-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathusagovdata.py
More file actions
70 lines (53 loc) · 2.1 KB
/
usagovdata.py
File metadata and controls
70 lines (53 loc) · 2.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint
from collections import Counter
from collections import defaultdict
if __name__ == '__main__':
path = 'data/usagov_bitly_data2012-03-16-1331923249.txt'
with open(path, 'rb') as f:
records = [json.loads(line.decode('utf-8')) for line in f]
# get top 10 time zones
print('Top Time Zones:')
tz = [rec['tz'] for rec in records if 'tz' in rec]
tz = ['Unknown' if x == '' else x for x in tz]
tz_counts = Counter(tz)
pprint(tz_counts.most_common(10))
# now do it with pandas DataFrame
print('\nTop Time Zones Using DataFrame:')
frame = pd.DataFrame(records)
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()
print(tz_counts[:10])
# plot it
#tz_counts[:10].plot(kind='barh', rot=0)
#plt.show()
# get top agents
print('\nTop Agents:')
results = pd.Series([x.split()[0] for x in frame.a.dropna()])
print(results.value_counts()[:8])
# get top time zones with Windows and non-Windows users
print('\nUsers and Time Zones:')
# get the frames where the agent is not null
cframe = frame[frame.a.notnull()]
operating_system = np.where(cframe['a'].str.contains('Windows'),
'Windows', 'Not Windows')
# group the time zone to the operating system
by_tz_os = cframe.groupby(['tz', operating_system])
# count the os by time zones, unstack to a table of counts, replace N/A w/0
agg_counts = by_tz_os.size().unstack().fillna(0)
print(agg_counts[:5])
# sort the Windows count column in ascending order
indexer = agg_counts.sum(1).argsort()
# get the last 10 in that order
count_subset = agg_counts.take(indexer)[-10:]
print(count_subset)
# normalize it to the relative percentage
normed_subset = count_subset.div(count_subset.sum(1), axis=0)
# plot it
normed_subset.plot(kind='barh', stacked=True)
plt.show()