In [39]:
import pandas
plot(arange(10))
Out[39]:
Out[39]:
In [40]:
import pandas
plot(arange(100))
Out[40]:
Out[40]:
In [41]:
plot(arange(50))
Out[41]:
Out[41]:
In [42]:
a=rand(100)
plot(a)
Out[42]:
Out[42]:
In [43]:
#usagov_bitly_data_path='F:\syn\git\pydata-book\ch02\usagov_bitly_data2012-03-16-1331923249.txt'
usagov_bitly_data_path='E:\git\pydata-book\ch02\usagov_bitly_data2012-03-16-1331923249.txt'
f=open(usagov_bitly_data_path)
f.readline()
Out[43]:
Out[43]:
In [44]:
import json
records=[json.loads(line) for line in f]
In [45]:
records[0]
Out[45]:
Out[45]:
In [46]:
records[0]['tz']
Out[46]:
Out[46]:
In [47]:
print records[0]['tz']
In [48]:
print records[0]['nk']
In [49]:
time_zones=[rec['tz'] for rec in records if 'tz' in rec]
In [50]:
time_zones[:10]
Out[50]:
Out[50]:
In [51]:
len(time_zones)
Out[51]:
Out[51]:
In [52]:
records[:2]
Out[52]:
Out[52]:
In [53]:
def get_counts(sequence):
counts={}
for x in sequence:
if x in counts:
counts[x] += 1
else:
counts[x] = 1
return counts
In [54]:
get_counts(time_zones)['Africa/Cairo']
Out[54]:
Out[54]:
In [55]:
from collections import defaultdict
def get_counts2(sequence):
counts = defaultdict(int) #所有的值均会初始化为0
for x in sequence:
counts[x] += 1
return counts
In [56]:
counts = get_counts2(time_zones)
In [57]:
counts['America/New_York']
Out[57]:
Out[57]:
In [58]:
len(time_zones)
Out[58]:
Out[58]:
In [59]:
def top_counts(count_dict, n=10):
value_key_pairs = [(count, tz) for tz,count in count_dict.items()]
value_key_pairs.sort()
return value_key_pairs[-n:]
In [60]:
top_counts(counts)
Out[60]:
Out[60]:
In [61]:
from collections import Counter
counts2=Counter(time_zones)
counts2.most_common(10)
Out[61]:
Out[61]:
In [62]:
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
frame = DataFrame(records)
frame
Out[62]:
Out[62]:
In [63]:
frame['tz'][:10]
Out[63]:
Out[63]:
In [64]:
tz_counts = frame['tz'].value_counts()
In [65]:
tz_counts[:10]
Out[65]:
Out[65]:
In [66]:
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()
tz_counts[:10]
Out[66]:
Out[66]:
In [67]:
tz_counts[:10].plot(kind='barh', rot=0)
Out[67]:
Out[67]:
In [68]:
frame['a'][1]
Out[68]:
Out[68]:
In [69]:
frame['a'][50]
Out[69]:
Out[69]:
In [70]:
frame['a'][51]
Out[70]:
Out[70]:
In [71]:
results = Series([x.split()[0] for x in frame.a.dropna()])
results[:5]
Out[71]:
Out[71]:
In [72]:
results.value_counts()[:8]
Out[72]:
Out[72]:
In [73]:
cframe = frame[frame.a.notnull()]
operting_system = np.where(cframe['a'].str.contains('Windows'), 'Windows',
'Not Windows')
In [74]:
operting_system[:5]
Out[74]:
Out[74]:
In [76]:
by_tz_os = cframe.groupby(['tz', operting_system])
In [77]:
agg_counts = by_tz_os.size().unstack().fillna(0)
In [78]:
agg_counts[:10]
Out[78]:
In [79]:
indexer = agg_counts.sum(1).argsort()
indexer[:10]
Out[79]:
In [82]:
count_subset = agg_counts.take(indexer)[-10:]
count_subset
Out[82]:
In [84]:
count_subset.plot(kind='barh', stacked=True)
Out[84]:
In [85]:
normed_subset = count_subset.div(count_subset.sum(1), axis=0)
normed_subset.plot(kind='barh', stacked=True)
Out[85]:
In [ ]: