In [88]:
cd "/home/bakuda/pandas-book/"
In [89]:
path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'
In [90]:
open(path).readline()
Out[90]:
In [91]:
import json
records = [json.loads(line) for line in open(path)]
In [92]:
records[:1]
Out[92]:
In [14]:
records[0]['tz']
Out[14]:
In [16]:
tzs = [rec['tz'] for rec in records if 'tz' in rec]
In [18]:
tzs[:5]
Out[18]:
In [19]:
type(tzs)
Out[19]:
In [20]:
len(tzs)
Out[20]:
In [21]:
len(set(tzs))
Out[21]:
In [22]:
%rmagic
In [23]:
tzs[:10]
Out[23]:
In [25]:
from collections import Counter
In [26]:
counts = Counter(tzs)
In [28]:
counts.most_common(10)
Out[28]:
In [93]:
import pandas as pd
In [94]:
df = pd.DataFrame(records)
In [95]:
df.shape
Out[95]:
In [96]:
df['tz'][:10]
Out[96]:
In [35]:
tz_counts = df['tz'].value_counts()
In [36]:
tz_counts[:10]
Out[36]:
In [37]:
clean_tz = df['tz'].fillna('Missing')
In [38]:
clean_tz[clean_tz=='']='Unknown'
In [39]:
tz_counts = clean_tz.value_counts()
In [40]:
tz_counts[:10]
Out[40]:
In [41]:
tz_counts[:10].plot(kind='bar', rot=0)
Out[41]:
In [42]:
tz_counts[:10].plot(kind='barh', rot=0)
Out[42]:
In [43]:
df['a'][:5]
Out[43]:
In [44]:
s = df['a'][0]
In [45]:
s
Out[45]:
In [47]:
s.split()[0]
Out[47]:
In [53]:
results = pd.Series([x.split()[0] for x in df['a'].dropna()])
In [52]:
#df.tz
# same as below
df['tz'][:5]
Out[52]:
In [54]:
results[:5]
Out[54]:
In [55]:
%load_ext rmagic
In [57]:
%%R
install.packages("nycflights13", repos='http://cran.us.r‐project.org')
In [58]:
%%R
library(nycflights13)
write.csv(flights, "flights.csv")
In [59]:
flights = pd.read_csv("flights.csv", index_col=0)
In [60]:
flights.shape
Out[60]:
In [62]:
%%R
head(flights)
In [63]:
%%R
search()
In [100]:
#flights.head()
In [66]:
df.loc[:2,:]
Out[66]:
In [67]:
type(records)
Out[67]:
In [68]:
type(records[0])
Out[68]:
In [69]:
df.columns
Out[69]:
In [74]:
df.shape
Out[74]:
In [81]:
df.to_csv('/home/bakuda/pandas-book/ch02/bitly-data-for-R.csv', encoding='utf8', header=True, sep=',')
In [82]:
df = pd.DataFrame(records)
In [99]:
#df.loc[10:18]
In [103]:
results.value_counts()[:10].plot(kind='barh')
Out[103]:
In [104]:
cdf = df[df['a'].notnull()]
In [105]:
cdf.shape
Out[105]:
In [106]:
cdf.groupby('a')
Out[106]:
In [111]:
results.value_counts()[:20].plot(kind='barh')
Out[111]:
In [112]:
os_types = np.where(cdf['a'].str.contains('Windows'), 'Windows', 'Not Windows')
In [114]:
os_types[:5]
Out[114]:
In [116]:
by_tz_os = cdf.groupby(['tz',os_types])
In [118]:
by_tz_os.size()
Out[118]:
In [119]:
agg_counts = by_tz_os.size().unstack().fillna(0)
In [120]:
agg_counts[:10]
Out[120]:
In [121]:
# Unstack is like R::tidyr:spread()
by_tz_os.size().unstack()
Out[121]:
In [123]:
cdf.groupby(['tz',os_types]).size().unstack().fillna(0)[:5]
Out[123]:
In [124]:
indexer = agg_counts.sum(1).argsort()
In [125]:
indexer[:5]
Out[125]:
In [126]:
count_subset = agg_counts.take(indexer)[-10:]
In [127]:
count_subset
Out[127]:
In [130]:
count_subset.plot(kind='barh', stacked=True)
Out[130]:
In [146]:
cd "/home/bakuda/pandas-book/ch02/movielens/"
In [148]:
import pandas as pd
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('./users.dat', sep='::', header=None,
names=unames)
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('./ratings.dat', sep='::', header=None,
names=rnames)
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('./movies.dat', sep='::', header=None,
names=mnames)
In [149]:
users[:5]
Out[149]:
In [153]:
ratings[:5]
Out[153]:
In [152]:
movies[:5]
Out[152]:
In [154]:
data = pd.merge(pd.merge(ratings, users), movies)
In [155]:
data.shape
Out[155]:
In [156]:
data[:10]
Out[156]:
In [158]:
data[:1]
Out[158]:
In [159]:
data.ix[0]
Out[159]:
In [ ]: