In [48]:
import pandas as pd
%matplotlib inline
In [4]:
import json
In [7]:
path="/home/topo/repos/ghub/pydata-book/ch02/usagov_bitly_data2012-03-16-1331923249.txt"
In [9]:
open(path).readline()
Out[9]:
In [10]:
records = [json.loads(line) for line in open(path)]
In [16]:
records[:2]
Out[16]:
In [21]:
records[0]['tz']
Out[21]:
In [23]:
tzones = [rec['tz'] for rec in records if 'tz' in rec]
In [29]:
tzones[:10]
Out[29]:
In [30]:
len(tzones)
Out[30]:
In [31]:
#Counting Time Zones with pandas
In [32]:
from pandas import DataFrame, Series
In [33]:
df = DataFrame(records)
In [35]:
df[:2]['tz']
Out[35]:
In [37]:
df['tz']
Out[37]:
In [39]:
tz_counts = df['tz'].value_counts()
In [41]:
tz_counts[:12]
Out[41]:
In [42]:
clean_tz = df['tz'].fillna('Missing')
In [44]:
clean_tz[clean_tz == ''] ='Unknown'
In [45]:
clean_tz_counts = clean_tz.value_counts()
print clean_tz_counts
In [49]:
clean_tz_counts[:10].plot(kind='barh')
Out[49]:
In [50]:
clean_tz_counts[:10].plot(kind='bar')
Out[50]:
In [53]:
df['a'][:5]
Out[53]:
In [55]:
t = df['a'][5]
In [56]:
print t
In [58]:
t.split()[0]
Out[58]:
In [59]:
results = Series([x.split()[0] for x in df.a.dropna()])
In [60]:
results.value_counts()
Out[60]:
In [62]:
df.a.notnull().value_counts()
Out[62]:
In [66]:
df_notnull = df[df.a.notnull()]
print df_notnull.a.notnull().value_counts()
In [67]:
import numpy as np
In [69]:
os_type = np.where(df_notnull['a'].str.contains('Windows'), 'Windows', 'Not Windows')
In [71]:
os_type
Out[71]:
In [72]:
by_tz_os = df_notnull.groupby(['tz', os_type])
In [74]:
by_tz_os.size()
Out[74]:
In [78]:
agg_counts = by_tz_os.size().unstack().fillna(0)
print agg_counts[:10]
In [81]:
#column wise sums
agg_counts.sum(axis=0)
Out[81]:
In [90]:
#row wise sums i.e. add "windows" and "not windows" columns
agg_counts.sum(axis=1)[:10]
Out[90]:
In [91]:
order = agg_counts.sum(axis=1).argsort()
In [84]:
agg_counts.sum(axis=1)[-10:]
Out[84]:
In [95]:
subset = agg_counts.take(order)[-10:]
print subset
In [97]:
subset.plot(kind='bar')
Out[97]:
In [100]:
#sum row-wise and get total, then divide by
normed_subset = subset.div(subset.sum(axis=1), axis=0)
In [99]:
normed_subset
Out[99]:
In [103]:
subset.sum(axis=1)
Out[103]:
In [106]:
normed_subset.plot(kind='bar', stacked=True)
Out[106]:
In [8]:
cd /home/topo/repos/ghub/pydata-book/ch02/movielens
In [9]:
# Movie_lens dataset
import pandas as pd
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('users.dat', sep='::', header=None,
names=unames)
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('ratings.dat', sep='::', header=None,
names=rnames)
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('movies.dat', sep='::', header=None,
names=mnames)
In [6]:
In [10]:
users.count()
Out[10]:
In [14]:
movies.count()
Out[14]:
In [12]:
ratings.count()
Out[12]:
In [15]:
data = pd.merge(pd.merge(ratings, users), movies)
In [28]:
print data.shape
print data.count()
In [17]:
data[:5]
Out[17]:
In [19]:
data[:1]
Out[19]:
In [20]:
data.ix[0]
Out[20]:
In [40]:
#mean rating by gender - updated as per current method...the one in book is not working anymore
import numpy as np
mean_ratings = pd.pivot_table(data, values='rating', index=['gender'], columns=['title'], aggfunc=np.mean)
print mean_ratings[:2]
In [37]:
#unstack and print, else it'll print as above
print mean_ratings[:5].unstack()
In [30]:
help(pd.pivot_table)
In [41]:
ratings_by_title = data.groupby('title').size()
In [42]:
print ratings_by_title
In [43]:
print data.groupby('gender').size()
In [51]:
#movies with ratings >250
active_titles = ratings_by_title.index[ratings_by_title >= 250]
In [49]:
(ratings_by_title > 250)[:5]
Out[49]:
In [50]:
ratings_by_title[ratings_by_title >250]
Out[50]:
In [52]:
active_titles[:5]
Out[52]:
In [54]:
# pick mean_ratings for active_titles only
mean_ratings = mean_ratings.ix[active_titles]
In [55]:
mean_ratings[:5]
Out[55]:
In [56]:
#mean_ratings = data.pivot_table('rating', rows='title', cols='gender', aggfunc='mean')
In [ ]: