In [37]:
from pandas import *
set_printoptions(notebook_repr_html=False) # turn off html, i.e., without table.
set_printoptions(max_rows=500) # getting the summary of the view versus getting everything. Terminal takes time to buffer.
import pandas
pandas.__version__
Out[37]:
In [4]:
names = read_csv('baby-names2.csv')
In [17]:
names
Out[17]:
In [18]:
names.head()
Out[18]:
In [19]:
names[names.year == 1880].head()
Out[19]:
In [14]:
names[names.year == 1880].tail()
Out[14]:
In [38]:
boys = names[names.sex == 'boy'] # segment the data into boy and girl names. We have 2 dataframes now.
girls = names[names.sex == 'girl']
In [22]:
boys.groupby('year')
Out[22]:
In [23]:
boys.groupby('year').size()
Out[23]:
In [24]:
names.groupby(['year', 'sex']).size()
Out[24]:
In [26]:
type(names.groupby(['year', 'sex']).size())
Out[26]:
In [27]:
names.groupby(['year', 'sex']).size().ix[2000] # select out by year: 2001, 2002. Only show the inner key.
Out[27]:
In [28]:
boys[boys.year == 2000] # We would like to find out most popular boy names for each year.
Out[28]:
In [29]:
boys[boys.year == 2000].prop
Out[29]:
In [30]:
boys[boys.year == 2000][:5]
Out[30]:
In [31]:
boys[boys.year == 2000].prop.idxmax()
Out[31]:
In [32]:
boys.ix[boys[boys.year == 2000].prop.idxmax()] # to get the whole row.
Out[32]:
In [35]:
def get_max_record(group):
return group.ix[group.prop.idxmax()]
get_max_record(boys)
Out[35]:
In [49]:
def get_max_record(group):
return group.ix[group.prop.idxmax()]
result = boys.groupby('year').apply(get_max_record)
In [53]:
result # the popularity of a certain name goes down over time.
Out[53]:
In [38]:
result.prop.plot()
Out[38]:
In [39]:
boys[boys.name == 'Travis']
Out[39]:
In [40]:
idf = boys.set_index(['name', 'year']) # pull out the indexes, and move them to the row.
In [41]:
idf[-50:]
Out[41]:
In [42]:
idf.ix['Travis']
Out[42]:
In [43]:
idf.ix['Travis'].prop.plot()
Out[43]:
In [44]:
boys.groupby('name')['prop'].mean()
Out[44]:
In [45]:
boys.groupby('name')['prop'].mean().order()
Out[45]:
In [46]:
boys['prop'].describe() # Getting the summary of statistics from the dataframe, based on proportion.e
Out[46]:
In [47]:
result = boys.groupby('year')['prop'].describe()
In [48]:
result[:50]
Out[48]:
In [49]:
df = boys[boys.year == 2008]
In [50]:
df.prop
Out[50]:
In [51]:
df = boys[boys.year == 2008].sort_index(by='prop', ascending=False) # If not in descending order. Can also do ascending=True for ascending.
In [52]:
df.prop
Out[52]:
In [53]:
df.prop.cumsum() # numpy
Out[53]:
In [54]:
df.prop.cumsum().searchsorted(0.5) # how many does it take to reach 50%. Also called a measure of diversity.
Out[54]:
In [55]:
df.prop.cumsum()[:130]
Out[55]:
In [56]:
def get_quantile_count(group, quantile = 0.5):
df = group.sort_index(by='prop', ascending=False)
return df.prop.cumsum().searchsorted(quantile)
boys.groupby('year').apply(get_quantile_count).plot()
Out[56]:
In [60]:
def get_quantile_count(group, quantile=0.5): # Problem with no different colors for boys and girls.
group = group.groupby('soundex').sum()
df = group.sort_index(by='prop', ascending=False)
return df.prop.cumsum().searchsorted(quantile)
#f = lambda x: get_quantile_count(x, 0.1)
q = 0.25
boy_ct = boys.groupby('year').apply(get_quantile_count, quantile=q) # to pass different values for quantile
girl_ct = girls.groupby('year').apply(get_quantile_count, quantile=q)
boy_ct.plot(label='boy')
girl_ct.plot(label='girl')
legend(loc='best') # with --pylab=inline, we don't have to do plt.legend()
Out[60]:
In [61]:
boys[boys.year == 2008].prop.rank() # mean rank by default.
Out[61]:
In [20]:
grouped = boys.groupby('year')['prop']
In [10]:
grouped.transform(Series.rank) # transform is more rigid than apply. Output the same size as the input.
Out[10]:
In [11]:
boys['year_rank'] = grouped.transform(Series.rank)
In [15]:
boys[boys.name == 'Wesley'].year_rank.plot()
Out[15]:
In [14]:
idf = boys.set_index(['name', 'year']) # same as above
idf.ix['Wesley']
idf.ix['Wesley'].year_rank
idf.ix['Wesley'].year_rank.plot()
Out[14]:
In [16]:
% timeit result = grouped.transform(lambda x: x - x.mean())
In [17]:
% timeit result = grouped.apply(lambda x: x - x.mean())
In [21]:
names # Output only the proportion of each name, not the amount.
Out[21]:
In [39]:
births = read_csv('births.csv')
In [29]:
merged = merge(names, births, on=['year', 'sex']) # merge 2 tables: names and births.
In [28]:
merge(names, births) # same as above. Join is performed using the common columns: year and sex
# Many to many joins in SQL compute the cartesians products of duplicated keys.
Out[28]:
In [30]:
merged['persons'] = np.floor(merged.prop * merged.births)
In [31]:
merged.head()
Out[31]:
In [34]:
merged.groupby(['name', 'sex'])['persons'].sum() # slice and dice. It's a hierarhical labeling.
Out[34]:
In [35]:
merged.groupby(['name', 'sex'])['persons'].sum().order()
Out[35]:
In [40]:
mboys = merge(boys, births) # inner join by default.
In [41]:
mboys['persons'] = np.floor(mboys.prop * mboys.births)
In [42]:
persons = mboys.set_index(['year', 'name']).persons # Select out persons
In [44]:
type(persons) # One dimensional series.
Out[44]:
In [45]:
persons # hierarhical index
Out[45]:
In [46]:
persons.ix[:, 'Christopher'].plot(kind='bar', rot=90) # Select out all the people named Chris. Plot is kind of crowded. Matplotlib doesn't go more than 130 in x axis.
Out[46]:
In [59]:
persons.unstack('name') # Create a data frame whose columns are each unique names, and the row indexes are the years.
Out[59]:
In [60]:
result = _ # underscore in ipython: the output of the last statement, because we don't want to compute again the same thing.
In [61]:
result
Out[61]:
In [62]:
result['Wesley']
Out[62]:
In [63]:
result['Wesley'].plot()
Out[63]:
In [ ]: