In [9]:
import pandas as pd
import numpy as np
In [10]:
names1880 = pd.read_csv(r'/Users/Harish/Documents/HK_Work/Python/Python-for-Data-Analysis/chapter 02/names/yob1880.txt',
names=['name', 'sex', 'births'])
# names1880 = pd.read_csv(r'C:\Users\hrao\Documents\Personal\HK\Python\Python-for-Data-Analysis\chapter 02\names\yob1880.txt',
# names=['name', 'sex', 'births'])
In [11]:
names1880
Out[11]:
In [12]:
names1880.groupby('sex')['births'].sum()
Out[12]:
In [13]:
years = range(1880, 2011)
In [14]:
pieces = []
In [15]:
columns = ['name', 'sex', 'births']
In [16]:
for year in years:
path = r'/Users/Harish/Documents/HK_Work/Python/Python-for-Data-Analysis/chapter 02/names/yob%d.txt' % year
#path = r'C:\Users\hrao\Documents\Personal\HK\Python\Python-for-Data-Analysis\chapter 02\names\yob%d.txt' % year
frame = pd.read_csv(path, names=columns)
frame['year'] = year
pieces.append(frame)
In [17]:
names = pd.concat(pieces, ignore_index=True)
In [18]:
names
Out[18]:
In [19]:
total_births = names.pivot_table('births', index='year', columns='sex', aggfunc=sum)
In [20]:
total_births.tail()
Out[20]:
In [21]:
%matplotlib inline
In [22]:
total_births.plot(title='Total births by sex and year')
Out[22]:
In [23]:
def add_prop(group):
births = group.births.astype(float)
group['prop'] = births / births.sum()
return group
In [24]:
names = names.groupby(['year','sex']).apply(add_prop)
In [25]:
names
Out[25]:
In [26]:
np.allclose(names.groupby(['year','sex']).prop.sum(),1)
Out[26]:
In [27]:
def get_top1000(group):
return group.sort_values(by='births', ascending=False)[:1000]
In [28]:
grouped = names.groupby(['year','sex'])
top1000 = grouped.apply(get_top1000)
top1000.index = np.arange(len(top1000))
In [29]:
top1000
Out[29]:
In [30]:
boys = top1000[top1000.sex == 'M']
In [31]:
girls = top1000[top1000.sex == 'F']
In [32]:
boys.head()
Out[32]:
In [33]:
girls.head()
Out[33]:
In [34]:
total_births = top1000.pivot_table('births', index='year', columns='name', aggfunc=sum)
In [35]:
total_births.info()
In [36]:
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]
In [37]:
subset.plot(subplots=True, figsize=(12, 10), grid=False,
title="Number of births per year")
Out[37]:
In [38]:
table = top1000.pivot_table('prop', index='year',
columns='sex', aggfunc=sum)
In [39]:
table.plot(title='Sum of table1000.prop by year and sex',
yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))
Out[39]:
In [40]:
df = boys[boys.year == 2010]
In [41]:
df
Out[41]:
In [42]:
prop_cumsum = df.sort_values(by='prop', ascending=False).prop.cumsum()
In [43]:
prop_cumsum[:10]
Out[43]:
In [44]:
prop_cumsum.values.searchsorted(0.5)
Out[44]:
In [45]:
df = boys[boys.year == 1900]
In [46]:
in1900 = df.sort_values(by='prop', ascending=False).prop.cumsum()
In [47]:
in1900.values.searchsorted(0.5) + 1
Out[47]:
In [48]:
def get_quantile_count(group, q=0.5):
group = group.sort_values(by='prop', ascending=False)
return group.prop.cumsum().values.searchsorted(q) + 1
In [49]:
diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')
In [50]:
diversity.head()
Out[50]:
In [51]:
diversity.plot(title="Number of popular names in top 50%")
Out[51]:
In [54]:
get_last_letter = lambda x: x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last_letter'
table = names.pivot_table('births', index=last_letters, columns=['sex', 'year'], aggfunc=sum)
In [57]:
subtable = table.reindex(columns=[1910,1960,2010], level='year')
In [58]:
subtable.head()
Out[58]:
In [59]:
subtable.sum()
Out[59]:
In [60]:
letter_prop = subtable / subtable.sum().astype(float)
In [61]:
import matplotlib.pyplot as plt
In [62]:
fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',
legend=False)
Out[62]:
In [63]:
letter_prop = table / table.sum().astype(float)
In [64]:
dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T
In [65]:
dny_ts.head()
Out[65]:
In [66]:
dny_ts.plot()
Out[66]:
In [67]:
all_names = top1000.name.unique()
In [68]:
mask = np.array(['lesl' in x.lower() for x in all_names])
In [69]:
lesley_like = all_names[mask]
In [70]:
lesley_like
Out[70]:
In [71]:
filtered = top1000[top1000.name.isin(lesley_like)]
In [72]:
filtered.groupby('name').births.sum()
Out[72]:
In [73]:
table = filtered.pivot_table('births', index='year', columns='sex', aggfunc=sum)
In [74]:
table = table.div(table.sum(1), axis=0)
In [75]:
table.tail()
Out[75]:
In [76]:
table.plot(style={'M':'k-', 'F': 'k--'})
Out[76]:
In [ ]: