In [9]:
import pandas as pd
import numpy as np

In [10]:
names1880 = pd.read_csv(r'/Users/Harish/Documents/HK_Work/Python/Python-for-Data-Analysis/chapter 02/names/yob1880.txt', 
                        names=['name', 'sex', 'births'])

# names1880 = pd.read_csv(r'C:\Users\hrao\Documents\Personal\HK\Python\Python-for-Data-Analysis\chapter 02\names\yob1880.txt',
#                        names=['name', 'sex', 'births'])

In [11]:
names1880


Out[11]:
name sex births
0 Mary F 7065
1 Anna F 2604
2 Emma F 2003
3 Elizabeth F 1939
4 Minnie F 1746
5 Margaret F 1578
6 Ida F 1472
7 Alice F 1414
8 Bertha F 1320
9 Sarah F 1288
10 Annie F 1258
11 Clara F 1226
12 Ella F 1156
13 Florence F 1063
14 Cora F 1045
15 Martha F 1040
16 Laura F 1012
17 Nellie F 995
18 Grace F 982
19 Carrie F 949
20 Maude F 858
21 Mabel F 808
22 Bessie F 794
23 Jennie F 793
24 Gertrude F 787
25 Julia F 783
26 Hattie F 769
27 Edith F 768
28 Mattie F 704
29 Rose F 700
... ... ... ...
1970 Philo M 5
1971 Phineas M 5
1972 Presley M 5
1973 Ransom M 5
1974 Reece M 5
1975 Rene M 5
1976 Roswell M 5
1977 Rowland M 5
1978 Sampson M 5
1979 Samual M 5
1980 Santos M 5
1981 Schuyler M 5
1982 Sheppard M 5
1983 Spurgeon M 5
1984 Starling M 5
1985 Sylvanus M 5
1986 Theadore M 5
1987 Theophile M 5
1988 Tilmon M 5
1989 Tommy M 5
1990 Unknown M 5
1991 Vann M 5
1992 Wes M 5
1993 Winston M 5
1994 Wood M 5
1995 Woodie M 5
1996 Worthy M 5
1997 Wright M 5
1998 York M 5
1999 Zachariah M 5

2000 rows × 3 columns


In [12]:
names1880.groupby('sex')['births'].sum()


Out[12]:
sex
F     90993
M    110493
Name: births, dtype: int64

In [13]:
years = range(1880, 2011)

In [14]:
pieces = []

In [15]:
columns = ['name', 'sex', 'births']

In [16]:
for year in years: 
    path = r'/Users/Harish/Documents/HK_Work/Python/Python-for-Data-Analysis/chapter 02/names/yob%d.txt' % year
    #path = r'C:\Users\hrao\Documents\Personal\HK\Python\Python-for-Data-Analysis\chapter 02\names\yob%d.txt' % year
    frame = pd.read_csv(path, names=columns)
    frame['year'] = year 
    pieces.append(frame)

In [17]:
names = pd.concat(pieces, ignore_index=True)

In [18]:
names


Out[18]:
name sex births year
0 Mary F 7065 1880
1 Anna F 2604 1880
2 Emma F 2003 1880
3 Elizabeth F 1939 1880
4 Minnie F 1746 1880
5 Margaret F 1578 1880
6 Ida F 1472 1880
7 Alice F 1414 1880
8 Bertha F 1320 1880
9 Sarah F 1288 1880
10 Annie F 1258 1880
11 Clara F 1226 1880
12 Ella F 1156 1880
13 Florence F 1063 1880
14 Cora F 1045 1880
15 Martha F 1040 1880
16 Laura F 1012 1880
17 Nellie F 995 1880
18 Grace F 982 1880
19 Carrie F 949 1880
20 Maude F 858 1880
21 Mabel F 808 1880
22 Bessie F 794 1880
23 Jennie F 793 1880
24 Gertrude F 787 1880
25 Julia F 783 1880
26 Hattie F 769 1880
27 Edith F 768 1880
28 Mattie F 704 1880
29 Rose F 700 1880
... ... ... ... ...
1690754 Zaviyon M 5 2010
1690755 Zaybrien M 5 2010
1690756 Zayshawn M 5 2010
1690757 Zayyan M 5 2010
1690758 Zeal M 5 2010
1690759 Zealan M 5 2010
1690760 Zecharia M 5 2010
1690761 Zeferino M 5 2010
1690762 Zekariah M 5 2010
1690763 Zeki M 5 2010
1690764 Zeriah M 5 2010
1690765 Zeshan M 5 2010
1690766 Zhyier M 5 2010
1690767 Zildjian M 5 2010
1690768 Zinn M 5 2010
1690769 Zishan M 5 2010
1690770 Ziven M 5 2010
1690771 Zmari M 5 2010
1690772 Zoren M 5 2010
1690773 Zuhaib M 5 2010
1690774 Zyeire M 5 2010
1690775 Zygmunt M 5 2010
1690776 Zykerion M 5 2010
1690777 Zylar M 5 2010
1690778 Zylin M 5 2010
1690779 Zymaire M 5 2010
1690780 Zyonne M 5 2010
1690781 Zyquarius M 5 2010
1690782 Zyran M 5 2010
1690783 Zzyzx M 5 2010

1690784 rows × 4 columns


In [19]:
total_births = names.pivot_table('births', index='year', columns='sex', aggfunc=sum)

In [20]:
total_births.tail()


Out[20]:
sex F M
year
2006 1896468 2050234
2007 1916888 2069242
2008 1883645 2032310
2009 1827643 1973359
2010 1759010 1898382

In [21]:
%matplotlib inline

In [22]:
total_births.plot(title='Total births by sex and year')


Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x11ed989e8>

In [23]:
def add_prop(group):
    births = group.births.astype(float)
    group['prop'] = births / births.sum()
    return group

In [24]:
names = names.groupby(['year','sex']).apply(add_prop)

In [25]:
names


Out[25]:
name sex births year prop
0 Mary F 7065 1880 0.077643
1 Anna F 2604 1880 0.028618
2 Emma F 2003 1880 0.022013
3 Elizabeth F 1939 1880 0.021309
4 Minnie F 1746 1880 0.019188
5 Margaret F 1578 1880 0.017342
6 Ida F 1472 1880 0.016177
7 Alice F 1414 1880 0.015540
8 Bertha F 1320 1880 0.014507
9 Sarah F 1288 1880 0.014155
10 Annie F 1258 1880 0.013825
11 Clara F 1226 1880 0.013474
12 Ella F 1156 1880 0.012704
13 Florence F 1063 1880 0.011682
14 Cora F 1045 1880 0.011484
15 Martha F 1040 1880 0.011429
16 Laura F 1012 1880 0.011122
17 Nellie F 995 1880 0.010935
18 Grace F 982 1880 0.010792
19 Carrie F 949 1880 0.010429
20 Maude F 858 1880 0.009429
21 Mabel F 808 1880 0.008880
22 Bessie F 794 1880 0.008726
23 Jennie F 793 1880 0.008715
24 Gertrude F 787 1880 0.008649
25 Julia F 783 1880 0.008605
26 Hattie F 769 1880 0.008451
27 Edith F 768 1880 0.008440
28 Mattie F 704 1880 0.007737
29 Rose F 700 1880 0.007693
... ... ... ... ... ...
1690754 Zaviyon M 5 2010 0.000003
1690755 Zaybrien M 5 2010 0.000003
1690756 Zayshawn M 5 2010 0.000003
1690757 Zayyan M 5 2010 0.000003
1690758 Zeal M 5 2010 0.000003
1690759 Zealan M 5 2010 0.000003
1690760 Zecharia M 5 2010 0.000003
1690761 Zeferino M 5 2010 0.000003
1690762 Zekariah M 5 2010 0.000003
1690763 Zeki M 5 2010 0.000003
1690764 Zeriah M 5 2010 0.000003
1690765 Zeshan M 5 2010 0.000003
1690766 Zhyier M 5 2010 0.000003
1690767 Zildjian M 5 2010 0.000003
1690768 Zinn M 5 2010 0.000003
1690769 Zishan M 5 2010 0.000003
1690770 Ziven M 5 2010 0.000003
1690771 Zmari M 5 2010 0.000003
1690772 Zoren M 5 2010 0.000003
1690773 Zuhaib M 5 2010 0.000003
1690774 Zyeire M 5 2010 0.000003
1690775 Zygmunt M 5 2010 0.000003
1690776 Zykerion M 5 2010 0.000003
1690777 Zylar M 5 2010 0.000003
1690778 Zylin M 5 2010 0.000003
1690779 Zymaire M 5 2010 0.000003
1690780 Zyonne M 5 2010 0.000003
1690781 Zyquarius M 5 2010 0.000003
1690782 Zyran M 5 2010 0.000003
1690783 Zzyzx M 5 2010 0.000003

1690784 rows × 5 columns


In [26]:
np.allclose(names.groupby(['year','sex']).prop.sum(),1)


Out[26]:
True

In [27]:
def get_top1000(group):
    return group.sort_values(by='births', ascending=False)[:1000]

In [28]:
grouped = names.groupby(['year','sex'])
top1000 = grouped.apply(get_top1000)
top1000.index = np.arange(len(top1000))

In [29]:
top1000


Out[29]:
name sex births year prop
0 Mary F 7065 1880 0.077643
1 Anna F 2604 1880 0.028618
2 Emma F 2003 1880 0.022013
3 Elizabeth F 1939 1880 0.021309
4 Minnie F 1746 1880 0.019188
5 Margaret F 1578 1880 0.017342
6 Ida F 1472 1880 0.016177
7 Alice F 1414 1880 0.015540
8 Bertha F 1320 1880 0.014507
9 Sarah F 1288 1880 0.014155
10 Annie F 1258 1880 0.013825
11 Clara F 1226 1880 0.013474
12 Ella F 1156 1880 0.012704
13 Florence F 1063 1880 0.011682
14 Cora F 1045 1880 0.011484
15 Martha F 1040 1880 0.011429
16 Laura F 1012 1880 0.011122
17 Nellie F 995 1880 0.010935
18 Grace F 982 1880 0.010792
19 Carrie F 949 1880 0.010429
20 Maude F 858 1880 0.009429
21 Mabel F 808 1880 0.008880
22 Bessie F 794 1880 0.008726
23 Jennie F 793 1880 0.008715
24 Gertrude F 787 1880 0.008649
25 Julia F 783 1880 0.008605
26 Hattie F 769 1880 0.008451
27 Edith F 768 1880 0.008440
28 Mattie F 704 1880 0.007737
29 Rose F 700 1880 0.007693
... ... ... ... ... ...
261847 Yair M 201 2010 0.000106
261848 Talan M 201 2010 0.000106
261849 Keyon M 201 2010 0.000106
261850 Kael M 201 2010 0.000106
261851 Demarion M 200 2010 0.000105
261852 Gibson M 200 2010 0.000105
261853 Reagan M 200 2010 0.000105
261854 Cristofer M 199 2010 0.000105
261855 Daylen M 199 2010 0.000105
261856 Jordon M 199 2010 0.000105
261857 Dashawn M 198 2010 0.000104
261858 Masen M 198 2010 0.000104
261859 Rowen M 197 2010 0.000104
261860 Yousef M 197 2010 0.000104
261861 Thaddeus M 197 2010 0.000104
261862 Kadin M 197 2010 0.000104
261863 Dillan M 197 2010 0.000104
261864 Clarence M 197 2010 0.000104
261865 Slade M 196 2010 0.000103
261866 Clinton M 196 2010 0.000103
261867 Sheldon M 196 2010 0.000103
261868 Keshawn M 195 2010 0.000103
261869 Menachem M 195 2010 0.000103
261870 Joziah M 195 2010 0.000103
261871 Bailey M 194 2010 0.000102
261872 Camilo M 194 2010 0.000102
261873 Destin M 194 2010 0.000102
261874 Jaquan M 194 2010 0.000102
261875 Jaydan M 194 2010 0.000102
261876 Maxton M 193 2010 0.000102

261877 rows × 5 columns


In [30]:
boys = top1000[top1000.sex == 'M']

In [31]:
girls = top1000[top1000.sex == 'F']

In [32]:
boys.head()


Out[32]:
name sex births year prop
942 John M 9655 1880 0.087381
943 William M 9533 1880 0.086277
944 James M 5927 1880 0.053641
945 Charles M 5348 1880 0.048401
946 George M 5126 1880 0.046392

In [33]:
girls.head()


Out[33]:
name sex births year prop
0 Mary F 7065 1880 0.077643
1 Anna F 2604 1880 0.028618
2 Emma F 2003 1880 0.022013
3 Elizabeth F 1939 1880 0.021309
4 Minnie F 1746 1880 0.019188

In [34]:
total_births = top1000.pivot_table('births', index='year', columns='name', aggfunc=sum)

In [35]:
total_births.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 131 entries, 1880 to 2010
Columns: 6868 entries, Aaden to Zuri
dtypes: float64(6868)
memory usage: 6.9 MB

In [36]:
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]

In [37]:
subset.plot(subplots=True, figsize=(12, 10), grid=False, 
            title="Number of births per year")


Out[37]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x129376e80>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x1293b0358>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x122bed518>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x1293eaa58>], dtype=object)

In [38]:
table = top1000.pivot_table('prop', index='year',
                            columns='sex', aggfunc=sum)

In [39]:
table.plot(title='Sum of table1000.prop by year and sex',
           yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))


Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0x1294ca748>

In [40]:
df = boys[boys.year == 2010]

In [41]:
df


Out[41]:
name sex births year prop
260877 Jacob M 21875 2010 0.011523
260878 Ethan M 17866 2010 0.009411
260879 Michael M 17133 2010 0.009025
260880 Jayden M 17030 2010 0.008971
260881 William M 16870 2010 0.008887
260882 Alexander M 16634 2010 0.008762
260883 Noah M 16281 2010 0.008576
260884 Daniel M 15679 2010 0.008259
260885 Aiden M 15403 2010 0.008114
260886 Anthony M 15364 2010 0.008093
260887 Joshua M 15238 2010 0.008027
260888 Mason M 14728 2010 0.007758
260889 Christopher M 14135 2010 0.007446
260890 Andrew M 14093 2010 0.007424
260891 David M 14042 2010 0.007397
260892 Matthew M 13954 2010 0.007350
260893 Logan M 13943 2010 0.007345
260894 Elijah M 13735 2010 0.007235
260895 James M 13714 2010 0.007224
260896 Joseph M 13657 2010 0.007194
260897 Gabriel M 12722 2010 0.006701
260898 Benjamin M 12280 2010 0.006469
260899 Ryan M 11886 2010 0.006261
260900 Samuel M 11776 2010 0.006203
260901 Jackson M 11693 2010 0.006159
260902 John M 11424 2010 0.006018
260903 Nathan M 11269 2010 0.005936
260904 Jonathan M 11028 2010 0.005809
260905 Christian M 10965 2010 0.005776
260906 Liam M 10852 2010 0.005716
... ... ... ... ... ...
261847 Yair M 201 2010 0.000106
261848 Talan M 201 2010 0.000106
261849 Keyon M 201 2010 0.000106
261850 Kael M 201 2010 0.000106
261851 Demarion M 200 2010 0.000105
261852 Gibson M 200 2010 0.000105
261853 Reagan M 200 2010 0.000105
261854 Cristofer M 199 2010 0.000105
261855 Daylen M 199 2010 0.000105
261856 Jordon M 199 2010 0.000105
261857 Dashawn M 198 2010 0.000104
261858 Masen M 198 2010 0.000104
261859 Rowen M 197 2010 0.000104
261860 Yousef M 197 2010 0.000104
261861 Thaddeus M 197 2010 0.000104
261862 Kadin M 197 2010 0.000104
261863 Dillan M 197 2010 0.000104
261864 Clarence M 197 2010 0.000104
261865 Slade M 196 2010 0.000103
261866 Clinton M 196 2010 0.000103
261867 Sheldon M 196 2010 0.000103
261868 Keshawn M 195 2010 0.000103
261869 Menachem M 195 2010 0.000103
261870 Joziah M 195 2010 0.000103
261871 Bailey M 194 2010 0.000102
261872 Camilo M 194 2010 0.000102
261873 Destin M 194 2010 0.000102
261874 Jaquan M 194 2010 0.000102
261875 Jaydan M 194 2010 0.000102
261876 Maxton M 193 2010 0.000102

1000 rows × 5 columns


In [42]:
prop_cumsum = df.sort_values(by='prop', ascending=False).prop.cumsum()

In [43]:
prop_cumsum[:10]


Out[43]:
260877    0.011523
260878    0.020934
260879    0.029959
260880    0.038930
260881    0.047817
260882    0.056579
260883    0.065155
260884    0.073414
260885    0.081528
260886    0.089621
Name: prop, dtype: float64

In [44]:
prop_cumsum.values.searchsorted(0.5)


Out[44]:
116

In [45]:
df = boys[boys.year == 1900]

In [46]:
in1900 = df.sort_values(by='prop', ascending=False).prop.cumsum()

In [47]:
in1900.values.searchsorted(0.5) + 1


Out[47]:
25

In [48]:
def get_quantile_count(group, q=0.5):
    group = group.sort_values(by='prop', ascending=False)
    return group.prop.cumsum().values.searchsorted(q) + 1

In [49]:
diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')

In [50]:
diversity.head()


Out[50]:
sex F M
year
1880 38 14
1881 38 14
1882 38 15
1883 39 15
1884 39 16

In [51]:
diversity.plot(title="Number of popular names in top 50%")


Out[51]:
<matplotlib.axes._subplots.AxesSubplot at 0x12958a6a0>

In [54]:
get_last_letter = lambda x: x[-1]
last_letters = names.name.map(get_last_letter) 
last_letters.name = 'last_letter'
table = names.pivot_table('births', index=last_letters, columns=['sex', 'year'], aggfunc=sum)

In [57]:
subtable = table.reindex(columns=[1910,1960,2010], level='year')

In [58]:
subtable.head()


Out[58]:
sex F M
year 1910 1960 2010 1910 1960 2010
last_letter
a 108376.0 691247.0 670605.0 977.0 5204.0 28438.0
b NaN 694.0 450.0 411.0 3912.0 38859.0
c 5.0 49.0 946.0 482.0 15476.0 23125.0
d 6750.0 3729.0 2607.0 22111.0 262112.0 44398.0
e 133569.0 435013.0 313833.0 28655.0 178823.0 129012.0

In [59]:
subtable.sum()


Out[59]:
sex  year
F    1910     396416.0
     1960    2022062.0
     2010    1759010.0
M    1910     194198.0
     1960    2132588.0
     2010    1898382.0
dtype: float64

In [60]:
letter_prop = subtable / subtable.sum().astype(float)

In [61]:
import matplotlib.pyplot as plt

In [62]:
fig, axes = plt.subplots(2, 1, figsize=(10, 8)) 
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male') 
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',
legend=False)


Out[62]:
<matplotlib.axes._subplots.AxesSubplot at 0x127611438>

In [63]:
letter_prop = table / table.sum().astype(float)

In [64]:
dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T

In [65]:
dny_ts.head()


Out[65]:
last_letter d n y
year
1880 0.083055 0.153213 0.075760
1881 0.083247 0.153214 0.077451
1882 0.085340 0.149560 0.077537
1883 0.084066 0.151646 0.079144
1884 0.086120 0.149915 0.080405

In [66]:
dny_ts.plot()


Out[66]:
<matplotlib.axes._subplots.AxesSubplot at 0x1273b0f98>

In [67]:
all_names = top1000.name.unique()

In [68]:
mask = np.array(['lesl' in x.lower() for x in all_names])

In [69]:
lesley_like = all_names[mask]

In [70]:
lesley_like


Out[70]:
array(['Leslie', 'Lesley', 'Leslee', 'Lesli', 'Lesly'], dtype=object)

In [71]:
filtered = top1000[top1000.name.isin(lesley_like)]

In [72]:
filtered.groupby('name').births.sum()


Out[72]:
name
Leslee      1082
Lesley     35022
Lesli        929
Leslie    370429
Lesly      10067
Name: births, dtype: int64

In [73]:
table = filtered.pivot_table('births', index='year', columns='sex', aggfunc=sum)

In [74]:
table = table.div(table.sum(1), axis=0)

In [75]:
table.tail()


Out[75]:
sex F M
year
2006 1.0 NaN
2007 1.0 NaN
2008 1.0 NaN
2009 1.0 NaN
2010 1.0 NaN

In [76]:
table.plot(style={'M':'k-', 'F': 'k--'})


Out[76]:
<matplotlib.axes._subplots.AxesSubplot at 0x1274a7e48>

In [ ]: