notebook.community

Edit and run



In [100]:

    
#1880-2010年间全美婴儿姓名



In [101]:

    
import pandas as pd



In [102]:

    
#path =  'F:/syn/git/pydata-book/ch02/names/'
path =  'E:/git/pydata-book/ch02/names/'
file1880 = 'yob1880.txt'
# 姓名，性别，出生数
names=['name', 'sex', 'births']



In [103]:

    
names1880 = pd.read_csv(path + file1880, names=names)



In [104]:

    
names1880









    Out[104]:





<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 1999
Data columns:
name      2000  non-null values
sex       2000  non-null values
births    2000  non-null values
dtypes: int64(1), object(2)



In [105]:

    
names1880.groupby('sex').births.sum()









    Out[105]:





sex
F       90993
M      110493
Name: births



In [106]:

    
#将1880年至2010年所有数据都组装到一个DataFrame里,并加上一个year字段
#2010是目前最后一个有效统计年度
years = range(1880, 2011)

pieces = []
columns = ['name', 'sex', 'births']

for year in years:
    filename = 'yob%d.txt' % year
    frame = pd.read_csv(path + filename, names=columns)
    frame['year'] = year
    pieces.append(frame)

#将所有的数据整合到单个DataFrame中
names = pd.concat(pieces, ignore_index=True)



In [107]:

    
names









    Out[107]:





<class 'pandas.core.frame.DataFrame'>
Int64Index: 1690784 entries, 0 to 1690783
Data columns:
name      1690784  non-null values
sex       1690784  non-null values
births    1690784  non-null values
year      1690784  non-null values
dtypes: int64(2), object(2)



In [108]:

    
total_births = names.pivot_table('births', rows='year', cols='sex', aggfunc=sum)



In [109]:

    
total_births.tail()



In [110]:

    
total_births.plot(title='Total births by sex and year')









    Out[110]:





<matplotlib.axes.AxesSubplot at 0x21a98b90>



In [111]:

    
def add_prop(group):
    #整数除法会向下圆整
    births = group.births.astype(float)
    
    group['prop'] = births / births.sum()
    return group
names = names.groupby(['year', 'sex']).apply(add_prop)



In [112]:

    
names









    Out[112]:





<class 'pandas.core.frame.DataFrame'>
Int64Index: 1690784 entries, 0 to 1690783
Data columns:
name      1690784  non-null values
sex       1690784  non-null values
births    1690784  non-null values
year      1690784  non-null values
prop      1690784  non-null values
dtypes: float64(1), int64(2), object(2)



In [113]:

    
np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)









    Out[113]:





True



In [114]:

    
def get_top1000(group):
    return group.sort_index(by='births', ascending=False)[:1000]
grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)



In [115]:

    
#取top100的另一种方法
pieces = []
for year, group in names.groupby(['year', 'sex']):
    pieces.append(group.sort_index(by='births', ascending=False)[:1000])
top1000_2 = pd.concat(pieces, ignore_index=True)



In [116]:

    
top1000_2









    Out[116]:





<class 'pandas.core.frame.DataFrame'>
Int64Index: 261877 entries, 0 to 261876
Data columns:
name      261877  non-null values
sex       261877  non-null values
births    261877  non-null values
year      261877  non-null values
prop      261877  non-null values
dtypes: float64(1), int64(2), object(2)



In [117]:

    
#比较小的数据集，接下来分析工作针对这个top1000数据集
top1000[:10]



In [118]:

    
#分析命名趋势

#将前1000个名字分为男女两个部分
boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']



In [119]:

    
total_births = top1000.pivot_table('births', rows='year', cols='name', aggfunc=sum)



In [120]:

    
total_births









    Out[120]:





<class 'pandas.core.frame.DataFrame'>
Int64Index: 131 entries, 1880 to 2010
Columns: 6865 entries, Aaden to Zuri
dtypes: float64(6865)



In [121]:

    
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]



In [122]:

    
subset.plot(subplots=True, figsize=(12, 10), grid=False, title="Number fo births per year")









    Out[122]:





array([Axes(0.125,0.747826;0.775x0.152174),
       Axes(0.125,0.565217;0.775x0.152174),
       Axes(0.125,0.382609;0.775x0.152174), Axes(0.125,0.2;0.775x0.152174)], dtype=object)



In [123]:

    
#评估命名多样性的增长
table = top1000.pivot_table('prop', rows='year', cols='sex', aggfunc=sum)



In [124]:

    
table.plot(title='Sum of table1000.prop by year and sex', yticks=np.linspace(0,1.2,13), xticks=range(1880,2020,10))









    Out[124]:





<matplotlib.axes.AxesSubplot at 0x36e31030>



In [125]:

    
df = boys[boys.year == 2010]



In [126]:

    
df









    Out[126]:





<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1000 entries, (2010, 'M', 1676644) to (2010, 'M', 1677643)
Data columns:
name      1000  non-null values
sex       1000  non-null values
births    1000  non-null values
year      1000  non-null values
prop      1000  non-null values
dtypes: float64(1), int64(2), object(2)



In [127]:

    
prop_cumsum = df.sort_index(by='prop', ascending=False).prop.cumsum()



In [128]:

    
prop_cumsum[:10]









    Out[128]:





year  sex         
2010  M    1676644    0.011523
           1676645    0.020934
           1676646    0.029959
           1676647    0.038930
           1676648    0.047817
           1676649    0.056579
           1676650    0.065155
           1676651    0.073414
           1676652    0.081528
           1676653    0.089621



In [129]:

    
prop_cumsum.searchsorted(0.5) + 1









    Out[129]:





117



In [130]:

    
df = boys[boys.year == 1990]



In [131]:

    
in1900 = df.sort_index(by='prop', ascending=False).prop.cumsum()



In [132]:

    
in1900.searchsorted(0.5) + 1









    Out[132]:





45



In [133]:

    
def get_quantile_count(group, q=0.5):
    group = group.sort_index(by='prop', ascending=False)
    return group.prop.cumsum().searchsorted(q) + 1



In [134]:

    
diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)



In [135]:

    
diversity.head()









    Out[135]:





year  sex
1880  F      38
      M      14
1881  F      38
      M      14
1882  F      38



In [136]:

    
#按年度统计的密度表
diversity.plot(title="Number of popular names in top 50%")









    Out[136]:





<matplotlib.axes.AxesSubplot at 0x36a53a90>



In [137]:

    
##“最后一个字母”的变革



In [138]:

    
#从name列取出最后一个字母
get_last_letter = lambda x:x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last_letter'

table = names.pivot_table('births', rows=last_letters, cols=['sex', 'year'], aggfunc=sum)



In [139]:

    
subtable = table.reindex(columns=[1910, 1960, 2010], level='year')



In [140]:

    
subtable.head()









    Out[140]:






  
    
      sex
year
      F
1910
      F
1960
      F
2010
      M
1910
      M
1960
      M
2010
    
    
      last_letter
      
      
      
      
      
      
    
  
  
    
      a
       108376
       691247
       670605
         977
         5204
        28438
    
    
      b
          NaN
          694
          450
         411
         3912
        38859
    
    
      c
            5
           49
          946
         482
        15476
        23125
    
    
      d
         6750
         3729
         2607
       22111
       262112
        44398
    
    
      e
       133569
       435013
       313833
       28655
       178823
       129012



In [141]:

    
subtable.sum()









    Out[141]:





sex  year
F    1910     396416
     1960    2022062
     2010    1759010
M    1910     194198
     1960    2132588
     2010    1898382



In [142]:

    
letter_prop = subtable / subtable.sum().astype(float)



In [143]:

    
import matplotlib.pyplot as plt



In [144]:

    
#男孩女孩名字中各个末字母的比例
fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='FeMale', legend=False)









    Out[144]:





<matplotlib.axes.AxesSubplot at 0x368ca4d0>



In [145]:

    
letter_prop = table / table.sum().astype(float)
dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T
dny_ts.head()



In [146]:

    
#各年出生的男孩中名字以d/n/y结尾的人数比例
dny_ts.plot()









    Out[146]:





<matplotlib.axes.AxesSubplot at 0x37216350>



In [147]:

    
#变成女孩名字的男孩名字(以及相反的情况)
all_names = top1000.name.unique()
mask = np.array(['lesl' in x.lower() for x in all_names])
lesley_like = all_names[mask]
lesley_like









    Out[147]:





array([Leslie, Lesley, Leslee, Lesli, Lesly], dtype=object)



In [148]:

    
filtered = top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()









    Out[148]:





name
Leslee      1082
Lesley     35022
Lesli        929
Leslie    370429
Lesly      10067
Name: births



In [149]:

    
table = filtered.pivot_table('births', rows='year', cols='sex', aggfunc='sum')



In [150]:

    
table = table.div(table.sum(1), axis=0)
table.tail()



In [151]:

    
#各个年度使用“Lesley型”名字的男女比例
table.plot(style={'M':'k-', 'F':'k--'})









    Out[151]:





<matplotlib.axes.AxesSubplot at 0x36a46070>



In [151]:

sex	F	M
year
2006	1896468	2050234
2007	1916888	2069242
2008	1883645	2032310
2009	1827643	1973359
2010	1759010	1898382

			name	sex	births	year	prop
year	sex	None
1880	F	0	Mary	F	7065	1880	0.077643
		1	Anna	F	2604	1880	0.028618
		2	Emma	F	2003	1880	0.022013
		3	Elizabeth	F	1939	1880	0.021309
		4	Minnie	F	1746	1880	0.019188
		5	Margaret	F	1578	1880	0.017342
		6	Ida	F	1472	1880	0.016177
		7	Alice	F	1414	1880	0.015540
		8	Bertha	F	1320	1880	0.014507
		9	Sarah	F	1288	1880	0.014155

	d	n	y
year
1880	0.083055	0.153213	0.075760
1881	0.083247	0.153214	0.077451
1882	0.085340	0.149560	0.077537
1883	0.084066	0.151646	0.079144
1884	0.086120	0.149915	0.080405