In [100]:
#1880-2010年间全美婴儿姓名

In [101]:
import pandas as pd

In [102]:
#path =  'F:/syn/git/pydata-book/ch02/names/'
path =  'E:/git/pydata-book/ch02/names/'
file1880 = 'yob1880.txt'
# 姓名,性别,出生数
names=['name', 'sex', 'births']

In [103]:
names1880 = pd.read_csv(path + file1880, names=names)

In [104]:
names1880


Out[104]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 1999
Data columns:
name      2000  non-null values
sex       2000  non-null values
births    2000  non-null values
dtypes: int64(1), object(2)

In [105]:
names1880.groupby('sex').births.sum()


Out[105]:
sex
F       90993
M      110493
Name: births

In [106]:
#将1880年至2010年所有数据都组装到一个DataFrame里,并加上一个year字段
#2010是目前最后一个有效统计年度
years = range(1880, 2011)

pieces = []
columns = ['name', 'sex', 'births']

for year in years:
    filename = 'yob%d.txt' % year
    frame = pd.read_csv(path + filename, names=columns)
    frame['year'] = year
    pieces.append(frame)

#将所有的数据整合到单个DataFrame中
names = pd.concat(pieces, ignore_index=True)

In [107]:
names


Out[107]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1690784 entries, 0 to 1690783
Data columns:
name      1690784  non-null values
sex       1690784  non-null values
births    1690784  non-null values
year      1690784  non-null values
dtypes: int64(2), object(2)

In [108]:
total_births = names.pivot_table('births', rows='year', cols='sex', aggfunc=sum)

In [109]:
total_births.tail()


Out[109]:
sex F M
year
2006 1896468 2050234
2007 1916888 2069242
2008 1883645 2032310
2009 1827643 1973359
2010 1759010 1898382

In [110]:
total_births.plot(title='Total births by sex and year')


Out[110]:
<matplotlib.axes.AxesSubplot at 0x21a98b90>

In [111]:
def add_prop(group):
    #整数除法会向下圆整
    births = group.births.astype(float)
    
    group['prop'] = births / births.sum()
    return group
names = names.groupby(['year', 'sex']).apply(add_prop)

In [112]:
names


Out[112]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1690784 entries, 0 to 1690783
Data columns:
name      1690784  non-null values
sex       1690784  non-null values
births    1690784  non-null values
year      1690784  non-null values
prop      1690784  non-null values
dtypes: float64(1), int64(2), object(2)

In [113]:
np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)


Out[113]:
True

In [114]:
def get_top1000(group):
    return group.sort_index(by='births', ascending=False)[:1000]
grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)

In [115]:
#取top100的另一种方法
pieces = []
for year, group in names.groupby(['year', 'sex']):
    pieces.append(group.sort_index(by='births', ascending=False)[:1000])
top1000_2 = pd.concat(pieces, ignore_index=True)

In [116]:
top1000_2


Out[116]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 261877 entries, 0 to 261876
Data columns:
name      261877  non-null values
sex       261877  non-null values
births    261877  non-null values
year      261877  non-null values
prop      261877  non-null values
dtypes: float64(1), int64(2), object(2)

In [117]:
#比较小的数据集,接下来分析工作针对这个top1000数据集
top1000[:10]


Out[117]:
name sex births year prop
year sex None
1880 F 0 Mary F 7065 1880 0.077643
1 Anna F 2604 1880 0.028618
2 Emma F 2003 1880 0.022013
3 Elizabeth F 1939 1880 0.021309
4 Minnie F 1746 1880 0.019188
5 Margaret F 1578 1880 0.017342
6 Ida F 1472 1880 0.016177
7 Alice F 1414 1880 0.015540
8 Bertha F 1320 1880 0.014507
9 Sarah F 1288 1880 0.014155

In [118]:
#分析命名趋势

#将前1000个名字分为男女两个部分
boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']

In [119]:
total_births = top1000.pivot_table('births', rows='year', cols='name', aggfunc=sum)

In [120]:
total_births


Out[120]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 131 entries, 1880 to 2010
Columns: 6865 entries, Aaden to Zuri
dtypes: float64(6865)

In [121]:
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]

In [122]:
subset.plot(subplots=True, figsize=(12, 10), grid=False, title="Number fo births per year")


Out[122]:
array([Axes(0.125,0.747826;0.775x0.152174),
       Axes(0.125,0.565217;0.775x0.152174),
       Axes(0.125,0.382609;0.775x0.152174), Axes(0.125,0.2;0.775x0.152174)], dtype=object)

In [123]:
#评估命名多样性的增长
table = top1000.pivot_table('prop', rows='year', cols='sex', aggfunc=sum)

In [124]:
table.plot(title='Sum of table1000.prop by year and sex', yticks=np.linspace(0,1.2,13), xticks=range(1880,2020,10))


Out[124]:
<matplotlib.axes.AxesSubplot at 0x36e31030>

In [125]:
df = boys[boys.year == 2010]

In [126]:
df


Out[126]:
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1000 entries, (2010, 'M', 1676644) to (2010, 'M', 1677643)
Data columns:
name      1000  non-null values
sex       1000  non-null values
births    1000  non-null values
year      1000  non-null values
prop      1000  non-null values
dtypes: float64(1), int64(2), object(2)

In [127]:
prop_cumsum = df.sort_index(by='prop', ascending=False).prop.cumsum()

In [128]:
prop_cumsum[:10]


Out[128]:
year  sex         
2010  M    1676644    0.011523
           1676645    0.020934
           1676646    0.029959
           1676647    0.038930
           1676648    0.047817
           1676649    0.056579
           1676650    0.065155
           1676651    0.073414
           1676652    0.081528
           1676653    0.089621

In [129]:
prop_cumsum.searchsorted(0.5) + 1


Out[129]:
117

In [130]:
df = boys[boys.year == 1990]

In [131]:
in1900 = df.sort_index(by='prop', ascending=False).prop.cumsum()

In [132]:
in1900.searchsorted(0.5) + 1


Out[132]:
45

In [133]:
def get_quantile_count(group, q=0.5):
    group = group.sort_index(by='prop', ascending=False)
    return group.prop.cumsum().searchsorted(q) + 1

In [134]:
diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)

In [135]:
diversity.head()


Out[135]:
year  sex
1880  F      38
      M      14
1881  F      38
      M      14
1882  F      38

In [136]:
#按年度统计的密度表
diversity.plot(title="Number of popular names in top 50%")


Out[136]:
<matplotlib.axes.AxesSubplot at 0x36a53a90>

In [137]:
##“最后一个字母”的变革

In [138]:
#从name列取出最后一个字母
get_last_letter = lambda x:x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last_letter'

table = names.pivot_table('births', rows=last_letters, cols=['sex', 'year'], aggfunc=sum)

In [139]:
subtable = table.reindex(columns=[1910, 1960, 2010], level='year')

In [140]:
subtable.head()


Out[140]:
sex
year
F
1910
F
1960
F
2010
M
1910
M
1960
M
2010
last_letter
a 108376 691247 670605 977 5204 28438
b NaN 694 450 411 3912 38859
c 5 49 946 482 15476 23125
d 6750 3729 2607 22111 262112 44398
e 133569 435013 313833 28655 178823 129012

In [141]:
subtable.sum()


Out[141]:
sex  year
F    1910     396416
     1960    2022062
     2010    1759010
M    1910     194198
     1960    2132588
     2010    1898382

In [142]:
letter_prop = subtable / subtable.sum().astype(float)

In [143]:
import matplotlib.pyplot as plt

In [144]:
#男孩女孩名字中各个末字母的比例
fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='FeMale', legend=False)


Out[144]:
<matplotlib.axes.AxesSubplot at 0x368ca4d0>

In [145]:
letter_prop = table / table.sum().astype(float)
dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T
dny_ts.head()


Out[145]:
d n y
year
1880 0.083055 0.153213 0.075760
1881 0.083247 0.153214 0.077451
1882 0.085340 0.149560 0.077537
1883 0.084066 0.151646 0.079144
1884 0.086120 0.149915 0.080405

In [146]:
#各年出生的男孩中名字以d/n/y结尾的人数比例
dny_ts.plot()


Out[146]:
<matplotlib.axes.AxesSubplot at 0x37216350>

In [147]:
#变成女孩名字的男孩名字(以及相反的情况)
all_names = top1000.name.unique()
mask = np.array(['lesl' in x.lower() for x in all_names])
lesley_like = all_names[mask]
lesley_like


Out[147]:
array([Leslie, Lesley, Leslee, Lesli, Lesly], dtype=object)

In [148]:
filtered = top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()


Out[148]:
name
Leslee      1082
Lesley     35022
Lesli        929
Leslie    370429
Lesly      10067
Name: births

In [149]:
table = filtered.pivot_table('births', rows='year', cols='sex', aggfunc='sum')

In [150]:
table = table.div(table.sum(1), axis=0)
table.tail()


Out[150]:
sex F M
year
2006 1 NaN
2007 1 NaN
2008 1 NaN
2009 1 NaN
2010 1 NaN

In [151]:
#各个年度使用“Lesley型”名字的男女比例
table.plot(style={'M':'k-', 'F':'k--'})


Out[151]:
<matplotlib.axes.AxesSubplot at 0x36a46070>

In [151]: