In [100]:
#1880-2010年间全美婴儿姓名
In [101]:
import pandas as pd
In [102]:
#path = 'F:/syn/git/pydata-book/ch02/names/'
path = 'E:/git/pydata-book/ch02/names/'
file1880 = 'yob1880.txt'
# 姓名,性别,出生数
names=['name', 'sex', 'births']
In [103]:
names1880 = pd.read_csv(path + file1880, names=names)
In [104]:
names1880
Out[104]:
In [105]:
names1880.groupby('sex').births.sum()
Out[105]:
In [106]:
#将1880年至2010年所有数据都组装到一个DataFrame里,并加上一个year字段
#2010是目前最后一个有效统计年度
years = range(1880, 2011)
pieces = []
columns = ['name', 'sex', 'births']
for year in years:
filename = 'yob%d.txt' % year
frame = pd.read_csv(path + filename, names=columns)
frame['year'] = year
pieces.append(frame)
#将所有的数据整合到单个DataFrame中
names = pd.concat(pieces, ignore_index=True)
In [107]:
names
Out[107]:
In [108]:
total_births = names.pivot_table('births', rows='year', cols='sex', aggfunc=sum)
In [109]:
total_births.tail()
Out[109]:
In [110]:
total_births.plot(title='Total births by sex and year')
Out[110]:
In [111]:
def add_prop(group):
#整数除法会向下圆整
births = group.births.astype(float)
group['prop'] = births / births.sum()
return group
names = names.groupby(['year', 'sex']).apply(add_prop)
In [112]:
names
Out[112]:
In [113]:
np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)
Out[113]:
In [114]:
def get_top1000(group):
return group.sort_index(by='births', ascending=False)[:1000]
grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)
In [115]:
#取top100的另一种方法
pieces = []
for year, group in names.groupby(['year', 'sex']):
pieces.append(group.sort_index(by='births', ascending=False)[:1000])
top1000_2 = pd.concat(pieces, ignore_index=True)
In [116]:
top1000_2
Out[116]:
In [117]:
#比较小的数据集,接下来分析工作针对这个top1000数据集
top1000[:10]
Out[117]:
In [118]:
#分析命名趋势
#将前1000个名字分为男女两个部分
boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']
In [119]:
total_births = top1000.pivot_table('births', rows='year', cols='name', aggfunc=sum)
In [120]:
total_births
Out[120]:
In [121]:
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]
In [122]:
subset.plot(subplots=True, figsize=(12, 10), grid=False, title="Number fo births per year")
Out[122]:
In [123]:
#评估命名多样性的增长
table = top1000.pivot_table('prop', rows='year', cols='sex', aggfunc=sum)
In [124]:
table.plot(title='Sum of table1000.prop by year and sex', yticks=np.linspace(0,1.2,13), xticks=range(1880,2020,10))
Out[124]:
In [125]:
df = boys[boys.year == 2010]
In [126]:
df
Out[126]:
In [127]:
prop_cumsum = df.sort_index(by='prop', ascending=False).prop.cumsum()
In [128]:
prop_cumsum[:10]
Out[128]:
In [129]:
prop_cumsum.searchsorted(0.5) + 1
Out[129]:
In [130]:
df = boys[boys.year == 1990]
In [131]:
in1900 = df.sort_index(by='prop', ascending=False).prop.cumsum()
In [132]:
in1900.searchsorted(0.5) + 1
Out[132]:
In [133]:
def get_quantile_count(group, q=0.5):
group = group.sort_index(by='prop', ascending=False)
return group.prop.cumsum().searchsorted(q) + 1
In [134]:
diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
In [135]:
diversity.head()
Out[135]:
In [136]:
#按年度统计的密度表
diversity.plot(title="Number of popular names in top 50%")
Out[136]:
In [137]:
##“最后一个字母”的变革
In [138]:
#从name列取出最后一个字母
get_last_letter = lambda x:x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last_letter'
table = names.pivot_table('births', rows=last_letters, cols=['sex', 'year'], aggfunc=sum)
In [139]:
subtable = table.reindex(columns=[1910, 1960, 2010], level='year')
In [140]:
subtable.head()
Out[140]:
In [141]:
subtable.sum()
Out[141]:
In [142]:
letter_prop = subtable / subtable.sum().astype(float)
In [143]:
import matplotlib.pyplot as plt
In [144]:
#男孩女孩名字中各个末字母的比例
fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='FeMale', legend=False)
Out[144]:
In [145]:
letter_prop = table / table.sum().astype(float)
dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T
dny_ts.head()
Out[145]:
In [146]:
#各年出生的男孩中名字以d/n/y结尾的人数比例
dny_ts.plot()
Out[146]:
In [147]:
#变成女孩名字的男孩名字(以及相反的情况)
all_names = top1000.name.unique()
mask = np.array(['lesl' in x.lower() for x in all_names])
lesley_like = all_names[mask]
lesley_like
Out[147]:
In [148]:
filtered = top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()
Out[148]:
In [149]:
table = filtered.pivot_table('births', rows='year', cols='sex', aggfunc='sum')
In [150]:
table = table.div(table.sum(1), axis=0)
table.tail()
Out[150]:
In [151]:
#各个年度使用“Lesley型”名字的男女比例
table.plot(style={'M':'k-', 'F':'k--'})
Out[151]:
In [151]: