PfDA
treats the baby names database
In [1]:
%pylab --no-import-all inline
In [2]:
import matplotlib.pyplot as plt
import numpy as np
from pylab import figure, show
from pandas import DataFrame, Series
import pandas as pd
To make it more practical for me to look at your homework, I'm again going to assume a relative placement of files. I placed the files from
https://github.com/pydata/pydata-book
in a local directory, which in my case is "/Users/raymondyee/D/Document/Working_with_Open_Data/pydata-book/"
and then symbolically linked (ln -s
) to the the pydata-book from the root directory of the working-open-data folder. i.e., on OS X
cd /Users/raymondyee/D/Document/Working_with_Open_Data/working-open-data
ln -s /Users/raymondyee/D/Document/Working_with_Open_Data/pydata-book/ pydata-book
That way the files from the pydata-book repository look like they sit in the working-open-data directory -- without having to actually copy the files.
With this arrangment, I should then be able to drop your notebook into my own notebooks directory and run them without having to mess around with paths.
In [3]:
import os
NAMES_DIR = os.path.join(os.pardir, "pydata-book", "ch02", "names")
assert os.path.exists(NAMES_DIR)
Please make sure the above assertion works.
discussed in p. 35 of PfDA
book
To download all the data, including that for 2011 and 2012: Popular Baby Names --> includes state by state data.
In [4]:
# take a look at some lines from each of the names files
import os
os.path.join(NAMES_DIR,'yob1880.txt')
yob1880_path = os.path.join(NAMES_DIR,'yob1880.txt')
!head $yob1880_path
In [5]:
# create a DataFrame for 1880 data
import pandas as pd
import codecs
names1880_file = codecs.open(os.path.join(NAMES_DIR,'yob1880.txt'), encoding='iso-8859-1')
names1880 = pd.read_csv(names1880_file, names=['name', 'sex', 'births'])
names1880.head()
Out[5]:
In [6]:
# how many females represented in the 1880 data?
names1880[names1880.sex=='F']['births'].sum()
Out[6]:
In [7]:
# number of names in 1880 file
len(names1880.groupby('name'))
Out[7]:
In [8]:
# group by name to find out which names are both M and F -- by looking at which names have more
# than one sex represented.
name_count = names1880.groupby('name').apply(lambda s: len(s))
set(name_count[name_count > 1].index)
Out[8]:
In [9]:
# number of births by sex for 1880
names1880.groupby('sex').sum()
Out[9]:
In [10]:
# total number of births in 1880
names1880['births'].sum()
Out[10]:
In [11]:
# sort by number of births to get most popular names
names1880.sort('births', ascending=False)[:10]
Out[11]:
In [12]:
# most popular female names
names1880[names1880.sex == 'F'].sort('births', ascending=False)[:10]
Out[12]:
In [16]:
# try out seaborn if you want
#import seaborn as sns
In [14]:
num_names1880 = len(names1880['births'].order(ascending=False))
plt.plot(np.arange(num_names1880), names1880['births'].order(ascending=False), 'ro', ms=1)
plt.yscale('log')
plt.xlabel('order of name')
plt.ylabel('number of babies')
Out[14]:
In [17]:
!ls $NAMES_DIR
In [18]:
# 2010 is the last available year in the pydata-book repo
import os
years = range(1880, 2011)
pieces = []
columns = ['name', 'sex', 'births']
for year in years:
path = os.path.join(NAMES_DIR, 'yob%d.txt' % year)
frame = pd.read_csv(path, names=columns)
frame['year'] = year
pieces.append(frame)
# Concatenate everything into a single DataFrame
names = pd.concat(pieces, ignore_index=True)
# why floats? I'm not sure.
names.describe()
Out[18]:
In [19]:
# how many people, names, males and females represented in names?
names.births.sum()
Out[19]:
In [20]:
# F vs M
names.groupby('sex')['births'].sum()
Out[20]:
In [21]:
# total number of names
len(names.groupby('name'))
Out[21]:
In [22]:
# use pivot_table to collect records by year (rows) and sex (columns)
total_births = names.pivot_table('births', rows='year', cols='sex', aggfunc=sum)
total_births.head()
Out[22]:
In [23]:
# You can use groupy to get equivalent pivot_table calculation
names.groupby('year').apply(lambda s: s.groupby('sex').agg('sum')).unstack()['births']
Out[23]:
In [24]:
# how to calculate the total births / year
names.groupby('year').sum().plot(title="total births by year")
Out[24]:
In [25]:
names.groupby('year').apply(lambda s: s.groupby('sex').agg('sum')).unstack()['births'].plot(title="births (M/F) by year")
Out[25]:
In [26]:
# number of names in 1880
#names.groupby('year').apply(lambda s: s.groupby('name').agg('count'))
#len(names1880.groupby('name').agg('count'))
len(names1880.groupby('name'))
Out[26]:
In [27]:
# can groupby more than one column
# 131 years x 2 sexes
len(names.groupby(['year', 'sex']))
Out[27]:
In [28]:
# how many combo of name x year
len(names.groupby(['name','year']))
Out[28]:
In [ ]:
# from book: add prop to names
def add_prop(group):
# Integer division floors
births = group.births.astype(float)
group['prop'] = births / births.sum()
return group
names = names.groupby(['year', 'sex']).apply(add_prop)
In [ ]:
# verify prop --> all adds up to 1
np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)
In [ ]:
# number of records in full names dataframe
len(names)
In [ ]:
# from book: useful to work with top 1000 for each year/sex combo
# can use groupby/apply
names.groupby(['year', 'sex']).apply(lambda g: g.sort_index(by='births', ascending=False)[:1000])
In [ ]:
def get_top1000(group):
return group.sort_index(by='births', ascending=False)[:1000]
grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)
top1000.head()
In [ ]:
# Do pivot table: row: year and cols= names for top 1000
top_births = top1000.pivot_table('births', rows='year', cols='name', aggfunc=np.sum)
top_births.tail()
In [ ]:
# is your name in the top_births list?
top_births['Raymond'].plot(title='plot for Raymond')
In [ ]:
# for Aaden, which shows up at the end
top_births.Aaden.plot(xlim=[1880,2010])
In [ ]:
# number of names represented in top_births
len(top_births.columns)
In [ ]:
# how to get the most popular name of all time in top_births?
most_common_names = top_births.sum()
most_common_names.sort(ascending=False)
most_common_names.head()
In [ ]:
plt.figure()
most_common_names[:50][::-1].plot(kind='barh', figsize=(10,10))
We go from 1880 to 2010.
It might be helpful to calculate a cumulative sum for all names...
In [ ]:
# replace n/a with 0 -- might not actually need to do this
top_births = top_births.fillna(0)
In [ ]:
top_births_cumsum = top_births.apply(lambda s: s.cumsum(), axis=0)
In [ ]:
def start_year(s):
active_years = s.index[s > 0]
if len(active_years):
return active_years[0]
else:
return None
def end_year(s):
max_years = s.index[s == s.irow(-1)]
return max_years[0]
def start_end_years(s):
active_years = s.index[s > 0]
max_years = s.index[s == s.irow(-1)]
return Series({'start': active_years[0] if len(active_years) else None,
'end': max_years[0] })
top_births_cumsum.apply(start_end_years)
In [ ]:
# instead of top_birth -- get all_births
all_births = names.pivot_table('births', rows='year', cols='name', aggfunc=sum)
In [ ]:
all_births = all_births.fillna(0)
all_births.tail()
In [ ]:
# set up to do start/end calculation
all_births_cumsum = all_births.apply(lambda s: s.cumsum(), axis=0)
In [ ]:
all_births_cumsum.tail()
In [ ]:
def start_end_years(s):
active_years = s.index[s > 0]
max_years = s.index[s == s.irow(-1)]
return Series({'start': active_years[0] if len(active_years) else None,
'end': max_years[0] })
all_start_end = all_births_cumsum.apply(start_end_years)
In [ ]:
# all_start_end.to_pickle('Day_12_Baby_Names_all_start_end.pickle')
In [ ]:
all_start_end.tail()
In [ ]:
vc_start = all_start_end.ix['start'].value_counts()
vc_end = all_start_end.ix['end'].value_counts()
fig = plt.figure()
ax1 = fig.add_subplot(111)
plt.ylim(0,2000)
ax1.scatter(vc_start.index, vc_start, c='b')
ax1.scatter(vc_end.index, vc_end, c='r')
ax1.set_xlabel('year')
ax1.set_xlabel('number of starts/ends')
plt.tight_layout()
plt.show()
In [ ]:
# max / min totals and when? -- awkward -- must be a better way
total_births_sum = names.groupby('year').sum()
max_value = list(total_births_sum.max())[0]
min_value = list(total_births_sum.min())[0]
is_max = total_births_sum.births == max_value
is_min = total_births_sum.births == min_value
is_max[is_max], is_min[is_min]
In [ ]:
# a "derivative" -- when is there great population rate change
total_births_sum.diff().plot()
In [ ]:
# plot multiple names on same plot or as multiple axes
def name_sex_count_in_year(name,sex):
return names[(names.name==name) & (names.sex==sex)][['year', 'births']].set_index(keys='year')
def name_sex_prop_in_year(name,sex):
return names[(names.name==name) & (names.sex==sex)][['year', 'prop']].set_index(keys='year')
name_df = DataFrame(index=np.arange(1880,2010))
name_df['Raymond'] = name_sex_count_in_year('Raymond','M')
name_df['Laura'] = name_sex_count_in_year('Laura','F')
name_df.plot()
In [ ]:
# plot proportion instead of absolute births
name_df = DataFrame(index=np.arange(1880,2010))
name_df['Raymond'] = name_sex_prop_in_year('Raymond','M')
name_df['Laura'] = name_sex_prop_in_year('Laura','F')
name_df.plot()
In [ ]:
total_births.plot(title='Total births by sex and year')
In [ ]:
# http://en.wikipedia.org/wiki/Human_sex_ratio
# make an agg figure
fig = figure()
# meaning of 111: http://stackoverflow.com/a/3584933/7782
ax = fig.add_subplot(111)
ax.set_title('Ratio of M to F births')
cum_ratio_by_sex = total_births.M.cumsum() / total_births.F.cumsum()
cum_ratio_by_sex.plot(ax=ax, label="cumulative", color="red")
# add instantaneous ratio
annual_ratio_by_sex = total_births.M / total_births.F
annual_ratio_by_sex.plot(ax=ax, label="annual", color="green")
ax.legend(loc='best')
fig.canvas.draw()
In [ ]:
# number of names over time
names.groupby('year').count()[['name']].plot()
In [ ]:
# first attempt to calculate entropy of names
fig = figure()
# meaning of 111: http://stackoverflow.com/a/3584933/7782
ax = fig.add_subplot(111)
ax.set_title('Entropy of names')
S_male = names[names.sex=='M'].groupby('year').prop.agg(lambda x: sum([-j*np.log(j) for j in x])) # apply(lambda x: -x*log(x))
S_male.plot(ax=ax, label="M", color="blue")
S_female = names[names.sex=='F'].groupby('year').prop.agg(lambda x: sum([-j*np.log(j) for j in x])) # apply(lambda x: -x*log(x))
S_female.plot(ax=ax, label="F", color="red")
ax.legend(loc='best')
ax.set_ylim(0)
fig.canvas.draw()
Goal: start to explore names that have been given to both male and female babies. Is there a general trend to feminization of names? (That is, is it more likely that names start as male names become feminine names than vice versa?)