In this assigment, I analyzed baby names from two main perspectives:
Note: to show that Day_13_C_Baby_Names_MF_Completed is running properly, I added my work at the bottom of this notebook
In [1]:
%matplotlib inline
In [1]:
import matplotlib.pyplot as plt
import numpy as np
from pylab import figure, show
from pandas import DataFrame, Series
import pandas as pd
In [2]:
try:
import mpld3
from mpld3 import enable_notebook
from mpld3 import plugins
enable_notebook()
except Exception as e:
print "Attempt to import and enable mpld3 failed", e
In [3]:
# what would seaborn do?
try:
import seaborn as sns
except Exception as e:
print "Attempt to import and enable seaborn failed", e
To make it more practical for me to look at your homework, I'm again going to assume a relative placement of files. I placed the files from
https://github.com/pydata/pydata-book
in a local directory, which in my case is "/Users/raymondyee/D/Document/Working_with_Open_Data/pydata-book/"
and then symbolically linked (ln -s
) to the the pydata-book from the root directory of the working-open-data folder. i.e., on OS X
cd /Users/raymondyee/D/Document/Working_with_Open_Data/working-open-data
ln -s /Users/raymondyee/D/Document/Working_with_Open_Data/pydata-book/ pydata-book
That way the files from the pydata-book repository look like they sit in the working-open-data directory -- without having to actually copy the files.
With this arrangment, I should then be able to drop your notebook into my own notebooks directory and run them without having to mess around with paths.
In [5]:
import os
NAMES_DIR = os.path.join(os.pardir, "pydata-book", "ch02", "names")
#NAMES_DIR ---> '../pydata-book/ch02/names'
assert os.path.exists(NAMES_DIR)
Please make sure the above assertion works.
discussed in p. 35 of PfDA
book
To download all the data, including that for 2011 and 2012: Popular Baby Names --> includes state by state data.
In [6]:
# show the first five files in the NAMES_DIR
import glob
glob.glob(NAMES_DIR + "/*")[:5]
Out[6]:
In [7]:
# 2010 is the last available year in the pydata-book repo
import os
years = range(1880, 2011)
pieces = []
columns = ['name', 'sex', 'births']
for year in years:
path = os.path.join(NAMES_DIR, 'yob%d.txt' % year)
# print path
frame = pd.read_csv(path, names=columns)
# print frame
frame['year'] = year
# print frame
pieces.append(frame)
# print pieces
# Concatenate everything into a single DataFrame
names = pd.concat(pieces, ignore_index=True)
# why floats? I'm not sure.
names.describe()
# names = pd.concat(pieces)
# len(names) -->1690784
Out[7]:
In [8]:
names.head()
Out[8]:
In [9]:
names.births
Out[9]:
In [10]:
# how many people, names, males and females represented in names?
names.births.sum()
Out[10]:
In [11]:
names.groupby('sex').head()
Out[11]:
In [12]:
# F vs M
names.groupby('sex')['births'].sum()
Out[12]:
In [13]:
grp = names.groupby('name')
In [111]:
#experimenting with groups
# from itertools import islice
# for key, g_df in islice(grp,5):
# print key, type(g_df), g_df.columns, g_df, g_df.sex
In [15]:
# total number of names
len(names.groupby('name'))
Out[15]:
In [16]:
# use pivot_table to collect records by year (rows) and sex (columns)
total_births = names.pivot_table('births', rows='year', cols='sex', aggfunc=sum)
total_births.head()
Out[16]:
In [17]:
names.groupby('year').head()
Out[17]:
In [18]:
names.groupby('year').apply(lambda s: s.groupby('sex').agg('sum')).head()
Out[18]:
In [19]:
# You can use groupy to get equivalent pivot_table calculation
names.groupby('year').apply(lambda s: s.groupby('sex').agg('sum')).unstack()['births'].head()
Out[19]:
In [20]:
# how to calculate the total births / year
names.groupby('year').sum().plot(title="total births by year")
Out[20]:
In [21]:
names.groupby('year').apply(lambda s: s.groupby('sex').agg('sum')).unstack()['births'].plot(title="births (M/F) by year")
Out[21]:
In [110]:
#some more experimentation with groups
# from itertools import islice
# for key, g_df in islice(names.groupby(['year', 'sex']),5):
# print key,g_df
# print key, type(g_df), g_df.columns, g_df, g_df.sex
In [23]:
# from book: add prop to names
def add_prop(group):
# Integer division floors
births = group.births.astype(float)
# print births
group['prop'] = births / births.sum()
return group
propped_names = names.groupby(['year', 'sex']).apply(add_prop)
propped_names.head()
Out[23]:
In [24]:
# verify prop --> all adds up to 1
# np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)
np.allclose(propped_names.groupby(['year', 'sex']).prop.sum(), 1)
Out[24]:
In [25]:
# number of records in full names dataframe
# len(names) --> 1690784
len(propped_names)
Out[25]:
This section on the top1000 calculation is kept in here to provide some inspiration on how to work with baby names
In [26]:
# from book: useful to work with top 1000 for each year/sex combo
# can use groupby/apply
names.groupby(['year', 'sex']).apply(lambda g: g.sort_index(by='births', ascending=False)[:1000]).head()
Out[26]:
In [27]:
def get_top1000(group):
return group.sort_index(by='births', ascending=False)[:1000]
grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)
top1000.head()
Out[27]:
In [28]:
# Do pivot table: row: year and cols= names for top 1000
top_births = top1000.pivot_table('births', rows='year', cols='name', aggfunc=np.sum)
top_births.tail()
Out[28]:
In [29]:
#instead of pivot, I used groupby here
grp_top_births = top1000.groupby('year').apply(lambda s:s.groupby('name').agg('sum')).unstack()['births']
grp_top_births.tail()
# grp_top_births['Raymond'].plot()
#
Out[29]:
In [30]:
"""my name "Prabha" or "Matta" is missing in the database :("""
# top_births['Matta'].plot(title='plot for Matta')
Out[30]:
In [31]:
# is your name in the top_births list?
top_births['Raymond'].plot(title='plot for Raymond')
Out[31]:
In [32]:
# for Aaden, which shows up at the end
top_births.Aaden.plot(xlim=[1880,2010])
Out[32]:
In [33]:
# number of names represented in top_births
len(top_births.columns)
Out[33]:
In [34]:
top_births.head()
Out[34]:
In [118]:
# how to get the most popular name of all time in top_births?
most_common_names = top_births.sum()
# print most_common_names
most_common_names.sort(ascending=False)
# most_common_names.head()
# # James 5071647
# # John 5060953
# # Robert 4787187
# # Michael 4263083
# # Mary 4117746
# temp=grp_top_births.sum()
# temp.sort()
In [117]:
# most_common_names = top_births.sum()
# # print type(most_common_names)
# most_common_names.sort(ascending=False)
# most_common_names.head()
# # # James 5071647
# # # John 5060953
# # # Robert 4787187
# # # Michael 4263083
# # # Mary 4117746
# temp=grp_top_births.sum()
# print type(temp)
# temp.sort(ascending=False)
# temp.head()
In [37]:
# as of mpl v 0.1 (2014.03.04), the name labeling doesn't work -- so disble mpld3 for this figure
mpld3.disable_notebook()
plt.figure()
most_common_names[:50][::-1].plot(kind='barh', figsize=(10,10))
Out[37]:
In [38]:
# turn mpld3 back on
mpld3.enable_notebook()
In [39]:
#using groupby
names.groupby('year').apply(lambda s: s.groupby('name').agg('sum')).unstack()['births'].tail()
Out[39]:
In [40]:
# instead of top_birth -- get all_births
all_births = names.pivot_table('births', rows='year', cols='name', aggfunc=sum)
all_births.tail()
Out[40]:
In [41]:
all_births = all_births.fillna(0)
all_births.tail()
Out[41]:
In [42]:
# set up to do start/end calculation
all_births_cumsum = all_births.apply(lambda s: s.cumsum(), axis=0)
In [43]:
all_births_cumsum.tail()
Out[43]:
In [44]:
all_births_cumsum['Raymond'].plot()
Out[44]:
In [45]:
# remind ourselves of what's in names
names.head()
Out[45]:
In [46]:
# columns in names
names.columns
Out[46]:
In [112]:
# for key, g_df in islice(names.groupby('sex'),5):
# print key,g_df
In [48]:
# calculate set of male_only, female_only, ambigender names
def calc_of_sex_of_names():
k = names.groupby('sex').apply(lambda s: set(list(s['name'])))
print k
male_only_names = k['M'] - k['F']
female_only_names = k['F'] - k['M']
ambi_names = k['F'] & k['M'] # intersection of two
return {'male_only_names': male_only_names,
'female_only_names': female_only_names,
'ambi_names': ambi_names }
names_by_sex = calc_of_sex_of_names()
ambi_names_array = np.array(list(names_by_sex['ambi_names']))
[(k, len(v)) for (k,v) in names_by_sex.items()]
Out[48]:
In [49]:
# total number of people in names
names.births.sum()
Out[49]:
In [50]:
#learning in1d
# >>> test = np.array([0, 1, 2, 5, 0])
# >>> states = [0, 2]
# >>> mask = np.in1d(test, states)
# >>> mask
# array([ True, False, True, False, True], dtype=bool)
# >>> test[mask]
# array([0, 2, 0])
# >>> mask = np.in1d(test, states, invert=True)
# >>> mask
# array([False, True, False, True, False], dtype=bool)
# >>> test[mask]
# array([1, 5])
# pivot table of ambigendered names to aggregate
names_ambi = names[np.in1d(names.name,ambi_names_array)]
ambi_names_pt = names_ambi.pivot_table('births',
rows='year',
cols=['name','sex'],
aggfunc='sum')
ambi_names_pt.tail()
Out[50]:
In [51]:
ambi_names_pt['Raymond'].plot()
Out[51]:
In [52]:
# total number of people in k1 -- almost everyone!
ambi_names_pt.sum().sum()
Out[52]:
In [53]:
# fill n/a with 0 and look at the table at the end
ambi_names_pt=ambi_names_pt.fillna(0L)
ambi_names_pt.tail()
Out[53]:
In [54]:
ambi_names_pt.T.head()
Out[54]:
In [55]:
# plot M, F in ambigender_names over time
ambi_names_pt.T.xs('M',level='sex').sum().cumsum().plot()
Out[55]:
In [56]:
ambi_names_pt.T.xs('F',level='sex').sum().cumsum().plot()
Out[56]:
In [57]:
# don't know what pivot table has type float
# https://github.com/pydata/pandas/issues/3283
ambi_names_pt['Raymond', 'M'].dtype
Out[57]:
In [58]:
# calculate proportion of males for given name
def prop_male(name):
return (ambi_names_pt[name]['M']/ \
((ambi_names_pt[name]['M'] + ambi_names_pt[name]['F'])))
def prop_c_male(name):
return (ambi_names_pt[name]['M'].cumsum()/ \
((ambi_names_pt[name]['M'].cumsum() + ambi_names_pt[name]['F'].cumsum())))
In [59]:
prop_c_male('Leslie').plot()
Out[59]:
In [61]:
# I couldn't figure out a way of iterating over the names rather than names/sex combo in
# a vectorized way.
from itertools import islice
names_to_calc = list(islice(list(ambi_names_pt.T.index.levels[0]),None))
m = [(name_, ambi_names_pt[name_]['M']/(ambi_names_pt[name_]['F'] + ambi_names_pt[name_]['M'])) \
for name_ in names_to_calc]
p_m_instant = DataFrame(dict(m))
p_m_instant.tail()
Out[61]:
In [62]:
# similar calculation except instead of looking at the proportions for a given year only,
# we look at the cumulative number of male/female babies for given name
from itertools import islice
names_to_calc = list(islice(list(ambi_names_pt.T.index.levels[0]),None))
m = [(name_, ambi_names_pt[name_]['M'].cumsum()/(ambi_names_pt[name_]['F'].cumsum() + ambi_names_pt[name_]['M'].cumsum())) \
for name_ in names_to_calc]
p_m_cum = DataFrame(dict(m))
p_m_cum.tail()
Out[62]:
In [63]:
p_m_cum['Donnie'].plot()
Out[63]:
In [64]:
# some metrics that attempt to measure how a time series s has changed
def min_max_range(s):
"""range of s signed -- positive if slope between two points p +ve and negative
otherwise; 0 if slope is 0"""
# note np.argmax, np.argmin returns the position of first occurence of global max, min
sign = np.sign(np.argmax(s) - np.argmin(s))
if sign == 0:
return 0.0
else:
return sign*(np.max(s) - np.min(s))
def last_first_diff(s):
"""difference between latest and earliest value"""
s0 = s.dropna()
return (s0.iloc[-1] - s0.iloc[0])
In [65]:
# population distributions of ambinames
# might want to remove from consideration instances when total ratio is too great
# or range of existence of a name/sex combo too short
total_pop_ambiname = all_births.sum()[np.in1d(all_births.sum().index, ambi_names_array)]
total_pop_ambiname.sort(ascending=False)
total_pop_ambiname.plot(logy=True)
Out[65]:
In [66]:
# now calculate a DataFrame to visualize results
# calculate the total population, the change in p_m from last to first appearance,
# the change from max to min in p_m, and the percentage of males overall for name
df = DataFrame()
df['total_pop'] = total_pop_ambiname
df['last_first_diff'] = p_m_cum.apply(last_first_diff)
df['min_max_range'] = p_m_cum.apply(min_max_range)
df['abs_min_max_range'] = np.abs(df.min_max_range)
df['p_m'] = p_m_cum.iloc[-1]
# distance from full ambigender -- p_m=0.5 leads to 1, p_m=1 or 0 -> 0
df['ambi_index'] = df.p_m.apply(lambda p: 1 - 2* np.abs(p-0.5))
df.head()
Out[66]:
In [67]:
# plot: x -> log10 of total population, y->how p_m has changed from first to last
# turn off d3 for this plot
mpld3.disable_notebook()
plt.scatter(np.log10(df.total_pop), df.last_first_diff, s=1)
Out[67]:
In [ ]:
# turn d3 back on
mpld3.enable_notebook()
plt.scatter(np.log10(df.total_pop), df.last_first_diff, s=1)
In [68]:
# general directionality counts -- looking for over asymmetry
df.groupby(np.sign(df.last_first_diff)).count()
Out[68]:
In [69]:
# let's concentrate on more populous names that have seen big swings in the cumulative p_m
# you can play with the population and range filter
popular_names_with_shifts = df[(df.total_pop>5000) & (df.abs_min_max_range >0.7)]
popular_names_with_shifts.sort_index(by="abs_min_max_range", ascending=False)
Out[69]:
In [70]:
popular_names_with_shifts.groupby(np.sign(df.last_first_diff)).count()
Out[70]:
In [ ]:
#popular_names_with_shifts.to_pickle('popular_names_with_shifts.pickle')
In [71]:
fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE'))
x = np.log10(popular_names_with_shifts.total_pop)
y = popular_names_with_shifts.min_max_range
scatter = ax.scatter(x, y)
ax.grid(color='white', linestyle='solid')
ax.set_title("Populous Names with Major Sex Shift", size=20)
ax.set_xlabel('log10(total_pop)')
ax.set_ylabel('min_max_range')
#labels = ['point {0}'.format(i + 1) for i in range(len(x))]
labels = list(popular_names_with_shifts.index)
tooltip = plugins.PointLabelTooltip(scatter, labels=labels)
plugins.connect(fig, tooltip)
In [ ]:
prop_c_male('Leslie').plot()
In [72]:
get_first_letter = lambda x:x[0]
first_letters = names.name.map(get_first_letter)
first_letters.name = 'first_letter'
first_letter_trend = names.pivot_table('births', rows='year', cols=[first_letters,'sex'], aggfunc=sum)
first_letter_trend.head()
Out[72]:
In [73]:
first_letter_trend['A'].plot()
Out[73]:
In [74]:
yearwise_first_letter_trend = names.pivot_table('births', rows=first_letters, cols=['sex','year'], aggfunc=sum)
#trending of names starting with 'A'
# first_letter_trend.plot()
yearwise_first_letter_trend.head()
Out[74]:
In [113]:
#plotting for all the first_letters and years
yearwise_first_letter_trend.plot(legend=False)
Out[113]:
In [108]:
# yearwise_first_letter_trend.sum()
In [77]:
#Let us analyze the trend for every 50 years ->1880,1930,1970, 2010
interval_yearwise_first_letter_trend = yearwise_first_letter_trend.reindex(columns = [1880,1930,1970, 2010], level = 'year')
interval_yearwise_first_letter_trend.head()
letter_prop = interval_yearwise_first_letter_trend/interval_yearwise_first_letter_trend.sum().astype(float)
In [78]:
mpld3.disable_notebook()
import matplotlib.pyplot as plt
# letter_prop = yearwise_first_letter_trend/yearwise_first_letter_trend.sum().astype(float)
fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female', legend=False)
# yearwise_first_letter_trend[2010].plot()
Out[78]:
In [80]:
# some metrics that attempt to measure how a time series s has changed
def min_max_range(s):
"""range of s signed -- positive if slope between two points p +ve and negative
otherwise; 0 if slope is 0"""
# note np.argmax, np.argmin returns the position of first occurence of global max, min
sign = np.sign(np.argmax(s) - np.argmin(s))
if sign == 0:
return 0.0
else:
return sign*(np.max(s) - np.min(s))
def last_first_diff(s):
"""difference between latest and earliest value"""
s0 = s.dropna()
return (s0.iloc[-1] - s0.iloc[0])
In [81]:
total_pop_ambiname = all_births.sum()[np.in1d(all_births.sum().index, ambi_names_array)]
total_pop_ambiname.sort(ascending=False)
In [102]:
top5_ambi_data = DataFrame()
top5_ambi_data['total_pop'] = total_pop_ambiname
top5_ambi_data['last_first_diff'] = p_m_cum.apply(last_first_diff)
top5_ambi_data['min_max_range'] = p_m_cum.apply(min_max_range)
top5_ambi_data['abs_min_max_range'] = np.abs(df.min_max_range)
top5_ambi_data['p_m'] = p_m_cum.iloc[-1]
# distance from full ambigender -- p_m=0.5 leads to 1, p_m=1 or 0 -> 0
top5_ambi_data['ambi_index'] = df.p_m.apply(lambda p: 1 - 2* np.abs(p-0.5))
In [84]:
#sorting the ambi-names which has maximum last year to first year diff
top5_ambi_data.sort_index(by='last_first_diff', ascending=False).head()
Out[84]:
In [92]:
# we see that Krish has changed the maximum.
#let us analyze Krish
names_ambi = names[np.in1d(names.name,ambi_names_array)]
ambi_names_pt = names_ambi.pivot_table('births',
rows='year',
cols=['name','sex'],
aggfunc='sum')
ambi_names_pt= ambi_names_pt.fillna(0L)
In [93]:
normalized_ambi_names = ambi_names_pt.div(ambi_names_pt.sum(1),axis=0)
In [94]:
normalized_ambi_names.tail()
Out[94]:
In [97]:
#plotting for Krish
normalized_ambi_names['Krish'].plot()
"""Observation for 'Krish'
It is interesting to note that though the name has changed from female to male. Apparantly, Krish has become popular only after 1980's
"""
Out[97]:
In [98]:
#plotting for Lydell
normalized_ambi_names['Lydell'].plot()
"""Observation for 'Lydell'
we see that the name has completely transformed from female to male.
"""
Out[98]:
In [99]:
top5_ambi_data.sort_index(by='last_first_diff', ascending=False).tail()
Out[99]:
In [109]:
#plotting for Hailey
normalized_ambi_names['Hailey'].plot()
"""Observation for 'Hailey'
we see that the name has increasing given to female babies.
"""
Out[109]:
In [101]:
top5_ambi_data.sort_index(by='ambi_index', ascending=False).head()
Out[101]:
In [103]:
#plotting for Challie
normalized_ambi_names['Challie'].plot()
"""Observation for 'Challie'
Very interesting plot
"""
Out[103]:
In [104]:
top5_ambi_data.sort_index(by='ambi_index', ascending=False).tail()
Out[104]:
In [107]:
#plotting for Annabelle
normalized_ambi_names['Annabelle'].plot()
"""Observation for 'Annabelle'
Annabelle is a hot name now :) very trending
"""
Out[107]: