This demo is based on an example in the book: https://www.amazon.com/dp/1449319793/
Python for Data Analysis by William Wesley McKinney (O'Reilly). Copyright 2012 William McKinney, 978-1-449-31979-3
The data can be download from: https://catalog.data.gov/dataset/baby-names-from-social-security-card-applications-national-level-data
SF_ACCOUNT, SF_USER and SF_PASSWORDSF_S3_BUCKET, AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY to load the daata from the given location.jupyter notebookThe given data files are named in the form of yobi<year>.txt. For example, the baby names in 1880 is included in yobi1880.txt. The data row format is:
Since no year is included, I did run a bash script to preprocess the data:
#!/bin/bash
gzip *.txt
for f in $(ls *.txt.gz); do
year=${f:3:4}
new_name=${f:0:7}"u.tar.gz"
zcat $f | awk -v year="$year" '{ gsub(/\r$/,""); print $0 "," year}' | gzip -c > ${f:0:7}"u.tar.gz"
aws s3 cp $new_name s3://{s3_bucket}/baby_names/data/
done
In [1]:
import os
account=os.getenv('SF_ACCOUNT')
user=os.getenv('SF_USER')
password=os.getenv('SF_PASSWORD')
aws_key=os.getenv("AWS_ACCESS_KEY_ID")
aws_secret = os.getenv("AWS_SECRET_ACCESS_KEY")
sf_s3_bucket=os.getenv("SF_S3_BUCKET")
In [2]:
from snowflake.connector import connect
con = connect(
account=account,
user=user,
password=password,
database='testdb',
schema='public',
)
cur = con.cursor()
In [3]:
cur.execute("""
create or replace table baby_names(
name string,
gender varchar(1),
births integer,
year integer)""")
Out[3]:
In [4]:
cur.execute("""
copy into baby_names from s3://{s3_bucket}/baby_names/data/
credentials=(
aws_key_id='{aws_key_id}'
aws_secret_key='{aws_secret_key}'
)
file_format= (
field_delimiter=','
type='csv'
)
""".format(
s3_bucket=sf_s3_bucket,
aws_key_id=aws_key,
aws_secret_key=aws_secret))
Out[4]:
In [5]:
import pandas as pd
In [6]:
from sqlalchemy import create_engine
from snowflake.sqlalchemy import URL
engine = create_engine(URL(
account=account,
user=user,
password=password,
database='testdb',
schema='public',
numpy=True,
))
In [7]:
names1880 = pd.read_sql("""
SELECT name, gender, births
FROM baby_names
WHERE year=1880""", engine)
In [8]:
names1880.groupby('gender')['births'].sum()
Out[8]:
In [9]:
names=pd.read_sql("""
SELECT *
FROM baby_names""", engine)
In [10]:
total_births=pd.pivot_table(
names,
values='births',
index='year',
columns='gender',
aggfunc='sum')
In [11]:
total_births.tail()
Out[11]:
In [12]:
%matplotlib notebook
In [13]:
total_births.plot(title='Total births by gender and year')
Out[13]:
In [14]:
def add_prop(group):
# Integer division floor
births = group.births.astype(float)
group['prop'] = births / births.sum()
return group
In [15]:
names=names.groupby(['year','gender']).apply(add_prop)
In [16]:
names.head()
Out[16]:
In [17]:
import numpy as np
In [18]:
np.allclose(names.groupby(['year','gender']).prop.sum(),1)
Out[18]:
In [19]:
def get_top1000(group):
return group.sort_values(by='births', ascending=False)[:1000]
grouped=names.groupby(['year','gender'])
top1000=grouped.apply(get_top1000)
top1000.index = np.arange(len(top1000))
In [20]:
pieces=[]
for year, group in names.groupby(['year','gender']):
pieces.append(group.sort_values(by='births', ascending=False)[:1000])
top1000=pd.concat(pieces, ignore_index=True)
In [21]:
top1000
Out[21]:
In [22]:
boys=top1000[top1000.gender=='M']
In [23]:
girls=top1000[top1000.gender=='F']
In [24]:
total_births=pd.pivot_table(
names,
values='births',
index='year',
columns='name',
aggfunc='sum')
In [25]:
total_births.info()
In [26]:
subset=total_births[['John', 'Harry', 'Mary', 'Marilyn']]
In [27]:
subset.plot(subplots=True, figsize=(12,10), grid=False, title="Number of births per year")
Out[27]:
In [28]:
table=pd.pivot_table(
top1000,
values='prop',
index='year',
columns='gender',
aggfunc='sum'
)
In [29]:
table.plot(title='Sum of table1000.prop by year and gender',
yticks=np.linspace(0, 1.2, 13),
xticks=range(1880, 2020, 10))
Out[29]:
In [31]:
df=boys[boys.year==2010]
In [32]:
df
Out[32]:
In [33]:
prop_cumsum=df.sort_values(by='prop', ascending=False).prop.cumsum()
In [34]:
prop_cumsum[:10]
Out[34]:
In [35]:
prop_cumsum.values.searchsorted(0.5)
Out[35]:
In [36]:
df=boys[boys.year==1900]
In [37]:
in1900=df.sort_values(by='prop', ascending=False).prop.cumsum()
In [38]:
in1900.values.searchsorted(0.5)+1
Out[38]:
In [39]:
def get_quantile_count(group, q=0.5):
group=group.sort_values(by='prop', ascending=False)
return group.prop.cumsum().values.searchsorted(q)+1
diversity=top1000.groupby(['year','gender']).apply(get_quantile_count)
diversity=diversity.unstack('gender')
In [40]:
diversity.head()
Out[40]:
In [41]:
diversity.plot(title='Number of popular names in top 50%')
Out[41]:
In [42]:
get_last_letter = lambda x: x[-1]
last_letters=names.name.map(get_last_letter)
last_letters.name = 'last_letter'
table = pd.pivot_table(
names,
values='births',
index=last_letters,
columns=['gender', 'year'],
aggfunc='sum'
)
In [43]:
subtable=table.reindex(columns=[1910, 1960, 2010], level='year')
In [44]:
subtable.head()
Out[44]:
In [45]:
subtable.sum()
Out[45]:
In [46]:
letter_prop=subtable/subtable.sum().astype(float)
In [47]:
import matplotlib.pyplot as plt
fig, axes = plt.subplots(2, 1, figsize=(10,8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female', legend=False)
Out[47]:
In [50]:
letter_prop=table/table.sum().astype(float)
In [51]:
dny_ts=letter_prop.ix[['d','n','y'], 'M'].T
dny_ts.head()
Out[51]:
In [52]:
dny_ts.plot()
Out[52]:
In [53]:
all_names=top1000.name.unique()
In [54]:
mask=np.array(['lesl' in x.lower() for x in all_names])
In [55]:
lesley_like=all_names[mask]
In [56]:
lesley_like
Out[56]:
In [57]:
filtered=top1000[top1000.name.isin(lesley_like)]
In [58]:
filtered.groupby('name').births.sum()
Out[58]:
In [59]:
table=pd.pivot_table(
filtered,
values='births',
index='year',
columns=['gender'],
aggfunc='sum'
)
In [60]:
table=table.div(table.sum(1), axis=0)
In [61]:
table.tail()
Out[61]:
In [62]:
table.plot(style={'M': 'k-', 'F': 'k--'})
Out[62]: