In [4]:
# Hidden
from datascience import *
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')
%matplotlib inline
This workbook shows a example derived from the EDA exercise in Chapter 2 of Doing Data Science, by o'Neil abd Schutt
In [5]:
clicks = Table.read_table("http://stat.columbia.edu/~rachel/datasets/nyt1.csv")
clicks
Out[5]:
Well. Half a million rows. That would be painful in excel.
Add a column of 1's, so that a sum will count people.
In [7]:
age_upper_bounds = [18, 25, 35, 45, 55, 65]
def age_range(n):
if n == 0:
return '0'
lower = 1
for upper in age_upper_bounds:
if lower <= n < upper:
return str(lower) + '-' + str(upper-1)
lower = upper
return str(lower) + '+'
# a little test
np.unique([age_range(n) for n in range(100)])
Out[7]:
In [8]:
clicks["Age Range"] = clicks.apply(age_range, 'Age')
clicks["Person"] = 1
clicks
Out[8]:
Now we can group the table by Age Range
and count how many clicks come from each range.
In [18]:
clicks_by_age = clicks.group('Age Range', sum)
clicks_by_age
Out[18]:
In [19]:
clicks_by_age.select(['Age Range', 'Clicks sum', 'Impressions sum', 'Person sum']).barh('Age Range')
Now we can do some other interesting summaries of these categories
In [20]:
clicks_by_age['Gender Mix'] = clicks_by_age['Gender sum'] / clicks_by_age['Person sum']
clicks_by_age["CTR"] = clicks_by_age['Clicks sum'] / clicks_by_age['Impressions sum']
clicks_by_age.select(['Age Range', 'Person sum', 'Gender Mix', 'CTR'])
Out[20]:
In [26]:
# Format some columns as percent with limited precision
clicks_by_age.set_format('Gender Mix', PercentFormatter(1))
clicks_by_age.set_format('CTR', PercentFormatter(2))
clicks_by_age
Out[26]:
We might want to do the click rate calculation a little more carefully. We don't care about clicks where there are zero impressions or missing age/gender information. So let's filter those out of our data set.
In [27]:
impressed = clicks.where(clicks['Age'] > 0).where('Impressions')
impressed
Out[27]:
In [32]:
# Impressions by age and gender
impressed.pivot(rows='Gender', columns='Age Range', values='Impressions', collect=sum)
Out[32]:
In [39]:
impressed.pivot("Age Range", "Gender", "Clicks",sum)
Out[39]:
In [40]:
impressed.pivot_hist('Age Range','Impressions')
In [34]:
distributions = impressed.pivot_bin('Age Range','Impressions')
distributions
Out[34]:
In [35]:
impressed['Gen'] = [['Male','Female'][i] for i in impressed['Gender']]
impressed
Out[35]:
Group returns a new table. If we wanted to specify the formats on columns of this table, assign it to a name.
In [38]:
# How does gender and clicks vary with age?
gi = impressed.group('Age Range', np.mean).select(['Age Range', 'Gender mean', 'Clicks mean'])
gi.set_format(['Gender mean', 'Clicks mean'], PercentFormatter)
gi
Out[38]:
In [ ]: