In [4]:

    
# Hidden
from datascience import *
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')
%matplotlib inline

This workbook shows a example derived from the EDA exercise in Chapter 2 of Doing Data Science, by o'Neil abd Schutt



In [5]:

    
clicks = Table.read_table("http://stat.columbia.edu/~rachel/datasets/nyt1.csv")
clicks









    Out[5]:





    
        
            Age  Gender  Impressions  Clicks  Signed_In
        
    
    
        
            36   0      3           0      1        
        
    
        
            73   1      3           0      1        
        
    
        
            30   0      3           0      1        
        
    
        
            49   1      3           0      1        
        
    
        
            47   1      11          0      1        
        
    
        
            47   0      11          1      1        
        
    
        
            0    0      7           1      0        
        
    
        
            46   0      5           0      1        
        
    
        
            16   0      3           0      1        
        
    
        
            52   0      4           0      1        
        
    

... (458431 rows omitted)

Well. Half a million rows. That would be painful in excel.

Add a column of 1's, so that a sum will count people.



In [7]:

    
age_upper_bounds = [18, 25, 35, 45, 55, 65]

def age_range(n):
    if n == 0:
        return '0'
    lower = 1
    for upper in age_upper_bounds:
        if lower <= n < upper:
            return str(lower) + '-' + str(upper-1)
        lower = upper
    return str(lower) + '+'

# a little test
np.unique([age_range(n) for n in range(100)])









    Out[7]:





array(['0', '1-17', '18-24', '25-34', '35-44', '45-54', '55-64', '65+'], 
      dtype='<U5')



In [8]:

    
clicks["Age Range"] = clicks.apply(age_range, 'Age')
clicks["Person"] = 1
clicks









    Out[8]:





    
        
            Age  Gender  Impressions  Clicks  Signed_In  Age Range  Person
        
    
    
        
            36   0      3           0      1         35-44     1     
        
    
        
            73   1      3           0      1         65+       1     
        
    
        
            30   0      3           0      1         25-34     1     
        
    
        
            49   1      3           0      1         45-54     1     
        
    
        
            47   1      11          0      1         45-54     1     
        
    
        
            47   0      11          1      1         45-54     1     
        
    
        
            0    0      7           1      0         0         1     
        
    
        
            46   0      5           0      1         45-54     1     
        
    
        
            16   0      3           0      1         1-17      1     
        
    
        
            52   0      4           0      1         45-54     1     
        
    

... (458431 rows omitted)

Now we can group the table by Age Range and count how many clicks come from each range.



In [18]:

    
clicks_by_age = clicks.group('Age Range', sum)
clicks_by_age









    Out[18]:





    
        
            Age Range  Age sum  Gender sum  Impressions sum  Clicks sum  Signed_In sum  Person sum
        
    
    
        
            0         0       0          685483          19480      0             137106    
        
    
        
            1-17      211045  9470       69239           2065       13828         13828     
        
    
        
            18-24     847791  21721      203585          2167       40694         40694     
        
    
        
            25-34     1716328  30958      290511          2937       58174         58174     
        
    
        
            35-44     2798593  37676      355824          3662       70860         70860     
        
    
        
            45-54     3181779  34007      322109          3232       64288         64288     
        
    
        
            55-64     2661830  23988      224688          4556       44738         44738     
        
    
        
            65+       2098644  10445      144120          4350       28753         28753



In [19]:

    
clicks_by_age.select(['Age Range', 'Clicks sum', 'Impressions sum', 'Person sum']).barh('Age Range')

Now we can do some other interesting summaries of these categories



In [20]:

    
clicks_by_age['Gender Mix'] = clicks_by_age['Gender sum'] / clicks_by_age['Person sum']
clicks_by_age["CTR"] = clicks_by_age['Clicks sum'] / clicks_by_age['Impressions sum']
clicks_by_age.select(['Age Range', 'Person sum', 'Gender Mix', 'CTR'])









    Out[20]:





    
        
            Age Range  Person sum  Gender Mix  CTR
        
    
    
        
            0         137106     0          0.0284179
        
    
        
            1-17      13828      0.684842   0.0298242
        
    
        
            18-24     40694      0.533764   0.0106442
        
    
        
            25-34     58174      0.532162   0.0101098
        
    
        
            35-44     70860      0.531696   0.0102916
        
    
        
            45-54     64288      0.528979   0.0100339
        
    
        
            55-64     44738      0.536188   0.020277 
        
    
        
            65+       28753      0.363266   0.0301832



In [26]:

    
# Format some columns as percent with limited precision
clicks_by_age.set_format('Gender Mix', PercentFormatter(1))
clicks_by_age.set_format('CTR', PercentFormatter(2))
clicks_by_age









    Out[26]:





    
        
            Age Range  Age sum  Gender sum  Impressions sum  Clicks sum  Signed_In sum  Person sum  Gender Mix  CTR
        
    
    
        
            0         0       0          685483          19480      0             137106     0.0%       2.84%
        
    
        
            1-17      211045  9470       69239           2065       13828         13828      68.5%      2.98%
        
    
        
            18-24     847791  21721      203585          2167       40694         40694      53.4%      1.06%
        
    
        
            25-34     1716328  30958      290511          2937       58174         58174      53.2%      1.01%
        
    
        
            35-44     2798593  37676      355824          3662       70860         70860      53.2%      1.03%
        
    
        
            45-54     3181779  34007      322109          3232       64288         64288      52.9%      1.00%
        
    
        
            55-64     2661830  23988      224688          4556       44738         44738      53.6%      2.03%
        
    
        
            65+       2098644  10445      144120          4350       28753         28753      36.3%      3.02%

We might want to do the click rate calculation a little more carefully. We don't care about clicks where there are zero impressions or missing age/gender information. So let's filter those out of our data set.



In [27]:

    
impressed = clicks.where(clicks['Age'] > 0).where('Impressions')
impressed









    Out[27]:





    
        
            Age  Gender  Impressions  Clicks  Signed_In  Age Range  Person
        
    
    
        
            36   0      3           0      1         35-44     1     
        
    
        
            73   1      3           0      1         65+       1     
        
    
        
            30   0      3           0      1         25-34     1     
        
    
        
            49   1      3           0      1         45-54     1     
        
    
        
            47   1      11          0      1         45-54     1     
        
    
        
            47   0      11          1      1         45-54     1     
        
    
        
            46   0      5           0      1         45-54     1     
        
    
        
            16   0      3           0      1         1-17      1     
        
    
        
            52   0      4           0      1         45-54     1     
        
    
        
            21   0      3           0      1         18-24     1     
        
    

... (319188 rows omitted)



In [32]:

    
# Impressions by age and gender
impressed.pivot(rows='Gender', columns='Age Range', values='Impressions', collect=sum)









    Out[32]:





    
        
            Gender  1-17  18-24  25-34  35-44  45-54  55-64  65+
        
    
    
        
            0      21800  95057  136292  166324  151650  104220  91956
        
    
        
            1      47439  108528  154219  189500  170459  120468  52164



In [39]:

    
impressed.pivot("Age Range", "Gender", "Clicks",sum)









    Out[39]:





    
        
            Gender  1-17  18-24  25-34  35-44  45-54  55-64  65+
        
    
    
        
            0      683  1002  1388  1707  1542  2105  2765
        
    
        
            1      1382  1165  1549  1955  1690  2451  1585



In [40]:

    
impressed.pivot_hist('Age Range','Impressions')



In [34]:

    
distributions = impressed.pivot_bin('Age Range','Impressions')
distributions









    Out[34]:





    
        
            bin  1-17  18-24  25-34  35-44  45-54  55-64  65+
        
    
    
        
            1    1674  4833  6862  8206  7546  5183  3345
        
    
        
            2.9  4345  12833  18382  22260  20230  14025  9131
        
    
        
            4.8  4346  13000  18861  22895  20586  14548  9120
        
    
        
            6.7  2358  6952  9801  12094  11050  7595  4957
        
    
        
            8.6  799  2237  3117  3981  3517  2486  1607
        
    
        
            10.5  179  456   671   814   787   530   326 
        
    
        
            12.4  24   87    95    130   116   90    62  
        
    
        
            14.3  7    8     11    12    12    5     8   
        
    
        
            16.2  0    2     1     1     1     0     0   
        
    
        
            18.1  0    0     0     1     0     0     0   
        
    

... (1 rows omitted)



In [35]:

    
impressed['Gen'] = [['Male','Female'][i] for i in impressed['Gender']]
impressed









    Out[35]:





    
        
            Age  Gender  Impressions  Clicks  Signed_In  Age Range  Person  Gen
        
    
    
        
            36   0      3           0      1         35-44     1      Male  
        
    
        
            73   1      3           0      1         65+       1      Female
        
    
        
            30   0      3           0      1         25-34     1      Male  
        
    
        
            49   1      3           0      1         45-54     1      Female
        
    
        
            47   1      11          0      1         45-54     1      Female
        
    
        
            47   0      11          1      1         45-54     1      Male  
        
    
        
            46   0      5           0      1         45-54     1      Male  
        
    
        
            16   0      3           0      1         1-17      1      Male  
        
    
        
            52   0      4           0      1         45-54     1      Male  
        
    
        
            21   0      3           0      1         18-24     1      Male  
        
    

... (319188 rows omitted)

Group returns a new table. If we wanted to specify the formats on columns of this table, assign it to a name.



In [38]:

    
# How does gender and clicks vary with age?
gi = impressed.group('Age Range', np.mean).select(['Age Range', 'Gender mean', 'Clicks mean'])
gi.set_format(['Gender mean', 'Clicks mean'], PercentFormatter)
gi









    Out[38]:





    
        
            Age Range  Gender mean  Clicks mean
        
    
    
        
            1-17      68.45%      15.04%     
        
    
        
            18-24     53.40%      5.36%      
        
    
        
            25-34     53.20%      5.08%      
        
    
        
            35-44     53.17%      5.20%      
        
    
        
            45-54     52.92%      5.06%      
        
    
        
            55-64     53.60%      10.25%     
        
    
        
            65+       36.33%      15.23%



In [ ]:

Age	Gender	Impressions	Clicks	Signed_In
36	0	3	0	1
73	1	3	0	1
30	0	3	0	1
49	1	3	0	1
47	1	11	0	1
47	0	11	1	1
0	0	7	1	0
46	0	5	0	1
16	0	3	0	1
52	0	4	0	1

Age	Gender	Impressions	Clicks	Signed_In	Age Range	Person
36	0	3	0	1	35-44	1
73	1	3	0	1	65+	1
30	0	3	0	1	25-34	1
49	1	3	0	1	45-54	1
47	1	11	0	1	45-54	1
47	0	11	1	1	45-54	1
0	0	7	1	0	0	1
46	0	5	0	1	45-54	1
16	0	3	0	1	1-17	1
52	0	4	0	1	45-54	1

Age Range	Age sum	Gender sum	Impressions sum	Clicks sum	Signed_In sum	Person sum
0	0	0	685483	19480	0	137106
1-17	211045	9470	69239	2065	13828	13828
18-24	847791	21721	203585	2167	40694	40694
25-34	1716328	30958	290511	2937	58174	58174
35-44	2798593	37676	355824	3662	70860	70860
45-54	3181779	34007	322109	3232	64288	64288
55-64	2661830	23988	224688	4556	44738	44738
65+	2098644	10445	144120	4350	28753	28753

Age Range	Person sum	Gender Mix	CTR
0	137106	0	0.0284179
1-17	13828	0.684842	0.0298242
18-24	40694	0.533764	0.0106442
25-34	58174	0.532162	0.0101098
35-44	70860	0.531696	0.0102916
45-54	64288	0.528979	0.0100339
55-64	44738	0.536188	0.020277
65+	28753	0.363266	0.0301832

Gender	1-17	18-24	25-34	35-44	45-54	55-64	65+
0	21800	95057	136292	166324	151650	104220	91956
1	47439	108528	154219	189500	170459	120468	52164

bin	1-17	18-24	25-34	35-44	45-54	55-64	65+
1	1674	4833	6862	8206	7546	5183	3345
2.9	4345	12833	18382	22260	20230	14025	9131
4.8	4346	13000	18861	22895	20586	14548	9120
6.7	2358	6952	9801	12094	11050	7595	4957
8.6	799	2237	3117	3981	3517	2486	1607
10.5	179	456	671	814	787	530	326
12.4	24	87	95	130	116	90	62
14.3	7	8	11	12	12	5	8
16.2	0	2	1	1	1	0	0
18.1	0	0	0	1	0	0	0

Age	Gender	Impressions	Clicks	Signed_In	Age Range	Person	Gen
36	0	3	0	1	35-44	1	Male
73	1	3	0	1	65+	1	Female
30	0	3	0	1	25-34	1	Male
49	1	3	0	1	45-54	1	Female
47	1	11	0	1	45-54	1	Female
47	0	11	1	1	45-54	1	Male
46	0	5	0	1	45-54	1	Male
16	0	3	0	1	1-17	1	Male
52	0	4	0	1	45-54	1	Male
21	0	3	0	1	18-24	1	Male

Age Range	Gender mean	Clicks mean
1-17	68.45%	15.04%
18-24	53.40%	5.36%
25-34	53.20%	5.08%
35-44	53.17%	5.20%
45-54	52.92%	5.06%
55-64	53.60%	10.25%
65+	36.33%	15.23%

Age	Gender	Impressions	Clicks	Signed_In
36	0	3	0	1
73	1	3	0	1
30	0	3	0	1
49	1	3	0	1
47	1	11	0	1
47	0	11	1	1
0	0	7	1	0
46	0	5	0	1
16	0	3	0	1
52	0	4	0	1

Age	Gender	Impressions	Clicks	Signed_In
36	0	3	0	1
73	1	3	0	1
30	0	3	0	1
49	1	3	0	1
47	1	11	0	1
47	0	11	1	1
0	0	7	1	0
46	0	5	0	1
16	0	3	0	1
52	0	4	0	1