In [8]:
import pandas as pd
import numpy as np #didn't use
import scipy #didn't use
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("/Users/abigaildobyns/Documents/doing_data_science-master/dds_datasets/dds_ch2_nyt/nyt1.csv")#load data set
data #let's look at it


Out[2]:
Age Gender Impressions Clicks Signed_In
0 36 0 3 0 1
1 73 1 3 0 1
2 30 0 3 0 1
3 49 1 3 0 1
4 47 1 11 0 1
5 47 0 11 1 1
6 0 0 7 1 0
7 46 0 5 0 1
8 16 0 3 0 1
9 52 0 4 0 1
10 0 0 8 1 0
11 21 0 3 0 1
12 0 0 4 0 0
13 57 0 6 0 1
14 31 0 5 0 1
15 0 0 6 0 0
16 40 1 3 0 1
17 31 1 5 0 1
18 38 0 4 0 1
19 0 0 5 0 0
20 59 1 4 0 1
21 61 0 6 0 1
22 48 0 7 0 1
23 29 1 2 0 1
24 0 0 4 0 0
25 19 1 4 0 1
26 19 0 3 0 1
27 48 1 9 0 1
28 48 1 4 0 1
29 21 1 5 0 1
30 23 0 4 0 1
31 66 1 3 0 1
32 23 1 7 0 1
33 44 1 4 0 1
34 32 0 2 0 1
35 22 1 2 0 1
36 30 1 8 0 1
37 74 0 2 0 1
38 65 1 5 0 1
39 0 0 7 2 0
40 32 1 9 0 1
41 0 0 4 0 0
42 61 0 2 0 1
43 63 1 4 0 1
44 36 1 1 0 1
45 35 0 7 0 1
46 0 0 3 0 0
47 0 0 7 0 0
48 73 0 4 0 1
49 0 0 5 0 0
50 0 0 5 0 0
51 33 0 4 0 1
52 34 0 5 0 1
53 48 1 1 0 1
54 43 1 4 0 1
55 59 1 2 0 1
56 34 0 3 0 1
57 22 0 5 0 1
58 18 1 1 0 1
59 0 0 4 0 0
... ... ... ... ...

458441 rows × 5 columns

1. Create a new variable, age_group, that categorizes users as "<18", "18-24", "25-34", "35-44", "45-54", "55-64", and "65+".

In [7]:
age_group = pd.Series(index = data.index) #create a series to store the new age group in

for index, row in data['Age'].iteritems(): #iterate over the rows, must explicitly call index too
    if row < 18:
        age_group[index] = 1
    if 18 <= row <= 24:
        age_group[index] = 2
    if 25 <= row <= 34:
        age_group[index] = 3
    if 35 <= row <= 44:
        age_group[index] = 4
    if 45 <= row <= 54:
        age_group[index] = 5
    if 55 <= row <= 64:
        age_group[index] = 6
    if row >= 65:
        age_group[index] = 7

data['age_group'] = age_group #add new categories to the data frame
data #check that it worked


Out[7]:
Age Gender Impressions Clicks Signed_In age_group
0 36 0 3 0 1 4
1 73 1 3 0 1 7
2 30 0 3 0 1 3
3 49 1 3 0 1 5
4 47 1 11 0 1 5
5 47 0 11 1 1 5
6 0 0 7 1 0 1
7 46 0 5 0 1 5
8 16 0 3 0 1 1
9 52 0 4 0 1 5
10 0 0 8 1 0 1
11 21 0 3 0 1 2
12 0 0 4 0 0 1
13 57 0 6 0 1 6
14 31 0 5 0 1 3
15 0 0 6 0 0 1
16 40 1 3 0 1 4
17 31 1 5 0 1 3
18 38 0 4 0 1 4
19 0 0 5 0 0 1
20 59 1 4 0 1 6
21 61 0 6 0 1 6
22 48 0 7 0 1 5
23 29 1 2 0 1 3
24 0 0 4 0 0 1
25 19 1 4 0 1 2
26 19 0 3 0 1 2
27 48 1 9 0 1 5
28 48 1 4 0 1 5
29 21 1 5 0 1 2
30 23 0 4 0 1 2
31 66 1 3 0 1 7
32 23 1 7 0 1 2
33 44 1 4 0 1 4
34 32 0 2 0 1 3
35 22 1 2 0 1 2
36 30 1 8 0 1 3
37 74 0 2 0 1 7
38 65 1 5 0 1 7
39 0 0 7 2 0 1
40 32 1 9 0 1 3
41 0 0 4 0 0 1
42 61 0 2 0 1 6
43 63 1 4 0 1 6
44 36 1 1 0 1 4
45 35 0 7 0 1 4
46 0 0 3 0 0 1
47 0 0 7 0 0 1
48 73 0 4 0 1 7
49 0 0 5 0 0 1
50 0 0 5 0 0 1
51 33 0 4 0 1 3
52 34 0 5 0 1 3
53 48 1 1 0 1 5
54 43 1 4 0 1 4
55 59 1 2 0 1 6
56 34 0 3 0 1 3
57 22 0 5 0 1 2
58 18 1 1 0 1 2
59 0 0 4 0 0 1
... ... ... ... ... ...

458441 rows × 6 columns

Plot the distributions of number impressions and clickthrough- rate (CTR=# clicks/# impressions) for these six age categories.

In [9]:
data['CTR'] = data['Clicks']/data['Impressions'] #make CTR column

In [21]:
data.boxplot(column= 'Impressions', by = 'age_group')
plt.show()

In [22]:
data.boxplot(column= 'CTR', by = 'age_group')
plt.show()
Define a new variable to segment or categorize users based on their click behavior.

In [24]:
click_group = pd.Series(index = data.index) #create a series to store the new age group in

for index, row in data['Clicks'].iteritems(): #iterate over the rows
    if row < 1:
        click_group[index] = 1
    if 1 <= row <= 2:
        click_group[index] = 2
    if row >2:
        click_group[index] = 3


data['click_group'] = click_group #add new categories to the data frame
data #check that it worked


Out[24]:
Age Gender Impressions Clicks Signed_In age_group CTR click_group
0 36 0 3 0 1 4 0.000000 1
1 73 1 3 0 1 7 0.000000 1
2 30 0 3 0 1 3 0.000000 1
3 49 1 3 0 1 5 0.000000 1
4 47 1 11 0 1 5 0.000000 1
5 47 0 11 1 1 5 0.090909 2
6 0 0 7 1 0 1 0.142857 2
7 46 0 5 0 1 5 0.000000 1
8 16 0 3 0 1 1 0.000000 1
9 52 0 4 0 1 5 0.000000 1
10 0 0 8 1 0 1 0.125000 2
11 21 0 3 0 1 2 0.000000 1
12 0 0 4 0 0 1 0.000000 1
13 57 0 6 0 1 6 0.000000 1
14 31 0 5 0 1 3 0.000000 1
15 0 0 6 0 0 1 0.000000 1
16 40 1 3 0 1 4 0.000000 1
17 31 1 5 0 1 3 0.000000 1
18 38 0 4 0 1 4 0.000000 1
19 0 0 5 0 0 1 0.000000 1
20 59 1 4 0 1 6 0.000000 1
21 61 0 6 0 1 6 0.000000 1
22 48 0 7 0 1 5 0.000000 1
23 29 1 2 0 1 3 0.000000 1
24 0 0 4 0 0 1 0.000000 1
25 19 1 4 0 1 2 0.000000 1
26 19 0 3 0 1 2 0.000000 1
27 48 1 9 0 1 5 0.000000 1
28 48 1 4 0 1 5 0.000000 1
29 21 1 5 0 1 2 0.000000 1
30 23 0 4 0 1 2 0.000000 1
31 66 1 3 0 1 7 0.000000 1
32 23 1 7 0 1 2 0.000000 1
33 44 1 4 0 1 4 0.000000 1
34 32 0 2 0 1 3 0.000000 1
35 22 1 2 0 1 2 0.000000 1
36 30 1 8 0 1 3 0.000000 1
37 74 0 2 0 1 7 0.000000 1
38 65 1 5 0 1 7 0.000000 1
39 0 0 7 2 0 1 0.285714 2
40 32 1 9 0 1 3 0.000000 1
41 0 0 4 0 0 1 0.000000 1
42 61 0 2 0 1 6 0.000000 1
43 63 1 4 0 1 6 0.000000 1
44 36 1 1 0 1 4 0.000000 1
45 35 0 7 0 1 4 0.000000 1
46 0 0 3 0 0 1 0.000000 1
47 0 0 7 0 0 1 0.000000 1
48 73 0 4 0 1 7 0.000000 1
49 0 0 5 0 0 1 0.000000 1
50 0 0 5 0 0 1 0.000000 1
51 33 0 4 0 1 3 0.000000 1
52 34 0 5 0 1 3 0.000000 1
53 48 1 1 0 1 5 0.000000 1
54 43 1 4 0 1 4 0.000000 1
55 59 1 2 0 1 6 0.000000 1
56 34 0 3 0 1 3 0.000000 1
57 22 0 5 0 1 2 0.000000 1
58 18 1 1 0 1 2 0.000000 1
59 0 0 4 0 0 1 0.000000 1
... ... ... ... ... ... ... ...

458441 rows × 8 columns

Explore the data and make visual and quantitative comparisons across user segments/demographics (<18-year-old males versus < 18-year-old females or logged-in versus not, for example).

In [26]:
data[data['age_group']==1].boxplot(column = 'Impressions', by ='Gender')
plt.show()

The above example has two sorting steps. the first is in the indexing, where I selected only age group one. i could have also used

data[data['Age']]<18]

to accomplish the same thing.

the grouping is handled by the pd.data.boxplot() call using teh by = parameter.

Create metrics/measurements/statistics that summarize the data. Examples of potential metrics include CTR, quantiles, mean, median, variance, and max, and these can be calculated across the various user segments. Be selective. Think about what will be important to track over time—what will compress the data, but still capture user behavior.

In [23]:
#summary stats by age group for Impressions
byage = data.groupby('age_group')
byage['Impressions'].describe()


Out[23]:
age_group       
1          count    150934.000000
           mean          5.000345
           std           2.243536
           min           0.000000
           25%           3.000000
           50%           5.000000
           75%           6.000000
           max          18.000000
2          count     40694.000000
           mean          5.002826
           std           2.244950
           min           0.000000
           25%           3.000000
           50%           5.000000
           75%           6.000000
           max          17.000000
3          count     58174.000000
           mean          4.993829
           std           2.226877
           min           0.000000
           25%           3.000000
           50%           5.000000
           75%           6.000000
           max          17.000000
4          count     70860.000000
           mean          5.021507
           std           2.237829
           min           0.000000
           25%           3.000000
           50%           5.000000
           75%           6.000000
           max          20.000000
5          count     64288.000000
           mean          5.010406
           std           2.240790
           min           0.000000
           25%           3.000000
           50%           5.000000
           75%           6.000000
           max          17.000000
6          count     44738.000000
           mean          5.022308
           std           2.230206
           min           0.000000
           25%           3.000000
           50%           5.000000
           75%           6.000000
           max          16.000000
7          count     28753.000000
           mean          5.012347
           std           2.249162
           min           0.000000
           25%           3.000000
           50%           5.000000
           75%           6.000000
           max          16.000000
Length: 56, dtype: float64

In [27]:
#summary of CTR by gender
bygender = data.groupby('Gender')
bygender['CTR'].describe()


Out[27]:
Gender       
0       count    2.901760e+05
        mean              inf
        std               NaN
        min      0.000000e+00
        25%      0.000000e+00
        50%      0.000000e+00
        75%      0.000000e+00
        max               inf
1       count    1.682650e+05
        mean              inf
        std               NaN
        min      0.000000e+00
        25%      0.000000e+00
        50%      0.000000e+00
        75%      0.000000e+00
        max               inf
dtype: float64

In [28]:
#summary stats of all users that were signed in
data[data['Signed_In'] == 1].describe()


Out[28]:
Age Gender Impressions Clicks Signed_In age_group CTR click_group
count 321335.000000 321335.000000 321335.000000 321335.000000 321335 321335.000000 3.213350e+05 321335.000000
mean 42.062054 0.523644 5.010584 0.071480 1 4.183537 inf 1.068738
std 16.308117 0.499441 2.238784 0.268659 0 1.613538 NaN 0.253328
min 7.000000 0.000000 0.000000 0.000000 1 1.000000 0.000000e+00 1.000000
25% 29.000000 0.000000 3.000000 0.000000 1 3.000000 0.000000e+00 1.000000
50% 41.000000 1.000000 5.000000 0.000000 1 4.000000 0.000000e+00 1.000000
75% 53.000000 1.000000 6.000000 0.000000 1 5.000000 0.000000e+00 1.000000
max 108.000000 1.000000 20.000000 3.000000 1 7.000000 inf 3.000000

8 rows × 8 columns

other files from this set contain other days. load and process those to compare days. I'm not doing it right now because i've got the jist of this.