In [1]:
%matplotlib inline
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
In online advertising, click-through rate (CTR) is a very important metric for evaluating ad performance.
This competition provides 11 days worth of Avazu data to build and test prediction models.
Additional details:
Experiment with a subset of the training data.
In [1]:
!head -n 400001 data/train_rev2.csv > data/train_sub.csv
In [2]:
train = pd.read_csv("data/train_sub.csv")
In [3]:
train.head(10)
Out[3]:
In [4]:
train.shape
Out[4]:
In [5]:
train.columns
Out[5]:
Total # of clicks/non-clicks were targeted at 200K.
In [13]:
train.groupby('hour').count()
Out[13]:
# of clicks are not exact 200K due to hash collision. No NANs exist.
Some thoughts so far:
In [2]:
# subsample train_rev2.csv into 10 subsets
#%run subsample.py # CAUTION! it takes a while
!ls -l data/train*
In [3]:
train0 = pd.read_csv('data/train_0.csv')
In [34]:
train0.shape
Out[34]:
In [5]:
train0.head()
Out[5]:
Encode 'hour' into 'day' and 'short_hour'. That is, extract information from the attribute 'hour'.
In [6]:
train0.groupby('day').count()
Out[6]:
In [7]:
train0.groupby('short_hour').count()
Out[7]:
Plot the ratio $\text{click} \over \text{click} + \text{non-click}$ over hours and days.
In [8]:
ratio_day = train0.groupby('day').mean()['click']
ratio_hour = train0.groupby('short_hour').mean()['click']
ratio_dayhour = train0.groupby(['day', 'short_hour']).mean()['click']
In [9]:
fig, (ax1, ax2, ax3) = plt.subplots(3)
ax1.plot(range(10), ratio_day)
ax1.set_ylabel('Daily')
ax2.plot(range(24), ratio_hour)
ax2.set_ylabel('hourly')
ax3.plot(range(239), ratio_dayhour)
Out[9]:
As one expected, 'day' and 'hour' are two significant factors.
‘banner_pos’ is a significant categorical as well.
In [10]:
train0.groupby('banner_pos').mean()
Out[10]:
A closer look at the categorical vaiables.
In [11]:
intcols = ("click","day","short_hour","C1","C17","C18","C19","C20","C21","C22","C23","C24")
catcols = ("banner_pos","site_id","site_domain","site_category","app_id","app_domain","app_category","device_os","device_make","device_model","device_type","device_conn_type")
In [12]:
for col_id in catcols:
print col_id.ljust(15) + 'has {0} unique values'.format(np.unique(train0[col_id]).shape[0])
In [16]:
train0.iloc[0]
Out[16]:
In [30]:
chacols = ("site_id","site_domain","site_category","app_id","app_domain","app_category","device_os","device_make","device_model")
In [31]:
np.array(train0[list(chacols)])
Out[31]:
In [32]:
from sklearn.feature_extraction import FeatureHasher
fh = FeatureHasher(input_type='string')
# generaet a sparse matrix
csr_fh = fh.fit_transform(np.array(train0[list(chacols)].astype(str)))
In [35]:
print csr_fh.shape
print csr_fh.data.shape
print csr_fh.indices.shape
print csr_fh.indptr.shape
In [36]:
test = pd.read_csv('data/test_rev2.csv')
fh.transform(np.array(test[list(catcols)].astype(str)))
In [51]:
from scipy.sparse import csr_matrix
# row oriented
indptr = np.array([0,2,3,6])
indices = np.array([0,2,2,0,1,2])
data = np.array([1,2,3,4,5,6])
csr_matrix( (data,indices,indptr), shape=(3,3) ).todense()
Out[51]:
In [50]:
# non-zero values of the i-th row
print data[indptr[0]:indptr[0+1]]
print data[indptr[1]:indptr[1+1]]
print data[indptr[2]:indptr[2+1]]
In [52]:
# non-zero positions of the i-th row
print indices[indptr[0]:indptr[0+1]]
print indices[indptr[1]:indptr[1+1]]
print indices[indptr[2]:indptr[2+1]]
In [ ]: