In the second round of feature engineering, we construct features from the 'events' data. This will result in features involving the location of the event. After the discussions at Kaggle it seems that the location data can be omitted.
In [83]:
import numpy as np
import math
import pandas as pd
import pickle
import time
from sklearn.preprocessing import LabelEncoder, FunctionTransformer
#path to data and features
DATA_PATH = "../../../input/"
FEATURE_PATH = "../../../features/"
#hour marking the begin of a day
DAWN = 5
#Minimum number of events starting from which no smoothing is done
NMIN = 25
Next, we load the train-test data and merge with the events data. We perform an inner join so as to keep only those device ids for which some events are available.
In [3]:
train = pd.read_csv('{0}gender_age_train.csv'.format(DATA_PATH))['device_id']
test = pd.read_csv('{0}gender_age_test.csv'.format(DATA_PATH))['device_id']
events = pd.read_csv('{0}events.csv'.format(DATA_PATH))
train_test = pd.concat([train, test])
train_test_events = pd.merge(train_test.to_frame(), events, 'inner', on = 'device_id').drop_duplicates().set_index('device_id')
To begin with, we simply count the number of events per device id.
In [4]:
event_cnt = train_test_events.groupby(level = 0, sort = False)['event_id'].nunique()
cnt_name = ['event_cnt']
As a pleasant side effect, we obtain device-ids that have at least some events associated with them. We extract and pickle them for further use.
In [5]:
event_cnt_train = pd.merge(train.to_frame(), event_cnt.to_frame(), 'inner',
left_on = 'device_id', right_index = True).set_index('device_id')
event_cnt_test = pd.merge(test.to_frame(), event_cnt.to_frame(), 'inner',
left_on = 'device_id', right_index = True).set_index('device_id')
pickle.dump(event_cnt_train.index, open('{0}train_event_ids'.format(DATA_PATH), 'wb'))
pickle.dump(event_cnt_test.index, open('{0}test_event_ids'.format(DATA_PATH), 'wb'))
From the timestamp column, we first extract weekday and hour, and quantize the predictors.
In [84]:
dates = pd.to_datetime(train_test_events['timestamp'])
start = time.clock()
hours = dates.dt.hour
binned_hours = pd.cut((hours - DAWN ) % 24 + DAWN, [5, 7 , 22, 28],
labels = ['morning', 'day', 'night']).to_frame().rename(columns = {'timestamp': 'binned_hour'})
weekdays = pd.cut(dates.dt.weekday, [0, 3, 4, 5, 6],
labels = ['weekday', 'friday', 'saturday', 'sunday'],
include_lowest = True).to_frame().rename(columns = {'timestamp': 'weekday'})
weekdays_binned_hours = pd.concat([weekdays, binned_hours], axis = 1).apply(lambda row: '{0},{1}'.format(row[0],row[1]), axis = 1)
weekdays_binned_hours.name = 'timestamp'
print(time.clock() - start)
As a first rough feature, we consider the mean of the shifted hours corresponding to a device-id.
In [122]:
mean_hour = event_cnt.to_frame().merge(((hours - DAWN ) % 24 + DAWN).groupby(level = 0).mean().to_frame(),
'left',left_index = True, right_index = True).drop('event_id', axis = 1).fillna(0)
For each device, we make a statistic of when an event was triggered. To make the features comparable, we compute the log-likelihoods.
In [110]:
def compute_logliks_pivot(data):
hist_raw = data.reset_index().pivot_table(index = 'device_id', columns = data.name, aggfunc = len, fill_value = 0)
hist = event_cnt.to_frame().merge(hist_raw,'left',left_index = True, right_index = True).drop('event_id', axis = 1)
prior = data.value_counts(normalize = True).sort_index().values
return hist.apply(lambda row: compute_logliks(row, prior), axis = 1).fillna(0)
def compute_logliks(row, prior):
#add 1 to avoid taking logarithm of 0
row = row + 1
#smooth out the probabilities by using the prior
row_sum = row.sum()
weight = min(row_sum - len(row), NMIN)/NMIN
row = (1 - weight) * prior + weight * (row / row_sum)
#compute the log ratios of class probabilities and the popularity of the feature
row = row.apply(lambda y: math.log(y) - math.log(1.0 / len(row)))
return row
Finally, we compute these statistics for hours and for the binned days.
In [111]:
start = time.clock()
days_loglik = compute_logliks_pivot(weekdays_binned_hours)
print(time.clock() - start)
In [125]:
time_features = np.hstack([ days_loglik, mean_hour])
time_names = np.hstack([days_loglik.columns, 'mean_hour'])
Finally, we collect the event features and persist them to disk.
In [126]:
event_features = np.hstack([event_cnt.to_frame(), time_features])
event_features_names = np.hstack([cnt_name, time_names])
pickle.dump(event_features, open('{0}event_features_noloc.p'.format(FEATURE_PATH), 'wb'))
pickle.dump(event_features_names, open('{0}event_features_noloc_names.p'.format(FEATURE_PATH), 'wb'))
In [ ]: