In the first round of feature engineering, we extract features from the phone brand and the device model. The most immediate approach is to label-encode these two columns. As an additional feature, we add the regularized class histograms proposed in
import numpy as np
import math
import pickle
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder
#path to data and constructed features
DATA_PATH = "../../../input/"
FEATURE_PATH = "../../../features/"
#parameter for crosstab feature; minimum number of occurrences for a brand to be considered
NMIN = 50
#seed for randomness
SEED = 1747
#number of classes
Next, we load train, test and phone data.
train = pd.read_csv('{0}gender_age_train.csv'.format(DATA_PATH))[['device_id', 'group']].set_index('device_id')
test = pd.read_csv('{0}gender_age_test.csv'.format(DATA_PATH))['device_id']
phone_brand = pd.read_csv('{0}phone_brand_device_model.csv'.format(DATA_PATH)).groupby('device_id', sort = False).first()
train_test = np.hstack([train.index, test])
phone_brand = pd.Series(train_test, name = 'device_id').to_frame().merge(phone_brand, 'left',
left_on = 'device_id', right_index = True).set_index('device_id')
We relabel the device-ids to obtain a more legible representation and merge the phone data.
dec = LabelEncoder().fit(train_test)
[train_test, train.index, test.index, phone_brand.index] = [dec.transform(data) for data in
[train_test, train.index, test, phone_brand.index]]
We add the prefix of the device model as new feature.
phone_brand['device_pref'] = phone_brand['device_model'].apply(lambda model: model[0:2])
To generate the first features, we label-encode the train-test data.
phone_brand_enc = phone_brand.copy()
phone_brand_enc = phone_brand_enc.apply(lambda col: LabelEncoder().fit_transform(col), axis = 0)
In addition to plain ohe, we implement an idea from, which suggests to characterize a brand or device model by its gender-age histogram.
To set up the crosstab feature, we need to do a blending-type partition.
def blend_split(df, labels):
#first, generate validation set
df_train, _, labels_train, _ = train_test_split(df, labels, stratify = labels, train_size = 0.8, random_state = SEED)
#second, split in stacking folds
return train_test_split(df_train, labels_train, stratify = labels_train, train_size = 0.5, random_state = SEED)
We generate two splits, one for all events and the other one only for events with devices.
train_event_indices = dec.transform(pickle.load(open('{0}train_event_ids'.format(DATA_PATH),'rb')))
test_event_indices = dec.transform(pickle.load(open('{0}test_event_ids'.format(DATA_PATH),'rb')))
train_test_event_indices = np.hstack([train_event_indices, test_event_indices])
train_mask = train.index.isin(train_event_indices)
labels_all = train['group']
labels_event = labels_all[train_mask]
blend_all = blend_split(phone_brand[phone_brand.index.isin(train.index)], labels_all)
blend_events_raw = blend_split(phone_brand[phone_brand.index.isin(train_event_indices)], labels_event)
We also consider elements that are not associated with events.
residual_indices = train.index.difference(train_event_indices)
y_res = train[train.index.isin(residual_indices)]['group']
phone_brand_res = phone_brand[phone_brand.index.isin(residual_indices)]
blend_events = [pd.concat([blend_events_raw[0], phone_brand_res], axis = 0),
pd.concat([blend_events_raw[1], phone_brand_res], axis = 0),
np.hstack([blend_events_raw[2], y_res]),
np.hstack([blend_events_raw[3], y_res])
Now, we define a transformer to compute the crosstab-features on the blending folds. In order to obtain features that are not constraint to [0,1], we consider the logs of the class probabilities.
class CrossTabEncoder(BaseEstimator, TransformerMixin):
A CrossTabEncoder characterizes a feature by its crosstab dataframe.
def fit(self, blend_data, label_name):
"""For each class of the considered feature, the empirical histogram for the prediction classes is computed.
blend_data : data used for the histogram computation
#compute the prior
prior = pd.Series(labels_all).value_counts(normalize = True).sort_index().values = [features[label_name] for features in blend_data[0:2]]
self.crosstabs = [compute_crosstabs(data, classes, prior) for data, classes in zip(, blend_data[2:4])]
self.crosstab_total = compute_crosstabs(pd.concat(, axis = 0), np.hstack(blend_data[2:4]), prior)
return self
def transform(self, data):
"""The precomputed histograms are joined as features to the given data set.
X : array-like object
Transformed dataset.
feat = [feature.to_frame().merge(crosstab,'left',left_on =, right_index = True ).drop(, axis = 1)
for feature, crosstab in zip(, self.crosstabs[::-1])]
merge_12 = data.to_frame().merge(pd.concat(feat, axis = 0).groupby(level = 0, sort = False).first(),
'left', left_index = True, right_index = True).drop(, axis = 1)
merge_total = pd.merge(data.to_frame(), self.crosstab_total, 'left',
left_on =, right_index = True).drop(, axis = 1).fillna(0)
return merge_12.combine_first(merge_total)
def compute_crosstabs(tab_0, tab_1, prior):
compute the log of relative crosstabs regularized by a prior
return pd.crosstab(tab_0, tab_1).apply(lambda row: compute_log_probs(row, prior), axis = 1)
def compute_log_probs(row, prior):
"""helper function for computing regularized log probabilities
#add 1 to avoid taking logarithm of 0
row = row + 1
#smooth out the probabilities by using the prior
row_sum = row.sum()
weight = min(row_sum, NMIN)/NMIN
row = (1 - weight) * prior + weight * (row/row_sum)
#compute the log ratios of class probabilities and the popularity of the feature
row = row.apply(lambda y: math.log(y) - math.log(1.0/NCLASSES))
return row
Next, we define a function to compute crosstab-encoders for multiple columns.
def generate_crosstab_features(blend_data, data):
ctes = [CrossTabEncoder().fit(blend_data, column) for column in data.columns]
return pd.concat([cte.transform(data[column]) for column, cte in zip(data.columns, ctes)], axis = 1)
We generate the cross-tab features for all devices and the events-devices.
crosstab_all = generate_crosstab_features(blend_all, phone_brand)
crosstab_events = generate_crosstab_features(blend_events, phone_brand[phone_brand.index.isin(train_test_event_indices)])
We also record the column names.
crosstab_names = ['{0} {1}'.format(pref,suf) for pref,suf in zip(phone_brand.columns.repeat(NCLASSES).values,crosstab_all.columns)]
Finally, we collect all features, and their names and persist them to disk.
phone_model_features = np.hstack([phone_brand_enc, crosstab_all])
phone_model_features_events = np.hstack([phone_brand_enc[phone_brand_enc.index.isin(train_test_event_indices)],
phone_model_names = np.hstack([phone_brand.columns, crosstab_names])
pickle.dump(phone_model_features, open('{0}phone_model_features.p'.format(FEATURE_PATH), 'wb'))
pickle.dump(phone_model_names, open('{0}phone_model_features_names.p'.format(FEATURE_PATH), 'wb'))
pickle.dump(phone_model_features_events, open('{0}phone_model_features_event.p'.format(FEATURE_PATH), 'wb'))
pickle.dump(phone_model_names, open('{0}phone_model_features_names_event.p'.format(FEATURE_PATH), 'wb'))
