In the third round of feature engineering, we construct features on app usage. This includes three types of features. First, simple counts on the number of installed and active apps. Second, a GMM-analysis of crosstab features. Third, bag of installed apps together with embeddings provided by a neural network.
In [1]:
import math
import numpy as np
import pandas as pd
import pickle
import time
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers.core import Dense, Layer, Dropout, Activation
from keras.optimizers import SGD
from scipy import sparse
from scipy.sparse import coo_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cross_validation import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import log_loss
from sklearn.mixture import GMM
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
#minimum number of times an app is used so that it will be counted
MIN_COUNT = 10
#paths to data and features
DATA_PATH = "../../../input/"
FEATURE_PATH = "../../../features/"
#number of classes to be predicted
NCLASSES = 12
#seed for randomness
SEED = 1747
np.random.seed(SEED)
#number of clusters for GMM
NCLUST = 2
########################
##BLENDING_CONFIG
########################
#number of folds used in blending
N_FOLDS = 4
########################
##NEURAL NETWORK CONFIGS
########################
#batch size for neural network
BATCH_SIZE = 64
#size of the hidden layer in the neural network
HIDDEN_SIZE = 50
#probability for dropout
DROPOUT_PROB = 50
#Number of neueral networks used for bagging
N_NNETS = 4
Next, we load the apps data and join with the device data. We also perform a more concise encoding of ids.
In [2]:
device_apps = pickle.load(open('{0}device_apps_inner'.format(DATA_PATH),'rb'))
device_apps = device_apps.dropna()
train_indices = pickle.load(open('{0}train_event_ids'.format(DATA_PATH),'rb'))
test_indices = pickle.load(open('{0}test_event_ids'.format(DATA_PATH),'rb'))
train_test = pd.concat([pd.Series(train_indices).to_frame(),pd.Series(test_indices).to_frame()])
apps = pd.read_csv('{0}app_labels.csv'.format(DATA_PATH))
apps['app_id'] = apps['app_id'].astype(float)
did_enc = LabelEncoder().fit(train_test['device_id'])
train_test['device_id'] = did_enc.transform(train_test['device_id']).astype(np.int32)
train_indices = did_enc.transform(train_indices)
test_indices = did_enc.transform(test_indices)
device_apps['device_id'] = did_enc.transform(device_apps['device_id']).astype(np.int32)
app_enc = LabelEncoder().fit(np.hstack([device_apps['app_id'], apps['app_id']]))
device_apps['app_id'] = app_enc.transform(device_apps['app_id']).astype(np.int32)
apps['app_id'] = app_enc.transform(apps['app_id']).astype(np.int32)
start = time.clock()
device_apps_inner = train_test.merge(device_apps, 'left', on = 'device_id')
device_label = device_apps_inner[['device_id', 'app_id']].merge(apps, 'left', on = 'app_id')[['device_id', 'label_id']]
device_label_small = device_label.groupby(['device_id', 'label_id'], sort = False).size().reset_index()[['device_id', 'label_id']]
print(time.clock() - start)
del apps
del device_apps
We filter out rarely used apps.
In [3]:
apps_train = device_apps_inner[device_apps_inner['device_id'].isin(train_indices)]['app_id']
apps_test = device_apps_inner[device_apps_inner['device_id'].isin(test_indices)]['app_id']
train_counts = apps_train.value_counts()
test_counts = apps_test.value_counts()
relevant_train = train_counts[train_counts > MIN_COUNT].index
relevant_test = test_counts[test_counts > MIN_COUNT].index
relevant_apps = np.intersect1d(relevant_train, relevant_test)
device_apps_filtered = device_apps_inner[device_apps_inner['app_id'].isin(relevant_apps)][['device_id', 'app_id']]
As a first feature, we count the numbers of installed and active apps.
In [4]:
app_cnt = device_apps_inner.groupby('device_id', sort =False)[['is_installed','is_active']].sum()
app_cnt_names = ['installed_cnt', 'active_cnt']
del device_apps_inner
Next, we introduce a function to create a tf-idf vectorized bag of features...
In [4]:
def make_bag(data, index, feature_name, col_prefix):
bag_of_features_raw = data.groupby(index, as_index = False, sort = False).aggregate(lambda x:
' '.join([str(word) for word in list(x)]))
bag_of_features = train_test.merge(bag_of_features_raw, 'left', on = 'device_id').drop_duplicates().fillna('')
vectorizer = TfidfVectorizer(analyzer = "word", tokenizer = None, min_df=3, max_features=None, use_idf=1,
smooth_idf=1, sublinear_tf=1, preprocessor = None, stop_words = None)
vectorized_bof = vectorizer.fit_transform(bag_of_features[feature_name])
names = ['{0}{1}'.format(col_prefix, name) for name in vectorizer.get_feature_names()]
return (names, vectorized_bof)
... and apply it to the installed apps.
In [5]:
start = time.clock()
(boa_names, bag_of_apps) = make_bag(device_apps_filtered, 'device_id', 'app_id', 'boa')
print(time.clock()- start)
(bol_names, bag_of_labels) = make_bag(device_label, 'device_id', 'label_id', 'bol')
print(time.clock()- start)
bags = sparse.csr_matrix(sparse.hstack([bag_of_apps, bag_of_labels]))
bags_names = np.hstack([boa_names, bol_names])
We use a Keras to train a single-layer neural network and keep the hidden layer as feature for later classifiers. The architecture is taken from https://www.kaggle.com/chechir/talkingdata-mobile-user-demographics/keras-on-labels-and-brands.
As preliminary step, we fetch the training data, filter for devices with events and extract the labels.
In [6]:
train = pd.read_csv('{0}gender_age_train.csv'.format(DATA_PATH)).loc[:, ['device_id', 'group']]
train_event = train[train['device_id'].isin(did_enc.classes_)]
train_event['device_id'] = did_enc.transform(train_event['device_id'])
labels = LabelEncoder().fit_transform(train_event['group'])
To prevent overfitting, we use a blend-type split of the training data.
In [7]:
def blend_split(X, y, n_folds):
#first, generate holdout set
X_train, X_ho, y_train, y_ho = train_test_split(X, y, stratify = y, train_size = 0.8, random_state = SEED)
#second, split the remaining set
ind_train, ind_test, _, _ = train_test_split(range(len(y_train)), y_train, stratify = y_train, train_size = 0.5,
random_state = SEED)
return [StratifiedKFold(y_train, n_folds = n_folds, shuffle = True, random_state = SEED), X_train, X_ho, y_train, y_ho]
We split the training indices and the bagged features.
In [8]:
ind_split = blend_split(train_indices,labels, N_FOLDS)
bags_split = blend_split(bags[:labels.shape[0]], labels, N_FOLDS)
Since the bag-of-features is a sparse matrix, we need a generator to feed it into Keras.
In [9]:
def batches(X):
idxs = range(X.shape[0])
return [idxs[i:i + BATCH_SIZE] for i in range(0, len(idxs), BATCH_SIZE)]
def sparse_gen(X, y):
y_enc = OneHotEncoder().fit_transform(y.reshape(-1,1)).toarray()
while 1:
batch_list = batches(X)
for batch in batch_list:
yield (X[batch, :].toarray(), y_enc[batch, :])
def batch_predict(clf, X):
batch_list = batches(X)
return np.vstack([clf.predict(X[batch,:].toarray()) for batch in batch_list])
As suggested in https://www.kaggle.com/chechir/talkingdata-mobile-user-demographics/keras-on-labels-and-brands, a neural network with a single hidden layer plus dropout.
In [10]:
def baseline_model():
model = Sequential()
model.add(Dense(HIDDEN_SIZE, input_dim=bags.shape[1], init = 'normal', activation = 'tanh'))
model.add(Dropout(DROPOUT_PROB))
model.add(Dense(NCLASSES, init = 'normal', activation = 'sigmoid'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
return model
Next, we fit the neural networks on one of the blending halves and generate the hidden layer when performing predictions on the second half. In order to prepare bagging, we fit two networks with different initial data.
In [11]:
def keras_fit_folds(data, nb_epoch):
[skf, X_train, X_val, y_train, y_val] = data
result_nnets = []
for train_index, test_index in skf:
nnets = keras_fit(X_train[train_index], X_train[test_index], y_train[train_index], y_train[test_index],
N_NNETS, nb_epoch)
result_nnets = result_nnets + [nnets]
return result_nnets
def keras_fit(X_train, X_val, y_train, y_val, bag_num, nb_epoch):
result_nnets = []
for _ in range(bag_num):
nnet = baseline_model()
nnet.fit_generator(sparse_gen(X_train, y_train),
samples_per_epoch = X_train.shape[0],
validation_data = sparse_gen(X_val, y_val),
callbacks = [EarlyStopping(monitor='val_loss', patience=0, verbose=1)],
nb_val_samples = X_val.shape[0],
nb_epoch = nb_epoch,
verbose = 2)
result_nnets += [nnet]
return result_nnets
For predictions on the training set, we fit on the folds. For predictions on the test set, we fit on the entire training set.
In [12]:
nnets_train = keras_fit_folds(bags_split, nb_epoch = 6)
nnets_test = keras_fit(bags_split[1], bags_split[2], bags_split[3], bags_split[4], N_NNETS, nb_epoch = 5)
Now, we compute the predictions on the training set and the test set.
In [26]:
#############TRAINING PREDICTIONS##################
preds_train = []
train_columns = ['NN{0}'.format(col_idx) for col_idx in range(N_NNETS * NCLASSES)]
for fold, (train_index, test_index) in enumerate(bags_split[0]):
train_data = np.hstack([batch_predict(nnet, bags_split[1][test_index]) for nnet in nnets_train[fold]])
#np.mean(np.array([batch_predict(nnet, bags_split[1][test_index]) for nnet in nnets_train[fold]]), axis = 0)
#np.hstack([batch_predict(net, bags_split[1][test_index]) for net in nnets_train[0]])
ttrain_indices = pd.Series(ind_split[1][test_index], name = 'device_id')
preds_train = preds_train + [pd.DataFrame(train_data, columns = train_columns, index = ttrain_indices)]
train_predictions = pd.concat(preds_train, axis = 0)
For the predictions on the holdout set, we can generate the maximum number of predictions in each fold.
In [33]:
#############TEST PREDICTIONS##################
ho_test_input = sparse.csr_matrix(sparse.vstack([bags_split[2], bags[labels.shape[0]:]]))
ho_test_data = np.hstack([batch_predict(net, ho_test_input) for net in nnets_test])
#np.mean(np.array([batch_predict(net, ho_test_input) for net in nnets_test]))
ho_test_index = pd.Series(np.hstack([ind_split[2], train_test['device_id'][labels.shape[0]:]]), name = 'device_id')
ho_test_predictions = pd.DataFrame(ho_test_data , columns = train_columns, index = ho_test_index)
It remains to merge the predictions.
In [34]:
nnet_features = train_test.merge(pd.concat([train_predictions, ho_test_predictions]),
'left', left_on = 'device_id', right_index = True).set_index('device_id')
nnet_names = train_columns
In [35]:
pickle.dump(nnet_features.values, open('{0}nnet_features2.p'.format(FEATURE_PATH), 'wb'))
pickle.dump(nnet_names, open('{0}nnet_names.p'.format(FEATURE_PATH), 'wb'))
Our goal is to embed the bags of apps and labels into a lower dimensional space. In addition to the neural network considered above, we use an ad hoc approach, where each app is encoded by its histogram. For large bags, this would lead to a large number of necessary encodings. Therefore, we use GMM to get the NCLUST most important components. As first step, we use the crosstab-encoder. In order to avoid distortion of validation scores, the crosstabs are only computed outside of the validation set.
In [84]:
class CrossTabEncoder(BaseEstimator, TransformerMixin):
"""CrossTabEncoder
A CrossTabEncoder characterizes a feature by its crosstab dataframe.
"""
def fit(self, data, ids_list):
"""For each class of the considered feature, the empirical histogram for the prediction classes is computed.
Parameters
----------
data : feature column used for the histogram computation
ids_list : list of ids used to split the training ids
"""
self.ids_pair = ids_list
merged_data = [train_event[train_event['device_id'].isin(ids)].merge(data,
'inner', 'device_id').drop('device_id', axis = 1) for ids in ids_list]
data_total = pd.concat(merged_data, axis = 0)
self.crosstabs = [pd.crosstab(mdata.iloc[:, 1], mdata.iloc[:, 0]).fillna(0).apply(compute_log_probs,axis=1)
for mdata in merged_data]
self.crosstab_total = pd.crosstab(data_total.iloc[:, 1], data_total.iloc[:, 0]).fillna(0).apply(compute_log_probs, axis = 1)
return self
def transform(self, data):
"""The precomputed histograms are joined as features to the given data set.
Parameters
----------
data : data that will be augmented by the crosstab feature
Returns
-------
Transformed dataset.
"""
feat_name = data.columns[1]
#indices that are in neither of the trained stacking halves
residual = pd.Index(train_test['device_id']).difference(ids_pair[0]).difference(ids_pair[1]).values
#merging the crosstab features with the device data
device_ct = pd.concat([merge_crosstab(data, feat_name, crosstab, ids) for crosstab, ids in
[[self.crosstabs[1], self.ids_pair[0]],[self.crosstabs[0], self.ids_pair[1]]]], axis = 0)
device_ct_total = data[data['device_id'].isin(residual)].merge(self.crosstab_total, 'left',
left_on = feat_name, right_index = True).fillna(0).drop(feat_name, axis = 1)
#combined_device = device_ct.combine_first(device_ct_total)
combined_device = pd.concat([device_ct, device_ct_total], axis = 0)
combined_device['device_id'] = combined_device['device_id'].astype(int)
return combined_device
def compute_log_probs(row):
"""
helper function for computing regularized log probabilities
"""
row = row + np.ones(len(row))
row_sum = row.sum()
return (row/row_sum).apply(lambda y: math.log(y) - math.log(1.0/NCLASSES))
def merge_crosstab(data, feat_name, crosstab, ids):
"""
helper function to join crosstab features with data
"""
ct_data = data.merge(crosstab, 'left', left_on = feat_name, right_index = True).drop(feat_name, axis = 1)
ct_data = ct_data[ct_data['device_id'].isin(ids)]
return ct_data
We define a function to compute the embeddings and join to the device data.
In [119]:
def embed(data, feat_name):
ct = CrossTabEncoder().fit(data, ind_split[:2])
device_ct_combine = ct.transform(data).dropna()
group_sizes = device_ct_combine.iloc[:,0:2].groupby('device_id').transform(len)
return [device_ct_combine.iloc[(group_sizes.values >= NCLUST).ravel(), :].set_index('device_id'),ct]
We merge the filtered set with the apps data and compute the crosstab features.
In [120]:
start = time.clock()
[device_apps_ct, ct_apps] = embed(device_apps_filtered, 'app_id')
print(time.clock()- start)
[device_labels_ct, ct_labels] = embed(device_label_small, 'label_id')
print(time.clock()- start)
As features, we consider two geometric manipulations of the bag of embedded vectors. First, the mean of the embedded vectors and second the GMM. To achieve this, we define a function to compute the means and GMMs of the app and label embeddings for each device-id.
In [104]:
def fit_gmm(data):
#fitting of gmm
gmm = GMM(N_CLUST, random_state = SEED)
gmm.fit(data)
#column names
mean_cols = ['Mean{0}'.format(i) for i in data.columns]
gmm_mean_cols = ['GMM-Mean{0}'.format(i) for i in range(len(gmm.means_.ravel()))]
gmm_weight_cols = ['GMM-Weight{0}'.format(i) for i in range(len(gmm.weights_))]
gmm_covar_cols = ['GMM-Covars{0}'.format(i) for i in range(len(gmm.covars_.ravel()))]
gmm_features = np.hstack([data.mean(), gmm.means_.ravel(), gmm.weights_, gmm.covars_.ravel()])
gmm_index = np.hstack([mean_cols, gmm_mean_cols, gmm_weight_cols, gmm_covar_cols])
return pd.Series( gmm_features, index = index_km)
Now, we compute the function on the data and join back to the original set of train and test ids.
In [133]:
start = time.clock()
gmm_apps = device_apps_ct.groupby(level = 0).apply(fit_gmm)
print(time.clock()- start)
gmm_labels = device_labels_ct.groupby(level = 0).apply(fit_gmm)
print(time.clock()- start)
gmm_apps_feature = train_test.merge(gmm_apps, 'left', left_on = 'device_id', right_index = True).set_index('device_id')
gmm_labels_feature = train_test.merge(gmm_labels, 'left', left_on = 'device_id', right_index = True).set_index('device_id')
Finally, we stack the GMM-features together.
In [134]:
gmm = np.hstack([gmm_apps_feature, gmm_labels_feature])
gmm_names = np.hstack([['App-{0}'.format(col) for col in gmm_apps_feature], ['Label-{0}'.format(col) for col in gmm_apps_feature]])
Finally, everything is pickled.
In [138]:
app_features = sparse.csr_matrix(sparse.hstack([app_cnt, gmm, bags]))
app_features_names = np.hstack([app_cnt_names, gmm_names, bags_names])
pickle.dump(app_features, open('{0}app_features.p'.format(FEATURE_PATH), 'wb'))
pickle.dump(app_features_names, open('{0}app_features_names.p'.format(FEATURE_PATH), 'wb'))