In this notebook we generate submission based on the stacked features.

import numpy as np
import pandas as pd
import pickle
import xgboost as xgb
import time 
from scipy import sparse
from sklearn.preprocessing import LabelEncoder
#path to data, features and classifiers
DATA_PATH = "../../../input/"
FEATURE_PATH = "../../../features/"
CLF_PATH = "../../../models/"

#number of classes considered

#random seed
SEED = 1747

#do not allow probabilities below EPS
EPS = 0.001

#age_gender categories
cats = ['F23-', 'F24-26', 'F27-28', 'F29-32', 'F33-42', 'F43+', 'M22-', 'M23-26', 'M27-28', 'M29-31', 'M32-38', 'M39+']

#float format used in the submission file


#parameters for xgb fitting for devices without events
'verbose_eval': 50, 
'num_boost_round': 120,

#parameters for xgb fitting for devices with events
'verbose_eval': 100, 
'num_boost_round': 550,

#params for xgb
'objective' :'multi:softprob',
"num_class": 12,
'eval_metric': 'mlogloss', 

Now, we load the classifiers, data and features.

classifier_general = pickle.load(open("{0}xgb_params_no_events.p".format(CLF_PATH),'rb'))
classifier_events = pickle.load(open("{0}xgb_params_events.p".format(CLF_PATH), 'rb'))
classifier_general['objective'] = 'multi:softprob'
classifier_events['objective'] = 'multi:softprob'

train = pd.read_csv('{0}gender_age_train.csv'.format(DATA_PATH))
test = pd.read_csv('{0}gender_age_test.csv'.format(DATA_PATH))

train_ids = pickle.load(open('{0}train_event_ids'.format(DATA_PATH), 'rb'))
test_ids = pickle.load(open('{0}test_event_ids'.format(DATA_PATH), 'rb'))

features_general = sparse.csr_matrix(pickle.load(open('{0}phone_model_features.p'.format(FEATURE_PATH), 'rb')))
features_event = pickle.load(open('{0}feature_sel_lvl0_events.p'.format(FEATURE_PATH), 'rb'))

Fitting of classifiers

Implement a method to fit an XGB model and predict probabilities.

def fit_predict_xgb(X_train, X_test, y, hyper_params, fit_params):
    labels = LabelEncoder().fit_transform(y)
    dtrain = xgb.DMatrix(X_train, labels)
    dtest = xgb.DMatrix(X_test)
    gbm = xgb.train(hyper_params, dtrain, evals = [(dtrain, 'train')], **fit_params )
    return gbm.predict(dtest)

Fit classifier and generate predictions on all devices.

prediction_general = fit_predict_xgb(features_general[:len(train)], 
                                    train['group'], classifier_general, FIT_PARAMS_GENERAL)

prediction_events = fit_predict_xgb(features_event[:len(train_ids)], 
                                     train[train['device_id'].isin(train_ids)]['group'], classifier_events, FIT_PARAMS_EVENTS)

[0]	train-mlogloss:2.480607
[50]	train-mlogloss:2.373757
[100]	train-mlogloss:2.339313
[119]	train-mlogloss:2.331777
[0]	train-mlogloss:2.472073
[100]	train-mlogloss:1.950904
[200]	train-mlogloss:1.797024
[300]	train-mlogloss:1.707134
[400]	train-mlogloss:1.638839
[500]	train-mlogloss:1.580103
[549]	train-mlogloss:1.552651

Regularization of probabilities and submission

Now, we need a method to regularize the probabilities and generate a dataframe.

def prediction_frame(ids, y):
    y_cut = [[max(EPS, proba) for proba in proba_vector] for proba_vector in y]
    df = pd.DataFrame(y_cut, columns = cats)

    default_vals = np.empty((test.shape[0], len(cats)))
    default_df = pd.DataFrame(default_vals, columns = cats)

    df.insert(0, 'device_id',  ids.astype(str))
    return df.set_index('device_id')

Using this auxiliary function, we now generate the final submission frame.

df_general = prediction_frame(test['device_id'], prediction_general)
df_events = prediction_frame(test_ids, prediction_events)
df_total = df_events.combine_first(df_general)

df_total.to_csv("../../../input/subm.csv", index = True, float_format = FLOAT_FORMAT)

