In this notebook we generate submission based on the stacked features.
In [1]:
import numpy as np
import pandas as pd
import pickle
import xgboost as xgb
import time
from scipy import sparse
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
#path to data, features and classifiers
DATA_PATH = "../../../input/"
FEATURE_PATH = "../../../features/"
CLF_PATH = "../../../models/"
#number of classes considered
NCLASSES = 12
#random seed
SEED = 1747
#do not allow probabilities below EPS
EPS = 0.001
#age_gender categories
cats = ['F23-', 'F24-26', 'F27-28', 'F29-32', 'F33-42', 'F43+', 'M22-', 'M23-26', 'M27-28', 'M29-31', 'M32-38', 'M39+']
#float format used in the submission file
FLOAT_FORMAT = '%.3f'
############################################
###XGB PARAMETERS
############################################
#parameters for xgb fitting for devices without events
FIT_PARAMS_GENERAL = {
'verbose_eval': 50,
'num_boost_round': 120,
}
#parameters for xgb fitting for devices with events
FIT_PARAMS_EVENTS = {
'verbose_eval': 100,
'num_boost_round': 550,
}
#params for xgb
HYPER_PARAMS = {
'objective' :'multi:softprob',
"num_class": 12,
'eval_metric': 'mlogloss',
}
Now, we load the classifiers, data and features.
In [2]:
classifier_general = pickle.load(open("{0}xgb_params_no_events.p".format(CLF_PATH),'rb'))
classifier_events = pickle.load(open("{0}xgb_params_events.p".format(CLF_PATH), 'rb'))
classifier_general['objective'] = 'multi:softprob'
classifier_events['objective'] = 'multi:softprob'
train = pd.read_csv('{0}gender_age_train.csv'.format(DATA_PATH))
test = pd.read_csv('{0}gender_age_test.csv'.format(DATA_PATH))
train_ids = pickle.load(open('{0}train_event_ids'.format(DATA_PATH), 'rb'))
test_ids = pickle.load(open('{0}test_event_ids'.format(DATA_PATH), 'rb'))
features_general = sparse.csr_matrix(pickle.load(open('{0}phone_model_features.p'.format(FEATURE_PATH), 'rb')))
features_event = pickle.load(open('{0}feature_sel_lvl0_events.p'.format(FEATURE_PATH), 'rb'))
Implement a method to fit an XGB model and predict probabilities.
In [3]:
def fit_predict_xgb(X_train, X_test, y, hyper_params, fit_params):
labels = LabelEncoder().fit_transform(y)
dtrain = xgb.DMatrix(X_train, labels)
dtest = xgb.DMatrix(X_test)
gbm = xgb.train(hyper_params, dtrain, evals = [(dtrain, 'train')], **fit_params )
return gbm.predict(dtest)
Fit classifier and generate predictions on all devices.
In [4]:
prediction_general = fit_predict_xgb(features_general[:len(train)],
features_general[len(train):],
train['group'], classifier_general, FIT_PARAMS_GENERAL)
prediction_events = fit_predict_xgb(features_event[:len(train_ids)],
features_event[len(train_ids):],
train[train['device_id'].isin(train_ids)]['group'], classifier_events, FIT_PARAMS_EVENTS)
Now, we need a method to regularize the probabilities and generate a dataframe.
In [5]:
def prediction_frame(ids, y):
y_cut = [[max(EPS, proba) for proba in proba_vector] for proba_vector in y]
df = pd.DataFrame(y_cut, columns = cats)
default_vals = np.empty((test.shape[0], len(cats)))
default_vals.fill(EPS)
default_df = pd.DataFrame(default_vals, columns = cats)
df.insert(0, 'device_id', ids.astype(str))
return df.set_index('device_id')
Using this auxiliary function, we now generate the final submission frame.
In [7]:
df_general = prediction_frame(test['device_id'], prediction_general)
df_events = prediction_frame(test_ids, prediction_events)
df_total = df_events.combine_first(df_general)
In [8]:
df_total.to_csv("../../../input/subm.csv", index = True, float_format = FLOAT_FORMAT)
In [ ]: