In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import gc
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc,recall_score,precision_score,accuracy_score
from sklearn.metrics import average_precision_score

In [ ]:
# Reading in the data
probs = pd.read_parquet('./probs.parquet', engine='pyarrow')
normals = pd.read_parquet('./normals.parquet', engine='pyarrow')

Preparing the data for variable engineering


In [ ]:
pident = set(probs.ident.unique())
nident = set(normals.ident.unique())

probs_coding = {}
norms_coding = {}
i = 0
for ident in list(pident): 
    probs_coding[ident] = i
    i += 1

for ident in list(nident):
    norms_coding[ident] = i
    i += 1
    
probs['id'] = probs.ident.apply(lambda x: probs_coding[x])
normals['id'] = normals.ident.apply(lambda x: norms_coding[x])

In [ ]:
df = pd.concat([probs, normals])

In [ ]:
del probs, normals
gc.collect()

In [ ]:
df.event_id = df.event_id.astype('str')

In [ ]:
# save unix time as an index to keep the order of events
df['index'] = df.start.copy()
df.start = df.start.astype('M8[s]')

Creating the variables


In [ ]:
# Generating timedeltas
def group_timedelta(group):
    group.sort_values('index', inplace=True)
    group['timedelta'] = group.start - group.start.shift()
    group.timedelta.fillna(0, inplace=True)
    return group

def make_timedelta(df):
    df = df.groupby('id', as_index=False).apply(group_timedelta)
    df = df.reset_index().iloc[:,2:]
    df.timedelta = df.timedelta.dt.seconds
    return df

df = make_timedelta(df)

In [ ]:
df.timedelta = pd.cut(df.timedelta, [0,1,5,60,300,1800,3600,3600*5,3600*24], right=False)
df.timedelta = df.timedelta.astype(str)

In [ ]:
df['weekday'] = df.start.dt.weekday_name
df['hour'] = df.start.dt.hour.astype(str)

In [ ]:
# Adding the device info
phone_models = pd.read_msgpack('./phone_models.msgpack')

device_info = phone_models.loc[:,phone_models.columns.drop(['phone_id', 'allocation_date', 'storage_code', 'handsetmodel_id', 'phone_cnt', 'g3', 'g4', 'android'])]
device_info.rename(columns={'seadme_tyyp': 'network_type'}, inplace=True)

df = df.merge(device_info, on='TAC')
df.drop(labels=['TAC'], axis=1, inplace=True)
df.nfc = df.nfc.astype(str).apply(lambda x: None if x == 'nan' else x)

Transforming from categorical to binary variables


In [ ]:
cats = ['event_result', 'cause_code', 'sub_cause_code', 'mecontext', 'event_id', 'network_type', 'phone_type', 'Manufacturer', 'Model', 'os', 'category', 'nfc', 'sim_type', 'screen_size', 'timedelta', 'weekday', 'hour']
non_cats = ['index', 'id', 'ident', 'start', 'probs']
def encode_columns(df):
    return pd.concat([df[non_cats], pd.get_dummies(df[cats], cats, dummy_na = True)], axis=1)

df = encode_columns(df)

Generating train and test datasets


In [ ]:
# Generating random sets of ids for the training and testing set - these need to be kept separate in both the
# non-aggregated and aggregated datasets
probs_indexes = random.sample(range(500), 100)
normals_indexes = random.sample(range(500,5500), 1000)
test_indexes = probs_indexes + normals_indexes

In [ ]:
def partition(id, test = test_indexes):
    if id in test_indexes:
        return 1
    else:
        return 0

In [ ]:
df['test'] = df['id'].apply(partition)

In [ ]:
df_train = df.loc[df['test'] == 0]
df_test = df.loc[df['test'] == 1]
del df_train['test']
del df_test['test']

Aggregating the single events


In [ ]:
# The following steps should also be done for the competition dataset, but removing the 'probs' column

In [ ]:
# grouping the variables by information source
non_cats = ['index', 'id', 'ident', 'start', 'probs'] # - remove 'probs' from the list for the competition dataset
identical = ['id', 'ident', 'probs']  # - remove 'probs' from the list for the competition dataset
event_result = [x for x in df.columns.tolist() if x.startswith('event_result')]
cause_code = [x for x in df.columns.tolist() if x.startswith('cause_code')]
sub_cause_code = [x for x in df.columns.tolist() if x.startswith('sub_cause_code')]
event_id = [x for x in df.columns.tolist() if x.startswith('event_id')]
network_type = [x for x in df.columns.tolist() if x.startswith('network_type')]
phone_type = [x for x in df.columns.tolist() if x.startswith('phone_type')]
Manufacturer = [x for x in df.columns.tolist() if x.startswith('Manufacturer')]
Model = [x for x in df.columns.tolist() if x.startswith('Model')]
os = [x for x in df.columns.tolist() if x.startswith('os')]
category = [x for x in df.columns.tolist() if x.startswith('category')]
nfc = [x for x in df.columns.tolist() if x.startswith('nfc')]
sim_type = [x for x in df.columns.tolist() if x.startswith('sim_type')]
screen_size = [x for x in df.columns.tolist() if x.startswith('screen_size')]
timedelta = [x for x in df.columns.tolist() if x.startswith('timedelta')]
weekday = [x for x in df.columns.tolist() if x.startswith('weekday')]
hour = [x for x in df.columns.tolist() if x.startswith('hour')]
history = [x for x in df.columns.tolist() if x.startswith('history')]

In [ ]:
def join_lists(list_of_lists):
    list3 = []
    if type(list_of_lists) is list:
        for element in list_of_lists:
            list3.extend(element)
    else:
        list3.extend(list_of_lists)
    return list(set(list3))

In [ ]:
# determining which variables will be summed and which ones will be averaged
identical_info = join_lists([identical, weekday])
event_info = join_lists([event_result, cause_code, sub_cause_code, event_id])
device_info = join_lists([network_type, phone_type, Manufacturer, Model, os, category, nfc, sim_type, screen_size])
time_info = join_lists([timedelta, weekday, hour])

summable = join_lists([event_info, timedelta, hour])
meanable = join_lists([event_info, device_info, timedelta, hour])

summable.append('id')
meanable.append('id')

In [ ]:
# aggregating the data
df_daily = pd.concat([df[identical_info].groupby('id').mean(), df[summable].groupby('id').sum().rename(columns = dict(zip(df[summable].groupby('id').sum().columns, ['sum_' + x for x in df[summable].groupby('id').sum().columns]))), df[meanable].groupby('id').mean().rename(columns = dict(zip(df[meanable].groupby('id').mean().columns, ['mean_' + x for x in df[meanable].groupby('id').mean().columns])))], axis = 1)

In [ ]:
# Saving the data for later use:
# df_daily.to_csv('df_competition_historical_data.txt', sep = '\t')

Splitting the aggregated data into training and test sets


In [ ]:
df_test_agg = df_daily.loc[test_indexes,:]
df_train_agg = df_daily.loc[~df.index.isin(df_train_agg.index)]

Predicting outcomes for single events to use the aggregated predictions as variables


In [ ]:
# Feature selection with ANOVA f-test
from sklearn.feature_selection import chi2, f_classif
f_values, f_p_values = f_classif(np.array(X_train_subsample),np.array(y_train_subsample))
selected_features = X_train_subsample.columns[f_values > 5000]

In [ ]:
y_train = df_train['probs']
y_test = df_test['probs']
id_train = df_train['id']
id_test = df_test['id']
X_train = df_train[selected_features]
X_test = X_test[selected_features]

In [ ]:
# a simple random forest classifer for this large dataset:
model = RandomForestClassifier(n_estimators = 100, max_depth = 3, n_jobs = 16, verbose = 10)
model.fit(X_train, y_train)
check = model.predict_proba(X_test)
train = model.predict_proba(X_train)

In [ ]:
# aggregating the predictions
result_train = pd.DataFrame(data=[])
result_train['id']= id_train
result_train['pred'] = train[:,1]

train_mean = result_train.groupby('id').mean()
train_mean.columns = ['pred_mean']

train_max = result_train.groupby('id').max()
train_max.columns = ['pred_max']

# ...and for the testing set
result_test = pd.DataFrame(data=[])
result_test['id']= id_test
result_test['pred'] = check[:,1]

test_mean = result_test.groupby('id').mean()
test_mean.columns = ['pred_mean']

test_max = result_test.groupby('id').max()
test_max.columns = ['pred_max']

Building the final models


In [ ]:
# adding single event predictions to the training data:
data = df_train_agg.merge(train_mean, on='id')
data = df_train_agg.merge(train_max, on='id')
data.columns = [x.replace('[', '').replace(']', '') for x in data.columns]

# ... and the test data:
test = df_test_agg.merge(test_mean, on='id')
test = df_test_agg.merge(test_max, on='id')
test.columns = [x.replace('[', '').replace(']', '') for x in test.columns]

In [ ]:
# Getting the feature importances
from sklearn.feature_selection import chi2, f_classif

f_values, f_p_values = f_classif(np.array(X_train),np.array(y_train))
selected_features = X_train.columns[f_values > 65]
selected_features

In [ ]:
# The variables that will be used for the modeling:
cols = ['sum_sub_cause_code_NO_VALUE',
 'sum_sub_cause_code_SIGNALING_INTERFERENCE',
 'sum_sub_cause_code_SUBSCRIPTION_CANCELLED',
 'sum_cause_code_UNKNOWN_CAUSE_CODE_PROT_TYPE_0',
 'sum_sub_cause_code_SGSN_CONTEXT_REQUEST_RECEIVED',
 'sum_event_id_13',
 'sum_event_id_2',
 'sum_event_result_IGNORE',
 'sum_event_id_15',
 'sum_sub_cause_code_GGSN_RESPONDED_WITH_REJECT_CAUSE_CONTEXT_NON_EXISTENT',
 'sum_sub_cause_code_QOS_CHANGED_BY_GGSN_DURING_RAU_OR_SGSN_OR_BSS_MODIFICATION_PROCEDURE',
 'sum_cause_code_REACTIVATION_REQUIRED',
 'sum_sub_cause_code_TIMEOUT_PAGING',
 'sum_event_id_8',
 'sum_sub_cause_code_NO_RESPONSE_FROM_MS_DURING_SGSN_INITIATED_MODIFICATION',
 'mean_sub_cause_code_SIGNALING_INTERFERENCE',
 'mean_event_id_4',
 'mean_event_id_6',
 'mean_event_id_2',
 'mean_timedelta_0, 1)',
 'mean_event_id_5',
 'mean_event_id_8',
 'mean_cause_code_UNKNOWN_CAUSE_CODE_PROT_TYPE_0',
 'mean_event_id_13',
 'mean_sub_cause_code_QOS_CHANGED_BY_GGSN_DURING_RAU_OR_SGSN_OR_BSS_MODIFICATION_PROCEDURE',
 'mean_event_id_12',
 'mean_cause_code_REACTIVATION_REQUIRED',
 'mean_event_result_ABORT',
 'mean_sub_cause_code_NO_VALUE',
 'mean_timedelta_60, 300)',
 'mean_sub_cause_code_SGSN_CONTEXT_REQUEST_RECEIVED',
 'mean_sub_cause_code_SUCCESS',
 'mean_cause_code_NOCAUSECODE',
 'mean_sub_cause_code_DETACH_TRIGGERED_PDN_DISCONNECTION',
 'mean_event_result_IGNORE',
 'mean_sub_cause_code_NO_RESPONSE_FROM_MS_DURING_SGSN_INITIATED_MODIFICATION',
 'pred_mean',
 'pred_max',
 'sum_hour_22',
 'sum_hour_14',
 'mean_hour_13',
 'weekday_Friday',
 'mean_hour_21',
 'mean_hour_16',
 'weekday_Saturday',
 'mean_hour_23']

In [ ]:
y_train = df_train_agg['probs']
y_test = df_test_agg['probs']

X_train = df_train_agg[cols]
X_test = df_test_agg[cols]

In [ ]:
# Oversampling the minority 'probs' = True class for training
sm = SMOTE(ratio = {True:(4*y_train.sum())},k_neighbors=5, random_state=np.random.randint(567))
X_train, y_train = sm.fit_sample(X_train, y_train)
X_train = pd.DataFrame(data=X_train, columns = feature_names)

XGBoost classifer:


In [ ]:
# tuning the hyperparameters
parameter_ranges = {'max_depth': [3,5,10],
                   'learning_rate': [0.01, 0.05, 0.1],
                   'n_estimators': [50, 100, 500]}


model = xgb.XGBClassifier().fit(X_train, y_train)
model_tuned = GridSearchCV(model, parameter_ranges, scoring = 'roc_auc', verbose = 10, n_jobs = 16)
model_tuned.fit(X_train, y_train)
model_xgb = model_tuned.best_estimator_ 
print(model_tuned.best_score_)
print(model_tuned.best_params_)

Random Forest classifier:


In [ ]:
# tuning the hyperparameters
parameter_ranges = {'max_depth': [10,15,20],
                   'n_estimators': [100, 500, 1000]}


model = RandomForestClassifier().fit(X_train, y_train)
model_tuned = GridSearchCV(model, parameter_ranges, scoring = 'roc_auc', verbose = 10, n_jobs = 16)
model_tuned.fit(X_train, y_train)
model_rf = model_tuned.best_estimator_ 
print(model_tuned.best_score_)
print(model_tuned.best_params_)

In [ ]:
# Making the predictions for the test set:
predictions_xgb = model_xgb.predict_proba(X_test)
predictions_rf = model_rf.predict_proba(X_test)

In [ ]:
# Taking the max probability for class 'probs' = True
predictions_final = [np.max([x,y]) for x,y in zip(predictions_xgb[:,1], predictions_rf[:,1])]
predictions_final_binary = [1 if x >= 0.52 else 0 for x in predictions_final]

Looking at the accuracy metrics:


In [ ]:
#area under the precision-recall curve
score = average_precision_score(y_test, predictions_final)
print('area under the precision-recall curve: {:.6f}'.format(score))


check2 = predictions_final_binary
score = precision_score(y_test, check2)
print('precision score: {:.6f}'.format(score))

score = recall_score(y_test, check2)
print('recall score: {:.6f}'.format(score))

score = accuracy_score(y_test, check2)
print('accuracy score: {:.6f}'.format(score))

fpr, tpr, _ = roc_curve(y_test, predictions_final)
roc_auc = auc(fpr, tpr)
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([-0.02, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
plt.legend(loc="lower right")
plt.show()

Training all the classifiers on the full dataset:


In [ ]:
# Training the single event classifer:
X_full = df[selected_features]
y_full = df['probs']
model = model.fit(X_full, y_full)

In [ ]:
# Training the aggregated data classifers:
X_full = df_daily[cols]
y_full = df_daily['probs']
model_xgb = model_xgb.fit(X_full, y_full)
model_rf = model_rf.fit(X_full, y_full)

In [ ]:
# Saving the models for later use:
joblib.dump(model, './final_models/nonagg_full_model.pkl')
joblib.dump(model_xgb, './final_models/model_xgb.pkl')
joblib.dump(model_rf, './final_models/model_rf.pkl')

Making the predictions for the competition dataset


In [61]:
competition = pd.read_parquet('./competition_data.parquet', engine='pyarrow')

In [26]:
# This file is created using the same lines of code used to aggregate the training data
# The steps used though "aggregating the single events" should be used
competition_agg = pd.read_csv('./df_competition_historical_data.txt', sep = '\t')

In [27]:
# Loading in the saved models:
model = joblib.load('./final_models/nonagg_full_model.pkl')
model_xgb = joblib.load('./final_models/model_xgb.pkl')
model_rf = joblib.load('./final_models/model_rf.pkl')

Data preparation - same steps as in the beginning of the notebook


In [28]:
nident = set(competition.ident.unique())

norms_coding = {}
i = 0

for ident in list(nident):
    norms_coding[ident] = i
    i += 1
    
competition['id'] = competition.ident.apply(lambda x: norms_coding[x])

In [29]:
competition.event_id = competition.event_id.astype('str')

In [30]:
competition['index'] = competition.start.copy()
competition.start = competition.start.astype('M8[s]')

In [31]:
def group_timedelta(group):
    group.sort_values('index', inplace=True)
    group['timedelta'] = group.start - group.start.shift()
    group.timedelta.fillna(0, inplace=True)
    return group

def make_timedelta(df):
    df = df.groupby('id', as_index=False).apply(group_timedelta)
    df = df.reset_index().iloc[:,2:]
    df.timedelta = df.timedelta.dt.seconds
    return df

competition = make_timedelta(competition)

In [32]:
competition.timedelta = pd.cut(competition.timedelta, [0,1,5,60,300,1800,3600,3600*5,3600*24], right=False)
competition.timedelta = competition.timedelta.astype(str)
competition['weekday'] = competition.start.dt.weekday_name
competition['hour'] = competition.start.dt.hour.astype(str)

In [33]:
phone_models = pd.read_msgpack('./phone_models.msgpack')

device_info = phone_models.loc[:,phone_models.columns.drop(['phone_id', 'allocation_date', 'storage_code', 'handsetmodel_id', 'phone_cnt', 'g3', 'g4', 'android'])]
device_info.rename(columns={'seadme_tyyp': 'network_type'}, inplace=True)

competition = competition.merge(device_info, on='TAC')
competition.drop(labels=['TAC'], axis=1, inplace=True)
competition.nfc = competition.nfc.astype(str).apply(lambda x: None if x == 'nan' else x)

In [34]:
cats = ['event_result', 'cause_code', 'sub_cause_code', 'mecontext', 'event_id', 'network_type', 'phone_type', 'Manufacturer', 'Model', 'os', 'category', 'nfc', 'sim_type', 'screen_size', 'timedelta', 'weekday', 'hour']
non_cats = ['index', 'id', 'ident', 'start']
def encode_columns(df):
    return pd.concat([df[non_cats], pd.get_dummies(df[cats], cats, dummy_na = True)], axis=1)

competition = encode_columns(competition)

In [37]:
# apparently this model does not appear in the competition dataset so I'm adding all zeros
competition['Model_GQ3030'] = len(competition.index) * [0]

Making the predictions:


In [41]:
data = competition[['id', 'cause_code_EPS_SERVICES_NOT_ALLOWED_IN_THIS_PLMN',
       'cause_code_NOCAUSECODE', 'cause_code_REACTIVATION_REQUIRED',
       'cause_code_SUCCESSFUL_HANDOVER',
       'cause_code_UNKNOWN_CAUSE_CODE_PROT_TYPE_0',
       'sub_cause_code_ALL_PACKET_ORIENTED_SERVICES_BARRED',
       'sub_cause_code_NO_VALUE',
       'sub_cause_code_QOS_CHANGED_BY_GGSN_DURING_RAU_OR_SGSN_OR_BSS_MODIFICATION_PROCEDURE',
       'sub_cause_code_SUCCESS', 'event_id_0', 'event_id_1', 'event_id_13',
       'event_id_15', 'event_id_2', 'event_id_4', 'event_id_5', 'event_id_6',
       'event_id_7', 'event_id_8', 'network_type_3G', 'network_type_4G',
       'phone_type_Nutitelefon', 'phone_type_[No Data]', 'Manufacturer_Apple',
       'Manufacturer_Samsung', 'Manufacturer_Teised', 'Manufacturer_ZTE',
       'Model_Alcatel Onetouch Pop 8S', 'Model_Apple iPhone 7 (A1778)',
       'Model_Apple iPhone X 64GB', 'Model_BV7000', 'Model_CAT S41',
       'Model_CAT S60', 'Model_Coolpad Modena', 'Model_E303', 'Model_GQ3030',
       'Model_HUAWEI Y6 Pro', 'Model_Huawei P9 Lite', 'Model_LG G4 Stylus',
       'Model_Lenovo A6000', 'Model_Lenovo Z90-7', 'Model_Maya-L41',
       'Model_MediaPad S7-303u', 'Model_Nokia LUMIA 635',
       'Model_Original One, Original Pure, Original Shock, Kindo',
       'Model_PM-1023-BV', 'Model_SM-G800F Galaxy S5 mini',
       'Model_Samsung Galaxy Alpha 32GB', 'Model_Sony Xperia XA',
       'Model_ZTE Blade V6', 'Model_iPhone 6 (A1586)', 'Model_m3 note',
       'os_Android', 'os_Windows', 'os_iOS', 'nfc_0.0', 'nfc_1.0',
       'sim_type_mini', 'sim_type_nan', 'screen_size_4.7', 'screen_size_5.0',
       'screen_size_5.0000', 'screen_size_5.2000', 'screen_size_7.0',
       'screen_size_nan']]

In [45]:
predictions = model.predict_proba(data.drop(['id'], axis = 1))


[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    3.4s
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    5.8s
[Parallel(n_jobs=16)]: Done  29 tasks      | elapsed:    6.8s
[Parallel(n_jobs=16)]: Done  40 tasks      | elapsed:    9.6s
[Parallel(n_jobs=16)]: Done  53 tasks      | elapsed:   12.6s
[Parallel(n_jobs=16)]: Done  66 tasks      | elapsed:   15.9s
[Parallel(n_jobs=16)]: Done  80 out of 100 | elapsed:   18.2s remaining:    4.5s
[Parallel(n_jobs=16)]: Done  91 out of 100 | elapsed:   20.0s remaining:    1.9s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:   21.9s finished

In [46]:
# aggregating the single event predictions
result = pd.DataFrame(data=[])
result['id']= data['id']
result['pred'] = predictions[:,1]

result_mean = result.groupby('id').mean()
result_mean.columns = ['pred_mean']

result_max = result.groupby('id').max()
result_max.columns = ['pred_max']

In [55]:
# creating the id column so I can easily merge
result_mean['id'] = result_mean.index
result_max['id'] = result_max.index

In [59]:
data = competition_agg.merge(result_mean, on='id')
data = data.merge(result_max, on='id')
data.columns = [x.replace('[', '').replace(']', '') for x in data.columns]

In [75]:
# selecting the variables to be used with the aggregated data models
data = data[['sum_sub_cause_code_NO_VALUE',
 'sum_sub_cause_code_SIGNALING_INTERFERENCE',
 'sum_sub_cause_code_SUBSCRIPTION_CANCELLED',
 'sum_cause_code_UNKNOWN_CAUSE_CODE_PROT_TYPE_0',
 'sum_sub_cause_code_SGSN_CONTEXT_REQUEST_RECEIVED',
 'sum_event_id_13',
 'sum_event_id_2',
 'sum_event_result_IGNORE',
 'sum_event_id_15',
 'sum_sub_cause_code_GGSN_RESPONDED_WITH_REJECT_CAUSE_CONTEXT_NON_EXISTENT',
 'sum_sub_cause_code_QOS_CHANGED_BY_GGSN_DURING_RAU_OR_SGSN_OR_BSS_MODIFICATION_PROCEDURE',
 'sum_cause_code_REACTIVATION_REQUIRED',
 'sum_sub_cause_code_TIMEOUT_PAGING',
 'sum_event_id_8',
 'sum_sub_cause_code_NO_RESPONSE_FROM_MS_DURING_SGSN_INITIATED_MODIFICATION',
 'mean_sub_cause_code_SIGNALING_INTERFERENCE',
 'mean_event_id_4',
 'mean_event_id_6',
 'mean_event_id_2',
 'mean_timedelta_0, 1)',
 'mean_event_id_5',
 'mean_event_id_8',
 'mean_cause_code_UNKNOWN_CAUSE_CODE_PROT_TYPE_0',
 'mean_event_id_13',
 'mean_sub_cause_code_QOS_CHANGED_BY_GGSN_DURING_RAU_OR_SGSN_OR_BSS_MODIFICATION_PROCEDURE',
 'mean_event_id_12',
 'mean_cause_code_REACTIVATION_REQUIRED',
 'mean_event_result_ABORT',
 'mean_sub_cause_code_NO_VALUE',
 'mean_timedelta_60, 300)',
 'mean_sub_cause_code_SGSN_CONTEXT_REQUEST_RECEIVED',
 'mean_sub_cause_code_SUCCESS',
 'mean_cause_code_NOCAUSECODE',
 'mean_sub_cause_code_DETACH_TRIGGERED_PDN_DISCONNECTION',
 'mean_event_result_IGNORE',
 'mean_sub_cause_code_NO_RESPONSE_FROM_MS_DURING_SGSN_INITIATED_MODIFICATION',
 'pred_mean',
 'pred_max',
 'sum_hour_22',
 'sum_hour_14',
 'mean_hour_13',
 'weekday_Friday',
 'mean_hour_21',
 'mean_hour_16',
 'weekday_Saturday',
 'mean_hour_23']]

In [77]:
# predicting probabilities:
predictions_xgb = model_xgb.predict_proba(data)
predictions_rf = model_rf.predict_proba(data)

In [79]:
# creating lists of probabilities and final predictions
predictions_final = [np.max([x,y]) for x,y in zip(predictions_xgb[:,1], predictions_rf[:,1])]
predictions_final_binary = [True if x >= 0.52 else False for x in predictions_final]

In [82]:
# joining the ids and predictions into single dataframe
predictions = pd.DataFrame(data=[])
predictions['ident'] = competition_agg['id']
predictions['probs'] = predictions_final_binary
predictions['probs_probability'] = predictions_final

In [85]:
# checking if the distribution is about the same as in training dataset (10%)
predictions['probs'].mean()


Out[85]:
0.11080383480825959

In [86]:
# adding the predictions to the competition dataset
competition_results = competition.merge(predictions, on = 'ident')

In [88]:
competition_results['probs'].mean()


Out[88]:
0.12554338586281305

In [89]:
competition_results['probs_probability'].mean()


Out[89]:
0.13905381070197795

In [95]:
# saving the results
competition_results.to_parquet('./competition_results.parquet')
predictions.to_csv('./results.csv', sep=',')

In [ ]: