Load packages


In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from scipy import interp
import pandas.core.algorithms as algos 
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import time
import sys
sys.path.append('/data1/pypackages_cris ')

Create functions


In [24]:
from sklearn.metrics import make_scorer
def my_custom_loss_func(y_true,y_pred):
    
    D = pd.DataFrame(y_pred[:, 1],columns = ['y_prob_inc'])
    E = pd.DataFrame(y_true.values,columns=['Actual_Inc'])
    Results = pd.concat([D,E],1)
    del D,E
    bcut = [1,.9,0]
    label = ['top 10%','rest'][::-1]
    bins = np.unique(algos.quantile(Results['y_prob_inc'],bcut))
    if len(label) != len(bins) - 1:
        return 0
    else:
        Results['decile_inc']  = pd.tools.tile._bins_to_cuts(Results['y_prob_inc'], bins,labels=label, include_lowest=True)

        #Results['decile_inc'] = pd.qcut(Results.y_prob_inc,10,labels=range(10))
        top = Results[Results['decile_inc'] =='top 10%'] 
        #ults.groupby(['decile_inc'])['y_prob_inc'].describe()
        return float(top[top['Actual_Inc'] == 1].shape[0])/float(Results[Results['Actual_Inc'] == 1].shape[0])
    
def my_custom_loss_func2(y_true,y_pred):
    
    D = pd.DataFrame(y_pred[:, 1],columns = ['y_prob_inc'])
    E = pd.DataFrame(y_true.values,columns=['Actual_Inc'])
    Results = pd.concat([D,E],1)
    del D,E
    bcut = [1,.8,0]
    label = ['top 20%','rest'][::-1]
    bins = np.unique(algos.quantile(Results['y_prob_inc'],bcut))
    if len(label) != len(bins) - 1:
        return 0
    else:
        Results['decile_inc']  = pd.tools.tile._bins_to_cuts(Results['y_prob_inc'], bins,labels=label, include_lowest=True)

        #Results['decile_inc'] = pd.qcut(Results.y_prob_inc,10,labels=range(10))
        top = Results[Results['decile_inc'] =='top 20%'] 
        #ults.groupby(['decile_inc'])['y_prob_inc'].describe()
        return float(top[top['Actual_Inc'] == 1].shape[0])/float(Results[Results['Actual_Inc'] == 1].shape[0])

def cust_scorer():
    return make_scorer(my_custom_loss_func, greater_is_better=True,needs_proba=True)
from sklearn.metrics import make_scorer
def my_custom_loss_func(y_true,y_pred):
    
    D = pd.DataFrame(y_pred[:, 1],columns = ['y_prob_inc'])
    E = pd.DataFrame(y_true.values,columns=['Actual_Inc'])
    Results = pd.concat([D,E],1)
    del D,E
    bcut = [1,.9,0]
    label = ['top 10%','rest'][::-1]
    bins = np.unique(algos.quantile(Results['y_prob_inc'],bcut))
    if len(label) != len(bins) - 1:
        return 0
    else:
        Results['decile_inc']  = pd.tools.tile._bins_to_cuts(Results['y_prob_inc'], bins,labels=label, include_lowest=True)

        #Results['decile_inc'] = pd.qcut(Results.y_prob_inc,10,labels=range(10))
        top = Results[Results['decile_inc'] =='top 10%'] 
        #ults.groupby(['decile_inc'])['y_prob_inc'].describe()
        return float(top[top['Actual_Inc'] == 1].shape[0])/float(Results[Results['Actual_Inc'] == 1].shape[0])
    
def my_custom_loss_func2(y_true,y_pred):
    
    D = pd.DataFrame(y_pred[:, 1],columns = ['y_prob_inc'])
    E = pd.DataFrame(y_true.values,columns=['Actual_Inc'])
    Results = pd.concat([D,E],1)
    del D,E
    bcut = [1,.8,0]
    label = ['top 20%','rest'][::-1]
    bins = np.unique(algos.quantile(Results['y_prob_inc'],bcut))
    if len(label) != len(bins) - 1:
        return 0
    else:
        Results['decile_inc']  = pd.tools.tile._bins_to_cuts(Results['y_prob_inc'], bins,labels=label, include_lowest=True)

        #Results['decile_inc'] = pd.qcut(Results.y_prob_inc,10,labels=range(10))
        top = Results[Results['decile_inc'] =='top 20%'] 
        #ults.groupby(['decile_inc'])['y_prob_inc'].describe()
        return float(top[top['Actual_Inc'] == 1].shape[0])/float(Results[Results['Actual_Inc'] == 1].shape[0])

def cust_scorer():
    return make_scorer(my_custom_loss_func, greater_is_better=True,needs_proba=True)

def assessment2(Results,num_bins):
        bins = np.unique(algos.quantile(Results['y_prob_inc'], np.linspace(0, 1, num_bins)))
        Results['decile']  = pd.tools.tile._bins_to_cuts(Results['y_prob_inc'], bins,labels=np.arange( 0,len(bins)-1,1)[::-1], include_lowest=True)
        group = Results.groupby(['decile','Actual_Inc'])
        Results2 = group.Actual_Inc.count().unstack().fillna(0).sort_index(ascending=False)
        Results2.columns = ['Negative','Positive']
        Total_neg,Total_pos = Results2.sum()[0],Results2.sum()[1]
        Total_neg,Total_pos = Results2.Negative.sum(),Results2.Positive.sum()
        Results2['Neg_perc'] = Results2['Negative']/Total_neg
        Results2['Pos_perc'] = Results2['Positive']/Total_pos
        Results2['Total_perc'] = (Results2['Negative']+Results2['Positive'])/(Total_neg+Total_pos)
        Results2['Cul_neg_perc'] = Results2['Neg_perc'].cumsum(axis=0)
        Results2['Cul_pos_perc'] = Results2['Pos_perc'].cumsum(axis=0)
        Results2['Cul_tot_perc'] = Results2['Total_perc'].cumsum(axis=0)
        Results2['Lift_perc'] = (Results2['Cul_pos_perc']+0.0001)/(Results2['Cul_neg_perc']+0.0001)
        Results2['ratio'] =  (Results2['Pos_perc']+0.0001)/(Results2['Neg_perc']+0.0001)
        # Gain chart: Cumulative %pos and Cummulative %Population
        # Lift chart: Decile & model
        KS_value = max(Results2['Cul_pos_perc']-Results2['Cul_neg_perc'])
        #print 'KS value :{0}'.format(KS_value) 
        #Results2['KS_value'] = KS_value
        num_dec = sum(x>=y for x, y in zip(Results2['ratio'], Results2['ratio'][1:]))
        #print "{0} out of {1} is strictly decreasing".format(num_dec,len(Results2['ratio'])-1) 
        mono = str(num_dec)+' out of '+str(len(Results2['ratio'])-1)
        #Results2['mono'] =mono
        #print "KS_value: ",KS_value
        #print "monotonicity: ",mono
       
 
        
        return KS_value

Read cleaned data


In [26]:
df2 = pd.read_csv('/data2/GMC/sample_dev_cln2.csv')
df_h2 = pd.read_csv('/data2/GMC/sample_allattrib_oot_cln2.csv')

In [6]:
droplist2 = [i for i in df2.columns.values if i[4:15]=='PDUE_BUCKET']
df2 = df2.drop(droplist2,1)
df2 = df2.drop(droplist2,1)
y= df2['NEW_BAD']
X = df2.drop(['NEW_BAD','SEGMENT_PM2016'],1)
Xh = df_h2.drop(['NEW_BAD','SEGMENT_PM2016'],1)

In [35]:
keeplist1 = pd.read_table('keeplist',header=None)[0].values
#df3 = df2[keeplist1]
y= df2['NEW_BAD']
X = df2[keeplist1]
Xh = df_h2[keeplist1]

Define X & y and split to train & test


In [36]:
X_train, X_test, y_train, y_test = train_test_split(
             X, y, test_size=0.3, random_state=3523)

In [37]:
print Xh.shape
print X.shape


(199115, 1385)
(615098, 1385)

In [38]:
print df2.NEW_BAD.value_counts(normalize=True)
print df_h2.NEW_BAD.value_counts(normalize=True)


0    0.977135
1    0.022865
Name: NEW_BAD, dtype: float64
0    1
Name: NEW_BAD, dtype: float64

In [39]:
set(Xh.columns) - set(X.columns)


Out[39]:
set()

Run different ML algorithm

Random forest

In [40]:
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)
forest = RandomForestClassifier(n_estimators=500, 
                                    #class_weight='balanced_subsample',
                                    max_depth = 80,
                                    min_samples_leaf=500,
                                    #max_features=100,
                                    n_jobs=-1)

model = forest

import time
start = time.time()    
model.fit(X_train,y_train)
end = time.time()
print(end - start)


214.895590067
Xgboost

In [19]:
import math
bad_rate = float(len(y[y==1]))/len(y)
min_child_weight=1/math.sqrt(bad_rate)
print min_child_weight


6.61329245441

In [32]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

xgb1=XGBClassifier(learning_rate=0.1, n_estimators=100,max_depth = 8,subsample=0.8, colsample_bytree =0.5)
model = xgb1
import time
start = time.time()    
model.fit(X_train,y_train)
end = time.time()
print(end - start)


545.667114973

Run gradient boosting


In [40]:
from sklearn.ensemble import GradientBoostingClassifier

gbc1=GradientBoostingClassifier(learning_rate=0.05, n_estimators=200,max_depth = 3,subsample=0.8)
model = gbc1
import time
start = time.time()    
model.fit(X_train,y_train)
end = time.time()
print(end - start)


860.400943041

In [26]:
from sklearn.ensemble import GradientBoostingClassifier

gbc2=GradientBoostingClassifier(learning_rate=0.1, n_estimators=100,max_depth = 10,subsample=1)
model = gbc2
import time
start = time.time()    
model.fit(X_train,y_train)
end = time.time()
print(end - start)


3854.47664595

Evaluate Performance: KS, ROC, top10 and 20 capture


In [41]:
proba= model.predict_proba(X_train)

D = pd.DataFrame(proba.T[1],columns = ['y_prob_inc'])
E = pd.DataFrame(y_train.values,columns=['Actual_Inc'])
Results = pd.concat([D,E],1)
r1 = assessment2(Results,11)
r2 = roc_auc_score(y_train,proba[:,1])
r3 =  my_custom_loss_func(y_train,proba)
r4 = my_custom_loss_func2(y_train,proba)
Result_train = (r1,r2,r3,r4)

proba= model.predict_proba(X_test)
D = pd.DataFrame(proba.T[1],columns = ['y_prob_inc'])
E = pd.DataFrame(y_test.values,columns=['Actual_Inc'])
Results = pd.concat([D,E],1)
r1 = assessment2(Results,11)
r2 = roc_auc_score(y_test,proba[:,1])
r3 = my_custom_loss_func(y_test,proba)
r4 = my_custom_loss_func2(y_test,proba)
Result_test = (r1,r2,r3,r4)
"""
proba= model.predict_proba(Xh)
D = pd.DataFrame(proba.T[1],columns = ['y_prob_inc'])
E = pd.DataFrame(yh.values,columns=['Actual_Inc'])
Results = pd.concat([D,E],1)
r1 = assessment2(Results,11)
r2 = roc_auc_score(yh,proba[:,1])
r3 = my_custom_loss_func(yh,proba)
r4 = my_custom_loss_func2(yh,proba)
Result_valid1 = (r1,r2,r3,r4)"""

Result_final = pd.DataFrame()
Result_final['Train'] = pd.Series(Result_train)
Result_final['Holdout'] = pd.Series(Result_test)
#Result_final['Validation1'] = pd.Series(Result_valid1)
Result_final = Result_final.T
Result_final.columns =['KS_value', 'ROC', 'Top_10%_capture','Top_20%_capture']
print Result_final


         KS_value       ROC  Top_10%_capture  Top_20%_capture
Train    0.682072  0.921978         0.684269         0.866504
Holdout  0.655704  0.909994         0.656988         0.840652

Save the score code for later use


In [24]:
from sklearn.externals import joblib
d = joblib.dump(model, '/data2/GMC/sales_model/pickle/region_xgb1.pkl')

In [34]:
from sklearn.externals import joblib
d = joblib.dump(model, '/data2/GMC/sales_model/pickle/region_xgb2.pkl')

In [31]:
from sklearn.externals import joblib
d = joblib.dump(model, '/data2/GMC/sales_model/pickle/region_xgb3.pkl')

In [42]:
from sklearn.externals import joblib
d = joblib.dump(model, '/data2/GMC/sales_model/pickle/region_rf1.pkl')

In [39]:
from sklearn.externals import joblib
d = joblib.dump(model, '/data2/GMC/sales_model/pickle/region_rf2.pkl')

Load the score code, and score validation dataset


In [43]:
model = joblib.load('/data2/GMC/sales_model/pickle/region_xgb1.pkl') 
score = model.predict_proba(Xh)[:,1]

In [ ]:
model = joblib.load('/data2/GMC/sales_model/pickle/region_xgb2.pkl') 
score2 = model.predict_proba(Xh)[:,1]

In [ ]:
model = joblib.load('/data2/GMC/sales_model/pickle/region_xgb3.pkl') 
score3 = model.predict_proba(Xh)[:,1]

In [ ]:
model = joblib.load('/data2/GMC/sales_model/pickle/region_rf1.pkl') 
score4 = model.predict_proba(Xh)[:,1]

In [ ]:
model = joblib.load('/data2/GMC/sales_model/pickle/region_rf2.pkl') 
score5 = model.predict_proba(Xh)[:,1]

In [ ]:
output = pd.concat([pd.Series(score),pd.Series(score2),pd.Series(score3),pd.Series(score4),pd.Series(score5)],1)

In [76]:
output.index = Xh.Seqnum.values
output.columns = ['score1','score2','score3','score4','score5']
output.to_csv('sample_ml_result1.csv')

Variable importance


In [11]:
import matplotlib.pyplot as plt
cols = X_train.columns
def var_importance(model):

    feature_importance = model.feature_importances_
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance)[::-1]
    top_sorted_idx = sorted_idx[:20]
    pos = np.arange(top_sorted_idx.shape[0]) + .5
    #fig = plt.figure() 
    plt.barh(pos, feature_importance[top_sorted_idx][::-1], align='center')
    plt.yticks(pos, cols[top_sorted_idx][::-1])
    plt.xlabel('Relative Importance')
    plt.title('Variable Importance')
    #plt.savefig("var_importance1.png")
    plt.show()
var_importance(model)