In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from scipy import interp
import pandas.core.algorithms as algos
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import time
import sys
sys.path.append('/data1/pypackages_cris ')
In [24]:
from sklearn.metrics import make_scorer
def my_custom_loss_func(y_true,y_pred):
D = pd.DataFrame(y_pred[:, 1],columns = ['y_prob_inc'])
E = pd.DataFrame(y_true.values,columns=['Actual_Inc'])
Results = pd.concat([D,E],1)
del D,E
bcut = [1,.9,0]
label = ['top 10%','rest'][::-1]
bins = np.unique(algos.quantile(Results['y_prob_inc'],bcut))
if len(label) != len(bins) - 1:
return 0
else:
Results['decile_inc'] = pd.tools.tile._bins_to_cuts(Results['y_prob_inc'], bins,labels=label, include_lowest=True)
#Results['decile_inc'] = pd.qcut(Results.y_prob_inc,10,labels=range(10))
top = Results[Results['decile_inc'] =='top 10%']
#ults.groupby(['decile_inc'])['y_prob_inc'].describe()
return float(top[top['Actual_Inc'] == 1].shape[0])/float(Results[Results['Actual_Inc'] == 1].shape[0])
def my_custom_loss_func2(y_true,y_pred):
D = pd.DataFrame(y_pred[:, 1],columns = ['y_prob_inc'])
E = pd.DataFrame(y_true.values,columns=['Actual_Inc'])
Results = pd.concat([D,E],1)
del D,E
bcut = [1,.8,0]
label = ['top 20%','rest'][::-1]
bins = np.unique(algos.quantile(Results['y_prob_inc'],bcut))
if len(label) != len(bins) - 1:
return 0
else:
Results['decile_inc'] = pd.tools.tile._bins_to_cuts(Results['y_prob_inc'], bins,labels=label, include_lowest=True)
#Results['decile_inc'] = pd.qcut(Results.y_prob_inc,10,labels=range(10))
top = Results[Results['decile_inc'] =='top 20%']
#ults.groupby(['decile_inc'])['y_prob_inc'].describe()
return float(top[top['Actual_Inc'] == 1].shape[0])/float(Results[Results['Actual_Inc'] == 1].shape[0])
def cust_scorer():
return make_scorer(my_custom_loss_func, greater_is_better=True,needs_proba=True)
from sklearn.metrics import make_scorer
def my_custom_loss_func(y_true,y_pred):
D = pd.DataFrame(y_pred[:, 1],columns = ['y_prob_inc'])
E = pd.DataFrame(y_true.values,columns=['Actual_Inc'])
Results = pd.concat([D,E],1)
del D,E
bcut = [1,.9,0]
label = ['top 10%','rest'][::-1]
bins = np.unique(algos.quantile(Results['y_prob_inc'],bcut))
if len(label) != len(bins) - 1:
return 0
else:
Results['decile_inc'] = pd.tools.tile._bins_to_cuts(Results['y_prob_inc'], bins,labels=label, include_lowest=True)
#Results['decile_inc'] = pd.qcut(Results.y_prob_inc,10,labels=range(10))
top = Results[Results['decile_inc'] =='top 10%']
#ults.groupby(['decile_inc'])['y_prob_inc'].describe()
return float(top[top['Actual_Inc'] == 1].shape[0])/float(Results[Results['Actual_Inc'] == 1].shape[0])
def my_custom_loss_func2(y_true,y_pred):
D = pd.DataFrame(y_pred[:, 1],columns = ['y_prob_inc'])
E = pd.DataFrame(y_true.values,columns=['Actual_Inc'])
Results = pd.concat([D,E],1)
del D,E
bcut = [1,.8,0]
label = ['top 20%','rest'][::-1]
bins = np.unique(algos.quantile(Results['y_prob_inc'],bcut))
if len(label) != len(bins) - 1:
return 0
else:
Results['decile_inc'] = pd.tools.tile._bins_to_cuts(Results['y_prob_inc'], bins,labels=label, include_lowest=True)
#Results['decile_inc'] = pd.qcut(Results.y_prob_inc,10,labels=range(10))
top = Results[Results['decile_inc'] =='top 20%']
#ults.groupby(['decile_inc'])['y_prob_inc'].describe()
return float(top[top['Actual_Inc'] == 1].shape[0])/float(Results[Results['Actual_Inc'] == 1].shape[0])
def cust_scorer():
return make_scorer(my_custom_loss_func, greater_is_better=True,needs_proba=True)
def assessment2(Results,num_bins):
bins = np.unique(algos.quantile(Results['y_prob_inc'], np.linspace(0, 1, num_bins)))
Results['decile'] = pd.tools.tile._bins_to_cuts(Results['y_prob_inc'], bins,labels=np.arange( 0,len(bins)-1,1)[::-1], include_lowest=True)
group = Results.groupby(['decile','Actual_Inc'])
Results2 = group.Actual_Inc.count().unstack().fillna(0).sort_index(ascending=False)
Results2.columns = ['Negative','Positive']
Total_neg,Total_pos = Results2.sum()[0],Results2.sum()[1]
Total_neg,Total_pos = Results2.Negative.sum(),Results2.Positive.sum()
Results2['Neg_perc'] = Results2['Negative']/Total_neg
Results2['Pos_perc'] = Results2['Positive']/Total_pos
Results2['Total_perc'] = (Results2['Negative']+Results2['Positive'])/(Total_neg+Total_pos)
Results2['Cul_neg_perc'] = Results2['Neg_perc'].cumsum(axis=0)
Results2['Cul_pos_perc'] = Results2['Pos_perc'].cumsum(axis=0)
Results2['Cul_tot_perc'] = Results2['Total_perc'].cumsum(axis=0)
Results2['Lift_perc'] = (Results2['Cul_pos_perc']+0.0001)/(Results2['Cul_neg_perc']+0.0001)
Results2['ratio'] = (Results2['Pos_perc']+0.0001)/(Results2['Neg_perc']+0.0001)
# Gain chart: Cumulative %pos and Cummulative %Population
# Lift chart: Decile & model
KS_value = max(Results2['Cul_pos_perc']-Results2['Cul_neg_perc'])
#print 'KS value :{0}'.format(KS_value)
#Results2['KS_value'] = KS_value
num_dec = sum(x>=y for x, y in zip(Results2['ratio'], Results2['ratio'][1:]))
#print "{0} out of {1} is strictly decreasing".format(num_dec,len(Results2['ratio'])-1)
mono = str(num_dec)+' out of '+str(len(Results2['ratio'])-1)
#Results2['mono'] =mono
#print "KS_value: ",KS_value
#print "monotonicity: ",mono
return KS_value
In [26]:
df2 = pd.read_csv('/data2/GMC/sample_dev_cln2.csv')
df_h2 = pd.read_csv('/data2/GMC/sample_allattrib_oot_cln2.csv')
In [6]:
droplist2 = [i for i in df2.columns.values if i[4:15]=='PDUE_BUCKET']
df2 = df2.drop(droplist2,1)
df2 = df2.drop(droplist2,1)
y= df2['NEW_BAD']
X = df2.drop(['NEW_BAD','SEGMENT_PM2016'],1)
Xh = df_h2.drop(['NEW_BAD','SEGMENT_PM2016'],1)
In [35]:
keeplist1 = pd.read_table('keeplist',header=None)[0].values
#df3 = df2[keeplist1]
y= df2['NEW_BAD']
X = df2[keeplist1]
Xh = df_h2[keeplist1]
In [36]:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=3523)
In [37]:
print Xh.shape
print X.shape
In [38]:
print df2.NEW_BAD.value_counts(normalize=True)
print df_h2.NEW_BAD.value_counts(normalize=True)
In [39]:
set(Xh.columns) - set(X.columns)
Out[39]:
In [40]:
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)
forest = RandomForestClassifier(n_estimators=500,
#class_weight='balanced_subsample',
max_depth = 80,
min_samples_leaf=500,
#max_features=100,
n_jobs=-1)
model = forest
import time
start = time.time()
model.fit(X_train,y_train)
end = time.time()
print(end - start)
In [19]:
import math
bad_rate = float(len(y[y==1]))/len(y)
min_child_weight=1/math.sqrt(bad_rate)
print min_child_weight
In [32]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
xgb1=XGBClassifier(learning_rate=0.1, n_estimators=100,max_depth = 8,subsample=0.8, colsample_bytree =0.5)
model = xgb1
import time
start = time.time()
model.fit(X_train,y_train)
end = time.time()
print(end - start)
In [40]:
from sklearn.ensemble import GradientBoostingClassifier
gbc1=GradientBoostingClassifier(learning_rate=0.05, n_estimators=200,max_depth = 3,subsample=0.8)
model = gbc1
import time
start = time.time()
model.fit(X_train,y_train)
end = time.time()
print(end - start)
In [26]:
from sklearn.ensemble import GradientBoostingClassifier
gbc2=GradientBoostingClassifier(learning_rate=0.1, n_estimators=100,max_depth = 10,subsample=1)
model = gbc2
import time
start = time.time()
model.fit(X_train,y_train)
end = time.time()
print(end - start)
In [41]:
proba= model.predict_proba(X_train)
D = pd.DataFrame(proba.T[1],columns = ['y_prob_inc'])
E = pd.DataFrame(y_train.values,columns=['Actual_Inc'])
Results = pd.concat([D,E],1)
r1 = assessment2(Results,11)
r2 = roc_auc_score(y_train,proba[:,1])
r3 = my_custom_loss_func(y_train,proba)
r4 = my_custom_loss_func2(y_train,proba)
Result_train = (r1,r2,r3,r4)
proba= model.predict_proba(X_test)
D = pd.DataFrame(proba.T[1],columns = ['y_prob_inc'])
E = pd.DataFrame(y_test.values,columns=['Actual_Inc'])
Results = pd.concat([D,E],1)
r1 = assessment2(Results,11)
r2 = roc_auc_score(y_test,proba[:,1])
r3 = my_custom_loss_func(y_test,proba)
r4 = my_custom_loss_func2(y_test,proba)
Result_test = (r1,r2,r3,r4)
"""
proba= model.predict_proba(Xh)
D = pd.DataFrame(proba.T[1],columns = ['y_prob_inc'])
E = pd.DataFrame(yh.values,columns=['Actual_Inc'])
Results = pd.concat([D,E],1)
r1 = assessment2(Results,11)
r2 = roc_auc_score(yh,proba[:,1])
r3 = my_custom_loss_func(yh,proba)
r4 = my_custom_loss_func2(yh,proba)
Result_valid1 = (r1,r2,r3,r4)"""
Result_final = pd.DataFrame()
Result_final['Train'] = pd.Series(Result_train)
Result_final['Holdout'] = pd.Series(Result_test)
#Result_final['Validation1'] = pd.Series(Result_valid1)
Result_final = Result_final.T
Result_final.columns =['KS_value', 'ROC', 'Top_10%_capture','Top_20%_capture']
print Result_final
In [24]:
from sklearn.externals import joblib
d = joblib.dump(model, '/data2/GMC/sales_model/pickle/region_xgb1.pkl')
In [34]:
from sklearn.externals import joblib
d = joblib.dump(model, '/data2/GMC/sales_model/pickle/region_xgb2.pkl')
In [31]:
from sklearn.externals import joblib
d = joblib.dump(model, '/data2/GMC/sales_model/pickle/region_xgb3.pkl')
In [42]:
from sklearn.externals import joblib
d = joblib.dump(model, '/data2/GMC/sales_model/pickle/region_rf1.pkl')
In [39]:
from sklearn.externals import joblib
d = joblib.dump(model, '/data2/GMC/sales_model/pickle/region_rf2.pkl')
In [43]:
model = joblib.load('/data2/GMC/sales_model/pickle/region_xgb1.pkl')
score = model.predict_proba(Xh)[:,1]
In [ ]:
model = joblib.load('/data2/GMC/sales_model/pickle/region_xgb2.pkl')
score2 = model.predict_proba(Xh)[:,1]
In [ ]:
model = joblib.load('/data2/GMC/sales_model/pickle/region_xgb3.pkl')
score3 = model.predict_proba(Xh)[:,1]
In [ ]:
model = joblib.load('/data2/GMC/sales_model/pickle/region_rf1.pkl')
score4 = model.predict_proba(Xh)[:,1]
In [ ]:
model = joblib.load('/data2/GMC/sales_model/pickle/region_rf2.pkl')
score5 = model.predict_proba(Xh)[:,1]
In [ ]:
output = pd.concat([pd.Series(score),pd.Series(score2),pd.Series(score3),pd.Series(score4),pd.Series(score5)],1)
In [76]:
output.index = Xh.Seqnum.values
output.columns = ['score1','score2','score3','score4','score5']
output.to_csv('sample_ml_result1.csv')
In [11]:
import matplotlib.pyplot as plt
cols = X_train.columns
def var_importance(model):
feature_importance = model.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)[::-1]
top_sorted_idx = sorted_idx[:20]
pos = np.arange(top_sorted_idx.shape[0]) + .5
#fig = plt.figure()
plt.barh(pos, feature_importance[top_sorted_idx][::-1], align='center')
plt.yticks(pos, cols[top_sorted_idx][::-1])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
#plt.savefig("var_importance1.png")
plt.show()
var_importance(model)