In [41]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import math
from sklearn.cross_validation import train_test_split,KFold
import numpy as np
In [42]:
datafile_train=r'carvan_train.csv'
datafile_test=r'carvan_test.csv'
cd_train=pd.read_csv(datafile_train)
cd_test=pd.read_csv(datafile_test)
In [43]:
len(cd_train)
Out[43]:
In [44]:
cd_train.head(5)
Out[44]:
In [45]:
x = cd_train.drop(['V86'],1)
y = cd_train['V86']
In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,accuracy_score,classification_report
In [47]:
ld_train, ld_test = train_test_split(cd_train, test_size=0.2, random_state=2)
In [48]:
x80_train = ld_train.drop(['V86'],1)
y80_train = ld_train['V86']
x20_test = ld_test.drop(['V86'],1)
y20_test = ld_test['V86']
In [49]:
model_logr1 = LogisticRegression(penalty="l1",class_weight=None,random_state=2)
In [50]:
model_logr1.fit(x80_train, y80_train)
Out[50]:
In [70]:
y20_test_pred1 = np.where(model_logr1.predict(x20_test)==1,1,0)
temp_df1 = pd.DataFrame(list(zip(cd_test['V1'],list(y20_test_pred1))), columns=['V1','V86'])
y_test_pred1 = temp_df1['V86']
In [72]:
roc_auc_score(y20_test, y_test_pred1)
Out[72]:
In [53]:
model_logrl2 = LogisticRegression(penalty="l2",class_weight=None,random_state=2)
In [54]:
model_logrl2.fit(x80_train, y80_train)
Out[54]:
In [57]:
y20_test_pred2 = np.where(model_logrl2.predict(x20_test)==1,1,0)
temp_df2 = pd.DataFrame(list(zip(cd_test['V1'],list(y20_test_pred2))), columns=['V1','V86'])
y_test_pred2 = temp_df2['V86']
In [58]:
roc_auc_score(y20_test, y_test_pred2)
Out[58]:
In [59]:
model_logr3 = LogisticRegression(penalty="l1",class_weight="balanced",random_state=2)
In [60]:
model_logr3.fit(x80_train, y80_train)
Out[60]:
In [61]:
y20_test_pred3 = np.where(model_logr3.predict(x20_test)==1,1,0)
temp_df3 = pd.DataFrame(list(zip(cd_test['V1'],list(y20_test_pred3))), columns=['V1','V86'])
y_test_pred3 = temp_df3['V86']
In [62]:
roc_auc_score(y20_test, y_test_pred3)
Out[62]:
In [116]:
model_logr4 = LogisticRegression(penalty="l2",class_weight="balanced",random_state=2, solver="newton-cg")
In [117]:
model_logr4.fit(x80_train, y80_train)
Out[117]:
In [118]:
y20_test_pred4 = np.where(model_logr4.predict(x20_test)==1,1,0)
temp_df4 = pd.DataFrame(list(zip(cd_test['V1'],list(y20_test_pred4))), columns=['V1','V86'])
y_test_pred4 = temp_df4['V86']
In [119]:
roc_auc_score(y20_test, y_test_pred4)
Out[119]:
In [124]:
prob_score=pd.Series(list(zip(*model_logr4.predict_proba(x80_train)))[1])
In [127]:
cutoffs=np.linspace(0,1,100)
For each of these cutoff , we are going to look at TP,FP,TN,FN values and calculate KS. Then we'll chose the best cutoff as the one having highest KS.
In [129]:
KS_cut=[]
for cutoff in cutoffs:
predicted = pd.Series([0]*len(y80_train))
predicted[prob_score > cutoff] = 1
df = pd.DataFrame(list(zip(y80_train,predicted)),columns=["real","predicted"])
TP=len(df[(df["real"]==1) &(df["predicted"]==1) ])
FP=len(df[(df["real"]==0) &(df["predicted"]==1) ])
TN=len(df[(df["real"]==0) &(df["predicted"]==0) ])
FN=len(df[(df["real"]==1) &(df["predicted"]==0) ])
P=TP+FN
N=TN+FP
KS=(TP/P)-(FP/N)
KS_cut.append(KS)
cutoff_data=pd.DataFrame(list(zip(cutoffs,KS_cut)),columns=["cutoff","KS"])
KS_cutoff=cutoff_data[cutoff_data["KS"]==cutoff_data["KS"].max()]["cutoff"]
Now we'll see how this model with the cutoff determined here , performs on the test data.
In [132]:
# Performance on test data
prob_score_test=pd.Series(list(zip(*model_logr4.predict_proba(x20_test)))[1])
predicted_test=pd.Series([0]*len(y20_test))
predicted_test[prob_score_test>float(KS_cutoff)]=1
df_test=pd.DataFrame(list(zip(y20_test,predicted_test)),columns=["real","predicted"])
k=pd.crosstab(df_test['real'],df_test["predicted"])
print('confusion matrix :\n \n ',k)
TN=k.iloc[0,0]
TP=k.iloc[1,1]
FP=k.iloc[0,1]
FN=k.iloc[1,0]
P=TP+FN
N=TN+FP
In [136]:
# Accuracy of test
accuracy = (TP+TN)/(P+N)
# Sensitivity on test
sensitivity = TP/P
#Specificity on test
specificity = TN/N
print("accuracy : ", accuracy)
print("sensitivity : ", sensitivity)
print("specificity : ", specificity)
Next we see how cutoff determined by F_beta score performs on test data for beta values : 0.5,1,2
In [141]:
cutoffs=np.linspace(0.010,0.99,100)
def Fbeta_perf(beta,cutoffs,y80_train,prob_score):
FB_cut=[]
for cutoff in cutoffs:
predicted=pd.Series([0]*len(y80_train))
predicted[prob_score>cutoff]=1
df=pd.DataFrame(list(zip(y80_train,predicted)),columns=["real","predicted"])
TP=len(df[(df["real"]==1) &(df["predicted"]==1) ])
FP=len(df[(df["real"]==0) &(df["predicted"]==1) ])
FN=len(df[(df["real"]==1) &(df["predicted"]==0) ])
P=TP+FN
Precision=TP/(TP+FP)
Recall=TP/P
FB=(1+beta**2)*Precision*Recall/((beta**2)*Precision+Recall)
FB_cut.append(FB)
cutoff_data=pd.DataFrame(list(zip(cutoffs,FB_cut)),columns=["cutoff","FB"])
FB_cutoff=cutoff_data[cutoff_data["FB"]==cutoff_data["FB"].max()]["cutoff"]
prob_score_test=pd.Series(list(zip(*model_logr4.predict_proba(x20_test)))[1])
predicted_test=pd.Series([0]*len(y20_test))
predicted_test[prob_score_test>float(FB_cutoff)]=1
df_test=pd.DataFrame(list(zip(y20_test,predicted_test)),columns=["real","predicted"])
k=pd.crosstab(df_test['real'],df_test["predicted"])
# print('confusion matrix :\n \n ',k)
TN=k.iloc[0,0]
TP=k.iloc[1,1]
FP=k.iloc[0,1]
FN=k.iloc[1,0]
P=TP+FN
N=TN+FP
print('For beta :',beta)
print('Accuracy is :',(TP+TN)/(P+N))
print('Sensitivity is :',(TP/P))
print('Specificity is :',(TN/N))
print('\n \n \n')
In [178]:
Fbeta_perf(0.5,cutoffs,y80_train,prob_score)
Fbeta_perf(1,cutoffs,y80_train,prob_score)
Fbeta_perf(1.5,cutoffs,y80_train,prob_score)
Fbeta_perf(2,cutoffs,y80_train,prob_score)
Fbeta_perf(2.5,cutoffs,y80_train,prob_score)
Fbeta_perf(3.0,cutoffs,y80_train,prob_score)
In [147]:
from sklearn.metrics import fbeta_score
In [177]:
betas = np.linspace(1,3,num=5)
for ta in betas:
print('\n')
print('Beta : ', ta)
fscorema = fbeta_score(y20_test, y_test_pred4, average='macro', beta=ta)
print('fscore_ma : ' ,fscorema)
fscoremi = fbeta_score(y20_test, y_test_pred4, average='micro', beta=ta)
print('fscore_mi : ' ,fscoremi)
fscorew = fbeta_score(y20_test, y_test_pred4, average='weighted', beta=ta)
print('fscore_w : ' ,fscorew)
fscoren = fbeta_score(y20_test, y_test_pred4, average=None, beta=ta)
print('fscore_n : ' ,fscoren)
In [165]:
print('fscorema : ' ,fscorema)
print('fscoremi : ' ,fscoremi)
print('fscorew : ' ,fscorew)
print('fscoren : ' ,fscoren)
In [ ]:
model_logr4.fit(x,y)
In [181]:
prediction = np.where(model_logr4.predict(cd_test)==1,"Yes","No")
submission = pd.DataFrame(list(zip(cd_test['V1'],list(prediction))),
columns=['V1','V86'])
In [182]:
pred_y = submission['V86']
actual_y = cd_train['V86']
In [183]:
submission.head(4)
Out[183]:
In [185]:
submission.to_csv('submission_carvan.csv',index=False)
This submission will get you auc score of approx 0.50, slightly less than whats required for passing the course. You'll have to make changes