notebook.community

Edit and run



In [41]:

    
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import math
from sklearn.cross_validation import train_test_split,KFold
import numpy as np



In [42]:

    
datafile_train=r'carvan_train.csv'
datafile_test=r'carvan_test.csv'
cd_train=pd.read_csv(datafile_train)
cd_test=pd.read_csv(datafile_test)



In [43]:

    
len(cd_train)









    Out[43]:





5822



In [44]:

    
cd_train.head(5)









    Out[44]:






  
    
      
      V1
      V2
      V3
      V4
      V5
      V6
      V7
      V8
      V9
      V10
      ...
      V77
      V78
      V79
      V80
      V81
      V82
      V83
      V84
      V85
      V86
    
  
  
    
      0
      33
      1
      3
      2
      8
      0
      5
      1
      3
      7
      ...
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
    
    
      1
      37
      1
      2
      2
      8
      1
      4
      1
      4
      6
      ...
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
    
    
      2
      37
      1
      2
      2
      8
      0
      4
      2
      4
      3
      ...
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
    
    
      3
      9
      1
      3
      3
      3
      2
      3
      2
      4
      5
      ...
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
    
    
      4
      40
      1
      4
      2
      10
      1
      4
      1
      4
      7
      ...
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
    
  

5 rows × 86 columns



In [45]:

    
x = cd_train.drop(['V86'],1)
y = cd_train['V86']



In [46]:

    
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,accuracy_score,classification_report

Optimizing model...

Run train_test splits on the train data



In [47]:

    
ld_train, ld_test = train_test_split(cd_train, test_size=0.2, random_state=2)



In [48]:

    
x80_train = ld_train.drop(['V86'],1)
y80_train = ld_train['V86']

x20_test = ld_test.drop(['V86'],1)
y20_test = ld_test['V86']

1. Check ROC_AUC_SCORE {penalty='l1', class_weight=None}



In [49]:

    
model_logr1 = LogisticRegression(penalty="l1",class_weight=None,random_state=2)



In [50]:

    
model_logr1.fit(x80_train, y80_train)









    Out[50]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=2, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [70]:

    
y20_test_pred1 = np.where(model_logr1.predict(x20_test)==1,1,0)
temp_df1 = pd.DataFrame(list(zip(cd_test['V1'],list(y20_test_pred1))), columns=['V1','V86'])

y_test_pred1 = temp_df1['V86']



In [72]:

    
roc_auc_score(y20_test, y_test_pred1)









    Out[72]:





0.50574923547400619

2. Check ROC_AUC_SCORE {penalty='l2', class_weight=None}



In [53]:

    
model_logrl2 = LogisticRegression(penalty="l2",class_weight=None,random_state=2)



In [54]:

    
model_logrl2.fit(x80_train, y80_train)









    Out[54]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=2, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [57]:

    
y20_test_pred2 = np.where(model_logrl2.predict(x20_test)==1,1,0)
temp_df2 = pd.DataFrame(list(zip(cd_test['V1'],list(y20_test_pred2))), columns=['V1','V86'])

y_test_pred2 = temp_df2['V86']



In [58]:

    
roc_auc_score(y20_test, y_test_pred2)









    Out[58]:





0.50574923547400619

3. Check ROC_AUC_SCORE {penalty='l1', class_weight='balanced'}



In [59]:

    
model_logr3 = LogisticRegression(penalty="l1",class_weight="balanced",random_state=2)



In [60]:

    
model_logr3.fit(x80_train, y80_train)









    Out[60]:





LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=2,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)



In [61]:

    
y20_test_pred3 = np.where(model_logr3.predict(x20_test)==1,1,0)
temp_df3 = pd.DataFrame(list(zip(cd_test['V1'],list(y20_test_pred3))), columns=['V1','V86'])

y_test_pred3 = temp_df3['V86']



In [62]:

    
roc_auc_score(y20_test, y_test_pred3)









    Out[62]:





0.67596330275229366

4. Check ROC_AUC_SCORE {penalty='l2', class_weight='balanced'}



In [116]:

    
model_logr4 = LogisticRegression(penalty="l2",class_weight="balanced",random_state=2, solver="newton-cg")



In [117]:

    
model_logr4.fit(x80_train, y80_train)









    Out[117]:





LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=2,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)



In [118]:

    
y20_test_pred4 = np.where(model_logr4.predict(x20_test)==1,1,0)
temp_df4 = pd.DataFrame(list(zip(cd_test['V1'],list(y20_test_pred4))), columns=['V1','V86'])

y_test_pred4 = temp_df4['V86']



In [119]:

    
roc_auc_score(y20_test, y_test_pred4)









    Out[119]:





0.67688073394495418



In [124]:

    
prob_score=pd.Series(list(zip(*model_logr4.predict_proba(x80_train)))[1])

2. Calculate optimum FBeta score

a. Calculate cutoffs and best KS



In [127]:

    
cutoffs=np.linspace(0,1,100)

For each of these cutoff , we are going to look at TP,FP,TN,FN values and calculate KS. Then we'll chose the best cutoff as the one having highest KS.



In [129]:

    
KS_cut=[]
for cutoff in cutoffs:
    predicted = pd.Series([0]*len(y80_train))
    predicted[prob_score > cutoff] = 1
    df = pd.DataFrame(list(zip(y80_train,predicted)),columns=["real","predicted"])
    TP=len(df[(df["real"]==1) &(df["predicted"]==1) ])
    FP=len(df[(df["real"]==0) &(df["predicted"]==1) ])
    TN=len(df[(df["real"]==0) &(df["predicted"]==0) ])
    FN=len(df[(df["real"]==1) &(df["predicted"]==0) ])
    P=TP+FN
    N=TN+FP
    KS=(TP/P)-(FP/N)
    KS_cut.append(KS)

cutoff_data=pd.DataFrame(list(zip(cutoffs,KS_cut)),columns=["cutoff","KS"])

KS_cutoff=cutoff_data[cutoff_data["KS"]==cutoff_data["KS"].max()]["cutoff"]

Now we'll see how this model with the cutoff determined here , performs on the test data.



In [132]:

    
# Performance on test data
prob_score_test=pd.Series(list(zip(*model_logr4.predict_proba(x20_test)))[1])

predicted_test=pd.Series([0]*len(y20_test))
predicted_test[prob_score_test>float(KS_cutoff)]=1

df_test=pd.DataFrame(list(zip(y20_test,predicted_test)),columns=["real","predicted"])

k=pd.crosstab(df_test['real'],df_test["predicted"])
print('confusion matrix :\n \n ',k)
TN=k.iloc[0,0]
TP=k.iloc[1,1]
FP=k.iloc[0,1]
FN=k.iloc[1,0]
P=TP+FN
N=TN+FP









    



confusion matrix :
 
  predicted    0    1
real               
0          833  257
1           34   41



In [136]:

    
# Accuracy of test
accuracy = (TP+TN)/(P+N)
# Sensitivity on test
sensitivity = TP/P
#Specificity on test
specificity = TN/N

print("accuracy : ", accuracy)
print("sensitivity : ", sensitivity)
print("specificity : ", specificity)









    



accuracy :  0.750214592275
sensitivity :  0.546666666667
specificity :  0.764220183486

Next we see how cutoff determined by F_beta score performs on test data for beta values : 0.5,1,2



In [141]:

    
cutoffs=np.linspace(0.010,0.99,100)
def Fbeta_perf(beta,cutoffs,y80_train,prob_score):
    FB_cut=[]
    for cutoff in cutoffs:
        predicted=pd.Series([0]*len(y80_train))
        predicted[prob_score>cutoff]=1
        df=pd.DataFrame(list(zip(y80_train,predicted)),columns=["real","predicted"])

        TP=len(df[(df["real"]==1) &(df["predicted"]==1) ])
        FP=len(df[(df["real"]==0) &(df["predicted"]==1) ])
        FN=len(df[(df["real"]==1) &(df["predicted"]==0) ])
        P=TP+FN
        
        
        Precision=TP/(TP+FP)
        Recall=TP/P
        FB=(1+beta**2)*Precision*Recall/((beta**2)*Precision+Recall)
        FB_cut.append(FB)

    cutoff_data=pd.DataFrame(list(zip(cutoffs,FB_cut)),columns=["cutoff","FB"])

    FB_cutoff=cutoff_data[cutoff_data["FB"]==cutoff_data["FB"].max()]["cutoff"]

    prob_score_test=pd.Series(list(zip(*model_logr4.predict_proba(x20_test)))[1])

    predicted_test=pd.Series([0]*len(y20_test))
    predicted_test[prob_score_test>float(FB_cutoff)]=1

    df_test=pd.DataFrame(list(zip(y20_test,predicted_test)),columns=["real","predicted"])

    k=pd.crosstab(df_test['real'],df_test["predicted"])
#     print('confusion matrix :\n \n ',k)
    TN=k.iloc[0,0]
    TP=k.iloc[1,1]
    FP=k.iloc[0,1]
    FN=k.iloc[1,0]
    P=TP+FN
    N=TN+FP
    print('For beta :',beta)
    print('Accuracy is :',(TP+TN)/(P+N))
    print('Sensitivity is :',(TP/P))
    print('Specificity is :',(TN/N))
    print('\n \n \n')



In [178]:

    
Fbeta_perf(0.5,cutoffs,y80_train,prob_score)
Fbeta_perf(1,cutoffs,y80_train,prob_score)
Fbeta_perf(1.5,cutoffs,y80_train,prob_score)
Fbeta_perf(2,cutoffs,y80_train,prob_score)
Fbeta_perf(2.5,cutoffs,y80_train,prob_score)
Fbeta_perf(3.0,cutoffs,y80_train,prob_score)









    



For beta : 0.5
Accuracy is : 0.915021459227
Sensitivity is : 0.226666666667
Specificity is : 0.962385321101

 
 

For beta : 1
Accuracy is : 0.843776824034
Sensitivity is : 0.426666666667
Specificity is : 0.87247706422

 
 

For beta : 1.5
Accuracy is : 0.781115879828
Sensitivity is : 0.506666666667
Specificity is : 0.8

 
 

For beta : 2
Accuracy is : 0.763090128755
Sensitivity is : 0.533333333333
Specificity is : 0.778899082569

 
 

For beta : 2.5
Accuracy is : 0.763090128755
Sensitivity is : 0.533333333333
Specificity is : 0.778899082569

 
 

For beta : 3.0
Accuracy is : 0.634334763948
Sensitivity is : 0.693333333333
Specificity is : 0.630275229358

b. Calculate FBeta score on original optimal model {model_logr4}



In [147]:

    
from sklearn.metrics import fbeta_score



In [177]:

    
betas = np.linspace(1,3,num=5)
for ta in betas:
    print('\n')
    print('Beta : ', ta)
    fscorema = fbeta_score(y20_test, y_test_pred4, average='macro', beta=ta)
    print('fscore_ma : ' ,fscorema)
    fscoremi = fbeta_score(y20_test, y_test_pred4, average='micro', beta=ta)
    print('fscore_mi : ' ,fscoremi)
    fscorew = fbeta_score(y20_test, y_test_pred4, average='weighted', beta=ta)
    print('fscore_w : ' ,fscorew)
    fscoren = fbeta_score(y20_test, y_test_pred4, average=None, beta=ta)
    print('fscore_n : ' ,fscoren)









    




Beta :  1.0
fscore_ma :  0.220689655172
fscore_mi :  0.220689655172
fscore_w :  0.220689655172
fscore_n :  [ 0.82110818  0.22068966]


Beta :  1.5
fscore_ma :  0.295035460993
fscore_mi :  0.295035460993
fscore_w :  0.295035460993
fscore_n :  [ 0.77620875  0.29503546]


Beta :  2.0
fscore_ma :  0.363636363636
fscore_mi :  0.363636363636
fscore_w :  0.363636363636
fscore_n :  [ 0.75314618  0.36363636]


Beta :  2.5
fscore_ma :  0.419909502262
fscore_mi :  0.419909502262
fscore_w :  0.419909502262
fscore_n :  [ 0.74046603  0.4199095 ]


Beta :  3.0
fscore_ma :  0.463768115942
fscore_mi :  0.463768115942
fscore_w :  0.463768115942
fscore_n :  [ 0.73292511  0.46376812]



In [165]:

    
print('fscorema : ' ,fscorema)
print('fscoremi : ' ,fscoremi)
print('fscorew : ' ,fscorew)
print('fscoren : ' ,fscoren)









    



fscorema :  0.419909502262
fscoremi :  0.419909502262
fscorew :  0.419909502262
fscoren :  [ 0.74046603  0.4199095 ]

Fit the optimized model on actual x,y and predict y from test dataset



In [ ]:

    
model_logr4.fit(x,y)



In [181]:

    
prediction = np.where(model_logr4.predict(cd_test)==1,"Yes","No")
submission = pd.DataFrame(list(zip(cd_test['V1'],list(prediction))),
                       columns=['V1','V86'])



In [182]:

    
pred_y = submission['V86']
actual_y = cd_train['V86']



In [183]:

    
submission.head(4)



In [185]:

    
submission.to_csv('submission_carvan.csv',index=False)

This submission will get you auc score of approx 0.50, slightly less than whats required for passing the course. You'll have to make changes

	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	...	V80
0	33	1	3	2	8	0	5	1	3	7	...	1
1	37	1	2	2	8	1	4	1	4	6	...	1
2	37	1	2	2	8	0	4	2	4	3	...	1
3	9	1	3	3	3	2	3	2	4	5	...	1
4	40	1	4	2	10	1	4	1	4	7	...	1

	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	...	V80
0	33	1	3	2	8	0	5	1	3	7	...	1
1	37	1	2	2	8	1	4	1	4	6	...	1
2	37	1	2	2	8	0	4	2	4	3	...	1
3	9	1	3	3	3	2	3	2	4	5	...	1
4	40	1	4	2	10	1	4	1	4	7	...	1

	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	...	V80
0	33	1	3	2	8	0	5	1	3	7	...	1
1	37	1	2	2	8	1	4	1	4	6	...	1
2	37	1	2	2	8	0	4	2	4	3	...	1
3	9	1	3	3	3	2	3	2	4	5	...	1
4	40	1	4	2	10	1	4	1	4	7	...	1