In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import math
from sklearn.cross_validation import train_test_split,KFold
import numpy as np
In [2]:
datafile_train=r'Data/consumer/Consumer_Complaints_train.csv'
datafile_test=r'Data/consumer/Consumer_Complaints_test.csv'
cd_train=pd.read_csv(datafile_train)
cd_test=pd.read_csv(datafile_test)
In [ ]:
cd_train.dtypes
In [3]:
for col in ['Date received','Date sent to company']:
cd_train[col]=pd.to_datetime(cd_train[col],infer_datetime_format=True)
cd_test[col]=pd.to_datetime(cd_test[col],infer_datetime_format=True)
In [4]:
cd_train['day_diff']=pd.to_numeric(cd_train['Date sent to company']-cd_train['Date received'])
cd_test['day_diff']=pd.to_numeric(cd_test['Date sent to company']-cd_test['Date received'])
In [5]:
for col in ['Date received','Date sent to company']:
cd_train.drop([col],1,inplace=True)
cd_test.drop([col],1,inplace=True)
In [ ]:
for col in cd_train.select_dtypes(['object']).columns:
print(col,':',cd_train[col].nunique())
In [ ]:
cd_train.isnull().sum()
In [ ]:
len(pd.isnull(cd_train['Tags']))
len(cd_train)
In [6]:
for col in ['Sub-product','Sub-issue','Consumer complaint narrative',
'Company public response','Tags','Consumer consent provided?']:
varname=col.replace('-','_').replace('?','').replace(" ",'_')+'_isNan'
cd_train[varname]=np.where(pd.isnull(cd_train[col]),1,0)
cd_train.drop([col],1,inplace=True)
cd_test[varname]=np.where(pd.isnull(cd_test[col]),1,0)
cd_test.drop([col],1,inplace=True)
In [ ]:
cd_train.head(4)
In [15]:
for col in cd_train.select_dtypes(['object']).columns:
print(col,':',cd_train[col].nunique())
In [7]:
for col in ['ZIP code','Company']:
cd_train.drop([col],1,inplace=True)
cd_test.drop([col],1,inplace=True)
In [8]:
cd_train['Consumer disputed?']=np.where(cd_train['Consumer disputed?']=="Yes",1,0)
In [9]:
k=cd_train['Issue'].value_counts()
for val in k.axes[0][0:10]:
varname='Issue_'+val.replace(',','_').replace(' ','_')
cd_train[varname]=np.where(cd_train['Issue']==val,1,0)
cd_test[varname]=np.where(cd_test['Issue']==val,1,0)
del cd_train['Issue']
del cd_test['Issue']
In [13]:
for col in cd_train.select_dtypes(['object']).columns:
print(col,':',cd_train[col].nunique())
In [10]:
k=cd_train['State'].value_counts()
for val in k.axes[0][0:15]:
varname='State_'+val.replace(',','_').replace(' ','_')
cd_train[varname]=np.where(cd_train['State']==val,1,0)
cd_test[varname]=np.where(cd_test['State']==val,1,0)
del cd_train['State']
del cd_test['State']
In [11]:
for col in ['Product','Submitted via','Company response to consumer','Timely response?']:
temp=pd.get_dummies(cd_train[col],prefix=col,drop_first=True)
cd_train=pd.concat([temp,cd_train],1)
cd_train.drop([col],1,inplace=True)
temp=pd.get_dummies(cd_test[col],prefix=col,drop_first=True)
cd_test=pd.concat([temp,cd_test],1)
cd_test.drop([col],1,inplace=True)
In [12]:
x = cd_train.drop(['Consumer disputed?','Complaint ID'],1)
y = cd_train['Consumer disputed?']
In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,accuracy_score,classification_report
In [14]:
ld_train, ld_test = train_test_split(cd_train, test_size=0.2, random_state=2)
In [15]:
x80_train = ld_train.drop(['Consumer disputed?','Complaint ID'],1)
y80_train = ld_train['Consumer disputed?']
x20_test = ld_test.drop(['Consumer disputed?','Complaint ID'],1)
y20_test = ld_test['Consumer disputed?']
In [19]:
model_logr1 = LogisticRegression(penalty="l1",class_weight=None,random_state=2)
In [20]:
model_logr1.fit(x80_train, y80_train)
Out[20]:
In [21]:
#y20_test_pred = np.where(model_logr1.predict(ld_test.drop(['Complaint ID','Consumer disputed?'],1))==1,1,0)
y20_test_pred = np.where(model_logr1.predict(x20_test)==1,1,0)
temp_df = pd.DataFrame(list(zip(cd_test['Complaint ID'],list(y20_test_pred))), columns=['Complaint ID','Consumer disputed?'])
y_test_pred = temp_df['Consumer disputed?']
In [23]:
roc_auc_score(y20_test, y_test_pred)
Out[23]:
In [24]:
model_logrl2 = LogisticRegression(penalty="l2",class_weight=None,random_state=2)
In [25]:
model_logrl2.fit(x80_train, y80_train)
Out[25]:
In [26]:
y20_test_pred = np.where(model_logrl2.predict(x20_test)==1,1,0)
temp_df = pd.DataFrame(list(zip(cd_test['Complaint ID'],list(y20_test_pred))), columns=['Complaint ID','Consumer disputed?'])
y_test_pred = temp_df['Consumer disputed?']
In [27]:
roc_auc_score(y20_test, y_test_pred)
Out[27]:
In [28]:
model_logr2 = LogisticRegression(penalty="l1",class_weight="balanced",random_state=2)
In [29]:
model_logr2.fit(x80_train, y80_train)
Out[29]:
In [30]:
y20_test_pred2 = np.where(model_logr2.predict(ld_test.drop(['Complaint ID','Consumer disputed?'],1))==1,1,0)
temp_df2 = pd.DataFrame(list(zip(cd_test['Complaint ID'],list(y20_test_pred2))),
columns=['Complaint ID','Consumer disputed?'])
y_test_pred2 = temp_df2['Consumer disputed?']
In [31]:
roc_auc_score(y20_test, y_test_pred2)
Out[31]:
In [32]:
model_logr3 = LogisticRegression(penalty="l2",class_weight="balanced",random_state=2)
In [33]:
model_logr3.fit(x80_train, y80_train)
Out[33]:
In [34]:
y20_test_pred3 = np.where(model_logr3.predict(ld_test.drop(['Complaint ID','Consumer disputed?'],1))==1,1,0)
temp_df3 = pd.DataFrame(list(zip(cd_test['Complaint ID'],list(y20_test_pred3))),
columns=['Complaint ID','Consumer disputed?'])
y_test_pred3 = temp_df3['Consumer disputed?']
In [35]:
roc_auc_score(y20_test, y_test_pred3)
Out[35]:
In [36]:
from sklearn import cross_validation
In [38]:
predicted = cross_validation.cross_val_predict(model_logr2, x, y, cv=10)
print(accuracy_score(y, predicted))
print(classification_report(y, predicted))
In [55]:
prob_score=pd.Series(list(zip(*model_logr2.predict_proba(x80_train)))[1])
In [56]:
cutoffs=np.linspace(0,1,100)
For each of these cutoff , we are going to look at TP,FP,TN,FN values and caluclate KS. Then we'll choose the best cutoff as the one having highest KS.
In [58]:
KS_cut=[]
for cutoff in cutoffs:
predicted=pd.Series([0]*len(y80_train))
predicted[prob_score>cutoff]=1
df=pd.DataFrame(list(zip(y80_train,predicted)),columns=["real","predicted"])
TP=len(df[(df["real"]==1) &(df["predicted"]==1) ])
FP=len(df[(df["real"]==0) &(df["predicted"]==1) ])
TN=len(df[(df["real"]==0) &(df["predicted"]==0) ])
FN=len(df[(df["real"]==1) &(df["predicted"]==0) ])
P=TP+FN
N=TN+FP
KS=(TP/P)-(FP/N)
KS_cut.append(KS)
cutoff_data=pd.DataFrame(list(zip(cutoffs,KS_cut)),columns=["cutoff","KS"])
KS_cutoff=cutoff_data[cutoff_data["KS"]==cutoff_data["KS"].max()]["cutoff"]
Now we'll see how this model with the cutoff determined here , performs on the test data.
In [60]:
# Performance on test data
prob_score_test=pd.Series(list(zip(*model_logr2.predict_proba(x20_test)))[1])
predicted_test=pd.Series([0]*len(y20_test))
predicted_test[prob_score_test > float(KS_cutoff)]=1
df_test=pd.DataFrame(list(zip(y20_test,predicted_test)),columns=["real","predicted"])
k=pd.crosstab(df_test['real'],df_test["predicted"])
print('confusion matrix :\n \n ',k)
TN=k.iloc[0,0]
TP=k.iloc[1,1]
FP=k.iloc[0,1]
FN=k.iloc[1,0]
P=TP+FN
N=TN+FP
In [61]:
# Accuracy of test
(TP+TN)/(P+N)
Out[61]:
In [62]:
# Sensitivity on test
TP/P
Out[62]:
In [63]:
#Specificity on test
TN/N
Out[63]:
In [39]:
model_logr2.fit(x,y)
Out[39]:
In [40]:
prediction = np.where(model_logr2.predict(cd_test.drop(['Complaint ID'],1))==1,"Yes","No")
submission = pd.DataFrame(list(zip(cd_test['Complaint ID'],list(prediction))),
columns=['Complaint ID','Consumer disputed?'])
In [49]:
pred_y = submission['Consumer disputed?']
actual_y = cd_train['Consumer disputed?']
# roc_auc_score(actual_y, pred_y) # This will fail since the probability pairs are one-one between y_actual and y_predicted
In [52]:
submission.head(4)
Out[52]:
In [53]:
submission.to_csv('submission_new.csv',index=False)
This submission will get you auc score of approx 0.50, slightly less, try to increase the score.