In [ ]:

    
%matplotlib inline

Model Building Part 3

Code for building the models
Author: Jimmy Charité
Email: jimmy.charite@gmail.com

Following up with part one, I try the bag of words approach



In [ ]:

    
import os
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt
import json
from IPython.display import Image
from IPython.core.display import HTML



In [ ]:

    
retval=os.chdir("..")



In [ ]:

    
clean_data=pd.read_pickle('./clean_data/clean_data.pkl')



In [ ]:

    
clean_data.head()



In [ ]:

    
kept_cols=['helpful', 'text_lemma']

Training and Testing Split



In [ ]:

    
my_rand_state=0
test_size=0.25



In [ ]:

    
from sklearn.model_selection import train_test_split



In [ ]:

    
X = (clean_data[kept_cols].iloc[:,1]).tolist()
y = (clean_data[kept_cols].iloc[:,0]).tolist()



In [ ]:

    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, 
                                              random_state=my_rand_state)

Text



In [ ]:

    
from sklearn.feature_extraction.text import TfidfVectorizer



In [ ]:

    
tfidf=TfidfVectorizer(lowercase=False)

Classification Models



In [ ]:

    
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

Although tuning is not necessary for Naive Bayes, I pass the default parameters of those models to GridSearchCV anyway so that I can do a direct pair-wise comparison with the other models across the different steps of cross-validation.

In the interest of time, I didn't use the SVM classifier.



In [ ]:

    
nb_clf=GaussianNB()
priors=[None]



In [ ]:

    
qda_clf=QuadraticDiscriminantAnalysis()
reg_param=[0.0, 0.25, 0.5, 0.75]



In [ ]:

    
log_clf=LogisticRegression(penalty='l2')
C=[0.001 , 0.01, 10, 100,1000]



In [ ]:

    
rf_clf=RandomForestClassifier()
n_estimators=[100,200]
max_features=[.1,.3,.5]



In [ ]:

    
dtree=DecisionTreeClassifier(max_depth=None, min_samples_split=2)
bagTree_clf=BaggingClassifier(base_estimator=dtree)
max_samples=[.3,.6]



In [ ]:

    
class_weight=['balanced']
class_weight.extend([{1: w} for w in [1, 2, 10]])

Creating Pipelines



In [ ]:

    
from imblearn import pipeline #needed if mixing imblearn with sklearn classes
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

I plan on using imblearn classes for later iterations so I use it's pipeline in the beginning for convenience



In [ ]:

    
n_jobs=4



In [ ]:

    
n_folds=10
skfold = StratifiedKFold(n_splits=n_folds,random_state=my_rand_state, shuffle=False)

Naive Bayes Estimators



In [ ]:

    
nb_clf_b = pipeline.Pipeline(steps=[('tfidf',tfidf),('clf',nb_clf)])
nb_clf_est_b = GridSearchCV(estimator=nb_clf_b,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(clf__priors=priors))

QDA Estimators



In [ ]:

    
qda_clf_b = pipeline.Pipeline(steps=[('tfidf',tfidf),('clf',qda_clf)])
qda_clf_est_b = GridSearchCV(estimator=qda_clf_b,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(clf__reg_param=reg_param))

Logistic Estimators



In [ ]:

    
log_clf_b = pipeline.Pipeline(steps=[('tfidf',tfidf),('clf',log_clf)])
log_clf_est_b = GridSearchCV(estimator=log_clf_b,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(clf__C=C,
                              clf__class_weight=class_weight))

Random Forest Estimators



In [ ]:

    
rf_clf_b = pipeline.Pipeline(steps=[('tfidf',tfidf),('clf',rf_clf)])
rf_clf_est_b = GridSearchCV(estimator=rf_clf_b,cv=skfold,
              scoring='roc_auc',n_jobs=n_jobs,
              param_grid=dict(clf__n_estimators=n_estimators,
                              clf__max_features=max_features,
                              clf__class_weight=class_weight))

Fitting Estimators



In [ ]:

    
from sklearn.externals import joblib

Basic Estimators: no bag of words or PCA



In [ ]:

    
log_clf_est_b.fit(X_train,y_train)
joblib.dump(log_clf_est_b, './other_output/bow/log_clf_est_b.pkl')



In [ ]:

    
rf_clf_est_b.fit(X_train,y_train)
joblib.dump(rf_clf_est_b, './other_output/bow/rf_clf_est_b.pkl')

Testing Estimators



In [ ]:

    
from sklearn.metrics import roc_curve, auc



In [ ]:

    
log_clf_est_b=joblib.load('./other_output/bow/log_clf_est_b.pkl')
rf_clf_est_b=joblib.load('./other_output/bow/rf_clf_est_b.pkl')

Basic Estimators: no bag of words or PCA



In [ ]:

    
log_fpr, log_tpr, _ = roc_curve(y_test, 
                    log_clf_est_b.predict_proba(X_test)[:,1])
log_roc_auc = auc(log_fpr, log_tpr)

rf_fpr, rf_tpr, _ = roc_curve(y_test, 
                    rf_clf_est_b.predict_proba(X_test)[:,1])
rf_roc_auc = auc(rf_fpr, rf_tpr)



In [ ]:

    
plt.plot(log_fpr, log_tpr, color='seagreen', linestyle='--',
         label='LOG (area = %0.2f)' % log_roc_auc, lw=2)

plt.plot(rf_fpr, rf_tpr, color='blue', linestyle='--',
         label='RF (area = %0.2f)' % rf_roc_auc, lw=2)

plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
         label='Luck')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves of Basic Models Using BOW')
plt.legend(loc="lower right")
plt.savefig('./plots/ROC_Basic_BOW.png', bbox_inches='tight')
plt.show()