ZIKA CLASSIFICATION MODEL

IMPORTS


In [123]:
# Algorithms
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# Metrics
from sklearn.metrics import confusion_matrix, roc_curve, auc, accuracy_score
from sklearn.metrics import classification_report, precision_recall_curve

# Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.decomposition import TruncatedSVD
from modules.transformers import *

# Visuals
# from modules.custom_plot import plot_confusion_matrix
import matplotlib.pyplot as plt
% matplotlib inline

# Miscellaneous
from sklearn.cross_validation import train_test_split
from glob import glob
import pandas as pd
import numpy as np
import pickle
import re

LOAD DATA


In [2]:
df = pd.read_csv('data/161207_ZikaLabels.csv')
df.dropna(axis=0,inplace=True) #drop NaNs or else NaNs would confuse the algorithms
X = df.diagnosisRAW
y = df.zika
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [71]:
class_mapping = {label:indx for indx,label in enumerate(np.unique(df.zika))}
encoded_y_test = y_test.map(class_mapping)
N = float(len(encoded_y_test[encoded_y_test==0]))
P = float(len(encoded_y_test[encoded_y_test==1]))
baseline_PR = P/(P+N)

ALGORITHMS


In [26]:
algorithms = {}
algorithms['Gradient_Boost'] = GradientBoostingClassifier(random_state=42)
algorithms['Logistic_Regression'] = LogisticRegression(random_state=42)
algorithms['Random_Forest'] = RandomForestClassifier(random_state=42)
algorithms['Gauss_Naive_Bayes'] = GaussianNB()

TRAIN MODELS


In [72]:
# Latent Semantic Analysis (LSA)
lsa = make_pipeline(TfidfVectorizer(),TruncatedSVD(n_components=500)) 
# n_components = 100-500 is the general range for LSA applications and depends on size of corpus
# https://medium.com/@adi_enasoaie/easy-lsi-pipeline-using-scikit-learn-a073f2484408#.j50q4rwnz

# Feature Extractions
feature_union = make_union(lsa,ZikaCounterTransformer(), SentimentTransformer())

In [142]:
for name,algorithm in algorithms.items():
    # Data Pipeline
    pipeline = make_pipeline(AsciiTransformer(),
                            LowerCaseTransformer(),
                            RemoveSymsTransformer(),
                            RemoveStopWordsTransformer(),
                            feature_union,
                            algorithm)
    # Train Model
    model = pipeline.fit(X_train,y_train)
    
    # Make Predictions
    y_pred = model.predict(X_test)
    y_pred_probs = model.predict_proba(X_test)
 
    # Metrics (for model evaluation)
    cnf_matrix = confusion_matrix(y_test, y_pred)
    score = accuracy_score(y_test,y_pred) # accuracy = (correct preds)/(num samples) = (TP+TN)/(TP+TN+FP+FN)
    precision,recall,threshold_PR = precision_recall_curve(encoded_y_test,y_pred_probs[:,1],pos_label=1)
    fpr,tpr,threshold_ROC = roc_curve(encoded_y_test,y_pred_probs[:,1],pos_label=1)
    AUC = auc(fpr,tpr)
    print '#'*90
    print '\t MODEL:{} \t ACCURACY:{} \t AUC:{}'.format(name, score, AUC)
    print '#'*90    
    print classification_report(y_test,y_pred)
    
    # Plot Figures
    fig,axs = plt.subplots(nrows=1,ncols=3)
    fig.set_figwidth(15)
    fig.set_figheight(5)
    # PR Curve
    ax = axs[0]
    ax.plot(recall,precision)
    ax.plot(np.linspace(0,1,len(recall)),[baseline_PR]*len(recall),'--r')
    ax.set_title('Precision-Recall ({})'.format(name))
    ax.set_xlabel('Recall')
    ax.set_ylabel('Precision')
    ax.legend(['PR Curve','PR Baseline'])
    ax.grid(True)
    # ROC Curve
    ax = axs[1]
    ax.plot(fpr,tpr)
    ax.plot(np.linspace(0,1,len(fpr)),np.linspace(0,1,len(fpr)),'--r')
    ax.set_title('ROC Curve ({})'.format(name))
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.legend(['ROC Curve','ROC Baseline'])
    ax.grid(True)
    # Plot Confusion Matrix
    ax = axs[2]
    class_names = sorted(df.zika.unique())
    plot_confusion_matrix(cnf_matrix, classes=class_names,title='Confusion Matrix ({})'.format(name) )
    plt.show()
    print


##########################################################################################
	 MODEL:Logistic_Regression 	 ACCURACY:0.929133858268 	 AUC:0.498171234689
##########################################################################################
             precision    recall  f1-score   support

      False       0.93      1.00      0.96      1298
       True       0.00      0.00      0.00        99

avg / total       0.86      0.93      0.90      1397

##########################################################################################
	 MODEL:Gradient_Boost 	 ACCURACY:0.926270579814 	 AUC:0.525089103672
##########################################################################################
             precision    recall  f1-score   support

      False       0.93      1.00      0.96      1298
       True       0.17      0.01      0.02        99

avg / total       0.88      0.93      0.89      1397

##########################################################################################
	 MODEL:Gauss_Naive_Bayes 	 ACCURACY:0.312097351467 	 AUC:0.470467385722
##########################################################################################
             precision    recall  f1-score   support

      False       0.91      0.29      0.44      1298
       True       0.06      0.65      0.12        99

avg / total       0.85      0.31      0.41      1397

##########################################################################################
	 MODEL:Random_Forest 	 ACCURACY:0.924838940587 	 AUC:0.503498000031
##########################################################################################
             precision    recall  f1-score   support

      False       0.93      0.99      0.96      1298
       True       0.20      0.02      0.04        99

avg / total       0.88      0.92      0.90      1397


OPTIMIZE N PRINCIPAL COMPONENTS


In [116]:
# Initiate Figures
fig,axs = plt.subplots(nrows=2,ncols=2)
fig.set_figwidth(15)
fig.set_figheight(15)
labels = [] # model names

for name,algorithm in algorithms.items():
    labels.append(name)
    scores = [] # accuracy score = (TP+TN)/(TP+TN+FP+FN)
    aucs = [] # area under ROC curve
    tprs = [] # true positive rate = sensitivity = TP/P = TP/(TP+FN)
    tnrs = [] # true negative rate = specificity = TN/N = TN/(TN+FP)
    n_components = [] # number of principal components
    for n in range(10,501,10):
        # Data Pipeline
        lsa = make_pipeline(TfidfVectorizer(),TruncatedSVD(n_components=n))
        feature_union = make_union(lsa,ZikaCounterTransformer(), SentimentTransformer()) 
        pipeline = make_pipeline(AsciiTransformer(),
                                LowerCaseTransformer(),
                                RemoveSymsTransformer(),
                                RemoveStopWordsTransformer(),
                                feature_union,
                                algorithm)
        # Train Model
        model = pipeline.fit(X_train,y_train)

        # Make Predictions
        y_pred = model.predict(X_test)
        y_pred_probs = model.predict_proba(X_test)

        # Metrics (for model evaluation)
        score = accuracy_score(y_test,y_pred) # accuracy = (correct preds)/(num samples) = (TP+TN)/(TP+TN+FP+FN)
        precision,recall,threshold_PR = precision_recall_curve(encoded_y_test,y_pred_probs[:,1],pos_label=1)
        fpr,tpr,threshold_ROC = roc_curve(encoded_y_test,y_pred_probs[:,1],pos_label=1)
        AUC = auc(fpr,tpr)
        cnf_matrix = confusion_matrix(y_test, y_pred)
        TP = float(cnf_matrix[1][1])
        FN = float(cnf_matrix[1][0])
        TN = float(cnf_matrix[0][0])
        FP = float(cnf_matrix[0][1])
        TPR = TP/(TP+FN)
        TNR = TN/(TN+FP) 
        
        # Save Data
        scores.append(score)
        aucs.append(AUC)
        tprs.append(TPR)
        tnrs.append(TNR)
        n_components.append(n)
    
    ax = axs[0,0]
    ax.plot(n_components,scores,'--o')
    ax = axs[0,1]
    ax.plot(n_components,aucs,'--o')
    ax = axs[1,0]
    ax.plot(n_components,tprs,'--o')
    ax = axs[1,1]
    ax.plot(n_components,tnrs,'--o')

# Scores VS n_components
ax = axs[0,0]
ax.set_title('Accuracy Score')
ax.set_xlabel('n_component')
ax.set_ylabel('Score')
ax.legend(labels, loc='best')
ax.grid(True)
# AUCs VS n_components
ax = axs[0,1]
ax.plot(n_components,aucs)
ax.set_title('Area Under Curve')
ax.set_xlabel('n_component')
ax.set_ylabel('AUC')
ax.legend(labels, loc='best')
ax.grid(True)
# True Positive Rate VS n_components
ax = axs[1,0]
ax.plot(n_components,tprs)
ax.set_title('True Positive Rate')
ax.set_xlabel('n_component')
ax.set_ylabel('TPR')
ax.legend(labels, loc='best')
ax.grid(True)
# True Negative Rate VS n_components
ax = axs[1,1]
ax.plot(n_components,tnrs)
ax.set_title('True Negative Rate')
ax.set_xlabel('n_component')
ax.set_ylabel('TNR')
ax.legend(labels, loc='best')
ax.grid(True)


SAVE MODELS


In [117]:
for name,algorithm in algorithms.items():
    lsa = make_pipeline(TfidfVectorizer(),TruncatedSVD(n_components=500))
    feature_union = make_union(lsa,ZikaCounterTransformer(), SentimentTransformer()) 
    pipeline = make_pipeline(AsciiTransformer(),
                            LowerCaseTransformer(),
                            RemoveSymsTransformer(),
                            RemoveStopWordsTransformer(),
                            feature_union,
                            algorithm)
    model = pipeline.fit(X_train,y_train)
#     with open('models/MODEL_{}.plk'.format(name),'wb') as f:
#         pickle.dump(model,f)

RUN MODEL


In [143]:
def run_model(model,text):
    print 'Text input: \"{}\"'.format(text)
    print 'Prediction: {}'.format(model.predict(text)[0])
    print 
#     print 'Probability the model thinks you do NOT have the Zika Virus (FALSE): {}'.format(model.predict_proba(text)[0][0],4)
#     print 'Probability the model thinks you DO have the Zika Virus (TRUE): {}'.format(model.predict_proba(text)[0][1],4)

In [144]:
sent1 = 'Eu tenho o vírus zika'     # I have the zika virus
sent2 = 'Eu nao tenho o virus zika' # I do not have the zika virus
sent3 = 'Tenho febre, erupções cutâneas, dores nas articulações e olhos vermelhos.'  # I have a  fever, rash, joint pain, and red eyes.
sent4 = 'Estou completamente saudável' # I am completely healthy
sentences = [sent1,sent2,sent3,sent4]

In [145]:
file_names = glob('models/MODEL_*.plk')
for name in file_names:
    with open(name,'rb') as f:
        model = pickle.load(f)    
    print '#'*90
    print '\t \t \t {}'.format(name)
    print '#'*90
    for sent in sentences:
        run_model(model,sent)


##########################################################################################
	 	 	 models/MODEL_Gauss_Naive_Bayes.plk
##########################################################################################
Text input: "Eu tenho o vírus zika"
Prediction: True

Text input: "Eu nao tenho o virus zika"
Prediction: True

Text input: "Tenho febre, erupções cutâneas, dores nas articulações e olhos vermelhos."
Prediction: False

Text input: "Estou completamente saudável"
Prediction: False

##########################################################################################
	 	 	 models/MODEL_Gradient_Boost.plk
##########################################################################################
Text input: "Eu tenho o vírus zika"
Prediction: False

Text input: "Eu nao tenho o virus zika"
Prediction: False

Text input: "Tenho febre, erupções cutâneas, dores nas articulações e olhos vermelhos."
Prediction: False

Text input: "Estou completamente saudável"
Prediction: False

##########################################################################################
	 	 	 models/MODEL_Logistic_Regression.plk
##########################################################################################
Text input: "Eu tenho o vírus zika"
Prediction: False

Text input: "Eu nao tenho o virus zika"
Prediction: False

Text input: "Tenho febre, erupções cutâneas, dores nas articulações e olhos vermelhos."
Prediction: False

Text input: "Estou completamente saudável"
Prediction: False

##########################################################################################
	 	 	 models/MODEL_Random_Forest.plk
##########################################################################################
Text input: "Eu tenho o vírus zika"
Prediction: False

Text input: "Eu nao tenho o virus zika"
Prediction: False

Text input: "Tenho febre, erupções cutâneas, dores nas articulações e olhos vermelhos."
Prediction: False

Text input: "Estou completamente saudável"
Prediction: False

ANALYSIS

From the examples above, we can obviously see the model(s) cannot differentiate between whether someone has Zika or not based off of that person's text message.

So, why is that? I have spent some time thinking about whether or not it is possible. It is possible, but we're going to need more data, more fancy feature extractions, and/or more intricate models (such as convolutional neural network). But that will take some more time and effort. As a quick sanity, lets attempt to view the semantic relationship between words by converting the words into vectors (Word2Vec). This Word2Vec approach can be visualized via TNSE, a method that allows you to visualize high dimensionality space.

Check out the Word2Vec_model.ipynb to see semantic relationships from various example corpuses.