In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
import cPickle as pickle
from string import punctuation
from nltk import word_tokenize
from nltk.stem import snowball
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import PunktSentenceTokenizer
import time
import seaborn as sb
%matplotlib inline
stemmer = snowball.SnowballStemmer("english")
In [29]:
def build_roc(df):
df['TPR'] = df['recall']
df['FPR'] = df['FP']/(df['FP'] + df['TN'])
# plt.plot([0,1],[0,1],'k',linewidth=0.5)
plt.figure()
plt.plot(df.FPR.values,df.TPR.values,'r*',markersize=7)
plt.xlabel('FPR')
plt.xlim([0,1])
plt.ylabel('TPR')
plt.ylim([0,1])
titlestr = "AUC: {} k = {}".format(np.trapz(df.TPR.values[::-1],x=df.FPR.values[::-1]),int(df.k.unique()))
plt.title(titlestr)
In [30]:
resultspath2 = '../../data/gridsearch_modelbase2mini_on_test.csv'
df2 = pd.read_csv(resultspath2)
In [39]:
df2.head()
Out[39]:
In [34]:
build_roc(df2[df2['k']==11])
In [4]:
resultspath = '../../data/twitter_test_xgboost_doc2vec_results.csv'
dresults = pd.read_csv(resultspath)
In [51]:
plt.figure()
yvals = df2['recall']
xvals = df2['FP']/(df2['FP'] + df2['TN'])
plt.plot(xvals,yvals,'r',linewidth=2,label='doc2vec')
plt.plot(FPR,TPR,'b',linewidth=2,label='TFIDF')
plt.xlabel('False Postive Rate')
plt.xlim([0,1])
plt.ylabel('True Positive Rate')
plt.ylim([0,1])
plt.legend()
doc2vecAUC = np.round(np.trapz(yvals[::-1],x=xvals[::-1]),decimals=2)
TFIDF_AUC = np.round(np.trapz(TPR[::-1],x=FPR[::-1]),decimals=2)
titlestr = "doc2vec AUC: {} | TFIDF AUC: {}".format(doc2vecAUC,TFIDF_AUC)
plt.title(titlestr)
Out[51]:
In [5]:
labels = dresults['label'].values
TPR = []
FPR = []
for i in xrange(101):
threshold = i/100.0
predict = dresults['xgboost_predict'].values >= threshold
TP = sum(predict+labels==2)
TN = sum(predict+labels==0)
FP = sum(predict-labels==1)
FN = sum(predict-labels==-1)
# print "accuracy: {} | threshold {}".format((TP+TN)/float(len(labels)),threshold)
TPR.append(TP/float(TP+FN))
FPR.append(FP/float(FP+TN))
plt.plot(FPR,TPR,'r*',markersize=7)
plt.xlabel('FPR')
plt.xlim([0,1])
plt.ylabel('TPR')
plt.ylim([0,1])
titlestr = "AUC: {}".format(np.trapz(TPR[::-1],x=FPR[::-1]))
plt.title(titlestr)
Out[5]:
In [6]:
#output from test_model.py on test set with k = 11 and threshold = 0.63
# accuracy: 0.794011976048
# recall: 0.781922525108
# precision: 0.801863658656
# TP: 1635
# TN: 1680
# FN: 456
# FP: 404
In [ ]:
In [7]:
def myfunc(x):
#threshold val chosen from cross val set analysis
threshold = 0.41
if x>= threshold:
return 1
else:
return 0
In [8]:
df = dresults.copy()
df['xgbfinal4_predict'] = df['xgboost_predict'].map(lambda x: myfunc(x))
In [9]:
df.head()
Out[9]:
In [10]:
df_agree_good = df[(df['label'] + df['xgbfinal4_predict'] + df['doc2vec_predict']) == 0]
In [18]:
x = df_agree_good['tweet_text'].values
print len(x)
print ""
for i in xrange(5):
print x[i]
print ""
In [12]:
df_agree_bad = df[(df['label'] + df['xgbfinal4_predict'] + df['doc2vec_predict']) == 3]
In [19]:
x = df_agree_bad['tweet_text'].values
print len(x)
print ""
for i in xrange(5):
print x[i]
print ""
In [14]:
df_missed_good = df[(df['label']==0) & (df['xgbfinal4_predict']==1) & (df['doc2vec_predict']==1)]
In [20]:
x = df_missed_good['tweet_text'].values
print len(x)
print ""
for i in xrange(5):
print x[i]
print ""
In [16]:
df_missed_bad = df[(df['label']==1) & (df['xgbfinal4_predict']==0) & (df['doc2vec_predict']==0)]
In [21]:
x = df_missed_bad['tweet_text'].values
print len(x)
print ""
for i in xrange(5):
print x[i]
print ""
In [22]:
df_doc2vec_win = df[(df['label']==df['doc2vec_predict']) & (df['xgbfinal4_predict']!=df['label'])]
In [26]:
x = df_doc2vec_win['tweet_text'].values
print len(x)
print ""
for i in xrange(5):
print x[i]
print ""
In [24]:
df_doc2vec_lose = df[(df['label']!=df['doc2vec_predict']) & (df['xgbfinal4_predict']==df['label'])]
In [28]:
x = df_doc2vec_lose['tweet_text'].values
print len(x)
print ""
for i in xrange(5):
print x[i]
print ""
In [ ]: