In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
import cPickle as pickle
from string import punctuation
from nltk import word_tokenize
from nltk.stem import snowball
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import PunktSentenceTokenizer
import time
%matplotlib inline
stemmer = snowball.SnowballStemmer("english")
In [2]:
df = pd.read_csv('../../data/twitter_cross_val_xgboost_doc2vec_results.csv')
In [3]:
df.head()
Out[3]:
In [4]:
def myfunc(x):
threshold = 0.41
if x>= threshold:
return 1
else:
return 0
In [5]:
df['xgbfinal4_predict'] = df['xgboost_predict'].map(lambda x: myfunc(x))
In [6]:
df.head()
Out[6]:
In [12]:
df['doc2vec_predict'] = df['doc2vec_predict'].map(lambda x: bool(x))
In [13]:
df_agree_good = df[(df['label'] + df['xgbfinal4_predict'] + df['doc2vec_predict']) == 0]
In [25]:
x = df_agree_good['tweet_text'].values
for i in xrange(5):
print x[i]
print ""
In [14]:
df_agree_bad = df[(df['label'] + df['xgbfinal4_predict'] + df['doc2vec_predict']) == 3]
In [26]:
x = df_agree_bad['tweet_text'].values
for i in xrange(5):
print x[i]
print ""
In [35]:
df_missed_good = df[(df['label']==0) & (df['xgbfinal4_predict']==1) & (df['doc2vec_predict']==1)]
In [36]:
x = df_missed_good['tweet_text'].values
for i in xrange(5):
print x[i]
print ""
In [37]:
df_missed_bad = df[(df['label']==1) & (df['xgbfinal4_predict']==0) & (df['doc2vec_predict']==0)]
In [38]:
x = df_missed_bad['tweet_text'].values
for i in xrange(5):
print x[i]
print ""
In [43]:
df_doc2vec_win = df[(df['label']==df['doc2vec_predict']) & (df['xgbfinal4_predict']!=df['label'])]
In [44]:
x = df_doc2vec_win['tweet_text'].values
for i in xrange(5):
print x[i]
print ""
In [45]:
df_doc2vec_lose = df[(df['label']!=df['doc2vec_predict']) & (df['xgbfinal4_predict']==df['label'])]
In [46]:
x = df_doc2vec_lose['tweet_text'].values
for i in xrange(5):
print x[i]
print ""
In [ ]:
In [ ]: