In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
import cPickle as pickle
from string import punctuation
from nltk import word_tokenize
from nltk.stem import snowball
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import PunktSentenceTokenizer
import time
%matplotlib inline
stemmer = snowball.SnowballStemmer("english")


/home/mgupta/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
df = pd.read_csv('../../data/twitter_cross_val_xgboost_doc2vec_results.csv')

In [3]:
df.head()


Out[3]:
Unnamed: 0 Unnamed: 0.1 Unnamed: 0.1 tweet_text label xgboost_predict doc2vec_predict
0 0 12168 12168 I love how Jayden acts like we have no right t... 1 0.409886 0.0
1 1 13497 13497 I been POPPIN since kindergarten nigga you a l... 1 0.509634 1.0
2 2 10058 10058 Like a real life mean person could make you cr... 0 0.329897 0.0
3 3 4559 4559 Which fags are getting down and tributing Xmas... 1 0.527562 0.0
4 4 5514 5514 From last night...my thoughts on J-Up and what... 0 0.339007 0.0

In [4]:
def myfunc(x):
    threshold = 0.41
    if x>= threshold:
        return 1
    else:
        return 0

In [5]:
df['xgbfinal4_predict'] = df['xgboost_predict'].map(lambda x: myfunc(x))

In [6]:
df.head()


Out[6]:
Unnamed: 0 Unnamed: 0.1 Unnamed: 0.1 tweet_text label xgboost_predict doc2vec_predict xgbfinal4_predict
0 0 12168 12168 I love how Jayden acts like we have no right t... 1 0.409886 0.0 0
1 1 13497 13497 I been POPPIN since kindergarten nigga you a l... 1 0.509634 1.0 1
2 2 10058 10058 Like a real life mean person could make you cr... 0 0.329897 0.0 0
3 3 4559 4559 Which fags are getting down and tributing Xmas... 1 0.527562 0.0 1
4 4 5514 5514 From last night...my thoughts on J-Up and what... 0 0.339007 0.0 0

In [12]:
df['doc2vec_predict'] = df['doc2vec_predict'].map(lambda x: bool(x))

In [13]:
df_agree_good = df[(df['label'] + df['xgbfinal4_predict'] + df['doc2vec_predict']) == 0]

In [25]:
x = df_agree_good['tweet_text'].values
for i in xrange(5):
    print x[i]
    print ""


Like a real life mean person could make you cry right now

From last night...my thoughts on J-Up and what I'd give him as a free agent. https://t.co/NwzUJlvn1v

You're truly amiable @Harry_Styles.
2 days until Christmas, may you please
follow @hsviola to make it exquisite? Stay well pal.. x1

@paranoiapupz never said it was haha

@shailenewoodley are you writing a book?


In [14]:
df_agree_bad = df[(df['label'] + df['xgbfinal4_predict'] + df['doc2vec_predict']) == 3]

In [26]:
x = df_agree_bad['tweet_text'].values
for i in xrange(5):
    print x[i]
    print ""


I been POPPIN since kindergarten nigga you a lame to me �_�__崁��

@Brady_Cutler What're you a fuckin' faggot?

Haha bitch i will never forgive the faggots who betray me #neverforgive2015 #sandrabland https://t.co/4qty8ZPPAs

@AddictedtoPesos twitter bash of the faggots

I just had a back and forth with someone who actually tweeted 'I hate fat bitches'.....conclusion: I am an idiot


In [35]:
df_missed_good = df[(df['label']==0) & (df['xgbfinal4_predict']==1) & (df['doc2vec_predict']==1)]

In [36]:
x = df_missed_good['tweet_text'].values
for i in xrange(5):
    print x[i]
    print ""


@wolfmikey_ WOW UPDATE SHES LEAVING AND HE CALLED HER WHITE TRASH AND SAID WHERE R U GOING LMAO

He looks so damn hot help https://t.co/oh6iYfz5d0

@BDUTT @sardesairajdeep would tweet like "Haha..All is under control " 
but the chinks in the armour quite visible https://t.co/sYnBIqUa7G

Going to miss my first LSU basketball game tonight in a long time tonight. I blame the weather.

I WAKE UP WITH THA BONG NEXT TO MY BED LIKE A TRUE WHITE BOY


In [37]:
df_missed_bad = df[(df['label']==1) & (df['xgbfinal4_predict']==0) & (df['doc2vec_predict']==0)]

In [38]:
x = df_missed_bad['tweet_text'].values
for i in xrange(5):
    print x[i]
    print ""


I love how Jayden acts like we have no right to be mad and we are "to far up Ashton's ass"

When people act gay and say no homo _�㢉�_�ۢ https://t.co/DMpGWqWt7B

@hemant_batra Because Dubai is full of Indians Saudi is the hot destination for #TortureTourism @SushmaSwaraj @dhanyarajendran

I don't trust dykes

hyporcrits who bash queers but pretend to be about lgbt if it pisses the gop off. rape culture fans etc @SedaryRaymaker @scootey @KailiJoy


In [43]:
df_doc2vec_win = df[(df['label']==df['doc2vec_predict']) & (df['xgbfinal4_predict']!=df['label'])]

In [44]:
x = df_doc2vec_win['tweet_text'].values
for i in xrange(5):
    print x[i]
    print ""


I be cuffin these hoes, I don't be smashing these hoes.

it's just dark and the stuff on the chair is white rip

Cuffin these hoes thats where u went bad at

@Visit_Wakefield He looks nothing like him!

The chinks in the mall great at repair _�㢉ۢ���


In [45]:
df_doc2vec_lose = df[(df['label']!=df['doc2vec_predict']) & (df['xgbfinal4_predict']==df['label'])]

In [46]:
x = df_doc2vec_lose['tweet_text'].values
for i in xrange(5):
    print x[i]
    print ""


Which fags are getting down and tributing Xmas presents! Prove urself that u wanna be owned Hey look at: https://t.co/owhnInz7cJ @Amazon

I hate when u know something is wrong but they won't tell u. You just feel so unhelpful

@thehill @freegalt Hillary is the EPITOME of WHITE TRASH!

so all these fucking adults are like "fuck yeah! snakes and shit!" so off we go

s/o to the undercover cop in shorewood for pulling me over to give me a Dunkin' giftcard���_�̪叁��_�̪� https://t.co/LwazeDTYhS


In [ ]:


In [ ]: