Training a sentiment classifier



In [1]:

    
import pandas as pd
import time
import warnings
warnings.filterwarnings('ignore')

start_time = time.time()

dataset = pd.read_csv('data/Sentiment Analysis Dataset.csv',error_bad_lines=False)
del dataset["ItemID"]
del dataset['SentimentSource']

elapsed_time = time.time() - start_time
print elapsed_time

print dataset.shape
print len(dataset)
dataset.head()









    



Skipping line 8836: expected 4 fields, saw 5

Skipping line 535882: expected 4 fields, saw 7







    



1.73944497108
(1578612, 2)
1578612






    Out[1]:







  
    
      
      Sentiment
      SentimentText
    
  
  
    
      0
      0
      is so sad for my APL frie...
    
    
      1
      0
      I missed the New Moon trail...
    
    
      2
      1
      omg its already 7:30 :O
    
    
      3
      0
      .. Omgaga. Im sooo  im gunna CRy. I'...
    
    
      4
      0
      i think mi bf is cheating on me!!!   ...



In [2]:

    
#dataset[:10].iterrows()

test_train = pd.DataFrame()

test_train = dataset
test_train.head()









    Out[2]:







  
    
      
      Sentiment
      SentimentText
    
  
  
    
      0
      0
      is so sad for my APL frie...
    
    
      1
      0
      I missed the New Moon trail...
    
    
      2
      1
      omg its already 7:30 :O
    
    
      3
      0
      .. Omgaga. Im sooo  im gunna CRy. I'...
    
    
      4
      0
      i think mi bf is cheating on me!!!   ...

preproccessing



In [3]:

    
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import HTMLParser # In Python 3.4+ import html 
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()

def Clean(unescaped_tweet):
    '''This function takes a tweet as input and returns a tokenizing list.'''
    
    tokenizer = RegexpTokenizer(r'\w+')
    
    #tokenize words
    cleaned_tweet_tokens = tokenizer.tokenize(unescaped_tweet.lower())
    #remove stop words
    #cleaned_tweet_tokens = [word for word in cleaned_tweet_tokens if word not in stopwords.words('english')]
    
    #cleaned_tweet_tokens = [ ps.stem(w) for w in  cleaned_tweet_tokens]
    
    return cleaned_tweet_tokens



In [4]:

    
# start_time = time.time()
# test_train['token'] = test_train['SentimentText'].apply(lambda tweet: Clean(tweet))
# test_train.head()

# elapsed_time = time.time() - start_time
# print elapsed_time



In [5]:

    
test_train.head()









    Out[5]:







  
    
      
      Sentiment
      SentimentText
    
  
  
    
      0
      0
      is so sad for my APL frie...
    
    
      1
      0
      I missed the New Moon trail...
    
    
      2
      1
      omg its already 7:30 :O
    
    
      3
      0
      .. Omgaga. Im sooo  im gunna CRy. I'...
    
    
      4
      0
      i think mi bf is cheating on me!!!   ...



In [ ]:



In [4]:

    
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.stem.snowball import SnowballStemmer
import nltk

stemmer = SnowballStemmer('english')
analyzer = TfidfVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

vect = TfidfVectorizer(analyzer=stemmed_words, 
                       tokenizer=nltk.tokenize.casual.TweetTokenizer,
                       stop_words='english',
                       #min_df = 0.001, #dont include words that appear in less than x% of tweets
                       #max_df = 0.1
                      )

#test stemmer
#print(vect.fit_transform(sm_set.head()[:10]))
#print(vect.get_feature_names())



In [6]:

    
from sklearn.utils import shuffle



In [11]:

    
#keep only a sample
sm_set = pd.DataFrame(shuffle(test_train)[:100000]).reset_index(drop=True)



In [12]:

    
sm_set.SentimentText.head()









    Out[12]:





0                 @hannysimon I'm coming home today!! 
1    gnite tworld! may you sleep well. i dont think...
2              @NateSchwab that's the thing she can't 
3    and I want to see Drag Me To Hell again sooo b...
4    @Maineeventmnp AWWW tht sucks... where's my cu...
Name: SentimentText, dtype: object



In [14]:

    
# based on the text of each tweet, create a (sparse) matrix containing the occurencies of each word and store it into X.
# this is going to be our feature matrix, which we will give into the classifier to "learn" the sentiment.

#narrow down the dataset
sm_set = test_train#[:100000]

start_time = time.time()
#fit_transform is a method to create the feature matrix of the tweets based on word occurencies
X = vect.fit_transform(sm_set.SentimentText)
y = sm_set.Sentiment

elapsed_time = time.time() - start_time
print elapsed_time,'sec to fit transform',len(sm_set),'samples'









    



255.079085827 sec to fit transform 1578612 samples



In [ ]:



In [15]:

    
print len(vect.get_feature_names())
print vect.get_feature_names()









    



636007






    



IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.



In [16]:

    
from sklearn.model_selection import train_test_split



In [17]:

    
#split the dataset in a training (X_train, y_train) and test dataset (X_test,y_test)

X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.33, random_state=42)



In [ ]:



In [ ]:

train NaiveBayes without feature selection



In [18]:

    
from sklearn.naive_bayes import MultinomialNB



In [19]:

    
#train the classifier
start_time = time.time()
clf = MultinomialNB()
clf.fit(X_train, y_train)

elapsed_time = time.time() - start_time
print elapsed_time









    



0.265272855759



In [20]:

    
print vect.get_feature_names()[::len(vect.get_feature_names())/40]









    



[u'00', u'3p9pkr', u'6p3im', u'abail', u'amaneci', u'assistiu', u'benjaminblack', u'brendensteven', u'cathyrigbi', u'clickio', u'd_ryura', u'didntwork', u'ebert', u'explosivosr', u'friendstack', u'gotki', u'hesgettingamazingreview', u'ilube', u'jaybaer', u'jorgemudri', u'kennyl98', u'lall', u'lmmeng', u'manu', u'michaeal', u'moviesss', u'nenna', u'ohscreditunion', u'pentagramdream', u'pshawww', u'renna', u'sakura0_o', u'shaunswagg', u'someh', u'sumchi', u'thanickyj', u'tootexti', u'unvibr', u'wepppaaaaaaaaaaaaaaa', u'xseifer', u'\u02c6\xec\u0153\xbc\xeb']



In [21]:

    
len(vect.get_feature_names())









    Out[21]:





636007



In [22]:

    
vect









    Out[22]:





TfidfVectorizer(analyzer=<function stemmed_words at 0x7fe09dfdc9b0>,
        binary=False, decode_error=u'strict', dtype=<type 'numpy.int64'>,
        encoding=u'utf-8', input=u'content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), norm=u'l2',
        preprocessor=None, smooth_idf=True, stop_words='english',
        strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=<class nltk.tokenize.casual.TweetTokenizer at 0x7fe09e4d4870>,
        use_idf=True, vocabulary=None)



In [23]:

    
from sklearn.metrics import classification_report



In [24]:

    
print "Results for %i training samples and %i test samples (trained on %f sec)" %(len(y_train),len(y_test),elapsed_time)
print classification_report(y_test,clf.predict(X_test))









    



Results for 1057670 training samples and 520942 test samples (trained on 0.265273 sec)
             precision    recall  f1-score   support

          0       0.74      0.82      0.78    260447
          1       0.80      0.72      0.75    260495

avg / total       0.77      0.77      0.77    520942



In [ ]:

feature selection

By now, vect has a vocabulary including all words and we will trim that using a statistical method, such as chi2



In [ ]:



In [25]:

    
from sklearn.feature_selection import chi2,f_classif,SelectPercentile



In [26]:

    
import matplotlib.pyplot as plt



In [27]:

    
y_train_bool = map(lambda x: x==1,y_train)



In [28]:

    
pd.Series(f_classif(X_train,y_train_bool)[0]).plot()
plt.show()



In [29]:

    
pd.Series(f_classif(X_train,y_train_bool)[1]).plot()
plt.show()



In [ ]:



In [ ]:



In [30]:

    
selector = SelectPercentile(chi2, percentile=1)



In [31]:

    
selector.fit(X_train,y_train)









    Out[31]:





SelectPercentile(percentile=1, score_func=<function chi2 at 0x7fe09f0cc8c0>)



In [32]:

    
clf.fit(selector.transform(X_train),y_train)









    Out[32]:





MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)



In [33]:

    
#predict
results = clf.predict(selector.transform(X_test))
print classification_report(y_test,results)









    



             precision    recall  f1-score   support

          0       0.77      0.77      0.77    260447
          1       0.77      0.77      0.77    260495

avg / total       0.77      0.77      0.77    520942



In [ ]:



In [ ]:

try different models

Logistic Regression



In [ ]:

    
from sklearn.linear_model import LogisticRegression



In [ ]:

    
clf = LogisticRegression('l2')



In [ ]:

    
# fit a variable selector in our data
selector = SelectPercentile(chi2, percentile=10)
selector.fit(X_train,y_train)



In [ ]:

    
#fit the model
clf.fit(selector.transform(X_train),y_train)



In [ ]:

    
#predict
results = clf.predict(selector.transform(X_test))
print classification_report(y_test,results)



In [ ]:

Sentiment analysis in some sample sentences



In [ ]:

    
sample_text = ['Aris is a bit dubtful about me being a smart ass',
               'aris doesnt love sklearn yet',
               'but he will definitely love it soon',
               'fuck','bad','amazing', 'this is a sentence',
               'this is a bad sentence',]



In [ ]:

    
vect.transform(sample_text)



In [ ]:

    
for i,sent in enumerate(sample_text):
    print sent,clf.predict(vect.transform(sample_text))[i]



In [ ]:



In [ ]:

export our model



In [405]:

    
time.localtime()[1:5]









    Out[405]:





(9, 25, 11, 30)



In [278]:

    
#save the clf classifier in a file to load it in a different notebook/at a different time
timestr = "%i-%i_%i,%i"%time.localtime()[1:5]
from sklearn.externals import joblib
joblib.dump(clf, 'trained models/'+'descr'+timestr+'.pkl') 
joblib.dump(vect, 'trained models/vect'+'descr'+timestr+'.pkl') 

#load it later with:
#clf = joblib.load('NaiveBayesCl_67k_tweets.pkl')



In [ ]:

    
import pickle
s = pickle.dumps(vect)
vec2 = pickle.loads(s)



In [ ]:

	Sentiment	SentimentText
0	0	is so sad for my APL frie...
1	0	I missed the New Moon trail...
2	1	omg its already 7:30 :O
3	0	.. Omgaga. Im sooo im gunna CRy. I'...
4	0	i think mi bf is cheating on me!!! ...