Problem Statement: Classify text as +ve or -ve sentiment



In [24]:

    
#Reference: http://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/
import nltk
import pandas as pd
import numpy as np
import scipy
import pickle

# Lets specify some positive tweets for evaluation purpose
pos_tweets = [('I love this car', 'positive'), ('This view is amazing', 'positive'), 
              ('I feel great this morning', 'positive'), ('I am so excited about the concert', 'positive'),
              ('He is my best friend', 'positive'), ('The beer is good', 'positive'), 
              ('I do love ice-cream', 'positive'), ('morning is good', 'positive'), ('welcome morning', 'positive')]

# Similary some negative tweets
neg_tweets = [('I do not like this car', 'negative'), ('This view is horrible', 'negative'), 
              ('I am not looking forward to the party', 'negative'),
              ('He is my enemy', 'negative'), ('very annoying', 'negative')]



In [25]:

    
# 1. Transform the array of positive and negative tweets to a tuple2 (tweet, sentiment)
# 2. Filtered some stop words, where word length < 3
tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
    words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
    tweets.append((words_filtered, sentiment))
print(pd.DataFrame(tweets))









    



                                      0         1
0                     [love, this, car]  positive
1                 [this, view, amazing]  positive
2          [feel, great, this, morning]  positive
3        [excited, about, the, concert]  positive
4                        [best, friend]  positive
5                     [the, beer, good]  positive
6                     [love, ice-cream]  positive
7                       [morning, good]  positive
8                    [welcome, morning]  positive
9                [not, like, this, car]  negative
10               [this, view, horrible]  negative
11  [not, looking, forward, the, party]  negative
12                              [enemy]  negative
13                     [very, annoying]  negative



In [26]:

    
def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
      # append words to a list
      all_words.extend(words)
    return all_words

def get_word_features(wordlist):
    # make use of nltk.FreqDist function to compute TF
    w_l = nltk.FreqDist(wordlist)
    return w_l



In [27]:

    
word_features = get_word_features(get_words_in_tweets(tweets))

def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features



In [28]:

    
unique_word_list = np.unique(get_words_in_tweets(tweets))
print('Unique words in the corpus: {}'.format(unique_word_list))
print('Count of unique words in corpus: {}'.format(len(unique_word_list)))
print(pd.DataFrame(word_features.most_common(50)))









    



Unique words in the corpus: ['about' 'amazing' 'annoying' 'beer' 'best' 'car' 'concert' 'enemy'
 'excited' 'feel' 'forward' 'friend' 'good' 'great' 'horrible' 'ice-cream'
 'like' 'looking' 'love' 'morning' 'not' 'party' 'the' 'this' 'very' 'view'
 'welcome']
Count of unique words in corpus: 27
            0  1
0        this  5
1     morning  3
2         the  3
3        love  2
4        good  2
5         not  2
6         car  2
7        view  2
8     concert  1
9        feel  1
10  ice-cream  1
11   annoying  1
12       best  1
13    amazing  1
14    looking  1
15       beer  1
16   horrible  1
17    forward  1
18      party  1
19    excited  1
20     friend  1
21       very  1
22    welcome  1
23      about  1
24      enemy  1
25      great  1
26       like  1



In [29]:

    
training_set = nltk.classify.apply_features(extract_features, tweets)



In [44]:

    
print(training_set)









    



[({'contains(looking)': False, 'contains(not)': False, 'contains(excited)': False, 'contains(view)': False, 'contains(welcome)': False, 'contains(forward)': False, 'contains(ice-cream)': False, 'contains(love)': True, 'contains(enemy)': False, 'contains(very)': False, 'contains(horrible)': False, 'contains(beer)': False, 'contains(party)': False, 'contains(about)': False, 'contains(concert)': False, 'contains(feel)': False, 'contains(like)': False, 'contains(annoying)': False, 'contains(great)': False, 'contains(the)': False, 'contains(friend)': False, 'contains(morning)': False, 'contains(best)': False, 'contains(good)': False, 'contains(this)': True, 'contains(car)': True, 'contains(amazing)': False}, 'positive'), ({'contains(looking)': False, 'contains(not)': False, 'contains(excited)': False, 'contains(view)': True, 'contains(welcome)': False, 'contains(forward)': False, 'contains(ice-cream)': False, 'contains(love)': False, 'contains(enemy)': False, 'contains(very)': False, 'contains(horrible)': False, 'contains(beer)': False, 'contains(party)': False, 'contains(about)': False, 'contains(concert)': False, 'contains(feel)': False, 'contains(like)': False, 'contains(annoying)': False, 'contains(great)': False, 'contains(the)': False, 'contains(friend)': False, 'contains(morning)': False, 'contains(best)': False, 'contains(good)': False, 'contains(this)': True, 'contains(car)': False, 'contains(amazing)': True}, 'positive'), ...]



In [10]:

    
classifier = nltk.NaiveBayesClassifier.train(training_set)



In [22]:

    
print(classifier.show_most_informative_features(25))









    



Most Informative Features
       contains(concert) = True           negati : positi =      1.7 : 1.0
           contains(car) = True           negati : positi =      1.7 : 1.0
          contains(view) = True           negati : positi =      1.7 : 1.0
           contains(not) = False          positi : negati =      1.6 : 1.0
       contains(morning) = False          negati : positi =      1.4 : 1.0
          contains(like) = False          positi : negati =      1.3 : 1.0
      contains(annoying) = False          positi : negati =      1.3 : 1.0
       contains(looking) = False          positi : negati =      1.3 : 1.0
         contains(enemy) = False          positi : negati =      1.3 : 1.0
       contains(forward) = False          positi : negati =      1.3 : 1.0
          contains(very) = False          positi : negati =      1.3 : 1.0
      contains(horrible) = False          positi : negati =      1.3 : 1.0
          contains(good) = False          negati : positi =      1.2 : 1.0
          contains(love) = False          negati : positi =      1.2 : 1.0
          contains(this) = True           negati : positi =      1.2 : 1.0
       contains(concert) = False          positi : negati =      1.1 : 1.0
           contains(car) = False          positi : negati =      1.1 : 1.0
          contains(view) = False          positi : negati =      1.1 : 1.0
          contains(this) = False          positi : negati =      1.1 : 1.0
          contains(beer) = False          negati : positi =      1.1 : 1.0
       contains(excited) = False          negati : positi =      1.1 : 1.0
         contains(about) = False          negati : positi =      1.1 : 1.0
     contains(ice-cream) = False          negati : positi =      1.1 : 1.0
       contains(welcome) = False          negati : positi =      1.1 : 1.0
       contains(amazing) = False          negati : positi =      1.1 : 1.0
None



In [12]:

    
# A positive example
tweet = 'Larry is my friend'
transformed_features = extract_features(tweet.split())
print (pd.DataFrame(transformed_features.items()))
print classifier.classify(extract_features(tweet.split()))









    



                      0      1
0     contains(looking)  False
1         contains(not)  False
2     contains(excited)  False
3        contains(view)  False
4     contains(welcome)  False
5     contains(forward)  False
6   contains(ice-cream)  False
7        contains(love)  False
8       contains(enemy)  False
9        contains(very)  False
10   contains(horrible)  False
11       contains(beer)  False
12      contains(about)  False
13    contains(concert)  False
14       contains(feel)  False
15       contains(like)  False
16   contains(annoying)  False
17      contains(great)  False
18        contains(the)  False
19     contains(friend)   True
20    contains(morning)  False
21       contains(best)  False
22       contains(good)  False
23       contains(this)  False
24        contains(car)  False
25    contains(amazing)  False
positive



In [13]:

    
# A failed example
tweet = 'Your song is annoying'
print classifier.classify(extract_features(tweet.split()))









    



negative



In [128]:

    
# Add the words annoying to the list and repeat
tweet = 'Your song is annoying'
print classifier.classify(extract_features(tweet.split()))









    



negative



In [233]:

    
tweet = 'love the summers'
print classifier.classify(extract_features(tweet.split()))









    



positive



In [234]:

    
tweet = 'hate the winters'
print classifier.classify(extract_features(tweet.split()))









    



positive



In [213]:

    
tweet = 'review on Black Mirror'
print classifier.classify(extract_features(tweet.split()))









    



positive



In [235]:

    
tweet = 'i got a bad grade'
print classifier.classify(extract_features(tweet.split()))









    



positive



In [236]:

    
tweet = 'This is the best course ever'
print classifier.classify(extract_features(tweet.split()))









    



positive



In [46]:

    
# Persist the model for usage using pickle
# Reference: https://docs.python.org/2/library/pickle.html
serialized_classifier = open("nb_sentiment.pickle","wb")
pickle.dump(classifier, serialized_classifier)
serialized_classifier.close()