Problem Statement: Classify text as +ve or -ve sentiment


In [24]:
#Reference: http://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/
import nltk
import pandas as pd
import numpy as np
import scipy
import pickle

# Lets specify some positive tweets for evaluation purpose
pos_tweets = [('I love this car', 'positive'), ('This view is amazing', 'positive'), 
              ('I feel great this morning', 'positive'), ('I am so excited about the concert', 'positive'),
              ('He is my best friend', 'positive'), ('The beer is good', 'positive'), 
              ('I do love ice-cream', 'positive'), ('morning is good', 'positive'), ('welcome morning', 'positive')]

# Similary some negative tweets
neg_tweets = [('I do not like this car', 'negative'), ('This view is horrible', 'negative'), 
              ('I am not looking forward to the party', 'negative'),
              ('He is my enemy', 'negative'), ('very annoying', 'negative')]

In [25]:
# 1. Transform the array of positive and negative tweets to a tuple2 (tweet, sentiment)
# 2. Filtered some stop words, where word length < 3
tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
    words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
    tweets.append((words_filtered, sentiment))
print(pd.DataFrame(tweets))


                                      0         1
0                     [love, this, car]  positive
1                 [this, view, amazing]  positive
2          [feel, great, this, morning]  positive
3        [excited, about, the, concert]  positive
4                        [best, friend]  positive
5                     [the, beer, good]  positive
6                     [love, ice-cream]  positive
7                       [morning, good]  positive
8                    [welcome, morning]  positive
9                [not, like, this, car]  negative
10               [this, view, horrible]  negative
11  [not, looking, forward, the, party]  negative
12                              [enemy]  negative
13                     [very, annoying]  negative

In [26]:
def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
      # append words to a list
      all_words.extend(words)
    return all_words

def get_word_features(wordlist):
    # make use of nltk.FreqDist function to compute TF
    w_l = nltk.FreqDist(wordlist)
    return w_l

In [27]:
word_features = get_word_features(get_words_in_tweets(tweets))

def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [28]:
unique_word_list = np.unique(get_words_in_tweets(tweets))
print('Unique words in the corpus: {}'.format(unique_word_list))
print('Count of unique words in corpus: {}'.format(len(unique_word_list)))
print(pd.DataFrame(word_features.most_common(50)))


Unique words in the corpus: ['about' 'amazing' 'annoying' 'beer' 'best' 'car' 'concert' 'enemy'
 'excited' 'feel' 'forward' 'friend' 'good' 'great' 'horrible' 'ice-cream'
 'like' 'looking' 'love' 'morning' 'not' 'party' 'the' 'this' 'very' 'view'
 'welcome']
Count of unique words in corpus: 27
            0  1
0        this  5
1     morning  3
2         the  3
3        love  2
4        good  2
5         not  2
6         car  2
7        view  2
8     concert  1
9        feel  1
10  ice-cream  1
11   annoying  1
12       best  1
13    amazing  1
14    looking  1
15       beer  1
16   horrible  1
17    forward  1
18      party  1
19    excited  1
20     friend  1
21       very  1
22    welcome  1
23      about  1
24      enemy  1
25      great  1
26       like  1

In [29]:
training_set = nltk.classify.apply_features(extract_features, tweets)

In [44]:
print(training_set)


[({'contains(looking)': False, 'contains(not)': False, 'contains(excited)': False, 'contains(view)': False, 'contains(welcome)': False, 'contains(forward)': False, 'contains(ice-cream)': False, 'contains(love)': True, 'contains(enemy)': False, 'contains(very)': False, 'contains(horrible)': False, 'contains(beer)': False, 'contains(party)': False, 'contains(about)': False, 'contains(concert)': False, 'contains(feel)': False, 'contains(like)': False, 'contains(annoying)': False, 'contains(great)': False, 'contains(the)': False, 'contains(friend)': False, 'contains(morning)': False, 'contains(best)': False, 'contains(good)': False, 'contains(this)': True, 'contains(car)': True, 'contains(amazing)': False}, 'positive'), ({'contains(looking)': False, 'contains(not)': False, 'contains(excited)': False, 'contains(view)': True, 'contains(welcome)': False, 'contains(forward)': False, 'contains(ice-cream)': False, 'contains(love)': False, 'contains(enemy)': False, 'contains(very)': False, 'contains(horrible)': False, 'contains(beer)': False, 'contains(party)': False, 'contains(about)': False, 'contains(concert)': False, 'contains(feel)': False, 'contains(like)': False, 'contains(annoying)': False, 'contains(great)': False, 'contains(the)': False, 'contains(friend)': False, 'contains(morning)': False, 'contains(best)': False, 'contains(good)': False, 'contains(this)': True, 'contains(car)': False, 'contains(amazing)': True}, 'positive'), ...]

In [10]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [22]:
print(classifier.show_most_informative_features(25))


Most Informative Features
       contains(concert) = True           negati : positi =      1.7 : 1.0
           contains(car) = True           negati : positi =      1.7 : 1.0
          contains(view) = True           negati : positi =      1.7 : 1.0
           contains(not) = False          positi : negati =      1.6 : 1.0
       contains(morning) = False          negati : positi =      1.4 : 1.0
          contains(like) = False          positi : negati =      1.3 : 1.0
      contains(annoying) = False          positi : negati =      1.3 : 1.0
       contains(looking) = False          positi : negati =      1.3 : 1.0
         contains(enemy) = False          positi : negati =      1.3 : 1.0
       contains(forward) = False          positi : negati =      1.3 : 1.0
          contains(very) = False          positi : negati =      1.3 : 1.0
      contains(horrible) = False          positi : negati =      1.3 : 1.0
          contains(good) = False          negati : positi =      1.2 : 1.0
          contains(love) = False          negati : positi =      1.2 : 1.0
          contains(this) = True           negati : positi =      1.2 : 1.0
       contains(concert) = False          positi : negati =      1.1 : 1.0
           contains(car) = False          positi : negati =      1.1 : 1.0
          contains(view) = False          positi : negati =      1.1 : 1.0
          contains(this) = False          positi : negati =      1.1 : 1.0
          contains(beer) = False          negati : positi =      1.1 : 1.0
       contains(excited) = False          negati : positi =      1.1 : 1.0
         contains(about) = False          negati : positi =      1.1 : 1.0
     contains(ice-cream) = False          negati : positi =      1.1 : 1.0
       contains(welcome) = False          negati : positi =      1.1 : 1.0
       contains(amazing) = False          negati : positi =      1.1 : 1.0
None

In [12]:
# A positive example
tweet = 'Larry is my friend'
transformed_features = extract_features(tweet.split())
print (pd.DataFrame(transformed_features.items()))
print classifier.classify(extract_features(tweet.split()))


                      0      1
0     contains(looking)  False
1         contains(not)  False
2     contains(excited)  False
3        contains(view)  False
4     contains(welcome)  False
5     contains(forward)  False
6   contains(ice-cream)  False
7        contains(love)  False
8       contains(enemy)  False
9        contains(very)  False
10   contains(horrible)  False
11       contains(beer)  False
12      contains(about)  False
13    contains(concert)  False
14       contains(feel)  False
15       contains(like)  False
16   contains(annoying)  False
17      contains(great)  False
18        contains(the)  False
19     contains(friend)   True
20    contains(morning)  False
21       contains(best)  False
22       contains(good)  False
23       contains(this)  False
24        contains(car)  False
25    contains(amazing)  False
positive

In [13]:
# A failed example
tweet = 'Your song is annoying'
print classifier.classify(extract_features(tweet.split()))


negative

In [128]:
# Add the words annoying to the list and repeat
tweet = 'Your song is annoying'
print classifier.classify(extract_features(tweet.split()))


negative

In [233]:
tweet = 'love the summers'
print classifier.classify(extract_features(tweet.split()))


positive

In [234]:
tweet = 'hate the winters'
print classifier.classify(extract_features(tweet.split()))


positive

In [213]:
tweet = 'review on Black Mirror'
print classifier.classify(extract_features(tweet.split()))


positive

In [235]:
tweet = 'i got a bad grade'
print classifier.classify(extract_features(tweet.split()))


positive

In [236]:
tweet = 'This is the best course ever'
print classifier.classify(extract_features(tweet.split()))


positive

In [46]:
# Persist the model for usage using pickle
# Reference: https://docs.python.org/2/library/pickle.html
serialized_classifier = open("nb_sentiment.pickle","wb")
pickle.dump(classifier, serialized_classifier)
serialized_classifier.close()