Problem Statement: Classify text as +ve or -ve sentiment
In [24]:
#Reference: http://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/
import nltk
import pandas as pd
import numpy as np
import scipy
import pickle
# Lets specify some positive tweets for evaluation purpose
pos_tweets = [('I love this car', 'positive'), ('This view is amazing', 'positive'),
('I feel great this morning', 'positive'), ('I am so excited about the concert', 'positive'),
('He is my best friend', 'positive'), ('The beer is good', 'positive'),
('I do love ice-cream', 'positive'), ('morning is good', 'positive'), ('welcome morning', 'positive')]
# Similary some negative tweets
neg_tweets = [('I do not like this car', 'negative'), ('This view is horrible', 'negative'),
('I am not looking forward to the party', 'negative'),
('He is my enemy', 'negative'), ('very annoying', 'negative')]
In [25]:
# 1. Transform the array of positive and negative tweets to a tuple2 (tweet, sentiment)
# 2. Filtered some stop words, where word length < 3
tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
tweets.append((words_filtered, sentiment))
print(pd.DataFrame(tweets))
In [26]:
def get_words_in_tweets(tweets):
all_words = []
for (words, sentiment) in tweets:
# append words to a list
all_words.extend(words)
return all_words
def get_word_features(wordlist):
# make use of nltk.FreqDist function to compute TF
w_l = nltk.FreqDist(wordlist)
return w_l
In [27]:
word_features = get_word_features(get_words_in_tweets(tweets))
def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
In [28]:
unique_word_list = np.unique(get_words_in_tweets(tweets))
print('Unique words in the corpus: {}'.format(unique_word_list))
print('Count of unique words in corpus: {}'.format(len(unique_word_list)))
print(pd.DataFrame(word_features.most_common(50)))
In [29]:
training_set = nltk.classify.apply_features(extract_features, tweets)
In [44]:
print(training_set)
In [10]:
classifier = nltk.NaiveBayesClassifier.train(training_set)
In [22]:
print(classifier.show_most_informative_features(25))
In [12]:
# A positive example
tweet = 'Larry is my friend'
transformed_features = extract_features(tweet.split())
print (pd.DataFrame(transformed_features.items()))
print classifier.classify(extract_features(tweet.split()))
In [13]:
# A failed example
tweet = 'Your song is annoying'
print classifier.classify(extract_features(tweet.split()))
In [128]:
# Add the words annoying to the list and repeat
tweet = 'Your song is annoying'
print classifier.classify(extract_features(tweet.split()))
In [233]:
tweet = 'love the summers'
print classifier.classify(extract_features(tweet.split()))
In [234]:
tweet = 'hate the winters'
print classifier.classify(extract_features(tweet.split()))
In [213]:
tweet = 'review on Black Mirror'
print classifier.classify(extract_features(tweet.split()))
In [235]:
tweet = 'i got a bad grade'
print classifier.classify(extract_features(tweet.split()))
In [236]:
tweet = 'This is the best course ever'
print classifier.classify(extract_features(tweet.split()))
In [46]:
# Persist the model for usage using pickle
# Reference: https://docs.python.org/2/library/pickle.html
serialized_classifier = open("nb_sentiment.pickle","wb")
pickle.dump(classifier, serialized_classifier)
serialized_classifier.close()