by Andrew Trask
In [50]:
def pretty_print_review_and_label(i):
print(labels[i] + "\t:\t" + reviews[i][:80] + "...")
g = open('reviews.txt','r') # What we know!
reviews = list(map(lambda x:x[:-1],g.readlines()))
g.close()
g = open('labels.txt','r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()
In [51]:
len(reviews)
Out[51]:
In [52]:
reviews[0]
Out[52]:
In [53]:
labels[0]
Out[53]:
In [54]:
print("labels.txt \t : \t reviews.txt\n")
pretty_print_review_and_label(2137)
pretty_print_review_and_label(12816)
pretty_print_review_and_label(6267)
pretty_print_review_and_label(21934)
pretty_print_review_and_label(5297)
pretty_print_review_and_label(4998)
In [98]:
import numpy as np
In [114]:
bag_of_words = {}
pos_words = {}
neg_words = {}
for i in range(len(reviews)):
words = reviews[i].split(' ')
for word in words:
if word in bag_of_words.keys():
bag_of_words[word] += 1
else:
bag_of_words[word] = 1
pos_words[word] = 0
neg_words[word] = 0
if labels[i] == 'POSITIVE':
if word in pos_words.keys():
pos_words[word] += 1
elif labels[i] == 'NEGATIVE':
if word in neg_words.keys():
neg_words[word] += 1
words_pos_neg_ratio = []
for word in bag_of_words.keys():
if bag_of_words[word] > 500:
pos_neg_ratio = pos_words[word] / float(neg_words[word] + 1)
words_pos_neg_ratio.append((word, np.log(pos_neg_ratio)))
words_pos_neg_ratio = sorted(words_pos_neg_ratio, key=lambda x: x[1], reverse=True)
In [115]:
print('\nTop positive words: \n')
for i in range(10):
print(words_pos_neg_ratio[i][0],': ', round(words_pos_neg_ratio[i][1], 10), sep='')
In [116]:
print('\nTop negative words: \n')
for i in range(-1, -11, -1):
print(words_pos_neg_ratio[i][0],': ', round(words_pos_neg_ratio[i][1], 10), sep='')
In [ ]: