by Andrew Trask
In [1]:
def pretty_print_review_and_label(i):
print(labels[i] + "\t:\t" + reviews[i][:80] + "...")
g = open('reviews.txt','r') # What we know!
reviews = list(map(lambda x:x[:-1],g.readlines()))
g.close()
g = open('labels.txt','r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()
In [2]:
len(reviews)
Out[2]:
In [3]:
reviews[0]
Out[3]:
In [4]:
labels[0]
Out[4]:
In [5]:
print("labels.txt \t : \t reviews.txt\n")
pretty_print_review_and_label(2137)
pretty_print_review_and_label(12816)
pretty_print_review_and_label(6267)
pretty_print_review_and_label(21934)
pretty_print_review_and_label(5297)
pretty_print_review_and_label(4998)
In [6]:
from collections import Counter
import numpy as np
In [7]:
positive_count = Counter()
negative_count = Counter()
total_count = Counter()
In [8]:
for i in range(len(reviews)):
if (labels[i] == 'POSITIVE'):
for word in reviews[i].split(" "):
positive_count[word] += 1
total_count[word] += 1
else:
for word in reviews[i].split(" "):
negative_count[word] +=1
total_count[word] +=1
In [9]:
positive_count.most_common()[:15]
Out[9]:
In [10]:
pos_ratios = Counter()
neg_ratios = Counter()
for word, count in list(total_count.most_common()):
if count > 100:
pos_ratio = positive_count[word] / float(total_count[word] + 1)
neg_ratio = negative_count[word] / float(total_count[word] + 1)
pos_ratios[word] = pos_ratio
neg_ratios[word] = neg_ratio
In [11]:
pos_ratios.most_common()[:15]
Out[11]:
In [12]:
neg_ratios.most_common()[:15]
Out[12]:
In [ ]: