by Andrew Trask
In [1]:
def pretty_print_review_and_label(i):
print(labels[i] + "\t:\t" + reviews[i][:80] + "...")
g = open('reviews.txt','r') # What we know!
reviews = list(map(lambda x:x[:-1],g.readlines()))
g.close()
g = open('labels.txt','r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()
In [2]:
len(reviews)
Out[2]:
In [3]:
reviews[0]
Out[3]:
In [4]:
labels[0]
Out[4]:
In [5]:
print("labels.txt \t : \t reviews.txt\n")
pretty_print_review_and_label(2137)
pretty_print_review_and_label(12816)
pretty_print_review_and_label(6267)
pretty_print_review_and_label(21934)
pretty_print_review_and_label(5297)
pretty_print_review_and_label(4998)
In [36]:
from toolz import concat, frequencies, groupby, reduceby, merge_with, valmap, first, last, mapcat, pluck, concatv, itemmap, get_in
import numpy as np
training_reviews = reviews[:15000]
training_labels = labels[:15000]
test_reviews = reviews[15000:]
test_labels = labels[15000:]
review_dict = [{'review': r, 'label': l} for r, l in zip(training_reviews, training_labels)]
grouped = groupby('label', review_dict)
review_pluck = valmap(lambda x: pluck('review', x), grouped)
review_comb = valmap(lambda x: frequencies(mapcat(lambda y: y.replace('.', '').split(), x)), review_pluck)
combined = merge_with(sum, [review_comb[x] for x in review_comb])
value_dict = {x: (get_in(['POSITIVE', x], review_comb, 0) - \
get_in(['NEGATIVE', x], review_comb, 0)) / \
combined[x]
for x in combined}
In [37]:
def label_review(r, d):
score = sum(map(lambda x: get_in([x], d, 0), r.replace('.', '').split()))
if score >= 0:
return 'POSITIVE'
else:
return 'NEGATIVE'
In [38]:
## training_score
training_score = np.mean([label_review(r, value_dict) == l for r, l in zip(training_reviews, training_labels)])
training_score
Out[38]:
In [40]:
test_score = np.mean([label_review(r, value_dict) == l for r, l in zip(test_reviews, test_labels)])
test_score
Out[40]:
In [ ]:
In [ ]: