In [3]:
from collections import defaultdict
import csv
import sys
csv.field_size_limit(sys.maxsize)
data = "/Users/skhederian/restaurant-health/format_reviews.csv"
#Combine reviews with the same violation count into the same document
reviews = defaultdict(list)
with open(data, "r") as review_file:
reader = csv.reader(review_file, delimiter=',')
next(reader)
for row in reader:
reviews[row[3]].append(row[4])
for violations, string_agg in reviews.items():
reviews[violations] = "".join(string_agg)
#Append documents to corpus
corpus = []
for id, review in reviews.items():
corpus.append(review)
In [13]:
#Import vectorizer with unigrams and 2 or 3-word phrases
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = .25, stop_words='english')
tf2 = TfidfVectorizer(min_df=0)
In [14]:
#Fit to our corpus
tfidf_matrix = tf.fit_transform(corpus)
tfidf_matrixfull = tf2.fit_transform(corpus)
feature_names = tf.get_feature_names()
In [15]:
tfidf_matrix
Out[15]:
In [16]:
tfidf_matrixfull
Out[16]:
In [17]:
dense = tfidf_matrix.todense()
len(dense[0].tolist()[0])
Out[17]:
In [18]:
review = dense[0].tolist()[0]
phrase_scores = [pair for pair in zip(range(0, len(review)), review) if pair[1] > 0]
In [19]:
len(phrase_scores)
Out[19]:
In [20]:
#features from the first document with the highest scores
sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][:20]:
print('{0: <20} {1}'.format(phrase, score))