Term Frequency - Inverse Document Frequency (TF-IDF)



In [3]:

    
from collections import defaultdict
import csv
import sys
csv.field_size_limit(sys.maxsize)

data = "/Users/skhederian/restaurant-health/format_reviews.csv"

#Combine reviews with the same violation count into the same document
reviews = defaultdict(list)
with open(data, "r") as review_file:
    reader = csv.reader(review_file, delimiter=',')
    next(reader)
    for row in reader:
        reviews[row[3]].append(row[4])
        
for violations, string_agg in reviews.items():
    reviews[violations] = "".join(string_agg)

#Append documents to corpus
corpus = []

for id, review in reviews.items():
    corpus.append(review)



In [13]:

    
#Import vectorizer with unigrams and 2 or 3-word phrases
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = .25, stop_words='english')
tf2 = TfidfVectorizer(min_df=0)



In [14]:

    
#Fit to our corpus
tfidf_matrix =  tf.fit_transform(corpus)
tfidf_matrixfull = tf2.fit_transform(corpus)
feature_names = tf.get_feature_names()



In [15]:

    
tfidf_matrix









    Out[15]:





<63x23388 sparse matrix of type '<class 'numpy.float64'>'
	with 471677 stored elements in Compressed Sparse Row format>



In [16]:

    
tfidf_matrixfull









    Out[16]:





<63x136973 sparse matrix of type '<class 'numpy.float64'>'
	with 552819 stored elements in Compressed Sparse Row format>



In [17]:

    
dense = tfidf_matrix.todense()
len(dense[0].tolist()[0])









    Out[17]:





23388



In [18]:

    
review = dense[0].tolist()[0]
phrase_scores = [pair for pair in zip(range(0, len(review)), review) if pair[1] > 0]



In [19]:

    
len(phrase_scores)









    Out[19]:





23367



In [20]:

    
#features from the first document with the highest scores
sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][:20]:
   print('{0: <20} {1}'.format(phrase, score))









    



good                 0.26332838123853636
place                0.2555399228687619
food                 0.25027731167790174
great                0.19172679110444524
like                 0.18432688913648937
just                 0.17236678773002492
really               0.14237047597845381
service              0.13759250495524905
time                 0.13158220912438298
boston               0.13050217059133756
bar                  0.12038484170927376
ve                   0.11027730703240032
got                  0.10250640185996204
restaurant           0.1008494243486718
don                  0.09496081361608513
delicious            0.09436801224695833
best                 0.09352482334027934
nice                 0.09331912263082309
lobster              0.0932762268184481
little               0.09152074628493059

Coding reference from http://www.markhneedham.com/blog/2015/02/15/pythonscikit-learn-calculating-tfidf-on-how-i-met-your-mother-transcripts/