In [1]:
import json
from pandas import DataFrame
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import nltk
In [2]:
def load_file(file_path):
records = [json.loads(line) for line in open(file_path)]
return records
In [3]:
def drop_fields(fields, dictionary_list):
"""
Removes the specified fields from every dictionary in the list records
:rtype : void
:param fields: a list of strings, which contains the fields that are
going to be removed from every dictionary in the list records
:param dictionary_list: a list of dictionaries
"""
for record in dictionary_list:
for field in fields:
del (record[field])
def filter_records(records, field, values):
filtered_records = [record for record in records if
record[field] in values]
return filtered_records
We analyze the tip database and select the businesses that have the higher number of reviews
In [4]:
def analyze(file_path, n):
records = load_file(file_path)
drop_fields(['text', 'type', 'date', 'user_id', 'likes'],
records)
data_frame = DataFrame(records)
counts = data_frame.groupby('business_id').size()
counts.sort(ascending=0)
top_counts = counts[:n]
print(top_counts)
print records[0].keys()
Here we will take the top ten businesses with more reviews
In [5]:
data_folder = 'data/'
tip_file_path = data_folder + 'yelp_academic_dataset_tip.json'
analyze(tip_file_path, 10)
In [6]:
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')
from sklearn.feature_extraction.text import TfidfVectorizer
class StemmedTfidfVectorizer(TfidfVectorizer):
def build_analyzer(self):
analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
Now we create a function to apply TF-IDF to the text inside the tips
In [7]:
def tf_idf(file_path, business_id, stem):
records = load_file(file_path)
data = [record['text'] for record in records]
vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
if stem:
vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english')
train = vectorizer.fit_transform(data)
#print "Vocabulary:", vectorizer.get_feature_names()
num_samples, num_features = train.shape
print("#samples: %d, #features: %d" % (
num_samples, num_features))
business_records = filter_records(records, 'business_id', [business_id])
business_data = [record['text'] for record in business_records]
freq_term_matrix = vectorizer.transform(business_data)
vocabulary = vectorizer.get_feature_names()
my_list = []
rows, cols = freq_term_matrix.nonzero()
for row, col in zip(rows, cols):
my_dict = {}
word = vocabulary[col]
my_dict['tip_id'] = row
my_dict['word'] = word
my_dict['tfidf'] = freq_term_matrix[row, col]
my_list.append(my_dict)
data_frame = DataFrame(my_list)
suma = data_frame.groupby('word').aggregate(np.sum)['tfidf']
ordenado = suma.order()
print ordenado
We execute the code to see the most relevant words in the tips
In [8]:
tf_idf(tip_file_path, 'EWMwV5V9BxNs_U6nNVMeqw', 0)
Now we execute the same code but stemming the words, so that we don't have words with the same meaning like burger and burgers repeated
In [9]:
tf_idf(tip_file_path, 'EWMwV5V9BxNs_U6nNVMeqw', 1)