In [1]:
import json
from pandas import DataFrame
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import nltk

In [2]:
def load_file(file_path):
    records = [json.loads(line) for line in open(file_path)]

    return records

In [3]:
def drop_fields(fields, dictionary_list):
    """
    Removes the specified fields from every dictionary in the list records

    :rtype : void
    :param fields: a list of strings, which contains the fields that are
    going to be removed from every dictionary in the list records
    :param dictionary_list: a list of dictionaries
    """
    for record in dictionary_list:
        for field in fields:
            del (record[field])


def filter_records(records, field, values):
    filtered_records = [record for record in records if
                        record[field] in values]
    return filtered_records

We analyze the tip database and select the businesses that have the higher number of reviews


In [4]:
def analyze(file_path, n):
        records = load_file(file_path)
        drop_fields(['text', 'type', 'date', 'user_id', 'likes'],
                             records)
        data_frame = DataFrame(records)
        counts = data_frame.groupby('business_id').size()
        counts.sort(ascending=0)
        top_counts = counts[:n]
        print(top_counts)

        print records[0].keys()

Here we will take the top ten businesses with more reviews


In [5]:
data_folder = 'data/'
tip_file_path = data_folder + 'yelp_academic_dataset_tip.json'
analyze(tip_file_path, 10)


business_id
hW0Ne_HTHEAgGF1rAdmR-g    1420
JokKtdXU7zXHcr20Lrk29A     391
0UZ31UTcOLRKuqPqPe-VBA     355
aRkYtXfmEKYG-eTDf_qUsw     336
-sC66z4SO3tR7nFCjfQwuQ     291
_FXql6eVhbM923RdCi94SA     284
EWMwV5V9BxNs_U6nNVMeqw     248
L9UYbtAUOcfTgZFimehlXw     243
uFJwKlHL6HyHSJmORO8-5w     242
WS1z1OAR0tRl4FsjdTGUFQ     239
dtype: int64
[u'business_id']

In [6]:
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')
from sklearn.feature_extraction.text import TfidfVectorizer


class StemmedTfidfVectorizer(TfidfVectorizer):

    def build_analyzer(self):
        analyzer = super(StemmedTfidfVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

Now we create a function to apply TF-IDF to the text inside the tips


In [7]:
def tf_idf(file_path, business_id, stem):
        records = load_file(file_path)
        data = [record['text'] for record in records]
        vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
        if stem:
            vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english')
        train = vectorizer.fit_transform(data)
        #print "Vocabulary:", vectorizer.get_feature_names()
        num_samples, num_features = train.shape
        print("#samples: %d, #features: %d" % (
            num_samples, num_features))

        business_records = filter_records(records, 'business_id', [business_id])
        business_data = [record['text'] for record in business_records]
        freq_term_matrix = vectorizer.transform(business_data)
        vocabulary = vectorizer.get_feature_names()

        my_list = []
        rows, cols = freq_term_matrix.nonzero()
        for row, col in zip(rows, cols):
            my_dict = {}
            word = vocabulary[col]
            my_dict['tip_id'] = row
            my_dict['word'] = word
            my_dict['tfidf'] = freq_term_matrix[row, col]
            my_list.append(my_dict)

        data_frame = DataFrame(my_list)
        suma = data_frame.groupby('word').aggregate(np.sum)['tfidf']
        ordenado = suma.order()
        print ordenado

We execute the code to see the most relevant words in the tips


In [8]:
tf_idf(tip_file_path, 'EWMwV5V9BxNs_U6nNVMeqw', 0)


#samples: 113993, #features: 32066
word
hot         0.198456
make        0.202844
az          0.203616
door        0.220653
spinach     0.224708
gone        0.229656
taste       0.230293
lots        0.231347
said        0.231943
coming      0.234097
pie         0.239452
does        0.244336
add         0.248715
bathroom    0.250183
ambience    0.250342
...
night       3.509764
hummus      3.674564
sweet       3.981846
service     4.056771
potato      4.193176
food        4.232536
place       4.393503
best        5.365621
burgers     5.517702
love        5.891557
great       7.127851
fries       7.444558
burger      7.939506
brunch      8.035746
fez        17.164686
Name: tfidf, Length: 611, dtype: float64

Now we execute the same code but stemming the words, so that we don't have words with the same meaning like burger and burgers repeated


In [9]:
tf_idf(tip_file_path, 'EWMwV5V9BxNs_U6nNVMeqw', 1)


#samples: 113993, #features: 24571
word
make       0.191378
hot        0.201586
custom     0.212081
az         0.216548
tast       0.219641
door       0.228683
spinach    0.233878
pass       0.235770
gone       0.239028
said       0.239881
pie        0.243106
cake       0.248671
order      0.249398
doe        0.252706
add        0.254718
...
servic       4.173222
sweet        4.175118
potato       4.201844
food         4.341459
martini      4.371138
special      4.523931
place        4.668387
cocktail     4.746489
best         5.580851
fri          7.059537
love         7.082217
great        7.569483
brunch       8.266688
burger      12.550479
fez         17.762831
Name: tfidf, Length: 562, dtype: float64