| notebook.community

notebook.community



In [ ]:

    
import json
import pandas as pd
import re
import random
from scipy import sparse
import numpy as np
from pymongo import MongoClient
from nltk.corpus import stopwords
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim import corpora, models, similarities, matutils
import tqdm
import sys
sys.path.append('/Users/ed/yelp-classification/machine_learning')
import yelp_ml as yml
#reload(yml)



In [ ]:

    
lh_neg = open('../input/negative-words.txt', 'r', encoding = "ISO-8859-1").read()
lh_neg = lh_neg.split('\n')
lh_pos = open('../input/positive-words.txt', 'r', encoding = "ISO-8859-1").read()
lh_pos = lh_pos.split('\n')
users = json.load(open('../input/many_reviews_dictionary.json'))

word_list = list(set(lh_pos + lh_neg))



In [ ]:

    
#Fix users JSON
users_dict = {}
user_ids = []

users_dict = {}
user_ids = []

for list in users['reviews']:
    users_dict[list[0]['user_id']]= list




for list_reviews in users['reviews']:
    user_ids.append(list_reviews[0]['user_id'])
    
#We have 228 users, creat a new dictionary where the user_ids are the keys and the entries are a list of reviews

    
with open('cleaned_large_user_dictionary.json', 'w') as outfile:
    json.dump(users_dict, outfile)

Try running a few tests on a subset of users, the keys are our unique user IDs. We proceed as follows for each user ID:

1.Create a user dataframe with the following columns:•(review_text, review rating, business_id)

2.Create a list of unique business IDs for that user

3.Connect to the MongoDB server and pull all of the reviews for the restaurants that the user has reviewed

4.Create a restaurant dataframe with the following columns:•(review_text, biz rating, business_id)

5.Do a 80/20 training/test split, randomizing over the set of user' reviewed restaurants

6.Train the LSI model on the set of training reviews, get the number of topics used in fitting

7.Set up the FeatureUnion with the desired features, then fit according to the train reviews and transform the train reviews

8.



In [ ]:

    
#####Test Machine Learning Algorithms
ip = 'Insert IP here'
conn = MongoClient(ip, 27017)
conn.database_names()
db = conn.get_database('cleaned_data')
reviews = db.get_collection('restaurant_reviews')

1.Create a user dataframe with the following columns:•(review_text, review rating, business_id)



In [ ]:

    
useridlist =[]

for user in users_dict.keys():
    useridlist.append(user)
print(useridlist[1])



In [ ]:

    
def make_user_df(user_specific_reviews):
    #Input:
    #user_specific_reviews: A list of reviews for a specific user
    #Output: A dataframe with the columns (user_reviews, user_ratings, biz_ids)
    user_reviews = []
    user_ratings = []
    business_ids = []

    for review in user_specific_reviews:
        user_reviews.append(review['text'])
        user_ratings.append(review['stars'])
        business_ids.append(review['business_id'])

    ###WE SHOULD MAKE THE OUR OWN PUNCTUATION RULES
    #https://www.tutorialspoint.com/python/string_translate.htm
    #I'm gonna have to go and figure out what this does -ed
    #user_reviews = [review.encode('utf-8').translate(None, string.punctuation) for review in user_reviews]

    user_df = pd.DataFrame({'review_text': user_reviews, 'rating': user_ratings, 'biz_id': business_ids})
    return user_df



In [ ]:

    
#test to make users_dict,make_user_df works 
user_specific_reviews = users_dict[useridlist[0]]
x= make_user_df(user_specific_reviews)
x.head()

2.Create a list of unique business IDs for that user



In [ ]:

    
business_ids = list(set(user['biz_id']))

3.Connect to the MongoDB server and pull all of the reviews for the restaurants that the user has reviewed



In [ ]:

    
restreview = {}

for i in range(0, len(business_ids)):
    rlist = []
    for obj in reviews.find({'business_id':business_ids[i]}):
        rlist.append(obj)
        restreview[business_ids[i]] = rlist

4.Create a restaurant dataframe with the following columns:•(review_text, biz rating, business_id)



In [ ]:

    
restaurant_df = yml.make_biz_df(user, restreview)

5.Do a 80/20 training/test split, randomizing over the set of user' reviewed restaurants



In [ ]:

    
#Create a training and test sample from the user reviewed restaurants
split_samp = .30
random_int = random.randint(1, len(business_ids)-1)
len_random = int(len(business_ids) * split_samp)
test_set = business_ids[random_int:random_int+len_random]
training_set = business_ids[0:random_int]+business_ids[random_int+len_random:len(business_ids)]
train_reviews, train_ratings = [], []



In [ ]:

    
#Create a list of training reviews and training ratings
for rest_id in training_set:
    train_reviews.extend(list(user_df[user_df['biz_id'] == rest_id]['review_text']))
    train_ratings.extend(list(user_df[user_df['biz_id'] == rest_id]['rating']))



In [ ]:

    
#Transform the star labels into a binary class problem, 0 if rating is < 4 else 1
train_labels = [1 if x >=4 else 0 for x in train_ratings]

6.Train the LSI model on the set of training reviews, get the number of topics used in fitting



In [ ]:

    
#this is just for my understand of how the model is working under the hood
def fit_lsi(train_reviews):
    #Input: train_reviews is a list of reviews that will be used to train the LSI feature transformer
    #Output: A trained LSI model and the transformed training reviews

    texts = [[word for word in review.lower().split() if (word not in stop_words)]
              for review in train_reviews]
    
    dictionary = corpora.Dictionary(texts)

    corpus = [dictionary.doc2bow(text) for text in texts]

    numpy_matrix = matutils.corpus2dense(corpus, num_terms=10000)
    singular_values = np.linalg.svd(numpy_matrix, full_matrices=False, compute_uv=False)
    mean_sv = sum(list(singular_values))/len(singular_values)
    topics = int(mean_sv)

    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topics)

    return lsi, topics, dictionary



In [ ]:

    
#Fit LSI model and return number of LSI topics
lsi, topics, dictionary = yml.fit_lsi(train_reviews)

7.Set up the FeatureUnion with the desired features, then fit according to the train reviews and transform the train reviews



In [ ]:

    
#Make a FeatureUnion object with the desired features then fit to train reviews
comb_features = yml.make_featureunion()
comb_features.fit(train_reviews)
    
train_features = comb_features.transform(train_reviews)
train_lsi = yml.get_lsi_features(train_reviews, lsi, topics, dictionary)
train_features = sparse.hstack((train_features, train_lsi))
train_features = train_features.todense()



In [ ]:

    
#fit each model in turn 
model_runs = [(True, False, False, False, False), (False, True, False, False, False), 
                  (False, False, True, False, False), (False, False, False, True, False),
                 (False, False, False, False, True)]

test_results = {}

for i in tqdm.tqdm(range(0, len(model_runs))):
    clf = yml.fit_model(train_features, train_labels, svm_clf = model_runs[i][0], 
                        RandomForest = model_runs[i][1], nb = model_runs[i][2])
    threshold = 0.7
    error = yml.test_user_set(test_set, clf, restaurant_df, user_df, comb_features, threshold, lsi, topics, dictionary)
    test_results[clf] = error



In [ ]:

    
#Get top predictions

for key in test_results.keys():
    results = test_results[test_results.keys()[0]]
    log_loss = yml.get_log_loss()
    print log_loss



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]: