In [ ]:
import json
import pandas as pd
import re
import random
from scipy import sparse
import numpy as np
from pymongo import MongoClient
from nltk.corpus import stopwords
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora, models, similarities, matutils
import tqdm
import sys
sys.path.append('/Users/ed/yelp-classification/machine_learning')
import yelp_ml as yml
#reload(yml)
In [ ]:
lh_neg = open('../input/negative-words.txt', 'r', encoding = "ISO-8859-1").read()
lh_neg = lh_neg.split('\n')
lh_pos = open('../input/positive-words.txt', 'r', encoding = "ISO-8859-1").read()
lh_pos = lh_pos.split('\n')
users = json.load(open('../input/many_reviews_dictionary.json'))
word_list = list(set(lh_pos + lh_neg))
In [ ]:
#Fix users JSON
users_dict = {}
user_ids = []
users_dict = {}
user_ids = []
for list in users['reviews']:
users_dict[list[0]['user_id']]= list
for list_reviews in users['reviews']:
user_ids.append(list_reviews[0]['user_id'])
#We have 228 users, creat a new dictionary where the user_ids are the keys and the entries are a list of reviews
with open('cleaned_large_user_dictionary.json', 'w') as outfile:
json.dump(users_dict, outfile)
Try running a few tests on a subset of users, the keys are our unique user IDs. We proceed as follows for each user ID:
1.Create a user dataframe with the following columns:•(review_text, review rating, business_id)
2.Create a list of unique business IDs for that user
3.Connect to the MongoDB server and pull all of the reviews for the restaurants that the user has reviewed
4.Create a restaurant dataframe with the following columns:•(review_text, biz rating, business_id)
5.Do a 80/20 training/test split, randomizing over the set of user' reviewed restaurants
6.Train the LSI model on the set of training reviews, get the number of topics used in fitting
7.Set up the FeatureUnion with the desired features, then fit according to the train reviews and transform the train reviews
8.
In [ ]:
#####Test Machine Learning Algorithms
ip = 'Insert IP here'
conn = MongoClient(ip, 27017)
conn.database_names()
db = conn.get_database('cleaned_data')
reviews = db.get_collection('restaurant_reviews')
1.Create a user dataframe with the following columns:•(review_text, review rating, business_id)
In [ ]:
useridlist =[]
for user in users_dict.keys():
useridlist.append(user)
print(useridlist[1])
In [ ]:
def make_user_df(user_specific_reviews):
#Input:
#user_specific_reviews: A list of reviews for a specific user
#Output: A dataframe with the columns (user_reviews, user_ratings, biz_ids)
user_reviews = []
user_ratings = []
business_ids = []
for review in user_specific_reviews:
user_reviews.append(review['text'])
user_ratings.append(review['stars'])
business_ids.append(review['business_id'])
###WE SHOULD MAKE THE OUR OWN PUNCTUATION RULES
#https://www.tutorialspoint.com/python/string_translate.htm
#I'm gonna have to go and figure out what this does -ed
#user_reviews = [review.encode('utf-8').translate(None, string.punctuation) for review in user_reviews]
user_df = pd.DataFrame({'review_text': user_reviews, 'rating': user_ratings, 'biz_id': business_ids})
return user_df
In [ ]:
#test to make users_dict,make_user_df works
user_specific_reviews = users_dict[useridlist[0]]
x= make_user_df(user_specific_reviews)
x.head()
2.Create a list of unique business IDs for that user
In [ ]:
business_ids = list(set(user['biz_id']))
3.Connect to the MongoDB server and pull all of the reviews for the restaurants that the user has reviewed
In [ ]:
restreview = {}
for i in range(0, len(business_ids)):
rlist = []
for obj in reviews.find({'business_id':business_ids[i]}):
rlist.append(obj)
restreview[business_ids[i]] = rlist
4.Create a restaurant dataframe with the following columns:•(review_text, biz rating, business_id)
In [ ]:
restaurant_df = yml.make_biz_df(user, restreview)
5.Do a 80/20 training/test split, randomizing over the set of user' reviewed restaurants
In [ ]:
#Create a training and test sample from the user reviewed restaurants
split_samp = .30
random_int = random.randint(1, len(business_ids)-1)
len_random = int(len(business_ids) * split_samp)
test_set = business_ids[random_int:random_int+len_random]
training_set = business_ids[0:random_int]+business_ids[random_int+len_random:len(business_ids)]
train_reviews, train_ratings = [], []
In [ ]:
#Create a list of training reviews and training ratings
for rest_id in training_set:
train_reviews.extend(list(user_df[user_df['biz_id'] == rest_id]['review_text']))
train_ratings.extend(list(user_df[user_df['biz_id'] == rest_id]['rating']))
In [ ]:
#Transform the star labels into a binary class problem, 0 if rating is < 4 else 1
train_labels = [1 if x >=4 else 0 for x in train_ratings]
6.Train the LSI model on the set of training reviews, get the number of topics used in fitting
In [ ]:
#this is just for my understand of how the model is working under the hood
def fit_lsi(train_reviews):
#Input: train_reviews is a list of reviews that will be used to train the LSI feature transformer
#Output: A trained LSI model and the transformed training reviews
texts = [[word for word in review.lower().split() if (word not in stop_words)]
for review in train_reviews]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
numpy_matrix = matutils.corpus2dense(corpus, num_terms=10000)
singular_values = np.linalg.svd(numpy_matrix, full_matrices=False, compute_uv=False)
mean_sv = sum(list(singular_values))/len(singular_values)
topics = int(mean_sv)
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topics)
return lsi, topics, dictionary
In [ ]:
#Fit LSI model and return number of LSI topics
lsi, topics, dictionary = yml.fit_lsi(train_reviews)
7.Set up the FeatureUnion with the desired features, then fit according to the train reviews and transform the train reviews
In [ ]:
#Make a FeatureUnion object with the desired features then fit to train reviews
comb_features = yml.make_featureunion()
comb_features.fit(train_reviews)
train_features = comb_features.transform(train_reviews)
train_lsi = yml.get_lsi_features(train_reviews, lsi, topics, dictionary)
train_features = sparse.hstack((train_features, train_lsi))
train_features = train_features.todense()
In [ ]:
#fit each model in turn
model_runs = [(True, False, False, False, False), (False, True, False, False, False),
(False, False, True, False, False), (False, False, False, True, False),
(False, False, False, False, True)]
test_results = {}
for i in tqdm.tqdm(range(0, len(model_runs))):
clf = yml.fit_model(train_features, train_labels, svm_clf = model_runs[i][0],
RandomForest = model_runs[i][1], nb = model_runs[i][2])
threshold = 0.7
error = yml.test_user_set(test_set, clf, restaurant_df, user_df, comb_features, threshold, lsi, topics, dictionary)
test_results[clf] = error
In [ ]:
#Get top predictions
for key in test_results.keys():
results = test_results[test_results.keys()[0]]
log_loss = yml.get_log_loss()
print log_loss
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: