In [1]:
import re
import warnings
import pandas as pd
from bs4 import BeautifulSoup
import mongo
warnings.filterwarnings("ignore")
In [2]:
from pymongo import MongoClient
client = MongoClient()
db = client['Yelp']
mongo_review = db['review'].find({})
mongo_tip = db['tip'].find({})
mongo_business = db['business'].find({})
mongo_user = db['user'].find({})
In [3]:
business = pd.DataFrame(list(mongo_business))
business = business[['business_id','categories']]
business.head()
Out[3]:
In [4]:
categories_df = business['categories'].str.join(sep=',').str.get_dummies(sep=',')
categories = categories_df.columns.values
business = pd.merge(business, categories_df, left_index = True, right_index = True)
business['categories'] = business['categories'].apply(lambda x: tuple(x))
In [5]:
cat = {}
for index, row in business.iterrows():
for item in row['categories']:
if item in cat:
cat[item]+=1
else:
cat[item]=1
In [36]:
import operator
a = sorted(cat.items(), key=operator.itemgetter(1) ,reverse = True)
In [37]:
b = a[:10]
b
Out[37]:
In [38]:
mongo_user = db['user'].find({})
In [39]:
user = pd.DataFrame(list(mongo_user))
In [41]:
user.to_pickle('user.pkl')
In [ ]:
In [ ]:
In [ ]:
In [ ]:
reviewData = pd.DataFrame(list(mongo.mongo_review))
reviewData = reviewData[['business_id','text']]
reviewData.columns = ['business_id','review']
In [ ]:
for item in business['categories']
In [ ]:
In [2]:
tip = pd.read_pickle('tip.pkl')
tip.head()
Out[2]:
In [ ]:
def replicateClasses(review):
cols = review.columns
categories = review["categories"]
data = pd.DataFrame(columns=cols)
r_index = 0
count = 0
for i in range(len(categories)):
for item in categories[i]:
# print(item)
data.loc[len(data)-1] = [review["business_id"][r_index], review[cols[1]][r_index], item]
r_index += 1
count += len(categories[i])
print("Total Categories:\t",count)
return (data)
In [ ]:
tip = replicateClasses(tip)
In [ ]:
tip.head()
In [ ]:
In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")
In [3]:
review_multi = pd.read_pickle('review.pkl')
review_multi.head()
Out[3]:
In [7]:
review_multi = review_multi[:8041]
X_train, X_test, y_train, y_test = train_test_split(review_multi["review"], review_multi["categories"], test_size=0.20, random_state=4212)
vectorizer = TfidfVectorizer()
tfidfXtrain = vectorizer.fit_transform(X_train)
tfidfXtest = vectorizer.transform(X_test)
In [5]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
#result = lb.inverse_transform(result)
#output = pd.DataFrame( data={"predicted":result,"actual":y_test,'review':X_test} )
#string = output.iloc[0]['review']
#output.head()
In [6]:
lb = MultiLabelBinarizer()
y_train = lb.fit_transform(y_train)
y_test = lb.fit_transform(y_test)
one = OneVsRestClassifier(LinearSVC())
one = one.fit( tfidfXtrain, y_train )
result = one.predict(tfidfXtest)
In [9]:
lb = MultiLabelBinarizer()
y_train = lb.fit_transform(y_train)
y_test = lb.fit_transform(y_test)
In [22]:
train = review_multi.iloc[:5000,1:2]
labl = review_multi.iloc[:5000,2:3]
vectorizer = TfidfVectorizer()
tfidfXtrain = vectorizer.fit_transform(train)
lb = MultiLabelBinarizer()
y_test = lb.fit_transform(labl)
In [10]:
forest = RandomForestClassifier(max_features=200,n_estimators=100, n_jobs=4)
forest = forest.fit( tfidfXtrain, y_train )
result = forest.predict(tfidfXtest)
In [12]:
test = forest.predict_proba(tfidfXtest)
In [21]:
#lb.inverse_transform(result)
lb.transform(result)
In [8]:
from sklearn.datasets import make_regression
X, y = make_regression(n_samples=10, n_targets=3, random_state=1)
MultiOutputRegressor(GradientBoostingRegressor(random_state=0)).fit(X, y).predict(X)
Out[8]:
In [16]:
y_test.
Out[16]:
In [ ]:
review = pd.read_pickle('final.pkl')
len(review)
In [3]:
review = review[:8041]
X_train, X_test, y_train, y_test = train_test_split(review["review"], review["categories"], test_size=0.20, random_state=4212)
vectorizer = TfidfVectorizer()
tfidfXtrain = vectorizer.fit_transform(X_train)
tfidfXtest = vectorizer.transform(X_test)
In [4]:
len(review)
Out[4]:
In [5]:
forest = RandomForestClassifier(max_features=200,n_estimators=100, n_jobs=4)
forest = forest.fit( tfidfXtrain, y_train )
result = forest.predict(tfidfXtest)
In [6]:
# Reference: file:///C:/Users/Arihant/Downloads/Yelp_Advisor_Report.pdf
In [7]:
len(result)
Out[7]:
In [8]:
len(y_test)
Out[8]:
In [9]:
output = pd.DataFrame( data={"predicted":result,"actual":y_test,'review':X_test} )
string = output.iloc[0]['review']
output.head()
Out[9]:
In [10]:
print ("accuracy_score: ", accuracy_score(y_test.values,result))
In [11]:
for index, row in output.iterrows():
if row['review'] in string:
print(row['predicted'])
In [12]:
for index, row in review.iterrows():
if row['review'] in string:
print(row['categories'])
In [ ]:
In [ ]:
In [ ]:
In [3]:
dataset = replicateClasses(review)
dataset.head()
Out[3]:
In [4]:
len(dataset)
Out[4]:
In [5]:
dataset.to_pickle('final.pkl')
In [1]:
# -*- coding: utf-8 -*-
"""
Created on Sat Nov 5 13:09:04 2016
@author: Vipul Munot
"""
import re
import warnings
import pandas as pd
from bs4 import BeautifulSoup
import mongo
warnings.filterwarnings("ignore")
In [2]:
def convert_words( raw_review ):
review_text = BeautifulSoup(raw_review,'lxml').get_text()
letters_only = re.sub("[^a-zA-Z]", " ", review_text)
words = letters_only.lower().split()
return( " ".join( words ))
In [3]:
business = pd.DataFrame(list(mongo.mongo_business))
business = business[['business_id','categories']]
tipData = pd.DataFrame(list(mongo.mongo_tip))
tipData = tipData[['business_id','text']]
tipData.columns = ['business_id','tip']
tipData.loc[:,'tip'] = tipData['tip'].map(convert_words)
In [4]:
tip = pd.merge(tipData, business, on='business_id')
In [5]:
reviewData = pd.DataFrame(list(mongo.mongo_review))
reviewData = reviewData[['business_id','text']]
reviewData.columns = ['business_id','review']
reviewData.loc[:,'tip'] = reviewData['review'].map(convert_words)
In [6]:
review = pd.merge(reviewData, business, on='business_id')
In [7]:
review.head()
Out[7]:
In [8]:
tip.head()
Out[8]:
In [ ]: