In [1]:
import re
import warnings
import pandas as pd
from bs4 import BeautifulSoup
import mongo
warnings.filterwarnings("ignore")

In [2]:
from pymongo import MongoClient

client = MongoClient()


db = client['Yelp']
mongo_review = db['review'].find({})
mongo_tip = db['tip'].find({})
mongo_business = db['business'].find({})
mongo_user = db['user'].find({})

In [3]:
business =  pd.DataFrame(list(mongo_business))
business = business[['business_id','categories']]
business.head()


Out[3]:
business_id categories
0 5UmKMjUEUNdYWqANhGckJw [Fast Food, Restaurants]
1 UsFtqoBl7naz8AVUBZMjQQ [Nightlife]
2 cE27W9VPgO88Qxe4ol6y_g [Active Life, Mini Golf, Golf]
3 mVHrayjG3uZ_RLHkLj-AMg [Bars, American (New), Nightlife, Lounges, Res...
4 mYSpR_SLPgUVymYOvTQd_Q [Active Life, Golf]

In [4]:
categories_df = business['categories'].str.join(sep=',').str.get_dummies(sep=',')
categories = categories_df.columns.values
business = pd.merge(business, categories_df, left_index = True, right_index = True)
business['categories'] = business['categories'].apply(lambda x: tuple(x))

In [5]:
cat = {}
for index, row in business.iterrows():
    for item in row['categories']:
        if item in cat:
            cat[item]+=1
        else:
            cat[item]=1

In [36]:
import operator
a = sorted(cat.items(), key=operator.itemgetter(1) ,reverse = True)

In [37]:
b = a[:10]
b


Out[37]:
[('Restaurants', 26729),
 ('Shopping', 12444),
 ('Food', 10143),
 ('Beauty & Spas', 7490),
 ('Health & Medical', 6106),
 ('Home Services', 5866),
 ('Nightlife', 5507),
 ('Automotive', 4888),
 ('Bars', 4727),
 ('Local Services', 4041)]

In [38]:
mongo_user = db['user'].find({})

In [39]:
user =  pd.DataFrame(list(mongo_user))

In [41]:
user.to_pickle('user.pkl')

In [ ]:


In [ ]:


In [ ]:


In [ ]:
reviewData =  pd.DataFrame(list(mongo.mongo_review))
reviewData = reviewData[['business_id','text']]
reviewData.columns = ['business_id','review']

In [ ]:
for item in business['categories']

In [ ]:


In [2]:
tip = pd.read_pickle('tip.pkl')
tip.head()


Out[2]:
business_id tip categories
0 5UmKMjUEUNdYWqANhGckJw pizza is garbage hoagies are excellent [Fast Food, Restaurants]
1 cE27W9VPgO88Qxe4ol6y_g don t waste your time [Active Life, Mini Golf, Golf]
2 mVHrayjG3uZ_RLHkLj-AMg not easy to find be sure to put in directions ... [Bars, American (New), Nightlife, Lounges, Res...
3 mVHrayjG3uZ_RLHkLj-AMg your gps will not allow you to find this place... [Bars, American (New), Nightlife, Lounges, Res...
4 KayYbHCt-RkbGcPdGOThNg great drink specials [Bars, American (Traditional), Nightlife, Rest...

In [ ]:
def replicateClasses(review):
    cols = review.columns
    categories = review["categories"]    
    data = pd.DataFrame(columns=cols)   
    r_index = 0
    count = 0
    for i in range(len(categories)):
        for item in categories[i]:
            # print(item)
            data.loc[len(data)-1] = [review["business_id"][r_index], review[cols[1]][r_index], item]
        r_index += 1
        count += len(categories[i])
    print("Total Categories:\t",count)
    return (data)

In [ ]:
tip = replicateClasses(tip)

In [ ]:
tip.head()

In [ ]:


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import warnings

warnings.filterwarnings("ignore")

In [3]:
review_multi = pd.read_pickle('review.pkl')
review_multi.head()


Out[3]:
business_id review categories
0 5UmKMjUEUNdYWqANhGckJw mr hoagie is an institution walking in it does... [Fast Food, Restaurants]
1 5UmKMjUEUNdYWqANhGckJw excellent food superb customer service i miss ... [Fast Food, Restaurants]
2 5UmKMjUEUNdYWqANhGckJw yes this place is a little out dated and not o... [Fast Food, Restaurants]
3 5UmKMjUEUNdYWqANhGckJw pros italian hoagie was delicious friendly cou... [Fast Food, Restaurants]
4 5UmKMjUEUNdYWqANhGckJw first the only reason this place could possibl... [Fast Food, Restaurants]

In [7]:
review_multi = review_multi[:8041]
X_train, X_test, y_train, y_test = train_test_split(review_multi["review"], review_multi["categories"], test_size=0.20, random_state=4212)
vectorizer = TfidfVectorizer() 
tfidfXtrain = vectorizer.fit_transform(X_train)
tfidfXtest = vectorizer.transform(X_test)

In [5]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor

#result = lb.inverse_transform(result)
#output = pd.DataFrame( data={"predicted":result,"actual":y_test,'review':X_test} )
#string = output.iloc[0]['review']
#output.head()

In [6]:
lb = MultiLabelBinarizer()
y_train = lb.fit_transform(y_train)
y_test = lb.fit_transform(y_test)
one = OneVsRestClassifier(LinearSVC())
one = one.fit( tfidfXtrain, y_train )
result = one.predict(tfidfXtest)

In [9]:
lb = MultiLabelBinarizer()
y_train = lb.fit_transform(y_train)
y_test = lb.fit_transform(y_test)

In [22]:
train = review_multi.iloc[:5000,1:2]
labl = review_multi.iloc[:5000,2:3]
vectorizer = TfidfVectorizer() 
tfidfXtrain = vectorizer.fit_transform(train)
lb = MultiLabelBinarizer()
y_test = lb.fit_transform(labl)

In [10]:
forest = RandomForestClassifier(max_features=200,n_estimators=100, n_jobs=4)
forest = forest.fit( tfidfXtrain, y_train )
result = forest.predict(tfidfXtest)

In [12]:
test = forest.predict_proba(tfidfXtest)

In [21]:
#lb.inverse_transform(result)
lb.transform(result)


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-21-f39f4a3c8676> in <module>()
      1 #lb.inverse_transform(result)
----> 2 lb.transform(result)

C:\Users\Arihant\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py in transform(self, y)
    761 
    762         class_to_index = dict(zip(self.classes_, range(len(self.classes_))))
--> 763         yt = self._transform(y, class_to_index)
    764 
    765         if not self.sparse_output:

C:\Users\Arihant\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py in _transform(self, y, class_mapping)
    785         indptr = array.array('i', [0])
    786         for labels in y:
--> 787             indices.extend(set(class_mapping[label] for label in labels))
    788             indptr.append(len(indices))
    789         data = np.ones(len(indices), dtype=int)

C:\Users\Arihant\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py in <genexpr>(.0)
    785         indptr = array.array('i', [0])
    786         for labels in y:
--> 787             indices.extend(set(class_mapping[label] for label in labels))
    788             indptr.append(len(indices))
    789         data = np.ones(len(indices), dtype=int)

KeyError: 0.0

In [8]:
from sklearn.datasets import make_regression
X, y = make_regression(n_samples=10, n_targets=3, random_state=1)
MultiOutputRegressor(GradientBoostingRegressor(random_state=0)).fit(X, y).predict(X)


Out[8]:
array([[-154.75474165, -147.03498585,  -50.03812219],
       [   7.12165031,    5.12914884,  -81.46081961],
       [-187.8948621 , -100.44373091,   13.88978285],
       [-141.62745778,   95.02891072, -191.48204257],
       [  97.03260883,  165.34867495,  139.52003279],
       [ 123.92529176,   21.25719016,   -7.84253   ],
       [-122.25193977,  -85.16443186, -107.12274212],
       [ -30.170388  ,  -94.80956739,   12.16979946],
       [ 140.72667194,  176.50941682,  -17.50447799],
       [ 149.37967282,  -81.15699552,   -5.72850319]])

In [16]:
y_test.


Out[16]:
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [ ]:
review = pd.read_pickle('final.pkl')
len(review)

In [3]:
review = review[:8041]
X_train, X_test, y_train, y_test = train_test_split(review["review"], review["categories"], test_size=0.20, random_state=4212)
vectorizer = TfidfVectorizer() 
tfidfXtrain = vectorizer.fit_transform(X_train)
tfidfXtest = vectorizer.transform(X_test)

In [4]:
len(review)


Out[4]:
8041

In [5]:
forest = RandomForestClassifier(max_features=200,n_estimators=100, n_jobs=4)
forest = forest.fit( tfidfXtrain, y_train )
result = forest.predict(tfidfXtest)

In [6]:
# Reference: file:///C:/Users/Arihant/Downloads/Yelp_Advisor_Report.pdf

In [7]:
len(result)


Out[7]:
1609

In [8]:
len(y_test)


Out[8]:
1609

In [9]:
output = pd.DataFrame( data={"predicted":result,"actual":y_test,'review':X_test} )
string = output.iloc[0]['review']
output.head()


Out[9]:
actual predicted review
116 Lounges Bars we have been coming here for over years consis...
2217 Nightlife Arcades pm on a friday afternoon and i got turned away...
5061 Restaurants American (Traditional) we were really looking forward to the acclaime...
7108 Cafes Restaurants finally a great neighborhood coffee shop that ...
4211 Gluten-Free Restaurants we hadn t been at pf changs for awhile and now...

In [10]:
print ("accuracy_score: ", accuracy_score(y_test.values,result))


accuracy_score:  0.0205096333126

In [11]:
for index, row in output.iterrows():
    if row['review'] in string:
        print(row['predicted'])


Bars
Bars

In [12]:
for index, row in review.iterrows():
    if row['review'] in string:
        print(row['categories'])


Bars
American (New)
Nightlife
Lounges
Restaurants

In [ ]:


In [ ]:


In [ ]:


In [3]:
dataset = replicateClasses(review)
dataset.head()


Out[3]:
business_id review categories
-1 5UmKMjUEUNdYWqANhGckJw mr hoagie is an institution walking in it does... Fast Food
0 5UmKMjUEUNdYWqANhGckJw mr hoagie is an institution walking in it does... Restaurants
1 5UmKMjUEUNdYWqANhGckJw excellent food superb customer service i miss ... Fast Food
2 5UmKMjUEUNdYWqANhGckJw excellent food superb customer service i miss ... Restaurants
3 5UmKMjUEUNdYWqANhGckJw yes this place is a little out dated and not o... Fast Food
4 5UmKMjUEUNdYWqANhGckJw yes this place is a little out dated and not o... Restaurants
5 5UmKMjUEUNdYWqANhGckJw pros italian hoagie was delicious friendly cou... Fast Food
6 5UmKMjUEUNdYWqANhGckJw pros italian hoagie was delicious friendly cou... Restaurants
7 5UmKMjUEUNdYWqANhGckJw first the only reason this place could possibl... Fast Food
8 5UmKMjUEUNdYWqANhGckJw first the only reason this place could possibl... Restaurants
9 5UmKMjUEUNdYWqANhGckJw normally i do not do reviews of an establishme... Fast Food
10 5UmKMjUEUNdYWqANhGckJw normally i do not do reviews of an establishme... Restaurants
11 5UmKMjUEUNdYWqANhGckJw i like this place a lot it s a good toasted ho... Fast Food
12 5UmKMjUEUNdYWqANhGckJw i like this place a lot it s a good toasted ho... Restaurants
13 UsFtqoBl7naz8AVUBZMjQQ all the food is great here but the best thing ... Nightlife
14 UsFtqoBl7naz8AVUBZMjQQ we checked this place out this past monday for... Nightlife
15 UsFtqoBl7naz8AVUBZMjQQ wing sauce is like water pretty much a lot of ... Nightlife
16 UsFtqoBl7naz8AVUBZMjQQ cold cheap beer good bar food good service loo... Nightlife
17 UsFtqoBl7naz8AVUBZMjQQ possibly the most overhyped establishment in a... Nightlife
18 cE27W9VPgO88Qxe4ol6y_g decent range somewhat close to the city the ma... Active Life
19 cE27W9VPgO88Qxe4ol6y_g decent range somewhat close to the city the ma... Mini Golf
20 cE27W9VPgO88Qxe4ol6y_g decent range somewhat close to the city the ma... Golf
21 cE27W9VPgO88Qxe4ol6y_g owning a driving range inside the city limits ... Active Life
22 cE27W9VPgO88Qxe4ol6y_g owning a driving range inside the city limits ... Mini Golf
23 cE27W9VPgO88Qxe4ol6y_g owning a driving range inside the city limits ... Golf
24 cE27W9VPgO88Qxe4ol6y_g this place is absolute garbage half of the tee... Active Life
25 cE27W9VPgO88Qxe4ol6y_g this place is absolute garbage half of the tee... Mini Golf
26 cE27W9VPgO88Qxe4ol6y_g this place is absolute garbage half of the tee... Golf
27 cE27W9VPgO88Qxe4ol6y_g before i finally made it over to this range i ... Active Life
28 cE27W9VPgO88Qxe4ol6y_g before i finally made it over to this range i ... Mini Golf
... ... ... ...
95010 BOp3jcpYy31M85H1DLnEEw i ve been to upstream twice now and had except... Seafood
95011 BOp3jcpYy31M85H1DLnEEw i ve been to upstream twice now and had except... Sushi Bars
95012 BOp3jcpYy31M85H1DLnEEw i ve been to upstream twice now and had except... Restaurants
95013 BOp3jcpYy31M85H1DLnEEw i was amazed on my first visit to upstream i e... Seafood
95014 BOp3jcpYy31M85H1DLnEEw i was amazed on my first visit to upstream i e... Sushi Bars
95015 BOp3jcpYy31M85H1DLnEEw i was amazed on my first visit to upstream i e... Restaurants
95016 BOp3jcpYy31M85H1DLnEEw disappointed every time i ve gone won t be try... Seafood
95017 BOp3jcpYy31M85H1DLnEEw disappointed every time i ve gone won t be try... Sushi Bars
95018 BOp3jcpYy31M85H1DLnEEw disappointed every time i ve gone won t be try... Restaurants
95019 BOp3jcpYy31M85H1DLnEEw upstream is conveniently located in phillips p... Seafood
95020 BOp3jcpYy31M85H1DLnEEw upstream is conveniently located in phillips p... Sushi Bars
95021 BOp3jcpYy31M85H1DLnEEw upstream is conveniently located in phillips p... Restaurants
95022 BOp3jcpYy31M85H1DLnEEw i went here for lunch time restaurant feel rom... Seafood
95023 BOp3jcpYy31M85H1DLnEEw i went here for lunch time restaurant feel rom... Sushi Bars
95024 BOp3jcpYy31M85H1DLnEEw i went here for lunch time restaurant feel rom... Restaurants
95025 BOp3jcpYy31M85H1DLnEEw while in charlotte for business we ventured to... Seafood
95026 BOp3jcpYy31M85H1DLnEEw while in charlotte for business we ventured to... Sushi Bars
95027 BOp3jcpYy31M85H1DLnEEw while in charlotte for business we ventured to... Restaurants
95028 BOp3jcpYy31M85H1DLnEEw i brought my partner to upstream for his first... Seafood
95029 BOp3jcpYy31M85H1DLnEEw i brought my partner to upstream for his first... Sushi Bars
95030 BOp3jcpYy31M85H1DLnEEw i brought my partner to upstream for his first... Restaurants
95031 BOp3jcpYy31M85H1DLnEEw i am a big fan of this place it was home away ... Seafood
95032 BOp3jcpYy31M85H1DLnEEw i am a big fan of this place it was home away ... Sushi Bars
95033 BOp3jcpYy31M85H1DLnEEw i am a big fan of this place it was home away ... Restaurants
95034 BOp3jcpYy31M85H1DLnEEw came here for restaurant week last july when i... Seafood
95035 BOp3jcpYy31M85H1DLnEEw came here for restaurant week last july when i... Sushi Bars
95036 BOp3jcpYy31M85H1DLnEEw came here for restaurant week last july when i... Restaurants
95037 BOp3jcpYy31M85H1DLnEEw go on half priced bottle night i went one nigh... Seafood
95038 BOp3jcpYy31M85H1DLnEEw go on half priced bottle night i went one nigh... Sushi Bars
95039 BOp3jcpYy31M85H1DLnEEw go on half priced bottle night i went one nigh... Restaurants

95041 rows × 3 columns


In [4]:
len(dataset)


Out[4]:
95041

In [5]:
dataset.to_pickle('final.pkl')

In [1]:
# -*- coding: utf-8 -*-
"""
Created on Sat Nov  5 13:09:04 2016

@author: Vipul Munot
"""
import re
import warnings
import pandas as pd
from bs4 import BeautifulSoup
import mongo
warnings.filterwarnings("ignore")

In [2]:
def convert_words( raw_review ):
    review_text = BeautifulSoup(raw_review,'lxml').get_text() 
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    words = letters_only.lower().split()                             
    return( " ".join( words ))

In [3]:
business =  pd.DataFrame(list(mongo.mongo_business))
business = business[['business_id','categories']]
tipData =  pd.DataFrame(list(mongo.mongo_tip))
tipData = tipData[['business_id','text']]
tipData.columns = ['business_id','tip']
tipData.loc[:,'tip'] = tipData['tip'].map(convert_words)

In [4]:
tip = pd.merge(tipData, business, on='business_id')

In [5]:
reviewData =  pd.DataFrame(list(mongo.mongo_review))
reviewData = reviewData[['business_id','text']]
reviewData.columns = ['business_id','review']
reviewData.loc[:,'tip'] = reviewData['review'].map(convert_words)

In [6]:
review = pd.merge(reviewData, business, on='business_id')

In [7]:
review.head()


Out[7]:
business_id review tip categories
0 5UmKMjUEUNdYWqANhGckJw Mr Hoagie is an institution. Walking in, it do... mr hoagie is an institution walking in it does... [Fast Food, Restaurants]
1 5UmKMjUEUNdYWqANhGckJw Excellent food. Superb customer service. I mis... excellent food superb customer service i miss ... [Fast Food, Restaurants]
2 5UmKMjUEUNdYWqANhGckJw Yes this place is a little out dated and not o... yes this place is a little out dated and not o... [Fast Food, Restaurants]
3 5UmKMjUEUNdYWqANhGckJw PROS: Italian hoagie was delicious. Friendly ... pros italian hoagie was delicious friendly cou... [Fast Food, Restaurants]
4 5UmKMjUEUNdYWqANhGckJw First the only reason this place could possibl... first the only reason this place could possibl... [Fast Food, Restaurants]

In [8]:
tip.head()


Out[8]:
business_id tip categories
0 5UmKMjUEUNdYWqANhGckJw pizza is garbage hoagies are excellent [Fast Food, Restaurants]
1 cE27W9VPgO88Qxe4ol6y_g don t waste your time [Active Life, Mini Golf, Golf]
2 mVHrayjG3uZ_RLHkLj-AMg not easy to find be sure to put in directions ... [Bars, American (New), Nightlife, Lounges, Res...
3 mVHrayjG3uZ_RLHkLj-AMg your gps will not allow you to find this place... [Bars, American (New), Nightlife, Lounges, Res...
4 KayYbHCt-RkbGcPdGOThNg great drink specials [Bars, American (Traditional), Nightlife, Rest...

In [ ]: