yelp_reviews

a notebook that adds yelp hotel reviews to the AWS RDS MySQL DB



In [1]:

    
import pandas as pd
import json
import sqlalchemy
import connect_aws_db as cadb



In [2]:

    
%load_ext autoreload



In [3]:

    
%autoreload 2



In [4]:

    
data = []
with open('../../data/yelp_academic_dataset_review.json') as the_dat:
    for line in the_dat:
        data.append(json.loads(line))



In [5]:

    
data[0]









    Out[5]:





{u'business_id': u'vcNAWiLM4dR7D2nwwJ7nCA',
 u'date': u'2007-05-17',
 u'review_id': u'15SdjuK7DmYqUAj6rjGowg',
 u'stars': 5,
 u'text': u"dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.",
 u'type': u'review',
 u'user_id': u'Xqd0DzHaiyRqVH3WRG7hzg',
 u'votes': {u'cool': 1, u'funny': 0, u'useful': 2}}



In [6]:

    
df = pd.io.json.json_normalize(data)



In [7]:

    
for col in df.columns:
    print(col)









    



business_id
date
review_id
stars
text
type
user_id
votes.cool
votes.funny
votes.useful



In [8]:

    
df.head(3)









    Out[8]:






  
    
      
      business_id
      date
      review_id
      stars
      text
      type
      user_id
      votes.cool
      votes.funny
      votes.useful
    
  
  
    
      0
      vcNAWiLM4dR7D2nwwJ7nCA
      2007-05-17
      15SdjuK7DmYqUAj6rjGowg
      5
      dr. goldberg offers everything i look for in a...
      review
      Xqd0DzHaiyRqVH3WRG7hzg
      1
      0
      2
    
    
      1
      vcNAWiLM4dR7D2nwwJ7nCA
      2010-03-22
      RF6UnRTtG7tWMcrO2GEoAg
      2
      Unfortunately, the frustration of being Dr. Go...
      review
      H1kH6QZV7Le4zqTRNxoZow
      0
      0
      2
    
    
      2
      vcNAWiLM4dR7D2nwwJ7nCA
      2012-02-14
      -TsVN230RCkLYKBeLsuz7A
      4
      Dr. Goldberg has been my doctor for years and ...
      review
      zvJCcrpm2yOZrxKffwGQLA
      1
      0
      1



In [9]:

    
len(df)









    Out[9]:





1569264



In [10]:

    
engine = cadb.connect_aws_db()



In [11]:

    
conn = engine.connect()



In [12]:

    
cmd = "SELECT DISTINCT business_id FROM yelp_hotels"



In [13]:

    
hotel_biz_ids = conn.execute(cmd).fetchall()



In [14]:

    
len(hotel_biz_ids)









    Out[14]:





128



In [15]:

    
hbis = [id[0] for id in hotel_biz_ids]



In [16]:

    
hbis[:3]









    Out[16]:





['bWWrrsPWuoHuAGUCgH8Tyg', 'uXuiI-Y998JChGn-ebcSdw', 'i2CtyEZiC_164MdfWjdwsg']



In [17]:

    
hotel_reviewdfbig = df[df['business_id'].isin(hbis)].copy()



In [18]:

    
len(hotel_reviewdfbig)









    Out[18]:





6263



In [19]:

    
hotel_reviewdfbig.columns









    Out[19]:





Index([u'business_id', u'date', u'review_id', u'stars', u'text', u'type', u'user_id', u'votes.cool', u'votes.funny', u'votes.useful'], dtype='object')



In [20]:

    
hotel_reviewdf = hotel_reviewdfbig[['business_id', 'date', 'review_id', 'stars', 'text', 'user_id']].copy()



In [21]:

    
hotel_reviewdf.columns = ['business_id',
                          'yelp_review_date',
                          'yelp_review_id',
                          'review_rating',
                          'review_text',
                          'user_id']



In [22]:

    
hotel_reviewdf['rev_id'] = 0



In [23]:

    
hotel_reviewdf.head(1)









    Out[23]:






  
    
      
      business_id
      yelp_review_date
      yelp_review_id
      review_rating
      review_text
      user_id
      rev_id
    
  
  
    
      7023
      bWWrrsPWuoHuAGUCgH8Tyg
      2007-10-28
      TMiAAS5RB-P5EwXs7B9aBw
      4
      I stayed at the hotel for several months durin...
      ZqoMKTrJOtyYZOcvP8rTUA
      0

Now writing to SQL



In [24]:

    
from connect_aws_db import connect_aws_db



In [25]:

    
engine = cadb.connect_aws_db(write_unicode=True)



In [26]:

    
conn = engine.connect()



In [27]:

    
cmd = "DROP TABLE yelp_reviews"



In [28]:

    
result = conn.execute(cmd)



In [29]:

    
cmd = """
        CREATE TABLE yelp_reviews
        (
        rev_id MEDIUMINT AUTO_INCREMENT,
        business_id VARCHAR(256),
        yelp_review_date DATE,
        yelp_review_id VARCHAR(256),
        review_rating INT,
        review_text VARCHAR(5000),
        user_id VARCHAR(256),
        PRIMARY KEY (rev_id)
        )
        """



In [30]:

    
result = conn.execute(cmd)



In [31]:

    
hotel_reviewdf.to_sql('yelp_reviews', engine, if_exists='append', index=False)



In [ ]:

	business_id	date	review_id	stars	text	type	user_id	votes.cool	votes.useful
0	vcNAWiLM4dR7D2nwwJ7nCA	2007-05-17	15SdjuK7DmYqUAj6rjGowg	5	dr. goldberg offers everything i look for in a...	review	Xqd0DzHaiyRqVH3WRG7hzg	1	2
1	vcNAWiLM4dR7D2nwwJ7nCA	2010-03-22	RF6UnRTtG7tWMcrO2GEoAg	2	Unfortunately, the frustration of being Dr. Go...	review	H1kH6QZV7Le4zqTRNxoZow	0	2
2	vcNAWiLM4dR7D2nwwJ7nCA	2012-02-14	-TsVN230RCkLYKBeLsuz7A	4	Dr. Goldberg has been my doctor for years and ...	review	zvJCcrpm2yOZrxKffwGQLA	1	1