yelp_reviews

a notebook that adds yelp hotel reviews to the AWS RDS MySQL DB


In [1]:
import pandas as pd
import json
import sqlalchemy
import connect_aws_db as cadb

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
data = []
with open('../../data/yelp_academic_dataset_review.json') as the_dat:
    for line in the_dat:
        data.append(json.loads(line))

In [5]:
data[0]


Out[5]:
{u'business_id': u'vcNAWiLM4dR7D2nwwJ7nCA',
 u'date': u'2007-05-17',
 u'review_id': u'15SdjuK7DmYqUAj6rjGowg',
 u'stars': 5,
 u'text': u"dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.",
 u'type': u'review',
 u'user_id': u'Xqd0DzHaiyRqVH3WRG7hzg',
 u'votes': {u'cool': 1, u'funny': 0, u'useful': 2}}

In [6]:
df = pd.io.json.json_normalize(data)

In [7]:
for col in df.columns:
    print(col)


business_id
date
review_id
stars
text
type
user_id
votes.cool
votes.funny
votes.useful

In [8]:
df.head(3)


Out[8]:
business_id date review_id stars text type user_id votes.cool votes.funny votes.useful
0 vcNAWiLM4dR7D2nwwJ7nCA 2007-05-17 15SdjuK7DmYqUAj6rjGowg 5 dr. goldberg offers everything i look for in a... review Xqd0DzHaiyRqVH3WRG7hzg 1 0 2
1 vcNAWiLM4dR7D2nwwJ7nCA 2010-03-22 RF6UnRTtG7tWMcrO2GEoAg 2 Unfortunately, the frustration of being Dr. Go... review H1kH6QZV7Le4zqTRNxoZow 0 0 2
2 vcNAWiLM4dR7D2nwwJ7nCA 2012-02-14 -TsVN230RCkLYKBeLsuz7A 4 Dr. Goldberg has been my doctor for years and ... review zvJCcrpm2yOZrxKffwGQLA 1 0 1

In [9]:
len(df)


Out[9]:
1569264

In [10]:
engine = cadb.connect_aws_db()

In [11]:
conn = engine.connect()

In [12]:
cmd = "SELECT DISTINCT business_id FROM yelp_hotels"

In [13]:
hotel_biz_ids = conn.execute(cmd).fetchall()

In [14]:
len(hotel_biz_ids)


Out[14]:
128

In [15]:
hbis = [id[0] for id in hotel_biz_ids]

In [16]:
hbis[:3]


Out[16]:
['bWWrrsPWuoHuAGUCgH8Tyg', 'uXuiI-Y998JChGn-ebcSdw', 'i2CtyEZiC_164MdfWjdwsg']

In [17]:
hotel_reviewdfbig = df[df['business_id'].isin(hbis)].copy()

In [18]:
len(hotel_reviewdfbig)


Out[18]:
6263

In [19]:
hotel_reviewdfbig.columns


Out[19]:
Index([u'business_id', u'date', u'review_id', u'stars', u'text', u'type', u'user_id', u'votes.cool', u'votes.funny', u'votes.useful'], dtype='object')

In [20]:
hotel_reviewdf = hotel_reviewdfbig[['business_id', 'date', 'review_id', 'stars', 'text', 'user_id']].copy()

In [21]:
hotel_reviewdf.columns = ['business_id',
                          'yelp_review_date',
                          'yelp_review_id',
                          'review_rating',
                          'review_text',
                          'user_id']

In [22]:
hotel_reviewdf['rev_id'] = 0

In [23]:
hotel_reviewdf.head(1)


Out[23]:
business_id yelp_review_date yelp_review_id review_rating review_text user_id rev_id
7023 bWWrrsPWuoHuAGUCgH8Tyg 2007-10-28 TMiAAS5RB-P5EwXs7B9aBw 4 I stayed at the hotel for several months durin... ZqoMKTrJOtyYZOcvP8rTUA 0

Now writing to SQL


In [24]:
from connect_aws_db import connect_aws_db

In [25]:
engine = cadb.connect_aws_db(write_unicode=True)

In [26]:
conn = engine.connect()

In [27]:
cmd = "DROP TABLE yelp_reviews"

In [28]:
result = conn.execute(cmd)

In [29]:
cmd = """
        CREATE TABLE yelp_reviews
        (
        rev_id MEDIUMINT AUTO_INCREMENT,
        business_id VARCHAR(256),
        yelp_review_date DATE,
        yelp_review_id VARCHAR(256),
        review_rating INT,
        review_text VARCHAR(5000),
        user_id VARCHAR(256),
        PRIMARY KEY (rev_id)
        )
        """

In [30]:
result = conn.execute(cmd)

In [31]:
hotel_reviewdf.to_sql('yelp_reviews', engine, if_exists='append', index=False)

In [ ]: