In [1]:
import pandas as pd
import json
import sqlalchemy
import connect_aws_db as cadb
In [2]:
%load_ext autoreload
In [3]:
%autoreload 2
In [4]:
data = []
with open('../../data/yelp_academic_dataset_review.json') as the_dat:
for line in the_dat:
data.append(json.loads(line))
In [5]:
data[0]
Out[5]:
In [6]:
df = pd.io.json.json_normalize(data)
In [7]:
for col in df.columns:
print(col)
In [8]:
df.head(3)
Out[8]:
In [9]:
len(df)
Out[9]:
In [10]:
engine = cadb.connect_aws_db()
In [11]:
conn = engine.connect()
In [12]:
cmd = "SELECT DISTINCT business_id FROM yelp_hotels"
In [13]:
hotel_biz_ids = conn.execute(cmd).fetchall()
In [14]:
len(hotel_biz_ids)
Out[14]:
In [15]:
hbis = [id[0] for id in hotel_biz_ids]
In [16]:
hbis[:3]
Out[16]:
In [17]:
hotel_reviewdfbig = df[df['business_id'].isin(hbis)].copy()
In [18]:
len(hotel_reviewdfbig)
Out[18]:
In [19]:
hotel_reviewdfbig.columns
Out[19]:
In [20]:
hotel_reviewdf = hotel_reviewdfbig[['business_id', 'date', 'review_id', 'stars', 'text', 'user_id']].copy()
In [21]:
hotel_reviewdf.columns = ['business_id',
'yelp_review_date',
'yelp_review_id',
'review_rating',
'review_text',
'user_id']
In [22]:
hotel_reviewdf['rev_id'] = 0
In [23]:
hotel_reviewdf.head(1)
Out[23]:
In [24]:
from connect_aws_db import connect_aws_db
In [25]:
engine = cadb.connect_aws_db(write_unicode=True)
In [26]:
conn = engine.connect()
In [27]:
cmd = "DROP TABLE yelp_reviews"
In [28]:
result = conn.execute(cmd)
In [29]:
cmd = """
CREATE TABLE yelp_reviews
(
rev_id MEDIUMINT AUTO_INCREMENT,
business_id VARCHAR(256),
yelp_review_date DATE,
yelp_review_id VARCHAR(256),
review_rating INT,
review_text VARCHAR(5000),
user_id VARCHAR(256),
PRIMARY KEY (rev_id)
)
"""
In [30]:
result = conn.execute(cmd)
In [31]:
hotel_reviewdf.to_sql('yelp_reviews', engine, if_exists='append', index=False)
In [ ]: