In [1]:
import json
In [10]:
review=open('yelp_training_set_review.json')
business=open('yelp_training_set_business.json')
user=open('yelp_training_set_user.json')
In [13]:
review=open('yelp_test_set_review.json')
business=open('yelp_test_set_business.json')
user=open('yelp_test_set_user.json')
In [15]:
test_data={}
In [16]:
data=[]
for line in review:
data.append(line)
test_data['review']=data
data=[]
for line in business:
data.append(line)
test_data['business']=data
data=[]
for line in user:
data.append(line)
test_data['user']=data
In [3]:
train_data={}
In [4]:
data=[]
for line in review:
data.append(line)
train_data['review']=data
data=[]
for line in business:
data.append(line)
train_data['business']=data
data=[]
for line in user:
data.append(line)
train_data['user']=data
In [5]:
eval(train_data['review'][0]).keys()
Out[5]:
In [18]:
eval(train_data['business'][1].replace('true','"true"')).keys()
Out[18]:
In [10]:
for x in train_data['business'][:100]:
print(eval(x.replace('true','"true"').replace())['categories'])
In [21]:
eval(test_data['business'][0].replace('true','"True"')).keys()
Out[21]:
In [ ]:
In [ ]:
In [42]:
restaurants=[]
restaurantIds = []
i=1
for d in train_data['business']:
data={}
d=d.replace('true','"true"')
d=d.replace('false','"false"')
categories=eval(d)['categories']
if('Restaurants' in categories):
data['restaurant_id']=eval(d)['business_id']
data['full_address']=eval(d)['full_address'].replace('\\n',',')
data['city']=eval(d)['city']
data['review_count']=eval(d)['review_count']
data['longitude']=eval(d)['longitude']
data['latitude']=eval(d)['latitude']
data['state']=eval(d)['state']
data['rating']=eval(d)['stars']
data['restaurant_name']=eval(d)['name']
data['new_id']=i
i+=1
restaurants.append(data)
restaurantIds.append(data['restaurant_id'])
In [33]:
trestaurants=[]
trestaurantIds = []
i=1
for d in test_data['business']:
data={}
d=d.replace('true','"true"')
d=d.replace('false','"false"')
categories=eval(d)['categories']
if('Restaurants' in categories):
data['restaurant_id']=eval(d)['business_id']
data['full_address']=eval(d)['full_address'].replace('\\n',',')
data['city']=eval(d)['city']
data['review_count']=eval(d)['review_count']
data['longitude']=eval(d)['longitude']
data['latitude']=eval(d)['latitude']
data['state']=eval(d)['state']
data['restaurant_name']=eval(d)['name']
data['new_id']=i
i+=1
trestaurants.append(data)
trestaurantIds.append(data['restaurant_id'])
In [12]:
restaurants[0].keys()
Out[12]:
In [13]:
restaurants[0]
Out[13]:
In [43]:
reskey={}
for i in restaurants:
reskey[i['restaurant_id']]=i['new_id']
In [36]:
len(trestaurants)
Out[36]:
In [ ]:
In [ ]:
In [30]:
train_data['review'][0]
Out[30]:
In [32]:
eval(train_data['review'][0]).keys()
Out[32]:
In [44]:
reviews=[]
i=1
userid=[]
for d in train_data['review']:
data={}
res_id=eval(d)['business_id']
if(res_id in restaurantIds):
try:
data['user_id']=userkey[eval(d)['user_id']]
data['review_id']=i
i+=1
data['date']=eval(d)['date']
data['review']=eval(d)['text'].replace('\n',' ').replace('\\','').replace("\'","'")
data['rating']=eval(d)['stars']
# for i in restaurants:
# if(i['restaurant_id']==res_id):
# data['restaurant_id']=i['new_id']
# break
data['restaurant_id']=reskey[eval(d)['business_id']]
votes=eval(d)['votes']
data['funny']=votes['funny']
data['cool']=votes['cool']
data['useful']=votes['useful']
userid.append(eval(d)['user_id'])
reviews.append(data)
except (KeyError ):
continue
In [38]:
treviews=[]
i=1
userid=[]
for d in test_data['review']:
data={}
res_id=eval(d)['business_id']
if(res_id in trestaurantIds):
try:
data['user_id']=userkey[eval(d)['user_id']]
data['review_id']=i
i+=1
data['date']=eval(d)['date']
data['review']=eval(d)['text'].replace('\n',' ').replace('\\','').replace("\'","'")
data['rating']=eval(d)['stars']
# for i in restaurants:
# if(i['restaurant_id']==res_id):
# data['restaurant_id']=i['new_id']
# break
data['restaurant_id']=reskey[eval(d)['business_id']]
votes=eval(d)['votes']
data['funny']=votes['funny']
data['cool']=votes['cool']
data['useful']=votes['useful']
userid.append(eval(d)['user_id'])
treviews.append(data)
except (KeyError ):
continue
In [35]:
reviews[0]
Out[35]:
In [19]:
for i in restaurants:
if(i['new_id']==3010):
print (i)
break
In [46]:
eval(train_data['user'][0])
Out[46]:
In [47]:
userid=set(userid)
In [ ]:
In [48]:
import unidecode
In [49]:
users=[]
i=0
for d in train_data['user']:
data={}
uid=eval(d)['user_id']
if(uid in userid):
name=str(eval(d)['name'])
# print(type(name))
name = unidecode.unidecode(str(name.encode('utf-8')))
data['user_id']=eval(d)['user_id']
data['new_id']=i
i+=1
data['user_name']=name
users.append(data)
In [50]:
users[0]
Out[50]:
In [51]:
len(users)
Out[51]:
In [28]:
tusers=[]
i=0
for d in test_data['user']:
data={}
name=str(eval(d)['name'])
# print(type(name))
name = unidecode.unidecode(str(name.encode('utf-8')))
data['user_id']=eval(d)['user_id']
data['new_id']=i
i+=1
data['user_name']=name
tusers.append(data)
In [29]:
len(tusers)
Out[29]:
In [30]:
userkey={}
for i in users:
userkey[i['user_id']]=i['new_id']
In [23]:
userkey={}
for i in users:
userkey[i['user_id']]=i['new_id']
In [42]:
def save_obj(obj, name ):
with open( name + '.pkl', 'wb') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
In [43]:
import pickle
In [44]:
save_obj(reviews, 'reviews')
In [46]:
save_obj(restaurants, 'restaurants')
In [1]:
import json
In [3]:
import pandas as pd
In [39]:
len(treviews)
Out[39]:
In [40]:
len(tusers)
Out[40]:
In [52]:
import csv
In [39]:
with open('restaurants.csv', 'w') as csvfile:
csvdoc = csv.DictWriter(csvfile,
fieldnames=restaurants[0].keys())
csvdoc.writeheader()
csvdoc.writerows(restaurants)
In [40]:
with open('reviews.csv', 'w') as csvfile:
csvdoc = csv.DictWriter(csvfile,
fieldnames=reviews[0].keys())
csvdoc.writeheader()
csvdoc.writerows(reviews)
In [53]:
with open('users.csv', 'w') as csvfile:
csvdoc = csv.DictWriter(csvfile,
fieldnames=users[0].keys())
csvdoc.writeheader()
csvdoc.writerows(users)
In [ ]: