Data Cleanup



In [1]:

    
import json

Training Data



In [10]:

    
review=open('yelp_training_set_review.json')
business=open('yelp_training_set_business.json')
user=open('yelp_training_set_user.json')



In [13]:

    
review=open('yelp_test_set_review.json')
business=open('yelp_test_set_business.json')
user=open('yelp_test_set_user.json')



In [15]:

    
test_data={}



In [16]:

    
data=[]
for line in review:
    data.append(line)
test_data['review']=data

data=[]
for line in business:
    data.append(line)
test_data['business']=data

data=[]
for line in user:
    data.append(line)
test_data['user']=data



In [3]:

    
train_data={}



In [4]:

    
data=[]
for line in review:
    data.append(line)
train_data['review']=data

data=[]
for line in business:
    data.append(line)
train_data['business']=data

data=[]
for line in user:
    data.append(line)
train_data['user']=data



In [5]:

    
eval(train_data['review'][0]).keys()









    Out[5]:





dict_keys(['type', 'votes', 'review_id', 'user_id', 'text', 'stars', 'date', 'business_id'])



In [18]:

    
eval(train_data['business'][1].replace('true','"true"')).keys()









    Out[18]:





['business_id',
 'full_address',
 'open',
 'categories',
 'city',
 'review_count',
 'name',
 'neighborhoods',
 'longitude',
 'state',
 'stars',
 'latitude',
 'type']



In [10]:

    
for x in train_data['business'][:100]:
    print(eval(x.replace('true','"true"').replace())['categories'])









    



['Accountants', 'Professional Services', 'Tax Services', 'Financial Services']
['Sporting Goods', 'Bikes', 'Shopping']
[]
['Food', 'Grocery']
['Food', 'Bagels', 'Delis', 'Restaurants']
["Women's Clothing", 'Fashion', 'Shopping']
['Music & DVDs', 'Books, Mags, Music & Video', 'Vinyl Records', 'Shopping']
['Event Planning & Services', 'Venues & Event Spaces']
['Art Schools', 'Specialty Schools', 'Shopping', 'Jewelry', 'Accessories', 'Fashion', 'Education']
['Sandwiches', 'Restaurants']
['Music & DVDs', 'Books, Mags, Music & Video', 'Musical Instruments & Teachers', 'Shopping']
['Mexican', 'Restaurants']
['Food', 'Grocery']
['Pizza', 'Restaurants']
['Hotels & Travel', 'Event Planning & Services', 'Hotels']
['Department Stores', 'Fashion', 'Shopping']
['Window Washing', 'Home Services']
['Burgers', 'Restaurants']
['Kitchen & Bath', 'Shopping', 'Home & Garden']
['Food', 'Beer, Wine & Spirits']
['Plumbing', 'Home Services']






    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-90c76df3a6bf> in <module>()
      1 for x in train_data['business'][:100]:
----> 2     print(eval(x.replace('true','"true"'))['categories'])

<string> in <module>()

NameError: name 'false' is not defined



In [21]:

    
eval(test_data['business'][0].replace('true','"True"')).keys()









    Out[21]:





dict_keys(['longitude', 'name', 'review_count', 'business_id', 'city', 'open', 'full_address', 'categories', 'state', 'type', 'latitude', 'neighborhoods'])



In [ ]:



In [ ]:



In [42]:

    
restaurants=[]
restaurantIds = []
i=1
for d in train_data['business']:
    data={}
    d=d.replace('true','"true"')
    d=d.replace('false','"false"')
    categories=eval(d)['categories']
    if('Restaurants' in categories):
        data['restaurant_id']=eval(d)['business_id']
        data['full_address']=eval(d)['full_address'].replace('\\n',',')
        data['city']=eval(d)['city']
        data['review_count']=eval(d)['review_count']
        data['longitude']=eval(d)['longitude']
        data['latitude']=eval(d)['latitude']
        data['state']=eval(d)['state']
        data['rating']=eval(d)['stars']
        data['restaurant_name']=eval(d)['name']
        data['new_id']=i
        i+=1
        restaurants.append(data)
        restaurantIds.append(data['restaurant_id'])



In [33]:

    
trestaurants=[]
trestaurantIds = []
i=1
for d in test_data['business']:
    data={}
    d=d.replace('true','"true"')
    d=d.replace('false','"false"')
    categories=eval(d)['categories']
    if('Restaurants' in categories):
        data['restaurant_id']=eval(d)['business_id']
        data['full_address']=eval(d)['full_address'].replace('\\n',',')
        data['city']=eval(d)['city']
        data['review_count']=eval(d)['review_count']
        data['longitude']=eval(d)['longitude']
        data['latitude']=eval(d)['latitude']
        data['state']=eval(d)['state']
        data['restaurant_name']=eval(d)['name']
        data['new_id']=i
        i+=1
        trestaurants.append(data)
        trestaurantIds.append(data['restaurant_id'])



In [12]:

    
restaurants[0].keys()









    Out[12]:





dict_keys(['review_count', 'new_id', 'city', 'rating', 'restaurant_name', 'longitude', 'restaurant_id', 'full_address', 'state', 'latitude'])



In [13]:

    
restaurants[0]









    Out[13]:





{'city': 'Glendale Az',
 'full_address': '6520 W Happy Valley Rd\nSte 101\nGlendale Az, AZ 85310',
 'latitude': 33.712797,
 'longitude': -112.200264,
 'new_id': 1,
 'rating': 3.5,
 'restaurant_id': 'PzOqRohWw7F7YEPBz6AubA',
 'restaurant_name': 'Hot Bagels & Deli',
 'review_count': 14,
 'state': 'AZ'}



In [43]:

    
reskey={}
for i in restaurants:
    reskey[i['restaurant_id']]=i['new_id']



In [36]:

    
len(trestaurants)









    Out[36]:





325



In [ ]:



In [ ]:



In [30]:

    
train_data['review'][0]









    Out[30]:





'{"votes": {"funny": 0, "useful": 5, "cool": 2}, "user_id": "rLtl8ZkDX5vH5nAx9C3q5Q", "review_id": "fWKvX83p0-ka4JS3dc6E5A", "stars": 5, "date": "2011-01-26", "text": "My wife took me here on my birthday for breakfast and it was excellent.  The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.  Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning.  It looked like the place fills up pretty quickly so the earlier you get here the better.\\n\\nDo yourself a favor and get their Bloody Mary.  It was phenomenal and simply the best I\'ve ever had.  I\'m pretty sure they only use ingredients from their garden and blend them fresh when you order it.  It was amazing.\\n\\nWhile EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious.  It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete.  It was the best \\"toast\\" I\'ve ever had.\\n\\nAnyway, I can\'t wait to go back!", "type": "review", "business_id": "9yKzy9PApeiPPOUJEtnvkg"}\n'



In [32]:

    
eval(train_data['review'][0]).keys()









    Out[32]:





['votes',
 'user_id',
 'review_id',
 'stars',
 'date',
 'text',
 'type',
 'business_id']



In [44]:

    
reviews=[]
i=1
userid=[]
for d in train_data['review']:
    data={}
    res_id=eval(d)['business_id']
    if(res_id in restaurantIds):
        try:
            data['user_id']=userkey[eval(d)['user_id']]
            data['review_id']=i
            i+=1
            data['date']=eval(d)['date']
            data['review']=eval(d)['text'].replace('\n',' ').replace('\\','').replace("\'","'")
            data['rating']=eval(d)['stars']
    #         for i in restaurants:
    #             if(i['restaurant_id']==res_id):
    #                 data['restaurant_id']=i['new_id']
    #                 break

            data['restaurant_id']=reskey[eval(d)['business_id']]
            votes=eval(d)['votes']
            data['funny']=votes['funny']
            data['cool']=votes['cool']
            data['useful']=votes['useful']
            userid.append(eval(d)['user_id'])
            reviews.append(data)
        except (KeyError ):
            continue



In [38]:

    
treviews=[]
i=1
userid=[]
for d in test_data['review']:
    data={}
    res_id=eval(d)['business_id']
    if(res_id in trestaurantIds):
        try:
            data['user_id']=userkey[eval(d)['user_id']]
            data['review_id']=i
            i+=1
            data['date']=eval(d)['date']
            data['review']=eval(d)['text'].replace('\n',' ').replace('\\','').replace("\'","'")
            data['rating']=eval(d)['stars']
    #         for i in restaurants:
    #             if(i['restaurant_id']==res_id):
    #                 data['restaurant_id']=i['new_id']
    #                 break

            data['restaurant_id']=reskey[eval(d)['business_id']]
            votes=eval(d)['votes']
            data['funny']=votes['funny']
            data['cool']=votes['cool']
            data['useful']=votes['useful']
            userid.append(eval(d)['user_id'])
            treviews.append(data)
        except (KeyError ):
            continue



In [35]:

    
reviews[0]









    Out[35]:





{'cool': 2,
 'date': '2011-01-26',
 'funny': 0,
 'rating': 5,
 'restaurant_id': 3010,
 'review': 'My wife took me here on my birthday for breakfast and it was excellent.  The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.  Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning.  It looked like the place fills up pretty quickly so the earlier you get here the better.  Do yourself a favor and get their Bloody Mary.  It was phenomenal and simply the best I\'ve ever had.  I\'m pretty sure they only use ingredients from their garden and blend them fresh when you order it.  It was amazing.  While EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious.  It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete.  It was the best "toast" I\'ve ever had.  Anyway, I can\'t wait to go back!',
 'review_id': 1,
 'useful': 5,
 'user_id': 24538}



In [19]:

    
for i in restaurants:
    if(i['new_id']==3010):
        print (i)
        break









    



{'review_count': 116, 'new_id': 3010, 'city': 'Phoenix', 'rating': 4.0, 'restaurant_name': 'Morning Glory Cafe', 'longitude': -112.012504, 'restaurant_id': '9yKzy9PApeiPPOUJEtnvkg', 'full_address': '6106 S 32nd St\nPhoenix, AZ 85042', 'state': 'AZ', 'latitude': 33.390792}



In [46]:

    
eval(train_data['user'][0])









    Out[46]:





{'average_stars': 5.0,
 'name': 'Jim',
 'review_count': 6,
 'type': 'user',
 'user_id': 'CR2y7yEm4X035ZMzrTtN9Q',
 'votes': {'cool': 0, 'funny': 0, 'useful': 7}}



In [47]:

    
userid=set(userid)



In [ ]:



In [48]:

    
import unidecode



In [49]:

    
users=[]
i=0
for d in train_data['user']:
    data={}
    uid=eval(d)['user_id']
    if(uid in userid):
        name=str(eval(d)['name'])
    #     print(type(name))
        name = unidecode.unidecode(str(name.encode('utf-8')))
        data['user_id']=eval(d)['user_id']
        data['new_id']=i
        i+=1
        data['user_name']=name
        users.append(data)



In [50]:

    
users[0]









    Out[50]:





{'new_id': 0, 'user_id': 'CR2y7yEm4X035ZMzrTtN9Q', 'user_name': "b'Jim'"}



In [51]:

    
len(users)









    Out[51]:





34789



In [28]:

    
tusers=[]
i=0
for d in test_data['user']:
    data={}
    name=str(eval(d)['name'])
#     print(type(name))
    name = unidecode.unidecode(str(name.encode('utf-8')))
    data['user_id']=eval(d)['user_id']
    data['new_id']=i
    i+=1
    data['user_name']=name
    tusers.append(data)



In [29]:

    
len(tusers)









    Out[29]:





5105



In [30]:

    
userkey={}
for i in users:
    userkey[i['user_id']]=i['new_id']



In [23]:

    
userkey={}
for i in users:
    userkey[i['user_id']]=i['new_id']



In [42]:

    
def save_obj(obj, name ):
    with open( name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)



In [43]:

    
import pickle



In [44]:

    
save_obj(reviews, 'reviews')



In [46]:

    
save_obj(restaurants, 'restaurants')



In [1]:

    
import json



In [3]:

    
import pandas as pd



In [39]:

    
len(treviews)









    Out[39]:





0



In [40]:

    
len(tusers)









    Out[40]:





5105

Json to CSV



In [52]:

    
import csv



In [39]:

    
with open('restaurants.csv', 'w') as csvfile:
        csvdoc = csv.DictWriter(csvfile, 
                                fieldnames=restaurants[0].keys())
        csvdoc.writeheader()
        csvdoc.writerows(restaurants)



In [40]:

    
with open('reviews.csv', 'w') as csvfile:
        csvdoc = csv.DictWriter(csvfile, 
                                fieldnames=reviews[0].keys())
        csvdoc.writeheader()
        csvdoc.writerows(reviews)



In [53]:

    
with open('users.csv', 'w') as csvfile:
        csvdoc = csv.DictWriter(csvfile, 
                                fieldnames=users[0].keys())
        csvdoc.writeheader()
        csvdoc.writerows(users)



In [ ]: