Data Cleanup


In [1]:
import json

Training Data


In [10]:
review=open('yelp_training_set_review.json')
business=open('yelp_training_set_business.json')
user=open('yelp_training_set_user.json')

In [13]:
review=open('yelp_test_set_review.json')
business=open('yelp_test_set_business.json')
user=open('yelp_test_set_user.json')

In [15]:
test_data={}

In [16]:
data=[]
for line in review:
    data.append(line)
test_data['review']=data

data=[]
for line in business:
    data.append(line)
test_data['business']=data

data=[]
for line in user:
    data.append(line)
test_data['user']=data

In [3]:
train_data={}

In [4]:
data=[]
for line in review:
    data.append(line)
train_data['review']=data

data=[]
for line in business:
    data.append(line)
train_data['business']=data

data=[]
for line in user:
    data.append(line)
train_data['user']=data

In [5]:
eval(train_data['review'][0]).keys()


Out[5]:
dict_keys(['type', 'votes', 'review_id', 'user_id', 'text', 'stars', 'date', 'business_id'])

In [18]:
eval(train_data['business'][1].replace('true','"true"')).keys()


Out[18]:
['business_id',
 'full_address',
 'open',
 'categories',
 'city',
 'review_count',
 'name',
 'neighborhoods',
 'longitude',
 'state',
 'stars',
 'latitude',
 'type']

In [10]:
for x in train_data['business'][:100]:
    print(eval(x.replace('true','"true"').replace())['categories'])


['Accountants', 'Professional Services', 'Tax Services', 'Financial Services']
['Sporting Goods', 'Bikes', 'Shopping']
[]
['Food', 'Grocery']
['Food', 'Bagels', 'Delis', 'Restaurants']
["Women's Clothing", 'Fashion', 'Shopping']
['Music & DVDs', 'Books, Mags, Music & Video', 'Vinyl Records', 'Shopping']
['Event Planning & Services', 'Venues & Event Spaces']
['Art Schools', 'Specialty Schools', 'Shopping', 'Jewelry', 'Accessories', 'Fashion', 'Education']
['Sandwiches', 'Restaurants']
['Music & DVDs', 'Books, Mags, Music & Video', 'Musical Instruments & Teachers', 'Shopping']
['Mexican', 'Restaurants']
['Food', 'Grocery']
['Pizza', 'Restaurants']
['Hotels & Travel', 'Event Planning & Services', 'Hotels']
['Department Stores', 'Fashion', 'Shopping']
['Window Washing', 'Home Services']
['Burgers', 'Restaurants']
['Kitchen & Bath', 'Shopping', 'Home & Garden']
['Food', 'Beer, Wine & Spirits']
['Plumbing', 'Home Services']
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-90c76df3a6bf> in <module>()
      1 for x in train_data['business'][:100]:
----> 2     print(eval(x.replace('true','"true"'))['categories'])

<string> in <module>()

NameError: name 'false' is not defined

In [21]:
eval(test_data['business'][0].replace('true','"True"')).keys()


Out[21]:
dict_keys(['longitude', 'name', 'review_count', 'business_id', 'city', 'open', 'full_address', 'categories', 'state', 'type', 'latitude', 'neighborhoods'])

In [ ]:


In [ ]:


In [42]:
restaurants=[]
restaurantIds = []
i=1
for d in train_data['business']:
    data={}
    d=d.replace('true','"true"')
    d=d.replace('false','"false"')
    categories=eval(d)['categories']
    if('Restaurants' in categories):
        data['restaurant_id']=eval(d)['business_id']
        data['full_address']=eval(d)['full_address'].replace('\\n',',')
        data['city']=eval(d)['city']
        data['review_count']=eval(d)['review_count']
        data['longitude']=eval(d)['longitude']
        data['latitude']=eval(d)['latitude']
        data['state']=eval(d)['state']
        data['rating']=eval(d)['stars']
        data['restaurant_name']=eval(d)['name']
        data['new_id']=i
        i+=1
        restaurants.append(data)
        restaurantIds.append(data['restaurant_id'])

In [33]:
trestaurants=[]
trestaurantIds = []
i=1
for d in test_data['business']:
    data={}
    d=d.replace('true','"true"')
    d=d.replace('false','"false"')
    categories=eval(d)['categories']
    if('Restaurants' in categories):
        data['restaurant_id']=eval(d)['business_id']
        data['full_address']=eval(d)['full_address'].replace('\\n',',')
        data['city']=eval(d)['city']
        data['review_count']=eval(d)['review_count']
        data['longitude']=eval(d)['longitude']
        data['latitude']=eval(d)['latitude']
        data['state']=eval(d)['state']
        data['restaurant_name']=eval(d)['name']
        data['new_id']=i
        i+=1
        trestaurants.append(data)
        trestaurantIds.append(data['restaurant_id'])

In [12]:
restaurants[0].keys()


Out[12]:
dict_keys(['review_count', 'new_id', 'city', 'rating', 'restaurant_name', 'longitude', 'restaurant_id', 'full_address', 'state', 'latitude'])

In [13]:
restaurants[0]


Out[13]:
{'city': 'Glendale Az',
 'full_address': '6520 W Happy Valley Rd\nSte 101\nGlendale Az, AZ 85310',
 'latitude': 33.712797,
 'longitude': -112.200264,
 'new_id': 1,
 'rating': 3.5,
 'restaurant_id': 'PzOqRohWw7F7YEPBz6AubA',
 'restaurant_name': 'Hot Bagels & Deli',
 'review_count': 14,
 'state': 'AZ'}

In [43]:
reskey={}
for i in restaurants:
    reskey[i['restaurant_id']]=i['new_id']

In [36]:
len(trestaurants)


Out[36]:
325

In [ ]:


In [ ]:


In [30]:
train_data['review'][0]


Out[30]:
'{"votes": {"funny": 0, "useful": 5, "cool": 2}, "user_id": "rLtl8ZkDX5vH5nAx9C3q5Q", "review_id": "fWKvX83p0-ka4JS3dc6E5A", "stars": 5, "date": "2011-01-26", "text": "My wife took me here on my birthday for breakfast and it was excellent.  The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.  Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning.  It looked like the place fills up pretty quickly so the earlier you get here the better.\\n\\nDo yourself a favor and get their Bloody Mary.  It was phenomenal and simply the best I\'ve ever had.  I\'m pretty sure they only use ingredients from their garden and blend them fresh when you order it.  It was amazing.\\n\\nWhile EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious.  It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete.  It was the best \\"toast\\" I\'ve ever had.\\n\\nAnyway, I can\'t wait to go back!", "type": "review", "business_id": "9yKzy9PApeiPPOUJEtnvkg"}\n'

In [32]:
eval(train_data['review'][0]).keys()


Out[32]:
['votes',
 'user_id',
 'review_id',
 'stars',
 'date',
 'text',
 'type',
 'business_id']

In [44]:
reviews=[]
i=1
userid=[]
for d in train_data['review']:
    data={}
    res_id=eval(d)['business_id']
    if(res_id in restaurantIds):
        try:
            data['user_id']=userkey[eval(d)['user_id']]
            data['review_id']=i
            i+=1
            data['date']=eval(d)['date']
            data['review']=eval(d)['text'].replace('\n',' ').replace('\\','').replace("\'","'")
            data['rating']=eval(d)['stars']
    #         for i in restaurants:
    #             if(i['restaurant_id']==res_id):
    #                 data['restaurant_id']=i['new_id']
    #                 break

            data['restaurant_id']=reskey[eval(d)['business_id']]
            votes=eval(d)['votes']
            data['funny']=votes['funny']
            data['cool']=votes['cool']
            data['useful']=votes['useful']
            userid.append(eval(d)['user_id'])
            reviews.append(data)
        except (KeyError ):
            continue

In [38]:
treviews=[]
i=1
userid=[]
for d in test_data['review']:
    data={}
    res_id=eval(d)['business_id']
    if(res_id in trestaurantIds):
        try:
            data['user_id']=userkey[eval(d)['user_id']]
            data['review_id']=i
            i+=1
            data['date']=eval(d)['date']
            data['review']=eval(d)['text'].replace('\n',' ').replace('\\','').replace("\'","'")
            data['rating']=eval(d)['stars']
    #         for i in restaurants:
    #             if(i['restaurant_id']==res_id):
    #                 data['restaurant_id']=i['new_id']
    #                 break

            data['restaurant_id']=reskey[eval(d)['business_id']]
            votes=eval(d)['votes']
            data['funny']=votes['funny']
            data['cool']=votes['cool']
            data['useful']=votes['useful']
            userid.append(eval(d)['user_id'])
            treviews.append(data)
        except (KeyError ):
            continue

In [35]:
reviews[0]


Out[35]:
{'cool': 2,
 'date': '2011-01-26',
 'funny': 0,
 'rating': 5,
 'restaurant_id': 3010,
 'review': 'My wife took me here on my birthday for breakfast and it was excellent.  The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.  Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning.  It looked like the place fills up pretty quickly so the earlier you get here the better.  Do yourself a favor and get their Bloody Mary.  It was phenomenal and simply the best I\'ve ever had.  I\'m pretty sure they only use ingredients from their garden and blend them fresh when you order it.  It was amazing.  While EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious.  It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete.  It was the best "toast" I\'ve ever had.  Anyway, I can\'t wait to go back!',
 'review_id': 1,
 'useful': 5,
 'user_id': 24538}

In [19]:
for i in restaurants:
    if(i['new_id']==3010):
        print (i)
        break


{'review_count': 116, 'new_id': 3010, 'city': 'Phoenix', 'rating': 4.0, 'restaurant_name': 'Morning Glory Cafe', 'longitude': -112.012504, 'restaurant_id': '9yKzy9PApeiPPOUJEtnvkg', 'full_address': '6106 S 32nd St\nPhoenix, AZ 85042', 'state': 'AZ', 'latitude': 33.390792}

In [46]:
eval(train_data['user'][0])


Out[46]:
{'average_stars': 5.0,
 'name': 'Jim',
 'review_count': 6,
 'type': 'user',
 'user_id': 'CR2y7yEm4X035ZMzrTtN9Q',
 'votes': {'cool': 0, 'funny': 0, 'useful': 7}}

In [47]:
userid=set(userid)

In [ ]:


In [48]:
import unidecode

In [49]:
users=[]
i=0
for d in train_data['user']:
    data={}
    uid=eval(d)['user_id']
    if(uid in userid):
        name=str(eval(d)['name'])
    #     print(type(name))
        name = unidecode.unidecode(str(name.encode('utf-8')))
        data['user_id']=eval(d)['user_id']
        data['new_id']=i
        i+=1
        data['user_name']=name
        users.append(data)

In [50]:
users[0]


Out[50]:
{'new_id': 0, 'user_id': 'CR2y7yEm4X035ZMzrTtN9Q', 'user_name': "b'Jim'"}

In [51]:
len(users)


Out[51]:
34789

In [28]:
tusers=[]
i=0
for d in test_data['user']:
    data={}
    name=str(eval(d)['name'])
#     print(type(name))
    name = unidecode.unidecode(str(name.encode('utf-8')))
    data['user_id']=eval(d)['user_id']
    data['new_id']=i
    i+=1
    data['user_name']=name
    tusers.append(data)

In [29]:
len(tusers)


Out[29]:
5105

In [30]:
userkey={}
for i in users:
    userkey[i['user_id']]=i['new_id']

In [23]:
userkey={}
for i in users:
    userkey[i['user_id']]=i['new_id']

In [42]:
def save_obj(obj, name ):
    with open( name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [43]:
import pickle

In [44]:
save_obj(reviews, 'reviews')

In [46]:
save_obj(restaurants, 'restaurants')

In [1]:
import json

In [3]:
import pandas as pd

In [39]:
len(treviews)


Out[39]:
0

In [40]:
len(tusers)


Out[40]:
5105

Json to CSV


In [52]:
import csv

In [39]:
with open('restaurants.csv', 'w') as csvfile:
        csvdoc = csv.DictWriter(csvfile, 
                                fieldnames=restaurants[0].keys())
        csvdoc.writeheader()
        csvdoc.writerows(restaurants)

In [40]:
with open('reviews.csv', 'w') as csvfile:
        csvdoc = csv.DictWriter(csvfile, 
                                fieldnames=reviews[0].keys())
        csvdoc.writeheader()
        csvdoc.writerows(reviews)

In [53]:
with open('users.csv', 'w') as csvfile:
        csvdoc = csv.DictWriter(csvfile, 
                                fieldnames=users[0].keys())
        csvdoc.writeheader()
        csvdoc.writerows(users)

In [ ]: