1. Import the business information from the dataset and filter data irrelevant of USA restaurants


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline

#       C:\Users\Dino\Documents\GitHub\DataScience_SideProject\yelp_datachallenge
DataFolder ="C:/Users/Dino/Documents/GitHub/"

In [2]:
with open(DataFolder+"yelp_academic_dataset_business.json", 'rb') as f:
    data = f.readlines()
data = map(lambda x: x.rstrip(), data)
data_json_str = "[" + ','.join(data) + "]"
# now, load it into pandas
data_df = pd.read_json(data_json_str)
#data = pd.read_json(DataFolder+"yelp_academic_dataset_business.json")

In [34]:
data[0].split('"')[13]


Out[34]:
'tJRDll5yqpZwehenzE2cSg'

In [3]:
data_df.head(5)


Out[3]:
address attributes business_id categories city hours is_open latitude longitude name neighborhood postal_code review_count stars state type
0 227 E Baseline Rd, Ste J2 [BikeParking: True, BusinessAcceptsBitcoin: Fa... 0DI8Dt2PJp07XkVvIElIcQ [Tobacco Shops, Nightlife, Vape Shops, Shopping] Tempe [Monday 11:0-21:0, Tuesday 11:0-21:0, Wednesda... 0 33.378214 -111.936102 Innovative Vapors 85283 17 4.5 AZ business
1 495 S Grand Central Pkwy [BusinessAcceptsBitcoin: False, BusinessAccept... LTlCaCGZE14GuaUXUGbamg [Caterers, Grocery, Food, Event Planning & Ser... Las Vegas [Monday 0:0-0:0, Tuesday 0:0-0:0, Wednesday 0:... 1 36.192284 -115.159272 Cut and Taste 89106 9 5.0 NV business
2 979 Bloor Street W [Alcohol: none, Ambience: {'romantic': False, ... EDqCEAGXVGCH4FJXgqtjqg [Restaurants, Pizza, Chicken Wings, Italian] Toronto [Monday 11:0-2:0, Tuesday 11:0-2:0, Wednesday ... 1 43.661054 -79.429089 Pizza Pizza Dufferin Grove M6H 1L5 7 2.5 ON business
3 7014 Steubenville Pike [AcceptsInsurance: False, BusinessAcceptsCredi... cnGIivYRLxpF7tBVR_JwWA [Hair Removal, Beauty & Spas, Blow Dry/Out Ser... Oakdale [Tuesday 10:0-21:0, Wednesday 10:0-21:0, Thurs... 1 40.444544 -80.174540 Plush Salon and Spa 15071 4 4.0 PA business
4 321 Jarvis Street [BusinessAcceptsCreditCards: True, Restaurants... cdk-qqJ71q6P7TJTww_DSA [Hotels & Travel, Event Planning & Services, H... Toronto None 1 43.659829 -79.375401 Comfort Inn Downtown Core M5B 2C2 8 3.0 ON business

In [8]:
data_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144072 entries, 0 to 144071
Data columns (total 16 columns):
address         144072 non-null object
attributes      127162 non-null object
business_id     144072 non-null object
categories      143747 non-null object
city            144072 non-null object
hours           102464 non-null object
is_open         144072 non-null int64
latitude        144072 non-null float64
longitude       144072 non-null float64
name            144072 non-null object
neighborhood    144072 non-null object
postal_code     144072 non-null object
review_count    144072 non-null int64
stars           144072 non-null float64
state           144072 non-null object
type            144072 non-null object
dtypes: float64(3), int64(2), object(11)
memory usage: 17.6+ MB

In [3]:
#data_df.describe(include = ['object'])
data_df['categories'].fillna(-1,inplace = True)

In [4]:
def isRestaurant(categories):
    if isinstance(categories,list):
        for i in categories:
            if 'restaurants'in i.lower():
                return True
    return False

def isINUSA(state):
    USAStates = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]
    if state.upper() in USAStates:
        return True
    else:
        return False

In [5]:
data_df['include'] = data_df.apply(lambda row:(isRestaurant(row['categories']) and isINUSA(row['state'])), axis = 1)

In [6]:
data_df['include'].value_counts()


Out[6]:
False    114723
True      29349
Name: include, dtype: int64

In [7]:
data_res = data_df[data_df['include']]

In [34]:
data_res.to_csv("restaurant_yelp_academic_dataset_business.csv",encoding='utf-8')

2. Use 10% of USA business to generate a experimental dataset for reviews and tips


In [8]:
# u know why choose 51....
exp_business = data_res.sample(frac = 0.1, random_state = 51)
sample_business_id = exp_business.business_id.values
print "for experiments, the number of business_id is:",sample_business_id.shape


for experiments, the number of business_id is: (2935L,)

In [20]:
from yelpYourDish import *
inFile = DataFolder+"yelp_academic_dataset_review.json"
reviews = Utils.getExperimentalReviews(inFile,sample_business_id)

In [22]:
a[0]


Out[22]:
'{"review_id":"GPs6pPBaRDbavro8uKjo7g","user_id":"NgDQZBE-hhYFfZzgFvpJog","business_id":"4yGrrjIS0gu5gYAUIFU3fA","stars":4,"date":"2012-05-16","text":"We have been going there weekly for a business network meeting in the wee hours of the mornning.\\nI have also ordered a vege omlette and it\'s pretty amazing!\\nTheir sausage is good too as it\'s not a little link like other places.\\nOur waitress does a good job to our 22 businesses there.","useful":0,"funny":0,"cool":0,"type":"review"}'

In [23]:
a[0].split('"')[11]


Out[23]:
'4yGrrjIS0gu5gYAUIFU3fA'

In [ ]: