1. Import the business information from the dataset and filter data irrelevant of USA restaurants



In [1]:

    
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline

#       C:\Users\Dino\Documents\GitHub\DataScience_SideProject\yelp_datachallenge
DataFolder ="C:/Users/Dino/Documents/GitHub/"



In [2]:

    
with open(DataFolder+"yelp_academic_dataset_business.json", 'rb') as f:
    data = f.readlines()
data = map(lambda x: x.rstrip(), data)
data_json_str = "[" + ','.join(data) + "]"
# now, load it into pandas
data_df = pd.read_json(data_json_str)
#data = pd.read_json(DataFolder+"yelp_academic_dataset_business.json")



In [34]:

    
data[0].split('"')[13]









    Out[34]:





'tJRDll5yqpZwehenzE2cSg'



In [3]:

    
data_df.head(5)









    Out[3]:






  
    
      
      address
      attributes
      business_id
      categories
      city
      hours
      is_open
      latitude
      longitude
      name
      neighborhood
      postal_code
      review_count
      stars
      state
      type
    
  
  
    
      0
      227 E Baseline Rd, Ste J2
      [BikeParking: True, BusinessAcceptsBitcoin: Fa...
      0DI8Dt2PJp07XkVvIElIcQ
      [Tobacco Shops, Nightlife, Vape Shops, Shopping]
      Tempe
      [Monday 11:0-21:0, Tuesday 11:0-21:0, Wednesda...
      0
      33.378214
      -111.936102
      Innovative Vapors
      
      85283
      17
      4.5
      AZ
      business
    
    
      1
      495 S Grand Central Pkwy
      [BusinessAcceptsBitcoin: False, BusinessAccept...
      LTlCaCGZE14GuaUXUGbamg
      [Caterers, Grocery, Food, Event Planning & Ser...
      Las Vegas
      [Monday 0:0-0:0, Tuesday 0:0-0:0, Wednesday 0:...
      1
      36.192284
      -115.159272
      Cut and Taste
      
      89106
      9
      5.0
      NV
      business
    
    
      2
      979 Bloor Street W
      [Alcohol: none, Ambience: {'romantic': False, ...
      EDqCEAGXVGCH4FJXgqtjqg
      [Restaurants, Pizza, Chicken Wings, Italian]
      Toronto
      [Monday 11:0-2:0, Tuesday 11:0-2:0, Wednesday ...
      1
      43.661054
      -79.429089
      Pizza Pizza
      Dufferin Grove
      M6H 1L5
      7
      2.5
      ON
      business
    
    
      3
      7014 Steubenville Pike
      [AcceptsInsurance: False, BusinessAcceptsCredi...
      cnGIivYRLxpF7tBVR_JwWA
      [Hair Removal, Beauty & Spas, Blow Dry/Out Ser...
      Oakdale
      [Tuesday 10:0-21:0, Wednesday 10:0-21:0, Thurs...
      1
      40.444544
      -80.174540
      Plush Salon and Spa
      
      15071
      4
      4.0
      PA
      business
    
    
      4
      321 Jarvis Street
      [BusinessAcceptsCreditCards: True, Restaurants...
      cdk-qqJ71q6P7TJTww_DSA
      [Hotels & Travel, Event Planning & Services, H...
      Toronto
      None
      1
      43.659829
      -79.375401
      Comfort Inn
      Downtown Core
      M5B 2C2
      8
      3.0
      ON
      business



In [8]:

    
data_df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144072 entries, 0 to 144071
Data columns (total 16 columns):
address         144072 non-null object
attributes      127162 non-null object
business_id     144072 non-null object
categories      143747 non-null object
city            144072 non-null object
hours           102464 non-null object
is_open         144072 non-null int64
latitude        144072 non-null float64
longitude       144072 non-null float64
name            144072 non-null object
neighborhood    144072 non-null object
postal_code     144072 non-null object
review_count    144072 non-null int64
stars           144072 non-null float64
state           144072 non-null object
type            144072 non-null object
dtypes: float64(3), int64(2), object(11)
memory usage: 17.6+ MB



In [3]:

    
#data_df.describe(include = ['object'])
data_df['categories'].fillna(-1,inplace = True)



In [4]:

    
def isRestaurant(categories):
    if isinstance(categories,list):
        for i in categories:
            if 'restaurants'in i.lower():
                return True
    return False

def isINUSA(state):
    USAStates = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]
    if state.upper() in USAStates:
        return True
    else:
        return False



In [5]:

    
data_df['include'] = data_df.apply(lambda row:(isRestaurant(row['categories']) and isINUSA(row['state'])), axis = 1)



In [6]:

    
data_df['include'].value_counts()









    Out[6]:





False    114723
True      29349
Name: include, dtype: int64



In [7]:

    
data_res = data_df[data_df['include']]



In [34]:

    
data_res.to_csv("restaurant_yelp_academic_dataset_business.csv",encoding='utf-8')

2. Use 10% of USA business to generate a experimental dataset for reviews and tips



In [8]:

    
# u know why choose 51....
exp_business = data_res.sample(frac = 0.1, random_state = 51)
sample_business_id = exp_business.business_id.values
print "for experiments, the number of business_id is:",sample_business_id.shape









    



for experiments, the number of business_id is: (2935L,)



In [20]:

    
from yelpYourDish import *
inFile = DataFolder+"yelp_academic_dataset_review.json"
reviews = Utils.getExperimentalReviews(inFile,sample_business_id)



In [22]:

    
a[0]









    Out[22]:





'{"review_id":"GPs6pPBaRDbavro8uKjo7g","user_id":"NgDQZBE-hhYFfZzgFvpJog","business_id":"4yGrrjIS0gu5gYAUIFU3fA","stars":4,"date":"2012-05-16","text":"We have been going there weekly for a business network meeting in the wee hours of the mornning.\\nI have also ordered a vege omlette and it\'s pretty amazing!\\nTheir sausage is good too as it\'s not a little link like other places.\\nOur waitress does a good job to our 22 businesses there.","useful":0,"funny":0,"cool":0,"type":"review"}'



In [23]:

    
a[0].split('"')[11]









    Out[23]:





'4yGrrjIS0gu5gYAUIFU3fA'



In [ ]:

	address	attributes	business_id	categories	city	hours	is_open	latitude	longitude	name	neighborhood	postal_code	review_count	stars	state	type
0	227 E Baseline Rd, Ste J2	[BikeParking: True, BusinessAcceptsBitcoin: Fa...	0DI8Dt2PJp07XkVvIElIcQ	[Tobacco Shops, Nightlife, Vape Shops, Shopping]	Tempe	[Monday 11:0-21:0, Tuesday 11:0-21:0, Wednesda...	0	33.378214	-111.936102	Innovative Vapors		85283	17	4.5	AZ	business
1	495 S Grand Central Pkwy	[BusinessAcceptsBitcoin: False, BusinessAccept...	LTlCaCGZE14GuaUXUGbamg	[Caterers, Grocery, Food, Event Planning & Ser...	Las Vegas	[Monday 0:0-0:0, Tuesday 0:0-0:0, Wednesday 0:...	1	36.192284	-115.159272	Cut and Taste		89106	9	5.0	NV	business
2	979 Bloor Street W	[Alcohol: none, Ambience: {'romantic': False, ...	EDqCEAGXVGCH4FJXgqtjqg	[Restaurants, Pizza, Chicken Wings, Italian]	Toronto	[Monday 11:0-2:0, Tuesday 11:0-2:0, Wednesday ...	1	43.661054	-79.429089	Pizza Pizza	Dufferin Grove	M6H 1L5	7	2.5	ON	business
3	7014 Steubenville Pike	[AcceptsInsurance: False, BusinessAcceptsCredi...	cnGIivYRLxpF7tBVR_JwWA	[Hair Removal, Beauty & Spas, Blow Dry/Out Ser...	Oakdale	[Tuesday 10:0-21:0, Wednesday 10:0-21:0, Thurs...	1	40.444544	-80.174540	Plush Salon and Spa		15071	4	4.0	PA	business
4	321 Jarvis Street	[BusinessAcceptsCreditCards: True, Restaurants...	cdk-qqJ71q6P7TJTww_DSA	[Hotels & Travel, Event Planning & Services, H...	Toronto	None	1	43.659829	-79.375401	Comfort Inn	Downtown Core	M5B 2C2	8	3.0	ON	business