In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
# C:\Users\Dino\Documents\GitHub\DataScience_SideProject\yelp_datachallenge
DataFolder ="C:/Users/Dino/Documents/GitHub/"
In [2]:
with open(DataFolder+"yelp_academic_dataset_business.json", 'rb') as f:
data = f.readlines()
data = map(lambda x: x.rstrip(), data)
data_json_str = "[" + ','.join(data) + "]"
# now, load it into pandas
data_df = pd.read_json(data_json_str)
#data = pd.read_json(DataFolder+"yelp_academic_dataset_business.json")
In [34]:
data[0].split('"')[13]
Out[34]:
In [3]:
data_df.head(5)
Out[3]:
In [8]:
data_df.info()
In [3]:
#data_df.describe(include = ['object'])
data_df['categories'].fillna(-1,inplace = True)
In [4]:
def isRestaurant(categories):
if isinstance(categories,list):
for i in categories:
if 'restaurants'in i.lower():
return True
return False
def isINUSA(state):
USAStates = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA",
"HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
"MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
"NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
"SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]
if state.upper() in USAStates:
return True
else:
return False
In [5]:
data_df['include'] = data_df.apply(lambda row:(isRestaurant(row['categories']) and isINUSA(row['state'])), axis = 1)
In [6]:
data_df['include'].value_counts()
Out[6]:
In [7]:
data_res = data_df[data_df['include']]
In [34]:
data_res.to_csv("restaurant_yelp_academic_dataset_business.csv",encoding='utf-8')
In [8]:
# u know why choose 51....
exp_business = data_res.sample(frac = 0.1, random_state = 51)
sample_business_id = exp_business.business_id.values
print "for experiments, the number of business_id is:",sample_business_id.shape
In [20]:
from yelpYourDish import *
inFile = DataFolder+"yelp_academic_dataset_review.json"
reviews = Utils.getExperimentalReviews(inFile,sample_business_id)
In [22]:
a[0]
Out[22]:
In [23]:
a[0].split('"')[11]
Out[23]:
In [ ]: