In [1]:
    
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
#       C:\Users\Dino\Documents\GitHub\DataScience_SideProject\yelp_datachallenge
DataFolder ="C:/Users/Dino/Documents/GitHub/"
    
In [2]:
    
with open(DataFolder+"yelp_academic_dataset_business.json", 'rb') as f:
    data = f.readlines()
data = map(lambda x: x.rstrip(), data)
data_json_str = "[" + ','.join(data) + "]"
# now, load it into pandas
data_df = pd.read_json(data_json_str)
#data = pd.read_json(DataFolder+"yelp_academic_dataset_business.json")
    
In [34]:
    
data[0].split('"')[13]
    
    Out[34]:
In [3]:
    
data_df.head(5)
    
    Out[3]:
In [8]:
    
data_df.info()
    
    
In [3]:
    
#data_df.describe(include = ['object'])
data_df['categories'].fillna(-1,inplace = True)
    
In [4]:
    
def isRestaurant(categories):
    if isinstance(categories,list):
        for i in categories:
            if 'restaurants'in i.lower():
                return True
    return False
def isINUSA(state):
    USAStates = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", "DE", "FL", "GA", 
          "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
          "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
          "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
          "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]
    if state.upper() in USAStates:
        return True
    else:
        return False
    
In [5]:
    
data_df['include'] = data_df.apply(lambda row:(isRestaurant(row['categories']) and isINUSA(row['state'])), axis = 1)
    
In [6]:
    
data_df['include'].value_counts()
    
    Out[6]:
In [7]:
    
data_res = data_df[data_df['include']]
    
In [34]:
    
data_res.to_csv("restaurant_yelp_academic_dataset_business.csv",encoding='utf-8')
    
In [8]:
    
# u know why choose 51....
exp_business = data_res.sample(frac = 0.1, random_state = 51)
sample_business_id = exp_business.business_id.values
print "for experiments, the number of business_id is:",sample_business_id.shape
    
    
In [20]:
    
from yelpYourDish import *
inFile = DataFolder+"yelp_academic_dataset_review.json"
reviews = Utils.getExperimentalReviews(inFile,sample_business_id)
    
In [22]:
    
a[0]
    
    Out[22]:
In [23]:
    
a[0].split('"')[11]
    
    Out[23]:
In [ ]: