In [3]:
    
import json
def clean_states(filename, states):
    records = []
    with open(filename, 'r') as fd:
        for line in fd:
            j_content = json.loads(line)
            if j_content['state'] in states:
                records.append(j_content)
                
    return records
def clean_restaurants(json_list):
    records = []
    for json in json_list:
        try:
            if 'Restaurants' in json['categories'] or 'Restaurant' in json['categories']:
                records.append(json)
        except TypeError:
            pass
    
    return records
def return_ids(data_list, id_type):
    id_list = []
    for data_point in data_list:
        id_list.append(data_point[id_type])
        
    return id_list
def filter_reviews(filename, business_list):
    records = []
    with open(filename, 'r') as fd:
        for line in fd:
            j_content = json.loads(line)
            if j_content['business_id'] in business_list:
                records.append(j_content)
                
    return records
def filter_customers(filename):
    records = []
    
    with open(filename, 'r') as fd:
        for line in fd:
            j_content = json.loads(line)
            for record in j_content:
                records.append(record['user_id'])
                
    customer_file = '/Volumes/Data/yelp_dataset/yelp_academic_dataset_user.json'
    customer_records = []
    
    with open(customer_file, 'r') as fd:
        for line in fd:
            j_content = json.loads(line)
            if j_content['user_id'] in set(records):
                customer_records.append(j_content)
    return customer_records
def output_data(json_data, filename):
    output_directory = "/Volumes/Data/yelp_dataset/cleaned_data/"
    filename = output_directory + filename
    with open(filename, 'w') as outfile:
        json.dump(json_data, outfile)
    
In [21]:
    
#Declare data directory and input JSON file
data_dir = "/Volumes/data/yelp_dataset/"
business_jsonFile = data_dir + 'yelp_academic_dataset_business.json'
#Declare the list of states to be kept
states = ['AZ', 'IL', 'WI', 'OH', 'NC', 'NV']
state_data = clean_states(business_jsonFile, states)
#Keep only restaurants
restaurant_data = clean_restaurants(state_data)
#Output cleaned up JSON file
business_output = "cleaned_business_data.json"
output_data(restaurant_data, business_output)
#Create a list of unique business IDs
business_list = list(set(return_ids(restaurant_data, 'business_id')))
    
In [22]:
    
reviews_jsonFile = data_dir + 'yelp_academic_dataset_review.json'
review_data = filter_reviews(reviews_jsonFile, business_list)
    
In [28]:
    
reviews_output = "cleaned_review_data.json"
output_data(review_data, reviews_output)
    
In [ ]:
    
reviews_file = "/Users/robertsonwang/Desktop/Python/Yelp/cleaned_review_data.json"
customers_list = filter_customers(reviews_file)
    
In [4]:
    
business_json = json.load(open("/Users/robertsonwang/Desktop/Python/Yelp/Yelp_scrapper/cleaned_business_data.json"))
filtered_biz = {}
for line in business_json:
    filtered_biz[line['business_id']] = line['state']
    
In [5]:
    
reviews_json = json.load(open("/Users/robertsonwang/Desktop/Python/Yelp/cleaned_review_data.json"))
    
In [6]:
    
for review in reviews_json:
    review['state'] = filtered_biz[review['business_id']]
    
In [9]:
    
with open('cleaned_reviews_states', 'w') as outfile:
    json.dump(reviews_json, outfile)