In [3]:
import json
def clean_states(filename, states):
records = []
with open(filename, 'r') as fd:
for line in fd:
j_content = json.loads(line)
if j_content['state'] in states:
records.append(j_content)
return records
def clean_restaurants(json_list):
records = []
for json in json_list:
try:
if 'Restaurants' in json['categories'] or 'Restaurant' in json['categories']:
records.append(json)
except TypeError:
pass
return records
def return_ids(data_list, id_type):
id_list = []
for data_point in data_list:
id_list.append(data_point[id_type])
return id_list
def filter_reviews(filename, business_list):
records = []
with open(filename, 'r') as fd:
for line in fd:
j_content = json.loads(line)
if j_content['business_id'] in business_list:
records.append(j_content)
return records
def filter_customers(filename):
records = []
with open(filename, 'r') as fd:
for line in fd:
j_content = json.loads(line)
for record in j_content:
records.append(record['user_id'])
customer_file = '/Volumes/Data/yelp_dataset/yelp_academic_dataset_user.json'
customer_records = []
with open(customer_file, 'r') as fd:
for line in fd:
j_content = json.loads(line)
if j_content['user_id'] in set(records):
customer_records.append(j_content)
return customer_records
def output_data(json_data, filename):
output_directory = "/Volumes/Data/yelp_dataset/cleaned_data/"
filename = output_directory + filename
with open(filename, 'w') as outfile:
json.dump(json_data, outfile)
In [21]:
#Declare data directory and input JSON file
data_dir = "/Volumes/data/yelp_dataset/"
business_jsonFile = data_dir + 'yelp_academic_dataset_business.json'
#Declare the list of states to be kept
states = ['AZ', 'IL', 'WI', 'OH', 'NC', 'NV']
state_data = clean_states(business_jsonFile, states)
#Keep only restaurants
restaurant_data = clean_restaurants(state_data)
#Output cleaned up JSON file
business_output = "cleaned_business_data.json"
output_data(restaurant_data, business_output)
#Create a list of unique business IDs
business_list = list(set(return_ids(restaurant_data, 'business_id')))
In [22]:
reviews_jsonFile = data_dir + 'yelp_academic_dataset_review.json'
review_data = filter_reviews(reviews_jsonFile, business_list)
In [28]:
reviews_output = "cleaned_review_data.json"
output_data(review_data, reviews_output)
In [ ]:
reviews_file = "/Users/robertsonwang/Desktop/Python/Yelp/cleaned_review_data.json"
customers_list = filter_customers(reviews_file)
In [4]:
business_json = json.load(open("/Users/robertsonwang/Desktop/Python/Yelp/Yelp_scrapper/cleaned_business_data.json"))
filtered_biz = {}
for line in business_json:
filtered_biz[line['business_id']] = line['state']
In [5]:
reviews_json = json.load(open("/Users/robertsonwang/Desktop/Python/Yelp/cleaned_review_data.json"))
In [6]:
for review in reviews_json:
review['state'] = filtered_biz[review['business_id']]
In [9]:
with open('cleaned_reviews_states', 'w') as outfile:
json.dump(reviews_json, outfile)