JSON Cleanup Process

Main JSON Files:

  1. yelp_academic_dataset_business
  2. yelp_academic_dataset_review
  3. yelp_academic_dataset_customer

For each business entry:

  • Keep if business is restaurant
  • Keep if location is in state list
  • Output list of valid businesses
  • Output csv file for all busineses and business attributes

For each review entry:

  • Keep only businesses in valid business list
  • Create a dictionary with each business as a key
  • For each key, create a list of unique customers in that business
  • Find the unique list of customers for all businesses
  • Output csv file for all reviews and review attributes

For each customer entry:

  • Keep if customer in the unique list of customers for all businesses
  • Output csv file of all customers and customer attributes

In [3]:
import json

def clean_states(filename, states):
    records = []
    with open(filename, 'r') as fd:
        for line in fd:
            j_content = json.loads(line)
            if j_content['state'] in states:
                records.append(j_content)
                
    return records

def clean_restaurants(json_list):
    records = []
    for json in json_list:
        try:
            if 'Restaurants' in json['categories'] or 'Restaurant' in json['categories']:
                records.append(json)
        except TypeError:
            pass
    
    return records

def return_ids(data_list, id_type):
    id_list = []
    for data_point in data_list:
        id_list.append(data_point[id_type])
        
    return id_list

def filter_reviews(filename, business_list):
    records = []
    with open(filename, 'r') as fd:
        for line in fd:
            j_content = json.loads(line)
            if j_content['business_id'] in business_list:
                records.append(j_content)
                
    return records

def filter_customers(filename):
    records = []
    
    with open(filename, 'r') as fd:
        for line in fd:
            j_content = json.loads(line)
            for record in j_content:
                records.append(record['user_id'])
                
    customer_file = '/Volumes/Data/yelp_dataset/yelp_academic_dataset_user.json'
    customer_records = []
    
    with open(customer_file, 'r') as fd:
        for line in fd:
            j_content = json.loads(line)
            if j_content['user_id'] in set(records):
                customer_records.append(j_content)

    return customer_records

def output_data(json_data, filename):
    output_directory = "/Volumes/Data/yelp_dataset/cleaned_data/"
    filename = output_directory + filename
    with open(filename, 'w') as outfile:
        json.dump(json_data, outfile)

In [21]:
#Declare data directory and input JSON file
data_dir = "/Volumes/data/yelp_dataset/"
business_jsonFile = data_dir + 'yelp_academic_dataset_business.json'

#Declare the list of states to be kept
states = ['AZ', 'IL', 'WI', 'OH', 'NC', 'NV']
state_data = clean_states(business_jsonFile, states)

#Keep only restaurants
restaurant_data = clean_restaurants(state_data)

#Output cleaned up JSON file
business_output = "cleaned_business_data.json"
output_data(restaurant_data, business_output)

#Create a list of unique business IDs
business_list = list(set(return_ids(restaurant_data, 'business_id')))

In [22]:
reviews_jsonFile = data_dir + 'yelp_academic_dataset_review.json'
review_data = filter_reviews(reviews_jsonFile, business_list)

In [28]:
reviews_output = "cleaned_review_data.json"
output_data(review_data, reviews_output)

In [ ]:
reviews_file = "/Users/robertsonwang/Desktop/Python/Yelp/cleaned_review_data.json"
customers_list = filter_customers(reviews_file)

Merge the state data key from the business JSON into the review JSON


In [4]:
business_json = json.load(open("/Users/robertsonwang/Desktop/Python/Yelp/Yelp_scrapper/cleaned_business_data.json"))

filtered_biz = {}
for line in business_json:
    filtered_biz[line['business_id']] = line['state']

In [5]:
reviews_json = json.load(open("/Users/robertsonwang/Desktop/Python/Yelp/cleaned_review_data.json"))

In [6]:
for review in reviews_json:
    review['state'] = filtered_biz[review['business_id']]

In [9]:
with open('cleaned_reviews_states', 'w') as outfile:
    json.dump(reviews_json, outfile)