In [1]:
import pandas as pd
import numpy as np
import json
import csv
import re
In [ ]:
def parse_json(file_name):
with open(file_name, 'rb') as f:
data = f.readlines()
data = map(lambda string: string.rstrip(), data)
DATA_SLICE = 750000
i, end = 1, len(data) / DATA_SLICE
json_string = "[" + ','.join(data[0:DATA_SLICE]) + "]"
df = pd.read_json(json_string)
while i <= end:
json_string = "[" + ','.join(data[i * DATA_SLICE:(i + 1) * DATA_SLICE]) + "]"
i += 1
df = df.append(pd.read_json(json_string), ignore_index = True)
return df
BASE_PATH = "/home/alex/TechSphere/yelp_data/yelp_dataset/"
df_business = parse_json(BASE_PATH + "yelp_academic_dataset_business.json")
df_checkin = parse_json(BASE_PATH + "yelp_academic_dataset_checkin.json")
df_review = parse_json(BASE_PATH + "yelp_academic_dataset_review.json")
df_user = parse_json(BASE_PATH + "yelp_academic_dataset_user.json")
df_tip = parse_json(BASE_PATH + "yelp_academic_dataset_tip.json")
In [35]:
print len(df_business)
print len(df_checkin)
print len(df_review)
print len(df_user)
print len(df_tip)
In [36]:
df_business.head(1)
Out[36]:
In [37]:
df_checkin.head(1)
Out[37]:
In [39]:
df_review.head(1)
Out[39]:
In [40]:
df_user.head(1)
Out[40]:
In [41]:
df_tip.head(1)
Out[41]:
In [46]:
def set_prefix_and_underscores(prefix, columns):
underscorizer = lambda string: re.sub(r"[\s.-]", "_", string.lower())
columns[:] = map(underscorizer, columns)
prefix_setter = lambda string: prefix + u"_" + string
columns[:] = map(prefix_setter, columns)
def get_lists_as_dicts(series):
caster = lambda lst: {str(value): 1 for value in lst}
return series.apply(caster)
def get_values_counted(series):
counter = lambda lst: len(lst)
return series.apply(counter)
def get_lists_as_strings(series):
caster = lambda lst: ",".join(lst)
return series.apply(caster)
In [47]:
# SKIP THIS STEP
# TODO: try to normalize attributes...
#df_n_attributes = pd.io.json.json_normalize(df_business['attributes'].tolist())
#print set(type(a) for a in df_n_attributes['Accepts Credit Cards'].tolist())
#df_n_attributes.head()
In [48]:
categories = get_lists_as_dicts(df_business['categories'])
df_n_categories = pd.io.json.json_normalize(categories.tolist())
df_n_categories.fillna(0, inplace = True)
cols = df_n_categories.columns.tolist()
set_prefix_and_underscores(u"category", cols)
df_n_categories.columns = cols
df_n_categories.head(1)
Out[48]:
In [49]:
neighborhoods = get_lists_as_dicts(df_business['neighborhoods'])
df_n_neighborhoods = pd.io.json.json_normalize(neighborhoods.tolist())
df_n_neighborhoods.fillna(0, inplace = True)
cols = df_n_neighborhoods.columns.tolist()
set_prefix_and_underscores(u"neighborhood", cols)
df_n_neighborhoods.columns = cols
df_n_neighborhoods.head(1)
Out[49]:
In [50]:
hours = df_business['hours']
df_n_hours = pd.io.json.json_normalize(hours.tolist())
df_n_hours.fillna("", inplace = True)
cols = df_n_hours.columns.tolist()
set_prefix_and_underscores(u"hour", cols)
df_n_hours.columns = cols
df_n_hours.head(1)
Out[50]:
In [51]:
df_businesses = df_business.merge(df_n_categories, how = "left", left_index = True, right_index = True)
df_businesses = df_businesses.merge(df_n_neighborhoods, how = "left", left_index = True, right_index = True)
df_businesses = df_businesses.merge(df_n_hours, how = "left", left_index = True, right_index = True)
df_businesses['neighborhoods_count'] = get_values_counted(df_business['neighborhoods'])
df_businesses.drop(['attributes', 'categories', 'neighborhoods', 'hours', 'type'], axis = 1, inplace = True)
df_businesses.head(1)
Out[51]:
In [52]:
compliments = df_user['compliments']
df_n_compliments = pd.io.json.json_normalize(compliments.tolist())
df_n_compliments.fillna(0, inplace = True)
cols = df_n_compliments.columns.tolist()
set_prefix_and_underscores(u"compliment", cols)
df_n_compliments.columns = cols
df_n_compliments.head(1)
Out[52]:
In [53]:
votes = df_user['votes']
df_n_votes = pd.io.json.json_normalize(votes.tolist())
df_n_votes.fillna(0, inplace = True)
cols = df_n_votes.columns.tolist()
set_prefix_and_underscores(u"vote", cols)
df_n_votes.columns = cols
df_n_votes.head(1)
Out[53]:
In [54]:
elite = get_lists_as_dicts(df_user['elite'])
df_n_elite = pd.io.json.json_normalize(elite.tolist())
df_n_elite.fillna(0, inplace = True)
cols = df_n_elite.columns.tolist()
set_prefix_and_underscores(u"elite_at", cols)
df_n_elite.columns = cols
df_n_elite.head(1)
Out[54]:
In [55]:
df_users = df_user.merge(df_n_compliments, how = "left", left_index = True, right_index = True)
df_users = df_users.merge(df_n_votes, how = "left", left_index = True, right_index = True)
df_users = df_users.merge(df_n_elite, how = "left", left_index = True, right_index = True)
df_users['friends_count'] = get_values_counted(df_user['friends'])
df_users['friends'] = get_lists_as_strings(df_user['friends'])
df_users.drop(['compliments', 'votes', 'elite', 'type'], axis = 1, inplace = True)
cols = df_users.columns.tolist()
cols.remove(u"user_id")
cols.insert(0, u"user_id")
cols.remove(u"name")
cols.insert(1, u"name")
cols.remove(u"friends_count")
cols.insert(2, u"friends_count")
cols.remove(u"fans")
cols.insert(3, u"fans")
cols.remove(u"review_count")
cols.insert(4, u"review_count")
cols.remove(u"friends")
cols.append(u"friends")
df_users = df_users[cols]
df_users.head(1)
Out[55]:
In [56]:
checkin_info = df_checkin['checkin_info']
df_n_checkin_info = pd.io.json.json_normalize(checkin_info.tolist())
df_n_checkin_info.fillna(0, inplace = True)
cols = df_n_checkin_info.columns.tolist()
format_unifier = lambda string: u"0" + string if re.search(r"^[0-9]-[0-6]$", string) else string
cols = map(format_unifier, cols)
set_prefix_and_underscores(u"checkin_at", cols)
df_n_checkin_info.columns = cols
cols = sorted(cols)
df_n_checkin_info = df_n_checkin_info[cols]
df_n_checkin_info.head(1)
Out[56]:
In [57]:
df_checkins = df_checkin.merge(df_n_checkin_info, how = "left", left_index = True, right_index = True)
df_checkins.drop(['checkin_info', 'type'], axis = 1, inplace = True)
df_checkins.head(1)
Out[57]:
In [58]:
df_tips = df_tip.drop(['type'], axis = 1)
df_tips.head(1)
Out[58]:
In [59]:
votes = df_review['votes']
df_n_votes = pd.io.json.json_normalize(votes.tolist())
df_n_votes.fillna(0, inplace = True)
cols = df_n_votes.columns.tolist()
set_prefix_and_underscores(u"vote", cols)
df_n_votes.columns = cols
df_n_votes.head(1)
Out[59]:
In [60]:
df_reviews = df_review.merge(df_n_votes, how = "left", left_index = True, right_index = True)
df_reviews.drop(['votes', 'type'], axis = 1, inplace = True)
df_reviews.head(1)
Out[60]:
In [61]:
#df_users.drop("friends", axis = 1).head(100).to_excel("users.xls", index = False, encoding = "utf-8")
df_users.to_csv("users.csv", sep = ",", index = False, encoding = "utf-8", quote = csv.QUOTE_NONNUMERIC)
In [63]:
#df_businesses.head(100).to_excel("businesses.xls", index = False, encoding = "utf-8")
df_businesses.to_csv("businesses.csv", sep = ",", index = False, encoding = "utf-8", quote = csv.QUOTE_NONNUMERIC)
In [70]:
#df_reviews.head(100).to_excel("reviews.xls", index = False, encoding = "utf-8")
df_reviews.to_csv("reviews.csv", sep = ",", index = False, encoding = "utf-8", quote = csv.QUOTE_NONNUMERIC)
In [65]:
#df_tips.head(100).to_excel("tips.xls", index = False, encoding = "utf-8")
df_tips.to_csv("tips.csv", sep = ",", index = False, encoding = "utf-8", quote = csv.QUOTE_NONNUMERIC)
In [68]:
#df_checkins.head(100).to_excel("checkins.xls", index = False, encoding = "utf-8")
df_checkins.to_csv("checkins.csv", sep = ",", index = False, encoding = "utf-8", quote = csv.QUOTE_NONNUMERIC)
In [ ]: