Parse & Normalize


In [1]:
import pandas as pd
import numpy as np
import json
import csv
import re

In [ ]:
def parse_json(file_name):
    with open(file_name, 'rb') as f:
        data = f.readlines()
    data = map(lambda string: string.rstrip(), data)
    DATA_SLICE = 750000
    i, end = 1, len(data) / DATA_SLICE
    json_string = "[" + ','.join(data[0:DATA_SLICE]) + "]"
    df = pd.read_json(json_string)
    while i <= end:
        json_string = "[" + ','.join(data[i * DATA_SLICE:(i + 1) * DATA_SLICE]) + "]"
        i += 1
        df = df.append(pd.read_json(json_string), ignore_index = True)
    return df

BASE_PATH = "/home/alex/TechSphere/yelp_data/yelp_dataset/"

df_business = parse_json(BASE_PATH + "yelp_academic_dataset_business.json")
df_checkin  = parse_json(BASE_PATH + "yelp_academic_dataset_checkin.json")
df_review   = parse_json(BASE_PATH + "yelp_academic_dataset_review.json")
df_user     = parse_json(BASE_PATH + "yelp_academic_dataset_user.json")
df_tip      = parse_json(BASE_PATH + "yelp_academic_dataset_tip.json")

In [35]:
print len(df_business)
print len(df_checkin)
print len(df_review)
print len(df_user)
print len(df_tip)


61184
45166
1569264
366715
495107

In [36]:
df_business.head(1)


Out[36]:
attributes business_id categories city full_address hours latitude longitude name neighborhoods open review_count stars state type
0 {u'By Appointment Only': True} vcNAWiLM4dR7D2nwwJ7nCA [Doctors, Health & Medical] Phoenix 4840 E Indian School Rd\nSte 101\nPhoenix, AZ ... {u'Thursday': {u'close': u'17:00', u'open': u'... 33.499313 -111.983758 Eric Goldberg, MD [] True 9 3.5 AZ business

In [37]:
df_checkin.head(1)


Out[37]:
business_id checkin_info type
0 cE27W9VPgO88Qxe4ol6y_g {u'9-5': 1, u'7-5': 1, u'13-3': 1, u'17-6': 1,... checkin

In [39]:
df_review.head(1)


Out[39]:
business_id date review_id stars text type user_id votes
0 vcNAWiLM4dR7D2nwwJ7nCA 2007-05-17 15SdjuK7DmYqUAj6rjGowg 5 dr. goldberg offers everything i look for in a... review Xqd0DzHaiyRqVH3WRG7hzg {u'funny': 0, u'useful': 2, u'cool': 1}

In [40]:
df_user.head(1)


Out[40]:
average_stars compliments elite fans friends name review_count type user_id votes yelping_since
0 4.14 {u'profile': 8, u'cute': 15, u'funny': 11, u'p... [2005, 2006] 69 [rpOyqD_893cqmDAtJLbdog, 4U9kSBLuBDU391x6bxU-Y... Russel 108 user 18kPq7GPye-YQ3LyKyAZPw {u'funny': 166, u'useful': 278, u'cool': 245} 2004-10

In [41]:
df_tip.head(1)


Out[41]:
business_id date likes text type user_id
0 cE27W9VPgO88Qxe4ol6y_g 2013-04-18 0 Don't waste your time. tip -6rEfobYjMxpUWLNxszaxQ

In [46]:
def set_prefix_and_underscores(prefix, columns):
    underscorizer = lambda string: re.sub(r"[\s.-]", "_", string.lower())
    columns[:] = map(underscorizer, columns)
    prefix_setter = lambda string: prefix + u"_" + string
    columns[:] = map(prefix_setter, columns)

def get_lists_as_dicts(series):
    caster = lambda lst: {str(value): 1 for value in lst}
    return series.apply(caster)

def get_values_counted(series):
    counter = lambda lst: len(lst)
    return series.apply(counter)

def get_lists_as_strings(series):
    caster = lambda lst: ",".join(lst)
    return series.apply(caster)

business dataframe normalization


In [47]:
# SKIP THIS STEP 
# TODO: try to normalize attributes...
#df_n_attributes = pd.io.json.json_normalize(df_business['attributes'].tolist())
#print set(type(a) for a in df_n_attributes['Accepts Credit Cards'].tolist()) 
#df_n_attributes.head()

In [48]:
categories = get_lists_as_dicts(df_business['categories'])
df_n_categories = pd.io.json.json_normalize(categories.tolist())
df_n_categories.fillna(0, inplace = True)
cols = df_n_categories.columns.tolist()
set_prefix_and_underscores(u"category", cols)
df_n_categories.columns = cols

df_n_categories.head(1)


Out[48]:
category_accessories category_accountants category_active_life category_acupuncture category_adult category_adult_education category_adult_entertainment category_advertising category_afghan category_african ... category_windows_installation category_windshield_installation_&_repair category_wine_bars category_wine_tours category_wineries category_wok category_women's_clothing category_yelp_events category_yoga category_zoos
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

1 rows × 783 columns


In [49]:
neighborhoods = get_lists_as_dicts(df_business['neighborhoods'])
df_n_neighborhoods = pd.io.json.json_normalize(neighborhoods.tolist())
df_n_neighborhoods.fillna(0, inplace = True)
cols = df_n_neighborhoods.columns.tolist()
set_prefix_and_underscores(u"neighborhood", cols)
df_n_neighborhoods.columns = cols

df_n_neighborhoods.head(1)


Out[49]:
neighborhood_allentown neighborhood_allied_dunn's_marsh neighborhood_anthem neighborhood_arboretum neighborhood_aspinwall neighborhood_avalon neighborhood_ballantyne neighborhood_banksville neighborhood_bay_creek neighborhood_beechview ... neighborhood_west_view neighborhood_westchester_gardens neighborhood_westhaven_trails neighborhood_westmorland neighborhood_westside neighborhood_wexford neighborhood_williamson___marquette neighborhood_willy_st neighborhood_windgap neighborhood_worthington_park
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

1 rows × 175 columns


In [50]:
hours = df_business['hours']
df_n_hours = pd.io.json.json_normalize(hours.tolist())
df_n_hours.fillna("", inplace = True)
cols = df_n_hours.columns.tolist()
set_prefix_and_underscores(u"hour", cols)
df_n_hours.columns = cols

df_n_hours.head(1)


Out[50]:
hour_friday_close hour_friday_open hour_monday_close hour_monday_open hour_saturday_close hour_saturday_open hour_sunday_close hour_sunday_open hour_thursday_close hour_thursday_open hour_tuesday_close hour_tuesday_open hour_wednesday_close hour_wednesday_open
0 17:00 08:00 17:00 08:00 17:00 08:00 17:00 08:00 17:00 08:00

In [51]:
df_businesses = df_business.merge(df_n_categories, how = "left", left_index = True, right_index = True)
df_businesses = df_businesses.merge(df_n_neighborhoods, how = "left", left_index = True, right_index = True)
df_businesses = df_businesses.merge(df_n_hours, how = "left", left_index = True, right_index = True)

df_businesses['neighborhoods_count'] = get_values_counted(df_business['neighborhoods'])

df_businesses.drop(['attributes', 'categories', 'neighborhoods', 'hours', 'type'], axis = 1, inplace = True)

df_businesses.head(1)


Out[51]:
business_id city full_address latitude longitude name open review_count stars state ... hour_saturday_open hour_sunday_close hour_sunday_open hour_thursday_close hour_thursday_open hour_tuesday_close hour_tuesday_open hour_wednesday_close hour_wednesday_open neighborhoods_count
0 vcNAWiLM4dR7D2nwwJ7nCA Phoenix 4840 E Indian School Rd\nSte 101\nPhoenix, AZ ... 33.499313 -111.983758 Eric Goldberg, MD True 9 3.5 AZ ... 17:00 08:00 17:00 08:00 17:00 08:00 0

1 rows × 983 columns

user dataframe normalization


In [52]:
compliments = df_user['compliments']
df_n_compliments = pd.io.json.json_normalize(compliments.tolist())
df_n_compliments.fillna(0, inplace = True)
cols = df_n_compliments.columns.tolist()
set_prefix_and_underscores(u"compliment", cols)
df_n_compliments.columns = cols

df_n_compliments.head(1)


Out[52]:
compliment_cool compliment_cute compliment_funny compliment_hot compliment_list compliment_more compliment_note compliment_photos compliment_plain compliment_profile compliment_writer
0 78 15 11 48 0 3 20 15 25 8 9

In [53]:
votes = df_user['votes']
df_n_votes = pd.io.json.json_normalize(votes.tolist())
df_n_votes.fillna(0, inplace = True)
cols = df_n_votes.columns.tolist()
set_prefix_and_underscores(u"vote", cols)
df_n_votes.columns = cols

df_n_votes.head(1)


Out[53]:
vote_cool vote_funny vote_useful
0 245 166 278

In [54]:
elite = get_lists_as_dicts(df_user['elite'])
df_n_elite = pd.io.json.json_normalize(elite.tolist())
df_n_elite.fillna(0, inplace = True)
cols = df_n_elite.columns.tolist()
set_prefix_and_underscores(u"elite_at", cols)
df_n_elite.columns = cols

df_n_elite.head(1)


Out[54]:
elite_at_2005 elite_at_2006 elite_at_2007 elite_at_2008 elite_at_2009 elite_at_2010 elite_at_2011 elite_at_2012 elite_at_2013 elite_at_2014 elite_at_2015
0 1 1 0 0 0 0 0 0 0 0 0

In [55]:
df_users = df_user.merge(df_n_compliments, how = "left", left_index = True, right_index = True)
df_users = df_users.merge(df_n_votes, how = "left", left_index = True, right_index = True)
df_users = df_users.merge(df_n_elite, how = "left", left_index = True, right_index = True)

df_users['friends_count'] = get_values_counted(df_user['friends'])
df_users['friends'] = get_lists_as_strings(df_user['friends'])

df_users.drop(['compliments', 'votes', 'elite', 'type'], axis = 1, inplace = True)

cols = df_users.columns.tolist()
cols.remove(u"user_id")
cols.insert(0, u"user_id")
cols.remove(u"name")
cols.insert(1, u"name")
cols.remove(u"friends_count")
cols.insert(2, u"friends_count")
cols.remove(u"fans")
cols.insert(3, u"fans")
cols.remove(u"review_count")
cols.insert(4, u"review_count")
cols.remove(u"friends")
cols.append(u"friends")
df_users = df_users[cols]

df_users.head(1)


Out[55]:
user_id name friends_count fans review_count average_stars yelping_since compliment_cool compliment_cute compliment_funny ... elite_at_2007 elite_at_2008 elite_at_2009 elite_at_2010 elite_at_2011 elite_at_2012 elite_at_2013 elite_at_2014 elite_at_2015 friends
0 18kPq7GPye-YQ3LyKyAZPw Russel 206 69 108 4.14 2004-10 78 15 11 ... 0 0 0 0 0 0 0 0 0 rpOyqD_893cqmDAtJLbdog,4U9kSBLuBDU391x6bxU-YA,...

1 rows × 33 columns

checkin dataframe normalization


In [56]:
checkin_info = df_checkin['checkin_info']
df_n_checkin_info = pd.io.json.json_normalize(checkin_info.tolist())
df_n_checkin_info.fillna(0, inplace = True)
cols = df_n_checkin_info.columns.tolist()
format_unifier = lambda string: u"0" + string if re.search(r"^[0-9]-[0-6]$", string) else string
cols = map(format_unifier, cols)
set_prefix_and_underscores(u"checkin_at", cols)
df_n_checkin_info.columns = cols
cols = sorted(cols)
df_n_checkin_info = df_n_checkin_info[cols]

df_n_checkin_info.head(1)


Out[56]:
checkin_at_00_0 checkin_at_00_1 checkin_at_00_2 checkin_at_00_3 checkin_at_00_4 checkin_at_00_5 checkin_at_00_6 checkin_at_01_0 checkin_at_01_1 checkin_at_01_2 ... checkin_at_22_4 checkin_at_22_5 checkin_at_22_6 checkin_at_23_0 checkin_at_23_1 checkin_at_23_2 checkin_at_23_3 checkin_at_23_4 checkin_at_23_5 checkin_at_23_6
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

1 rows × 168 columns


In [57]:
df_checkins = df_checkin.merge(df_n_checkin_info, how = "left", left_index = True, right_index = True)

df_checkins.drop(['checkin_info', 'type'], axis = 1, inplace = True)

df_checkins.head(1)


Out[57]:
business_id checkin_at_00_0 checkin_at_00_1 checkin_at_00_2 checkin_at_00_3 checkin_at_00_4 checkin_at_00_5 checkin_at_00_6 checkin_at_01_0 checkin_at_01_1 ... checkin_at_22_4 checkin_at_22_5 checkin_at_22_6 checkin_at_23_0 checkin_at_23_1 checkin_at_23_2 checkin_at_23_3 checkin_at_23_4 checkin_at_23_5 checkin_at_23_6
0 cE27W9VPgO88Qxe4ol6y_g 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

1 rows × 169 columns

tip dataframe


In [58]:
df_tips = df_tip.drop(['type'], axis = 1)

df_tips.head(1)


Out[58]:
business_id date likes text user_id
0 cE27W9VPgO88Qxe4ol6y_g 2013-04-18 0 Don't waste your time. -6rEfobYjMxpUWLNxszaxQ

review dataframe normalization


In [59]:
votes = df_review['votes']
df_n_votes = pd.io.json.json_normalize(votes.tolist())
df_n_votes.fillna(0, inplace = True)
cols = df_n_votes.columns.tolist()
set_prefix_and_underscores(u"vote", cols)
df_n_votes.columns = cols

df_n_votes.head(1)


Out[59]:
vote_cool vote_funny vote_useful
0 1 0 2

In [60]:
df_reviews = df_review.merge(df_n_votes, how = "left", left_index = True, right_index = True)

df_reviews.drop(['votes', 'type'], axis = 1, inplace = True)

df_reviews.head(1)


Out[60]:
business_id date review_id stars text user_id vote_cool vote_funny vote_useful
0 vcNAWiLM4dR7D2nwwJ7nCA 2007-05-17 15SdjuK7DmYqUAj6rjGowg 5 dr. goldberg offers everything i look for in a... Xqd0DzHaiyRqVH3WRG7hzg 1 0 2

In [61]:
#df_users.drop("friends", axis = 1).head(100).to_excel("users.xls", index = False, encoding = "utf-8")
df_users.to_csv("users.csv", sep = ",", index = False, encoding = "utf-8", quote = csv.QUOTE_NONNUMERIC)

In [63]:
#df_businesses.head(100).to_excel("businesses.xls", index = False, encoding = "utf-8")
df_businesses.to_csv("businesses.csv", sep = ",", index = False, encoding = "utf-8", quote = csv.QUOTE_NONNUMERIC)

In [70]:
#df_reviews.head(100).to_excel("reviews.xls", index = False, encoding = "utf-8")
df_reviews.to_csv("reviews.csv", sep = ",", index = False, encoding = "utf-8", quote = csv.QUOTE_NONNUMERIC)

In [65]:
#df_tips.head(100).to_excel("tips.xls", index = False, encoding = "utf-8")
df_tips.to_csv("tips.csv", sep = ",", index = False, encoding = "utf-8", quote = csv.QUOTE_NONNUMERIC)

In [68]:
#df_checkins.head(100).to_excel("checkins.xls", index = False, encoding = "utf-8")
df_checkins.to_csv("checkins.csv", sep = ",", index = False, encoding = "utf-8", quote = csv.QUOTE_NONNUMERIC)

In [ ]: