Parse & Normalize



In [1]:

    
import pandas as pd
import numpy as np
import json
import csv
import re



In [ ]:

    
def parse_json(file_name):
    with open(file_name, 'rb') as f:
        data = f.readlines()
    data = map(lambda string: string.rstrip(), data)
    DATA_SLICE = 750000
    i, end = 1, len(data) / DATA_SLICE
    json_string = "[" + ','.join(data[0:DATA_SLICE]) + "]"
    df = pd.read_json(json_string)
    while i <= end:
        json_string = "[" + ','.join(data[i * DATA_SLICE:(i + 1) * DATA_SLICE]) + "]"
        i += 1
        df = df.append(pd.read_json(json_string), ignore_index = True)
    return df

BASE_PATH = "/home/alex/TechSphere/yelp_data/yelp_dataset/"

df_business = parse_json(BASE_PATH + "yelp_academic_dataset_business.json")
df_checkin  = parse_json(BASE_PATH + "yelp_academic_dataset_checkin.json")
df_review   = parse_json(BASE_PATH + "yelp_academic_dataset_review.json")
df_user     = parse_json(BASE_PATH + "yelp_academic_dataset_user.json")
df_tip      = parse_json(BASE_PATH + "yelp_academic_dataset_tip.json")



In [35]:

    
print len(df_business)
print len(df_checkin)
print len(df_review)
print len(df_user)
print len(df_tip)



In [36]:

    
df_business.head(1)









    Out[36]:






  
    
      
      attributes
      business_id
      categories
      city
      full_address
      hours
      latitude
      longitude
      name
      neighborhoods
      open
      review_count
      stars
      state
      type
    
  
  
    
      0
      {u'By Appointment Only': True}
      vcNAWiLM4dR7D2nwwJ7nCA
      [Doctors, Health & Medical]
      Phoenix
      4840 E Indian School Rd\nSte 101\nPhoenix, AZ ...
      {u'Thursday': {u'close': u'17:00', u'open': u'...
      33.499313
      -111.983758
      Eric Goldberg, MD
      []
      True
      9
      3.5
      AZ
      business



In [37]:

    
df_checkin.head(1)









    Out[37]:






  
    
      
      business_id
      checkin_info
      type
    
  
  
    
      0
      cE27W9VPgO88Qxe4ol6y_g
      {u'9-5': 1, u'7-5': 1, u'13-3': 1, u'17-6': 1,...
      checkin



In [39]:

    
df_review.head(1)









    Out[39]:






  
    
      
      business_id
      date
      review_id
      stars
      text
      type
      user_id
      votes
    
  
  
    
      0
      vcNAWiLM4dR7D2nwwJ7nCA
      2007-05-17
      15SdjuK7DmYqUAj6rjGowg
      5
      dr. goldberg offers everything i look for in a...
      review
      Xqd0DzHaiyRqVH3WRG7hzg
      {u'funny': 0, u'useful': 2, u'cool': 1}



In [40]:

    
df_user.head(1)









    Out[40]:






  
    
      
      average_stars
      compliments
      elite
      fans
      friends
      name
      review_count
      type
      user_id
      votes
      yelping_since
    
  
  
    
      0
      4.14
      {u'profile': 8, u'cute': 15, u'funny': 11, u'p...
      [2005, 2006]
      69
      [rpOyqD_893cqmDAtJLbdog, 4U9kSBLuBDU391x6bxU-Y...
      Russel
      108
      user
      18kPq7GPye-YQ3LyKyAZPw
      {u'funny': 166, u'useful': 278, u'cool': 245}
      2004-10



In [41]:

    
df_tip.head(1)









    Out[41]:






  
    
      
      business_id
      date
      likes
      text
      type
      user_id
    
  
  
    
      0
      cE27W9VPgO88Qxe4ol6y_g
      2013-04-18
      0
      Don't waste your time.
      tip
      -6rEfobYjMxpUWLNxszaxQ



In [46]:

    
def set_prefix_and_underscores(prefix, columns):
    underscorizer = lambda string: re.sub(r"[\s.-]", "_", string.lower())
    columns[:] = map(underscorizer, columns)
    prefix_setter = lambda string: prefix + u"_" + string
    columns[:] = map(prefix_setter, columns)

def get_lists_as_dicts(series):
    caster = lambda lst: {str(value): 1 for value in lst}
    return series.apply(caster)

def get_values_counted(series):
    counter = lambda lst: len(lst)
    return series.apply(counter)

def get_lists_as_strings(series):
    caster = lambda lst: ",".join(lst)
    return series.apply(caster)

business dataframe normalization



In [47]:

    
# SKIP THIS STEP 
# TODO: try to normalize attributes...
#df_n_attributes = pd.io.json.json_normalize(df_business['attributes'].tolist())
#print set(type(a) for a in df_n_attributes['Accepts Credit Cards'].tolist()) 
#df_n_attributes.head()



In [48]:

    
categories = get_lists_as_dicts(df_business['categories'])
df_n_categories = pd.io.json.json_normalize(categories.tolist())
df_n_categories.fillna(0, inplace = True)
cols = df_n_categories.columns.tolist()
set_prefix_and_underscores(u"category", cols)
df_n_categories.columns = cols

df_n_categories.head(1)









    Out[48]:






  
    
      
      category_accessories
      category_accountants
      category_active_life
      category_acupuncture
      category_adult
      category_adult_education
      category_adult_entertainment
      category_advertising
      category_afghan
      category_african
      ...
      category_windows_installation
      category_windshield_installation_&_repair
      category_wine_bars
      category_wine_tours
      category_wineries
      category_wok
      category_women's_clothing
      category_yelp_events
      category_yoga
      category_zoos
    
  
  
    
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

1 rows × 783 columns



In [49]:

    
neighborhoods = get_lists_as_dicts(df_business['neighborhoods'])
df_n_neighborhoods = pd.io.json.json_normalize(neighborhoods.tolist())
df_n_neighborhoods.fillna(0, inplace = True)
cols = df_n_neighborhoods.columns.tolist()
set_prefix_and_underscores(u"neighborhood", cols)
df_n_neighborhoods.columns = cols

df_n_neighborhoods.head(1)









    Out[49]:






  
    
      
      neighborhood_allentown
      neighborhood_allied_dunn's_marsh
      neighborhood_anthem
      neighborhood_arboretum
      neighborhood_aspinwall
      neighborhood_avalon
      neighborhood_ballantyne
      neighborhood_banksville
      neighborhood_bay_creek
      neighborhood_beechview
      ...
      neighborhood_west_view
      neighborhood_westchester_gardens
      neighborhood_westhaven_trails
      neighborhood_westmorland
      neighborhood_westside
      neighborhood_wexford
      neighborhood_williamson___marquette
      neighborhood_willy_st
      neighborhood_windgap
      neighborhood_worthington_park
    
  
  
    
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

1 rows × 175 columns



In [50]:

    
hours = df_business['hours']
df_n_hours = pd.io.json.json_normalize(hours.tolist())
df_n_hours.fillna("", inplace = True)
cols = df_n_hours.columns.tolist()
set_prefix_and_underscores(u"hour", cols)
df_n_hours.columns = cols

df_n_hours.head(1)









    Out[50]:






  
    
      
      hour_friday_close
      hour_friday_open
      hour_monday_close
      hour_monday_open
      hour_saturday_close
      hour_saturday_open
      hour_sunday_close
      hour_sunday_open
      hour_thursday_close
      hour_thursday_open
      hour_tuesday_close
      hour_tuesday_open
      hour_wednesday_close
      hour_wednesday_open
    
  
  
    
      0
      17:00
      08:00
      17:00
      08:00
      
      
      
      
      17:00
      08:00
      17:00
      08:00
      17:00
      08:00



In [51]:

    
df_businesses = df_business.merge(df_n_categories, how = "left", left_index = True, right_index = True)
df_businesses = df_businesses.merge(df_n_neighborhoods, how = "left", left_index = True, right_index = True)
df_businesses = df_businesses.merge(df_n_hours, how = "left", left_index = True, right_index = True)

df_businesses['neighborhoods_count'] = get_values_counted(df_business['neighborhoods'])

df_businesses.drop(['attributes', 'categories', 'neighborhoods', 'hours', 'type'], axis = 1, inplace = True)

df_businesses.head(1)









    Out[51]:






  
    
      
      business_id
      city
      full_address
      latitude
      longitude
      name
      open
      review_count
      stars
      state
      ...
      hour_saturday_open
      hour_sunday_close
      hour_sunday_open
      hour_thursday_close
      hour_thursday_open
      hour_tuesday_close
      hour_tuesday_open
      hour_wednesday_close
      hour_wednesday_open
      neighborhoods_count
    
  
  
    
      0
      vcNAWiLM4dR7D2nwwJ7nCA
      Phoenix
      4840 E Indian School Rd\nSte 101\nPhoenix, AZ ...
      33.499313
      -111.983758
      Eric Goldberg, MD
      True
      9
      3.5
      AZ
      ...
      
      
      
      17:00
      08:00
      17:00
      08:00
      17:00
      08:00
      0
    
  

1 rows × 983 columns

user dataframe normalization



In [52]:

    
compliments = df_user['compliments']
df_n_compliments = pd.io.json.json_normalize(compliments.tolist())
df_n_compliments.fillna(0, inplace = True)
cols = df_n_compliments.columns.tolist()
set_prefix_and_underscores(u"compliment", cols)
df_n_compliments.columns = cols

df_n_compliments.head(1)









    Out[52]:






  
    
      
      compliment_cool
      compliment_cute
      compliment_funny
      compliment_hot
      compliment_list
      compliment_more
      compliment_note
      compliment_photos
      compliment_plain
      compliment_profile
      compliment_writer
    
  
  
    
      0
      78
      15
      11
      48
      0
      3
      20
      15
      25
      8
      9



In [53]:

    
votes = df_user['votes']
df_n_votes = pd.io.json.json_normalize(votes.tolist())
df_n_votes.fillna(0, inplace = True)
cols = df_n_votes.columns.tolist()
set_prefix_and_underscores(u"vote", cols)
df_n_votes.columns = cols

df_n_votes.head(1)









    Out[53]:






  
    
      
      vote_cool
      vote_funny
      vote_useful
    
  
  
    
      0
      245
      166
      278



In [54]:

    
elite = get_lists_as_dicts(df_user['elite'])
df_n_elite = pd.io.json.json_normalize(elite.tolist())
df_n_elite.fillna(0, inplace = True)
cols = df_n_elite.columns.tolist()
set_prefix_and_underscores(u"elite_at", cols)
df_n_elite.columns = cols

df_n_elite.head(1)









    Out[54]:






  
    
      
      elite_at_2005
      elite_at_2006
      elite_at_2007
      elite_at_2008
      elite_at_2009
      elite_at_2010
      elite_at_2011
      elite_at_2012
      elite_at_2013
      elite_at_2014
      elite_at_2015
    
  
  
    
      0
      1
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0



In [55]:

    
df_users = df_user.merge(df_n_compliments, how = "left", left_index = True, right_index = True)
df_users = df_users.merge(df_n_votes, how = "left", left_index = True, right_index = True)
df_users = df_users.merge(df_n_elite, how = "left", left_index = True, right_index = True)

df_users['friends_count'] = get_values_counted(df_user['friends'])
df_users['friends'] = get_lists_as_strings(df_user['friends'])

df_users.drop(['compliments', 'votes', 'elite', 'type'], axis = 1, inplace = True)

cols = df_users.columns.tolist()
cols.remove(u"user_id")
cols.insert(0, u"user_id")
cols.remove(u"name")
cols.insert(1, u"name")
cols.remove(u"friends_count")
cols.insert(2, u"friends_count")
cols.remove(u"fans")
cols.insert(3, u"fans")
cols.remove(u"review_count")
cols.insert(4, u"review_count")
cols.remove(u"friends")
cols.append(u"friends")
df_users = df_users[cols]

df_users.head(1)









    Out[55]:






  
    
      
      user_id
      name
      friends_count
      fans
      review_count
      average_stars
      yelping_since
      compliment_cool
      compliment_cute
      compliment_funny
      ...
      elite_at_2007
      elite_at_2008
      elite_at_2009
      elite_at_2010
      elite_at_2011
      elite_at_2012
      elite_at_2013
      elite_at_2014
      elite_at_2015
      friends
    
  
  
    
      0
      18kPq7GPye-YQ3LyKyAZPw
      Russel
      206
      69
      108
      4.14
      2004-10
      78
      15
      11
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      rpOyqD_893cqmDAtJLbdog,4U9kSBLuBDU391x6bxU-YA,...
    
  

1 rows × 33 columns

checkin dataframe normalization



In [56]:

    
checkin_info = df_checkin['checkin_info']
df_n_checkin_info = pd.io.json.json_normalize(checkin_info.tolist())
df_n_checkin_info.fillna(0, inplace = True)
cols = df_n_checkin_info.columns.tolist()
format_unifier = lambda string: u"0" + string if re.search(r"^[0-9]-[0-6]$", string) else string
cols = map(format_unifier, cols)
set_prefix_and_underscores(u"checkin_at", cols)
df_n_checkin_info.columns = cols
cols = sorted(cols)
df_n_checkin_info = df_n_checkin_info[cols]

df_n_checkin_info.head(1)









    Out[56]:






  
    
      
      checkin_at_00_0
      checkin_at_00_1
      checkin_at_00_2
      checkin_at_00_3
      checkin_at_00_4
      checkin_at_00_5
      checkin_at_00_6
      checkin_at_01_0
      checkin_at_01_1
      checkin_at_01_2
      ...
      checkin_at_22_4
      checkin_at_22_5
      checkin_at_22_6
      checkin_at_23_0
      checkin_at_23_1
      checkin_at_23_2
      checkin_at_23_3
      checkin_at_23_4
      checkin_at_23_5
      checkin_at_23_6
    
  
  
    
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

1 rows × 168 columns



In [57]:

    
df_checkins = df_checkin.merge(df_n_checkin_info, how = "left", left_index = True, right_index = True)

df_checkins.drop(['checkin_info', 'type'], axis = 1, inplace = True)

df_checkins.head(1)









    Out[57]:






  
    
      
      business_id
      checkin_at_00_0
      checkin_at_00_1
      checkin_at_00_2
      checkin_at_00_3
      checkin_at_00_4
      checkin_at_00_5
      checkin_at_00_6
      checkin_at_01_0
      checkin_at_01_1
      ...
      checkin_at_22_4
      checkin_at_22_5
      checkin_at_22_6
      checkin_at_23_0
      checkin_at_23_1
      checkin_at_23_2
      checkin_at_23_3
      checkin_at_23_4
      checkin_at_23_5
      checkin_at_23_6
    
  
  
    
      0
      cE27W9VPgO88Qxe4ol6y_g
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

1 rows × 169 columns

tip dataframe



In [58]:

    
df_tips = df_tip.drop(['type'], axis = 1)

df_tips.head(1)









    Out[58]:






  
    
      
      business_id
      date
      likes
      text
      user_id
    
  
  
    
      0
      cE27W9VPgO88Qxe4ol6y_g
      2013-04-18
      0
      Don't waste your time.
      -6rEfobYjMxpUWLNxszaxQ

review dataframe normalization



In [59]:

    
votes = df_review['votes']
df_n_votes = pd.io.json.json_normalize(votes.tolist())
df_n_votes.fillna(0, inplace = True)
cols = df_n_votes.columns.tolist()
set_prefix_and_underscores(u"vote", cols)
df_n_votes.columns = cols

df_n_votes.head(1)









    Out[59]:






  
    
      
      vote_cool
      vote_funny
      vote_useful
    
  
  
    
      0
      1
      0
      2



In [60]:

    
df_reviews = df_review.merge(df_n_votes, how = "left", left_index = True, right_index = True)

df_reviews.drop(['votes', 'type'], axis = 1, inplace = True)

df_reviews.head(1)









    Out[60]:






  
    
      
      business_id
      date
      review_id
      stars
      text
      user_id
      vote_cool
      vote_funny
      vote_useful
    
  
  
    
      0
      vcNAWiLM4dR7D2nwwJ7nCA
      2007-05-17
      15SdjuK7DmYqUAj6rjGowg
      5
      dr. goldberg offers everything i look for in a...
      Xqd0DzHaiyRqVH3WRG7hzg
      1
      0
      2



In [61]:

    
#df_users.drop("friends", axis = 1).head(100).to_excel("users.xls", index = False, encoding = "utf-8")
df_users.to_csv("users.csv", sep = ",", index = False, encoding = "utf-8", quote = csv.QUOTE_NONNUMERIC)



In [63]:

    
#df_businesses.head(100).to_excel("businesses.xls", index = False, encoding = "utf-8")
df_businesses.to_csv("businesses.csv", sep = ",", index = False, encoding = "utf-8", quote = csv.QUOTE_NONNUMERIC)



In [70]:

    
#df_reviews.head(100).to_excel("reviews.xls", index = False, encoding = "utf-8")
df_reviews.to_csv("reviews.csv", sep = ",", index = False, encoding = "utf-8", quote = csv.QUOTE_NONNUMERIC)



In [65]:

    
#df_tips.head(100).to_excel("tips.xls", index = False, encoding = "utf-8")
df_tips.to_csv("tips.csv", sep = ",", index = False, encoding = "utf-8", quote = csv.QUOTE_NONNUMERIC)



In [68]:

    
#df_checkins.head(100).to_excel("checkins.xls", index = False, encoding = "utf-8")
df_checkins.to_csv("checkins.csv", sep = ",", index = False, encoding = "utf-8", quote = csv.QUOTE_NONNUMERIC)



In [ ]: