In [1]:
import pandas as pd
import numpy as np
from pyspark.sql.functions import array_contains

In [2]:
business_df = spark.read.json(
    '../data/yelp_dataset_challenge_round9/yelp_academic_dataset_business.json'
)

In [3]:
business_df.printSchema()


root
 |-- address: string (nullable = true)
 |-- attributes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- business_id: string (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- city: string (nullable = true)
 |-- hours: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- is_open: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- name: string (nullable = true)
 |-- neighborhood: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- stars: double (nullable = true)
 |-- state: string (nullable = true)
 |-- type: string (nullable = true)


In [4]:
restaurant_df = (
    business_df
    .filter(array_contains('categories', 'Restaurants'))
    .persist()
)

In [ ]:
# Number of restaurants
# restaurant_df.count()
# 48485

In [ ]:
# (
#     restaurant_df
#     .groupby('state', 'city')
#     .count()
#     .orderBy('state', 'city')
#     .collect()
# )

In [5]:
#############
# Fix typos #
#############
fix_typos_city = {
    'Central City Village' : 'Central City',
    'Gelndale' : 'Glendale',
    'Glenndale' : 'Glendale',
    'Laveen Village' : 'Laveen',
    'MESA' : 'Mesa',
    'Mesa AZ' : 'Mesa',
    'Pheonix' : 'Phoenix',
    'Pheonix AZ' : 'Phoenix',
    'Scottdale' : 'Scottsdale',
    'Stuttgart-Vaihingen' : 'Stuttgart',
    'City of Edinburgh' : 'Edinburgh',
    'Edimbourg' : 'Edinburgh',
    'Concord Mills' : 'Concord',
    'Harrisbug' : 'Harrisburg',
    'Mattews' : 'Matthews',
    'Mint  Hill' : 'Mint Hill',
    'Las  Vegas' : 'Las Vegas',
    'LasVegas' : 'Las Vegas',
    'N Las Vegas' : 'North Las Vegas',
    'N. Las Vegas' : 'North Las Vegas',
    'Nellis AFB' : 'Nellis Air Force Base',
    'Nellis Afb' : 'Nellis Air Force Base',
    'las vegas' : 'Las Vegas',
    'Bainbridge Township' : 'Chagrin Falls',
    'Bedford Hts.' : 'Bedford Heights',
    'Brookpark' : 'Brook Park',
    'Concord Twp' : 'Mentor',
    'Cuyahoga Fls' : 'Cuyahoga Falls',
    'Medina Township' : 'Medina',
    'Mentor On the' : 'Mentor-on-the-Lake',
    'Mentor On the Lake' : 'Mentor-on-the-Lake',
    'N. Olmsted' : 'North Olmsted',
    'North Olmstead' : 'North Olmsted',
    'WICKLIFFE' : 'Wickliffe',
    'Warrensvile Heights' : 'Warrensville Heights',
    'Warrensville Hts.' : 'Warrensville Heights',
    'columbia station' : 'Columbia Station',
    'AGINCOURT' : 'Agincourt',
    'E Gwillimbury' : 'East Gwillimbury',
    'Missisauga' : 'Mississauga',
    'Mississuaga' : 'Mississauga',
    'NORTH YORK' : 'North York',
    'Richmond Hil' : 'Richmond Hill',
    'Scaroborough' : 'Scarborough',
    'Scarobrough' : 'Scarborough',
    'TORONTO' : 'Toronto',
    'Thornhil' : 'Thornhill',
    'Vaughn' : 'Vaughan',
    'Bellvue' : 'Bellevue',
    'East Mc Keesport' : 'East McKeesport',
    'Elizabeth Township' : 'Elizabeth',
    'Mc Donald' : 'McDonald',
    'Mc Murray' : 'McMurray',
    'Moon Township' : 'Moon',
    'Moon Twp' : 'Moon',
    'Moon Twp.' : 'Moon',
    'Mt. Lebanon' : 'Mount Lebanon',
    'Robinson Township' : 'Robinson',
    'Robinson Twp.' : 'Robinson',
    'South Park Township' : 'South Park',
    'Stowe Township' : 'Stowe',
    'Upper St Clair' : 'Upper Saint Clair',
    'Upper St. Clair' : 'Upper Saint Clair',
    'Chatauguay' : 'Châteauguay',
    'Chateauguay' : 'Châteauguay',
    'Cote Saint-Luc' : 'Côte-Saint-Luc',
    'Cote-Saint-Luc' : 'Côte-Saint-Luc',
    'Dollard-Des Ormeaux' : 'Dollard-Des-Ormeaux',
    'Dollard-des-Ormeaux' : 'Dollard-Des-Ormeaux',
    "L'assomption" : "L'Assomption",
    "L'ile-Perrot" : "L'Île-Perrot",
    "L'Île-Perrôt" : "L'Île-Perrot",
    'La Salle' : 'LaSalle',
    'Montreal' : 'Montréal',
    'Montreal-Nord' : 'Montréal-Nord',
    'Montreal-Ouest' : 'Montréal-Ouest',
    'Montreal-West' : 'Montréal-Ouest',
    'Montéal' : 'Montréal',
    'Saint Laurent' : 'Saint-Laurent',
    'Saint Leonard' : 'Saint-Léonard',
    'Saint-Bruno' : 'Saint-Bruno-de-Montarville',
    'Saint-Jean-Sur-Richelieu' : 'Saint-Jean-sur-Richelieu',  
    'Saint-Jerome' : 'Saint-Jérôme',
    'Saint-Leonard' : 'Saint-Léonard',
    'Saint-Marc-Sur-Richelieu' : 'Saint-Marc-sur-Richelieu',
    'Saint-Sauveur-des-Monts' : 'Saint-Sauveur',
    'Sainte-Adele' : 'Sainte-Adèle',
    'Sainte-Anne-De-Bellevue' : 'Sainte-Anne-de-Bellevue',
    'Sainte-Therese' : 'Sainte-Thérèse',
    'Sainte-Therese-de-Blainville' : 'Sainte-Thérèse',
    'Sainte-thérèse' : 'Sainte-Thérèse',
    'Salaberry-De-Valleyfield' : 'Salaberry-de-Valleyfield',
    'St Leonard' : 'Saint-Léonard',
    'St-Benoît de Mirabel' : 'Mirabel',
    'St-Jerome' : 'Saint-Jérôme',
    'St-Laurent' : 'Saint-Laurent',
    'St-Leonard' : 'Saint-Léonard',
    'Ste-Therese-de-Blainville' : 'Sainte-Thérèse',
    'Fort  Mill' : 'Fort Mill',
    'Ft. Mill' : 'Fort Mill',
    'De Forest' : 'DeForest',
    'Mc Farland' : 'McFarland',
}
restaurant_df2 = restaurant_df.na.replace(fix_typos_city, 'city')

fix_typos_state = {
    'KHL' : 'MLN',
    'PKN' : 'EDH'
}
restaurant_df3 = restaurant_df2.na.replace(fix_typos_state, 'state')

In [ ]:
# (
#     restaurant_df3
#     .groupby('state', 'city')
#     .count()
#     .orderBy('state', 'city')
#     .collect()
# )

In [6]:
restaurant_df3.write.parquet(
    path='../data/restaurants',
    mode='overwrite',
    compression='gzip'
)

In [ ]: