In [1]:
import jq
import json
import pandas as pd

In [2]:
data = []
with open('../data/yelp_academic_dataset_business.json') as the_dat:
    for line in the_dat:
        data.append(json.loads(line))

In [3]:
data[0]


Out[3]:
{u'attributes': {u'By Appointment Only': True},
 u'business_id': u'vcNAWiLM4dR7D2nwwJ7nCA',
 u'categories': [u'Doctors', u'Health & Medical'],
 u'city': u'Phoenix',
 u'full_address': u'4840 E Indian School Rd\nSte 101\nPhoenix, AZ 85018',
 u'hours': {u'Friday': {u'close': u'17:00', u'open': u'08:00'},
  u'Monday': {u'close': u'17:00', u'open': u'08:00'},
  u'Thursday': {u'close': u'17:00', u'open': u'08:00'},
  u'Tuesday': {u'close': u'17:00', u'open': u'08:00'},
  u'Wednesday': {u'close': u'17:00', u'open': u'08:00'}},
 u'latitude': 33.499313,
 u'longitude': -111.983758,
 u'name': u'Eric Goldberg, MD',
 u'neighborhoods': [],
 u'open': True,
 u'review_count': 9,
 u'stars': 3.5,
 u'state': u'AZ',
 u'type': u'business'}

In [5]:
test_data = data[0:10]

In [7]:
test_data[0]


Out[7]:
{u'attributes': {u'By Appointment Only': True},
 u'business_id': u'vcNAWiLM4dR7D2nwwJ7nCA',
 u'categories': [u'Doctors', u'Health & Medical'],
 u'city': u'Phoenix',
 u'full_address': u'4840 E Indian School Rd\nSte 101\nPhoenix, AZ 85018',
 u'hours': {u'Friday': {u'close': u'17:00', u'open': u'08:00'},
  u'Monday': {u'close': u'17:00', u'open': u'08:00'},
  u'Thursday': {u'close': u'17:00', u'open': u'08:00'},
  u'Tuesday': {u'close': u'17:00', u'open': u'08:00'},
  u'Wednesday': {u'close': u'17:00', u'open': u'08:00'}},
 u'latitude': 33.499313,
 u'longitude': -111.983758,
 u'name': u'Eric Goldberg, MD',
 u'neighborhoods': [],
 u'open': True,
 u'review_count': 9,
 u'stars': 3.5,
 u'state': u'AZ',
 u'type': u'business'}

In [8]:
df = pd.io.json.json_normalize(data)

In [11]:
df.head(3)


Out[11]:
attributes.Accepts Credit Cards attributes.Accepts Insurance attributes.Ages Allowed attributes.Alcohol attributes.Ambience.casual attributes.Ambience.classy attributes.Ambience.divey attributes.Ambience.hipster attributes.Ambience.intimate attributes.Ambience.romantic ... hours.Wednesday.open latitude longitude name neighborhoods open review_count stars state type
0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 08:00 33.499313 -111.983758 Eric Goldberg, MD [] True 9 3.5 AZ business
1 True NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN 40.350519 -79.886930 Clancy's Pub [] True 4 3.5 PA business
2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN 40.356896 -80.015910 Cool Springs Golf Center [] False 5 2.5 PA business

3 rows × 105 columns


In [12]:
df.columns


Out[12]:
Index([u'attributes.Accepts Credit Cards', u'attributes.Accepts Insurance', u'attributes.Ages Allowed', u'attributes.Alcohol', u'attributes.Ambience.casual', u'attributes.Ambience.classy', u'attributes.Ambience.divey', u'attributes.Ambience.hipster', u'attributes.Ambience.intimate', u'attributes.Ambience.romantic', u'attributes.Ambience.touristy', u'attributes.Ambience.trendy', u'attributes.Ambience.upscale', u'attributes.Attire', u'attributes.BYOB', u'attributes.BYOB/Corkage', u'attributes.By Appointment Only', u'attributes.Caters', u'attributes.Coat Check', u'attributes.Corkage', u'attributes.Delivery', u'attributes.Dietary Restrictions.dairy-free', u'attributes.Dietary Restrictions.gluten-free', u'attributes.Dietary Restrictions.halal', u'attributes.Dietary Restrictions.kosher', u'attributes.Dietary Restrictions.soy-free', u'attributes.Dietary Restrictions.vegan', u'attributes.Dietary Restrictions.vegetarian', u'attributes.Dogs Allowed', u'attributes.Drive-Thru', u'attributes.Good For Dancing', u'attributes.Good For Groups', u'attributes.Good For Kids', u'attributes.Good For.breakfast', u'attributes.Good For.brunch', u'attributes.Good For.dessert', u'attributes.Good For.dinner', u'attributes.Good For.latenight', u'attributes.Good For.lunch', u'attributes.Good for Kids', u'attributes.Hair Types Specialized In.africanamerican', u'attributes.Hair Types Specialized In.asian', u'attributes.Hair Types Specialized In.coloring', u'attributes.Hair Types Specialized In.curly', u'attributes.Hair Types Specialized In.extensions', u'attributes.Hair Types Specialized In.kids', u'attributes.Hair Types Specialized In.perms', u'attributes.Hair Types Specialized In.straightperms', u'attributes.Happy Hour', u'attributes.Has TV', u'attributes.Music.background_music', u'attributes.Music.dj', u'attributes.Music.jukebox', u'attributes.Music.karaoke', u'attributes.Music.live', u'attributes.Music.playlist', u'attributes.Music.video', u'attributes.Noise Level', u'attributes.Open 24 Hours', u'attributes.Order at Counter', u'attributes.Outdoor Seating', u'attributes.Parking.garage', u'attributes.Parking.lot', u'attributes.Parking.street', u'attributes.Parking.valet', u'attributes.Parking.validated', u'attributes.Payment Types.amex', u'attributes.Payment Types.cash_only', u'attributes.Payment Types.discover', u'attributes.Payment Types.mastercard', u'attributes.Payment Types.visa', u'attributes.Price Range', u'attributes.Smoking', u'attributes.Take-out', u'attributes.Takes Reservations', u'attributes.Waiter Service', u'attributes.Wheelchair Accessible', u'attributes.Wi-Fi', u'business_id', u'categories', u'city', u'full_address', u'hours.Friday.close', u'hours.Friday.open', u'hours.Monday.close', u'hours.Monday.open', u'hours.Saturday.close', u'hours.Saturday.open', u'hours.Sunday.close', u'hours.Sunday.open', u'hours.Thursday.close', u'hours.Thursday.open', u'hours.Tuesday.close', u'hours.Tuesday.open', u'hours.Wednesday.close', u'hours.Wednesday.open', u'latitude', u'longitude', u'name', u'neighborhoods', ...], dtype='object')

In [19]:
df.head(3)


Out[19]:
attributes.Accepts Credit Cards attributes.Accepts Insurance attributes.Ages Allowed attributes.Alcohol attributes.Ambience.casual attributes.Ambience.classy attributes.Ambience.divey attributes.Ambience.hipster attributes.Ambience.intimate attributes.Ambience.romantic ... hours.Wednesday.open latitude longitude name neighborhoods open review_count stars state type
0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 08:00 33.499313 -111.983758 Eric Goldberg, MD [] True 9 3.5 AZ business
1 True NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN 40.350519 -79.886930 Clancy's Pub [] True 4 3.5 PA business
2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN 40.356896 -80.015910 Cool Springs Golf Center [] False 5 2.5 PA business

3 rows × 105 columns


In [22]:
df['categories'].head(3)


Out[22]:
0       [Doctors, Health & Medical]
1                       [Nightlife]
2    [Active Life, Mini Golf, Golf]
Name: categories, dtype: object

In [70]:
def get_hotels(cats):
    #return 'Hotel' in cats
    if True in [True for s in cats if 'Hotel' in s]:
        return True
    else:
        return False

In [71]:
cats = ['Hotels', 'Travel & Shit', 'Fucker', 'Hotel & Travel']

In [72]:
get_hotels(cats)


Out[72]:
True

In [73]:
cats2 = ['Travel', 'Fucker']
get_hotels(cats2)


Out[73]:
False

In [74]:
hoteldf = df[df['categories'].apply(get_hotels)]

In [75]:
hoteldf['categories']


Out[75]:
14       [Hotels & Travel, Event Planning & Services, H...
43       [Hotels & Travel, Airport Shuttles, Limos, Tra...
80       [Hotels & Travel, Event Planning & Services, H...
121                     [Hotels & Travel, Travel Services]
169      [Hotels & Travel, Bed & Breakfast, Event Plann...
225      [Hotels & Travel, Event Planning & Services, H...
286                               [Hotels & Travel, Tours]
292      [Hotels & Travel, Event Planning & Services, H...
295      [Hotels & Travel, Event Planning & Services, V...
296      [Hotels & Travel, Event Planning & Services, H...
310      [Hotels & Travel, Public Transportation, Trans...
314                          [Hotels & Travel, Car Rental]
340      [Hotels & Travel, Airport Shuttles, Limos, Tra...
381      [Hotels & Travel, Event Planning & Services, H...
396                      [Train Stations, Hotels & Travel]
411      [Hotels & Travel, Event Planning & Services, H...
414      [Hotels & Travel, Event Planning & Services, H...
457      [Hotels & Travel, Event Planning & Services, H...
524      [Hotels & Travel, Event Planning & Services, V...
535      [Food, Tea Rooms, Breakfast & Brunch, Bed & Br...
562      [Food, Tea Rooms, Bed & Breakfast, Event Plann...
615      [Hotels & Travel, Event Planning & Services, H...
652      [Hotels & Travel, Event Planning & Services, H...
654      [Hotels & Travel, Event Planning & Services, V...
660      [Hotels & Travel, Event Planning & Services, H...
667      [Hotels & Travel, Event Planning & Services, H...
711      [Hotels & Travel, Event Planning & Services, H...
716      [Hotels & Travel, Event Planning & Services, H...
728      [Hotels & Travel, Event Planning & Services, H...
729      [Hotels & Travel, Event Planning & Services, H...
                               ...                        
59699    [Hotels & Travel, Event Planning & Services, H...
59732    [Hotels & Travel, Taxis, Automotive, Transport...
59733             [Hotels & Travel, Limos, Transportation]
59763                   [Hotels & Travel, Travel Services]
59800    [Tours, Party Bus Rentals, Event Planning & Se...
59805    [Hotels & Travel, Transportation, Airport Shut...
59808                             [Hotels & Travel, Tours]
59817                        [Hotels & Travel, Car Rental]
59818                        [Hotels & Travel, Car Rental]
59886    [Hotels & Travel, Event Planning & Services, H...
59894    [Hotels & Travel, Event Planning & Services, H...
59991                   [Hotels & Travel, Bed & Breakfast]
60001          [Hotels & Travel, Airlines, Transportation]
60013    [Hotels & Travel, Event Planning & Services, H...
60128                             [Hotels & Travel, Tours]
60216    [Hotels & Travel, Taxis, Limos, Transportation...
60227                        [Hotels & Travel, Car Rental]
60241    [Hotels & Travel, Event Planning & Services, H...
60243                             [Hotels & Travel, Tours]
60494                   [Hotels & Travel, Travel Services]
60501    [Hotels & Travel, Event Planning & Services, H...
60592    [Hotels & Travel, Arts & Entertainment, Casino...
60649    [Hotels & Travel, Event Planning & Services, H...
60714                        [Hotels & Travel, Car Rental]
60813    [Hotels & Travel, Event Planning & Services, H...
60846    [Arts & Entertainment, Resorts, Casinos, Event...
60920    [Hotels & Travel, Event Planning & Services, H...
60955    [Hotels & Travel, Event Planning & Services, V...
61086             [Hotels & Travel, Taxis, Transportation]
61137    [Hotels & Travel, Event Planning & Services, H...
Name: categories, dtype: object

In [76]:
hoteldf.reset_index(inplace=True)

In [86]:
for col in hoteldf.columns:
    print(col)


index
attributes.Accepts Credit Cards
attributes.Accepts Insurance
attributes.Ages Allowed
attributes.Alcohol
attributes.Ambience.casual
attributes.Ambience.classy
attributes.Ambience.divey
attributes.Ambience.hipster
attributes.Ambience.intimate
attributes.Ambience.romantic
attributes.Ambience.touristy
attributes.Ambience.trendy
attributes.Ambience.upscale
attributes.Attire
attributes.BYOB
attributes.BYOB/Corkage
attributes.By Appointment Only
attributes.Caters
attributes.Coat Check
attributes.Corkage
attributes.Delivery
attributes.Dietary Restrictions.dairy-free
attributes.Dietary Restrictions.gluten-free
attributes.Dietary Restrictions.halal
attributes.Dietary Restrictions.kosher
attributes.Dietary Restrictions.soy-free
attributes.Dietary Restrictions.vegan
attributes.Dietary Restrictions.vegetarian
attributes.Dogs Allowed
attributes.Drive-Thru
attributes.Good For Dancing
attributes.Good For Groups
attributes.Good For Kids
attributes.Good For.breakfast
attributes.Good For.brunch
attributes.Good For.dessert
attributes.Good For.dinner
attributes.Good For.latenight
attributes.Good For.lunch
attributes.Good for Kids
attributes.Hair Types Specialized In.africanamerican
attributes.Hair Types Specialized In.asian
attributes.Hair Types Specialized In.coloring
attributes.Hair Types Specialized In.curly
attributes.Hair Types Specialized In.extensions
attributes.Hair Types Specialized In.kids
attributes.Hair Types Specialized In.perms
attributes.Hair Types Specialized In.straightperms
attributes.Happy Hour
attributes.Has TV
attributes.Music.background_music
attributes.Music.dj
attributes.Music.jukebox
attributes.Music.karaoke
attributes.Music.live
attributes.Music.playlist
attributes.Music.video
attributes.Noise Level
attributes.Open 24 Hours
attributes.Order at Counter
attributes.Outdoor Seating
attributes.Parking.garage
attributes.Parking.lot
attributes.Parking.street
attributes.Parking.valet
attributes.Parking.validated
attributes.Payment Types.amex
attributes.Payment Types.cash_only
attributes.Payment Types.discover
attributes.Payment Types.mastercard
attributes.Payment Types.visa
attributes.Price Range
attributes.Smoking
attributes.Take-out
attributes.Takes Reservations
attributes.Waiter Service
attributes.Wheelchair Accessible
attributes.Wi-Fi
business_id
categories
city
full_address
hours.Friday.close
hours.Friday.open
hours.Monday.close
hours.Monday.open
hours.Saturday.close
hours.Saturday.open
hours.Sunday.close
hours.Sunday.open
hours.Thursday.close
hours.Thursday.open
hours.Tuesday.close
hours.Tuesday.open
hours.Wednesday.close
hours.Wednesday.open
latitude
longitude
name
neighborhoods
open
review_count
stars
state
type

In [78]:
len(hoteldf)


Out[78]:
2131

In [84]:
doghotelsbig = hoteldf[hoteldf['attributes.Dogs Allowed'] == True]

In [87]:
good_cats = [
    'name',
    'stars',
    'city',
    'state',
    'latitude',
    'longitude',
    'review_count',
    'full_address',
    'business_id',
    'attributes.Dogs Allowed']

In [115]:
doghotels = doghotelsbig[good_cats].copy()

In [116]:
doghotels['hotel_id'] = 0

In [117]:
doghotels.columns


Out[117]:
Index([u'name', u'stars', u'city', u'state', u'latitude', u'longitude', u'review_count', u'full_address', u'business_id', u'attributes.Dogs Allowed', u'hotel_id'], dtype='object')

In [118]:
len(doghotels)


Out[118]:
128

In [119]:
doghotels.columns = ['hotel_name',
                     'hotel_rating',
                     'hotel_city',
                     'hotel_state',
                     'hotel_latitude',
                     'hotel_longitude',
                     'review_count',
                     'hotel_address',
                     'business_id',
                     'dogs_allowed',
                     'hotel_id']

Now write to SQL


In [100]:
import connect_aws_db as cadb

In [103]:
import sqlalchemy

In [101]:
engine = cadb.connect_aws_db()

In [104]:
conn = engine.connect()

In [125]:
cmd = "DROP TABLE yelp_hotels"

In [126]:
result = conn.execute(cmd)

In [127]:
cmd = """
        CREATE TABLE yelp_hotels
        (
        hotel_id MEDIUMINT AUTO_INCREMENT,
        hotel_name VARCHAR(256),
        hotel_rating FLOAT,
        hotel_city VARCHAR(256),
        hotel_state VARCHAR(64),
        hotel_latitude FLOAT,
        hotel_longitude FLOAT,
        review_count INT,
        hotel_address VARCHAR(512),
        business_id VARCHAR(256),
        dogs_allowed BINARY,
        PRIMARY KEY (hotel_id)
        )
        """

In [128]:
result = conn.execute(cmd)

In [129]:
doghotels.to_sql('yelp_hotels', engine, if_exists='append', index=False)

In [ ]: