In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

Python package imports


In [2]:
import numpy as np
import os
import glob
import pandas as pd
import yelp_utils
from yelp_utils import *
import yelp_utils
print "Pandas version: " + pd.__version__


Pandas version: 0.17.0

In [3]:
pd.options.display.max_columns = 200

In [4]:
yelp_utils.make_sure_path_exists(yelp_utils.YELP_DATA_CSV_DIR)

DATA WRANGLING

Convert the json files to flattened csv for clarity and ease of access.


In [5]:
# http://stackoverflow.com/questions/3207219/how-to-list-all-files-of-a-directory-in-python
for json_file in glob.glob(os.path.join(yelp_utils.YELP_DATA_RAW_DIR, "*.json")):
    csv_file = '{0}.csv'.format(os.path.join(yelp_utils.YELP_DATA_CSV_DIR,os.path.basename(json_file).split('.json')[0]))
    if not os.path.isfile(csv_file):
        %run json_to_csv_converter.py $json_file $YELP_DATA_CSV_DIR

Read the csv to pandas dataframe


In [6]:
business_csv_file = os.path.join(yelp_utils.YELP_DATA_CSV_DIR, 'yelp_academic_dataset_business.csv')
df_business = pd.read_csv(business_csv_file, encoding='utf-8', engine='c', low_memory=False)
yelp_utils.getDfInfo(df_business)
df_business.head()


*****SHAPE********
(77445, 98)
*****NULL PERCENTAGE*********
attributes.Ambience.divey                               0.732068
attributes.Dietary Restrictions.vegan                   0.997986
attributes.Happy Hour                                   0.940965
hours.Thursday.open                                     0.363019
attributes.Order at Counter                             0.995145
attributes.Hair Types Specialized In.africanamerican    0.994435
attributes.Hair Types Specialized In.kids               0.994306
attributes.BYOB                                         0.989037
hours.Friday.open                                       0.365653
attributes.Good For.latenight                           0.715127
attributes.Outdoor Seating                              0.656518
attributes.Alcohol                                      0.698780
attributes.Ambience.classy                              0.723158
attributes.By Appointment Only                          0.848357
attributes.Parking.lot                                  0.424275
business_id                                             0.000000
attributes.Ambience.touristy                            0.723158
attributes.Corkage                                      0.991852
hours.Tuesday.open                                      0.372613
attributes.Good For.brunch                              0.715863
categories                                              0.000000
attributes.Waiter Service                               0.724553
hours.Monday.open                                       0.409813
name                                                    0.000000
attributes.Parking.street                               0.424275
attributes.Ambience.hipster                             0.725263
attributes.BYOB/Corkage                                 0.983020
attributes.Hair Types Specialized In.straightperms      0.994435
attributes.Music.live                                   0.963406
attributes.Dietary Restrictions.dairy-free              0.997986
                                                          ...   
attributes.Coat Check                                   0.943211
longitude                                               0.000000
hours.Monday.close                                      0.409813
attributes.Hair Types Specialized In.extensions         0.994306
hours.Tuesday.close                                     0.372613
hours.Saturday.close                                    0.432991
attributes.Good for Kids                                0.608393
attributes.Parking.validated                            0.429350
hours.Sunday.open                                       0.581651
attributes.Accepts Insurance                            0.992149
attributes.Music.dj                                     0.951863
attributes.Dietary Restrictions.soy-free                0.997986
attributes.Has TV                                       0.706850
hours.Sunday.close                                      0.581651
attributes.Ambience.casual                              0.723158
attributes.Hair Types Specialized In.perms              0.994306
attributes.Dogs Allowed                                 0.949241
attributes.Drive-Thru                                   0.957066
attributes.Dietary Restrictions.vegetarian              0.997986
hours.Wednesday.open                                    0.365731
attributes.Noise Level                                  0.724902
attributes.Smoking                                      0.938059
attributes.Attire                                       0.696727
attributes.Hair Types Specialized In.curly              0.994306
attributes.Good For Groups                              0.664136
neighborhoods                                           0.000000
attributes.Open 24 Hours                                0.995855
attributes.Ambience.romantic                            0.723158
attributes.Music.jukebox                                0.963368
attributes.Ambience.upscale                             0.724837
dtype: float64
Out[6]:
attributes.Ambience.divey attributes.Dietary Restrictions.vegan attributes.Happy Hour hours.Thursday.open attributes.Order at Counter attributes.Hair Types Specialized In.africanamerican attributes.Hair Types Specialized In.kids attributes.BYOB hours.Friday.open attributes.Good For.latenight attributes.Outdoor Seating attributes.Alcohol attributes.Ambience.classy attributes.By Appointment Only attributes.Parking.lot business_id attributes.Ambience.touristy attributes.Corkage hours.Tuesday.open attributes.Good For.brunch categories attributes.Waiter Service hours.Monday.open name attributes.Parking.street attributes.Ambience.hipster attributes.BYOB/Corkage attributes.Hair Types Specialized In.straightperms attributes.Music.live attributes.Dietary Restrictions.dairy-free attributes.Music.background_music attributes.Price Range attributes.Good For.breakfast attributes.Parking.garage attributes.Music.karaoke attributes.Good For Dancing review_count attributes.Hair Types Specialized In.asian state attributes.Accepts Credit Cards hours.Friday.close attributes.Good For.lunch attributes.Parking.valet attributes.Take-out full_address hours.Thursday.close attributes.Hair Types Specialized In.coloring attributes.Good For.dessert attributes.Music.video attributes.Dietary Restrictions.halal attributes.Takes Reservations hours.Saturday.open attributes.Ages Allowed attributes.Ambience.trendy attributes.Delivery hours.Wednesday.close attributes.Wi-Fi open city attributes.Wheelchair Accessible attributes.Dietary Restrictions.gluten-free stars attributes.Dietary Restrictions.kosher type attributes.Caters attributes.Ambience.intimate latitude attributes.Good For.dinner attributes.Coat Check longitude hours.Monday.close attributes.Hair Types Specialized In.extensions hours.Tuesday.close hours.Saturday.close attributes.Good for Kids attributes.Parking.validated hours.Sunday.open attributes.Accepts Insurance attributes.Music.dj attributes.Dietary Restrictions.soy-free attributes.Has TV hours.Sunday.close attributes.Ambience.casual attributes.Hair Types Specialized In.perms attributes.Dogs Allowed attributes.Drive-Thru attributes.Dietary Restrictions.vegetarian hours.Wednesday.open attributes.Noise Level attributes.Smoking attributes.Attire attributes.Hair Types Specialized In.curly attributes.Good For Groups neighborhoods attributes.Open 24 Hours attributes.Ambience.romantic attributes.Music.jukebox attributes.Ambience.upscale
0 False NaN NaN 11:00 NaN NaN NaN NaN 11:00 False False none False NaN False 5UmKMjUEUNdYWqANhGckJw False NaN 11:00 False ['Fast Food', 'Restaurants'] False 11:00 Mr Hoagie False False NaN NaN NaN NaN NaN 1 False False NaN NaN 4 NaN PA True 21:00 False False True 4734 Lebanon Church Rd\nDravosburg, PA 15034 21:00 NaN False NaN NaN False NaN NaN False False 21:00 NaN True Dravosburg NaN NaN 4.5 NaN business False False 40.354327 False NaN -79.900706 21:00 NaN 21:00 NaN True False NaN NaN NaN NaN False NaN False NaN NaN False NaN 11:00 average NaN casual NaN True [] NaN False NaN False
1 NaN NaN True NaN NaN NaN NaN NaN NaN NaN False NaN NaN NaN NaN UsFtqoBl7naz8AVUBZMjQQ NaN NaN NaN NaN ['Nightlife'] NaN NaN Clancy's Pub NaN NaN NaN NaN NaN NaN NaN 1 NaN NaN NaN NaN 4 NaN PA True NaN NaN NaN NaN 202 McClure St\nDravosburg, PA 15034 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN True Dravosburg NaN NaN 3.5 NaN business NaN NaN 40.350553 NaN NaN -79.886814 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN True [] NaN NaN NaN NaN
2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3eu6MEFlq2Dg7bQh8QbdOg NaN NaN NaN NaN ['Auto Repair', 'Automotive'] NaN NaN Joe Cislo's Auto NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3 NaN PA NaN NaN NaN NaN NaN 1 Ravine St\nDravosburg, PA 15034 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN True Dravosburg NaN NaN 5.0 NaN business NaN NaN 40.350956 NaN NaN -79.889059 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN [] NaN NaN NaN NaN
3 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN cE27W9VPgO88Qxe4ol6y_g NaN NaN NaN NaN ['Active Life', 'Mini Golf', 'Golf'] NaN NaN Cool Springs Golf Center NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5 NaN PA NaN NaN NaN NaN NaN 1530 Hamilton Rd\nBethel Park, PA 15234 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN False Bethel Park NaN NaN 2.5 NaN business NaN NaN 40.354116 NaN NaN -80.014660 NaN NaN NaN NaN True NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN [] NaN NaN NaN NaN
4 NaN NaN NaN 10:00 NaN NaN NaN NaN 10:00 NaN NaN NaN NaN NaN False HZdLhv6COCleJMo7nPl-RA NaN NaN 10:00 NaN ['Shopping', 'Home Services', 'Internet Servic... NaN 10:00 Verizon False NaN NaN NaN NaN NaN NaN 2 NaN False NaN NaN 5 NaN PA False 17:00 NaN False NaN 301 South Hills Village\nPittsburgh, PA 15241 17:00 NaN NaN NaN NaN NaN 10:00 NaN NaN NaN 21:00 NaN True Pittsburgh NaN NaN 2.5 NaN business NaN NaN 40.357620 NaN NaN -80.059980 21:00 NaN 21:00 21:00 NaN False 11:00 NaN NaN NaN NaN 18:00 NaN NaN NaN NaN NaN 10:00 NaN NaN NaN NaN NaN [] NaN NaN NaN NaN

In [7]:
review_csv_file = os.path.join(yelp_utils.YELP_DATA_CSV_DIR, 'yelp_academic_dataset_review.csv')
df_review = pd.read_csv(review_csv_file, encoding='utf-8', engine='c') # didnt use low memory option as file is too large
getDfInfo(df_review)
df_review.head()


*****SHAPE********
(2225213, 10)
*****NULL PERCENTAGE*********
user_id         0
review_id       0
text            0
votes.cool      0
business_id     0
votes.funny     0
stars           0
date            0
type            0
votes.useful    0
dtype: float64
Out[7]:
user_id review_id text votes.cool business_id votes.funny stars date type votes.useful
0 PUFPaY9KxDAcGqfsorJp3Q Ya85v4eqdd6k9Od8HbQjyA Mr Hoagie is an institution. Walking in, it do... 0 5UmKMjUEUNdYWqANhGckJw 0 4 2012-08-01 review 0
1 Iu6AxdBYGR4A0wspR9BYHA KPvLNJ21_4wbYNctrOwWdQ Excellent food. Superb customer service. I mis... 0 5UmKMjUEUNdYWqANhGckJw 0 5 2014-02-13 review 0
2 auESFwWvW42h6alXgFxAXQ fFSoGV46Yxuwbr3fHNuZig Yes this place is a little out dated and not o... 0 5UmKMjUEUNdYWqANhGckJw 0 5 2015-10-31 review 0
3 uK8tzraOp4M5u3uYrqIBXg Di3exaUCFNw1V4kSNW5pgA All the food is great here. But the best thing... 0 UsFtqoBl7naz8AVUBZMjQQ 0 5 2013-11-08 review 0
4 I_47G-R2_egp7ME5u_ltew 0Lua2-PbqEQMjD9r89-asw We checked this place out this past Monday for... 0 UsFtqoBl7naz8AVUBZMjQQ 0 3 2014-03-29 review 0

In [8]:
user_csv_file = os.path.join(yelp_utils.YELP_DATA_CSV_DIR, 'yelp_academic_dataset_user.csv')
df_user = pd.read_csv(user_csv_file, encoding='utf-8', engine='c', low_memory=False)
getDfInfo(df_user)
df_user.head()


*****SHAPE********
(552339, 23)
*****NULL PERCENTAGE*********
yelping_since          0.000000
compliments.plain      0.801376
review_count           0.000000
friends                0.000000
compliments.cute       0.956192
compliments.writer     0.868468
fans                   0.000000
compliments.note       0.835413
type                   0.000000
compliments.hot        0.876686
compliments.cool       0.832554
compliments.profile    0.959652
average_stars          0.000000
compliments.more       0.904267
elite                  0.000000
name                   0.000002
user_id                0.000000
votes.cool             0.000000
compliments.list       0.977780
votes.funny            0.000000
compliments.photos     0.933347
compliments.funny      0.884603
votes.useful           0.000000
dtype: float64
Out[8]:
yelping_since compliments.plain review_count friends compliments.cute compliments.writer fans compliments.note type compliments.hot compliments.cool compliments.profile average_stars compliments.more elite name user_id votes.cool compliments.list votes.funny compliments.photos compliments.funny votes.useful
0 2004-10 25 108 ['rpOyqD_893cqmDAtJLbdog', '4U9kSBLuBDU391x6bx... 15 9 70 20 user 48 78 8 4.14 3 [2005, 2006] Russel 18kPq7GPye-YQ3LyKyAZPw 245 NaN 167 14 11 280
1 2004-10 959 1274 ['18kPq7GPye-YQ3LyKyAZPw', '4U9kSBLuBDU391x6bx... 206 327 1179 611 user 1094 1642 116 3.66 134 [2005, 2006, 2007, 2008, 2009, 2010, 2011, 201... Jeremy rpOyqD_893cqmDAtJLbdog 11093 38 7681 330 580 14199
2 2004-10 89 442 ['18kPq7GPye-YQ3LyKyAZPw', 'rpOyqD_893cqmDAtJL... 23 24 100 83 user 101 145 9 3.60 19 [2005, 2006, 2007, 2008, 2009, 2010, 2011, 201... Michael 4U9kSBLuBDU391x6bxU-YA 732 4 908 24 120 1483
3 2004-10 NaN 11 ['18kPq7GPye-YQ3LyKyAZPw', 'rpOyqD_893cqmDAtJL... 2 NaN 2 NaN user NaN NaN NaN 4.64 NaN [] Ken fHtTaujcyKvXglE33Z5yIw 5 NaN 1 NaN NaN 11
4 2004-10 2 66 ['rpOyqD_893cqmDAtJLbdog', 'HDQixQ-WZEV0LVPJlI... 2 2 4 1 user 1 1 NaN 3.80 1 [2005] Katherine SIBCL7HBkrP4llolm4SC2A 13 NaN 11 NaN NaN 34

DATA CLEANING

Remove NAs


In [9]:
def renameRemoveNACols(df, string):
    '''
    Function to remove columns from dataframe which contain NA value and
    rename columns with string appended at beginning of column name.
    Input: 
        df: Pandas data frame
        string: The string that has to be appended at beginning of all column name
    Output: Pandas data frame 
    '''
    df = df.dropna(axis='columns', how='any')
    new_columns = df.columns.values
    new_columns = string + new_columns 
    df.columns = new_columns
    return(df)

def removeNACols(df):
    '''
    Function to remove columns which contain NA value.
    Input: 
        df: Pandas data frame
    Output: Pandas data frame     
    '''
    df = df.dropna(axis='columns', how='any')
    return(df)

In [10]:
df_business = renameRemoveNACols(df_business, 'business_')
df_business.rename(columns={'business_business_id': 'business_id'}, inplace=True)
df_review = renameRemoveNACols(df_review, 'review_')
df_review.rename(columns={'review_review_id': 'review_id', 'review_user_id':'user_id', 'review_business_id' :'business_id'}, 
                 inplace=True)
df_user = renameRemoveNACols(df_user, 'user_')
df_user.rename(columns={'user_user_id': 'user_id'}, inplace=True)

#                                      left_on='business_id', right_on='review_business_id')

Merge tables


In [11]:
df_merged_business_review = pd.merge(df_business, df_review, how ='inner', on='business_id')
getDfInfo(df_merged_business_review)


*****SHAPE********
(2225213, 20)
*****NULL PERCENTAGE*********
business_id               0
business_categories       0
business_name             0
business_review_count     0
business_full_address     0
business_open             0
business_stars            0
business_type             0
business_latitude         0
business_longitude        0
business_neighborhoods    0
user_id                   0
review_id                 0
review_text               0
review_votes.cool         0
review_votes.funny        0
review_stars              0
review_date               0
review_type               0
review_votes.useful       0
dtype: float64

In [12]:
df_merged_business_review_user = pd.merge(df_merged_business_review, df_user, how ='inner', on='user_id')
#                                           left_index ='review_user_id', right_index='user_id')
getDfInfo(df_merged_business_review_user)
df_merged_business_review_user.head()


*****SHAPE********
(2225213, 30)
*****NULL PERCENTAGE*********
business_id               0
business_categories       0
business_name             0
business_review_count     0
business_full_address     0
business_open             0
business_stars            0
business_type             0
business_latitude         0
business_longitude        0
business_neighborhoods    0
user_id                   0
review_id                 0
review_text               0
review_votes.cool         0
review_votes.funny        0
review_stars              0
review_date               0
review_type               0
review_votes.useful       0
user_yelping_since        0
user_review_count         0
user_friends              0
user_fans                 0
user_type                 0
user_average_stars        0
user_elite                0
user_votes.cool           0
user_votes.funny          0
user_votes.useful         0
dtype: float64
Out[12]:
business_id business_categories business_name business_review_count business_full_address business_open business_stars business_type business_latitude business_longitude business_neighborhoods user_id review_id review_text review_votes.cool review_votes.funny review_stars review_date review_type review_votes.useful user_yelping_since user_review_count user_friends user_fans user_type user_average_stars user_elite user_votes.cool user_votes.funny user_votes.useful
0 5UmKMjUEUNdYWqANhGckJw ['Fast Food', 'Restaurants'] Mr Hoagie 4 4734 Lebanon Church Rd\nDravosburg, PA 15034 True 4.5 business 40.354327 -79.900706 [] PUFPaY9KxDAcGqfsorJp3Q Ya85v4eqdd6k9Od8HbQjyA Mr Hoagie is an institution. Walking in, it do... 0 0 4 2012-08-01 review 0 2009-05 60 [] 0 user 3.08 [] 12 25 166
1 WaHXyBwljbKNPmmJZn5j8Q ['Food', 'Grocery'] Shop 'n Save 3 1886 Homeville Rd\nWest Mifflin, PA 15122 True 3.0 business 40.373677 -79.873734 [] PUFPaY9KxDAcGqfsorJp3Q 58tIa8PfEnUnPsIV_BUS3w It is now a shop and save. 0 0 3 2012-08-01 review 0 2009-05 60 [] 0 user 3.08 [] 12 25 166
2 u22QDAON1kk0wjoTliyftw ['Pizza', 'Restaurants'] Latina Pizza 3 4426 Kennywood Blvd\nWest Mifflin, PA 15122 True 4.0 business 40.389192 -79.868305 [] PUFPaY9KxDAcGqfsorJp3Q Vn4Uxe0sLHDSAllDa2iWDw I have to admit, the first time I hate here, I... 0 0 4 2012-08-05 review 4 2009-05 60 [] 0 user 3.08 [] 12 25 166
3 4ykgzzzGEWjMD5lwk1-l9A ['Sporting Goods', 'Bikes', 'Shopping'] Big Bang Bicycles 16 347 Lebanon Rd\nWest Mifflin, PA 15122 True 4.5 business 40.371539 -79.926492 [] PUFPaY9KxDAcGqfsorJp3Q iQdqWF_bgU61BZVcU4iHDQ I've had mixed results here. I would definitel... 0 0 3 2009-05-20 review 0 2009-05 60 [] 0 user 3.08 [] 12 25 166
4 2PfavOTufsPCRdYm-bFcpw ['Food', 'Ice Cream & Frozen Yogurt', 'Desserts'] Page Dairy Mart 74 4600 E Carson St\nSouth Side\nPittsburgh, PA 1... True 4.5 business 40.411509 -79.955886 ['South Side'] PUFPaY9KxDAcGqfsorJp3Q mINMFt512EchGlQ-CAKd0g This place is a great Ice Cream Shop. My Favor... 0 0 5 2012-08-01 review 0 2009-05 60 [] 0 user 3.08 [] 12 25 166

In [13]:
len(set(df_merged_business_review_user['review_id'])) == df_merged_business_review_user.shape[0]


Out[13]:
True

Considering reviews only in US


In [14]:
# https://answers.yahoo.com/question/index?qid=20070729220301AA6Ct4s
# +48.987386 is the northern most latitude 
# +18.005611 is the southern most latitude 
# -124.626080 is the west most longitude 
# -62.361014 is a east most longitude

df_merged_business_review_user[['business_latitude', 'business_longitude', 'business_full_address']]
condition_latitude = (df_merged_business_review_user.business_latitude < 48.987386) \
                    & (df_merged_business_review_user.business_latitude > 18.005611) 
condition_longitude = (df_merged_business_review_user.business_longitude > -124.626080) \
                    & (df_merged_business_review_user.business_longitude < -62.361014)
df_merged_business_review_user_only_us = df_merged_business_review_user.loc[condition_latitude & condition_longitude]

In [15]:
print df_merged_business_review_user.shape
print df_merged_business_review_user_only_us.shape


(2225213, 30)
(2192840, 30)

In [16]:
# Write business-review-user file
write_filename = os.path.join(yelp_utils.YELP_DATA_CSV_DIR, 'business_review_user.csv')
if not os.path.isfile(write_filename):
    df_merged_business_review_user_only_us.to_csv(write_filename, encoding='utf-8', engine='c', index=False)
else:
    df_merged_business_review_user_only_us = pd.read_csv(write_filename, encoding='utf-8', engine='c');
    print("read file")

Create a sample dataset


In [17]:
sample_file = os.path.join(yelp_utils.YELP_DATA_CSV_DIR, 'business_review_user'+ yelp_utils.data_subset +'.csv')
df_sample = df_merged_business_review_user_only_us.sample(frac = yelp_utils.data_frac, replace=False, random_state=yelp_utils.SEED_VAL)
if not os.path.isfile(sample_file):
    df_sample.to_csv(sample_file, encoding='utf-8', engine='c', index=False)
df_sample.head()


Out[17]:
business_id business_categories business_name business_review_count business_full_address business_open business_stars business_type business_latitude business_longitude business_neighborhoods user_id review_id review_text review_votes.cool review_votes.funny review_stars review_date review_type review_votes.useful user_yelping_since user_review_count user_friends user_fans user_type user_average_stars user_elite user_votes.cool user_votes.funny user_votes.useful
902304 FoHJy_ucYarA2DkCrAVIgw ['Automotive', 'Smog Check Stations', 'Registr... Jiffy Smog 11 3061 St Rose Pkwy\nSoutheast\nLas Vegas, NV 89130 True 2.5 business 35.999862 -115.123105 ['Southeast', 'Anthem'] ZPnjxhKUsBDhXEOBKY_n2g t4_5FZu9lrW0GbFvtHDVRQ It's located in the Chevron gas station lot. Q... 1 0 4 2015-03-09 review 1 2011-02 349 ['KtNT8biTJlbAFh-APgO7-w', 'EhcZKrQJIS226xdm_q... 31 user 3.50 [2013, 2014, 2015] 767 781 1219
1308203 FqzgT9Y-Yu7jiWdHnGW-kQ ['Bars', 'American (Traditional)', 'Pubs', 'Ni... The Vig 522 4041 N 40th St\nPhoenix, AZ 85018 True 4.0 business 33.494489 -111.995106 [] D_rLArbCVIDib94qEnLqrg uQ_8mcs2GoZxQuOQ7_1_DQ Great outdoor patio dining area. Great happy h... 0 0 4 2013-12-17 review 0 2008-01 859 ['8_RB3fs5Ywn_ECTmg139Gg', 'pz97SxRe1Vk-5_K6EB... 34 user 3.68 [2009, 2010, 2011, 2012, 2013, 2014, 2015] 622 463 1227
138374 n5eQnMnVVt3FfrFENYoU0g ['Latin American', 'Mexican', 'Restaurants'] Cabo Fish Taco 672 3201 N Davidson St\nNoDa\nCharlotte, NC 28205 True 3.5 business 35.247251 -80.805827 ['NoDa'] GgO4Q6d3mnHFwjSmpOg3Vw uSP0VvT0YcHM1cyoMN3L1w for the record... cabo fish taco has never bee... 0 0 5 2015-07-05 review 2 2012-09 6 [] 0 user 3.40 [] 0 0 3
1269747 E2XPkjbbhdNY2yXBTwScQw ['Gluten-Free', 'Sandwiches', 'Salad', 'Restau... Muscle Maker Grill 31 8386 W Thunderbird Rd\nSte 103\nPeoria, AZ 85381 True 4.0 business 33.610449 -112.239434 [] 7OrWLA2qylmcC1smi9LoQw 536JzbYMBl8jKA85_s8t3w Came in with a groupon but will definitely ret... 0 0 5 2015-10-03 review 1 2011-04 20 ['_mdXw4vDCsCW7VqRQWxDzA', 'k8Jv-CUjlB5L27h0VA... 0 user 3.76 [] 7 6 24
867797 hfUdBRgTTPTR4s4MOqux8Q ['Arts & Entertainment', 'Stadiums & Arenas'] Thomas & Mack Center 96 Swenson & Thomas and Mack Dr\nUniversity\nLas ... True 4.0 business 36.104761 -115.144418 ['University'] TfLjkiUNHWksThxZOHrcqw dRbScHqVlEpr-MRn9_vipA I love the Mack!\n\nI've come to the Mack for ... 1 0 4 2011-07-08 review 0 2010-07 213 ['R7-OHW_M_V4lYGbIT2hetQ', 'p24ECnwNOfdQMs6SDt... 4 user 4.00 [2011, 2012, 2013, 2014] 180 207 414

In [18]:
%whos


Variable                                                                                                Type                    Data/Info
-----------------------------------------------------------------------------------------------------------------------------------------
CountVectorizer                                                                                         type                    <class 'sklearn.feature_e<...>on.text.CountVectorizer'>
NLTK_STOPWORDS                                                                                          set                     set([u'all', u'just', u'b<...>he', u'having', u'once'])
PorterStemmer                                                                                           type                    <class 'nltk.stem.porter.PorterStemmer'>
SEED_VAL                                                                                                int                     200
TfidfVectorizer                                                                                         type                    <class 'sklearn.feature_e<...>on.text.TfidfVectorizer'>
WORK_DIR                                                                                                str                     D:\_Active_Projects\yelp\yelp
YELP_DATA_CSV_DIR                                                                                       str                     D:\_Active_Projects\yelp\yelp\data\csv
YELP_DATA_RAW_DIR                                                                                       str                     D:\_Active_Projects\yelp\yelp\data\raw
YELP_DATA_SPARSE_MATRIX_DIR                                                                             str                     D:\_Active_Projects\yelp\yelp\data\sparse_matrix
YELP_DATA_WORD_2_VEC_MODEL_DIR                                                                          str                     D:\_Active_Projects\yelp\yelp\data\word2vec_model
business_csv_file                                                                                       str                     D:\_Active_Projects\yelp\<...>emic_dataset_business.csv
condition_latitude                                                                                      Series                  0          True\n1       <...>ess_latitude, dtype: bool
condition_longitude                                                                                     Series                  0          True\n1       <...>ss_longitude, dtype: bool
csr_matrix                                                                                              type                    <class 'scipy.sparse.csr.csr_matrix'>
csv_file                                                                                                str                     D:\_Active_Projects\yelp\<...>academic_dataset_user.csv
data_frac                                                                                               float                   0.001
data_subset                                                                                             str                     _0_0_1Percent
df_business                                                                                             DataFrame                                 busines<...>[77445 rows x 11 columns]
df_merged_business_review                                                                               DataFrame                                   busin<...>225213 rows x 20 columns]
df_merged_business_review_user                                                                          DataFrame                                   busin<...>225213 rows x 30 columns]
df_merged_business_review_user_only_us                                                                  DataFrame                                   busin<...>192840 rows x 30 columns]
df_review                                                                                               DataFrame                                       u<...>225213 rows x 10 columns]
df_sample                                                                                               DataFrame                                   busin<...>n[2193 rows x 30 columns]
df_user                                                                                                 DataFrame                      user_yelping_since<...>552339 rows x 11 columns]
filter_out_more_stopwords                                                                               function                <function filter_out_more<...>ds at 0x000000001A55E128>
getDfInfo                                                                                               function                <function getDfInfo at 0x000000001A54BD68>
glob                                                                                                    module                  <module 'glob' from 'C:\Anaconda\lib\glob.pyc'>
json_file                                                                                               str                     D:\_Active_Projects\yelp\<...>cademic_dataset_user.json
load_sparse_csr                                                                                         function                <function load_sparse_csr at 0x000000001A54BE48>
lowercase_remove_punctuation                                                                            function                <function lowercase_remov<...>on at 0x000000001A54BF28>
lowercase_remove_punctuation_and_numbers_and_tokenize_and_filter_more_stopwords_and_stem_and_restring   function                <function lowercase_remov<...>ng at 0x000000001A55E278>
make_sure_path_exists                                                                                   function                <function make_sure_path_<...>ts at 0x000000001A54BCF8>
nltk                                                                                                    module                  <module 'nltk' from 'C:\A<...>kages\nltk\__init__.pyc'>
np                                                                                                      module                  <module 'numpy' from 'C:\<...>ages\numpy\__init__.pyc'>
os                                                                                                      module                  <module 'os' from 'C:\Anaconda\lib\os.pyc'>
pd                                                                                                      module                  <module 'pandas' from 'C:<...>ges\pandas\__init__.pyc'>
removeNACols                                                                                            function                <function removeNACols at 0x00000000270DEB38>
remove_number_map                                                                                       dict                    n=10
remove_numbers_in_string                                                                                function                <function remove_numbers_<...>ng at 0x000000001A54BEB8>
remove_punctuation_map                                                                                  dict                    n=32
remove_stopwords                                                                                        function                <function remove_stopwords at 0x000000001A55E048>
renameRemoveNACols                                                                                      function                <function renameRemoveNAC<...>ls at 0x00000000270DEBA8>
restring_tokens                                                                                         function                <function restring_tokens at 0x000000001A55E208>
review_csv_file                                                                                         str                     D:\_Active_Projects\yelp\<...>ademic_dataset_review.csv
sample_file                                                                                             str                     D:\_Active_Projects\yelp\<...>iew_user_0_0_1Percent.csv
save_sparse_csr                                                                                         function                <function save_sparse_csr at 0x000000001A54BDD8>
stem_token_list                                                                                         function                <function stem_token_list at 0x000000001A55E198>
stopwords                                                                                               WordListCorpusReader    <WordListCorpusReader in <...>ata\\corpora\\stopwords'>
string                                                                                                  module                  <module 'string' from 'C:<...>Anaconda\lib\string.pyc'>
user_csv_file                                                                                           str                     D:\_Active_Projects\yelp\<...>academic_dataset_user.csv
word_tokenize                                                                                           function                <function word_tokenize at 0x000000001A1B59E8>
write_filename                                                                                          str                     D:\_Active_Projects\yelp\<...>\business_review_user.csv
yelp_utils                                                                                              module                  <module 'yelp_utils' from 'yelp_utils.py'>

In [ ]: