In [4]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [10]:
import sys  

reload(sys)  
sys.setdefaultencoding('utf8')

In [12]:
business = pd.read_json('../datasets/yelp_academic_dataset_business.json', lines=True, encoding='utf8')

In [13]:
checkin = pd.read_json('../datasets/yelp_academic_dataset_checkin.json', lines=True, encoding='utf8')

In [14]:
review = pd.read_json('../datasets/yelp_academic_dataset_review.json', lines=True, encoding='utf8')

In [15]:
tip = pd.read_json('../datasets/yelp_academic_dataset_tip.json', lines=True, encoding='utf8')

In [16]:
user = pd.read_json('../datasets/yelp_academic_dataset_user.json', lines=True, encoding='utf8')

In [ ]:


In [ ]:


In [ ]:


In [ ]:

Business Dataset


In [ ]:
{
    "business_id":"encrypted business id",
    "name":"business name",
    "neighborhood":"hood name",
    "address":"full address",
    "city":"city",
    "state":"state -- if applicable --",
    "postal code":"postal code",
    "latitude":latitude,
    "longitude":longitude,
    "stars":star rating, ***rounded to half-stars***,
    "review_count":number of reviews,
    "is_open":0/1 (closed/open),
    "attributes":["an array of strings: each array element is an attribute"],
    "categories":["an array of strings of business categories"],
    "hours":["an array of strings of business hours"],
    "type": "business"
}

In [53]:
'Size of the business dataset: ' + str(len(business))


Out[53]:
'Size of the business dataset: 144072'

In [66]:
business.columns


Out[66]:
Index([u'address', u'attributes', u'business_id', u'categories', u'city',
       u'hours', u'is_open', u'latitude', u'longitude', u'name',
       u'neighborhood', u'postal_code', u'review_count', u'stars', u'state',
       u'type'],
      dtype='object')

In [115]:
business['attributes'][12]


Out[115]:
[u'BikeParking: True', u"BusinessParking: {'garage': False, 'street': True, 'validated': False, 'lot': False, 'valet': False}", u'RestaurantsPriceRange2: 1']

In [121]:
business['categories'][20]


Out[121]:
[u'Sports Clubs', u'Active Life']

In [73]:
business.head()


Out[73]:
address attributes business_id categories city hours is_open latitude longitude name neighborhood postal_code review_count stars state type
0 227 E Baseline Rd, Ste J2 [BikeParking: True, BusinessAcceptsBitcoin: Fa... 0DI8Dt2PJp07XkVvIElIcQ [Tobacco Shops, Nightlife, Vape Shops, Shopping] Tempe [Monday 11:0-21:0, Tuesday 11:0-21:0, Wednesda... 0 33.378214 -111.936102 Innovative Vapors 85283 17 4.5 AZ business
1 495 S Grand Central Pkwy [BusinessAcceptsBitcoin: False, BusinessAccept... LTlCaCGZE14GuaUXUGbamg [Caterers, Grocery, Food, Event Planning & Ser... Las Vegas [Monday 0:0-0:0, Tuesday 0:0-0:0, Wednesday 0:... 1 36.192284 -115.159272 Cut and Taste 89106 9 5.0 NV business
2 979 Bloor Street W [Alcohol: none, Ambience: {'romantic': False, ... EDqCEAGXVGCH4FJXgqtjqg [Restaurants, Pizza, Chicken Wings, Italian] Toronto [Monday 11:0-2:0, Tuesday 11:0-2:0, Wednesday ... 1 43.661054 -79.429089 Pizza Pizza Dufferin Grove M6H 1L5 7 2.5 ON business
3 7014 Steubenville Pike [AcceptsInsurance: False, BusinessAcceptsCredi... cnGIivYRLxpF7tBVR_JwWA [Hair Removal, Beauty & Spas, Blow Dry/Out Ser... Oakdale [Tuesday 10:0-21:0, Wednesday 10:0-21:0, Thurs... 1 40.444544 -80.174540 Plush Salon and Spa 15071 4 4.0 PA business
4 321 Jarvis Street [BusinessAcceptsCreditCards: True, Restaurants... cdk-qqJ71q6P7TJTww_DSA [Hotels & Travel, Event Planning & Services, H... Toronto None 1 43.659829 -79.375401 Comfort Inn Downtown Core M5B 2C2 8 3.0 ON business

Open/Closed


In [60]:
'Percentage of open businesses: ' + str(business['is_open'].sum() / float(len(business)))


Out[60]:
'Percentage of open businesses: 0.850304014659'

City and State


In [20]:
len(business.city.unique())


Out[20]:
878

In [30]:
business['city'].value_counts().head(10)


Out[30]:
Las Vegas     22892
Toronto       14540
Phoenix       14468
Scottsdale     6917
Charlotte      6912
Pittsburgh     5275
Montréal       4785
Mesa           4714
Henderson      3788
Tempe          3703
Name: city, dtype: int64

In [31]:
business['city'].value_counts().tail(10)


Out[31]:
Buckeye - Shaker          1
East Mesa                 1
Broadlands                1
Vilas                     1
Pencaitland               1
Morriston                 1
Charlotte (University)    1
Malton                    1
Reminderville             1
                          1
Name: city, dtype: int64

In [81]:
len(business.state.unique())


Out[81]:
29

In [82]:
business['state'].value_counts().head(10)


Out[82]:
AZ     43492
NV     28214
ON     24507
NC     10177
OH      9966
PA      8091
QC      6668
WI      3899
EDH     3539
BW      2905
Name: state, dtype: int64

In [83]:
business['state'].value_counts().tail(10)


Out[83]:
ESX    11
SCB     3
FLN     1
PKN     1
STG     1
KHL     1
VT      1
NLK     1
NTH     1
FAL     1
Name: state, dtype: int64

In [79]:
plt.figure(figsize=(10,10))
plt.scatter(business['review_count'], business['stars'])
plt.xlabel('Review Counts')
plt.ylabel('Stars')
plt.show()



In [93]:
business.groupby('state').median()['review_count']


Out[93]:
state
AZ      9.0
BW      6.0
EDH     7.0
ELN     4.0
ESX     5.0
FAL    11.0
FIF     4.0
FLN     5.0
HLD     5.0
IL      8.0
KHL     7.0
MLN     5.0
NC      8.0
NI      4.5
NLK     3.0
NTH    17.0
NV     12.0
NY      5.0
OH      7.0
ON      8.0
PA      8.0
PKN    24.0
QC      7.0
SC      6.5
SCB     6.0
STG     3.0
VT      4.0
WI      8.0
WLN     4.0
Name: review_count, dtype: float64

In [95]:
business.groupby('state').median()['stars']


Out[95]:
state
AZ     4.0
BW     4.0
EDH    4.0
ELN    4.0
ESX    4.0
FAL    4.0
FIF    3.5
FLN    4.5
HLD    4.0
IL     3.5
KHL    3.5
MLN    4.0
NC     3.5
NI     4.0
NLK    3.5
NTH    2.0
NV     4.0
NY     3.5
OH     3.5
ON     3.5
PA     3.5
PKN    3.5
QC     4.0
SC     3.5
SCB    4.0
STG    4.5
VT     5.0
WI     4.0
WLN    3.5
Name: stars, dtype: float64

In [107]:
business[business['business_id'] == '2LfIuF3_sX6uwe-IR-P0jQ']


Out[107]:
address attributes business_id categories city hours is_open latitude longitude name neighborhood postal_code review_count stars state type
70810 4610 N 7th Ave [BikeParking: True, BusinessAcceptsCreditCards... 2LfIuF3_sX6uwe-IR-P0jQ [Home & Garden, Antiques, Home Decor, Shopping] Phoenix [Monday 11:0-17:0, Tuesday 11:0-17:0, Wednesda... 0 33.504054 -112.082813 Modern On Melrose 85013 15 4.0 AZ business

In [140]:
business.describe()


Out[140]:
is_open latitude longitude review_count stars
count 144072.000000 144072.000000 144072.000000 144072.000000 144072.000000
mean 0.850304 38.644758 -92.690002 28.825511 3.637709
std 0.356774 5.344079 26.898355 92.574711 0.969585
min 0.000000 32.998019 -122.397206 3.000000 1.000000
25% 1.000000 33.627346 -112.136907 4.000000 3.000000
50% 1.000000 36.143417 -89.523207 9.000000 3.500000
75% 1.000000 43.611192 -79.649855 22.000000 4.500000
max 1.000000 57.592285 115.086769 6414.000000 5.000000

In [ ]:


In [ ]:


In [ ]:

Review


In [ ]:
{
    "review_id":"encrypted review id",
    "user_id":"encrypted user id",
    "business_id":"encrypted business id",
    "stars":star rating, rounded to half-stars,
    "date":"date formatted like 2009-12-19",
    "text":"review text",
    "useful":number of useful votes received,
    "funny":number of funny votes received,
    "cool": number of cool review votes received,
    "type": "review"
}

https://www.yelp.com/dataset_challenge https://www.yelp-support.com/Recommended_Reviews

  1. Why is the user review count different than the actual number of reviews returned for that user? The review count represents the total number of reviews a user had posted at the time of data collection, whether Yelp recommended them or not. As for the reviews, only the reviews that were recommended at the time of data collection are included. Also, we only include businesses that have had at least 3 reviews older than 14 days. So the review count number may differ from the number of actual reviews for any given user.

In [102]:
len(review)


Out[102]:
4153150

In [98]:
review.head()


Out[98]:
business_id cool date funny review_id stars text type useful user_id
0 2aFiy99vNLklCx3T_tGS9A 0 2011-10-10 0 NxL8SIC5yqOdnlXCg18IBg 5 If you enjoy service by someone who is as comp... review 0 KpkOkG6RIf4Ra25Lhhxf1A
1 2aFiy99vNLklCx3T_tGS9A 0 2010-12-29 0 pXbbIgOXvLuTi_SPs1hQEQ 5 After being on the phone with Verizon Wireless... review 1 bQ7fQq1otn9hKX-gXRsrgA
2 2aFiy99vNLklCx3T_tGS9A 0 2011-04-29 0 wslW2Lu4NYylb1jEapAGsw 5 Great service! Corey is very service oriented.... review 0 r1NUhdNmL6yU9Bn-Yx6FTw
3 2LfIuF3_sX6uwe-IR-P0jQ 1 2014-07-14 0 GP6YEearUWrzPtQYSF1vVg 5 Highly recommended. Went in yesterday looking ... review 0 aW3ix1KNZAvoM8q-WghA3Q
4 2LfIuF3_sX6uwe-IR-P0jQ 0 2014-01-15 0 25RlYGq2s5qShi-pn3ufVA 4 I walked in here looking for a specific piece ... review 0 YOo-Cip8HqvKp_p9nEGphw

In [101]:
review['useful'].max()


Out[101]:
1125

In [123]:
review[review['business_id'] == '2LfIuF3_sX6uwe-IR-P0jQ']['stars'].mean()


Out[123]:
4.2000000000000002

In [125]:
review[review['business_id'] == '2aFiy99vNLklCx3T_tGS9A']


Out[125]:
business_id cool date funny review_id stars text type useful user_id
0 2aFiy99vNLklCx3T_tGS9A 0 2011-10-10 0 NxL8SIC5yqOdnlXCg18IBg 5 If you enjoy service by someone who is as comp... review 0 KpkOkG6RIf4Ra25Lhhxf1A
1 2aFiy99vNLklCx3T_tGS9A 0 2010-12-29 0 pXbbIgOXvLuTi_SPs1hQEQ 5 After being on the phone with Verizon Wireless... review 1 bQ7fQq1otn9hKX-gXRsrgA
2 2aFiy99vNLklCx3T_tGS9A 0 2011-04-29 0 wslW2Lu4NYylb1jEapAGsw 5 Great service! Corey is very service oriented.... review 0 r1NUhdNmL6yU9Bn-Yx6FTw

In [128]:
len(review['review_id'].unique())


Out[128]:
4153150

In [131]:
plt.scatter(review['stars'], review['cool'])
plt.xlabel('Star')
plt.ylabel('Cool')
plt.show()



In [133]:
plt.scatter(review['stars'], review['useful'])
plt.xlabel('Star')
plt.ylabel('Useful')
plt.show()



In [135]:
plt.scatter(review['stars'], review['funny'])
plt.xlabel('Star')
plt.ylabel('Funny')
plt.show()



In [141]:
review.describe()


Out[141]:
cool funny stars useful
count 4.153150e+06 4.153150e+06 4.153150e+06 4.153150e+06
mean 5.262232e-01 4.194826e-01 3.722765e+00 1.008271e+00
std 1.914351e+00 1.721744e+00 1.405131e+00 2.585058e+00
min 0.000000e+00 0.000000e+00 1.000000e+00 0.000000e+00
25% 0.000000e+00 0.000000e+00 3.000000e+00 0.000000e+00
50% 0.000000e+00 0.000000e+00 4.000000e+00 0.000000e+00
75% 0.000000e+00 0.000000e+00 5.000000e+00 1.000000e+00
max 5.130000e+02 6.320000e+02 5.000000e+00 1.125000e+03

In [ ]:


In [ ]:

User


In [ ]:
{
    "user_id":"encrypted user id",
    "name":"first name",
    "review_count":number of reviews,
    "yelping_since": date formatted like "2009-12-19",
    "friends":["an array of encrypted ids of friends"],
    "useful":"number of useful votes sent by the user",
    "funny":"number of funny votes sent by the user",
    "cool":"number of cool votes sent by the user",
    "fans":"number of fans the user has",
    "elite":["an array of years the user was elite"],
    "average_stars":floating point average like 4.31,
    "compliment_hot":number of hot compliments received by the user,
    "compliment_more":number of more compliments received by the user,
    "compliment_profile": number of profile compliments received by the user,
    "compliment_cute": number of cute compliments received by the user,
    "compliment_list": number of list compliments received by the user,
    "compliment_note": number of note compliments received by the user,
    "compliment_plain": number of plain compliments received by the user,
    "compliment_cool": number of cool compliments received by the user,
    "compliment_funny": number of funny compliments received by the user,
    "compliment_writer": number of writer compliments received by the user,
    "compliment_photos": number of photo compliments received by the user,
    "type":"user"
}

In [142]:
len(user)


Out[142]:
1029432

In [138]:
user.columns


Out[138]:
Index([u'average_stars', u'compliment_cool', u'compliment_cute',
       u'compliment_funny', u'compliment_hot', u'compliment_list',
       u'compliment_more', u'compliment_note', u'compliment_photos',
       u'compliment_plain', u'compliment_profile', u'compliment_writer',
       u'cool', u'elite', u'fans', u'friends', u'funny', u'name',
       u'review_count', u'type', u'useful', u'user_id', u'yelping_since'],
      dtype='object')

In [136]:
user.head()


Out[136]:
average_stars compliment_cool compliment_cute compliment_funny compliment_hot compliment_list compliment_more compliment_note compliment_photos compliment_plain ... elite fans friends funny name review_count type useful user_id yelping_since
0 3.59 4192 79 4192 3904 19 305 4705 1347 2617 ... [2017, 2015, 2016, 2014, 2011, 2013, 2012] 298 [iJg9ekPzF9lkMuvjKYX6uA, ctWAuzS04Xu0lke2Rop4l... 12316 Rob 761 user 18456 EZmocAborM6z66rTzeZxzQ 2009-09-12
1 4.29 144 11 144 64 1 4 97 24 129 ... [None] 34 [r2UUCzGxqI6WPsiWPgqG2A, qewG3X2O4X6JKskxyyqFw... 28 Vivian 80 user 117 myql3o3x22_ygECb8gVo7A 2009-06-27
2 4.15 36 1 36 14 2 10 21 4 23 ... [2011, 2016, 2017, 2013, 2014, 2015, 2012] 48 [qewG3X2O4X6JKskxyyqFwQ, -50XWnmQGqBgEI-9ANvLl... 6 Carol 841 user 58 FIk4lQQu1eTe2EpzQ4xhBA 2010-08-26
3 3.82 54 6 54 32 0 5 13 1 29 ... [2014, 2010, 2017, 2015, 2011, 2016, 2013, 201... 28 [AIhfuFmX62k7a22gXXAB2Q, kQ1jU2rDpFD_q54edEmwI... 51 Miss Maggie 376 user 42 ojovtd9c8GIeDiB8e0mq2w 2008-05-31
4 2.97 0 0 0 0 0 0 0 0 0 ... [None] 2 [bvZkBCY-AvCsxRjd6Q-vEQ, KNEVNYeyNZApXG6HqOdsR... 3 Jeremy 28 user 8 uVEoZmmL9yK0NMgadLL0CQ 2013-01-17

5 rows × 23 columns


In [158]:
user.select_dtypes(include=['number']).columns


Out[158]:
Index([u'average_stars', u'compliment_cool', u'compliment_cute',
       u'compliment_funny', u'compliment_hot', u'compliment_list',
       u'compliment_more', u'compliment_note', u'compliment_photos',
       u'compliment_plain', u'compliment_profile', u'compliment_writer',
       u'cool', u'fans', u'funny', u'review_count', u'useful'],
      dtype='object')

In [159]:
user.select_dtypes(include=['number']).corr()


Out[159]:
average_stars compliment_cool compliment_cute compliment_funny compliment_hot compliment_list compliment_more compliment_note compliment_photos compliment_plain compliment_profile compliment_writer cool fans funny review_count useful
average_stars 1.000000 0.004955 0.002208 0.004955 0.004424 0.001194 0.002281 0.002279 0.001948 0.004772 0.001850 0.004204 0.004169 0.010477 0.002888 0.010328 0.003633
compliment_cool 0.004955 1.000000 0.637884 1.000000 0.922982 0.524139 0.698514 0.626849 0.692400 0.892175 0.690345 0.925280 0.663084 0.569677 0.689398 0.351760 0.658446
compliment_cute 0.002208 0.637884 1.000000 0.637884 0.645190 0.943909 0.934712 0.448215 0.853470 0.539704 0.892148 0.692343 0.315016 0.277186 0.328745 0.153434 0.314600
compliment_funny 0.004955 1.000000 0.637884 1.000000 0.922982 0.524139 0.698514 0.626849 0.692400 0.892175 0.690345 0.925280 0.663084 0.569677 0.689398 0.351760 0.658446
compliment_hot 0.004424 0.922982 0.645190 0.922982 1.000000 0.488109 0.654516 0.708887 0.657912 0.766171 0.651110 0.849750 0.561949 0.528990 0.589673 0.303549 0.555599
compliment_list 0.001194 0.524139 0.943909 0.524139 0.488109 1.000000 0.951548 0.397535 0.848836 0.458001 0.911373 0.633854 0.243560 0.148062 0.261302 0.103294 0.239819
compliment_more 0.002281 0.698514 0.934712 0.698514 0.654516 0.951548 1.000000 0.511596 0.882017 0.615690 0.964811 0.798392 0.383817 0.274604 0.419339 0.194481 0.381040
compliment_note 0.002279 0.626849 0.448215 0.626849 0.708887 0.397535 0.511596 1.000000 0.483627 0.586514 0.484896 0.603333 0.417652 0.384312 0.440480 0.247417 0.430591
compliment_photos 0.001948 0.692400 0.853470 0.692400 0.657912 0.848836 0.882017 0.483627 1.000000 0.603898 0.853324 0.775588 0.384438 0.224731 0.448318 0.117788 0.370907
compliment_plain 0.004772 0.892175 0.539704 0.892175 0.766171 0.458001 0.615690 0.586514 0.603898 1.000000 0.609914 0.840796 0.677567 0.596729 0.681456 0.352080 0.694519
compliment_profile 0.001850 0.690345 0.892148 0.690345 0.651110 0.911373 0.964811 0.484896 0.853324 0.609914 1.000000 0.795236 0.391188 0.226154 0.431158 0.144913 0.384007
compliment_writer 0.004204 0.925280 0.692343 0.925280 0.849750 0.633854 0.798392 0.603333 0.775588 0.840796 0.795236 1.000000 0.606207 0.527343 0.654503 0.348556 0.601251
cool 0.004169 0.663084 0.315016 0.663084 0.561949 0.243560 0.383817 0.417652 0.384438 0.677567 0.391188 0.606207 1.000000 0.433918 0.844814 0.260627 0.920161
fans 0.010477 0.569677 0.277186 0.569677 0.528990 0.148062 0.274604 0.384312 0.224731 0.596729 0.226154 0.527343 0.433918 1.000000 0.381613 0.595719 0.476943
funny 0.002888 0.689398 0.328745 0.689398 0.589673 0.261302 0.419339 0.440480 0.448318 0.681456 0.431158 0.654503 0.844814 0.381613 1.000000 0.240260 0.843471
review_count 0.010328 0.351760 0.153434 0.351760 0.303549 0.103294 0.194481 0.247417 0.117788 0.352080 0.144913 0.348556 0.260627 0.595719 0.240260 1.000000 0.298556
useful 0.003633 0.658446 0.314600 0.658446 0.555599 0.239819 0.381040 0.430591 0.370907 0.694519 0.384007 0.601251 0.920161 0.476943 0.843471 0.298556 1.000000

In [163]:
def correlation_matrix(df):
    from matplotlib import pyplot as plt
    from matplotlib import cm as cm

    fig = plt.figure(figsize=(16,16))
    ax1 = fig.add_subplot(111)
    cmap = cm.get_cmap('jet', 30)
    cax = ax1.imshow(df.corr(), interpolation="nearest", cmap=cmap)
    ax1.grid(True)
    plt.title('Numeric Feature Correlation')
    labels = user.select_dtypes(include=['number']).columns
    ax1.set_xticks(np.arange(len(labels)))
    ax1.set_yticks(np.arange(len(labels)))
    ax1.set_xticklabels(labels,fontsize=10,rotation=90)
    ax1.set_yticklabels(labels,fontsize=10)
#     Add colorbar, make sure to specify tick locations to match desired ticklabels
    fig.colorbar(cax, ticks=[.75,.8,.85,.90,.95,1])
    plt.show()

correlation_matrix(user.select_dtypes(include=['number']))



In [165]:
plt.scatter(user['average_stars'], user['review_count'])
plt.show()



In [166]:
plt.scatter(user['average_stars'], user['useful'])
plt.show()



In [167]:
plt.scatter(user['review_count'], user['useful'])
plt.show()



In [168]:
plt.scatter(user['useful'], user['fans'])
plt.show()



In [ ]:


In [ ]:

Checkin


In [ ]:
{
    "time":["an array of check ins with the format day-hour:number of check ins from hour to hour+1"],
    "business_id":"encrypted business id",
    "type":"checkin"
}

In [169]:
len(checkin)


Out[169]:
125532

In [170]:
checkin.columns


Out[170]:
Index([u'business_id', u'time', u'type'], dtype='object')

In [171]:
checkin.head()


Out[171]:
business_id time type
0 7KPBkxAOEtb3QeIL9PEErg [Fri-0:2, Sat-0:1, Sun-0:1, Wed-0:2, Sat-1:2, ... checkin
1 kREVIrSBbtqBhIYkTccQUg [Mon-13:1, Thu-13:1, Sat-16:1, Wed-17:1, Sun-1... checkin
2 tJRDll5yqpZwehenzE2cSg [Thu-0:1, Mon-1:1, Mon-12:1, Sat-16:1] checkin
3 nhZ1HGWD8lMErdn3FuWuTQ [Fri-0:1, Sat-0:1, Sun-0:1, Thu-0:1, Wed-0:1, ... checkin
4 8bY6M2yiWOF2ilfmGS34Fw [Sat-11:1, Fri-13:1, Thu-14:1] checkin

In [172]:
checkin['time'][0]


Out[172]:
[u'Fri-0:2', u'Sat-0:1', u'Sun-0:1', u'Wed-0:2', u'Sat-1:2', u'Thu-1:1', u'Wed-1:1', u'Sat-2:1', u'Sun-2:2', u'Thu-2:1', u'Wed-2:1', u'Fri-3:1', u'Sun-3:3', u'Thu-4:1', u'Tue-4:1', u'Sun-6:1', u'Wed-6:1', u'Fri-10:1', u'Sat-10:1', u'Mon-11:1', u'Wed-11:2', u'Mon-12:1', u'Sat-12:1', u'Tue-12:1', u'Sat-13:2', u'Thu-13:1', u'Tue-13:2', u'Wed-13:2', u'Fri-14:2', u'Sat-14:1', u'Wed-14:1', u'Fri-15:1', u'Sat-15:1', u'Thu-15:1', u'Tue-15:1', u'Fri-16:1', u'Sat-16:2', u'Sun-16:1', u'Tue-16:1', u'Sat-17:3', u'Sun-17:1', u'Fri-18:1', u'Mon-18:1', u'Sat-18:2', u'Sun-18:1', u'Tue-18:2', u'Wed-18:1', u'Fri-19:2', u'Mon-19:1', u'Sun-19:2', u'Thu-19:1', u'Wed-19:1', u'Mon-20:1', u'Sun-20:5', u'Thu-20:1', u'Tue-20:1', u'Wed-20:2', u'Fri-21:2', u'Sun-21:1', u'Thu-21:4', u'Tue-21:1', u'Wed-21:1', u'Fri-22:1', u'Thu-22:1', u'Fri-23:1', u'Mon-23:1', u'Sat-23:3', u'Sun-23:1', u'Thu-23:2', u'Tue-23:1']

In [ ]:


In [ ]:

Tip


In [ ]:
{
    "text":"text of the tip",
    "date":"date formatted like 2009-12-19",
    "likes":compliment count,
    "business_id":"encrypted business id",
    "user_id":"encrypted user id",
    "type":"tip"
}

In [173]:
len(tip)


Out[173]:
946600

In [174]:
tip.columns


Out[174]:
Index([u'business_id', u'date', u'likes', u'text', u'type', u'user_id'], dtype='object')

In [175]:
tip.head()


Out[175]:
business_id date likes text type user_id
0 tJRDll5yqpZwehenzE2cSg 2012-07-15 0 Get here early enough to have dinner. tip zcTZk7OG8ovAmh_fenH21g
1 jH19V2I9fIslnNhDzPmdkA 2015-08-12 0 Great breakfast large portions and friendly wa... tip ZcLKXikTHYOnYt5VYRO5sg
2 dAa0hB2yrnHzVmsCkN4YvQ 2014-06-20 0 Nice place. Great staff. A fixture in the tow... tip oaYhjqBbh18ZhU0bpyzSuw
3 dAa0hB2yrnHzVmsCkN4YvQ 2016-10-12 0 Happy hour 5-7 Monday - Friday tip ulQ8Nyj7jCUR8M83SUMoRQ
4 SqW3igh1_Png336VIb5DUA 2016-07-03 0 Come early on Sunday's to avoid the rush tip ulQ8Nyj7jCUR8M83SUMoRQ

In [176]:
plt.plot(tip['likes'])
plt.show()



In [ ]:


In [ ]:


In [ ]:


In [ ]: