In [4]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [10]:
import sys
reload(sys)
sys.setdefaultencoding('utf8')
In [12]:
business = pd.read_json('../datasets/yelp_academic_dataset_business.json', lines=True, encoding='utf8')
In [13]:
checkin = pd.read_json('../datasets/yelp_academic_dataset_checkin.json', lines=True, encoding='utf8')
In [14]:
review = pd.read_json('../datasets/yelp_academic_dataset_review.json', lines=True, encoding='utf8')
In [15]:
tip = pd.read_json('../datasets/yelp_academic_dataset_tip.json', lines=True, encoding='utf8')
In [16]:
user = pd.read_json('../datasets/yelp_academic_dataset_user.json', lines=True, encoding='utf8')
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
{
"business_id":"encrypted business id",
"name":"business name",
"neighborhood":"hood name",
"address":"full address",
"city":"city",
"state":"state -- if applicable --",
"postal code":"postal code",
"latitude":latitude,
"longitude":longitude,
"stars":star rating, ***rounded to half-stars***,
"review_count":number of reviews,
"is_open":0/1 (closed/open),
"attributes":["an array of strings: each array element is an attribute"],
"categories":["an array of strings of business categories"],
"hours":["an array of strings of business hours"],
"type": "business"
}
In [53]:
'Size of the business dataset: ' + str(len(business))
Out[53]:
In [66]:
business.columns
Out[66]:
In [115]:
business['attributes'][12]
Out[115]:
In [121]:
business['categories'][20]
Out[121]:
In [73]:
business.head()
Out[73]:
In [60]:
'Percentage of open businesses: ' + str(business['is_open'].sum() / float(len(business)))
Out[60]:
In [20]:
len(business.city.unique())
Out[20]:
In [30]:
business['city'].value_counts().head(10)
Out[30]:
In [31]:
business['city'].value_counts().tail(10)
Out[31]:
In [81]:
len(business.state.unique())
Out[81]:
In [82]:
business['state'].value_counts().head(10)
Out[82]:
In [83]:
business['state'].value_counts().tail(10)
Out[83]:
In [79]:
plt.figure(figsize=(10,10))
plt.scatter(business['review_count'], business['stars'])
plt.xlabel('Review Counts')
plt.ylabel('Stars')
plt.show()
In [93]:
business.groupby('state').median()['review_count']
Out[93]:
In [95]:
business.groupby('state').median()['stars']
Out[95]:
In [107]:
business[business['business_id'] == '2LfIuF3_sX6uwe-IR-P0jQ']
Out[107]:
In [140]:
business.describe()
Out[140]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
{
"review_id":"encrypted review id",
"user_id":"encrypted user id",
"business_id":"encrypted business id",
"stars":star rating, rounded to half-stars,
"date":"date formatted like 2009-12-19",
"text":"review text",
"useful":number of useful votes received,
"funny":number of funny votes received,
"cool": number of cool review votes received,
"type": "review"
}
https://www.yelp.com/dataset_challenge https://www.yelp-support.com/Recommended_Reviews
In [102]:
len(review)
Out[102]:
In [98]:
review.head()
Out[98]:
In [101]:
review['useful'].max()
Out[101]:
In [123]:
review[review['business_id'] == '2LfIuF3_sX6uwe-IR-P0jQ']['stars'].mean()
Out[123]:
In [125]:
review[review['business_id'] == '2aFiy99vNLklCx3T_tGS9A']
Out[125]:
In [128]:
len(review['review_id'].unique())
Out[128]:
In [131]:
plt.scatter(review['stars'], review['cool'])
plt.xlabel('Star')
plt.ylabel('Cool')
plt.show()
In [133]:
plt.scatter(review['stars'], review['useful'])
plt.xlabel('Star')
plt.ylabel('Useful')
plt.show()
In [135]:
plt.scatter(review['stars'], review['funny'])
plt.xlabel('Star')
plt.ylabel('Funny')
plt.show()
In [141]:
review.describe()
Out[141]:
In [ ]:
In [ ]:
In [ ]:
{
"user_id":"encrypted user id",
"name":"first name",
"review_count":number of reviews,
"yelping_since": date formatted like "2009-12-19",
"friends":["an array of encrypted ids of friends"],
"useful":"number of useful votes sent by the user",
"funny":"number of funny votes sent by the user",
"cool":"number of cool votes sent by the user",
"fans":"number of fans the user has",
"elite":["an array of years the user was elite"],
"average_stars":floating point average like 4.31,
"compliment_hot":number of hot compliments received by the user,
"compliment_more":number of more compliments received by the user,
"compliment_profile": number of profile compliments received by the user,
"compliment_cute": number of cute compliments received by the user,
"compliment_list": number of list compliments received by the user,
"compliment_note": number of note compliments received by the user,
"compliment_plain": number of plain compliments received by the user,
"compliment_cool": number of cool compliments received by the user,
"compliment_funny": number of funny compliments received by the user,
"compliment_writer": number of writer compliments received by the user,
"compliment_photos": number of photo compliments received by the user,
"type":"user"
}
In [142]:
len(user)
Out[142]:
In [138]:
user.columns
Out[138]:
In [136]:
user.head()
Out[136]:
In [158]:
user.select_dtypes(include=['number']).columns
Out[158]:
In [159]:
user.select_dtypes(include=['number']).corr()
Out[159]:
In [163]:
def correlation_matrix(df):
from matplotlib import pyplot as plt
from matplotlib import cm as cm
fig = plt.figure(figsize=(16,16))
ax1 = fig.add_subplot(111)
cmap = cm.get_cmap('jet', 30)
cax = ax1.imshow(df.corr(), interpolation="nearest", cmap=cmap)
ax1.grid(True)
plt.title('Numeric Feature Correlation')
labels = user.select_dtypes(include=['number']).columns
ax1.set_xticks(np.arange(len(labels)))
ax1.set_yticks(np.arange(len(labels)))
ax1.set_xticklabels(labels,fontsize=10,rotation=90)
ax1.set_yticklabels(labels,fontsize=10)
# Add colorbar, make sure to specify tick locations to match desired ticklabels
fig.colorbar(cax, ticks=[.75,.8,.85,.90,.95,1])
plt.show()
correlation_matrix(user.select_dtypes(include=['number']))
In [165]:
plt.scatter(user['average_stars'], user['review_count'])
plt.show()
In [166]:
plt.scatter(user['average_stars'], user['useful'])
plt.show()
In [167]:
plt.scatter(user['review_count'], user['useful'])
plt.show()
In [168]:
plt.scatter(user['useful'], user['fans'])
plt.show()
In [ ]:
In [ ]:
In [ ]:
{
"time":["an array of check ins with the format day-hour:number of check ins from hour to hour+1"],
"business_id":"encrypted business id",
"type":"checkin"
}
In [169]:
len(checkin)
Out[169]:
In [170]:
checkin.columns
Out[170]:
In [171]:
checkin.head()
Out[171]:
In [172]:
checkin['time'][0]
Out[172]:
In [ ]:
In [ ]:
In [ ]:
{
"text":"text of the tip",
"date":"date formatted like 2009-12-19",
"likes":compliment count,
"business_id":"encrypted business id",
"user_id":"encrypted user id",
"type":"tip"
}
In [173]:
len(tip)
Out[173]:
In [174]:
tip.columns
Out[174]:
In [175]:
tip.head()
Out[175]:
In [176]:
plt.plot(tip['likes'])
plt.show()
In [ ]:
In [ ]:
In [ ]:
In [ ]: