In [95]:
import gzip, urllib2, re
import dill as pickle
import simplejson as json
import pandas as pd
import numpy as np
import collections as collections
from scipy import stats
from StringIO import StringIO
from sklearn import datasets, decomposition, linear_model, grid_search, utils, preprocessing, cross_validation, neighbors, ensemble
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
The goal of this project is to use the Yelp dataset to predict a new venue's popularity from information available when the venue opens. The dataset contains meta data about the venue (where it is located, the type of food served, venue attributes, etc), and a star rating.
There are two parts to this project. The first is importing the data, and splitting it into test and training sets. The second is building and testing predictive models. I use scikit libraries for this part, for training and testing various machine learning algorithms on this data.
In [3]:
class HandleGZippedJSON:
def __init__(self, url):
self.url = url
self.json_data = None
def run(self):
request = urllib2.Request(self.url)
request.add_header('Accept-encoding', 'gzip')
opener = urllib2.build_opener()
f = opener.open(request)
c_data = f.read()
c_stream = StringIO(c_data)
gzipper = gzip.GzipFile(fileobj=c_stream)
data = gzipper.read()
output = data.splitlines()
datastr=[]
for lines in output:
try:
r=json.loads(lines)
datastr.append(r)
except ValueError: # includes simplejson.decoder.JSONDecodeError
print 'Decoding JSON has failed'
pass
return datastr
fileurl="http://thedataincubator.s3.amazonaws.com/coursedata/mldata/yelp_train_academic_dataset_business.json.gz"
out=HandleGZippedJSON(fileurl)
xfile=out.run()
yout=[]
for records in xfile:
yout.append(records['stars'])
#splitting dataset into testing and training sets
x_train, x_test, y_train, y_test = cross_validation.train_test_split(xfile, yout)
I test the predictive power of various features using scikit.learn's Transformer and Estimator classes, FeatureUnion (which allows me to chain transformers together), and Pipelines (which allows me to chain together FeatureUnion, Estimators, other Pipelines, etc -- the whole workflow).
In [99]:
class ModelTransformer(BaseEstimator, TransformerMixin):
def __init__(self,model,name):
self.model=model
self.name=name
def fit(self, *args, **kwargs):
self.model.fit(*args, **kwargs)
return self
def transform(self, X, **transform_params):
return [self.model.predict(x) for x in X]
class ColumnSelector(TransformerMixin):
def __init__(self,namecol):
self.namecol=namecol
def fit(self, data, y=None):
return self
def transform(self, data):
if type(data) is list:
df = pd.DataFrame(data)
D=df[self.namecol]
elif type(data) is dict:
df = pd.DataFrame(columns=[self.namecol], index=['x'])
df.loc['x'] = pd.Series({self.namecol:data[self.namecol]})
D=df[self.namecol]
return D
class DictList(TransformerMixin):
def __init__(self):
pass
def fit(self, data, y=None):
return self
def transform(self, data):
def flatten(d, parent_key='', sep='_'):
items = []
for k, v in d.items():
new_key = parent_key + sep + k if parent_key else k
if isinstance(v, collections.MutableMapping):
items.extend(flatten(v, new_key, sep=sep).items())
else:
items.append((new_key, v))
return dict(items)
dictlist=[]
for dicts in data:
dictlist.append(flatten(dicts))
return dictlist
class NestExtractor(TransformerMixin):
def __init__(self):
pass
def fit(self, data, y=None):
return self
def transform(self, data):
dictlist=[]
for dicts in data:
for key, value in dicts.items():
if type(dicts[key]) is bool:
dicts[key]=int(dicts[key])
if type(dicts[key]) is str:
keyn = key + "_" + dicts[key]
dicts[keyn]=1
del dicts[key]
dictlist.append(dicts)
return dictlist
class CategoriesExtractor(TransformerMixin):
def __init__(self):
self.t=DictVectorizer()
def fit(self, data, y=None):
#extract categories from data
if type(data) is list:
df = pd.DataFrame(data)
D=df['categories']
elif type(data) is dict:
df = pd.DataFrame(columns=['categories'], index=['x'])
df.loc['x'] = pd.Series({'categories':data['categories']})
D=df['categories']
dictlist=[]
for lines in D:
x={}
for words in lines:
x[words]=1
dictlist.append(x)
self.t.fit(dictlist)
return self
def transform(self, data):
#extract categories from data
if type(data) is list:
df = pd.DataFrame(data)
D=df['categories']
elif type(data) is dict:
df = pd.DataFrame(columns=['categories'], index=['x'])
df.loc['x'] = pd.Series({'categories':data['categories']})
D=df['categories']
else:
df= pd.DataFrame(data)
D=df['categories']
dictlist2=[]
for lines in D:
x={}
for words in lines:
x[words]=1
dictlist2.append(x)
X=self.t.transform(dictlist2)
return X
class DenseTransformer(TransformerMixin):
def transform(self, X, y=None, **fit_params):
return X.todense()
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
class LatLongExtractor(TransformerMixin):
def __init__(self):
pass
def fit(self, *_):
return self
def transform(self, X):
#extract columns from data
if type(X) is list:
df = pd.DataFrame(X)
df.drop(['attributes','business_id','categories','city','full_address','hours','name','\
neighborhoods','open','review_count','stars','state','type'],inplace=True,axis=1)
return df
elif type(X) is dict:
df = pd.DataFrame(columns=['latitude','longitude'], index=['x'])
df.loc['x'] = pd.Series({'latitude':X['latitude'], 'longitude':X['longitude']})
return df
else:
return self
class ColumnExtractor(TransformerMixin):
def __init__(self, namecols):
self.namecols = namecols
def fit(self, *_):
return self
def transform(self, X, *args, **transform_params):
if isinstance(X,dict):
return [X[arg] for arg in self.namecols]
elif isinstance(X,list):
return [[rec[arg] for arg in self.namecols] for rec in X]
class LinearRegressor2(BaseEstimator, RegressorMixin):
def __init__(self):
self.linear_regression = linear_model.LinearRegression()
pass
def fit(self, X, y):
self.linear_regression.fit(X, y)
return self
def transform(self, X):
return X
def predict(self, X):
Xy=self.linear_regression.predict(X)
if type(Xy) is list:
Xyz=Xy
elif type(Xy) is np.ndarray:
Xyz=[]
for record in Xy:
frecord=float(record)
Xyz.append(frecord)
if len(Xyz)<2:
Xyz=Xyz[0]
return Xyz
class EnsembleRegressor(BaseEstimator, RegressorMixin):
def __init__(self):
pass
def fit(self, X, y):
self.random_forest = ensemble.RandomForestRegressor(min_samples_leaf=20).fit(X, y)
return self.random_forest
def transform(self, X):
return X
def predict(self, X):
Xy=self.random_forest.predict(X)
if type(Xy) is list:
Xyz=Xy
else:
Xyz=Xy.tolist()
#Xyz=Xyz[0]
return Xyz
(1) I first test the predictive power of the city of the venue (perhaps some cities have higher average ratings than others). I get a very low score, indicating that this is not predictive.
In [49]:
#BASED ON CITY
class EstimatorKL(BaseEstimator, RegressorMixin):
def __init__(self):
pass
def fit(self, X, y):
infilex=X
sortbyx='city'
df = pd.DataFrame(infilex)
grouped=df.groupby(sortbyx, group_keys=True)
keylist=[]
for k, gp in grouped:
keylist.append(str(k))
cities_avg={}
for keysl in keylist:
cities_avg.update({keysl: grouped.get_group(keysl)['stars'].mean()})
self.ds=cities_avg
return self.ds
def predict(self, X):
akey=[]
for record in X:
try:
akey.append(self.ds[record['city']])
except:
akey.append(3.67)
return akey
estimator = EstimatorKL() # initialize
estimator.fit(x_train,y_train) # fit data
print estimator.score(x_test,y_test)
(2) Perhaps the city-based model isn't fine-grained enough. Therefore, I use both a Random Forests and a K-Nearest Neighbors model on the location of the venue (latitude/longitude) to test how predictive neighborhoods are. The scores show that the predictive power of location, while better than city, is still very weak. Moreover, both the Random Forest regressor and the KNN regressor seemed to have about the same predictive power.
In [91]:
#BASED ON LATITUDE AND LONGITUDE
#RandomForest Regressor
pipeline = Pipeline([
('latlong', LatLongExtractor()),
('rforest', ensemble.RandomForestRegressor(min_samples_leaf=20))
])
pipeline.fit(x_train,y_train)
print " The score for the Random Forest Regressor is: \n"
print pipeline.score(x_test,y_test)
#K-Nearest Neighbors Regressor
x_latlong = [[xa['latitude'],xa['longitude']] for xa in x_train]
cv = cross_validation.ShuffleSplit(len(y_train), n_iter=20, test_size=0.2, random_state=40)
param_grid = { "n_neighbors": range(4, 100, 5) }
nearest_neighbors_cv = grid_search.GridSearchCV(neighbors.KNeighborsRegressor(), param_grid=param_grid, cv=cv)
nearest_neighbors_cv.fit(x_latlong,y_train)
print("\n")
print(nearest_neighbors_cv.best_estimator_)
pipeline2 = Pipeline([
('latlong', LatLongExtractor()),
('knn', nearest_neighbors_cv.best_estimator_)
])
pipeline2.fit(x_train,y_train)
print "\n The score for the KNN Regressor is: \n"
print pipeline2.score(x_test,y_test)
(3) Venues have categories with varying degrees of specificity, such as: [American (Traditional), Restaurants] [Restaurants] [Doctors, Health & Medical]
For this part, (a) I built a custom transformer that massages the data and feeds it into the DictVectorizer, generating a large matrix via One-Hot-Encoding. (b) I then used the TfidfTransformer to normalize categories such that the influence of common, high frequency words (i.e. restaurants) was lessened and the influence of uncommon words increased. (c) I then ran a ridge regression. I didn't use a nonlinear predictor such as KNN for these category-features because of the high dimensionality; and I didn't carry out a linear regression because of concerns about overfitting.
The score (.17) shows that categories are weak predictors of Yelp Ratings.
In [109]:
#BASED ON CATEGORIES
pipeline = Pipeline([
('catextract', CategoriesExtractor()),
('tfidf', TfidfTransformer()),
('estimate', linear_model.Ridge())
])
pipeline.fit(x_train,y_train)
print pipeline.score(x_test,y_test)
I will now check the predictive power of venue attributes (wifi,good for groups, etc.).
(1) Attributes are a mixture of nested and unnested:
{'Attire': 'casual',
'Accepts Credit Cards': True,
'Ambience': {'casual': False, 'classy': False }}
I flatten the nested attributes and transform them into a features matrix via one-hot encoding, so that the above becomes:
{'Attire_casual' : 1,
'Accepts Credit Cards': 1,
'Ambience_casual': 0,
'Ambience_classy': 0 }
(2) I repeat the steps above -- running it through a dictionary vectorizer, tfidf transformer, and finally, a Ridge regression.
As the score shows, venue attributes are a very weak predictor of Yelp Ratings.
In [110]:
#BASED ON ATTRIBUTES
pipeline = Pipeline([
('att_extractor', ColumnSelector('attributes')),
('unnestor', DictList()),
('nobooler', NestExtractor()),
('dictv', DictVectorizer(sparse='FALSE')),
('tfidf', TfidfTransformer()),
('estimate', linear_model.Ridge())
])
pipeline.fit(x_train,y_train)
print pipeline.score(x_test,y_test)
So far we have three weak predictors: location, categories and attributes. I will now build a model combining these predictors using scikitlearn's FeatureUnion to combine. Since FeatureUnion only run on Transformers, I will have to transform the final estimators for each of the predictors into Transformers using the ModelTransformer() class created above.
While this combined model (score: .15) fares better than just the latlong (score: .02) or the attributes model (score: .08), it is outscored by the Categories predictor (score: .17) alone.
In [112]:
pipeline_all = Pipeline([
('festimators', FeatureUnion([
('categories', Pipeline([
('catextract', CategoriesExtractor()),
('tfidf', TfidfTransformer()),
('lregcat', ModelTransformer(linear_model.Ridge(),'cat'))
])),
('latlong', Pipeline([
('latlong', ColumnExtractor(['latitude','longitude'])),
('rforest', ModelTransformer(ensemble.RandomForestRegressor(min_samples_leaf=20),'rforest'))
])),
('attributes', Pipeline([
('att_extractor', ColumnSelector('attributes')),
('unnestor', DictList()),
('nobooler', NestExtractor()),
('dictv', DictVectorizer(sparse='FALSE')),
('tfidf', TfidfTransformer()),
('linearize', ModelTransformer(linear_model.Ridge(),'att'))
]))
])),
('estimate', linear_model.LinearRegression())
])
pipeline_all.fit(x_train,y_train)
print pipeline_all.score(x_test,y_test)
Next, I will explore whether running NLP on Yelp comments yields better predictions.