In [95]:
import gzip, urllib2, re

import dill as pickle
import simplejson as json
import pandas as pd
import numpy as np
import collections as collections

from scipy import stats
from StringIO import StringIO

from sklearn import datasets, decomposition, linear_model, grid_search, utils, preprocessing, cross_validation, neighbors, ensemble
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

PREDICTING A VENUE'S POPULARITY

The goal of this project is to use the Yelp dataset to predict a new venue's popularity from information available when the venue opens. The dataset contains meta data about the venue (where it is located, the type of food served, venue attributes, etc), and a star rating.

There are two parts to this project. The first is importing the data, and splitting it into test and training sets. The second is building and testing predictive models. I use scikit libraries for this part, for training and testing various machine learning algorithms on this data.

IMPORTING THE DATA


In [3]:
class HandleGZippedJSON:
    def __init__(self, url):
        self.url = url
        self.json_data = None
 
    def run(self):
        request = urllib2.Request(self.url)
        request.add_header('Accept-encoding', 'gzip')
        opener = urllib2.build_opener()
        f = opener.open(request)
        c_data = f.read()
        c_stream = StringIO(c_data)
        gzipper = gzip.GzipFile(fileobj=c_stream)
        data = gzipper.read()
        
        output = data.splitlines()
        datastr=[]

        for lines in output:
            try:
                r=json.loads(lines)
                datastr.append(r)
            except ValueError:  # includes simplejson.decoder.JSONDecodeError
                print 'Decoding JSON has failed'
                pass
        
        return datastr
    
fileurl="http://thedataincubator.s3.amazonaws.com/coursedata/mldata/yelp_train_academic_dataset_business.json.gz"
out=HandleGZippedJSON(fileurl)
xfile=out.run()
yout=[]
for records in xfile:
    yout.append(records['stars'])

    
#splitting dataset into testing and training sets
x_train, x_test, y_train, y_test = cross_validation.train_test_split(xfile, yout)

ANALYTICS

I test the predictive power of various features using scikit.learn's Transformer and Estimator classes, FeatureUnion (which allows me to chain transformers together), and Pipelines (which allows me to chain together FeatureUnion, Estimators, other Pipelines, etc -- the whole workflow).


In [99]:
class ModelTransformer(BaseEstimator, TransformerMixin):

    def __init__(self,model,name):
        self.model=model
        self.name=name

    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self

    def transform(self, X, **transform_params):
        return [self.model.predict(x) for x in X]

class ColumnSelector(TransformerMixin):
  
    def __init__(self,namecol):
        self.namecol=namecol

    def fit(self, data, y=None):
        return self
        
    def transform(self, data):
        if type(data) is list:
            df = pd.DataFrame(data)
            D=df[self.namecol]
        elif type(data) is dict:
            df = pd.DataFrame(columns=[self.namecol], index=['x'])
            df.loc['x'] = pd.Series({self.namecol:data[self.namecol]})
            D=df[self.namecol]

        return D

class DictList(TransformerMixin):
  
    def __init__(self):
        pass

    def fit(self, data, y=None):
        return self
        
    def transform(self, data):
        def flatten(d, parent_key='', sep='_'):
            items = []
            for k, v in d.items():
                new_key = parent_key + sep + k if parent_key else k
                if isinstance(v, collections.MutableMapping):
                    items.extend(flatten(v, new_key, sep=sep).items())
                else:
                    items.append((new_key, v))
            return dict(items)
        
        dictlist=[]
        for dicts in data:
            dictlist.append(flatten(dicts))

        return dictlist
    
class NestExtractor(TransformerMixin):
  
    def __init__(self):
        pass

    def fit(self, data, y=None):
        return self
        
    def transform(self, data):
        dictlist=[]
        for dicts in data:
            for key, value in dicts.items():
                if type(dicts[key]) is bool:
                    dicts[key]=int(dicts[key])
                if type(dicts[key]) is str:
                    keyn = key + "_" + dicts[key]
                    dicts[keyn]=1
                    del dicts[key]
            dictlist.append(dicts)
        return dictlist
    
class CategoriesExtractor(TransformerMixin):
  
    def __init__(self):
        self.t=DictVectorizer()

    def fit(self, data, y=None):        
        #extract categories from data
        if type(data) is list:
            df = pd.DataFrame(data)
            D=df['categories']
        elif type(data) is dict:
            df = pd.DataFrame(columns=['categories'], index=['x'])
            df.loc['x'] = pd.Series({'categories':data['categories']})
        D=df['categories']
        dictlist=[]
        for lines in D:
            x={}
            for words in lines:
                x[words]=1
            dictlist.append(x)
        self.t.fit(dictlist)
        return self
    
    def transform(self, data):
        #extract categories from data
        if type(data) is list:
            df = pd.DataFrame(data)
            D=df['categories']
        elif type(data) is dict:
            df = pd.DataFrame(columns=['categories'], index=['x'])
            df.loc['x'] = pd.Series({'categories':data['categories']})
            D=df['categories']
        else:
            df= pd.DataFrame(data)
            D=df['categories']
        dictlist2=[]
        for lines in D:
            x={}
            for words in lines:
                x[words]=1
            dictlist2.append(x)
        X=self.t.transform(dictlist2)
        return X
    
class DenseTransformer(TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self
    
class LatLongExtractor(TransformerMixin):
    def __init__(self):
        pass

    def fit(self, *_):
        return self

    def transform(self, X):
        #extract columns from data                                                                  
        if type(X) is list:
            df = pd.DataFrame(X)
            df.drop(['attributes','business_id','categories','city','full_address','hours','name','\
neighborhoods','open','review_count','stars','state','type'],inplace=True,axis=1)
            return df
        elif type(X) is dict:
            df = pd.DataFrame(columns=['latitude','longitude'], index=['x'])
            df.loc['x'] = pd.Series({'latitude':X['latitude'], 'longitude':X['longitude']})
            return df
        else:
            return self

class ColumnExtractor(TransformerMixin):
    def __init__(self, namecols):
        self.namecols = namecols
        
    def fit(self, *_):
        return self
    
    def transform(self, X, *args, **transform_params):
        if isinstance(X,dict):
            return [X[arg] for arg in self.namecols]
        elif isinstance(X,list):
            return [[rec[arg] for arg in self.namecols] for rec in X]
        
class LinearRegressor2(BaseEstimator, RegressorMixin):

    def __init__(self):
        self.linear_regression = linear_model.LinearRegression()
        pass
    
    def fit(self, X, y):
        self.linear_regression.fit(X, y)
        return self
    
    def transform(self, X):
        return X
    
    def predict(self, X):
        Xy=self.linear_regression.predict(X)
        if type(Xy) is list:
            Xyz=Xy
        elif type(Xy) is np.ndarray:
            Xyz=[]
            for record in Xy:
                frecord=float(record)
                Xyz.append(frecord)
        if len(Xyz)<2:
            Xyz=Xyz[0]
        return Xyz

class EnsembleRegressor(BaseEstimator, RegressorMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y):
        self.random_forest = ensemble.RandomForestRegressor(min_samples_leaf=20).fit(X, y)
        return self.random_forest
    
    def transform(self, X):
        return X
    
    def predict(self, X):
        Xy=self.random_forest.predict(X)
        if type(Xy) is list:
            Xyz=Xy
        else:
            Xyz=Xy.tolist()
            #Xyz=Xyz[0]
        return Xyz

INVESTIGATING HOW PREDICTIVE THE CITY-FEATURE IS

(1) I first test the predictive power of the city of the venue (perhaps some cities have higher average ratings than others). I get a very low score, indicating that this is not predictive.


In [49]:
#BASED ON CITY

class EstimatorKL(BaseEstimator, RegressorMixin):
    
    def __init__(self):
        pass

    def fit(self, X, y):
        infilex=X
        sortbyx='city'
        df = pd.DataFrame(infilex)
        grouped=df.groupby(sortbyx, group_keys=True)
        keylist=[]
        for k, gp in grouped:
            keylist.append(str(k))
        cities_avg={}
        for keysl in keylist:
            cities_avg.update({keysl: grouped.get_group(keysl)['stars'].mean()})
        self.ds=cities_avg
        return self.ds
    
    def predict(self, X):
        akey=[]
        for record in X:
            try:
                akey.append(self.ds[record['city']])
            except:
                akey.append(3.67)
        return akey
    
estimator = EstimatorKL()  # initialize
estimator.fit(x_train,y_train)  # fit data
print estimator.score(x_test,y_test)


0.00492804346835

INCORPORATING SPATIAL INFORMATION

(2) Perhaps the city-based model isn't fine-grained enough. Therefore, I use both a Random Forests and a K-Nearest Neighbors model on the location of the venue (latitude/longitude) to test how predictive neighborhoods are. The scores show that the predictive power of location, while better than city, is still very weak. Moreover, both the Random Forest regressor and the KNN regressor seemed to have about the same predictive power.


In [91]:
#BASED ON LATITUDE AND LONGITUDE

#RandomForest Regressor
pipeline = Pipeline([
    ('latlong', LatLongExtractor()),
    ('rforest', ensemble.RandomForestRegressor(min_samples_leaf=20))
                    ])

pipeline.fit(x_train,y_train)
print " The score for the Random Forest Regressor is: \n"
print pipeline.score(x_test,y_test)

#K-Nearest Neighbors Regressor

x_latlong = [[xa['latitude'],xa['longitude']] for xa in x_train]
cv = cross_validation.ShuffleSplit(len(y_train), n_iter=20, test_size=0.2, random_state=40)
param_grid = { "n_neighbors": range(4, 100, 5) }
nearest_neighbors_cv = grid_search.GridSearchCV(neighbors.KNeighborsRegressor(), param_grid=param_grid, cv=cv)
nearest_neighbors_cv.fit(x_latlong,y_train)
print("\n")
print(nearest_neighbors_cv.best_estimator_)

pipeline2 = Pipeline([
    ('latlong', LatLongExtractor()),
    ('knn', nearest_neighbors_cv.best_estimator_)
                    ])

pipeline2.fit(x_train,y_train)
print "\n The score for the KNN Regressor is: \n"
print pipeline2.score(x_test,y_test)


 The score for the Random Forest Regressor is: 

0.0173717385617


KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          n_neighbors=59, p=2, weights='uniform')

 The score for the KNN Regressor is: 

0.0179731781387

MODELING CATEGORY-FEATURES

(3) Venues have categories with varying degrees of specificity, such as: [American (Traditional), Restaurants] [Restaurants] [Doctors, Health & Medical]

For this part, (a) I built a custom transformer that massages the data and feeds it into the DictVectorizer, generating a large matrix via One-Hot-Encoding. (b) I then used the TfidfTransformer to normalize categories such that the influence of common, high frequency words (i.e. restaurants) was lessened and the influence of uncommon words increased. (c) I then ran a ridge regression. I didn't use a nonlinear predictor such as KNN for these category-features because of the high dimensionality; and I didn't carry out a linear regression because of concerns about overfitting.

The score (.17) shows that categories are weak predictors of Yelp Ratings.


In [109]:
#BASED ON CATEGORIES

pipeline = Pipeline([
    ('catextract', CategoriesExtractor()),
    ('tfidf', TfidfTransformer()),
    ('estimate', linear_model.Ridge())
                            ])

pipeline.fit(x_train,y_train)
print pipeline.score(x_test,y_test)


0.167279104669

MODELING ATTRIBUTE-FEATURES

I will now check the predictive power of venue attributes (wifi,good for groups, etc.).

(1) Attributes are a mixture of nested and unnested:
{'Attire': 'casual', 'Accepts Credit Cards': True, 'Ambience': {'casual': False, 'classy': False }}

I flatten the nested attributes and transform them into a features matrix via one-hot encoding, so that the above becomes:

{'Attire_casual' : 1,
 'Accepts Credit Cards': 1,
 'Ambience_casual': 0,
 'Ambience_classy': 0 }

(2) I repeat the steps above -- running it through a dictionary vectorizer, tfidf transformer, and finally, a Ridge regression.

As the score shows, venue attributes are a very weak predictor of Yelp Ratings.


In [110]:
#BASED ON ATTRIBUTES

pipeline =  Pipeline([
    ('att_extractor', ColumnSelector('attributes')),
    ('unnestor', DictList()),
    ('nobooler', NestExtractor()),
    ('dictv', DictVectorizer(sparse='FALSE')),
    ('tfidf', TfidfTransformer()),
    ('estimate', linear_model.Ridge())
                            ])

pipeline.fit(x_train,y_train)
print pipeline.score(x_test,y_test)


0.0761642437615

COMBINED MODEL

So far we have three weak predictors: location, categories and attributes. I will now build a model combining these predictors using scikitlearn's FeatureUnion to combine. Since FeatureUnion only run on Transformers, I will have to transform the final estimators for each of the predictors into Transformers using the ModelTransformer() class created above.

While this combined model (score: .15) fares better than just the latlong (score: .02) or the attributes model (score: .08), it is outscored by the Categories predictor (score: .17) alone.


In [112]:
pipeline_all = Pipeline([
        ('festimators', FeatureUnion([
                    ('categories', Pipeline([
                                ('catextract', CategoriesExtractor()),
                                ('tfidf', TfidfTransformer()),
                                ('lregcat', ModelTransformer(linear_model.Ridge(),'cat'))                              
                            ])),
                    ('latlong', Pipeline([
                                ('latlong', ColumnExtractor(['latitude','longitude'])),
                                ('rforest', ModelTransformer(ensemble.RandomForestRegressor(min_samples_leaf=20),'rforest'))
                            ])),
                    ('attributes', Pipeline([
                                ('att_extractor', ColumnSelector('attributes')),
                                ('unnestor', DictList()),
                                ('nobooler', NestExtractor()),
                                ('dictv', DictVectorizer(sparse='FALSE')),
                                ('tfidf', TfidfTransformer()),
                                ('linearize', ModelTransformer(linear_model.Ridge(),'att'))
                            ]))
                ])),
        ('estimate', linear_model.LinearRegression())
    ])

pipeline_all.fit(x_train,y_train)
print pipeline_all.score(x_test,y_test)


0.154383437543

Next, I will explore whether running NLP on Yelp comments yields better predictions.