In [9]:
import dill as pickle
import gzip
import simplejson as json
import urllib2
from StringIO import StringIO
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
import re
import pickle as pk

import seaborn as sns
import numpy as np
import nltk.tokenize as tokenize

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer
from sklearn import cross_validation, grid_search
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn import linear_model
import nltk

from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords, words

PREDICTING A VENUE'S RATING USING NLP

My objective is to accurately extract the sentiment (positive or negative) from Yelp review text, and predict the Yelp star ratings. There are over one million records in this dataset, and one of the constraints I faced was producing a model streamlined enough to go under Heroku's memory limits. For this reason, I used HashVectorizer when I could (in place of CountVectorizer), and stayed away from memory intensive search and validation procedures (such as GridSearchCV). However, even within these limitations, I was able to build models orders magnitudes more accurate than those in the previous lesson.

IMPORTING THE DATA


In [2]:
class HandleGZippedJSON:
    def __init__(self, url):
        self.url = url
        self.json_data = None
 
    def run(self):
        request = urllib2.Request(self.url)
        request.add_header('Accept-encoding', 'gzip')
        opener = urllib2.build_opener()
        f = opener.open(request)
        c_data = f.read()
        c_stream = StringIO(c_data)
        gzipper = gzip.GzipFile(fileobj=c_stream)
        data = gzipper.read()
        
        output = data.splitlines()
        datastr=[]

        for lines in output:
            try:
                r=json.loads(lines)
                datastr.append(r)
            except ValueError:  # includes simplejson.decoder.JSONDecodeError
                print 'Decoding JSON has failed'
                pass
        
        return datastr

fileurl="http://thedataincubator.s3.amazonaws.com/coursedata/mldata/yelp_train_academic_dataset_review.json.gz"
out=HandleGZippedJSON(fileurl)
xfile=out.run()
df = pd.DataFrame(xfile)
x_train, x_test, y_train, y_test = cross_validation.train_test_split(xfile,df['stars'],test_size=0.2)
print(len(xfile))
print(xfile[0])


1012913
{'votes': {'funny': 0, 'useful': 2, 'cool': 1}, 'user_id': 'Xqd0DzHaiyRqVH3WRG7hzg', 'review_id': '15SdjuK7DmYqUAj6rjGowg', 'text': "dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.", 'business_id': 'vcNAWiLM4dR7D2nwwJ7nCA', 'stars': 5, 'date': '2007-05-17', 'type': 'review'}

ANALYSIS

In this section, I create a bag of words. I then build a (regularized) linear model based on the count of the words in each document (review).

Some comments:

1) Tokenization is important for good performance, but it is also the most expensive step. For this reason, I vectorize as the first step.

2) 'CountVectorizer' has to memorize the mapping between words and the index to which it is assigned. This is linear in the size of the vocabulary. 'HashingVectorizer' doesn't have to remember this mapping and leads to much smaller models. For this reason, for the first two parts of this problem (when predicting rating) I use the hashing vectorizer.

3) I used a Ridge Regression for predicting ratings. Stochastic Gradient Descent SGDRegressor was also an option, but the results were much poorer (score of .46 vs .63).


In [3]:
class ColumnSelector(TransformerMixin):
  
    def __init__(self,namecol):
        import pandas as pd
        import numpy as np
        self.namecol=namecol

    def fit(self, data, y=None):
        import pandas as pd
        import numpy as np
        return self
        
    def transform(self, data):
        import pandas as pd
        import numpy as np
        
        if type(data) is list:
            df = pd.DataFrame(data)
            D=df[self.namecol]
        elif type(data) is dict:
            df = pd.DataFrame(columns=[self.namecol], index=['x'])
            df.loc['x'] = pd.Series({self.namecol:data[self.namecol]})
            D=df[self.namecol]
        return D
        
class RidgeRegressor2(BaseEstimator, RegressorMixin):

    def __init__(self):
        import pandas as pd
        import numpy as np
        pass
    
    def fit(self, X, y):
        import pandas as pd
        import numpy as np
        from sklearn import datasets, linear_model, utils, preprocessing, cross_validation, neighbors, ensemble
        self.ridge_regression = linear_model.Ridge().fit(X, y)
        return self
    
    def predict(self, X):
        import pandas as pd
        import numpy as np
        from sklearn import datasets, linear_model, utils, preprocessing, cross_validation, neighbors, ensemble

        Xy=self.ridge_regression.predict(X)
        if type(Xy) is list:
            Xyz=Xy
        elif type(Xy) is np.ndarray:
            Xyz=[]
            for record in Xy:
                frecord=float(record)
                Xyz.append(frecord)
        if len(Xyz)<2:
            Xyz=Xyz[0]
        return Xyz

In [7]:
### JUST USING ONE-GRAMS ####

mypipeline=Pipeline([
  ('text_extractor', ColumnSelector('text')),
  ('hvect', HashingVectorizer(norm='l2',stop_words=nltk.corpus.stopwords.words('english'))),
  ('ridgefit', RidgeRegressor2())
])

mypipeline.fit(x_train,y_train)

print(mypipeline.score(x_test,y_test))


0.627198492197

In [8]:
#### ALSO USING BIGRAMS ####

mypipeline2=Pipeline([
    ('text_extractor', ColumnSelector('text')),
    ('hvect', HashingVectorizer(norm='l2', ngram_range=(1, 2), stop_words=nltk.corpus.stopwords.words('english'))),
    ('ridgefit', RidgeRegressor2())
])

mypipeline2.fit(x_train,y_train)

print(mypipeline2.score(x_test,y_test))


0.662585007173

FOOD BIGRAMS

In this section, I identify word pairs in restaurant reviews that are more likely than the individual words alone. These might be strongly indicative of "foodie" type words that you might expect to find in a yelp review such as "huevos rancheros".

For this part, we need to use a CountVectorizer instead of a HashVectorizer, which will slow down our calculations.

We can find word pairs that are unlikely to occur consecutively based on the underlying probability of their words.

Mathematically, if p(w) is the probability of a word w and p(w1w2) is the probability of the bigram w1w2, then we want to look at word pairs w1w2 where the statistic

p(w1w2)/p(w1)/p(w2)

is high.

Because this metric is problematic when p(w_1) and/or p(w_2) are small, I add a Bayesian prior to all the word probabilities.

There are two steps to this analysis.

First, I load in the previous Yelp data set to idenitify which businesses are restaurants, and do an SQL style join on my two pandas dataframes. This allows me to select reviews that only correspond to restaurants and by extension special food bigrams.

Then, I clean and parse the gathered data and run the analysis.

IMPORTING AND MERGING THE DATA


In [4]:
#import and merge data from previous dataset to identify restaurants

fileurl2="http://thedataincubator.s3.amazonaws.com/coursedata/mldata/yelp_train_academic_dataset_business.json.gz"
out=HandleGZippedJSON(fileurl2)
xfile_rests=out.run()
dfrest=pd.DataFrame(xfile_rests)
xrests=list(myrests['business_id'] for myrests in xfile_rests if ('Restaurants' in myrests['categories']) or ('Food' in myrests['categories']))
print(len(xfile))
xfile2 = [review for review in xfile if review['business_id'] in xrests]
print(len(xfile2))
dfonlyrests=pd.DataFrame(xfile2)


1012913
634495

CLEANING, PROCESSING THE DATA


In [133]:
reviewtext=dfonlyrests['text']
estopwords=stopwords.words('english')

xonegramst = CountVectorizer(ngram_range=(1,1),stop_words=estopwords)
xonegrams = xonegramst.fit_transform(reviewtext)
xbigramst = CountVectorizer(ngram_range=(2,2),stop_words=estopwords)
xbigrams = xbigramst.fit_transform(reviewtext)
all_onegrams=xonegramst.get_feature_names()
all_bigrams=xbigramst.get_feature_names()

tot_words=xonegrams.sum() #total words in corpus (~50 million)
unique_words=xonegramst.get_feature_names()
tot_unique=len(unique_words) #total unique words in corpus (~322k)
new_tot_words=tot_words+tot_unique
wordcount_list=np.array(xonegrams.sum(axis=0))[0] #array of occurrances of each word in corpus
wordloc=xonegramst.vocabulary_ #location of each unique word in list
wc_list = [wordcount_list[wordloc[key]] for key in all_onegrams] #occurrances of particular unique word

###BIGRAMS###
unique_biwords=xbigramst.get_feature_names()
bi_keys_split = [re.split('\s',key) for key in unique_biwords]
biwordcount_list=np.array(xbigrams.sum(axis=0))[0]
biwordloc=xbigramst.vocabulary_
bi_wc_list=[biwordcount_list[biwordloc[key]] for key in unique_biwords]

In [64]:
def get_probs(xgrams,xgramst):
    wordloc=xgramst.vocabulary_
    unique_words=xgramst.get_feature_names()
    tot_words=xgrams.sum() #total words in corpus (~50 million)
    wordcount_list=np.array(xgrams.sum(axis=0))[0]
    prob_word={}
    for xword in unique_words:
        prob_word[xword] = float((wordcount_list[wordloc[xword]]) + 5)
    return prob_word

arb_cutoff = 35
biprobs=get_probs(xbigrams,xbigramst)
monoprobs=get_probs(xonegrams,xonegramst)

bigram_prob = [biprobs[b]/(monoprobs[s[0]]*monoprobs[s[1]]) for b,s in zip(unique_biwords,bi_keys_split)]
dfbiprob = pd.DataFrame({'biprob':bigram_prob,'bigram':unique_biwords})
dfbiprob = dfbiprob.sort('biprob',ascending=False)
dfbiprob = dfbiprob[dfbiprob['biprob'] != np.inf]

blist=[]
for x in dfbiprob['bigram']:
    if bi_wc_list[biwordloc[x]] > arb_cutoff:
        blist.append(x)
        
print(blist[0:100])


[u'pel meni', u'laan xang', u'f_5_unx wrafcxuakbzrdw', u'roka akor', u'grana padano', u'chicha morada', u'dol sot', u'innis gunn', u'himal chuli', u'hodge podge', u'ferrero rocher', u'hoity toity', u'celine dion', u'perrier jouet', u'mille feuille', u'luc lac', u'ore ida', u'hy vee', u'riff raff', u'reina pepiada', u'alain ducasse', u'cien agaves', u'rustler rooste', u'ezzyujdouig4p gyb3pv_a', u'khai hoan', u'deja vu', u'dueling pianos', u'feng shui', u'nooks crannies', u'hu tieu', u'nanay gloria', u'hors oeuvres', u'mccormick schmick', u'leaps bounds', u'haagen dazs', u'homer simpson', u'barnes noble', u'connective tissue', u'tutti santi', u'palo alto', u'marche bacchus', u'fru fru', u'knick knacks', u'bla bla', u'ak yelpcdn', u'jap chae', u'itty bitty', u'nuoc mam', u'khao soi', u'horny toad', u'haricot vert', u'bai thong', u'highs lows', u'holyrood 9a', u'porta alba', u'gulab jamun', u'shiner bock', u'molecular gastronomy', u'ropa vieja', u'org wiki', u'yadda yadda', u'visa mastercard', u'rx boiler', u'lloyd wright', u'dun dun', u'womp womp', u'pura vida', u'190 octane', u'haricot verts', u'lou malnati', u'ritz carlton', u'harley davidson', u'rula bula', u'turo turo', u'estiatorio milos', u'snicker doodle', u'bubba gump', u'tammie coe', u'salo salo', u'sous vide', u'quattro formaggi', u'dac biet', u'fritto misto', u'penn teller', u'vice versa', u'hob nobs', u'dolce vita', u'ba reeba', u'kao tod', u'goi cuon', u'malai kofta', u'lomo saltado', u'fra diavolo', u'wells fargo', u'wayne gorsek', u'jw marriott', u'upward projects', u'bradley ogden', u'chino bandido', u'baskin robbins']

EXTRA CODE


In [ ]:
hv = HashingVectorizer(norm='l2',stop_words=nltk.corpus.stopwords.words('english'))
hvcounts = hv.fit_transform(df['text'])
cv = cross_validation.KFold(len(df['stars']), n_folds=10, shuffle=True)
params = {'alpha':np.logspace(-6,-3,10)}
grid = grid_search.GridSearchCV(linear_model.SGDRegressor(),cv=cv,param_grid=params)
grid.fit(hvcounts,df['stars'])

with open('/home/vagrant/miniprojects/questions/nlp1.pkl', 'wb') as handle:
    pickle.dump(grid, handle)

In [ ]:
mypipeline3=Pipeline([
        ('text_extractor', ColumnSelector('text')),
        ('hvect', HashingVectorizer(norm=None, ngram_range=(1, 2), non_negative=True, stop_words=nltk.corpus.stopwords.words('english'))),
        ('tfidft', TfidfTransformer()),
        ('svd', TruncatedSVD(n_components=100)),
        ('normdata', Normalizer(copy=False)),
        ('compatibility', Compatibility())
])

mypipeline2.fit(xfile,yout)

with open('/home/vagrant/miniprojects/nlp3.pkl', 'wb') as handle2:
    pickle.dump(mypipeline2, handle2)

mypipeline2.predict(xfile[0:10])