In [9]:
import dill as pickle
import gzip
import simplejson as json
import urllib2
from StringIO import StringIO
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
import re
import pickle as pk
import seaborn as sns
import numpy as np
import nltk.tokenize as tokenize
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer
from sklearn import cross_validation, grid_search
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn import linear_model
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords, words
My objective is to accurately extract the sentiment (positive or negative) from Yelp review text, and predict the Yelp star ratings. There are over one million records in this dataset, and one of the constraints I faced was producing a model streamlined enough to go under Heroku's memory limits. For this reason, I used HashVectorizer when I could (in place of CountVectorizer), and stayed away from memory intensive search and validation procedures (such as GridSearchCV). However, even within these limitations, I was able to build models orders magnitudes more accurate than those in the previous lesson.
In [2]:
class HandleGZippedJSON:
def __init__(self, url):
self.url = url
self.json_data = None
def run(self):
request = urllib2.Request(self.url)
request.add_header('Accept-encoding', 'gzip')
opener = urllib2.build_opener()
f = opener.open(request)
c_data = f.read()
c_stream = StringIO(c_data)
gzipper = gzip.GzipFile(fileobj=c_stream)
data = gzipper.read()
output = data.splitlines()
datastr=[]
for lines in output:
try:
r=json.loads(lines)
datastr.append(r)
except ValueError: # includes simplejson.decoder.JSONDecodeError
print 'Decoding JSON has failed'
pass
return datastr
fileurl="http://thedataincubator.s3.amazonaws.com/coursedata/mldata/yelp_train_academic_dataset_review.json.gz"
out=HandleGZippedJSON(fileurl)
xfile=out.run()
df = pd.DataFrame(xfile)
x_train, x_test, y_train, y_test = cross_validation.train_test_split(xfile,df['stars'],test_size=0.2)
print(len(xfile))
print(xfile[0])
In this section, I create a bag of words. I then build a (regularized) linear model based on the count of the words in each document (review).
Some comments:
1) Tokenization is important for good performance, but it is also the most expensive step. For this reason, I vectorize as the first step.
2) 'CountVectorizer' has to memorize the mapping between words and the index to which it is assigned. This is linear in the size of the vocabulary. 'HashingVectorizer' doesn't have to remember this mapping and leads to much smaller models. For this reason, for the first two parts of this problem (when predicting rating) I use the hashing vectorizer.
3) I used a Ridge Regression for predicting ratings. Stochastic Gradient Descent SGDRegressor
was also an option, but the results were much poorer (score of .46 vs .63).
In [3]:
class ColumnSelector(TransformerMixin):
def __init__(self,namecol):
import pandas as pd
import numpy as np
self.namecol=namecol
def fit(self, data, y=None):
import pandas as pd
import numpy as np
return self
def transform(self, data):
import pandas as pd
import numpy as np
if type(data) is list:
df = pd.DataFrame(data)
D=df[self.namecol]
elif type(data) is dict:
df = pd.DataFrame(columns=[self.namecol], index=['x'])
df.loc['x'] = pd.Series({self.namecol:data[self.namecol]})
D=df[self.namecol]
return D
class RidgeRegressor2(BaseEstimator, RegressorMixin):
def __init__(self):
import pandas as pd
import numpy as np
pass
def fit(self, X, y):
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model, utils, preprocessing, cross_validation, neighbors, ensemble
self.ridge_regression = linear_model.Ridge().fit(X, y)
return self
def predict(self, X):
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model, utils, preprocessing, cross_validation, neighbors, ensemble
Xy=self.ridge_regression.predict(X)
if type(Xy) is list:
Xyz=Xy
elif type(Xy) is np.ndarray:
Xyz=[]
for record in Xy:
frecord=float(record)
Xyz.append(frecord)
if len(Xyz)<2:
Xyz=Xyz[0]
return Xyz
In [7]:
### JUST USING ONE-GRAMS ####
mypipeline=Pipeline([
('text_extractor', ColumnSelector('text')),
('hvect', HashingVectorizer(norm='l2',stop_words=nltk.corpus.stopwords.words('english'))),
('ridgefit', RidgeRegressor2())
])
mypipeline.fit(x_train,y_train)
print(mypipeline.score(x_test,y_test))
In [8]:
#### ALSO USING BIGRAMS ####
mypipeline2=Pipeline([
('text_extractor', ColumnSelector('text')),
('hvect', HashingVectorizer(norm='l2', ngram_range=(1, 2), stop_words=nltk.corpus.stopwords.words('english'))),
('ridgefit', RidgeRegressor2())
])
mypipeline2.fit(x_train,y_train)
print(mypipeline2.score(x_test,y_test))
In this section, I identify word pairs in restaurant reviews that are more likely than the individual words alone. These might be strongly indicative of "foodie" type words that you might expect to find in a yelp review such as "huevos rancheros".
For this part, we need to use a CountVectorizer instead of a HashVectorizer, which will slow down our calculations.
We can find word pairs that are unlikely to occur consecutively based on the underlying probability of their words.
Mathematically, if p(w) is the probability of a word w and p(w1w2) is the probability of the bigram w1w2, then we want to look at word pairs w1w2 where the statistic
p(w1w2)/p(w1)/p(w2)
is high.
Because this metric is problematic when p(w_1) and/or p(w_2) are small, I add a Bayesian prior to all the word probabilities.
There are two steps to this analysis.
First, I load in the previous Yelp data set to idenitify which businesses are restaurants, and do an SQL style join on my two pandas dataframes. This allows me to select reviews that only correspond to restaurants and by extension special food bigrams.
Then, I clean and parse the gathered data and run the analysis.
In [4]:
#import and merge data from previous dataset to identify restaurants
fileurl2="http://thedataincubator.s3.amazonaws.com/coursedata/mldata/yelp_train_academic_dataset_business.json.gz"
out=HandleGZippedJSON(fileurl2)
xfile_rests=out.run()
dfrest=pd.DataFrame(xfile_rests)
xrests=list(myrests['business_id'] for myrests in xfile_rests if ('Restaurants' in myrests['categories']) or ('Food' in myrests['categories']))
print(len(xfile))
xfile2 = [review for review in xfile if review['business_id'] in xrests]
print(len(xfile2))
dfonlyrests=pd.DataFrame(xfile2)
In [133]:
reviewtext=dfonlyrests['text']
estopwords=stopwords.words('english')
xonegramst = CountVectorizer(ngram_range=(1,1),stop_words=estopwords)
xonegrams = xonegramst.fit_transform(reviewtext)
xbigramst = CountVectorizer(ngram_range=(2,2),stop_words=estopwords)
xbigrams = xbigramst.fit_transform(reviewtext)
all_onegrams=xonegramst.get_feature_names()
all_bigrams=xbigramst.get_feature_names()
tot_words=xonegrams.sum() #total words in corpus (~50 million)
unique_words=xonegramst.get_feature_names()
tot_unique=len(unique_words) #total unique words in corpus (~322k)
new_tot_words=tot_words+tot_unique
wordcount_list=np.array(xonegrams.sum(axis=0))[0] #array of occurrances of each word in corpus
wordloc=xonegramst.vocabulary_ #location of each unique word in list
wc_list = [wordcount_list[wordloc[key]] for key in all_onegrams] #occurrances of particular unique word
###BIGRAMS###
unique_biwords=xbigramst.get_feature_names()
bi_keys_split = [re.split('\s',key) for key in unique_biwords]
biwordcount_list=np.array(xbigrams.sum(axis=0))[0]
biwordloc=xbigramst.vocabulary_
bi_wc_list=[biwordcount_list[biwordloc[key]] for key in unique_biwords]
In [64]:
def get_probs(xgrams,xgramst):
wordloc=xgramst.vocabulary_
unique_words=xgramst.get_feature_names()
tot_words=xgrams.sum() #total words in corpus (~50 million)
wordcount_list=np.array(xgrams.sum(axis=0))[0]
prob_word={}
for xword in unique_words:
prob_word[xword] = float((wordcount_list[wordloc[xword]]) + 5)
return prob_word
arb_cutoff = 35
biprobs=get_probs(xbigrams,xbigramst)
monoprobs=get_probs(xonegrams,xonegramst)
bigram_prob = [biprobs[b]/(monoprobs[s[0]]*monoprobs[s[1]]) for b,s in zip(unique_biwords,bi_keys_split)]
dfbiprob = pd.DataFrame({'biprob':bigram_prob,'bigram':unique_biwords})
dfbiprob = dfbiprob.sort('biprob',ascending=False)
dfbiprob = dfbiprob[dfbiprob['biprob'] != np.inf]
blist=[]
for x in dfbiprob['bigram']:
if bi_wc_list[biwordloc[x]] > arb_cutoff:
blist.append(x)
print(blist[0:100])
In [ ]:
hv = HashingVectorizer(norm='l2',stop_words=nltk.corpus.stopwords.words('english'))
hvcounts = hv.fit_transform(df['text'])
cv = cross_validation.KFold(len(df['stars']), n_folds=10, shuffle=True)
params = {'alpha':np.logspace(-6,-3,10)}
grid = grid_search.GridSearchCV(linear_model.SGDRegressor(),cv=cv,param_grid=params)
grid.fit(hvcounts,df['stars'])
with open('/home/vagrant/miniprojects/questions/nlp1.pkl', 'wb') as handle:
pickle.dump(grid, handle)
In [ ]:
mypipeline3=Pipeline([
('text_extractor', ColumnSelector('text')),
('hvect', HashingVectorizer(norm=None, ngram_range=(1, 2), non_negative=True, stop_words=nltk.corpus.stopwords.words('english'))),
('tfidft', TfidfTransformer()),
('svd', TruncatedSVD(n_components=100)),
('normdata', Normalizer(copy=False)),
('compatibility', Compatibility())
])
mypipeline2.fit(xfile,yout)
with open('/home/vagrant/miniprojects/nlp3.pkl', 'wb') as handle2:
pickle.dump(mypipeline2, handle2)
mypipeline2.predict(xfile[0:10])