In [95]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import json
import pandas as pd
import csv
import os
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats
from statsmodels.sandbox.regression.predstd import wls_prediction_std
This is a function that we'll use later to plot the results of a linear SVM classifier
In [3]:
def plot_coefficients(classifier, feature_names, top_features=20):
coef = classifier.coef_.ravel()[0:200]
top_positive_coefficients = np.argsort(coef)[-top_features:]
top_negative_coefficients = np.argsort(coef)[:top_features]
top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
#create plot
plt.figure(figsize=(15, 5))
colors = ['red' if c < 0 else 'blue' for c in coef[top_coefficients]]
plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
feature_names = np.array(feature_names)
plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[top_coefficients], rotation=60, ha='right')
plt.show()
In [ ]:
def bayesian_average()
In [9]:
#This is the main folder where all the modules and JSON files are stored on my computer.
#You need to change this to the folder path specific to your computer
file_directory = "/Users/robertsonwang/Desktop/Python/Yelp_class/yelp-classification/"
reviews_file = "cleaned_reviews_states_2010.json"
biz_file = "cleaned_business_data.json"
In [10]:
#This is a smaller subset of our overall Yelp data
#I randomly chose 5000 reviews from each state and filed them into the JSON file
#Note that for the overall dataset, we have about 2 million reviews.
#That's why we need to use a data management system like MongoDB in order to hold all our data
#and to more efficiently manipulate it
reviews_json = json.load(open(file_directory+reviews_file))
biz_json = json.load(open(file_directory+biz_file))
In [11]:
for key in reviews_json.keys():
reviews_json[key] = reviews_json[key][0:5000]
In [4]:
#Let's see how reviews_json is set up
print reviews_json.keys()
In [29]:
#We can see that on the highest level, the dictionary keys are the different states
#Let's look at the first entry under Ohio
print reviews_json['OH'][0]['useful']
In [6]:
#So for each review filed under Ohio, we have many different attributes to choose from
#Let's look at what the review and rating was for the first review filed under Ohio
print reviews_json['OH'][0]['text']
print reviews_json['OH'][0]['stars']
In [41]:
#We want to split up reviews between text and labels for each state
reviews = []
stars = []
for key in reviews_json.keys():
for review in reviews_json[key]:
reviews.append(review['text'])
stars.append(review['stars'])
#Just for demonstration, let's pick out the same review example as above but from our respective lists
print reviews[0]
print stars[0]
Let's take a look at the following regression (information is correlated with review length):
$Rating = \beta_{neg}neg + \beta_{pos}pos + \beta_{num}\text{Std_NumWords} + \epsilon$
Where:
$neg = \frac{\text{Number of Negative Words}}{\text{Total Number of Words}}$
$pos = \frac{\text{Number of Positive Words}}{\text{Total Number of Words}}$
In [86]:
harvard_dict = pd.read_csv('HIV-4.csv')
negative_words = list(harvard_dict.loc[harvard_dict['Negativ'] == 'Negativ']['Entry'])
positive_words = list(harvard_dict.loc[harvard_dict['Positiv'] == 'Positiv']['Entry'])
In [25]:
#Use word dictionary from Hu and Liu (2004)
negative_words = open('negative-words.txt', 'r').read()
negative_words = negative_words.split('\n')
positive_words = open('positive-words.txt', 'r').read()
positive_words = positive_words.split('\n')
total_words = negative_words + positive_words
total_words = list(set(total_words))
In [53]:
review_length = []
negative_percent = []
positive_percent = []
for review in reviews:
length_words = len(review.split())
neg_words = [x.lower() for x in review.split() if x in negative_words]
pos_words = [x.lower() for x in review.split() if x in positive_words]
negative_percent.append(float(len(neg_words))/float(length_words))
positive_percent.append(float(len(pos_words))/float(length_words))
review_length.append(length_words)
In [140]:
regression_df = pd.DataFrame({'stars':stars, 'review_length':review_length, 'neg_percent': negative_percent, 'positive_percent': positive_percent})
In [156]:
#Standardize dependent variables
std_vars = ['neg_percent', 'positive_percent', 'review_length']
for var in std_vars:
len_std = regression_df[var].std()
len_mu = regression_df[var].mean()
regression_df[var] = [(x - len_mu)/len_std for x in regression_df[var]]
Let's try using Harvard-IV sentiment categories as dependent variables
In [163]:
#The R-Squared from using the Harvard Dictionary is 0.1 but with the Hu & Liu word dictionary
X = np.column_stack((regression_df.review_length,regression_df.neg_percent, regression_df.positive_percent))
y = regression_df.stars
X = sm.add_constant(X)
est = sm.OLS(y, X)
est2 = est.fit()
print(est2.summary())
NOTE: BLUE Estimator does not require normality of errors Gauss-Markov Theorem states that the ordinary least squares estimate is the best linear unbiased estimator (BLUE) of the regression coefficients ('Best' meaning optimal in terms of minimizing mean squared error) as long as the errors:
(1) have mean zero
(2) are uncorrelated
(3) have constant variance
In [164]:
x = np.array(regression_df.stars)
#beta = [3.3648, -0.3227 , 0.5033]
y = [int(round(i)) for i in list(est2.fittedvalues)]
y = np.array(y)
errors = np.subtract(x,y)
np.sum(errors)
Out[164]:
In [ ]:
fig, ax = plt.subplots(figsize=(5,5))
ax.plot(x, x, 'b', label="data")
ax.plot(x, y, 'o', label="ols")
#ax.plot(x, est2.fittedvalues, 'r--.', label="OLS")
#ax.plot(x, iv_u, 'r--')
#ax.plot(x, iv_l, 'r--')
ax.legend(loc='best');
In [91]:
#Do a QQ plot of the data
fig = sm.qqplot(errors)
plt.show()
In [12]:
star_hist = pd.DataFrame({'Ratings':stars})
star_hist.plot.hist()
Out[12]:
In [13]:
df_list = []
states = list(reviews_json.keys())
for state in states:
stars_state = []
for review in reviews_json[state]:
stars_state.append(review['stars'])
star_hist = pd.DataFrame({'Ratings':stars_state})
df_list.append(star_hist)
for i in range(0, len(df_list)):
print states[i] + " Rating Distribution"
df_list[i].plot.hist()
plt.show()
Note, all support vector machine algorithm relies on drawing a separating hyperplane amongst the different classes. This is not necessarily guarenteed to exist. For a complete set of conditions that must be satisfied for this to be an appropriate algorithm to use, please see below:
http://www.unc.edu/~normanp/890part4.pdf
The following is also a good, and more general, introduction to Support Vector Machines:
In [36]:
#First let's separate out our dataset into a training sample and a test sample
#We specify a training sample percentage of 80% of our total dataset. This is just a rule of thumb
training_percent = 0.8
train_reviews = reviews[0:int(len(reviews)*training_percent)]
test_reviews = reviews[int(len(reviews)*training_percent):len(reviews)]
train_ratings = stars[0:int(len(stars)*training_percent)]
test_ratings = stars[int(len(stars)*training_percent):len(stars)]
In order to use the machine learning algorithms in Sci-Kit learn, we first have to initialize a CountVectorizer object. We can use this object creates a matrix representation of each of our words. There are many options that we can specify when we initialize our CountVectorizer object (see documentation for full list) but they essentially all relate to how the words are represented in the final matrix.
In [37]:
vectorizer = CountVectorizer(analyzer = "word", \
tokenizer = None, \
preprocessor = None, \
stop_words = None, \
vocabulary = total_words, \
max_features = 200)
train_data_features = vectorizer.fit_transform(train_reviews)
test_data_features = vectorizer.fit_transform(test_reviews)
In [38]:
output = pd.DataFrame( data={"Reviews": test_reviews, "Rating": test_ratings} )
Lets call a linear SVM instance from SK Learn have it train on our subset of reviews. We'll output the results to an output dataframe and then calculate a total accuracy percentage.
In [26]:
#Let's do the same exercise as above but use TF-IDF, you can learn more about TF-IDF here:
#https://nlp.stanford.edu/IR-book/html/htmledition/tf-idf-weighting-1.html
tf_transformer = TfidfTransformer(use_idf=True)
train_data_features = tf_transformer.fit_transform(train_data_features)
test_data_features = tf_transformer.fit_transform(test_data_features)
lin_svm = lin_svm.fit(train_data_features, train_ratings)
lin_svm_result = lin_svm.predict(test_data_features)
output['lin_svm'] = lin_svm_result
output['Accurate'] = np.where(output['Rating'] == output['lin_svm'], 1, 0)
accurate_percentage = float(sum(output['Accurate']))/float(len(output))
print accurate_percentage
In [51]:
#Here we plot the features with the highest absolute value coefficient weight
plot_coefficients(lin_svm, vectorizer.get_feature_names())
SKLearn uses what's known as a pipeline. Instead of having to declare each of these objects on their own and passing them into each other, we can just create one object with all the necessary options specified and then use that to run the algorithm. For each pipeline below, we specify the vector to be the CountVectorizer object we have defined above, set it to use tfidf, and then specify the classifier that we want to use.
Below, we create a separate pipeline for Random Forest, a Bagged Decision Tree, and Multinomial Logistic Regression. We then append the results to the dataframe that we've already created.
In [39]:
# random_forest = Pipeline([('vect', vectorizer),
# ('tfidf', TfidfTransformer()),
# ('clf', RandomForestClassifier())])
# random_forest.set_params(clf__n_estimators=100, clf__criterion='entropy').fit(train_reviews, train_ratings)
# output['random_forest'] = random_forest.predict(test_reviews)
# output['Accurate'] = np.where(output['Rating'] == output['random_forest'], 1, 0)
# accurate_percentage = float(sum(output['Accurate']))/float(len(output))
# print accurate_percentage
# bagged_dt = Pipeline([('vect', vectorizer),
# ('tfidf', TfidfTransformer()),
# ('clf', BaggingClassifier())])
# bagged_dt.set_params(clf__n_estimators=100, clf__n_jobs=1).fit(train_reviews, train_ratings)
# output['bagged_dt'] = bagged_dt.predict(test_reviews)
# output['Accurate'] = np.where(output['Rating'] == output['bagged_dt'], 1, 0)
# accurate_percentage = float(sum(output['Accurate']))/float(len(output))
# print accurate_percentage
multi_logit = Pipeline([('vect', vectorizer),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB())])
multi_logit.set_params(clf__alpha=1, clf__fit_prior = True, clf__class_prior = None).fit(train_reviews, train_ratings)
output['multi_logit'] = multi_logit.predict(test_reviews)
output['Accurate'] = np.where(output['Rating'] == output['multi_logit'], 1, 0)
accurate_percentage = float(sum(output['Accurate']))/float(len(output))
print accurate_percentage
In [40]:
random_forest = Pipeline([('vect', vectorizer),
('tfidf', TfidfTransformer()),
('clf', RandomForestClassifier())])
random_forest.set_params(clf__n_estimators=100, clf__criterion='entropy').fit(train_reviews, train_ratings)
output['random_forest'] = random_forest.predict(test_reviews)
output['Accurate'] = np.where(output['Rating'] == output['random_forest'], 1, 0)
accurate_percentage = float(sum(output['Accurate']))/float(len(output))
print accurate_percentage
As you can see, none of the above classifiers performs significantly better than a fair coin toss. This is most likely due to the heavily skewed distribution of review ratings. There are many reviews that receive 4 or 5 stars, therefore it is likely that the language associated with each review is being confused with each other. We can confirm this by looking at the "confusion matrix" of our predictions.
In [70]:
print metrics.confusion_matrix(test_ratings, bagged_dt.predict(test_reviews), labels = [1, 2, 3, 4, 5])
Each row and column corresponds to a rating number. For example, element (1,1) is the number of 1 star reviews that were correctly classified. Element (1,2) is the number of 1 star reviews that were incorrectly classified as 2 stars. Therefore, the sum of the diagonal represents the total number of correctly classified reviews. As you can see, the bagged decision tree classifier is classifying many four starred reviews as five starred reviews and vice versa.
This indicates that we can improve our results by using more aggregated categories. For example, we can call all four and five star reviews as "good" and all other review ratings as "bad".
In [12]:
for review in reviews_json[reviews_json.keys()[0]]:
print type(review['date'])
break
In [16]:
latitude_list = []
longitude_list = []
stars_list = []
count_list = []
state_list = []
for biz in biz_json:
stars_list.append(biz['stars'])
latitude_list.append(biz['latitude'])
longitude_list.append(biz['longitude'])
count_list.append(biz['review_count'])
state_list.append(biz['state'])
In [17]:
biz_df = pd.DataFrame({'ratings':stars_list, 'latitude':latitude_list, 'longitude': longitude_list, 'review_count': count_list, 'state':state_list})
We draw a heat map for each state below. Longitude is on the Y axis and Latitude is on the X axis. The color coding is as follows:
In [109]:
states = [u'OH', u'NC', u'WI', u'IL', u'AZ', u'NV']
cmap, norm = mpl.colors.from_levels_and_colors([1, 2, 3, 4, 5], ['red', 'orange', 'yellow', 'green', 'blue'], extend = 'max')
for state in states:
state_df = biz_df[biz_df.state == state]
state_df_filt = state_df[(np.abs(state_df.longitude-state_df.longitude.mean()) <= 2*state_df.longitude.std()) \
& (np.abs(state_df.latitude-state_df.latitude.mean()) <= 2*state_df.latitude.std())]
plt.ylim(min(state_df_filt.latitude), max(state_df_filt.latitude))
plt.xlim(min(state_df_filt.longitude), max(state_df_filt.longitude))
plt.scatter(state_df_filt.longitude, state_df_filt.latitude, c=state_df_filt.ratings, cmap=cmap, norm=norm)
plt.show()
print state
We run the following linear regression model for each of the states:
$Rating = \beta_{1} Longitude + \beta_{2} Latitude + \beta_{3} Num of Reviews + \epsilon$
In [126]:
for state in states:
state_df = biz_df[biz_df.state == state]
state_df_filt = state_df[(np.abs(state_df.longitude-state_df.longitude.mean()) <= 2*state_df.longitude.std()) \
& (np.abs(state_df.latitude-state_df.latitude.mean()) <= 2*state_df.latitude.std())]
state_df_filt['longitude'] = (state_df_filt.longitude - state_df.longitude.mean())/state_df.longitude.std()
state_df_filt['latitude'] = (state_df_filt.latitude - state_df.latitude.mean())/state_df.latitude.std()
state_df_filt['review_count'] = (state_df_filt.review_count - state_df.review_count.mean())/state_df.review_count.std()
X = np.column_stack((state_df_filt.longitude, state_df_filt.latitude, state_df_filt.review_count))
y = state_df_filt.ratings
est = sm.OLS(y, X)
est2 = est.fit()
print(est2.summary())
print state