In [1]:
__author__ = "Billy Yuan, Nikita Lakhotia, Stuti Maddan, Tyler Nicolas, Wenduo Wang"
__copyright__ = "Well, knowledge is open to curious minds."
__license__ = "GPL-3.0"
__version__ = "0.3"
__maintainer__ = "Wenduo Wang"
__email__ = "wenduo.wang@utexas.edu"
__status__ = "development"
__date__ = "Sep/21/2016"
In [2]:
import pandas as pd
import numpy as np
import random as rd
import time, re, math, functools, cProfile
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from patsy import dmatrices
from nltk import pos_tag, bigrams
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords as stpwds
from apiclient.discovery import build
In [3]:
# initialize a lemmatizer just in case it will be used
lmtz = WordNetLemmatizer().lemmatize
In [4]:
def timer(func):
'''This is a decorator to return a function's running time'''
def wrapper(*args, **kwargs):
t1 = time.time()
result = func(*args, **kwargs)
t2 = time.time()
print "{:>10}:{:>10.3f} seconds".format(func.__name__, t2-t1)
return result
return wrapper
In [5]:
@timer
def readData(portion, random_state=time.time()):
'''Read in a certain portion of data in a random manner'''
rd.seed(random_state)
skip = rd.sample(xrange(1, 19999), int(math.ceil(19999*(1-portion))))
data = pd.read_csv("yelp.csv", skiprows=skip)
data["target"]=data.stars.map(lambda v: 1 if v>3 else 0)
return data
In [6]:
@timer
def generateTrainTest(data, portion, random_state=time.time()):
'''Split train and test data set'''
rd.seed(random_state)
train_index = rd.sample(xrange(len(data)), int(math.ceil(len(data)*portion)))
test_index = list(set(xrange(len(data)))-set(train_index))
train_data = data.ix[train_index]
test_data = data.ix[test_index]
return train_data, test_data
In [7]:
@timer
def generateFormula(data):
'''A helper function to generate formula for regression'''
formula = "target~0"
for var in data.columns.values.tolist():
if data[var].dtype in ["int64", "float64"] and var not in ["stars", "target", "wc", "Review", "prediction"]:
formula += "+"+var
else:
continue
return formula
In [8]:
def splitXY(data):
'''Split independent and dependent variables, and return X as DataFrame Y as Series'''
Y, X = dmatrices(generateFormula(data), data=data, return_type="dataframe")
return X, np.ravel(Y)
In [9]:
def logistic_model(X, y):
'''A wrapper to generate and fit a logistic regression model'''
model = LogisticRegression(random_state=128)
model.fit(X, y)
return model
In [10]:
def printAccuracy(prediction, target):
'''Calculate and format accuracy of prediction against target'''
print "Accuracy: {:>6.4f}".format((prediction == target).mean())
In [11]:
def review2wc(text, lem=False):
'''Decompose a review into tokens, removing stopwords and optionally do lemmatization'''
wc = {}
text = text.lower()
tokens = re.split("\W+", text)
stopwords = stpwds.words("english")
if lem:
lmtzi = lmtz
tokens = map(lmtz, tokens)
while "" in tokens:
tokens.remove("")
for token in tokens:
if token not in stopwords:
try:
wc[token] =+ 1
except KeyError:
wc[token] = 1
return wc
In [12]:
@timer
def term_prob(corpus, subset):
'''Given a corpus and a subset, calculate the probability of each word
from the corpus appearing in the subset'''
prob_dict = {}
N = sum([i for (_, i) in list(corpus.items())])
for key in corpus:
if key not in subset:
prob_dict[key] = 1.0 / N
else:
prob_dict[key] = subset[key] + 1.0 / N
return prob_dict
@timer
def log_prob(term_prob_high, term_prob_low):
'''Given 2 subsets, calculate log relative probability o
a word appearing in subset 1 against in subset 2'''
term_log_prob = {}
log = math.log
for key in term_prob_high:
term_log_prob[key] = log(term_prob_high[key]/term_prob_low[key])
return term_log_prob
In [13]:
@timer
def token_count(wc):
'''Given a list of dictionaries in the form of "word:count",
aggregate word:count in to a single dictionary'''
tc = {}
for dic in wc.tolist():
if len(dic) == 0: continue
for token, count in dic.items():
try:
tc[token] += count
except KeyError:
tc[token] = 1
return tc
In [14]:
def totalscore(wc, prior, benchmark):
'''Given a dictionary in the form of "word:count",
and reference dictionary in the form of "word:log relative probability",
calculate the sum of count*log relative probability,
and at the end add a prior.'''
prob = 0
for word, count in wc.items():
try:
prob += count * benchmark[word]
except KeyError:
prob += 0
prob += math.log(prior/(1-prior+0.00001))
return prob
In [15]:
class NBClassifier(object):
'''A Naive Bayes classifier object with methods to fit on training data and
predict on test data'''
def __init__(self):
self.X = None
self.y = None
self.term_log_prob = None
self.prior = None
def fit(self, data, x_label, y_label):
'''The core of this method is to keep a dictionary of "word:log relative probability"'''
self.X = data[x_label]
self.y = data[y_label]
self.x_label = x_label
self.y_label = y_label
token_count_total = token_count(data[x_label])
token_count_high = token_count(data[data[y_label]==1][x_label])
token_count_low = token_count(data[data[y_label]==0][x_label])
term_prob_high = term_prob(token_count_total, token_count_high)
term_prob_low = term_prob(token_count_total, token_count_low)
self.term_log_prob = log_prob(term_prob_high, term_prob_low)
self.prior = len(data[data[y_label]==1])*1.0/len(data)
def predict(self, test, threshold=None):
'''Prediction can be tuned by adjusting threshold.
If threshold is set to None, then return actual score.'''
totalscore_partial = functools.partial(totalscore,
prior= self.prior,
benchmark=self.term_log_prob)
score = test[self.x_label].map(totalscore_partial)
if threshold == None:
return score
else:
prediction = score.map(lambda x: 1 if x>threshold else 0)
return prediction
In [16]:
def positiveness(test, positive, negative, threshold=1):
'''Given a positive vector and a negative vector,
calculate the cosine value between the two and a test vector,
and return the ratio of positive/negative.
Prediction is tuned by threshold.'''
product_positive = 0.1
product_negative = 0.1
len_positive = math.sqrt(sum(map(lambda x: x*x, positive.values())))
len_negative = math.sqrt(sum(map(lambda x: x*x, negative.values())))
for key in positive.keys():
try:
product_positive += positive[key] * test[key]
except KeyError:
continue
product_positive = product_positive*1.0/len_positive
for key in negative.keys():
try:
product_negative += negative[key] * test[key]
except KeyError:
continue
product_negative = product_negative*1.0/len_negative
return ((product_positive*1.0/product_negative)>threshold)*1
In [17]:
def review2pairs(text, pattern_1, pattern_2, no_match):
'''Decompose a review to (token,pos_tag) pairs.
Filter the pairs based on adjacent pos tag specified in pattern_1, pattern_2 and no_match.'''
wc = []
append = wc.append
text = text.lower()
tokens = re.split("\W+", text)
stopwords = stpwds.words("english")
remove = tokens.remove
while "" in tokens:
remove("")
for token in tokens:
if token in stopwords:
remove(token)
token_pos = pos_tag(tokens)
for i in xrange(len(token_pos)-1):
if (token_pos[i][1], token_pos[i+1][1]) not in pattern_1 and\
(token_pos[i][1], token_pos[i+1][1]) not in pattern_2:
continue
elif (token_pos[i][1], token_pos[i+1][1]) in pattern_1:
append((token_pos[i][0], token_pos[i+1][0]))
elif (token_pos[i][1], token_pos[i+1][1]) in pattern_2:
try:
if token_pos[i+2] not in no_match:
append((token_pos[i][0], token_pos[i+1][0]))
except IndexError:
append((token_pos[i][0], token_pos[i+1][0]))
else:
continue
return wc
In [18]:
# Initialize an instance of google custom search api
# Useful link: https://developers.google.com/custom-search/json-api/v1/reference/cse/list
with open("/home/wenduowang/.googleapi/customsearch.key") as f:
customsearch_key = f.readline().strip()
service = build("customsearch", "v1", developerKey=customsearch_key).cse()
def semanticOrientation(phrases,
engine_id,
positive="excellent",
negative="poor",
service=service,
prior = 1,
distance=5,
threshold=None):
'''Warning: this function depends on google custom search api, with a rate limit of 20 queries/sec.
engine_id is the identification # of google custom search engine, see: https://cse.google.com/all.
Here the search is specified within yelp.com
Given a list of phrases in the form of (word1, word2), find the average semantic orientation
of hits(phrase AROUND(distance) positive)/hits(phrase AROUND(distance) negative)/prior
Hits is the number of search result from the engine.
By default prior is set to 1, which should strictly be the ratio of hits(positive)/hits(negative).
'''
so_positive = 0.01
so_negative = 0.01
so_avg = 0
for phrase in phrases:
term = "%22{}+{}%22+AROUND({})+%22{}%22".format(phrase[0], distance, phrase[1], positive)
response = service.list(q=term, cx=engine_id).execute()
try:
rtr_pos = int(response["searchInformation"]["totalResults"].encode("utf-8"))
except KeyError:
rtr_pos = 0
if rtr_pos == None:
rtr_pos = 0
term = "%22{}+{}%22+AROUND({})+%22{}%22".format(phrase[0], distance, phrase[1], negative)
response = service.list(q=term, cx=engine_id).execute()
try:
rtr_neg = int(response["searchInformation"]["totalResults"].encode("utf-8"))
except KeyError:
rtr_neg = 0
if rtr_neg == None:
rtr_neg = 0
so_positive += rtr_pos
so_negative += rtr_neg
so_avg += math.log(so_positive/so_negative/prior)
so_avg = so_avg*1.0/len(phrases)
if threshold == None:
return so_avg
else:
return int(so_avg>threshold)
In [19]:
data = readData(0.2, random_state=8)
train_1, test_1 = generateTrainTest(data, 0.7, random_state=8)
X_train, y_train = splitXY(train_1)
model_1 = logistic_model(X_train, y_train)
X_test, y_test = splitXY(test_1)
prediction = model_1.predict(X_test)
printAccuracy(prediction, y_test)
In [20]:
data["wc"] = data.Review.map(review2wc)
train_2, test_2 = generateTrainTest(data, 0.7, random_state=8)
In [21]:
classifier = NBClassifier()
classifier.fit(train_2, "wc", "target")
prediction = classifier.predict(test_2, threshold=3.5)
printAccuracy(prediction, test_2.target)
The accuracy of NB is similar to logistic regression.
In [22]:
data["total_score"] = classifier.predict(data, threshold=3.5)
train_3, test_3 = generateTrainTest(data, 0.7, random_state=8)
X_train, y_train = splitXY(train_3)
model_2 = logistic_model(X_train, y_train)
X_test, y_test = splitXY(test_3)
prediction = model_2.predict(X_test)
printAccuracy(prediction, y_test)
The accuracy is not significantly higher.
In [23]:
totally_positive = "This restaurant is very good. It is actually the best on that I have ever been to.\
The queue could be long, but if you have booked well in advance it would not be a problem.\
Everyone smiles and their service is definitely professional. The foods are fantastic,\
and the price is low, I mean affordable. The wines are very nice, and there is a good collection\
of desserts which tastes phenomenal. The waiter and waitress are attentative and helpful.\
I believe they have been trained very well. Tables are clean, dishes\
served in time and they taste absolutely delicious. I totally recommend it."
totally_negative = "I can't believe this restaurant could be so bad. We waited for a long time before we were attended\
to by a waiter, who was so crude, maybe because he thought I couldn't afford the meal, the price of\
which by the way is riculously high. We each ordered 3 courses, but nothing showed up in the following\
30 minutes. Nobody even explained that to us. Finally I called the manager, and he just said they were\
busy. Well, I could see they were busy, but it doesn't make sense that other people were served better\
than us. And the end, we decided to give a smaller tip to the waitor (I preferred not at all), and\
I can still remember his face -- disgusting. Please don't go there!"
In [24]:
positive_vec = review2wc(totally_positive)
negative_vec = review2wc(totally_negative)
In [25]:
full_data = pd.read_csv("yelp.csv", usecols=["stars", "Review"])
full_data["target"] = full_data.Review.map(lambda s: 1 if s>3 else 0)
full_data["wc"] = full_data.Review.map(review2wc)
In [26]:
positiveness_partial = functools.partial(positiveness, positive=positive_vec, negative=negative_vec, threshold=.5)
unsupervised_prediction = full_data.wc.map(positiveness_partial)
printAccuracy(unsupervised_prediction, full_data.target)
In [27]:
pattern_1 = [("JJ", "NN"), ("JJ", "NNS"),
("RB", "VB"), ("RB", "VBD"), ("RB", "VBN"), ("RB", "VBG"),
("RBR", "VB"), ("RBR", "VBD"), ("RBR", "VBN"), ("RBR", "VBG"),
("RBS", "VB"), ("RBS", "VBD"), ("RBS", "VBN"), ("RBS", "VBG")]
pattern_2 = [("RB", "JJ"), ("RBR", "JJ"), ("RBS", "JJ"),
("JJ", "JJ"),
("NN", "JJ"), ("NNS", "JJ")]
no_match = ["NN", "NNS"]
Filter the review and keep word pairs whose pos tags match the above specification
In [28]:
r2p = functools.partial(review2pairs, pattern_1=pattern_1, pattern_2=pattern_2, no_match=no_match)
pairs=data.Review.map(r2p)
In [29]:
# Initialize custom search engine id
with open("/home/wenduowang/.googleapi/customsearch.engine") as f:
engine_id = f.readline().strip()
positive = "excellent"
negative = "poor"
res1 = service.list(q=positive, cx=engine_id).execute() # search for positive word
try:
p = int(res1["searchInformation"]["totalResults"].encode("utf-8")) # retrieve result number
except KeyError:
p = 0
del res1
# if the result is 0 then reset the word
if p == 0:
print "Reset positive"
del p
res2 = service.list(q=negative, cx=engine_id).execute() # search for negative word
try:
n = int(res2["searchInformation"]["totalResults"].encode("utf-8")) # retrieve result number
except KeyError:
n = 0
# if the result is 0 then reset the word
if n == 0:
print "Reset negative"
del n
del res2
prior = p*1.0/n
In [33]:
so = functools.partial(semanticOrientation,
engine_id=engine_id,
positive="excellent",
negative="poor",
prior = prior,
distance=5,
threshold=None)
prediction = pairs[:10].map(so)
In [31]:
printAccuracy(prediction, data.target[:10])
In [ ]:
food = "delicious, food, salad, dessert, tasty, melt, wine, disgusting, dish, flavor, crunchy, yummy, appetizing,\
course, cheese, meat, beef, steak, lamb, soup, cooked, cook, charred, burn, burned, burnt, burning, hot,\
spicy, fresh, seasoning, seasoned, marinated, crisp, tender, crust"
food = review2wc(food)
service = "waitor, waitress, serve, served, service, serving, polite, considerate, careful, attentative, warm,\
attention, rude, waiting, wait, long, smile, smiling, smiled, patient, inpatient, angry, anger, annoy,\
annoying, responsive, response"
service = review2wc(service)
ambiance = "ambiance, decoration, lighting, light, lights, music, piano, candle, candles, environment, decorated,\
table, cloth, glass, violin, performance, comfort, comfortable, soft, sofa, chair, relax, relaxing,\
romance, romantic, classic, noisy, boyfriend, girlfriend"
ambiance = review2wc(ambiance)
price = "price, affordable, expensive, cheap, worth, worthwhile, worthy, dear, charge, charged, fee, tip, tips,\
ripped"
price = review2wc(price)
convenience = "parking, drive, convenience, convenient, commute, highway, road, street, crowded, queue, line,\
traffic, every, week, month, days, day, everyday"
convenience = review2wc(convenience)
hygiene = "hygiene, clean, dirty, safe, tidy, hygienic, cloth, toilet, restroom, washroom, dress, dressed, hair,\
bug, fly, flies, smelly, stink, rotten, bad, wash, washed, hand, hands"
hygiene = review2wc(hygiene)
health = "healthy, health, calorie, heavy, light, sweet, sugar, fat, oil, salad, salty, salt, energy, refresh,\
refreshing, heart, body, portion, size, material, materials, ingredient"
health = review2wc(health)
family = "kid, kids, play, playground, child, children, plays, played, baby, seat, son, girl, daughter, dad, mom,\
mother, father, grandma, grandpa"
family = review2wc(family)
party = "friends, friend, together, party, celebrate, celebration, celebrated, space, room, big, birthday"
party = review2wc(party)
In [ ]:
def matchAttribute(wc, attribute):
attr_score = {}
len_wc = math.sqrt(sum(map(lambda x: x*x, wc.values())))
len_attr = math.sqrt(sum(map(lambda x: x*x, attribute.values())))
match = 0
for key in attribute.keys():
try:
match += abs(attribute[key] * wc[key])
except KeyError:
continue
match = match*1.0/len_wc/len_attr
return match
In [ ]:
# Populate the dataframe with matching scores.
for attr in {"food":food, "service":service, "ambiance":ambiance, "price":price, "convenience":convenience,
"hygiene":hygiene, "health":health, "family":family, "party":party}.items():
matchAttribute_partial = functools.partial(matchAttribute, attribute=attr[1])
data["match_{}".format(attr[0])] = data.wc.map(matchAttribute_partial)
In [ ]:
# Extract matching scores and target.
match_cols = [col for col in data.columns.values if "match" in col] + ["target"]
match_df = data[match_cols]
match_df = data[match_cols]
# Sort topics by matching score.
print "Sort by higher rated restaurants"
print match_df.groupby("target").mean().T.sort_values([1], ascending=False)
print "\n"
print "Sort by lower rated restaurants"
print match_df.groupby("target").mean().T.sort_values([0], ascending=False)
In [ ]: