In [1]:
__author__ = "Billy Yuan, Nikita Lakhotia, Stuti Maddan, Tyler Nicolas, Wenduo Wang"
__copyright__ = "Well, knowledge is open to curious minds."
__license__ = "GPL-3.0"
__version__ = "0.3"
__maintainer__ = "Wenduo Wang"
__email__ = "wenduo.wang@utexas.edu"
__status__ = "development"
__date__ = "Sep/21/2016"

In [2]:
import pandas as pd
import numpy as np
import random as rd
import time, re, math, functools, cProfile
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from patsy import dmatrices
from nltk import pos_tag, bigrams
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords as stpwds
from apiclient.discovery import build

In [3]:
# initialize a lemmatizer just in case it will be used
lmtz = WordNetLemmatizer().lemmatize

In [4]:
def timer(func):
    '''This is a decorator to return a function's running time'''
    def wrapper(*args, **kwargs):
        t1 = time.time()
        result = func(*args, **kwargs)
        t2 = time.time()
        print "{:>10}:{:>10.3f} seconds".format(func.__name__, t2-t1)
        return result
    return wrapper

In [5]:
@timer
def readData(portion, random_state=time.time()):
    '''Read in a certain portion of data in a random manner'''
    rd.seed(random_state)
    skip = rd.sample(xrange(1, 19999), int(math.ceil(19999*(1-portion))))
    data = pd.read_csv("yelp.csv", skiprows=skip)
    data["target"]=data.stars.map(lambda v: 1 if v>3 else 0)
    return data

In [6]:
@timer
def generateTrainTest(data, portion, random_state=time.time()):
    '''Split train and test data set'''
    rd.seed(random_state)
    train_index = rd.sample(xrange(len(data)), int(math.ceil(len(data)*portion)))
    test_index = list(set(xrange(len(data)))-set(train_index))
    train_data = data.ix[train_index]
    test_data = data.ix[test_index]
    return train_data, test_data

In [7]:
@timer
def generateFormula(data):
    '''A helper function to generate formula for regression'''
    formula = "target~0"
    for var in data.columns.values.tolist():
        if data[var].dtype in ["int64", "float64"] and var not in ["stars", "target", "wc", "Review", "prediction"]:
                
            formula += "+"+var
            
        else:
            continue
    return formula

In [8]:
def splitXY(data):
    '''Split independent and dependent variables, and return X as DataFrame Y as Series'''
    Y, X = dmatrices(generateFormula(data), data=data, return_type="dataframe")
    return X, np.ravel(Y)

In [9]:
def logistic_model(X, y):
    '''A wrapper to generate and fit a logistic regression model'''
    model = LogisticRegression(random_state=128)
    model.fit(X, y)
    return model

In [10]:
def printAccuracy(prediction, target):
    '''Calculate and format accuracy of prediction against target'''
    print "Accuracy: {:>6.4f}".format((prediction == target).mean())

In [11]:
def review2wc(text, lem=False):
    '''Decompose a review into tokens, removing stopwords and optionally do lemmatization'''
    wc = {}
    text = text.lower()
    tokens = re.split("\W+", text)
    stopwords = stpwds.words("english")
    if lem:
        lmtzi = lmtz
        tokens = map(lmtz, tokens)
    while "" in tokens:
        tokens.remove("")
        
    for token in tokens:
        if token not in stopwords:
            try:
                wc[token] =+ 1
            except KeyError:
                wc[token] = 1
    return wc

In [12]:
@timer
def term_prob(corpus, subset):
    '''Given a corpus and a subset, calculate the probability of each word
    from the corpus appearing in the subset'''
    prob_dict = {}
    N = sum([i for (_, i) in list(corpus.items())])
    for key in corpus:
        if key not in subset:
            prob_dict[key] = 1.0 / N
        else:
            prob_dict[key] = subset[key] + 1.0 / N
    return prob_dict

@timer
def log_prob(term_prob_high, term_prob_low):
    '''Given 2 subsets, calculate log relative probability o
    a word appearing in subset 1 against in subset 2'''
    term_log_prob = {}
    log = math.log
    for key in term_prob_high:
        term_log_prob[key] = log(term_prob_high[key]/term_prob_low[key])
    return term_log_prob

In [13]:
@timer
def token_count(wc):
    '''Given a list of dictionaries in the form of "word:count",
    aggregate word:count in to a single dictionary'''
    tc = {}
    for dic in wc.tolist():
        if len(dic) == 0: continue
        for token, count in dic.items():
            try:
                tc[token] += count
            except KeyError:
                tc[token] = 1
                
    return tc

In [14]:
def totalscore(wc, prior, benchmark):
    '''Given a dictionary in the form of "word:count", 
    and reference dictionary in the form of "word:log relative probability",
    calculate the sum of count*log relative probability,
    and at the end add a prior.'''
    prob = 0
    for word, count in wc.items():
        try:
            prob += count * benchmark[word]
        except KeyError:
            prob += 0
    prob += math.log(prior/(1-prior+0.00001))
    return prob

In [15]:
class NBClassifier(object):
    '''A Naive Bayes classifier object with methods to fit on training data and 
    predict on test data'''
    
    def __init__(self):
        self.X = None
        self.y = None
        self.term_log_prob = None
        self.prior = None
    
    def fit(self, data, x_label, y_label):
        '''The core of this method is to keep a dictionary of "word:log relative probability"'''
        self.X = data[x_label]
        self.y = data[y_label]
        self.x_label = x_label
        self.y_label = y_label
        token_count_total = token_count(data[x_label])
        token_count_high = token_count(data[data[y_label]==1][x_label])
        token_count_low = token_count(data[data[y_label]==0][x_label])
        term_prob_high = term_prob(token_count_total, token_count_high)
        term_prob_low = term_prob(token_count_total, token_count_low)
        self.term_log_prob = log_prob(term_prob_high, term_prob_low)
        self.prior = len(data[data[y_label]==1])*1.0/len(data)
        
    def predict(self, test, threshold=None):
        '''Prediction can be tuned by adjusting threshold.
        If threshold is set to None, then return actual score.'''
        totalscore_partial = functools.partial(totalscore, 
                                               prior= self.prior,
                                               benchmark=self.term_log_prob)
        score = test[self.x_label].map(totalscore_partial)
        if threshold == None:
            return score
        else:
            prediction = score.map(lambda x: 1 if x>threshold else 0)
            return prediction

In [16]:
def positiveness(test, positive, negative, threshold=1):
    '''Given a positive vector and a negative vector, 
    calculate the cosine value between the two and a test vector,
    and return the ratio of positive/negative.
    Prediction is tuned by threshold.'''
    product_positive = 0.1
    product_negative = 0.1
    len_positive = math.sqrt(sum(map(lambda x: x*x, positive.values())))
    len_negative = math.sqrt(sum(map(lambda x: x*x, negative.values())))
    for key in positive.keys():
        try:
            product_positive += positive[key] * test[key]
        except KeyError:
            continue
    product_positive = product_positive*1.0/len_positive    
        
    for key in negative.keys():
        try:
            product_negative += negative[key] * test[key]
        except KeyError:
            continue
    product_negative = product_negative*1.0/len_negative
    
    return ((product_positive*1.0/product_negative)>threshold)*1

In [17]:
def review2pairs(text, pattern_1, pattern_2, no_match):
    '''Decompose a review to (token,pos_tag) pairs.
    Filter the pairs based on adjacent pos tag specified in pattern_1, pattern_2 and no_match.'''
    wc = []
    append = wc.append
    text = text.lower()
    tokens = re.split("\W+", text)
    stopwords = stpwds.words("english")
    remove = tokens.remove
    while "" in tokens:
        remove("")
    for token in tokens:
        if token in stopwords:
            remove(token)
    token_pos = pos_tag(tokens)
    
    for i in xrange(len(token_pos)-1):
        if (token_pos[i][1], token_pos[i+1][1]) not in pattern_1 and\
            (token_pos[i][1], token_pos[i+1][1]) not in pattern_2:
                continue
        elif (token_pos[i][1], token_pos[i+1][1]) in pattern_1:
            append((token_pos[i][0], token_pos[i+1][0]))
        elif (token_pos[i][1], token_pos[i+1][1]) in pattern_2:
            try:
                if token_pos[i+2] not in no_match:
                    append((token_pos[i][0], token_pos[i+1][0]))
            except IndexError:
                append((token_pos[i][0], token_pos[i+1][0]))
        else:
            continue

    return wc

In [18]:
# Initialize an instance of google custom search api
# Useful link: https://developers.google.com/custom-search/json-api/v1/reference/cse/list
with open("/home/wenduowang/.googleapi/customsearch.key") as f:
    customsearch_key = f.readline().strip()
    
service = build("customsearch", "v1", developerKey=customsearch_key).cse()

def semanticOrientation(phrases,
                        engine_id,
                        positive="excellent", 
                        negative="poor", 
                        service=service,
                        prior = 1,
                        distance=5,
                        threshold=None):
    '''Warning: this function depends on google custom search api, with a rate limit of 20 queries/sec.
    engine_id is the identification # of google custom search engine, see: https://cse.google.com/all.
    Here the search is specified within yelp.com
    Given a list of phrases in the form of (word1, word2), find the average semantic orientation
    of hits(phrase AROUND(distance) positive)/hits(phrase AROUND(distance) negative)/prior
    Hits is the number of search result from the engine.
    By default prior is set to 1, which should strictly be the ratio of hits(positive)/hits(negative).
    '''
    so_positive = 0.01
    so_negative = 0.01
    so_avg = 0
    
    for phrase in phrases:
        term = "%22{}+{}%22+AROUND({})+%22{}%22".format(phrase[0], distance, phrase[1], positive)
        response = service.list(q=term, cx=engine_id).execute()
        try:
            rtr_pos = int(response["searchInformation"]["totalResults"].encode("utf-8"))
        except KeyError:
            rtr_pos = 0
            
        if rtr_pos == None:
            rtr_pos = 0
        
        term = "%22{}+{}%22+AROUND({})+%22{}%22".format(phrase[0], distance, phrase[1], negative)
        response = service.list(q=term, cx=engine_id).execute()
        try:
            rtr_neg = int(response["searchInformation"]["totalResults"].encode("utf-8"))
        except KeyError:
            rtr_neg = 0
            
        if rtr_neg == None:
            rtr_neg = 0
            
        so_positive += rtr_pos
        so_negative += rtr_neg
        
        so_avg += math.log(so_positive/so_negative/prior)
    
    so_avg = so_avg*1.0/len(phrases)
    
    if threshold == None:
        return so_avg
    else:
        return int(so_avg>threshold)

Task A. Ignore the text (reviews) and run a classification model with the numeric data (you can use standard methods like logistic regression, k-nearest neighbors or anything else). What is the best accuracy of your model with numeric data?

Just try logistic regression

In [19]:
data = readData(0.2, random_state=8)
train_1, test_1 = generateTrainTest(data, 0.7, random_state=8)
X_train, y_train = splitXY(train_1)
model_1 = logistic_model(X_train, y_train)
X_test, y_test = splitXY(test_1)
prediction = model_1.predict(X_test)
printAccuracy(prediction, y_test)


  readData:     0.121 seconds
generateTrainTest:     0.006 seconds
generateFormula:     0.001 seconds
generateFormula:     0.001 seconds
Accuracy: 0.6706

Task B. Perform a supervised classification on a subset of the corpus using the reviews only. You can write your code in Python or R. What accuracy do you get from this text mining exercise?


In [20]:
data["wc"] = data.Review.map(review2wc)
train_2, test_2 = generateTrainTest(data, 0.7, random_state=8)


generateTrainTest:     0.004 seconds

In [21]:
classifier = NBClassifier()
classifier.fit(train_2, "wc", "target")
prediction = classifier.predict(test_2, threshold=3.5)
printAccuracy(prediction, test_2.target)


token_count:     0.092 seconds
token_count:     0.033 seconds
token_count:     0.019 seconds
 term_prob:     0.009 seconds
 term_prob:     0.055 seconds
  log_prob:     0.008 seconds
Accuracy: 0.6822

The accuracy of NB is similar to logistic regression.

Task C. Combine the numeric data and the text classification model (in task B) to create a “hybrid” model. It is your task to figure out how to do this. Now run this hybrid classification model and compare the results with those in A and B. Does the numeric data add to the predictive power relative to text?

Add a column of scores generated from the Naive Bayes classifier to the data, and rerun logistic regression on all numeric variables

In [22]:
data["total_score"] = classifier.predict(data, threshold=3.5)
train_3, test_3 = generateTrainTest(data, 0.7, random_state=8)
X_train, y_train = splitXY(train_3)
model_2 = logistic_model(X_train, y_train)
X_test, y_test = splitXY(test_3)
prediction = model_2.predict(X_test)
printAccuracy(prediction, y_test)


generateTrainTest:     0.004 seconds
generateFormula:     0.001 seconds
generateFormula:     0.001 seconds
Accuracy: 0.6822

The accuracy is not significantly higher.

Task D. Use unsupervised sentiment analysis on the reviews (with SentiStrength or any other tool) and use the sentiment scores to predict high/low rating. Compare and contrast the results of tasks B and D. What can you conclude from your analysis?

Vector angle approach

Compare each review with a totally biased positive/negative review, and compare the ratio of $\frac{\cos \langle review, positive \rangle}{\cos \langle review, positive \rangle}$ with a user specified threshold.


In [23]:
totally_positive = "This restaurant is very good. It is actually the best on that I have ever been to.\
                    The queue could be long, but if you have booked well in advance it would not be a problem.\
                    Everyone smiles and their service is definitely professional. The foods are fantastic,\
                    and the price is low, I mean affordable. The wines are very nice, and there is a good collection\
                    of desserts which tastes phenomenal. The waiter and waitress are attentative and helpful.\
                    I believe they have been trained very well. Tables are clean, dishes\
                    served in time and they taste absolutely delicious. I totally recommend it."

totally_negative = "I can't believe this restaurant could be so bad. We waited for a long time before we were attended\
                    to by a waiter, who was so crude, maybe because he thought I couldn't afford the meal, the price of\
                    which by the way is riculously high. We each ordered 3 courses, but nothing showed up in the following\
                    30 minutes. Nobody even explained that to us. Finally I called the manager, and he just said they were\
                    busy. Well, I could see they were busy, but it doesn't make sense that other people were served better\
                    than us. And the end, we decided to give a smaller tip to the waitor (I preferred not at all), and\
                    I can still remember his face -- disgusting. Please don't go there!"

In [24]:
positive_vec = review2wc(totally_positive)
negative_vec = review2wc(totally_negative)

In [25]:
full_data = pd.read_csv("yelp.csv", usecols=["stars", "Review"])
full_data["target"] = full_data.Review.map(lambda s: 1 if s>3 else 0)
full_data["wc"] = full_data.Review.map(review2wc)

In [26]:
positiveness_partial = functools.partial(positiveness, positive=positive_vec, negative=negative_vec, threshold=.5)
unsupervised_prediction = full_data.wc.map(positiveness_partial)
printAccuracy(unsupervised_prediction, full_data.target)


Accuracy: 0.8887

Task E. Implement the PMI approach to sentiment analysis (in either Python or R), and run the classification model with the sentiment scores. How do your results compare with those in Task D?

Find word pairs and calculate average semantic orientation based on online search results

In [27]:
pattern_1 = [("JJ", "NN"), ("JJ", "NNS"), 
           ("RB", "VB"), ("RB", "VBD"), ("RB", "VBN"), ("RB", "VBG"),
          ("RBR", "VB"), ("RBR", "VBD"), ("RBR", "VBN"), ("RBR", "VBG"),
          ("RBS", "VB"), ("RBS", "VBD"), ("RBS", "VBN"), ("RBS", "VBG")]
pattern_2 = [("RB", "JJ"), ("RBR", "JJ"), ("RBS", "JJ"),
            ("JJ", "JJ"),
            ("NN", "JJ"), ("NNS", "JJ")]
no_match = ["NN", "NNS"]

Filter the review and keep word pairs whose pos tags match the above specification


In [28]:
r2p = functools.partial(review2pairs, pattern_1=pattern_1, pattern_2=pattern_2, no_match=no_match)
pairs=data.Review.map(r2p)

In [29]:
# Initialize custom search engine id
with open("/home/wenduowang/.googleapi/customsearch.engine") as f:
    engine_id = f.readline().strip()

positive = "excellent"
negative = "poor"
res1 = service.list(q=positive, cx=engine_id).execute() # search for positive word
try:
    p = int(res1["searchInformation"]["totalResults"].encode("utf-8")) # retrieve result number
except KeyError:
    p = 0
del res1

# if the result is 0 then reset the word
if p == 0:
    print "Reset positive"
    del p

res2 = service.list(q=negative, cx=engine_id).execute() # search for negative word
try:
    n = int(res2["searchInformation"]["totalResults"].encode("utf-8")) # retrieve result number
except KeyError:
    n = 0

# if the result is 0 then reset the word
if n == 0:
    print "Reset negative"
    del n
    
del res2

prior = p*1.0/n

In [30]:
so = functools.partial(semanticOrientation,
                        engine_id=engine_id,
                        positive="excellent", 
                        negative="poor", 
                        prior = prior,
                        distance=5,
                        threshold=None)
# prediction = so([("restaurant", "good")])

In [31]:
printAccuracy(prediction, data.target[:10])


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-31-d9c41805deb2> in <module>()
----> 1 printAccuracy(prediction, data.target[:10])

<ipython-input-10-f23bb6e6d01a> in printAccuracy(prediction, target)
      1 def printAccuracy(prediction, target):
      2     '''Calculate and format accuracy of prediction against target'''
----> 3     print "Accuracy: {:>6.4f}".format((prediction == target).mean())

/usr/local/lib/python2.7/dist-packages/pandas/core/ops.pyc in wrapper(self, other, axis)
    740         elif isinstance(other, (np.ndarray, pd.Index)):
    741             if len(self) != len(other):
--> 742                 raise ValueError('Lengths must match to compare')
    743             return self._constructor(na_op(self.values, np.asarray(other)),
    744                                      index=self.index).__finalize__(self)

ValueError: Lengths must match to compare

Task F. What are the top 5 “attributes” of a restaurant that are associated with (i) high and (ii) low ratings? That is, when people rate a restaurant high or low, are they more likely to mention service, ambiance, etc.?

Vector angle approach

Define several topics with common words, and compare the review with each topic. Calculate the cosine value as a proximity indicator.


In [ ]:
food = "delicious, food, salad, dessert, tasty, melt, wine, disgusting, dish, flavor, crunchy, yummy, appetizing,\
        course, cheese, meat, beef, steak, lamb, soup, cooked, cook, charred, burn, burned, burnt, burning, hot,\
        spicy, fresh, seasoning, seasoned, marinated, crisp, tender, crust"
food = review2wc(food)
service = "waitor, waitress, serve, served, service, serving, polite, considerate, careful, attentative, warm,\
            attention, rude, waiting, wait, long, smile, smiling, smiled, patient, inpatient, angry, anger, annoy,\
            annoying, responsive, response"
service = review2wc(service)
ambiance = "ambiance, decoration, lighting, light, lights, music, piano, candle, candles, environment, decorated,\
            table, cloth, glass, violin, performance, comfort, comfortable, soft, sofa, chair, relax, relaxing,\
            romance, romantic, classic, noisy, boyfriend, girlfriend"
ambiance = review2wc(ambiance)
price = "price, affordable, expensive, cheap, worth, worthwhile, worthy, dear, charge, charged, fee, tip, tips,\
            ripped"
price = review2wc(price)
convenience = "parking, drive, convenience, convenient, commute, highway, road, street, crowded, queue, line,\
                traffic, every, week, month, days, day, everyday"
convenience = review2wc(convenience)
hygiene = "hygiene, clean, dirty, safe, tidy, hygienic, cloth, toilet, restroom, washroom, dress, dressed, hair,\
            bug, fly, flies, smelly, stink, rotten, bad, wash, washed, hand, hands"
hygiene = review2wc(hygiene)
health = "healthy, health, calorie, heavy, light, sweet, sugar, fat, oil, salad, salty, salt, energy, refresh,\
            refreshing, heart, body, portion, size, material, materials, ingredient"
health = review2wc(health)
family = "kid, kids, play, playground, child, children, plays, played, baby, seat, son, girl, daughter, dad, mom,\
        mother, father, grandma, grandpa"
family = review2wc(family)
party = "friends, friend, together, party, celebrate, celebration, celebrated, space, room, big, birthday"
party = review2wc(party)

In [ ]:
def matchAttribute(wc, attribute):
    attr_score = {}
    len_wc = math.sqrt(sum(map(lambda x: x*x, wc.values())))
    len_attr = math.sqrt(sum(map(lambda x: x*x, attribute.values())))
    match = 0
    for key in attribute.keys():
        try:
            match += abs(attribute[key] * wc[key])
        except KeyError:
            continue
    match = match*1.0/len_wc/len_attr
    
    return match

In [ ]:
# Populate the dataframe with matching scores.
for attr in {"food":food, "service":service, "ambiance":ambiance, "price":price, "convenience":convenience,
            "hygiene":hygiene, "health":health, "family":family, "party":party}.items():
    
    matchAttribute_partial = functools.partial(matchAttribute, attribute=attr[1])
    data["match_{}".format(attr[0])] = data.wc.map(matchAttribute_partial)

In [ ]:
# Extract matching scores and target.
match_cols = [col for col in data.columns.values if "match" in col] + ["target"]
match_df = data[match_cols]
match_df = data[match_cols]
# Sort topics by matching score.
print "Sort by higher rated restaurants"
print match_df.groupby("target").mean().T.sort_values([1], ascending=False)
print "\n"
print "Sort by lower rated restaurants"
print match_df.groupby("target").mean().T.sort_values([0], ascending=False)

In [ ]: