In [41]:
import pickle
from collections import defaultdict 
from sklearn.cluster import KMeans 
from sklearn.cluster import AgglomerativeClustering 
from sklearn.neighbors import kneighbors_graph
import time
import timeit

In [35]:
#read places

file  = open ("/home/iizhaki/oasis/CSE255/reviewsUS.pck")
reviews = pickle.load(file)
file.close();
print "done"


done

Cross Computing Users


In [36]:
#read places

file  = open ("/home/iizhaki/oasis/CSE255/reviewsUS.pck")
reviews = pickle.load(file)
file.close();
print "done"


done

In [37]:
print reviews[0], len(reviews)


(5000, [u'Mexican Restaurant', u'Latin American Restaurant'], "You won't be disappointed in the food.  They do business lunches and groups (6 to 10) very well.  Service always fast and helpful.  This is one of my top 4 Mexican restaurants in Akron area, the only detractor is the age of the building and the environment.  Again not back, nothing to stay away from, but their business is serving Mexican food to their customers and they do that well.  Lunch is the majority of times I have been there.", '101280967457665576418', '103173356293785774089') 1610014

In [42]:
import collections
import re

occursPos = defaultdict(int)
occursNeg = defaultdict(int)

for review in reviews:
    string = review[2].lower()
    occs = filter(None, re.split("[,. !?:()01234566789\t\n]+", string))
    if review[0] >= 4000:
        for occ in occs:
            occursPos[occ] += 1
    elif review[0] <= 2000:
        for occ in occs:
            occursNeg[occ] += 1

In [43]:
odPos = sorted(occursPos, key=occursPos.get)[::-1]
odNeg = sorted(occursNeg, key=occursNeg.get)[::-1]
#odPos = collections.OrderedDict(sorted(occursPos,key=lambda key: occursPos[key]))
#print odPos[0:100]

In [44]:
print odPos[0], occursPos[odPos[0]]
print odNeg[0], occursNeg[odNeg[0]]


the 1503865
the 671701

In [63]:
xxx = filter(lambda x : occursPos[x] >= 1000, odPos)
yyy = filter(lambda y : occursNeg[y] >= 1000, odNeg)

diff = lambda l1,l2: [x for x in l1 if x not in l2]
inter = set(xxx).intersection(set(yyy))

pos = diff(xxx, inter)
neg = diff(yyy, inter)

pos = [(x, occursPos[x]) for x in pos]
neg = [(x, occursNeg[x]) for x in neg]

In [65]:
print [x for (x, y) in pos[:50]]
print [y for (y, r) in neg[:50]]


['wonderful', 'fantastic', 'loved', 'perfect', 'reasonable', 'attentive', 'outstanding', 'variety', 'pricey', 'specials', 'yummy', 'brunch', 'helpful', 'unique', 'die', 'beautiful', 'ambiance', 'incredible', 'perfectly', 'beers', 'vegetarian', 'neighborhood', 'homemade', 'patio', 'flavors', 'cuisine', 'miss', 'yum', 'chicago', 'glad', 'beat', 'packed', 'chocolate', 'affordable', 'flavorful', 'reasonably', 'pho', 'environment', 'easy', 'healthy', 'gem', 'downtown', 'comfortable', 'crispy', 'cozy', 'interesting', 'favorites', 'consistently', 'generous', 'tender']
['awful', 'disgusting', 'nasty', 'waste', 'worse', 'charged', 'sucks', 'sick', 'gross', 'hair', 'burnt', 'soggy', 'barely', 'employee', 'tasteless', 'refused', 'overcooked', 'sent', 'refund', 'supposed', 'crap', 'cashier', 'mess', 'driver', 'stale', 'apparently', 'bother', 'ridiculous', 'health', 'poisoning', 'unprofessional', 'returned', 'ignored', 'joke', 'spoke', 'receipt', 'nor', 'threw', 'hung', 'messed', 'answer', 'telling', 'complained', 'shame', 'needless', 'calling', 'standing', 'upset', 'disappointment', 'poorly']

In [47]:
i = 0
posDict = {}
for p in pos[:50]:
    posDict[p[0]] = i
    i += 1

In [48]:
i = 0
negDict = {}
for p in neg[:50]:
    negDict[p[0]] = i
    i += 1

In [49]:
print posDict


{'beautiful': 15, 'fantastic': 1, 'neighborhood': 21, 'outstanding': 6, 'ambiance': 16, 'comfortable': 42, 'tender': 49, 'miss': 26, 'packed': 31, 'cozy': 44, 'perfect': 3, 'cuisine': 25, 'generous': 48, 'attentive': 5, 'yummy': 10, 'variety': 7, 'crispy': 43, 'environment': 37, 'consistently': 47, 'wonderful': 0, 'gem': 40, 'homemade': 22, 'perfectly': 18, 'affordable': 33, 'flavors': 24, 'pho': 36, 'easy': 38, 'beat': 30, 'flavorful': 34, 'chocolate': 32, 'downtown': 41, 'pricey': 8, 'beers': 19, 'favorites': 46, 'brunch': 11, 'unique': 13, 'chicago': 28, 'glad': 29, 'loved': 2, 'helpful': 12, 'healthy': 39, 'die': 14, 'vegetarian': 20, 'incredible': 17, 'specials': 9, 'yum': 27, 'reasonable': 4, 'interesting': 45, 'reasonably': 35, 'patio': 23}

In [50]:
posReviewMap = defaultdict(list)

for review in reviews:
    restId = review[4]
    pot = posReviewMap[restId]
    if pot == []:
        pot = [0] * 50
        
    string = review[2].lower()
    occs = filter(None, re.split("[,. !?:()01234566789\t\n]+", string))
    for occ in occs:
        if occ in posDict:
            pot[posDict[occ]] += 1    
            
    posReviewMap[restId] = pot

In [51]:
negReviewMap = defaultdict(list)

for review in reviews:
    restId = review[4]
    pot = negReviewMap[restId]
    if pot == []:
        pot = [0] * 50
        
    string = review[2].lower()
    occs = filter(None, re.split("[,. !?:()01234566789\t\n]+", string))
    for occ in occs:
        if occ in negDict:
            pot[negDict[occ]] += 1    
    
    negReviewMap[restId] = pot

In [61]:
for obj in posReviewMap.values():
    if sum(obj) > 50:
        i = 0
        for o in obj:
            if o > 90:
                print pos[i]
            i += 1


('chocolate', 5610)

In [279]:
file  = open ("/home/iizhaki/oasis/CSE255/reviewToPositiveMap.pck", "w")
pickle.dump(posReviewMap, file)
file.close();
print "done"


done

In [280]:
file  = open ("/home/iizhaki/oasis/CSE255/reviewToNegativeMap.pck", "w")
pickle.dump(negReviewMap, file)
file.close();
print "done"


done

In [281]:
file  = open ("/home/iizhaki/oasis/CSE255/positiveDict.pck", "w")
pickle.dump(pos[:50], file)
file.close();
print "done"

file  = open ("/home/iizhaki/oasis/CSE255/negativeDict.pck", "w")
pickle.dump(neg[:50], file)
file.close();
print "done"


done
done

Compare least-square performance


In [16]:
import numpy as np

file = open("/home/iizhaki/oasis/CSE255/MatrixD.pck")
A = np.load(file)
file.close()
print "done"

file = open("/home/iizhaki/oasis/CSE255/YsD.pck")
B = np.load(file)
file.close()
print "done"


done
done

In [17]:
import time
import scipy
from scipy.sparse import csr_matrix
from scipy.optimize import leastsq

In [18]:
start = time.time()
X1, _, _, _ = numpy.linalg.lstsq(A[:2000], B[:2000]);
end = time.time()
print end - start


2.31600308418

In [19]:
start = time.time()
X2, _, _, _ = scipy.linalg.lstsq(A[:2000], B[:2000], False, False);
end = time.time()
print end - start


3.16320014

In [32]:
print X1
print X2


[ -1.05684097e+11   1.05684096e+11   1.05684097e+11 ...,   0.00000000e+00
   0.00000000e+00   0.00000000e+00]
[  1.37792230e+14  -6.11870999e+43   1.52128245e+44 ...,   0.00000000e+00
   0.00000000e+00   0.00000000e+00]

In [26]:


In [ ]: