In [41]:
import pickle
from collections import defaultdict
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import kneighbors_graph
import time
import timeit
In [35]:
#read places
file = open ("/home/iizhaki/oasis/CSE255/reviewsUS.pck")
reviews = pickle.load(file)
file.close();
print "done"
In [36]:
#read places
file = open ("/home/iizhaki/oasis/CSE255/reviewsUS.pck")
reviews = pickle.load(file)
file.close();
print "done"
In [37]:
print reviews[0], len(reviews)
In [42]:
import collections
import re
occursPos = defaultdict(int)
occursNeg = defaultdict(int)
for review in reviews:
string = review[2].lower()
occs = filter(None, re.split("[,. !?:()01234566789\t\n]+", string))
if review[0] >= 4000:
for occ in occs:
occursPos[occ] += 1
elif review[0] <= 2000:
for occ in occs:
occursNeg[occ] += 1
In [43]:
odPos = sorted(occursPos, key=occursPos.get)[::-1]
odNeg = sorted(occursNeg, key=occursNeg.get)[::-1]
#odPos = collections.OrderedDict(sorted(occursPos,key=lambda key: occursPos[key]))
#print odPos[0:100]
In [44]:
print odPos[0], occursPos[odPos[0]]
print odNeg[0], occursNeg[odNeg[0]]
In [63]:
xxx = filter(lambda x : occursPos[x] >= 1000, odPos)
yyy = filter(lambda y : occursNeg[y] >= 1000, odNeg)
diff = lambda l1,l2: [x for x in l1 if x not in l2]
inter = set(xxx).intersection(set(yyy))
pos = diff(xxx, inter)
neg = diff(yyy, inter)
pos = [(x, occursPos[x]) for x in pos]
neg = [(x, occursNeg[x]) for x in neg]
In [65]:
print [x for (x, y) in pos[:50]]
print [y for (y, r) in neg[:50]]
In [47]:
i = 0
posDict = {}
for p in pos[:50]:
posDict[p[0]] = i
i += 1
In [48]:
i = 0
negDict = {}
for p in neg[:50]:
negDict[p[0]] = i
i += 1
In [49]:
print posDict
In [50]:
posReviewMap = defaultdict(list)
for review in reviews:
restId = review[4]
pot = posReviewMap[restId]
if pot == []:
pot = [0] * 50
string = review[2].lower()
occs = filter(None, re.split("[,. !?:()01234566789\t\n]+", string))
for occ in occs:
if occ in posDict:
pot[posDict[occ]] += 1
posReviewMap[restId] = pot
In [51]:
negReviewMap = defaultdict(list)
for review in reviews:
restId = review[4]
pot = negReviewMap[restId]
if pot == []:
pot = [0] * 50
string = review[2].lower()
occs = filter(None, re.split("[,. !?:()01234566789\t\n]+", string))
for occ in occs:
if occ in negDict:
pot[negDict[occ]] += 1
negReviewMap[restId] = pot
In [61]:
for obj in posReviewMap.values():
if sum(obj) > 50:
i = 0
for o in obj:
if o > 90:
print pos[i]
i += 1
In [279]:
file = open ("/home/iizhaki/oasis/CSE255/reviewToPositiveMap.pck", "w")
pickle.dump(posReviewMap, file)
file.close();
print "done"
In [280]:
file = open ("/home/iizhaki/oasis/CSE255/reviewToNegativeMap.pck", "w")
pickle.dump(negReviewMap, file)
file.close();
print "done"
In [281]:
file = open ("/home/iizhaki/oasis/CSE255/positiveDict.pck", "w")
pickle.dump(pos[:50], file)
file.close();
print "done"
file = open ("/home/iizhaki/oasis/CSE255/negativeDict.pck", "w")
pickle.dump(neg[:50], file)
file.close();
print "done"
In [16]:
import numpy as np
file = open("/home/iizhaki/oasis/CSE255/MatrixD.pck")
A = np.load(file)
file.close()
print "done"
file = open("/home/iizhaki/oasis/CSE255/YsD.pck")
B = np.load(file)
file.close()
print "done"
In [17]:
import time
import scipy
from scipy.sparse import csr_matrix
from scipy.optimize import leastsq
In [18]:
start = time.time()
X1, _, _, _ = numpy.linalg.lstsq(A[:2000], B[:2000]);
end = time.time()
print end - start
In [19]:
start = time.time()
X2, _, _, _ = scipy.linalg.lstsq(A[:2000], B[:2000], False, False);
end = time.time()
print end - start
In [32]:
print X1
print X2
In [26]:
In [ ]: