In [1]:
import pickle
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import kneighbors_graph
import time
import timeit
In [2]:
#read places
file = open ("/home/iizhaki/oasis/CSE255/restaurantsReviewsUS.pck")
places = pickle.load(file)
file.close();
print "done"
In [3]:
start = timeit.timeit()
gps = [tuple(p[2]) for p in places]
#GPS = [g for g in gps if (g != [0, 0] and ((g[0] >= 24 and g[0] <= 49 and g[1] >= -128 and g[1] <= -47) or (g[0] >= 35 and g[0] <= 58 and g[1] >= -11 and g[1] <= 45)))]
GPS = [g for g in gps if (g != [0, 0] and (g[0] >= 24 and g[0] <= 49 and g[1] >= -128 and g[1] <= -47))]
print len(gps), len(GPS)
end = timeit.timeit()
print end - start
In [4]:
K = 100
km = KMeans (n_clusters=K, n_jobs=-1)
km.fit(GPS)
Out[4]:
In [5]:
#save places clusters
file = open("/home/iizhaki/oasis/CSE255/gpsUS" + str(K) + ".pck", "w")
pickle.dump(km.labels_, file)
file.close()
print "done"
In [6]:
#save places clusters
file = open("/home/iizhaki/oasis/CSE255/gpsUS" + str(K) + "_cent.pck", "w")
pickle.dump(km.cluster_centers_, file)
file.close()
print "done"
In [73]:
#save kmeans object itself
file = open("/home/iizhaki/oasis/CSE255/gpsUS" + str(K) + "_obj.pck", "w")
pickle.dump(km, file)
file.close()
print "done"
In [27]:
file = open("/home/iizhaki/oasis/CSE255/gpsUS" + str(K) + "_obj.pck")
kmm = pickle.load(file)
file.close()
print "done"
In [ ]:
print type (km)
print [sum(km.labels_ == i) for i in range(K)]
print km.cluster_centers_
print gps[0]
In [ ]:
X = [tuple(p[2]) for p in places]
knn_graph = kneighbors_graph(X, 30)
print(X[0])
for connectivity in (None, knn_graph):
for n_clusters in (30, 3):
plt.figure(figsize=(10, 4))
for index, linkage in enumerate(('average', 'complete', 'ward')):
plt.subplot(1, 3, index + 1)
model = AgglomerativeClustering(linkage=linkage,
connectivity=connectivity,
n_clusters=n_clusters)
t0 = time.time()
model.fit(X)
elapsed_time = time.time() - t0
plt.scatter(X[:, 0], X[:, 1], c=model.labels_,
cmap=plt.cm.spectral)
plt.title('linkage=%s (time %.2fs)' % (linkage, elapsed_time),
fontdict=dict(verticalalignment='top'))
plt.axis('equal')
plt.axis('off')
plt.subplots_adjust(bottom=0, top=.89, wspace=0,
left=0, right=1)
plt.suptitle('n_cluster=%i, connectivity=%r' %
(n_clusters, connectivity is not None), size=17)
plt.show()
In [ ]:
test = [(y, x) for (x, y) in km.cluster_centers_]
plt.plot(*np.transpose(test), marker='o', color='r', ls='')
In [5]:
for (x, y) in km.cluster_centers_:
break
print "addMarker(new google.maps.LatLng(", x, ", ", y, "), \"(", x, ", ", y, ")\");"
In [119]:
restReviews = np.load("/home/iizhaki/oasis/CSE255/reviewMatrix.pck")
print restReviews.shape
print km.labels_.max()
def rankVector():
total = len(places)
i = -1
restLocations = [()] * K
for l in km.labels_:
i = i + 1
potential = restLocations[l]
if potential == ():
potential = ([i], np.array(restReviews[i], copy=True))
else:
idxs, arr = potential
potential = (idxs + [i], np.vstack([arr, restReviews[i]]))
restLocations[l] = potential
return restLocations
restLocations = rankVector()
In [73]:
#shadow = np.array(restLocations, copy=True)
restLocations = np.array(shadow, copy=True)
print len(restLocations[1][1])
In [120]:
#for i in range (len(restLocations)):
# restLocations[i] = (restLocations[i][0], restLocations[i][1].tolist())
file = open("/home/iizhaki/oasis/CSE255/categoriesByLocation.pck", "w")
pickle.dump(restLocations, file)
file.close()
print "done"
In [63]:
print len(restLocations[0][1]), len(restLocations[0][1][0]), shadow[0][1].shape
In [46]:
restLabel = np.load("/home/iizhaki/oasis/CSE255/reviewY.pck")
In [52]:
padding = np.ones((len(places), 1))
print padding.shape, restLocations.shape, restReviews.shape, restScore.shape
restFeatures = np.concatenate((padding, restLocations, restReviews), axis = 1)
print restFeatures.shape
In [55]:
file = open("/home/iizhaki/oasis/CSE255/reviewFeatures.pck", "w")
np.save(file, restFeatures)
file.close()
In [148]:
import numpy as np
import urllib
import scipy.optimize
import random
from math import exp
from math import log
def parseData(fname):
for l in urllib.urlopen(fname):
yield eval(l)
print "Reading data..."
#data = np.load("/home/iizhaki/oasis/CSE255/reviewFeatures.pck")
print "done"
def inner(x,y):
return sum([x[i]*y[i] for i in range(len(x))])
def sigmoid(x):
return 1.0 / (1 + np.exp(-x))
# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
logit = np.dot(X, theta)
loglikelihood = np.log(1 + np.exp(-logit)).sum(axis=0, dtype='float')
loglikelihood -= np.dot(logit, y)
loglikelihood -= lam * np.dot(theta, theta)
print "ll =", loglikelihood
return -loglikelihood
# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
logit = np.dot(X, theta)
dl = np.dot(X.T, (1 - sigmoid(logit)));
dl -= y_spec
dl -= 2 * lam * theta
# Negate the return value since we're doing gradient *ascent*
return np.array([-x for x in dl])
X = np.ones((10, 20))
y = np.array([0, 1, 1, 1, 0, 0, 1, 0, 1, 1])
print y.shape
# Training data
X_train = X
y_train = y
dummy = np.zeros((X_train.shape[1]))
y_spec = np.array([X[i] if not y_train[i] else dummy for i in range(len(X_train))]).sum(axis=0, dtype='float')
# Test data
#X_test = X[1000:]
#y_test = y[1000:]
theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, args = (X_train, y_train, 1.0))
print theta.shape
print "Final log likelihood =", -l
In [128]:
len(data
)
Out[128]:
In [131]:
import numpy as np
import urllib
import scipy.optimize
import random
from math import exp
from math import log
def parseData(fname):
for l in urllib.urlopen(fname):
yield eval(l)
print "Reading data..."
data = np.load("/home/iizhaki/oasis/CSE255/reviewFeatures.pck")
print "done"
def inner(x,y):
return sum([x[i]*y[i] for i in range(len(x))])
def sigmoid(x):
return 1.0 / (1 + np.exp(-x))
# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
loglikelihood = 0
for i in range(len(X)):
logit = inner(X[i], theta)
loglikelihood -= log(1 + exp(-logit))
if not y[i]:
loglikelihood -= logit
for k in range(len(theta)):
loglikelihood -= lam * theta[k]*theta[k]
print "ll =", loglikelihood
return -loglikelihood
# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
logit = np.dot(X, theta)
dl = np.dot(X.T, (1 - sigmoid(logit)));
dl -= y_spec
dl -= 2 * lam * theta
# Negate the return value since we're doing gradient *ascent*
return np.array([-x for x in dl])
X = data
y = np.load("/home/iizhaki/oasis/CSE255/reviewY.pck")
ntraining = len(X) * 0.7
# Training data
X_train = X[:ntraining]
y_train = y[:ntraining]
# Test data
X_test = X[ntraining:]
y_test = y[ntraining:]
dummy = np.zeros((X_train.shape[1]))
y_spec = np.array([X[i] if not y_train[i] else dummy for i in range(len(X_train))]).sum(axis=0, dtype='float')
theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, args = (X_train, y_train, 1.0))
print theta.shape
print "Final log likelihood =", -l
In [2]:
A = np.array([[1,2,3],[4,5,6],[7,8,9]])
B = np.array([[5,5,6],[1,3,3],[9,8,9]])
print np.append(A, B, axis=0)
In [3]:
A = [1,2,3]
A[1] = 5
print A
In [34]:
A = [1,2,3]
B = [2,3,4];
print [A, B]
In [121]:
import numpy as np
import urllib
import scipy.optimize
import random
from math import exp
from math import log
def parseData(fname):
for l in urllib.urlopen(fname):
yield eval(l)
print "Reading data..."
file = open ("/home/iizhaki/oasis/CSE255/categoriesByLocation.pck")
data = pickle.load(file)
file.close();
print "done"
print data.shape
In [158]:
def inner(x,y):
return sum([x[i]*y[i] for i in range(len(x))])
def sigmoid(x):
return 1.0 / (1 + np.exp(-x))
# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
logit = np.dot(X, theta)
loglikelihood = -np.log(1 + np.exp(-logit)).sum(axis=0, dtype='float')
loglikelihood -= np.dot(logit, 1 - y)
loglikelihood -= lam * np.dot(theta, theta)
#print "ll =", loglikelihood
return -loglikelihood
# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
logit = np.dot(X, theta)
dl = np.dot(X.T, (1 - sigmoid(logit)));
dl -= y_spec
dl -= 2 * lam * theta
# Negate the return value since we're doing gradient *ascent*
return np.array([-x for x in dl])
X = data
y = np.load("/home/iizhaki/oasis/CSE255/reviewY.pck")
# Training data
X_train = X
y_train = y
# Test data
X_test = X[1000:]
y_test = y[1000:]
K = 300
thetas = [None] * K
for k in range(K):
X_train = np.array(data[k][1])
#print X_train.shape
X_train = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
#print X_train.shape
y_train = np.array([y[i] for i in data[k][0]])
#dummy = np.zeros((X_train.shape[1]))
#y_spec = np.array([X_train[i] if not y_train[i] else dummy for i in range(len(X_train))]).sum(axis=0, dtype='float')
#theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X_train[0]), fprime, args = (X_train, y_train, 1.0))
theta, residuals, rank, s = numpy.linalg.lstsq(X_train, y_train)
#print "Final log likelihood for ", k, " = ", -l
thetas[k] = theta
print "done"
In [144]:
print y_spec.shape
k = 2
X_train = np.array(data[k][1])
y_train = np.array([(int)(y[i] / 1000) for i in data[k][0]])
print data[k][0][0]
print sum(data[k][1][0] == restReviews[23])
In [160]:
def predict(data, theta):
theta = numpy.matrix(theta)
prediction = [theta * numpy.matrix(d).T for d in data]
return prediction
def MSE(prediction, real):
squares = [ (p - r) for p,r in zip (prediction, real) ]
return numpy.mean(squares)
num = 0
for k in range(K):
X_train = data[k][1]
X_train = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
y_train = np.array([y[i] for i in data[k][0]])
y_pred = predict(X_train, thetas[k])
#print len(y_pred), len(X_train)
mse = MSE(y_pred, y_train)
num = num + mse
#print " MSE training ", k, " is: ", mse
print num / K
In [126]:
import os
statinfo = os.stat("/home/iizhaki/oasis/CSE255/MatrixwWords2.pck")
#statinfo = os.stat("/home/iizhaki/oasis/CSE255/MatrixD.pck")
statinfo.st_size
#13240755216
Out[126]:
In [28]:
file = open ("/home/iizhaki/oasis/CSE255/users_GPS_US.pck")
usersGps = pickle.load(file)
file.close();
print "done"
file = open ("/home/iizhaki/oasis/CSE255/users_id_US.pck")
userIds = pickle.load(file)
file.close();
print "done"
In [38]:
userPerRegion = {}
i = 0
for gps in usersGps:
pred = km.predict(gps)
userPerRegion[userIds[i]] = pred[0]
i = i + 1
In [40]:
print (list(userPerRegion))[0]
In [41]:
#read places
file = open ("/home/iizhaki/oasis/CSE255/reviewsUS.pck")
reviews = pickle.load(file)
file.close();
print "done"
In [42]:
print reviews[0]
In [ ]:
from collections import defaultdict
import re
occurs = defaultdict(int)
for review in reviews:
string = reviews[0][2].lower()
occs = filter(None, re.split("[,. \-!?:()01234566789]+", string))
for occ in occs:
occurs[occ] += 1
In [ ]:
print 2
In [1]:
import numpy as np
#save matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixReorder.pck")
matrix =np.load(file)
file.close()
print "done"
#save matrix
file = open("/home/iizhaki/oasis/CSE255/YsReorder.pck")
rankingR = np.load(file)
file.close()
print "done"
In [9]:
len(matrix)
Out[9]:
In [13]:
a= np.ndarray([])
In [3]:
matrix_n = []
rankingR_n = []
for i in range(len(matrix)):
if rankingR[i]!=0:
matrix_n.append(matrix[i])
rankingR_n.append(rankingR[i])
import random
del matrix
del rankingR
matrix = matrix_n
rankingR = rankingR_n
indexes = range(len(matrix))
random.shuffle(indexes)
In [7]:
m = np.array(matrix)
In [8]:
import numpy as np
#save matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixReorder_0.pck", "w")
np.save(file,m)
file.close()
print "done"
#save matrix
file = open("/home/iizhaki/oasis/CSE255/YsReorder_1.pck", "w")
np.save(file,rankingR)
file.close()
print "done"
#save matrix
file = open("/home/iizhaki/oasis/CSE255/IndexReorder_1.pck", "w")
np.save(file,indexes)
file.close()
print "done"
In [ ]: