In [1]:
    
import pickle
from sklearn.cluster import KMeans 
from sklearn.cluster import AgglomerativeClustering 
from sklearn.neighbors import kneighbors_graph
import time
import timeit
    
In [2]:
    
#read places
file  = open ("/home/iizhaki/oasis/CSE255/restaurantsReviewsUS.pck")
places = pickle.load(file)
file.close();
print "done"
    
    
In [3]:
    
start = timeit.timeit()
gps = [tuple(p[2]) for p in places]
#GPS = [g for g in gps if (g != [0, 0] and ((g[0] >= 24 and g[0] <= 49 and g[1] >= -128 and g[1] <= -47) or (g[0] >= 35 and g[0] <= 58 and g[1] >= -11 and g[1] <= 45)))]
GPS = [g for g in gps if (g != [0, 0] and (g[0] >= 24 and g[0] <= 49 and g[1] >= -128 and g[1] <= -47))]
print len(gps), len(GPS)
end = timeit.timeit()
print end - start
    
    
In [4]:
    
K = 100
km =  KMeans (n_clusters=K, n_jobs=-1)
km.fit(GPS)
    
    Out[4]:
In [5]:
    
#save places clusters
file = open("/home/iizhaki/oasis/CSE255/gpsUS" + str(K) + ".pck", "w")
pickle.dump(km.labels_, file)
file.close()
print "done"
    
    
In [6]:
    
#save places clusters
file = open("/home/iizhaki/oasis/CSE255/gpsUS" + str(K) + "_cent.pck", "w")
pickle.dump(km.cluster_centers_, file)
file.close()
print "done"
    
    
In [73]:
    
#save kmeans object itself
file = open("/home/iizhaki/oasis/CSE255/gpsUS" + str(K) + "_obj.pck", "w")
pickle.dump(km, file)
file.close()
print "done"
    
    
In [27]:
    
file = open("/home/iizhaki/oasis/CSE255/gpsUS" + str(K) + "_obj.pck")
kmm = pickle.load(file)
file.close()
print "done"
    
    
In [ ]:
    
print type (km)
print [sum(km.labels_ == i) for i in range(K)]
print km.cluster_centers_
print gps[0]
    
In [ ]:
    
X = [tuple(p[2]) for p in places]
knn_graph = kneighbors_graph(X, 30)
print(X[0])
for connectivity in (None, knn_graph):
    for n_clusters in (30, 3):
        plt.figure(figsize=(10, 4))
        for index, linkage in enumerate(('average', 'complete', 'ward')):
            plt.subplot(1, 3, index + 1)
            model = AgglomerativeClustering(linkage=linkage,
                                            connectivity=connectivity,
                                            n_clusters=n_clusters)
            t0 = time.time()
            model.fit(X)
            elapsed_time = time.time() - t0
            plt.scatter(X[:, 0], X[:, 1], c=model.labels_,
                        cmap=plt.cm.spectral)
            plt.title('linkage=%s (time %.2fs)' % (linkage, elapsed_time),
                      fontdict=dict(verticalalignment='top'))
            plt.axis('equal')
            plt.axis('off')
            plt.subplots_adjust(bottom=0, top=.89, wspace=0,
                                left=0, right=1)
            plt.suptitle('n_cluster=%i, connectivity=%r' %
                         (n_clusters, connectivity is not None), size=17)
plt.show()
    
In [ ]:
    
test = [(y, x) for (x, y) in km.cluster_centers_]
plt.plot(*np.transpose(test), marker='o', color='r', ls='')
    
In [5]:
    
for (x, y) in km.cluster_centers_:
    break
    print "addMarker(new google.maps.LatLng(", x, ", ",  y, "), \"(", x, ", ", y,  ")\");"
    
In [119]:
    
restReviews = np.load("/home/iizhaki/oasis/CSE255/reviewMatrix.pck")
print restReviews.shape 
print km.labels_.max()
def rankVector():
    total = len(places)
    i = -1
    restLocations = [()] * K
    for l in km.labels_:
        i = i + 1
        
        potential = restLocations[l]
        if potential == ():
            potential = ([i], np.array(restReviews[i], copy=True))
        else:
            idxs, arr = potential
            potential = (idxs + [i], np.vstack([arr, restReviews[i]]))
        restLocations[l] = potential
            
    return restLocations
restLocations = rankVector()
    
    
In [73]:
    
#shadow = np.array(restLocations, copy=True)
restLocations = np.array(shadow, copy=True)
print len(restLocations[1][1])
    
    
In [120]:
    
#for i in range (len(restLocations)):
 #   restLocations[i] = (restLocations[i][0], restLocations[i][1].tolist())
file = open("/home/iizhaki/oasis/CSE255/categoriesByLocation.pck", "w")
pickle.dump(restLocations, file)
file.close()
print "done"
    
    
In [63]:
    
print len(restLocations[0][1]), len(restLocations[0][1][0]), shadow[0][1].shape
    
    
In [46]:
    
restLabel = np.load("/home/iizhaki/oasis/CSE255/reviewY.pck")
    
    
In [52]:
    
padding = np.ones((len(places), 1))
print padding.shape, restLocations.shape, restReviews.shape, restScore.shape
restFeatures = np.concatenate((padding, restLocations, restReviews), axis = 1)
print restFeatures.shape
    
    
In [55]:
    
file = open("/home/iizhaki/oasis/CSE255/reviewFeatures.pck", "w")
np.save(file, restFeatures)
file.close()
    
In [148]:
    
import numpy as np
import urllib
import scipy.optimize
import random
from math import exp
from math import log
def parseData(fname):
  for l in urllib.urlopen(fname):
    yield eval(l)
print "Reading data..."
#data = np.load("/home/iizhaki/oasis/CSE255/reviewFeatures.pck")
print "done"
def inner(x,y):
  return sum([x[i]*y[i] for i in range(len(x))])
def sigmoid(x):
  return 1.0 / (1 + np.exp(-x))
# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
  logit = np.dot(X, theta)
  loglikelihood = np.log(1 + np.exp(-logit)).sum(axis=0, dtype='float')
  loglikelihood -= np.dot(logit, y)
  loglikelihood -= lam * np.dot(theta, theta)
  
  print "ll =", loglikelihood
  return -loglikelihood
# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
    logit = np.dot(X, theta)
    dl = np.dot(X.T, (1 - sigmoid(logit)));
    dl -= y_spec
    dl -= 2 * lam * theta
    # Negate the return value since we're doing gradient *ascent*
    return np.array([-x for x in dl])
X = np.ones((10, 20))
y = np.array([0, 1, 1, 1, 0, 0, 1, 0, 1, 1])
print y.shape
# Training data
X_train = X
y_train = y
dummy = np.zeros((X_train.shape[1]))
y_spec = np.array([X[i] if not y_train[i] else dummy for i in range(len(X_train))]).sum(axis=0, dtype='float')
# Test data
#X_test = X[1000:]
#y_test = y[1000:]
theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, args = (X_train, y_train, 1.0))
print theta.shape
print "Final log likelihood =", -l
    
    
In [128]:
    
len(data
    )
    
    Out[128]:
In [131]:
    
import numpy as np
import urllib
import scipy.optimize
import random
from math import exp
from math import log
def parseData(fname):
  for l in urllib.urlopen(fname):
    yield eval(l)
print "Reading data..."
data = np.load("/home/iizhaki/oasis/CSE255/reviewFeatures.pck")
print "done"
def inner(x,y):
  return sum([x[i]*y[i] for i in range(len(x))])
def sigmoid(x):
  return 1.0 / (1 + np.exp(-x))
# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
  loglikelihood = 0
  for i in range(len(X)):
    logit = inner(X[i], theta)
    loglikelihood -= log(1 + exp(-logit))
    if not y[i]:
      loglikelihood -= logit
  for k in range(len(theta)):
    loglikelihood -= lam * theta[k]*theta[k]
  print "ll =", loglikelihood
  return -loglikelihood
# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
    logit = np.dot(X, theta)
    dl = np.dot(X.T, (1 - sigmoid(logit)));
    dl -= y_spec
    dl -= 2 * lam * theta
    # Negate the return value since we're doing gradient *ascent*
    return np.array([-x for x in dl])
X = data
y = np.load("/home/iizhaki/oasis/CSE255/reviewY.pck")
ntraining = len(X) * 0.7
# Training data
X_train = X[:ntraining]
y_train = y[:ntraining]
# Test data
X_test = X[ntraining:]
y_test = y[ntraining:]
dummy = np.zeros((X_train.shape[1]))
y_spec = np.array([X[i] if not y_train[i] else dummy for i in range(len(X_train))]).sum(axis=0, dtype='float')
theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, args = (X_train, y_train, 1.0))
print theta.shape
print "Final log likelihood =", -l
    
    
    
In [2]:
    
A = np.array([[1,2,3],[4,5,6],[7,8,9]])
B = np.array([[5,5,6],[1,3,3],[9,8,9]])
print np.append(A, B, axis=0)
    
    
In [3]:
    
A = [1,2,3]
A[1] = 5
print A
    
    
In [34]:
    
A = [1,2,3]
B = [2,3,4];
print [A, B]
    
    
In [121]:
    
import numpy as np
import urllib
import scipy.optimize
import random
from math import exp
from math import log
def parseData(fname):
  for l in urllib.urlopen(fname):
    yield eval(l)
print "Reading data..."
file  = open ("/home/iizhaki/oasis/CSE255/categoriesByLocation.pck")
data = pickle.load(file)
file.close();
print "done"
print data.shape
    
    
    
In [158]:
    
def inner(x,y):
  return sum([x[i]*y[i] for i in range(len(x))])
def sigmoid(x):
  return 1.0 / (1 + np.exp(-x))
# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
  logit = np.dot(X, theta)
  loglikelihood = -np.log(1 + np.exp(-logit)).sum(axis=0, dtype='float')
  loglikelihood -= np.dot(logit, 1 - y)
  loglikelihood -= lam * np.dot(theta, theta)
  
  #print "ll =", loglikelihood
  return -loglikelihood
# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
    logit = np.dot(X, theta)
    dl = np.dot(X.T, (1 - sigmoid(logit)));
    dl -= y_spec
    dl -= 2 * lam * theta
    # Negate the return value since we're doing gradient *ascent*
    return np.array([-x for x in dl])
X = data
y = np.load("/home/iizhaki/oasis/CSE255/reviewY.pck")
# Training data
X_train = X
y_train = y
# Test data
X_test = X[1000:]
y_test = y[1000:]
K = 300
thetas = [None] * K
for k in range(K):
    X_train = np.array(data[k][1])
    #print X_train.shape
    X_train = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
    #print X_train.shape
    y_train = np.array([y[i] for i in data[k][0]])
    
    #dummy = np.zeros((X_train.shape[1]))
    #y_spec = np.array([X_train[i] if not y_train[i] else dummy for i in range(len(X_train))]).sum(axis=0, dtype='float')
    #theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X_train[0]), fprime, args = (X_train, y_train, 1.0))
    theta, residuals, rank, s = numpy.linalg.lstsq(X_train, y_train)
    #print "Final log likelihood for ", k, " = ", -l
    
    thetas[k] = theta
    
print "done"
    
    
In [144]:
    
print y_spec.shape
k = 2
X_train = np.array(data[k][1])
y_train = np.array([(int)(y[i] / 1000) for i in data[k][0]])
print data[k][0][0]
print sum(data[k][1][0] == restReviews[23])
    
    
In [160]:
    
def predict(data, theta):
    theta = numpy.matrix(theta)
    prediction = [theta * numpy.matrix(d).T  for d in data]
    return prediction
def MSE(prediction, real):
    squares = [ (p - r) for p,r in zip (prediction, real) ]
    return numpy.mean(squares)
num = 0
for k in range(K):
    X_train = data[k][1]
    X_train = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
    y_train = np.array([y[i] for i in data[k][0]])
    
    y_pred = predict(X_train, thetas[k])
    #print len(y_pred), len(X_train)
    
    mse = MSE(y_pred, y_train)
    num = num + mse
    #print " MSE training ", k, " is: ", mse
    
print num / K
    
    
In [126]:
    
import os
statinfo = os.stat("/home/iizhaki/oasis/CSE255/MatrixwWords2.pck")
#statinfo = os.stat("/home/iizhaki/oasis/CSE255/MatrixD.pck")
statinfo.st_size
#13240755216
    
    Out[126]:
In [28]:
    
file  = open ("/home/iizhaki/oasis/CSE255/users_GPS_US.pck")
usersGps = pickle.load(file)
file.close();
print "done"
file  = open ("/home/iizhaki/oasis/CSE255/users_id_US.pck")
userIds = pickle.load(file)
file.close();
print "done"
    
    
In [38]:
    
userPerRegion = {}
i = 0
for gps in usersGps:
    pred = km.predict(gps)
    userPerRegion[userIds[i]] = pred[0]
    i = i + 1
    
In [40]:
    
print (list(userPerRegion))[0]
    
    
In [41]:
    
#read places
file  = open ("/home/iizhaki/oasis/CSE255/reviewsUS.pck")
reviews = pickle.load(file)
file.close();
print "done"
    
    
In [42]:
    
print reviews[0]
    
    
In [ ]:
    
from collections import defaultdict
import re
occurs = defaultdict(int)
for review in reviews:
    string = reviews[0][2].lower()
    occs = filter(None, re.split("[,. \-!?:()01234566789]+", string))
    for occ in occs:
        occurs[occ] += 1
    
In [ ]:
    
print 2
    
In [1]:
    
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixReorder.pck")
matrix =np.load(file)
file.close()
print "done"
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/YsReorder.pck")
rankingR = np.load(file)
file.close()
print "done"
    
    
In [9]:
    
len(matrix)
    
    Out[9]:
In [13]:
    
a=  np.ndarray([])
    
    
In [3]:
    
matrix_n   = []
rankingR_n = []
for i in range(len(matrix)):
    if rankingR[i]!=0:
        matrix_n.append(matrix[i])
        rankingR_n.append(rankingR[i])
import random
del matrix
del rankingR
matrix   = matrix_n
rankingR = rankingR_n
indexes = range(len(matrix))
random.shuffle(indexes)
    
In [7]:
    
m = np.array(matrix)
    
In [8]:
    
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixReorder_0.pck", "w")
np.save(file,m)
file.close()
print "done"
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/YsReorder_1.pck", "w")
np.save(file,rankingR)
file.close()
print "done"
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/IndexReorder_1.pck", "w")
np.save(file,indexes)
file.close()
print "done"
    
    
In [ ]: