In [1]:
import pickle
from sklearn.cluster import KMeans 
from sklearn.cluster import AgglomerativeClustering 
from sklearn.neighbors import kneighbors_graph
import time
import timeit

In [2]:
#read places

file  = open ("/home/iizhaki/oasis/CSE255/restaurantsReviewsUS.pck")
places = pickle.load(file)
file.close();
print "done"


done

In [3]:
start = timeit.timeit()
gps = [tuple(p[2]) for p in places]

#GPS = [g for g in gps if (g != [0, 0] and ((g[0] >= 24 and g[0] <= 49 and g[1] >= -128 and g[1] <= -47) or (g[0] >= 35 and g[0] <= 58 and g[1] >= -11 and g[1] <= 45)))]
GPS = [g for g in gps if (g != [0, 0] and (g[0] >= 24 and g[0] <= 49 and g[1] >= -128 and g[1] <= -47))]
print len(gps), len(GPS)
end = timeit.timeit()
print end - start


357191 357191
-0.00137424468994

In [4]:
K = 100
km =  KMeans (n_clusters=K, n_jobs=-1)
km.fit(GPS)


Out[4]:
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=100, n_init=10,
    n_jobs=-1, precompute_distances=True, random_state=None, tol=0.0001,
    verbose=0)

In [5]:
#save places clusters
file = open("/home/iizhaki/oasis/CSE255/gpsUS" + str(K) + ".pck", "w")
pickle.dump(km.labels_, file)
file.close()
print "done"


done

In [6]:
#save places clusters
file = open("/home/iizhaki/oasis/CSE255/gpsUS" + str(K) + "_cent.pck", "w")
pickle.dump(km.cluster_centers_, file)
file.close()
print "done"


done

In [73]:
#save kmeans object itself
file = open("/home/iizhaki/oasis/CSE255/gpsUS" + str(K) + "_obj.pck", "w")
pickle.dump(km, file)
file.close()
print "done"


done

In [27]:
file = open("/home/iizhaki/oasis/CSE255/gpsUS" + str(K) + "_obj.pck")
kmm = pickle.load(file)
file.close()
print "done"


done

In [ ]:
print type (km)
print [sum(km.labels_ == i) for i in range(K)]
print km.cluster_centers_
print gps[0]

In [ ]:
X = [tuple(p[2]) for p in places]
knn_graph = kneighbors_graph(X, 30)
print(X[0])

for connectivity in (None, knn_graph):
    for n_clusters in (30, 3):
        plt.figure(figsize=(10, 4))
        for index, linkage in enumerate(('average', 'complete', 'ward')):
            plt.subplot(1, 3, index + 1)
            model = AgglomerativeClustering(linkage=linkage,
                                            connectivity=connectivity,
                                            n_clusters=n_clusters)
            t0 = time.time()
            model.fit(X)
            elapsed_time = time.time() - t0
            plt.scatter(X[:, 0], X[:, 1], c=model.labels_,
                        cmap=plt.cm.spectral)
            plt.title('linkage=%s (time %.2fs)' % (linkage, elapsed_time),
                      fontdict=dict(verticalalignment='top'))
            plt.axis('equal')
            plt.axis('off')

            plt.subplots_adjust(bottom=0, top=.89, wspace=0,
                                left=0, right=1)
            plt.suptitle('n_cluster=%i, connectivity=%r' %
                         (n_clusters, connectivity is not None), size=17)


plt.show()

In [ ]:
test = [(y, x) for (x, y) in km.cluster_centers_]
plt.plot(*np.transpose(test), marker='o', color='r', ls='')

In [5]:
for (x, y) in km.cluster_centers_:
    break
    print "addMarker(new google.maps.LatLng(", x, ", ",  y, "), \"(", x, ", ", y,  ")\");"

In [119]:
restReviews = np.load("/home/iizhaki/oasis/CSE255/reviewMatrix.pck")
print restReviews.shape 

print km.labels_.max()

def rankVector():
    total = len(places)
    i = -1
    restLocations = [()] * K

    for l in km.labels_:
        i = i + 1
        
        potential = restLocations[l]
        if potential == ():
            potential = ([i], np.array(restReviews[i], copy=True))
        else:
            idxs, arr = potential
            potential = (idxs + [i], np.vstack([arr, restReviews[i]]))
        restLocations[l] = potential
            
    return restLocations

restLocations = rankVector()


(357191, 363)
299

In [73]:
#shadow = np.array(restLocations, copy=True)
restLocations = np.array(shadow, copy=True)
print len(restLocations[1][1])


673

In [120]:
#for i in range (len(restLocations)):
 #   restLocations[i] = (restLocations[i][0], restLocations[i][1].tolist())

file = open("/home/iizhaki/oasis/CSE255/categoriesByLocation.pck", "w")
pickle.dump(restLocations, file)
file.close()
print "done"


done

In [63]:
print len(restLocations[0][1]), len(restLocations[0][1][0]), shadow[0][1].shape


1662 363 (1662, 363)

In [46]:
restLabel = np.load("/home/iizhaki/oasis/CSE255/reviewY.pck")




In [52]:
padding = np.ones((len(places), 1))
print padding.shape, restLocations.shape, restReviews.shape, restScore.shape
restFeatures = np.concatenate((padding, restLocations, restReviews), axis = 1)
print restFeatures.shape


(357191, 1) (357191, 300) (357191, 363) (357191,)
(357191, 664)

In [55]:
file = open("/home/iizhaki/oasis/CSE255/reviewFeatures.pck", "w")
np.save(file, restFeatures)
file.close()

In [148]:
import numpy as np
import urllib
import scipy.optimize
import random
from math import exp
from math import log

def parseData(fname):
  for l in urllib.urlopen(fname):
    yield eval(l)

print "Reading data..."
#data = np.load("/home/iizhaki/oasis/CSE255/reviewFeatures.pck")
print "done"

def inner(x,y):
  return sum([x[i]*y[i] for i in range(len(x))])

def sigmoid(x):
  return 1.0 / (1 + np.exp(-x))

# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
  logit = np.dot(X, theta)
  loglikelihood = np.log(1 + np.exp(-logit)).sum(axis=0, dtype='float')
  loglikelihood -= np.dot(logit, y)
  loglikelihood -= lam * np.dot(theta, theta)
  
  print "ll =", loglikelihood
  return -loglikelihood

# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
    logit = np.dot(X, theta)
    dl = np.dot(X.T, (1 - sigmoid(logit)));
    dl -= y_spec
    dl -= 2 * lam * theta
    # Negate the return value since we're doing gradient *ascent*
    return np.array([-x for x in dl])


X = np.ones((10, 20))
y = np.array([0, 1, 1, 1, 0, 0, 1, 0, 1, 1])
print y.shape

# Training data
X_train = X
y_train = y

dummy = np.zeros((X_train.shape[1]))
y_spec = np.array([X[i] if not y_train[i] else dummy for i in range(len(X_train))]).sum(axis=0, dtype='float')

# Test data
#X_test = X[1000:]
#y_test = y[1000:]

theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, args = (X_train, y_train, 1.0))
print theta.shape
print "Final log likelihood =", -l


Reading data...
done
(10,)
ll = 6.9314718056
ll = -27.7192343061
ll = 5.74386274729
ll = 6.91421450357
ll = 6.93122858857
ll = 6.9314683793
ll = 6.93147175733
ll = 6.93147180492
ll = 6.93147180559
ll = 6.9314718056
ll = 6.9314718056
ll = 6.9314718056
ll = 6.9314718056
ll = 6.9314718056
ll = 6.9314718056
ll = 6.9314718056
ll = 6.9314718056
ll = 6.9314718056
ll = 6.9314718056
ll = 6.9314718056
ll = 6.9314718056
(20,)
Final log likelihood = 6.9314718056

In [128]:
len(data
    )


Out[128]:
357191

In [131]:
import numpy as np
import urllib
import scipy.optimize
import random
from math import exp
from math import log

def parseData(fname):
  for l in urllib.urlopen(fname):
    yield eval(l)

print "Reading data..."
data = np.load("/home/iizhaki/oasis/CSE255/reviewFeatures.pck")
print "done"

def inner(x,y):
  return sum([x[i]*y[i] for i in range(len(x))])

def sigmoid(x):
  return 1.0 / (1 + np.exp(-x))

# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
  loglikelihood = 0
  for i in range(len(X)):
    logit = inner(X[i], theta)
    loglikelihood -= log(1 + exp(-logit))
    if not y[i]:
      loglikelihood -= logit
  for k in range(len(theta)):
    loglikelihood -= lam * theta[k]*theta[k]
  print "ll =", loglikelihood
  return -loglikelihood

# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
    logit = np.dot(X, theta)
    dl = np.dot(X.T, (1 - sigmoid(logit)));
    dl -= y_spec
    dl -= 2 * lam * theta
    # Negate the return value since we're doing gradient *ascent*
    return np.array([-x for x in dl])

X = data
y = np.load("/home/iizhaki/oasis/CSE255/reviewY.pck")


ntraining = len(X) * 0.7
# Training data
X_train = X[:ntraining]
y_train = y[:ntraining]

# Test data
X_test = X[ntraining:]
y_test = y[ntraining:]

dummy = np.zeros((X_train.shape[1]))
y_spec = np.array([X[i] if not y_train[i] else dummy for i in range(len(X_train))]).sum(axis=0, dtype='float')

theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, args = (X_train, y_train, 1.0))
print theta.shape
print "Final log likelihood =", -l


Reading data...
done
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-131-563325817cde> in <module>()
     56 
     57 dummy = np.zeros((X_train.shape[1]))
---> 58 y_spec = np.array([X[i] if not y_train[i] else dummy for i in range(len(X_train))]).sum(axis=0, dtype='float')
     59 
     60 theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, args = (X_train, y_train, 1.0))

KeyboardInterrupt: 

In [2]:
A = np.array([[1,2,3],[4,5,6],[7,8,9]])
B = np.array([[5,5,6],[1,3,3],[9,8,9]])
print np.append(A, B, axis=0)


[[1 2 3]
 [4 5 6]
 [7 8 9]
 [5 5 6]
 [1 3 3]
 [9 8 9]]

In [3]:
A = [1,2,3]
A[1] = 5
print A


[1, 5, 3]

In [34]:
A = [1,2,3]
B = [2,3,4];
print [A, B]


[[1, 2, 3], [2, 3, 4]]

In [121]:
import numpy as np
import urllib
import scipy.optimize
import random
from math import exp
from math import log

def parseData(fname):
  for l in urllib.urlopen(fname):
    yield eval(l)

print "Reading data..."
file  = open ("/home/iizhaki/oasis/CSE255/categoriesByLocation.pck")
data = pickle.load(file)
file.close();
print "done"
print data.shape


Reading data...
done
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-121-bacc95db270b> in <module>()
     15 file.close();
     16 print "done"
---> 17 print data.shape

AttributeError: 'list' object has no attribute 'shape'

In [158]:
def inner(x,y):
  return sum([x[i]*y[i] for i in range(len(x))])

def sigmoid(x):
  return 1.0 / (1 + np.exp(-x))

# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
  logit = np.dot(X, theta)
  loglikelihood = -np.log(1 + np.exp(-logit)).sum(axis=0, dtype='float')
  loglikelihood -= np.dot(logit, 1 - y)
  loglikelihood -= lam * np.dot(theta, theta)
  
  #print "ll =", loglikelihood
  return -loglikelihood

# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
    logit = np.dot(X, theta)
    dl = np.dot(X.T, (1 - sigmoid(logit)));
    dl -= y_spec
    dl -= 2 * lam * theta
    # Negate the return value since we're doing gradient *ascent*
    return np.array([-x for x in dl])

X = data
y = np.load("/home/iizhaki/oasis/CSE255/reviewY.pck")

# Training data
X_train = X
y_train = y

# Test data
X_test = X[1000:]
y_test = y[1000:]
K = 300

thetas = [None] * K

for k in range(K):
    X_train = np.array(data[k][1])
    #print X_train.shape
    X_train = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
    #print X_train.shape
    y_train = np.array([y[i] for i in data[k][0]])
    
    #dummy = np.zeros((X_train.shape[1]))
    #y_spec = np.array([X_train[i] if not y_train[i] else dummy for i in range(len(X_train))]).sum(axis=0, dtype='float')

    #theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X_train[0]), fprime, args = (X_train, y_train, 1.0))
    theta, residuals, rank, s = numpy.linalg.lstsq(X_train, y_train)
    #print "Final log likelihood for ", k, " = ", -l
    
    thetas[k] = theta
    
print "done"


done

In [144]:
print y_spec.shape

k = 2
X_train = np.array(data[k][1])
y_train = np.array([(int)(y[i] / 1000) for i in data[k][0]])

print data[k][0][0]
print sum(data[k][1][0] == restReviews[23])


(363,)
23
363

In [160]:
def predict(data, theta):
    theta = numpy.matrix(theta)
    prediction = [theta * numpy.matrix(d).T  for d in data]
    return prediction

def MSE(prediction, real):
    squares = [ (p - r) for p,r in zip (prediction, real) ]
    return numpy.mean(squares)

num = 0
for k in range(K):
    X_train = data[k][1]
    X_train = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
    y_train = np.array([y[i] for i in data[k][0]])
    
    y_pred = predict(X_train, thetas[k])
    #print len(y_pred), len(X_train)
    
    mse = MSE(y_pred, y_train)
    num = num + mse
    #print " MSE training ", k, " is: ", mse
    
print num / K


-0.259466274409

In [126]:
import os

statinfo = os.stat("/home/iizhaki/oasis/CSE255/MatrixwWords2.pck")
#statinfo = os.stat("/home/iizhaki/oasis/CSE255/MatrixD.pck")
statinfo.st_size
#13240755216


Out[126]:
5504766416

Cross Computing Users


In [28]:
file  = open ("/home/iizhaki/oasis/CSE255/users_GPS_US.pck")
usersGps = pickle.load(file)
file.close();
print "done"

file  = open ("/home/iizhaki/oasis/CSE255/users_id_US.pck")
userIds = pickle.load(file)
file.close();
print "done"


done
done

In [38]:
userPerRegion = {}
i = 0
for gps in usersGps:
    pred = km.predict(gps)
    userPerRegion[userIds[i]] = pred[0]
    i = i + 1

In [40]:
print (list(userPerRegion))[0]


108437601487080769932

In [41]:
#read places

file  = open ("/home/iizhaki/oasis/CSE255/reviewsUS.pck")
reviews = pickle.load(file)
file.close();
print "done"


done

In [42]:
print reviews[0]


(5000, [u'Mexican Restaurant', u'Latin American Restaurant'], "You won't be disappointed in the food.  They do business lunches and groups (6 to 10) very well.  Service always fast and helpful.  This is one of my top 4 Mexican restaurants in Akron area, the only detractor is the age of the building and the environment.  Again not back, nothing to stay away from, but their business is serving Mexican food to their customers and they do that well.  Lunch is the majority of times I have been there.", '101280967457665576418', '103173356293785774089')

In [ ]:
from collections import defaultdict
import re

occurs = defaultdict(int)
for review in reviews:
    string = reviews[0][2].lower()
    occs = filter(None, re.split("[,. \-!?:()01234566789]+", string))
    for occ in occs:
        occurs[occ] += 1

In [ ]:
print 2

In [1]:
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixReorder.pck")
matrix =np.load(file)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/YsReorder.pck")
rankingR = np.load(file)
file.close()
print "done"


done
done

In [9]:
len(matrix)


Out[9]:
1609992

In [13]:
a=  np.ndarray([])


2.03512200714e-316

In [3]:
matrix_n   = []
rankingR_n = []
for i in range(len(matrix)):
    if rankingR[i]!=0:
        matrix_n.append(matrix[i])
        rankingR_n.append(rankingR[i])
import random
del matrix
del rankingR
matrix   = matrix_n
rankingR = rankingR_n
indexes = range(len(matrix))
random.shuffle(indexes)

In [7]:
m = np.array(matrix)

In [8]:
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixReorder_0.pck", "w")
np.save(file,m)
file.close()
print "done"


#save  matrix
file = open("/home/iizhaki/oasis/CSE255/YsReorder_1.pck", "w")
np.save(file,rankingR)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/IndexReorder_1.pck", "w")
np.save(file,indexes)
file.close()
print "done"


done
done
done

In [ ]: