notebook.community

Edit and run



In [1]:

    
import pickle
from sklearn.cluster import KMeans 
from sklearn.cluster import AgglomerativeClustering 
from sklearn.neighbors import kneighbors_graph
import time
import timeit



In [2]:

    
#read places

file  = open ("/home/iizhaki/oasis/CSE255/restaurantsReviewsUS.pck")
places = pickle.load(file)
file.close();
print "done"









    



done



In [3]:

    
start = timeit.timeit()
gps = [tuple(p[2]) for p in places]

#GPS = [g for g in gps if (g != [0, 0] and ((g[0] >= 24 and g[0] <= 49 and g[1] >= -128 and g[1] <= -47) or (g[0] >= 35 and g[0] <= 58 and g[1] >= -11 and g[1] <= 45)))]
GPS = [g for g in gps if (g != [0, 0] and (g[0] >= 24 and g[0] <= 49 and g[1] >= -128 and g[1] <= -47))]
print len(gps), len(GPS)
end = timeit.timeit()
print end - start









    



357191 357191
-0.00137424468994



In [4]:

    
K = 100
km =  KMeans (n_clusters=K, n_jobs=-1)
km.fit(GPS)









    Out[4]:





KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=100, n_init=10,
    n_jobs=-1, precompute_distances=True, random_state=None, tol=0.0001,
    verbose=0)



In [5]:

    
#save places clusters
file = open("/home/iizhaki/oasis/CSE255/gpsUS" + str(K) + ".pck", "w")
pickle.dump(km.labels_, file)
file.close()
print "done"









    



done



In [6]:

    
#save places clusters
file = open("/home/iizhaki/oasis/CSE255/gpsUS" + str(K) + "_cent.pck", "w")
pickle.dump(km.cluster_centers_, file)
file.close()
print "done"









    



done



In [73]:

    
#save kmeans object itself
file = open("/home/iizhaki/oasis/CSE255/gpsUS" + str(K) + "_obj.pck", "w")
pickle.dump(km, file)
file.close()
print "done"









    



done



In [27]:

    
file = open("/home/iizhaki/oasis/CSE255/gpsUS" + str(K) + "_obj.pck")
kmm = pickle.load(file)
file.close()
print "done"









    



done



In [ ]:

    
print type (km)
print [sum(km.labels_ == i) for i in range(K)]
print km.cluster_centers_
print gps[0]



In [ ]:

    
X = [tuple(p[2]) for p in places]
knn_graph = kneighbors_graph(X, 30)
print(X[0])

for connectivity in (None, knn_graph):
    for n_clusters in (30, 3):
        plt.figure(figsize=(10, 4))
        for index, linkage in enumerate(('average', 'complete', 'ward')):
            plt.subplot(1, 3, index + 1)
            model = AgglomerativeClustering(linkage=linkage,
                                            connectivity=connectivity,
                                            n_clusters=n_clusters)
            t0 = time.time()
            model.fit(X)
            elapsed_time = time.time() - t0
            plt.scatter(X[:, 0], X[:, 1], c=model.labels_,
                        cmap=plt.cm.spectral)
            plt.title('linkage=%s (time %.2fs)' % (linkage, elapsed_time),
                      fontdict=dict(verticalalignment='top'))
            plt.axis('equal')
            plt.axis('off')

            plt.subplots_adjust(bottom=0, top=.89, wspace=0,
                                left=0, right=1)
            plt.suptitle('n_cluster=%i, connectivity=%r' %
                         (n_clusters, connectivity is not None), size=17)


plt.show()



In [ ]:

    
test = [(y, x) for (x, y) in km.cluster_centers_]
plt.plot(*np.transpose(test), marker='o', color='r', ls='')



In [5]:

    
for (x, y) in km.cluster_centers_:
    break
    print "addMarker(new google.maps.LatLng(", x, ", ",  y, "), \"(", x, ", ", y,  ")\");"



In [119]:

    
restReviews = np.load("/home/iizhaki/oasis/CSE255/reviewMatrix.pck")
print restReviews.shape 

print km.labels_.max()

def rankVector():
    total = len(places)
    i = -1
    restLocations = [()] * K

    for l in km.labels_:
        i = i + 1
        
        potential = restLocations[l]
        if potential == ():
            potential = ([i], np.array(restReviews[i], copy=True))
        else:
            idxs, arr = potential
            potential = (idxs + [i], np.vstack([arr, restReviews[i]]))
        restLocations[l] = potential
            
    return restLocations

restLocations = rankVector()









    



(357191, 363)
299



In [73]:

    
#shadow = np.array(restLocations, copy=True)
restLocations = np.array(shadow, copy=True)
print len(restLocations[1][1])



In [120]:

    
#for i in range (len(restLocations)):
 #   restLocations[i] = (restLocations[i][0], restLocations[i][1].tolist())

file = open("/home/iizhaki/oasis/CSE255/categoriesByLocation.pck", "w")
pickle.dump(restLocations, file)
file.close()
print "done"









    



done



In [63]:

    
print len(restLocations[0][1]), len(restLocations[0][1][0]), shadow[0][1].shape









    



1662 363 (1662, 363)



In [46]:

    
restLabel = np.load("/home/iizhaki/oasis/CSE255/reviewY.pck")



In [52]:

    
padding = np.ones((len(places), 1))
print padding.shape, restLocations.shape, restReviews.shape, restScore.shape
restFeatures = np.concatenate((padding, restLocations, restReviews), axis = 1)
print restFeatures.shape









    



(357191, 1) (357191, 300) (357191, 363) (357191,)
(357191, 664)



In [55]:

    
file = open("/home/iizhaki/oasis/CSE255/reviewFeatures.pck", "w")
np.save(file, restFeatures)
file.close()



In [148]:

    
import numpy as np
import urllib
import scipy.optimize
import random
from math import exp
from math import log

def parseData(fname):
  for l in urllib.urlopen(fname):
    yield eval(l)

print "Reading data..."
#data = np.load("/home/iizhaki/oasis/CSE255/reviewFeatures.pck")
print "done"

def inner(x,y):
  return sum([x[i]*y[i] for i in range(len(x))])

def sigmoid(x):
  return 1.0 / (1 + np.exp(-x))

# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
  logit = np.dot(X, theta)
  loglikelihood = np.log(1 + np.exp(-logit)).sum(axis=0, dtype='float')
  loglikelihood -= np.dot(logit, y)
  loglikelihood -= lam * np.dot(theta, theta)
  
  print "ll =", loglikelihood
  return -loglikelihood

# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
    logit = np.dot(X, theta)
    dl = np.dot(X.T, (1 - sigmoid(logit)));
    dl -= y_spec
    dl -= 2 * lam * theta
    # Negate the return value since we're doing gradient *ascent*
    return np.array([-x for x in dl])


X = np.ones((10, 20))
y = np.array([0, 1, 1, 1, 0, 0, 1, 0, 1, 1])
print y.shape

# Training data
X_train = X
y_train = y

dummy = np.zeros((X_train.shape[1]))
y_spec = np.array([X[i] if not y_train[i] else dummy for i in range(len(X_train))]).sum(axis=0, dtype='float')

# Test data
#X_test = X[1000:]
#y_test = y[1000:]

theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, args = (X_train, y_train, 1.0))
print theta.shape
print "Final log likelihood =", -l









    



Reading data...
done
(10,)
ll = 6.9314718056
ll = -27.7192343061
ll = 5.74386274729
ll = 6.91421450357
ll = 6.93122858857
ll = 6.9314683793
ll = 6.93147175733
ll = 6.93147180492
ll = 6.93147180559
ll = 6.9314718056
ll = 6.9314718056
ll = 6.9314718056
ll = 6.9314718056
ll = 6.9314718056
ll = 6.9314718056
ll = 6.9314718056
ll = 6.9314718056
ll = 6.9314718056
ll = 6.9314718056
ll = 6.9314718056
ll = 6.9314718056
(20,)
Final log likelihood = 6.9314718056



In [128]:

    
len(data
    )









    Out[128]:





357191



In [131]:

    
import numpy as np
import urllib
import scipy.optimize
import random
from math import exp
from math import log

def parseData(fname):
  for l in urllib.urlopen(fname):
    yield eval(l)

print "Reading data..."
data = np.load("/home/iizhaki/oasis/CSE255/reviewFeatures.pck")
print "done"

def inner(x,y):
  return sum([x[i]*y[i] for i in range(len(x))])

def sigmoid(x):
  return 1.0 / (1 + np.exp(-x))

# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
  loglikelihood = 0
  for i in range(len(X)):
    logit = inner(X[i], theta)
    loglikelihood -= log(1 + exp(-logit))
    if not y[i]:
      loglikelihood -= logit
  for k in range(len(theta)):
    loglikelihood -= lam * theta[k]*theta[k]
  print "ll =", loglikelihood
  return -loglikelihood

# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
    logit = np.dot(X, theta)
    dl = np.dot(X.T, (1 - sigmoid(logit)));
    dl -= y_spec
    dl -= 2 * lam * theta
    # Negate the return value since we're doing gradient *ascent*
    return np.array([-x for x in dl])

X = data
y = np.load("/home/iizhaki/oasis/CSE255/reviewY.pck")


ntraining = len(X) * 0.7
# Training data
X_train = X[:ntraining]
y_train = y[:ntraining]

# Test data
X_test = X[ntraining:]
y_test = y[ntraining:]

dummy = np.zeros((X_train.shape[1]))
y_spec = np.array([X[i] if not y_train[i] else dummy for i in range(len(X_train))]).sum(axis=0, dtype='float')

theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, args = (X_train, y_train, 1.0))
print theta.shape
print "Final log likelihood =", -l









    



Reading data...
done






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-131-563325817cde> in <module>()
     56 
     57 dummy = np.zeros((X_train.shape[1]))
---> 58 y_spec = np.array([X[i] if not y_train[i] else dummy for i in range(len(X_train))]).sum(axis=0, dtype='float')
     59 
     60 theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, args = (X_train, y_train, 1.0))

KeyboardInterrupt:



In [2]:

    
A = np.array([[1,2,3],[4,5,6],[7,8,9]])
B = np.array([[5,5,6],[1,3,3],[9,8,9]])
print np.append(A, B, axis=0)









    



[[1 2 3]
 [4 5 6]
 [7 8 9]
 [5 5 6]
 [1 3 3]
 [9 8 9]]



In [3]:

    
A = [1,2,3]
A[1] = 5
print A









    



[1, 5, 3]



In [34]:

    
A = [1,2,3]
B = [2,3,4];
print [A, B]









    



[[1, 2, 3], [2, 3, 4]]



In [121]:

    
import numpy as np
import urllib
import scipy.optimize
import random
from math import exp
from math import log

def parseData(fname):
  for l in urllib.urlopen(fname):
    yield eval(l)

print "Reading data..."
file  = open ("/home/iizhaki/oasis/CSE255/categoriesByLocation.pck")
data = pickle.load(file)
file.close();
print "done"
print data.shape









    



Reading data...
done






    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-121-bacc95db270b> in <module>()
     15 file.close();
     16 print "done"
---> 17 print data.shape

AttributeError: 'list' object has no attribute 'shape'



In [158]:

    
def inner(x,y):
  return sum([x[i]*y[i] for i in range(len(x))])

def sigmoid(x):
  return 1.0 / (1 + np.exp(-x))

# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
  logit = np.dot(X, theta)
  loglikelihood = -np.log(1 + np.exp(-logit)).sum(axis=0, dtype='float')
  loglikelihood -= np.dot(logit, 1 - y)
  loglikelihood -= lam * np.dot(theta, theta)
  
  #print "ll =", loglikelihood
  return -loglikelihood

# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
    logit = np.dot(X, theta)
    dl = np.dot(X.T, (1 - sigmoid(logit)));
    dl -= y_spec
    dl -= 2 * lam * theta
    # Negate the return value since we're doing gradient *ascent*
    return np.array([-x for x in dl])

X = data
y = np.load("/home/iizhaki/oasis/CSE255/reviewY.pck")

# Training data
X_train = X
y_train = y

# Test data
X_test = X[1000:]
y_test = y[1000:]
K = 300

thetas = [None] * K

for k in range(K):
    X_train = np.array(data[k][1])
    #print X_train.shape
    X_train = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
    #print X_train.shape
    y_train = np.array([y[i] for i in data[k][0]])
    
    #dummy = np.zeros((X_train.shape[1]))
    #y_spec = np.array([X_train[i] if not y_train[i] else dummy for i in range(len(X_train))]).sum(axis=0, dtype='float')

    #theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X_train[0]), fprime, args = (X_train, y_train, 1.0))
    theta, residuals, rank, s = numpy.linalg.lstsq(X_train, y_train)
    #print "Final log likelihood for ", k, " = ", -l
    
    thetas[k] = theta
    
print "done"









    



done



In [144]:

    
print y_spec.shape

k = 2
X_train = np.array(data[k][1])
y_train = np.array([(int)(y[i] / 1000) for i in data[k][0]])

print data[k][0][0]
print sum(data[k][1][0] == restReviews[23])



In [160]:

    
def predict(data, theta):
    theta = numpy.matrix(theta)
    prediction = [theta * numpy.matrix(d).T  for d in data]
    return prediction

def MSE(prediction, real):
    squares = [ (p - r) for p,r in zip (prediction, real) ]
    return numpy.mean(squares)

num = 0
for k in range(K):
    X_train = data[k][1]
    X_train = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
    y_train = np.array([y[i] for i in data[k][0]])
    
    y_pred = predict(X_train, thetas[k])
    #print len(y_pred), len(X_train)
    
    mse = MSE(y_pred, y_train)
    num = num + mse
    #print " MSE training ", k, " is: ", mse
    
print num / K









    



-0.259466274409



In [126]:

    
import os

statinfo = os.stat("/home/iizhaki/oasis/CSE255/MatrixwWords2.pck")
#statinfo = os.stat("/home/iizhaki/oasis/CSE255/MatrixD.pck")
statinfo.st_size
#13240755216









    Out[126]:





5504766416

Cross Computing Users



In [28]:

    
file  = open ("/home/iizhaki/oasis/CSE255/users_GPS_US.pck")
usersGps = pickle.load(file)
file.close();
print "done"

file  = open ("/home/iizhaki/oasis/CSE255/users_id_US.pck")
userIds = pickle.load(file)
file.close();
print "done"









    



done
done



In [38]:

    
userPerRegion = {}
i = 0
for gps in usersGps:
    pred = km.predict(gps)
    userPerRegion[userIds[i]] = pred[0]
    i = i + 1



In [40]:

    
print (list(userPerRegion))[0]









    



108437601487080769932



In [41]:

    
#read places

file  = open ("/home/iizhaki/oasis/CSE255/reviewsUS.pck")
reviews = pickle.load(file)
file.close();
print "done"









    



done



In [42]:

    
print reviews[0]









    



(5000, [u'Mexican Restaurant', u'Latin American Restaurant'], "You won't be disappointed in the food.  They do business lunches and groups (6 to 10) very well.  Service always fast and helpful.  This is one of my top 4 Mexican restaurants in Akron area, the only detractor is the age of the building and the environment.  Again not back, nothing to stay away from, but their business is serving Mexican food to their customers and they do that well.  Lunch is the majority of times I have been there.", '101280967457665576418', '103173356293785774089')



In [ ]:

    
from collections import defaultdict
import re

occurs = defaultdict(int)
for review in reviews:
    string = reviews[0][2].lower()
    occs = filter(None, re.split("[,. \-!?:()01234566789]+", string))
    for occ in occs:
        occurs[occ] += 1



In [ ]:

    
print 2



In [1]:

    
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixReorder.pck")
matrix =np.load(file)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/YsReorder.pck")
rankingR = np.load(file)
file.close()
print "done"









    



done
done



In [9]:

    
len(matrix)









    Out[9]:





1609992



In [13]:

    
a=  np.ndarray([])









    



2.03512200714e-316



In [3]:

    
matrix_n   = []
rankingR_n = []
for i in range(len(matrix)):
    if rankingR[i]!=0:
        matrix_n.append(matrix[i])
        rankingR_n.append(rankingR[i])
import random
del matrix
del rankingR
matrix   = matrix_n
rankingR = rankingR_n
indexes = range(len(matrix))
random.shuffle(indexes)



In [7]:

    
m = np.array(matrix)



In [8]:

    
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixReorder_0.pck", "w")
np.save(file,m)
file.close()
print "done"


#save  matrix
file = open("/home/iizhaki/oasis/CSE255/YsReorder_1.pck", "w")
np.save(file,rankingR)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/IndexReorder_1.pck", "w")
np.save(file,indexes)
file.close()
print "done"









    



done
done
done



In [ ]: