All imports and JSon files loads



In [2]:

    
import pickle



In [4]:

    
from sklearn.cluster import KMeans
import json



In [2]:

    
# Links:
# /home/iizhaki/CSE255/googlelocal/places.json
# /home/iizhaki/CSE255/googlelocal/reviews.json
# /home/iizhaki/CSE255/googlelocal/users.json



In [5]:

    
import json
print "Start loading users..."
jsonUsers = json.load(open("/oasis/scratch/iizhaki/temp_project/CSE255/googlelocal/users.json"))
print "Finished loading users"









    



Start loading users...
Finished loading users



In [ ]:

    
print "Start loading reviews..."
jsonReviews = json.load(open("/home/iizhaki/oasis/CSE255/reviews_0.pck"))
print "Finished loading reviews"



In [11]:

    
print "Start loading places..."
jsonPlaces = json.load(open("/oasis/scratch/iizhaki/temp_project/CSE255/googlelocal/places.json"))
print "Finished loading places"









    



Start loading places...
Finished loading places

Load Files



In [15]:

    
##users
X = []
ids = []
for id in jsonUsers:
    X.append(jsonUsers[id])
    ids.append(id)
#X = [jsonUsers[id] for id in jsonUsers ]
#id = [id for id in jsonUsers]

#place      = [n['currentPlace'] if 'currentPlace' in n and n['currentPlace']!=[] else [] for n in X  ]
place      = [n['currentPlace'] for n in X if 'currentPlace' in n and n['currentPlace']!=[] ]

ids      = [ids[i] for i in range(len(X)) if 'currentPlace' in X[i] and X[i]['currentPlace']!=[] ]

GPS = [g[1] if len(g)>1  else []  for g in place ]
id=ids
#id      = [ids[i] for i in range(len(place)) if len(place[i])>1 ]

GPS = [[g[1]/1e7,g[2]/1e7] if len(g)>2  else [] for g in GPS]

#GPS_US = [p for p in GPS if p[0]>=24 and p[0]<=49 and p[1]>=-128 and p[1]<=-47]

#id      = [id[i] for i in range(len(GPS)) if GPS[i][0]>=24 and GPS[i][0]<=49 and GPS[i][1]>=-128 and GPS[i][1]<=-47 ]

print len(GPS),",", len(id)









    



749941 , 749941



In [6]:

    
#print len(X), len(ids)

##users
X = []
ids = []
for id in jsonUsers:
    X.append(jsonUsers[id])
    ids.append(id)
#X = [jsonUsers[id] for id in jsonUsers ]
#id = [id for id in jsonUsers]

place      = [n['currentPlace'] if 'currentPlace' in n and n['currentPlace']!=[] else [] for n in X  ]
#place      = [n['currentPlace'] for n in X if 'currentPlace' in n and n['currentPlace']!=[] ]

#ids      = [ids[i] for i in range(len(X)) if 'currentPlace' in X[i] and X[i]['currentPlace']!=[] ]

GPS = [g[1] if len(g)>1  else []  for g in place ]
id=ids
#id      = [ids[i] for i in range(len(place)) if len(place[i])>1 ]

GPS = [[g[1]/1e7,g[2]/1e7] if len(g)>2  else [] for g in GPS]

#GPS_US = [p for p in GPS if p[0]>=24 and p[0]<=49 and p[1]>=-128 and p[1]<=-47]

#id      = [id[i] for i in range(len(GPS)) if GPS[i][0]>=24 and GPS[i][0]<=49 and GPS[i][1]>=-128 and GPS[i][1]<=-47 ]

print len(GPS),",", len(id)









    



3747937 , 3747937



In [27]:

    
GPS_US = []
id_US  = [] 
for i in  range(len(GPS)):
    p = GPS[i]
    if p!= []:
        if p[0]>=24 and p[0]<=49 and p[1]>=-128 and p[1]<=-47:
            GPS_US.append(p)
            id_US.append(id[i])
        
#save users
file = open("/home/iizhaki/oasis/CSE255/users_GPS_US.pck", "w")
pickle.dump(GPS_US, file)
file.close()
print "done"

#save users
file = open("/home/iizhaki/oasis/CSE255/users_id_US.pck", "w")
pickle.dump(id_US, file)
file.close()
print "done"









    



done
done



In [42]:

    
#save users
file = open("/home/iizhaki/oasis/CSE255/users_GPS_US.pck", "w")
pickle.dump(GPS_US, file)
file.close()
print "done"









    



done



In [7]:

    
#save users
file = open("/home/iizhaki/oasis/CSE255/users_GPS.pck", "w")
pickle.dump(GPS, file)
file.close()
print "done"

#save users
file = open("/home/iizhaki/oasis/CSE255/users_id.pck", "w")
pickle.dump(id, file)
file.close()
print "done"









    



done
done



In [1]:

    
print len(GPS), len(id)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-4f2baf8b7575> in <module>()
----> 1 print len(GPS), len(id)

NameError: name 'GPS' is not defined



In [1]:

    
import pickle
#load users
file = open("/home/iizhaki/oasis/CSE255/users_GPS.pck")
GPS = pickle.load(file)
file.close()
print "done"

#load users
file = open("/home/iizhaki/oasis/CSE255/users_id.pck")
id = pickle.load(file)
file.close()
print "done"

print len(GPS), len(id)









    



done
done
3747937 3747937



In [139]:

    
#dictionary of users
usersD =  dict()
_sum = [0]* len(categories)
coun =[0]* len(categories)
for i in range(len(id)):
    usersD[id[i]]=(GPS[i],list(_sum),list(coun))
print len(usersD)    
print "done"



In [138]:

    
len(GPS)









    Out[138]:





3747937



In [140]:



In [12]:

    
X = [jsonPlaces[p] for p in jsonPlaces ]

gps     = [g['gps'] if 'gps'in g else [0,0] for g in X]
gps_x   = [g[0] for g in gps]
gps_y   = [g[1] for g in gps]

names   = [n['name'].encode('ascii','ignore') if 'name'in n     else "no name" for n in X]
id      = [n['id'].encode('ascii','ignore')   if 'id' in n      else "no id"   for n in X]
address = [n['address']                       if 'address' in n else ["",""]   for n in X]
address = [n[1].encode('ascii','ignore')      if len(n)>1       else ""        for n in address]

#name, id, gps, address, categories, rating, count
places = [(names[i],id[i],gps[i],address[i],set(),0,0) for i in range(len(X))]
places = [p for p in places if p[2]!=[0,0]]
print "done"









    



done



In [14]:

    
print len(places)#3087397



In [16]:

    
#save places
#file = open("/home/iizhaki/oasis/CSE255/places.pck", "w")
#pickle.dump(places, file)
#file.close()
print "done"









    



done



In [17]:

    
#read places

file  = open ("/home/iizhaki/oasis/CSE255/places.pck")
places = pickle.load(file)
file.close();
print "done"









    



done



In [3]:

    
#read reviews0

file  = open ("/home/iizhaki/oasis/CSE255/reviews_0.pck")
reviews0 = pickle.load(file)
file.close();
print "done"









    



done



In [4]:

    
#read reviews1

file  = open ("/home/iizhaki/oasis/CSE255/reviews_1.pck")
reviews1 = pickle.load(file)
file.close();
print "done"









    



done



In [5]:

    
#read reviews2

file  = open ("/home/iizhaki/oasis/CSE255/reviews_2.pck")
reviews2 = pickle.load(file)
file.close();
print "done"









    



done



In [6]:

    
reviews = reviews0+reviews1+reviews2

print len(reviews)

11453845



In [ ]:

    
#creates a dict given the places array
placesD =  dict()

for p in places:
    placesD[p[1]]=p

Join reviews places



In [ ]:

    
#fills the dict with the categories and rating from reviews
for r in reviews:
    if r[4] in placesD:
        p = placesD[r[4]]
        placesD[r[4]] = (p[0],p[1],p[2],p[3],set(p[4]) | set(r[1]),p[5]+r[0],p[6]+1)
print "done"



In [32]:

    
#calculate Rating
todel =[]
for idx in placesD:
    p = placesD[idx]
    if (p[6]==0):
        todel += [idx]
    else:
        placesD[idx] = (p[0],p[1],p[2],p[3],list(p[4]),p[5]/p[6])
print "done"









    



done



In [35]:

    
placesReviews = [placesD[idx] for idx in placesD]
len(placesReviews)









    Out[35]:





3087397



In [39]:

    
placesReviewsUS = [p for p in placesReviews if p[2][0]>=24 and p[2][0]<=49 and p[2][1]>=-128 and p[2][1]<=-47]
len(placesReviewsUS)









    Out[39]:





1308838



In [54]:

    
placesReviewsUS = [(p[0],p[1],p[2],p[3],[w.encode('ascii','ignore') for w in p[4]   ],p[5]) for p in placesReviewsUS]
len (placesReviewsUS)









    Out[54]:





1308838



In [55]:









    Out[55]:





('China Cottage',
 '106432060150136868000',
 [39.692899, -84.136173],
 'Dayton, OH 45429',
 ['Chinese Restaurant', 'Asian Restaurant'],
 3916)



In [9]:

    
print "a"



In [40]:

    
#save places with reviews
file = open("/home/iizhaki/oasis/CSE255/placesReviews.pck", "w")
pickle.dump(placesReviews, file)
file.close()
print "done"









    



done



In [56]:

    
#save places with reviews
file = open("/home/iizhaki/oasis/CSE255/placesReviewsUS.pck", "w")
pickle.dump(placesReviewsUS, file)
file.close()
print "done"









    



done



In [4]:

    
#read reviewsUS

file  = open ("/home/iizhaki/oasis/CSE255/placesReviewsUS.pck")
placesReviewsUS = pickle.load(file)
file.close();
print "done"









    



done



In [13]:

    
file  = open ("/home/iizhaki/oasis/CSE255/gpsUS200.pck")
centroids200 = pickle.load(file)
file.close();
print "done"









    



done



In [63]:

    
restaurantReviewsUS = [p for p in placesReviewsUS if [y for y in p[4] if 'RESTAURANT' in y.upper() and 'RESTAURANTE' not in y.upper()   ]!=[]]
restaurantReviewsUS = [(p[0],p[1],p[2],p[3],[w.upper() for w in p[4]   ],p[5]) for p in restaurantReviewsUS]
restaurantReviewsUS = [(p[0],p[1],p[2],p[3],[w  for w in p[4] if 'RESTAURANT' in w or 'BAR' in w or 'BAKERY' in w or 'FOOD' in w or 'GRILL'  in w ],p[5]) for p in restaurantReviewsUS]

print len(restaurantReviewsUS) #357191
print restaurantReviewsUS[0]









    



357191
("T C's Referee Sports Bar", '100327153115986850675', [43.529494, -96.792244], 'Sioux Falls, SD 57106', ['SPORTS BAR', 'RESTAURANT'], 4625)



In [66]:

    
#save restaurants with reviews
file = open("/home/iizhaki/oasis/CSE255/restaurantsReviewsUS.pck", "w")
pickle.dump(restaurantReviewsUS, file)
file.close()
print "done"









    



done



In [24]:

    
import pickle
file  = open ("/home/iizhaki/oasis/CSE255/restaurantsReviewsUS.pck")
restaurantReviewsUS = pickle.load(file)
file.close();
print "done"
print len(restaurantReviewsUS) #357191









    



done
357191



In [29]:

    
categories[0]









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-29-02a1b52a42a0> in <module>()
----> 1 categories[0]

NameError: name 'categories' is not defined



In [7]:

    
categories = set ()
for p in restaurantReviewsUS:
    categories = categories | set ( p[4]  ) 
print len(list(categories))



In [15]:

    
categories = list(categories)
#save categories
file = open("/home/iizhaki/oasis/CSE255/categories.pck", "w")
pickle.dump(categories, file)
file.close()
print "done"









    



done



In [30]:

    
import pickle
file  = open ("/home/iizhaki/oasis/CSE255/categories.pck")
categories = pickle.load(file)
file.close();
print "done"
print len(categories) #363









    



done
363



In [70]:

    
categories = list(categories)
matrixCategories = numpy.zeros((len(restaurantReviewsUS),len(categories)))
i =0
for r in restaurantReviewsUS:
    for c in r[4]:
        matrixCategories [i,categories.index(c)] = 1
    i+=1
len(matrixCategories)









    Out[70]:





357191



In [72]:

    
#save restaurants matrix
file = open("/home/iizhaki/oasis/CSE255/reviewMatrix.pck", "w")
numpy.save(file,matrixCategories)
file.close()
print "done"









    



done



In [71]:

    
len(restaurantReviewsUS)









    Out[71]:





357191



In [74]:

    
reviewsY = [r[5] for r in restaurantReviewsUS]
#save reviews
file = open("/home/iizhaki/oasis/CSE255/reviewY.pck", "w")
numpy.save(file,reviewsY)
file.close()
print "done"









    



done



In [91]:

    
print "max", max(reviewsY), "min", min(reviewsY), "avg",sum(reviewsY)/len(reviewsY)









    



 max 5000 min 1000 avg 3858

This part of code computes KMeans clusters for GPS coordinates.



In [ ]:

    
GPS = [g for g in gps if g != [0, 0]]
print len(gps), len(GPS)
K = 1000
kmeans = KMeans(n_clusters=K, n_jobs=-1)
kmeans.fit(GPS)









    



3114353 3087397



In [ ]:

    
kk = kmeans
print type (kk)
print len(kk.cluster_centers_)
print [sum(kk.labels_ == i) for i in range(K)]
print kk.cluster_centers_[3]
print gps[0]



In [140]:



In [140]:



In [140]:

Gradient Ascend Code



In [ ]:

    
import numpy as np
import urllib
import scipy.optimize
import random
from math import exp
from math import log

print "Reading data..."
data = np.load("/home/iizhaki/oasis/CSE255/reviewFeatures.pck")
print "done"

def inner(x,y):
  return sum([x[i]*y[i] for i in range(len(x))])

def sigmoid(x):
  return 1.0 / (1 + np.exp(-x))

# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
  logit = np.dot(X, theta)
  loglikelihood = -np.log(1 + np.exp(-logit)).sum(axis=0, dtype='float')
  loglikelihood -= np.dot(logit, 1 - y)
  loglikelihood -= lam * np.dot(theta, theta)
  
  print "ll =", loglikelihood
  return -loglikelihood

# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
    logit = np.dot(X, theta)
    dl = np.dot(X.T, (1 - sigmoid(logit)));
    dl -= y_spec
    dl -= 2 * lam * theta
    # Negate the return value since we're doing gradient *ascent*
    return np.array([-x for x in dl])

X = data
y = np.load("/home/iizhaki/oasis/CSE255/reviewY.pck")

# Training data
X_train = X
y_train = [(int)(ys/1000) for ys in y]

# Test data
X_test = X[1000:]
y_test = y[1000:]

#dummy = np.zeros((X_train.shape[1]))
#y_spec = np.array([X[i] if not y_train[i] else dummy for i in range(len(X_train))]).sum(axis=0, dtype='float')

#theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, args = (X_train, y_train, 1.0))
#print theta.shape
#print "Final log likelihood =", -l



In [86]:

    
print "Reading data..."
data = np.load("/home/iizhaki/oasis/CSE255/reviewFeatures.pck")
print "done"

X = data
y = np.load("/home/iizhaki/oasis/CSE255/reviewY.pck")









    



Reading data...
done



In [87]:

    
thetax,residualsx,rankx,sx = numpy.linalg.lstsq(X, y)



In [25]:

    
def predict(data, theta):
    theta = numpy.matrix(theta)
    prediction = [theta*numpy.matrix(d).T  for d in data]
    return prediction

prediction_training = predict(X, thetax)

def MSE(prediction, real):
    squares = [ (p - r)**2 for p,r in zip (prediction,real) ]
    return numpy.mean(squares)


print " MSE training", MSE(prediction_training, y )
#absolute error 658.914905107









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-25-da0c0bce89d0> in <module>()
      4     return prediction
      5 
----> 6 prediction_training = predict(X, thetax)
      7 
      8 def MSE(prediction, real):

NameError: name 'thetax' is not defined



In [100]:

    
print prediction_training[1],", ", y[1]









    



[[ 3802.62044907]] ,  3558



In [90]:

    
avg = [sum(y)*1.0/len(y)] * len(y)

print " MSE training", MSE(avg, y )
#MSE training 686.400936754









    



 MSE training 770393.021789



In [91]:

    
719823.238065/770393.021789









    Out[91]:





0.9343584608196901



In [128]:

    
print np.array([X[:,0]]).shape
print np.array(X[:,301:]).shape
Xx = np.concatenate((np.array([X[:,0]]).T,X[:,301:]), axis =1)









    



(1, 357191)
(357191, 363)



In [132]:

    
Xx[:,0]









    Out[132]:





array([ 1.,  1.,  1., ...,  1.,  1.,  1.])



In [133]:

    
thetax,residualsx,rankx,sx = numpy.linalg.lstsq(Xx, y)



In [140]:



In [135]:

    
def predict(data, theta):
    theta = numpy.matrix(theta)
    prediction = [theta*numpy.matrix(d).T  for d in data]
    return prediction

prediction_trainingx = predict(Xx, thetax)

def MSE(prediction, real):
    squares = [ (p - r)**2 for p,r in zip (prediction,real) ]
    return numpy.mean(squares)


print " MSE training", MSE(prediction_trainingx, y )









    



 MSE training 661.22571362



In [136]:

    
restaurantReviewsUS[0]









    Out[136]:





("T C's Referee Sports Bar",
 '100327153115986850675',
 [43.529494, -96.792244],
 'Sioux Falls, SD 57106',
 ['SPORTS BAR', 'RESTAURANT'],
 4625)



In [49]:

    
len(reviews)









    Out[49]:





2157302



In [140]:

    
len(restaurantReviewsUS)









    Out[140]:





357191



In [146]:

    
reviews1[0]









    Out[146]:





(4000,
 [u'Shopping Mall'],
 '',
 '105753474075712443583',
 '118294895653644676616')



In [1]:

    
reviews1[0:5]









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-f11bbb8e780c> in <module>()
----> 1 reviews1[0:5]

NameError: name 'reviews1' is not defined



In [17]:

    
len(categories)









    Out[17]:





363

Reviews of the US



In [4]:

    
#creates a dict given the reviews Restaurants array
placesD =  dict()

for p in restaurantReviewsUS:
    placesD[p[1]]=p
print "Done"









    



Done



In [133]:

    
print len(placesD)



In [14]:

    
#reviews of restaurants in the US

reviews = [r for r in reviews if r[4] in placesD]
print len(reviews)#2157302
print "Done"



In [15]:

    
reviews = [r for r in reviews if r[3] in usersD]
print len(reviews)#1610014
print "Done"



In [5]:

    
print len(reviews)#1610014









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-6948c29511b4> in <module>()
----> 1 print len(reviews)#1610014

NameError: name 'reviews' is not defined



In [16]:

    
#save reviews
file = open("/home/iizhaki/oasis/CSE255/reviewsUS.pck", "w")
pickle.dump(reviews, file)
file.close()
print len(reviews)#1610014
print "done"



In [5]:

    
#load reviews US with users
file = open("/home/iizhaki/oasis/CSE255/reviewsUS.pck")
reviews = pickle.load( file)
file.close()
print len(reviews)#1610014
print "done"



In [140]:

    
len(usersD)









    Out[140]:





3747937



In [134]:

    
len(reviews)









    Out[134]:





1610014



In [141]:

    
categories = list(categories)
usersDD = dict()
for r in reviews:
    if r[4] in placesD: #places
        if r[3] in usersD: #user
            p = placesD[r[4]]
            u = usersD[r[3]]
            sum   =  list(u[1])
            count =  list(u[2])
            for c in p[4]:
                if c in categories:
                    idx = categories.index(c)
                    sum[idx]   += r[0]
                    count[idx] += 1
            usersDD[r[3]] = (u[0],sum,count)
print "Done"









    



Done



In [11]:

    
len(usersDD)









    Out[11]:





692157



In [146]:

    
del usersD



In [72]:

    
#save usersDD
file = open("/home/iizhaki/oasis/CSE255/usersDD.pck", "w")
pickle.dump(usersDD, file)
file.close()
print "done"









    



done



In [10]:

    
#load usersDD
file = open("/home/iizhaki/oasis/CSE255/usersDD.pck")
usersDD = pickle.load( file)
file.close()
print "done"









    



done



In [126]:

    
len (list(usersDD))









    Out[126]:





153241



In [147]:

    
#calculate rank
for u in usersDD:
    us = usersDD[u]
    rank = []
    for i in range(len(us[1])):
        if us[2][i]!=0:
            rank += [us[1][i]/us[2][i]]
        else:
            rank += [0]
    usersDD[u] = (u,us[0],rank)



In [148]:

    
len(usersDD)









    Out[148]:





692157



In [149]:

    
#save usersDD
file = open("/home/iizhaki/oasis/CSE255/usersDD.pck", "w")
pickle.dump(usersDD, file)
file.close()
print "done"









    



done



In [62]:

    
u= [usersDD[sd] for sd in usersDD[:10]]
u[0]









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-62-73399bb3b8fb> in <module>()
----> 1 u= [usersDD[sd] for sd in usersDD[:10]]
      2 u[0]

TypeError: unhashable type



In [39]:

    
ur = [r for r in reviews if  r[3] =='106887711560311804886']



In [19]:

    
len(reviews)









    Out[19]:





1610014



In [16]:

    
#len of reviews most be 1610014 
K=300
matrixC= []
rankingY = []
for r in reviews:
    if r[3] in usersDD: #user
        l = locationD[r[4]]
        locs = list(l) #location or Rest
        p = placesD[r[4]]
        positive 
        if r[4] in positive:
            pos = list(positive[r[4]])
        else:
            pos = [0] * 50
        negative
        if r[4] in negative:
            neg = list(negative[r[4]])
        else:
            neg = [0] * 50
        
        #categories
        cats = [0] * len(categories)
        for c in p[4]:
            cats [categories.index(c)] = 1
        u = usersDD[r[3]]
    
        #alpha + location + avg review + categories + user info
        matrixC.append ( [1] + locs + [p[5]] + cats + pos + neg +  u[2])
        rankingY.append (r[0])
print "Done"
print len(matrixC)



In [17]:

    
len(matrixC[0])









    Out[17]:





1128



In [35]:

    
del usersDD



In [121]:

    
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixD.pck", "w")
np.save(file,matrix)
file.close()
print "done"









    



done



In [123]:

    
#save y
file = open("/home/iizhaki/oasis/CSE255/YsD.pck", "w")
np.save(file,rankingR)
file.close()
print "done"









    



done



In [3]:

    
import numpy as np
#save  matrix
matrix = np.load("/home/iizhaki/oasis/CSE255/MatrixD.pck")
print "done"
#save y
rankingR = np.load("/home/iizhaki/oasis/CSE255/YsD.pck")

print "done"









    



done
done



In [26]:

    
m0 = matrixC[:500000]



In [27]:

    
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixwWords0.pck", "w")
np.save (file,m0)
#pickle.dump(matrixC, file)
file.close()
print "done"









    



done



In [28]:

    
m1 = matrixC[500000:1000000]
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixwWords1.pck", "w")
np.save (file,m1)
#pickle.dump(matrixC, file)
file.close()
print "done"









    



done



In [29]:

    
m2 = matrixC[1000000:]
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixwWords2.pck", "w")
np.save (file,m2)
#pickle.dump(matrixC, file)
file.close()
print "done"









    



done



In [1]:

    
print "Reading data..."
m0 = np.load("/home/iizhaki/oasis/CSE255/MatrixwWords0.pck")
print "done"
print "Reading data..."
m1 = np.load("/home/iizhaki/oasis/CSE255/MatrixwWords1.pck")
print "done"
print "Reading data..."
m2 = np.load("/home/iizhaki/oasis/CSE255/MatrixwWords2.pck")
print "done"









    



Reading data...
done
Reading data...
done
Reading data...
done



In [5]:

    
len(m0)+len(m1)+len(m2)
matrixC= numpy.vstack((m0,m1,m2))
print len(matrixC)



In [6]:

    
type (matrixC)









    Out[6]:





numpy.ndarray



In [33]:

    
#save y
file = open("/home/iizhaki/oasis/CSE255/YwWords.pck", "w")
np.save(file,rankingY)
file.close()
print "done"









    



done



In [7]:

    
#load y
rankingY = np.load("/home/iizhaki/oasis/CSE255/YwWords.pck")
print len(rankingY)
print "done"









    



done



In [8]:

    
print len(rankingY)



In [6]:

    
print "Reading data..."
mdata = np.load("/home/iizhaki/oasis/CSE255/reviewFeatures.pck")
print "done"









    



Reading data...
done



In [25]:

    
len( matrixC)









    Out[25]:





1610014



In [7]:

    
mdata = mdata[:,1:301]

#creates a dict given the reviews Restaurants array
locationD =  dict()

for i in range(len(restaurantReviewsUS)):
    locationD[restaurantReviewsUS[i][1]]=list(mdata[i])
print "Done"









    



Done



In [9]:

    
len(locationD)









    Out[9]:





357191



In [8]:

    
del mdata



In [ ]:

    
theta,residuals,rank,s = numpy.linalg.lstsq(matrix, rankingR)



In [34]:

    
print 2



In [38]:

    
len(rankingR)









    Out[38]:





1610014



In [50]:

    
np.matrix(theta)









    Out[50]:





matrix([[  2.48550859e+10,  -2.48550859e+10,  -2.48550859e+10, ...,
           1.25411987e-01,   0.00000000e+00,   1.39045715e-01]])



In [49]:

    
len (np.matrix[0])









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-49-50ab403e15fb> in <module>()
----> 1 len (np.matrix[0])

TypeError: 'type' object has no attribute '__getitem__'



In [43]:

    
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/ThetaX.pck", "w")
np.save(file,theta)
file.close()
print "done"









    



done



In [ ]:

    
print "Reading data..."
theta = np.load("/home/iizhaki/oasis/CSE255/ThetaX.pck")
print "done"



In [83]:

    
def predict(data, theta):
    theta = numpy.matrix(theta)
    prediction = [theta*numpy.matrix(d).T  for d in data]
    return prediction

prediction_trainingx = predict(matrix, theta)

def MSE(prediction, real):
    squares = [ (p - r) ** 2 for p,r in zip (prediction,real) ]
    return numpy.mean(squares)

y = [e/1000.0 for e in rankingR]
p = [e/1000.0 for e in prediction_trainingx]
print " MSE training", MSE(p, y ) # mean abs error 0.708222960173









    



 MSE training 0.832253748827



In [85]:

    
0.832253748827 /1.39914325993









    Out[85]:





0.5948309745412619



In [84]:

    
avg_p = [sum(y)/len(y)] * len(y)
print avg_p[0]
print " MSE training", MSE(avg_p, y ) # mean abs error 0.924575762201









    



3.94964888504
 MSE training 1.39914325993



In [67]:

    
sum(y)









    Out[67]:





6358990000



In [80]:

    
3949/1000.0









    Out[80]:





3.949



In [73]:

    
prediction_trainingx/1000









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-73-ce9cd07e76af> in <module>()
----> 1 prediction_trainingx/1000

TypeError: unsupported operand type(s) for /: 'list' and 'int'



In [12]:

    
#load reviews US with users
file = open("/home/iizhaki/oasis/CSE255/reviewToPositiveMap.pck")
positive = pickle.load( file)
file.close()
print len(positive)#1610014
print "done"
len (positive)
# positive









    



340642
done



In [94]:

    
len (positive)









    Out[94]:





340642



In [96]:

    
len (restaurantReviewsUS)









    Out[96]:





357191



In [102]:

    
az =[r for r in restaurantReviewsUS if r[1] not in positive]

len(az)









    Out[102]:





16549



In [107]:

    
restaurantReviewsUS[0]









    Out[107]:





("T C's Referee Sports Bar",
 '100327153115986850675',
 [43.529494, -96.792244],
 'Sioux Falls, SD 57106',
 ['SPORTS BAR', 'RESTAURANT'],
 4625)



In [13]:

    
#load reviews US with users
file = open("/home/iizhaki/oasis/CSE255/reviewToNegativeMap.pck")
negative = pickle.load( file)
file.close()
print len(positive)#1610014
print "done"
len(negative)
# negative



In [104]:

    
len(negative)









    Out[104]:





340642



In [106]:

    
az[1]









    Out[106]:





('Dixie Lee Chicken',
 '112480955169876860251',
 [46.213896, -82.076332],
 'Spanish, ON P0P 2A0',
 ['FAMILY RESTAURANT'],
 5000)



In [110]:

    
reviews[0]









    Out[110]:





(5000,
 [u'Mexican Restaurant', u'Latin American Restaurant'],
 "You won't be disappointed in the food.  They do business lunches and groups (6 to 10) very well.  Service always fast and helpful.  This is one of my top 4 Mexican restaurants in Akron area, the only detractor is the age of the building and the environment.  Again not back, nothing to stay away from, but their business is serving Mexican food to their customers and they do that well.  Lunch is the majority of times I have been there.",
 '101280967457665576418',
 '103173356293785774089')



In [112]:

    
qqq =  [q for q in reviews if q[4] ==az[1]]



In [119]:

    
len (negative[negative.keys()[1]])









    Out[119]:





50



In [115]:

    
len(qqq)









    Out[115]:





0



In [ ]:

    
theta,residuals,rank,s = numpy.linalg.lstsq(matrixC, rankingY)



In [10]:

    
len(theta)









    Out[10]:





1128



In [17]:

    
def predict(data, theta):
    theta = numpy.matrix(theta)
    prediction = [theta*numpy.matrix(d).T  for d in data]
    return prediction



def MSE(prediction, real):
    squares = [ (p - r) ** 2 for p,r in zip (prediction,real) ]
    return numpy.mean(squares)



In [12]:

    
prediction_trainingx = predict(matrixC, theta)

y = [e/1000.0 for e in rankingY]
p = [e/1000.0 for e in prediction_trainingx]
print " MSE training", MSE(p, y ) # mean abs error 0.832253748827









    



 MSE training 0.832031361789



In [11]:

    
len(prediction_trainingx)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-3e16f324a7bc> in <module>()
----> 1 len(prediction_trainingx)

NameError: name 'prediction_trainingx' is not defined



In [14]:

    
0.832031361789/0.832253748827









    Out[14]:





0.9997327893826691



In [15]:

    
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/ThetawWords.pck", "w")
np.save(file,theta)
file.close()
print "done"









    



done



In [18]:

    
foo = ['a', 'b', 'c', 'd', 'e']
from random import randrange
random_index = randrange(0,len(foo))
print random_index



In [7]:

    
import random

indexes = range(len(matrix))
random.shuffle(indexes)
print indexes[:20]
X = [matrix[i] for i in indexes]
y = [rankingR[i] for i in indexes]









    



[1079569, 944663, 1177637, 202632, 1327565, 277798, 625838, 296735, 382265, 1212205, 1038896, 1251769, 1025485, 379692, 1587813, 184734, 951334, 1234560, 952987, 209674]



In [6]:

    
print indexes[:20]
X = [matrix[i] for i in indexes]
y = [rankingR[i] for i in indexes]









    



[1079569  944663 1177637  202632 1327565  277798  625838  296735  382265
 1212205 1038896 1251769 1025485  379692 1587813  184734  951334 1234560
  952987  209674]



In [8]:

    
print sum(X[0]==matrix[0])



In [12]:

    
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixReorder.pck", "w")
np.save(file,matrix)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/YsReorder.pck", "w")
np.save(file,rankingR)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/IndexReorder.pck", "w")
np.save(file,indexes)
file.close()
print "done"









    



done
done
done



In [1]:

    
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixReorder.pck")
matrix =np.load(file)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/YsReorder.pck")
rankingR = np.load(file)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/IndexReorder.pck")
indexes= np.load(file)
file.close()
print "done"









    



done
done
done



In [2]:

    
n = int(len(X)*0.7)
X_train = X[:n]
y_train = y[:n]

X_test =X[n:]
y_test =y[n:]









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-8a79feb921bd> in <module>()
----> 1 n = int(len(X)*0.7)
      2 X_train = X[:n]
      3 y_train = y[:n]
      4 
      5 X_test =X[n:]

NameError: name 'X' is not defined



In [ ]:

    
theta,residuals,rank,s = numpy.linalg.lstsq(X_train, y_train)



In [ ]:

    
prediction_training = predict(X_train, theta)
yt = [e/1000.0 for e in y_train]
pt = [e/1000.0 for e in prediction_training]
print " MSE training", MSE(pt, yt ) # mean abs error 0.832253748827


prediction_test = predict(X_test, theta)
ys = [e/1000.0 for e in y_test]
ps = [e/1000.0 for e in prediction_test]
print " MSE test", MSE(ps, ys ) # mean abs error 0.832253748827



In [ ]:

    
a=[y-p for (y,p) in zip (ys,ps)]



In [ ]:

    
def MAE(prediction, real):
    squares = [ abs(p - r)  for p,r in zip (prediction,real) ]
    return max(squares)#sum(squares)/len(squares)



In [ ]:

    
foos =[p for p in prediction_test if p>5000]
len(foos)



In [ ]:

    
min(theta)



In [51]:

    
matrixCategories = numpy.zeros((len(restaurantReviewsUS),len(categories)))
i =0
for r in restaurantReviewsUS:
    for c in r[4]:
        matrixCategories [i,categories.index(c)] = r[5]
    i+=1
len(matrixCategories)









    Out[51]:





357191



In [52]:

    
s = sum(matrixCategories,axis=0)



In [50]:

    
l = sum(matrixCategories,axis=0)



In [53]:

    
t =s /l



In [43]:

    
print (s/l)









    



[ 4625.  3558.  3916. ...,  4250.  3750.  3750.]



In [56]:

    
index = range(len(t))
index.sort (key= l.__getitem__, reverse=True)
    
ct = [categories[i] for i in index]
t[:] = [l[i] for i in index]



In [57]:

    
print t[:10]
print ct[:10]









    



[ 92193.  62372.  51248.  50239.  43532.  41073.  39149.  32937.  29796.
  27967.]
['RESTAURANT', 'FAST FOOD RESTAURANT', 'AMERICAN RESTAURANT', 'EUROPEAN RESTAURANT', 'ASIAN RESTAURANT', 'PIZZA RESTAURANT', 'ITALIAN RESTAURANT', 'MEXICAN RESTAURANT', 'HAMBURGER RESTAURANT', 'LATIN AMERICAN RESTAURANT']



In [64]:

    
prediction_training[:10]









    Out[64]:





[matrix([[ 4045.84846337]]),
 matrix([[ 3338.69120392]]),
 matrix([[ 4189.7933387]]),
 matrix([[ 3566.76011685]]),
 matrix([[ 3868.23306624]]),
 matrix([[ 4562.30147085]]),
 matrix([[ 5003.98583937]]),
 matrix([[ 4087.29172447]]),
 matrix([[ 3534.33690685]]),
 matrix([[ 3863.80331597]])]



In [1]:

    
import numpy as np
print "Reading data..."
m0 = np.load("/home/iizhaki/oasis/CSE255/MatrixwWords0.pck")
print "done"
print "Reading data..."
m1 = np.load("/home/iizhaki/oasis/CSE255/MatrixwWords1.pck")
print "done"
print "Reading data..."
m2 = np.load("/home/iizhaki/oasis/CSE255/MatrixwWords2.pck")
print "done"

matrix = numpy.vstack((m0,m1,m2))
print len(matrix )


#load y
rankingR = np.load("/home/iizhaki/oasis/CSE255/YwWords.pck")
print len(rankingR)
print "done"


#save  matrix
file = open("/home/iizhaki/oasis/CSE255/IndexReorder.pck")
indexes= np.load(file)
file.close()
print "done"

print indexes[:20]
X = [matrix[i] for i in indexes]
y = [rankingR[i] for i in indexes]


n = int(len(X)*0.1)
X_train = X[:n]
y_train = y[:n]

X_test =X[n:]
y_test =y[n:]









    



Reading data...
done
Reading data...
done
Reading data...
done
1610014
1610014
done
done
[1079569  944663 1177637  202632 1327565  277798  625838  296735  382265
 1212205 1038896 1251769 1025485  379692 1587813  184734  951334 1234560
  952987  209674]



In [ ]:

    
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixReorder.pck")
matrix =np.load(file)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/YsReorder.pck")
rankingR = np.load(file)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/IndexReorder.pck")
indexes= np.load(file)
file.close()
print "done"

print indexes[:20]
X = [matrix[i] for i in indexes]
y = [rankingR[i] for i in indexes]


n = 100 #int(len(X)*0.7)
X_train = X[:n]
y_train = y[:n]

X_test =X[n:]
y_test =y[n:]



In [65]:

    
X_train_b = list(X_train)



In [66]:

    
from sklearn.cluster import KMeans 

K = 1000
km =  KMeans (n_clusters=K, n_jobs=-1)
km.fit(X_train)









    Out[66]:





KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=1000,
    n_init=10, n_jobs=-1, precompute_distances=True, random_state=None,
    tol=0.0001, verbose=0)



In [1]:



In [67]:

    
X_train = list(X_train_b)

def bitIt(idx, rng):
    res = [0] * (rng + 1)
    res[0] = 1
    res[idx + 1] = 1
    return res

X_train = [bitIt(i, K) for i in km.labels_]
print X_train[0]









    



[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]



In [68]:

    
import scipy.optimize
from numpy.linalg import norm

### Gradient descent ###

# Objective
def f(theta, X, y, lam):
    diff = numpy.dot(X, theta) - y
    diffSq = (norm(diff) ** 2) / len(X)
    diffSqReg = diffSq + lam * norm(theta) ** 2
    #print "f : " , diffSqReg
    return diffSqReg

    # Derivative
def fprime(theta, X, y, lam):
    diff = numpy.dot(X, theta) - y
    res = 2 * numpy.dot(X.T, diff) / len(X) + 2 * lam * theta
    return res



In [69]:

    
import time
import timeit

start = time.time()
thetar,l,info = scipy.optimize.fmin_l_bfgs_b(f, numpy.array([0] * len(X_train[0])).T, fprime, args = (numpy.array(X_train), numpy.array(y_train).T, 0.1))
#thetar = scipy.optimize.minimize(f, numpy.array([0] * len(X_train[0])).T, jac = fprime, args = (numpy.array(X_train), numpy.array(y_train).T, 0.1))
end = time.time()
finished = end - start
print finished









    



0.552059173584



In [70]:

    
len (thetar)









    Out[70]:





1001



In [71]:

    
def predict(data, theta):
    theta = numpy.matrix(theta)
    prediction = [theta*numpy.matrix(d).T  for d in data]
    return prediction

#prediction_training = predict(X, thetax)

def MSE(prediction, real):
    squares = [ (p - r)**2 for p,r in zip (prediction,real) ]
    return numpy.mean(squares)


#print " MSE training", MSE(prediction_training, y )



In [72]:

    
prediction_training = predict(X_train, thetar)
yt = [e/1000.0 for e in y_train]
pt = [e/1000.0 for e in prediction_training]
print " MSE training", MSE(pt, yt ) # mean abs error 0.832253748827









    



 MSE training 1.60364829241



In [240]:

    
prediction_test = predict(X_test, thetar)
ys = [e/1000.0 for e in y_test]
ps = [e/1000.0 for e in prediction_test]
print " MSE test", MSE(ps, ys ) # mean abs error 0.832253748827









    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-240-97808929ec05> in <module>()
----> 1 prediction_test = predict(X_test, thetar)
      2 ys = [e/1000.0 for e in y_test]
      3 ps = [e/1000.0 for e in prediction_test]
      4 print " MSE test", MSE(ps, ys ) # mean abs error 0.832253748827

<ipython-input-238-34ca80dbdc63> in predict(data, theta)
      1 def predict(data, theta):
      2     theta = numpy.matrix(theta)
----> 3     prediction = [theta*numpy.matrix(d).T  for d in data]
      4     return prediction
      5 

/oasis/scratch/iizhaki/temp_project/PV/python-virtualEnv3/lib/python2.7/site-packages/numpy/matrixlib/defmatrix.pyc in __mul__(self, other)
    339         if isinstance(other, (N.ndarray, list, tuple)) :
    340             # This promotes 1-D vectors to row vectors
--> 341             return N.dot(self, asmatrix(other))
    342         if isscalar(other) or not hasattr(other, '__rmul__') :
    343             return N.dot(self, other)

KeyboardInterrupt:



In [16]:

    
import numpy as np
np.__config__.show()









    



blas_info:
    libraries = ['blas']
    library_dirs = ['/opt/lapack/gnu/lib']
    language = f77
lapack_info:
    libraries = ['lapack']
    library_dirs = ['/opt/lapack/gnu/lib']
    language = f77
atlas_threads_info:
  NOT AVAILABLE
blas_opt_info:
    libraries = ['blas']
    library_dirs = ['/opt/lapack/gnu/lib']
    language = f77
    define_macros = [('NO_ATLAS_INFO', 1)]
atlas_blas_threads_info:
  NOT AVAILABLE
openblas_info:
  NOT AVAILABLE
lapack_opt_info:
    libraries = ['lapack', 'blas']
    library_dirs = ['/opt/lapack/gnu/lib']
    language = f77
    define_macros = [('NO_ATLAS_INFO', 1)]
atlas_info:
  NOT AVAILABLE
lapack_mkl_info:
  NOT AVAILABLE
blas_mkl_info:
  NOT AVAILABLE
atlas_blas_info:
  NOT AVAILABLE
mkl_info:
  NOT AVAILABLE

Test with PlayDoh



In [68]:

    
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixReorder.pck")
matrix =np.load(file)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/YsReorder.pck")
rankingR = np.load(file)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/IndexReorder.pck")
indexes= np.load(file)
file.close()
print "done"

print indexes[:20]
X = [matrix[i] for i in indexes]
y = [rankingR[i] for i in indexes]


n = 100000 #int(len(X)*0.7)
X_train = X[:n]
y_train = y[:n]

#X_test =X[n:]
#y_test =y[n:]









    



done
done
done
[1079569  944663 1177637  202632 1327565  277798  625838  296735  382265
 1212205 1038896 1251769 1025485  379692 1587813  184734  951334 1234560
  952987  209674]



In [1]:

    
# NEGATIVE Log-likelihood
thetas = numpy.array([0] * len(X_train[0]))
lam = 0.1

def f(i):
    X = X_train[i]
    y = Y_train[i]
    theta = thetas[i]
    logit = np.dot(X, theta)
    loglikelihood = -np.log(1 + np.exp(-logit)).sum(axis=0, dtype='float')
    loglikelihood -= np.dot(logit, 1 - y)
    loglikelihood -= lam * np.dot(theta, theta)
  
    #print "ll =", loglikelihood
    return -loglikelihood



In [ ]:

    
import time
import timeit
import playdoh



#thetar,l,info = scipy.optimize.fmin_l_bfgs_b(f, numpy.array([0] * len(X_train[0])).T, fprime, args = (numpy.array(X_train), numpy.array(y_train).T, 0.1))
#thetar = scipy.optimize.minimize(f, numpy.array([0] * len(X_train[0])).T, jac = fprime, args = (numpy.array(X_train), numpy.array(y_train).T, 0.1))
if __name__ == '__main__':
    start = time.time()
    results = playdoh.maximize(fun,
                       popsize = 10000, # size of the population
                       maxiter = 10, # maximum number of iterations
                       cpu = 2, # number of CPUs to use on the local machine
                       x_initrange = [-10.,10.]) # initial interval for the ``x`` parameter

    # Display the final results in a table
    playdoh.print_table(results)
    end = time.time()
    finished = end - start
    print finished



In [ ]:

    
import playdoh

thetas = numpy.array([0] * 100)
lam = 0.1

# The fitness function to maximize
def fun(x, y):
    return x


if __name__ == '__main__':
    # Maximize the fitness function in parallel
    results = playdoh.minimize(fun,
                       popsize=1,  # size of the population
                       maxiter=1,  # maximum number of iterations
                       cpu=1)

    # Display the final result in a table
    playdoh.print_table(results)



In [69]:

    
X_train = np.array(X_train)
y_train = np.array(y_train)



In [32]:

    
# Objective
def f(theta, X, y, lam):
    diff = numpy.dot(X, theta) - y
    diffSq = (norm(diff) ** 2) / len(X)
    diffSqReg = diffSq + lam * norm(theta) ** 2
    #print "f : " , diffSqReg
    return diffSqReg



In [70]:

    
import time
import timeit

theta  = numpy.array([0] * len(X_train[0])).T

start = time.time()
f(theta, X_train, y_train, 0.1)
end = time.time()
finished = end - start
print finished









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-70-54e05b225547> in <module>()
      5 
      6 start = time.time()
----> 7 f(theta, X_train, y_train, 0.1)
      8 end = time.time()
      9 finished = end - start

/oasis/scratch/iizhaki/temp_project/PV/python-virtualEnv3/lib/python2.7/site-packages/numpy/random/mtrand.so in mtrand.RandomState.f (numpy/random/mtrand/mtrand.c:11268)()

TypeError: f() takes at most 3 positional arguments (4 given)



In [84]:

    
import threading
from threading import Thread
from multiprocessing.pool import ThreadPool
import time
import timeit

def inner(x,y):
  return sum([x[i]*y[i] for i in range(len(x))])

def sigmoid(x):
  return 1.0 / (1 + np.exp(-x))

l = X_train.shape[0]
res = dict()
theta = numpy.array([0] * len(X_train[0]))

def fprimeold(theta, X, y, lam):
    diff = numpy.dot(X, theta) - y
    #print diff
    return 2 * numpy.dot(X.T, diff) / X_train.shape[0] + 2 * lam * theta

# Derivative
def fprime(theta, X, y, lam, mi, ma, ress):
    X = np.array(X, copy=True)
    y = np.array(y, copy=True)
    theta = np.array(theta, copy=True)
    X[: mi] = 0.
    X[ma :] = 0
    y[: mi] = 0
    y[ma : ] = 0
    theta[mi : ma] = 0
    diff = numpy.dot(X, theta) - y
    #print diff
    ress[mi] = 2 * (numpy.dot(diff, X) / X_train.shape[0] + lam * theta)

threads = []
N = 1
for n in range(N):
    low = n * 1.0 / N
    high = (n + 1.0) / N
    threads.append(Thread(target = fprime, args = (theta, X_train, y_train, 0.1, low, high, res)))
#t3 = Thread(target = fprime, args = (theta, X_train, y_train, 0.1, 2.0*l / 3, l - 1, res))

start = time.time()

for t in threads:
    t.start()

for t in threads:
    t.join()

end = time.time()
finished = end - start
print finished

fres = np.zeros(res[0].shape)
for r in res.values():
    fres += r
    

start = time.time()
d = fprimeold(theta, X_train, y_train, 0.1)
end = time.time()
finished = end - start
print finished

print fres
print d
print [100.0 * sum(fres == d) / len(d)]









    



0.912188053131
0.496465921402
[-0.08  0.    0.   ...,  0.    0.    0.  ]
[ -7.91206000e+03  -1.13400000e+01  -1.18400000e+01 ...,  -1.72000000e+04
   0.00000000e+00  -1.39800000e+04]
[20.233463035019454]



In [17]:



In [16]:



In [ ]: