All imports and JSon files loads


In [2]:
import pickle

In [4]:
from sklearn.cluster import KMeans
import json

In [2]:
# Links:
# /home/iizhaki/CSE255/googlelocal/places.json
# /home/iizhaki/CSE255/googlelocal/reviews.json
# /home/iizhaki/CSE255/googlelocal/users.json

In [5]:
import json
print "Start loading users..."
jsonUsers = json.load(open("/oasis/scratch/iizhaki/temp_project/CSE255/googlelocal/users.json"))
print "Finished loading users"


Start loading users...
Finished loading users

In [ ]:
print "Start loading reviews..."
jsonReviews = json.load(open("/home/iizhaki/oasis/CSE255/reviews_0.pck"))
print "Finished loading reviews"

In [11]:
print "Start loading places..."
jsonPlaces = json.load(open("/oasis/scratch/iizhaki/temp_project/CSE255/googlelocal/places.json"))
print "Finished loading places"


Start loading places...
Finished loading places

Load Files


In [15]:
##users
X = []
ids = []
for id in jsonUsers:
    X.append(jsonUsers[id])
    ids.append(id)
#X = [jsonUsers[id] for id in jsonUsers ]
#id = [id for id in jsonUsers]

#place      = [n['currentPlace'] if 'currentPlace' in n and n['currentPlace']!=[] else [] for n in X  ]
place      = [n['currentPlace'] for n in X if 'currentPlace' in n and n['currentPlace']!=[] ]

ids      = [ids[i] for i in range(len(X)) if 'currentPlace' in X[i] and X[i]['currentPlace']!=[] ]

GPS = [g[1] if len(g)>1  else []  for g in place ]
id=ids
#id      = [ids[i] for i in range(len(place)) if len(place[i])>1 ]

GPS = [[g[1]/1e7,g[2]/1e7] if len(g)>2  else [] for g in GPS]

#GPS_US = [p for p in GPS if p[0]>=24 and p[0]<=49 and p[1]>=-128 and p[1]<=-47]

#id      = [id[i] for i in range(len(GPS)) if GPS[i][0]>=24 and GPS[i][0]<=49 and GPS[i][1]>=-128 and GPS[i][1]<=-47 ]

print len(GPS),",", len(id)


749941 , 749941

In [6]:
#print len(X), len(ids)

##users
X = []
ids = []
for id in jsonUsers:
    X.append(jsonUsers[id])
    ids.append(id)
#X = [jsonUsers[id] for id in jsonUsers ]
#id = [id for id in jsonUsers]

place      = [n['currentPlace'] if 'currentPlace' in n and n['currentPlace']!=[] else [] for n in X  ]
#place      = [n['currentPlace'] for n in X if 'currentPlace' in n and n['currentPlace']!=[] ]

#ids      = [ids[i] for i in range(len(X)) if 'currentPlace' in X[i] and X[i]['currentPlace']!=[] ]

GPS = [g[1] if len(g)>1  else []  for g in place ]
id=ids
#id      = [ids[i] for i in range(len(place)) if len(place[i])>1 ]

GPS = [[g[1]/1e7,g[2]/1e7] if len(g)>2  else [] for g in GPS]

#GPS_US = [p for p in GPS if p[0]>=24 and p[0]<=49 and p[1]>=-128 and p[1]<=-47]

#id      = [id[i] for i in range(len(GPS)) if GPS[i][0]>=24 and GPS[i][0]<=49 and GPS[i][1]>=-128 and GPS[i][1]<=-47 ]

print len(GPS),",", len(id)


3747937 , 3747937

In [27]:
GPS_US = []
id_US  = [] 
for i in  range(len(GPS)):
    p = GPS[i]
    if p!= []:
        if p[0]>=24 and p[0]<=49 and p[1]>=-128 and p[1]<=-47:
            GPS_US.append(p)
            id_US.append(id[i])
        
#save users
file = open("/home/iizhaki/oasis/CSE255/users_GPS_US.pck", "w")
pickle.dump(GPS_US, file)
file.close()
print "done"

#save users
file = open("/home/iizhaki/oasis/CSE255/users_id_US.pck", "w")
pickle.dump(id_US, file)
file.close()
print "done"


done
done

In [42]:
#save users
file = open("/home/iizhaki/oasis/CSE255/users_GPS_US.pck", "w")
pickle.dump(GPS_US, file)
file.close()
print "done"


done

In [7]:
#save users
file = open("/home/iizhaki/oasis/CSE255/users_GPS.pck", "w")
pickle.dump(GPS, file)
file.close()
print "done"

#save users
file = open("/home/iizhaki/oasis/CSE255/users_id.pck", "w")
pickle.dump(id, file)
file.close()
print "done"


done
done

In [1]:
print len(GPS), len(id)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-4f2baf8b7575> in <module>()
----> 1 print len(GPS), len(id)

NameError: name 'GPS' is not defined

In [1]:
import pickle
#load users
file = open("/home/iizhaki/oasis/CSE255/users_GPS.pck")
GPS = pickle.load(file)
file.close()
print "done"

#load users
file = open("/home/iizhaki/oasis/CSE255/users_id.pck")
id = pickle.load(file)
file.close()
print "done"

print len(GPS), len(id)


done
done
3747937 3747937

In [139]:
#dictionary of users
usersD =  dict()
_sum = [0]* len(categories)
coun =[0]* len(categories)
for i in range(len(id)):
    usersD[id[i]]=(GPS[i],list(_sum),list(coun))
print len(usersD)    
print "done"


3747937
done

In [138]:
len(GPS)


Out[138]:
3747937

In [140]:


In [12]:
X = [jsonPlaces[p] for p in jsonPlaces ]

gps     = [g['gps'] if 'gps'in g else [0,0] for g in X]
gps_x   = [g[0] for g in gps]
gps_y   = [g[1] for g in gps]

names   = [n['name'].encode('ascii','ignore') if 'name'in n     else "no name" for n in X]
id      = [n['id'].encode('ascii','ignore')   if 'id' in n      else "no id"   for n in X]
address = [n['address']                       if 'address' in n else ["",""]   for n in X]
address = [n[1].encode('ascii','ignore')      if len(n)>1       else ""        for n in address]

#name, id, gps, address, categories, rating, count
places = [(names[i],id[i],gps[i],address[i],set(),0,0) for i in range(len(X))]
places = [p for p in places if p[2]!=[0,0]]
print "done"


done

In [14]:
print len(places)#3087397


3087397

In [16]:
#save places
#file = open("/home/iizhaki/oasis/CSE255/places.pck", "w")
#pickle.dump(places, file)
#file.close()
print "done"


done

In [17]:
#read places

file  = open ("/home/iizhaki/oasis/CSE255/places.pck")
places = pickle.load(file)
file.close();
print "done"


done

In [3]:
#read reviews0

file  = open ("/home/iizhaki/oasis/CSE255/reviews_0.pck")
reviews0 = pickle.load(file)
file.close();
print "done"


done

In [4]:
#read reviews1

file  = open ("/home/iizhaki/oasis/CSE255/reviews_1.pck")
reviews1 = pickle.load(file)
file.close();
print "done"


done

In [5]:
#read reviews2

file  = open ("/home/iizhaki/oasis/CSE255/reviews_2.pck")
reviews2 = pickle.load(file)
file.close();
print "done"


done

In [6]:
reviews = reviews0+reviews1+reviews2

print len(reviews)


11453845

In [ ]:
#creates a dict given the places array
placesD =  dict()

for p in places:
    placesD[p[1]]=p

Join reviews places


In [ ]:
#fills the dict with the categories and rating from reviews
for r in reviews:
    if r[4] in placesD:
        p = placesD[r[4]]
        placesD[r[4]] = (p[0],p[1],p[2],p[3],set(p[4]) | set(r[1]),p[5]+r[0],p[6]+1)
print "done"

In [32]:
#calculate Rating
todel =[]
for idx in placesD:
    p = placesD[idx]
    if (p[6]==0):
        todel += [idx]
    else:
        placesD[idx] = (p[0],p[1],p[2],p[3],list(p[4]),p[5]/p[6])
print "done"


done

In [35]:
placesReviews = [placesD[idx] for idx in placesD]
len(placesReviews)


Out[35]:
3087397

In [39]:
placesReviewsUS = [p for p in placesReviews if p[2][0]>=24 and p[2][0]<=49 and p[2][1]>=-128 and p[2][1]<=-47]
len(placesReviewsUS)


Out[39]:
1308838

In [54]:
placesReviewsUS = [(p[0],p[1],p[2],p[3],[w.encode('ascii','ignore') for w in p[4]   ],p[5]) for p in placesReviewsUS]
len (placesReviewsUS)


Out[54]:
1308838

In [55]:



Out[55]:
('China Cottage',
 '106432060150136868000',
 [39.692899, -84.136173],
 'Dayton, OH 45429',
 ['Chinese Restaurant', 'Asian Restaurant'],
 3916)

In [9]:
print "a"


a

In [40]:
#save places with reviews
file = open("/home/iizhaki/oasis/CSE255/placesReviews.pck", "w")
pickle.dump(placesReviews, file)
file.close()
print "done"


done

In [56]:
#save places with reviews
file = open("/home/iizhaki/oasis/CSE255/placesReviewsUS.pck", "w")
pickle.dump(placesReviewsUS, file)
file.close()
print "done"


done

In [4]:
#read reviewsUS

file  = open ("/home/iizhaki/oasis/CSE255/placesReviewsUS.pck")
placesReviewsUS = pickle.load(file)
file.close();
print "done"


done

In [13]:
file  = open ("/home/iizhaki/oasis/CSE255/gpsUS200.pck")
centroids200 = pickle.load(file)
file.close();
print "done"


done

In [63]:
restaurantReviewsUS = [p for p in placesReviewsUS if [y for y in p[4] if 'RESTAURANT' in y.upper() and 'RESTAURANTE' not in y.upper()   ]!=[]]
restaurantReviewsUS = [(p[0],p[1],p[2],p[3],[w.upper() for w in p[4]   ],p[5]) for p in restaurantReviewsUS]
restaurantReviewsUS = [(p[0],p[1],p[2],p[3],[w  for w in p[4] if 'RESTAURANT' in w or 'BAR' in w or 'BAKERY' in w or 'FOOD' in w or 'GRILL'  in w ],p[5]) for p in restaurantReviewsUS]

print len(restaurantReviewsUS) #357191
print restaurantReviewsUS[0]


357191
("T C's Referee Sports Bar", '100327153115986850675', [43.529494, -96.792244], 'Sioux Falls, SD 57106', ['SPORTS BAR', 'RESTAURANT'], 4625)

In [66]:
#save restaurants with reviews
file = open("/home/iizhaki/oasis/CSE255/restaurantsReviewsUS.pck", "w")
pickle.dump(restaurantReviewsUS, file)
file.close()
print "done"


done

In [24]:
import pickle
file  = open ("/home/iizhaki/oasis/CSE255/restaurantsReviewsUS.pck")
restaurantReviewsUS = pickle.load(file)
file.close();
print "done"
print len(restaurantReviewsUS) #357191


done
357191

In [29]:
categories[0]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-29-02a1b52a42a0> in <module>()
----> 1 categories[0]

NameError: name 'categories' is not defined

In [7]:
categories = set ()
for p in restaurantReviewsUS:
    categories = categories | set ( p[4]  ) 
print len(list(categories))


363

In [15]:
categories = list(categories)
#save categories
file = open("/home/iizhaki/oasis/CSE255/categories.pck", "w")
pickle.dump(categories, file)
file.close()
print "done"


done

In [30]:
import pickle
file  = open ("/home/iizhaki/oasis/CSE255/categories.pck")
categories = pickle.load(file)
file.close();
print "done"
print len(categories) #363


done
363

In [70]:
categories = list(categories)
matrixCategories = numpy.zeros((len(restaurantReviewsUS),len(categories)))
i =0
for r in restaurantReviewsUS:
    for c in r[4]:
        matrixCategories [i,categories.index(c)] = 1
    i+=1
len(matrixCategories)


Out[70]:
357191

In [72]:
#save restaurants matrix
file = open("/home/iizhaki/oasis/CSE255/reviewMatrix.pck", "w")
numpy.save(file,matrixCategories)
file.close()
print "done"


done

In [71]:
len(restaurantReviewsUS)


Out[71]:
357191

In [74]:
reviewsY = [r[5] for r in restaurantReviewsUS]
#save reviews
file = open("/home/iizhaki/oasis/CSE255/reviewY.pck", "w")
numpy.save(file,reviewsY)
file.close()
print "done"


done

In [91]:
print "max", max(reviewsY), "min", min(reviewsY), "avg",sum(reviewsY)/len(reviewsY)


 max 5000 min 1000 avg 3858

This part of code computes KMeans clusters for GPS coordinates.


In [ ]:
GPS = [g for g in gps if g != [0, 0]]
print len(gps), len(GPS)
K = 1000
kmeans = KMeans(n_clusters=K, n_jobs=-1)
kmeans.fit(GPS)


3114353 3087397

In [ ]:
kk = kmeans
print type (kk)
print len(kk.cluster_centers_)
print [sum(kk.labels_ == i) for i in range(K)]
print kk.cluster_centers_[3]
print gps[0]

In [140]:


In [140]:


In [140]:

Gradient Ascend Code


In [ ]:
import numpy as np
import urllib
import scipy.optimize
import random
from math import exp
from math import log

print "Reading data..."
data = np.load("/home/iizhaki/oasis/CSE255/reviewFeatures.pck")
print "done"

def inner(x,y):
  return sum([x[i]*y[i] for i in range(len(x))])

def sigmoid(x):
  return 1.0 / (1 + np.exp(-x))

# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
  logit = np.dot(X, theta)
  loglikelihood = -np.log(1 + np.exp(-logit)).sum(axis=0, dtype='float')
  loglikelihood -= np.dot(logit, 1 - y)
  loglikelihood -= lam * np.dot(theta, theta)
  
  print "ll =", loglikelihood
  return -loglikelihood

# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
    logit = np.dot(X, theta)
    dl = np.dot(X.T, (1 - sigmoid(logit)));
    dl -= y_spec
    dl -= 2 * lam * theta
    # Negate the return value since we're doing gradient *ascent*
    return np.array([-x for x in dl])

X = data
y = np.load("/home/iizhaki/oasis/CSE255/reviewY.pck")

# Training data
X_train = X
y_train = [(int)(ys/1000) for ys in y]

# Test data
X_test = X[1000:]
y_test = y[1000:]

#dummy = np.zeros((X_train.shape[1]))
#y_spec = np.array([X[i] if not y_train[i] else dummy for i in range(len(X_train))]).sum(axis=0, dtype='float')

#theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, args = (X_train, y_train, 1.0))
#print theta.shape
#print "Final log likelihood =", -l

In [86]:
print "Reading data..."
data = np.load("/home/iizhaki/oasis/CSE255/reviewFeatures.pck")
print "done"

X = data
y = np.load("/home/iizhaki/oasis/CSE255/reviewY.pck")


Reading data...
done

In [87]:
thetax,residualsx,rankx,sx = numpy.linalg.lstsq(X, y)

In [25]:
def predict(data, theta):
    theta = numpy.matrix(theta)
    prediction = [theta*numpy.matrix(d).T  for d in data]
    return prediction

prediction_training = predict(X, thetax)

def MSE(prediction, real):
    squares = [ (p - r)**2 for p,r in zip (prediction,real) ]
    return numpy.mean(squares)


print " MSE training", MSE(prediction_training, y )
#absolute error 658.914905107


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-25-da0c0bce89d0> in <module>()
      4     return prediction
      5 
----> 6 prediction_training = predict(X, thetax)
      7 
      8 def MSE(prediction, real):

NameError: name 'thetax' is not defined

In [100]:
print prediction_training[1],", ", y[1]


[[ 3802.62044907]] ,  3558

In [90]:
avg = [sum(y)*1.0/len(y)] * len(y)

print " MSE training", MSE(avg, y )
#MSE training 686.400936754


 MSE training 770393.021789

In [91]:
719823.238065/770393.021789


Out[91]:
0.9343584608196901

In [128]:
print np.array([X[:,0]]).shape
print np.array(X[:,301:]).shape
Xx = np.concatenate((np.array([X[:,0]]).T,X[:,301:]), axis =1)


(1, 357191)
(357191, 363)

In [132]:
Xx[:,0]


Out[132]:
array([ 1.,  1.,  1., ...,  1.,  1.,  1.])

In [133]:
thetax,residualsx,rankx,sx = numpy.linalg.lstsq(Xx, y)

In [140]:


In [135]:
def predict(data, theta):
    theta = numpy.matrix(theta)
    prediction = [theta*numpy.matrix(d).T  for d in data]
    return prediction

prediction_trainingx = predict(Xx, thetax)

def MSE(prediction, real):
    squares = [ (p - r)**2 for p,r in zip (prediction,real) ]
    return numpy.mean(squares)


print " MSE training", MSE(prediction_trainingx, y )


 MSE training 661.22571362

In [136]:
restaurantReviewsUS[0]


Out[136]:
("T C's Referee Sports Bar",
 '100327153115986850675',
 [43.529494, -96.792244],
 'Sioux Falls, SD 57106',
 ['SPORTS BAR', 'RESTAURANT'],
 4625)

In [49]:
len(reviews)


Out[49]:
2157302

In [140]:
len(restaurantReviewsUS)


Out[140]:
357191

In [146]:
reviews1[0]


Out[146]:
(4000,
 [u'Shopping Mall'],
 '',
 '105753474075712443583',
 '118294895653644676616')

In [1]:
reviews1[0:5]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-f11bbb8e780c> in <module>()
----> 1 reviews1[0:5]

NameError: name 'reviews1' is not defined

In [17]:
len(categories)


Out[17]:
363

Reviews of the US


In [4]:
#creates a dict given the reviews Restaurants array
placesD =  dict()

for p in restaurantReviewsUS:
    placesD[p[1]]=p
print "Done"


Done

In [133]:
print len(placesD)


357191

In [14]:
#reviews of restaurants in the US

reviews = [r for r in reviews if r[4] in placesD]
print len(reviews)#2157302
print "Done"


2157302
Done

In [15]:
reviews = [r for r in reviews if r[3] in usersD]
print len(reviews)#1610014
print "Done"


1610014
Done

In [5]:
print len(reviews)#1610014


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-6948c29511b4> in <module>()
----> 1 print len(reviews)#1610014

NameError: name 'reviews' is not defined

In [16]:
#save reviews
file = open("/home/iizhaki/oasis/CSE255/reviewsUS.pck", "w")
pickle.dump(reviews, file)
file.close()
print len(reviews)#1610014
print "done"


1610014
done

In [5]:
#load reviews US with users
file = open("/home/iizhaki/oasis/CSE255/reviewsUS.pck")
reviews = pickle.load( file)
file.close()
print len(reviews)#1610014
print "done"


1610014
done

In [140]:
len(usersD)


Out[140]:
3747937

In [134]:
len(reviews)


Out[134]:
1610014

In [141]:
categories = list(categories)
usersDD = dict()
for r in reviews:
    if r[4] in placesD: #places
        if r[3] in usersD: #user
            p = placesD[r[4]]
            u = usersD[r[3]]
            sum   =  list(u[1])
            count =  list(u[2])
            for c in p[4]:
                if c in categories:
                    idx = categories.index(c)
                    sum[idx]   += r[0]
                    count[idx] += 1
            usersDD[r[3]] = (u[0],sum,count)
print "Done"


Done

In [11]:
len(usersDD)


Out[11]:
692157

In [146]:
del usersD

In [72]:
#save usersDD
file = open("/home/iizhaki/oasis/CSE255/usersDD.pck", "w")
pickle.dump(usersDD, file)
file.close()
print "done"


done

In [10]:
#load usersDD
file = open("/home/iizhaki/oasis/CSE255/usersDD.pck")
usersDD = pickle.load( file)
file.close()
print "done"


done

In [126]:
len (list(usersDD))


Out[126]:
153241

In [147]:
#calculate rank
for u in usersDD:
    us = usersDD[u]
    rank = []
    for i in range(len(us[1])):
        if us[2][i]!=0:
            rank += [us[1][i]/us[2][i]]
        else:
            rank += [0]
    usersDD[u] = (u,us[0],rank)

In [148]:
len(usersDD)


Out[148]:
692157

In [149]:
#save usersDD
file = open("/home/iizhaki/oasis/CSE255/usersDD.pck", "w")
pickle.dump(usersDD, file)
file.close()
print "done"


done

In [62]:
u= [usersDD[sd] for sd in usersDD[:10]]
u[0]


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-62-73399bb3b8fb> in <module>()
----> 1 u= [usersDD[sd] for sd in usersDD[:10]]
      2 u[0]

TypeError: unhashable type

In [39]:
ur = [r for r in reviews if  r[3] =='106887711560311804886']

In [19]:
len(reviews)


Out[19]:
1610014

In [16]:
#len of reviews most be 1610014 
K=300
matrixC= []
rankingY = []
for r in reviews:
    if r[3] in usersDD: #user
        l = locationD[r[4]]
        locs = list(l) #location or Rest
        p = placesD[r[4]]
        positive 
        if r[4] in positive:
            pos = list(positive[r[4]])
        else:
            pos = [0] * 50
        negative
        if r[4] in negative:
            neg = list(negative[r[4]])
        else:
            neg = [0] * 50
        
        #categories
        cats = [0] * len(categories)
        for c in p[4]:
            cats [categories.index(c)] = 1
        u = usersDD[r[3]]
    
        #alpha + location + avg review + categories + user info
        matrixC.append ( [1] + locs + [p[5]] + cats + pos + neg +  u[2])
        rankingY.append (r[0])
print "Done"
print len(matrixC)


Done
1610014

In [17]:
len(matrixC[0])


Out[17]:
1128

In [35]:
del usersDD

In [121]:
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixD.pck", "w")
np.save(file,matrix)
file.close()
print "done"


done

In [123]:
#save y
file = open("/home/iizhaki/oasis/CSE255/YsD.pck", "w")
np.save(file,rankingR)
file.close()
print "done"


done

In [3]:
import numpy as np
#save  matrix
matrix = np.load("/home/iizhaki/oasis/CSE255/MatrixD.pck")
print "done"
#save y
rankingR = np.load("/home/iizhaki/oasis/CSE255/YsD.pck")

print "done"


done
done

In [26]:
m0 = matrixC[:500000]

In [27]:
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixwWords0.pck", "w")
np.save (file,m0)
#pickle.dump(matrixC, file)
file.close()
print "done"


done

In [28]:
m1 = matrixC[500000:1000000]
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixwWords1.pck", "w")
np.save (file,m1)
#pickle.dump(matrixC, file)
file.close()
print "done"


done

In [29]:
m2 = matrixC[1000000:]
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixwWords2.pck", "w")
np.save (file,m2)
#pickle.dump(matrixC, file)
file.close()
print "done"


done

In [1]:
print "Reading data..."
m0 = np.load("/home/iizhaki/oasis/CSE255/MatrixwWords0.pck")
print "done"
print "Reading data..."
m1 = np.load("/home/iizhaki/oasis/CSE255/MatrixwWords1.pck")
print "done"
print "Reading data..."
m2 = np.load("/home/iizhaki/oasis/CSE255/MatrixwWords2.pck")
print "done"


Reading data...
done
Reading data...
done
Reading data...
done

In [5]:
len(m0)+len(m1)+len(m2)
matrixC= numpy.vstack((m0,m1,m2))
print len(matrixC)


1610014

In [6]:
type (matrixC)


Out[6]:
numpy.ndarray

In [33]:
#save y
file = open("/home/iizhaki/oasis/CSE255/YwWords.pck", "w")
np.save(file,rankingY)
file.close()
print "done"


done

In [7]:
#load y
rankingY = np.load("/home/iizhaki/oasis/CSE255/YwWords.pck")
print len(rankingY)
print "done"


done

In [8]:
print len(rankingY)


1610014

In [6]:
print "Reading data..."
mdata = np.load("/home/iizhaki/oasis/CSE255/reviewFeatures.pck")
print "done"


Reading data...
done

In [25]:
len( matrixC)


Out[25]:
1610014

In [7]:
mdata = mdata[:,1:301]

#creates a dict given the reviews Restaurants array
locationD =  dict()

for i in range(len(restaurantReviewsUS)):
    locationD[restaurantReviewsUS[i][1]]=list(mdata[i])
print "Done"


Done

In [9]:
len(locationD)


Out[9]:
357191

In [8]:
del mdata

In [ ]:
theta,residuals,rank,s = numpy.linalg.lstsq(matrix, rankingR)

In [34]:
print 2


2

In [38]:
len(rankingR)


Out[38]:
1610014

In [50]:
np.matrix(theta)


Out[50]:
matrix([[  2.48550859e+10,  -2.48550859e+10,  -2.48550859e+10, ...,
           1.25411987e-01,   0.00000000e+00,   1.39045715e-01]])

In [49]:
len (np.matrix[0])


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-49-50ab403e15fb> in <module>()
----> 1 len (np.matrix[0])

TypeError: 'type' object has no attribute '__getitem__'

In [43]:
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/ThetaX.pck", "w")
np.save(file,theta)
file.close()
print "done"


done

In [ ]:
print "Reading data..."
theta = np.load("/home/iizhaki/oasis/CSE255/ThetaX.pck")
print "done"

In [83]:
def predict(data, theta):
    theta = numpy.matrix(theta)
    prediction = [theta*numpy.matrix(d).T  for d in data]
    return prediction

prediction_trainingx = predict(matrix, theta)

def MSE(prediction, real):
    squares = [ (p - r) ** 2 for p,r in zip (prediction,real) ]
    return numpy.mean(squares)

y = [e/1000.0 for e in rankingR]
p = [e/1000.0 for e in prediction_trainingx]
print " MSE training", MSE(p, y ) # mean abs error 0.708222960173


 MSE training 0.832253748827

In [85]:
0.832253748827 /1.39914325993


Out[85]:
0.5948309745412619

In [84]:
avg_p = [sum(y)/len(y)] * len(y)
print avg_p[0]
print " MSE training", MSE(avg_p, y ) # mean abs error 0.924575762201


3.94964888504
 MSE training 1.39914325993

In [67]:
sum(y)


Out[67]:
6358990000

In [80]:
3949/1000.0


Out[80]:
3.949

In [73]:
prediction_trainingx/1000


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-73-ce9cd07e76af> in <module>()
----> 1 prediction_trainingx/1000

TypeError: unsupported operand type(s) for /: 'list' and 'int'

In [12]:
#load reviews US with users
file = open("/home/iizhaki/oasis/CSE255/reviewToPositiveMap.pck")
positive = pickle.load( file)
file.close()
print len(positive)#1610014
print "done"
len (positive)
# positive


340642
done

In [94]:
len (positive)


Out[94]:
340642

In [96]:
len (restaurantReviewsUS)


Out[96]:
357191

In [102]:
az =[r for r in restaurantReviewsUS if r[1] not in positive]

len(az)


Out[102]:
16549

In [107]:
restaurantReviewsUS[0]


Out[107]:
("T C's Referee Sports Bar",
 '100327153115986850675',
 [43.529494, -96.792244],
 'Sioux Falls, SD 57106',
 ['SPORTS BAR', 'RESTAURANT'],
 4625)

In [13]:
#load reviews US with users
file = open("/home/iizhaki/oasis/CSE255/reviewToNegativeMap.pck")
negative = pickle.load( file)
file.close()
print len(positive)#1610014
print "done"
len(negative)
# negative


340642
done
Out[13]:
340642

In [104]:
len(negative)


Out[104]:
340642

In [106]:
az[1]


Out[106]:
('Dixie Lee Chicken',
 '112480955169876860251',
 [46.213896, -82.076332],
 'Spanish, ON P0P 2A0',
 ['FAMILY RESTAURANT'],
 5000)

In [110]:
reviews[0]


Out[110]:
(5000,
 [u'Mexican Restaurant', u'Latin American Restaurant'],
 "You won't be disappointed in the food.  They do business lunches and groups (6 to 10) very well.  Service always fast and helpful.  This is one of my top 4 Mexican restaurants in Akron area, the only detractor is the age of the building and the environment.  Again not back, nothing to stay away from, but their business is serving Mexican food to their customers and they do that well.  Lunch is the majority of times I have been there.",
 '101280967457665576418',
 '103173356293785774089')

In [112]:
qqq =  [q for q in reviews if q[4] ==az[1]]

In [119]:
len (negative[negative.keys()[1]])


Out[119]:
50

In [115]:
len(qqq)


Out[115]:
0

In [ ]:
theta,residuals,rank,s = numpy.linalg.lstsq(matrixC, rankingY)

In [10]:
len(theta)


Out[10]:
1128

In [17]:
def predict(data, theta):
    theta = numpy.matrix(theta)
    prediction = [theta*numpy.matrix(d).T  for d in data]
    return prediction



def MSE(prediction, real):
    squares = [ (p - r) ** 2 for p,r in zip (prediction,real) ]
    return numpy.mean(squares)

In [12]:
prediction_trainingx = predict(matrixC, theta)

y = [e/1000.0 for e in rankingY]
p = [e/1000.0 for e in prediction_trainingx]
print " MSE training", MSE(p, y ) # mean abs error 0.832253748827


 MSE training 0.832031361789

In [11]:
len(prediction_trainingx)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-3e16f324a7bc> in <module>()
----> 1 len(prediction_trainingx)

NameError: name 'prediction_trainingx' is not defined

In [14]:
0.832031361789/0.832253748827


Out[14]:
0.9997327893826691

In [15]:
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/ThetawWords.pck", "w")
np.save(file,theta)
file.close()
print "done"


done

In [18]:
foo = ['a', 'b', 'c', 'd', 'e']
from random import randrange
random_index = randrange(0,len(foo))
print random_index


2

In [7]:
import random

indexes = range(len(matrix))
random.shuffle(indexes)
print indexes[:20]
X = [matrix[i] for i in indexes]
y = [rankingR[i] for i in indexes]


[1079569, 944663, 1177637, 202632, 1327565, 277798, 625838, 296735, 382265, 1212205, 1038896, 1251769, 1025485, 379692, 1587813, 184734, 951334, 1234560, 952987, 209674]

In [6]:
print indexes[:20]
X = [matrix[i] for i in indexes]
y = [rankingR[i] for i in indexes]


[1079569  944663 1177637  202632 1327565  277798  625838  296735  382265
 1212205 1038896 1251769 1025485  379692 1587813  184734  951334 1234560
  952987  209674]

In [8]:
print sum(X[0]==matrix[0])


1015

In [12]:
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixReorder.pck", "w")
np.save(file,matrix)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/YsReorder.pck", "w")
np.save(file,rankingR)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/IndexReorder.pck", "w")
np.save(file,indexes)
file.close()
print "done"


done
done
done

In [1]:
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixReorder.pck")
matrix =np.load(file)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/YsReorder.pck")
rankingR = np.load(file)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/IndexReorder.pck")
indexes= np.load(file)
file.close()
print "done"


done
done
done

In [2]:
n = int(len(X)*0.7)
X_train = X[:n]
y_train = y[:n]

X_test =X[n:]
y_test =y[n:]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-8a79feb921bd> in <module>()
----> 1 n = int(len(X)*0.7)
      2 X_train = X[:n]
      3 y_train = y[:n]
      4 
      5 X_test =X[n:]

NameError: name 'X' is not defined

In [ ]:
theta,residuals,rank,s = numpy.linalg.lstsq(X_train, y_train)

In [ ]:
prediction_training = predict(X_train, theta)
yt = [e/1000.0 for e in y_train]
pt = [e/1000.0 for e in prediction_training]
print " MSE training", MSE(pt, yt ) # mean abs error 0.832253748827


prediction_test = predict(X_test, theta)
ys = [e/1000.0 for e in y_test]
ps = [e/1000.0 for e in prediction_test]
print " MSE test", MSE(ps, ys ) # mean abs error 0.832253748827

In [ ]:
a=[y-p for (y,p) in zip (ys,ps)]

In [ ]:
def MAE(prediction, real):
    squares = [ abs(p - r)  for p,r in zip (prediction,real) ]
    return max(squares)#sum(squares)/len(squares)

In [ ]:
foos =[p for p in prediction_test if p>5000]
len(foos)

In [ ]:
min(theta)

In [51]:
matrixCategories = numpy.zeros((len(restaurantReviewsUS),len(categories)))
i =0
for r in restaurantReviewsUS:
    for c in r[4]:
        matrixCategories [i,categories.index(c)] = r[5]
    i+=1
len(matrixCategories)


Out[51]:
357191

In [52]:
s = sum(matrixCategories,axis=0)

In [50]:
l = sum(matrixCategories,axis=0)

In [53]:
t =s /l

In [43]:
print (s/l)


[ 4625.  3558.  3916. ...,  4250.  3750.  3750.]

In [56]:
index = range(len(t))
index.sort (key= l.__getitem__, reverse=True)
    
ct = [categories[i] for i in index]
t[:] = [l[i] for i in index]

In [57]:
print t[:10]
print ct[:10]


[ 92193.  62372.  51248.  50239.  43532.  41073.  39149.  32937.  29796.
  27967.]
['RESTAURANT', 'FAST FOOD RESTAURANT', 'AMERICAN RESTAURANT', 'EUROPEAN RESTAURANT', 'ASIAN RESTAURANT', 'PIZZA RESTAURANT', 'ITALIAN RESTAURANT', 'MEXICAN RESTAURANT', 'HAMBURGER RESTAURANT', 'LATIN AMERICAN RESTAURANT']

In [64]:
prediction_training[:10]


Out[64]:
[matrix([[ 4045.84846337]]),
 matrix([[ 3338.69120392]]),
 matrix([[ 4189.7933387]]),
 matrix([[ 3566.76011685]]),
 matrix([[ 3868.23306624]]),
 matrix([[ 4562.30147085]]),
 matrix([[ 5003.98583937]]),
 matrix([[ 4087.29172447]]),
 matrix([[ 3534.33690685]]),
 matrix([[ 3863.80331597]])]

In [1]:
import numpy as np
print "Reading data..."
m0 = np.load("/home/iizhaki/oasis/CSE255/MatrixwWords0.pck")
print "done"
print "Reading data..."
m1 = np.load("/home/iizhaki/oasis/CSE255/MatrixwWords1.pck")
print "done"
print "Reading data..."
m2 = np.load("/home/iizhaki/oasis/CSE255/MatrixwWords2.pck")
print "done"

matrix = numpy.vstack((m0,m1,m2))
print len(matrix )


#load y
rankingR = np.load("/home/iizhaki/oasis/CSE255/YwWords.pck")
print len(rankingR)
print "done"


#save  matrix
file = open("/home/iizhaki/oasis/CSE255/IndexReorder.pck")
indexes= np.load(file)
file.close()
print "done"

print indexes[:20]
X = [matrix[i] for i in indexes]
y = [rankingR[i] for i in indexes]


n = int(len(X)*0.1)
X_train = X[:n]
y_train = y[:n]

X_test =X[n:]
y_test =y[n:]


Reading data...
done
Reading data...
done
Reading data...
done
1610014
1610014
done
done
[1079569  944663 1177637  202632 1327565  277798  625838  296735  382265
 1212205 1038896 1251769 1025485  379692 1587813  184734  951334 1234560
  952987  209674]

In [ ]:
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixReorder.pck")
matrix =np.load(file)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/YsReorder.pck")
rankingR = np.load(file)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/IndexReorder.pck")
indexes= np.load(file)
file.close()
print "done"

print indexes[:20]
X = [matrix[i] for i in indexes]
y = [rankingR[i] for i in indexes]


n = 100 #int(len(X)*0.7)
X_train = X[:n]
y_train = y[:n]

X_test =X[n:]
y_test =y[n:]

In [65]:
X_train_b = list(X_train)

In [66]:
from sklearn.cluster import KMeans 

K = 1000
km =  KMeans (n_clusters=K, n_jobs=-1)
km.fit(X_train)


Out[66]:
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=1000,
    n_init=10, n_jobs=-1, precompute_distances=True, random_state=None,
    tol=0.0001, verbose=0)

In [1]:


In [67]:
X_train = list(X_train_b)

def bitIt(idx, rng):
    res = [0] * (rng + 1)
    res[0] = 1
    res[idx + 1] = 1
    return res

X_train = [bitIt(i, K) for i in km.labels_]
print X_train[0]


[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [68]:
import scipy.optimize
from numpy.linalg import norm

### Gradient descent ###

# Objective
def f(theta, X, y, lam):
    diff = numpy.dot(X, theta) - y
    diffSq = (norm(diff) ** 2) / len(X)
    diffSqReg = diffSq + lam * norm(theta) ** 2
    #print "f : " , diffSqReg
    return diffSqReg

    # Derivative
def fprime(theta, X, y, lam):
    diff = numpy.dot(X, theta) - y
    res = 2 * numpy.dot(X.T, diff) / len(X) + 2 * lam * theta
    return res

In [69]:
import time
import timeit

start = time.time()
thetar,l,info = scipy.optimize.fmin_l_bfgs_b(f, numpy.array([0] * len(X_train[0])).T, fprime, args = (numpy.array(X_train), numpy.array(y_train).T, 0.1))
#thetar = scipy.optimize.minimize(f, numpy.array([0] * len(X_train[0])).T, jac = fprime, args = (numpy.array(X_train), numpy.array(y_train).T, 0.1))
end = time.time()
finished = end - start
print finished


0.552059173584

In [70]:
len (thetar)


Out[70]:
1001

In [71]:
def predict(data, theta):
    theta = numpy.matrix(theta)
    prediction = [theta*numpy.matrix(d).T  for d in data]
    return prediction

#prediction_training = predict(X, thetax)

def MSE(prediction, real):
    squares = [ (p - r)**2 for p,r in zip (prediction,real) ]
    return numpy.mean(squares)


#print " MSE training", MSE(prediction_training, y )

In [72]:
prediction_training = predict(X_train, thetar)
yt = [e/1000.0 for e in y_train]
pt = [e/1000.0 for e in prediction_training]
print " MSE training", MSE(pt, yt ) # mean abs error 0.832253748827


 MSE training 1.60364829241

In [240]:
prediction_test = predict(X_test, thetar)
ys = [e/1000.0 for e in y_test]
ps = [e/1000.0 for e in prediction_test]
print " MSE test", MSE(ps, ys ) # mean abs error 0.832253748827


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-240-97808929ec05> in <module>()
----> 1 prediction_test = predict(X_test, thetar)
      2 ys = [e/1000.0 for e in y_test]
      3 ps = [e/1000.0 for e in prediction_test]
      4 print " MSE test", MSE(ps, ys ) # mean abs error 0.832253748827

<ipython-input-238-34ca80dbdc63> in predict(data, theta)
      1 def predict(data, theta):
      2     theta = numpy.matrix(theta)
----> 3     prediction = [theta*numpy.matrix(d).T  for d in data]
      4     return prediction
      5 

/oasis/scratch/iizhaki/temp_project/PV/python-virtualEnv3/lib/python2.7/site-packages/numpy/matrixlib/defmatrix.pyc in __mul__(self, other)
    339         if isinstance(other, (N.ndarray, list, tuple)) :
    340             # This promotes 1-D vectors to row vectors
--> 341             return N.dot(self, asmatrix(other))
    342         if isscalar(other) or not hasattr(other, '__rmul__') :
    343             return N.dot(self, other)

KeyboardInterrupt: 

In [16]:
import numpy as np
np.__config__.show()


blas_info:
    libraries = ['blas']
    library_dirs = ['/opt/lapack/gnu/lib']
    language = f77
lapack_info:
    libraries = ['lapack']
    library_dirs = ['/opt/lapack/gnu/lib']
    language = f77
atlas_threads_info:
  NOT AVAILABLE
blas_opt_info:
    libraries = ['blas']
    library_dirs = ['/opt/lapack/gnu/lib']
    language = f77
    define_macros = [('NO_ATLAS_INFO', 1)]
atlas_blas_threads_info:
  NOT AVAILABLE
openblas_info:
  NOT AVAILABLE
lapack_opt_info:
    libraries = ['lapack', 'blas']
    library_dirs = ['/opt/lapack/gnu/lib']
    language = f77
    define_macros = [('NO_ATLAS_INFO', 1)]
atlas_info:
  NOT AVAILABLE
lapack_mkl_info:
  NOT AVAILABLE
blas_mkl_info:
  NOT AVAILABLE
atlas_blas_info:
  NOT AVAILABLE
mkl_info:
  NOT AVAILABLE

Test with PlayDoh


In [68]:
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixReorder.pck")
matrix =np.load(file)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/YsReorder.pck")
rankingR = np.load(file)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/IndexReorder.pck")
indexes= np.load(file)
file.close()
print "done"

print indexes[:20]
X = [matrix[i] for i in indexes]
y = [rankingR[i] for i in indexes]


n = 100000 #int(len(X)*0.7)
X_train = X[:n]
y_train = y[:n]

#X_test =X[n:]
#y_test =y[n:]


done
done
done
[1079569  944663 1177637  202632 1327565  277798  625838  296735  382265
 1212205 1038896 1251769 1025485  379692 1587813  184734  951334 1234560
  952987  209674]

In [1]:
# NEGATIVE Log-likelihood
thetas = numpy.array([0] * len(X_train[0]))
lam = 0.1

def f(i):
    X = X_train[i]
    y = Y_train[i]
    theta = thetas[i]
    logit = np.dot(X, theta)
    loglikelihood = -np.log(1 + np.exp(-logit)).sum(axis=0, dtype='float')
    loglikelihood -= np.dot(logit, 1 - y)
    loglikelihood -= lam * np.dot(theta, theta)
  
    #print "ll =", loglikelihood
    return -loglikelihood

In [ ]:
import time
import timeit
import playdoh



#thetar,l,info = scipy.optimize.fmin_l_bfgs_b(f, numpy.array([0] * len(X_train[0])).T, fprime, args = (numpy.array(X_train), numpy.array(y_train).T, 0.1))
#thetar = scipy.optimize.minimize(f, numpy.array([0] * len(X_train[0])).T, jac = fprime, args = (numpy.array(X_train), numpy.array(y_train).T, 0.1))
if __name__ == '__main__':
    start = time.time()
    results = playdoh.maximize(fun,
                       popsize = 10000, # size of the population
                       maxiter = 10, # maximum number of iterations
                       cpu = 2, # number of CPUs to use on the local machine
                       x_initrange = [-10.,10.]) # initial interval for the ``x`` parameter

    # Display the final results in a table
    playdoh.print_table(results)
    end = time.time()
    finished = end - start
    print finished

In [ ]:
import playdoh

thetas = numpy.array([0] * 100)
lam = 0.1

# The fitness function to maximize
def fun(x, y):
    return x


if __name__ == '__main__':
    # Maximize the fitness function in parallel
    results = playdoh.minimize(fun,
                       popsize=1,  # size of the population
                       maxiter=1,  # maximum number of iterations
                       cpu=1)

    # Display the final result in a table
    playdoh.print_table(results)

In [69]:
X_train = np.array(X_train)
y_train = np.array(y_train)

In [32]:
# Objective
def f(theta, X, y, lam):
    diff = numpy.dot(X, theta) - y
    diffSq = (norm(diff) ** 2) / len(X)
    diffSqReg = diffSq + lam * norm(theta) ** 2
    #print "f : " , diffSqReg
    return diffSqReg

In [70]:
import time
import timeit

theta  = numpy.array([0] * len(X_train[0])).T

start = time.time()
f(theta, X_train, y_train, 0.1)
end = time.time()
finished = end - start
print finished


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-70-54e05b225547> in <module>()
      5 
      6 start = time.time()
----> 7 f(theta, X_train, y_train, 0.1)
      8 end = time.time()
      9 finished = end - start

/oasis/scratch/iizhaki/temp_project/PV/python-virtualEnv3/lib/python2.7/site-packages/numpy/random/mtrand.so in mtrand.RandomState.f (numpy/random/mtrand/mtrand.c:11268)()

TypeError: f() takes at most 3 positional arguments (4 given)

In [84]:
import threading
from threading import Thread
from multiprocessing.pool import ThreadPool
import time
import timeit

def inner(x,y):
  return sum([x[i]*y[i] for i in range(len(x))])

def sigmoid(x):
  return 1.0 / (1 + np.exp(-x))

l = X_train.shape[0]
res = dict()
theta = numpy.array([0] * len(X_train[0]))

def fprimeold(theta, X, y, lam):
    diff = numpy.dot(X, theta) - y
    #print diff
    return 2 * numpy.dot(X.T, diff) / X_train.shape[0] + 2 * lam * theta

# Derivative
def fprime(theta, X, y, lam, mi, ma, ress):
    X = np.array(X, copy=True)
    y = np.array(y, copy=True)
    theta = np.array(theta, copy=True)
    X[: mi] = 0.
    X[ma :] = 0
    y[: mi] = 0
    y[ma : ] = 0
    theta[mi : ma] = 0
    diff = numpy.dot(X, theta) - y
    #print diff
    ress[mi] = 2 * (numpy.dot(diff, X) / X_train.shape[0] + lam * theta)

threads = []
N = 1
for n in range(N):
    low = n * 1.0 / N
    high = (n + 1.0) / N
    threads.append(Thread(target = fprime, args = (theta, X_train, y_train, 0.1, low, high, res)))
#t3 = Thread(target = fprime, args = (theta, X_train, y_train, 0.1, 2.0*l / 3, l - 1, res))

start = time.time()

for t in threads:
    t.start()

for t in threads:
    t.join()

end = time.time()
finished = end - start
print finished

fres = np.zeros(res[0].shape)
for r in res.values():
    fres += r
    

start = time.time()
d = fprimeold(theta, X_train, y_train, 0.1)
end = time.time()
finished = end - start
print finished

print fres
print d
print [100.0 * sum(fres == d) / len(d)]


0.912188053131
0.496465921402
[-0.08  0.    0.   ...,  0.    0.    0.  ]
[ -7.91206000e+03  -1.13400000e+01  -1.18400000e+01 ...,  -1.72000000e+04
   0.00000000e+00  -1.39800000e+04]
[20.233463035019454]

In [17]:


In [16]:


In [ ]: