In [2]:
import pickle
In [4]:
from sklearn.cluster import KMeans
import json
In [2]:
# Links:
# /home/iizhaki/CSE255/googlelocal/places.json
# /home/iizhaki/CSE255/googlelocal/reviews.json
# /home/iizhaki/CSE255/googlelocal/users.json
In [5]:
import json
print "Start loading users..."
jsonUsers = json.load(open("/oasis/scratch/iizhaki/temp_project/CSE255/googlelocal/users.json"))
print "Finished loading users"
In [ ]:
print "Start loading reviews..."
jsonReviews = json.load(open("/home/iizhaki/oasis/CSE255/reviews_0.pck"))
print "Finished loading reviews"
In [11]:
print "Start loading places..."
jsonPlaces = json.load(open("/oasis/scratch/iizhaki/temp_project/CSE255/googlelocal/places.json"))
print "Finished loading places"
In [15]:
##users
X = []
ids = []
for id in jsonUsers:
X.append(jsonUsers[id])
ids.append(id)
#X = [jsonUsers[id] for id in jsonUsers ]
#id = [id for id in jsonUsers]
#place = [n['currentPlace'] if 'currentPlace' in n and n['currentPlace']!=[] else [] for n in X ]
place = [n['currentPlace'] for n in X if 'currentPlace' in n and n['currentPlace']!=[] ]
ids = [ids[i] for i in range(len(X)) if 'currentPlace' in X[i] and X[i]['currentPlace']!=[] ]
GPS = [g[1] if len(g)>1 else [] for g in place ]
id=ids
#id = [ids[i] for i in range(len(place)) if len(place[i])>1 ]
GPS = [[g[1]/1e7,g[2]/1e7] if len(g)>2 else [] for g in GPS]
#GPS_US = [p for p in GPS if p[0]>=24 and p[0]<=49 and p[1]>=-128 and p[1]<=-47]
#id = [id[i] for i in range(len(GPS)) if GPS[i][0]>=24 and GPS[i][0]<=49 and GPS[i][1]>=-128 and GPS[i][1]<=-47 ]
print len(GPS),",", len(id)
In [6]:
#print len(X), len(ids)
##users
X = []
ids = []
for id in jsonUsers:
X.append(jsonUsers[id])
ids.append(id)
#X = [jsonUsers[id] for id in jsonUsers ]
#id = [id for id in jsonUsers]
place = [n['currentPlace'] if 'currentPlace' in n and n['currentPlace']!=[] else [] for n in X ]
#place = [n['currentPlace'] for n in X if 'currentPlace' in n and n['currentPlace']!=[] ]
#ids = [ids[i] for i in range(len(X)) if 'currentPlace' in X[i] and X[i]['currentPlace']!=[] ]
GPS = [g[1] if len(g)>1 else [] for g in place ]
id=ids
#id = [ids[i] for i in range(len(place)) if len(place[i])>1 ]
GPS = [[g[1]/1e7,g[2]/1e7] if len(g)>2 else [] for g in GPS]
#GPS_US = [p for p in GPS if p[0]>=24 and p[0]<=49 and p[1]>=-128 and p[1]<=-47]
#id = [id[i] for i in range(len(GPS)) if GPS[i][0]>=24 and GPS[i][0]<=49 and GPS[i][1]>=-128 and GPS[i][1]<=-47 ]
print len(GPS),",", len(id)
In [27]:
GPS_US = []
id_US = []
for i in range(len(GPS)):
p = GPS[i]
if p!= []:
if p[0]>=24 and p[0]<=49 and p[1]>=-128 and p[1]<=-47:
GPS_US.append(p)
id_US.append(id[i])
#save users
file = open("/home/iizhaki/oasis/CSE255/users_GPS_US.pck", "w")
pickle.dump(GPS_US, file)
file.close()
print "done"
#save users
file = open("/home/iizhaki/oasis/CSE255/users_id_US.pck", "w")
pickle.dump(id_US, file)
file.close()
print "done"
In [42]:
#save users
file = open("/home/iizhaki/oasis/CSE255/users_GPS_US.pck", "w")
pickle.dump(GPS_US, file)
file.close()
print "done"
In [7]:
#save users
file = open("/home/iizhaki/oasis/CSE255/users_GPS.pck", "w")
pickle.dump(GPS, file)
file.close()
print "done"
#save users
file = open("/home/iizhaki/oasis/CSE255/users_id.pck", "w")
pickle.dump(id, file)
file.close()
print "done"
In [1]:
print len(GPS), len(id)
In [1]:
import pickle
#load users
file = open("/home/iizhaki/oasis/CSE255/users_GPS.pck")
GPS = pickle.load(file)
file.close()
print "done"
#load users
file = open("/home/iizhaki/oasis/CSE255/users_id.pck")
id = pickle.load(file)
file.close()
print "done"
print len(GPS), len(id)
In [139]:
#dictionary of users
usersD = dict()
_sum = [0]* len(categories)
coun =[0]* len(categories)
for i in range(len(id)):
usersD[id[i]]=(GPS[i],list(_sum),list(coun))
print len(usersD)
print "done"
In [138]:
len(GPS)
Out[138]:
In [140]:
In [12]:
X = [jsonPlaces[p] for p in jsonPlaces ]
gps = [g['gps'] if 'gps'in g else [0,0] for g in X]
gps_x = [g[0] for g in gps]
gps_y = [g[1] for g in gps]
names = [n['name'].encode('ascii','ignore') if 'name'in n else "no name" for n in X]
id = [n['id'].encode('ascii','ignore') if 'id' in n else "no id" for n in X]
address = [n['address'] if 'address' in n else ["",""] for n in X]
address = [n[1].encode('ascii','ignore') if len(n)>1 else "" for n in address]
#name, id, gps, address, categories, rating, count
places = [(names[i],id[i],gps[i],address[i],set(),0,0) for i in range(len(X))]
places = [p for p in places if p[2]!=[0,0]]
print "done"
In [14]:
print len(places)#3087397
In [16]:
#save places
#file = open("/home/iizhaki/oasis/CSE255/places.pck", "w")
#pickle.dump(places, file)
#file.close()
print "done"
In [17]:
#read places
file = open ("/home/iizhaki/oasis/CSE255/places.pck")
places = pickle.load(file)
file.close();
print "done"
In [3]:
#read reviews0
file = open ("/home/iizhaki/oasis/CSE255/reviews_0.pck")
reviews0 = pickle.load(file)
file.close();
print "done"
In [4]:
#read reviews1
file = open ("/home/iizhaki/oasis/CSE255/reviews_1.pck")
reviews1 = pickle.load(file)
file.close();
print "done"
In [5]:
#read reviews2
file = open ("/home/iizhaki/oasis/CSE255/reviews_2.pck")
reviews2 = pickle.load(file)
file.close();
print "done"
In [6]:
reviews = reviews0+reviews1+reviews2
print len(reviews)
In [ ]:
#creates a dict given the places array
placesD = dict()
for p in places:
placesD[p[1]]=p
In [ ]:
#fills the dict with the categories and rating from reviews
for r in reviews:
if r[4] in placesD:
p = placesD[r[4]]
placesD[r[4]] = (p[0],p[1],p[2],p[3],set(p[4]) | set(r[1]),p[5]+r[0],p[6]+1)
print "done"
In [32]:
#calculate Rating
todel =[]
for idx in placesD:
p = placesD[idx]
if (p[6]==0):
todel += [idx]
else:
placesD[idx] = (p[0],p[1],p[2],p[3],list(p[4]),p[5]/p[6])
print "done"
In [35]:
placesReviews = [placesD[idx] for idx in placesD]
len(placesReviews)
Out[35]:
In [39]:
placesReviewsUS = [p for p in placesReviews if p[2][0]>=24 and p[2][0]<=49 and p[2][1]>=-128 and p[2][1]<=-47]
len(placesReviewsUS)
Out[39]:
In [54]:
placesReviewsUS = [(p[0],p[1],p[2],p[3],[w.encode('ascii','ignore') for w in p[4] ],p[5]) for p in placesReviewsUS]
len (placesReviewsUS)
Out[54]:
In [55]:
Out[55]:
In [9]:
print "a"
In [40]:
#save places with reviews
file = open("/home/iizhaki/oasis/CSE255/placesReviews.pck", "w")
pickle.dump(placesReviews, file)
file.close()
print "done"
In [56]:
#save places with reviews
file = open("/home/iizhaki/oasis/CSE255/placesReviewsUS.pck", "w")
pickle.dump(placesReviewsUS, file)
file.close()
print "done"
In [4]:
#read reviewsUS
file = open ("/home/iizhaki/oasis/CSE255/placesReviewsUS.pck")
placesReviewsUS = pickle.load(file)
file.close();
print "done"
In [13]:
file = open ("/home/iizhaki/oasis/CSE255/gpsUS200.pck")
centroids200 = pickle.load(file)
file.close();
print "done"
In [63]:
restaurantReviewsUS = [p for p in placesReviewsUS if [y for y in p[4] if 'RESTAURANT' in y.upper() and 'RESTAURANTE' not in y.upper() ]!=[]]
restaurantReviewsUS = [(p[0],p[1],p[2],p[3],[w.upper() for w in p[4] ],p[5]) for p in restaurantReviewsUS]
restaurantReviewsUS = [(p[0],p[1],p[2],p[3],[w for w in p[4] if 'RESTAURANT' in w or 'BAR' in w or 'BAKERY' in w or 'FOOD' in w or 'GRILL' in w ],p[5]) for p in restaurantReviewsUS]
print len(restaurantReviewsUS) #357191
print restaurantReviewsUS[0]
In [66]:
#save restaurants with reviews
file = open("/home/iizhaki/oasis/CSE255/restaurantsReviewsUS.pck", "w")
pickle.dump(restaurantReviewsUS, file)
file.close()
print "done"
In [24]:
import pickle
file = open ("/home/iizhaki/oasis/CSE255/restaurantsReviewsUS.pck")
restaurantReviewsUS = pickle.load(file)
file.close();
print "done"
print len(restaurantReviewsUS) #357191
In [29]:
categories[0]
In [7]:
categories = set ()
for p in restaurantReviewsUS:
categories = categories | set ( p[4] )
print len(list(categories))
In [15]:
categories = list(categories)
#save categories
file = open("/home/iizhaki/oasis/CSE255/categories.pck", "w")
pickle.dump(categories, file)
file.close()
print "done"
In [30]:
import pickle
file = open ("/home/iizhaki/oasis/CSE255/categories.pck")
categories = pickle.load(file)
file.close();
print "done"
print len(categories) #363
In [70]:
categories = list(categories)
matrixCategories = numpy.zeros((len(restaurantReviewsUS),len(categories)))
i =0
for r in restaurantReviewsUS:
for c in r[4]:
matrixCategories [i,categories.index(c)] = 1
i+=1
len(matrixCategories)
Out[70]:
In [72]:
#save restaurants matrix
file = open("/home/iizhaki/oasis/CSE255/reviewMatrix.pck", "w")
numpy.save(file,matrixCategories)
file.close()
print "done"
In [71]:
len(restaurantReviewsUS)
Out[71]:
In [74]:
reviewsY = [r[5] for r in restaurantReviewsUS]
#save reviews
file = open("/home/iizhaki/oasis/CSE255/reviewY.pck", "w")
numpy.save(file,reviewsY)
file.close()
print "done"
In [91]:
print "max", max(reviewsY), "min", min(reviewsY), "avg",sum(reviewsY)/len(reviewsY)
In [ ]:
GPS = [g for g in gps if g != [0, 0]]
print len(gps), len(GPS)
K = 1000
kmeans = KMeans(n_clusters=K, n_jobs=-1)
kmeans.fit(GPS)
In [ ]:
kk = kmeans
print type (kk)
print len(kk.cluster_centers_)
print [sum(kk.labels_ == i) for i in range(K)]
print kk.cluster_centers_[3]
print gps[0]
In [140]:
In [140]:
In [140]:
In [ ]:
import numpy as np
import urllib
import scipy.optimize
import random
from math import exp
from math import log
print "Reading data..."
data = np.load("/home/iizhaki/oasis/CSE255/reviewFeatures.pck")
print "done"
def inner(x,y):
return sum([x[i]*y[i] for i in range(len(x))])
def sigmoid(x):
return 1.0 / (1 + np.exp(-x))
# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
logit = np.dot(X, theta)
loglikelihood = -np.log(1 + np.exp(-logit)).sum(axis=0, dtype='float')
loglikelihood -= np.dot(logit, 1 - y)
loglikelihood -= lam * np.dot(theta, theta)
print "ll =", loglikelihood
return -loglikelihood
# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
logit = np.dot(X, theta)
dl = np.dot(X.T, (1 - sigmoid(logit)));
dl -= y_spec
dl -= 2 * lam * theta
# Negate the return value since we're doing gradient *ascent*
return np.array([-x for x in dl])
X = data
y = np.load("/home/iizhaki/oasis/CSE255/reviewY.pck")
# Training data
X_train = X
y_train = [(int)(ys/1000) for ys in y]
# Test data
X_test = X[1000:]
y_test = y[1000:]
#dummy = np.zeros((X_train.shape[1]))
#y_spec = np.array([X[i] if not y_train[i] else dummy for i in range(len(X_train))]).sum(axis=0, dtype='float')
#theta,l,info = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, args = (X_train, y_train, 1.0))
#print theta.shape
#print "Final log likelihood =", -l
In [86]:
print "Reading data..."
data = np.load("/home/iizhaki/oasis/CSE255/reviewFeatures.pck")
print "done"
X = data
y = np.load("/home/iizhaki/oasis/CSE255/reviewY.pck")
In [87]:
thetax,residualsx,rankx,sx = numpy.linalg.lstsq(X, y)
In [25]:
def predict(data, theta):
theta = numpy.matrix(theta)
prediction = [theta*numpy.matrix(d).T for d in data]
return prediction
prediction_training = predict(X, thetax)
def MSE(prediction, real):
squares = [ (p - r)**2 for p,r in zip (prediction,real) ]
return numpy.mean(squares)
print " MSE training", MSE(prediction_training, y )
#absolute error 658.914905107
In [100]:
print prediction_training[1],", ", y[1]
In [90]:
avg = [sum(y)*1.0/len(y)] * len(y)
print " MSE training", MSE(avg, y )
#MSE training 686.400936754
In [91]:
719823.238065/770393.021789
Out[91]:
In [128]:
print np.array([X[:,0]]).shape
print np.array(X[:,301:]).shape
Xx = np.concatenate((np.array([X[:,0]]).T,X[:,301:]), axis =1)
In [132]:
Xx[:,0]
Out[132]:
In [133]:
thetax,residualsx,rankx,sx = numpy.linalg.lstsq(Xx, y)
In [140]:
In [135]:
def predict(data, theta):
theta = numpy.matrix(theta)
prediction = [theta*numpy.matrix(d).T for d in data]
return prediction
prediction_trainingx = predict(Xx, thetax)
def MSE(prediction, real):
squares = [ (p - r)**2 for p,r in zip (prediction,real) ]
return numpy.mean(squares)
print " MSE training", MSE(prediction_trainingx, y )
In [136]:
restaurantReviewsUS[0]
Out[136]:
In [49]:
len(reviews)
Out[49]:
In [140]:
len(restaurantReviewsUS)
Out[140]:
In [146]:
reviews1[0]
Out[146]:
In [1]:
reviews1[0:5]
In [17]:
len(categories)
Out[17]:
In [4]:
#creates a dict given the reviews Restaurants array
placesD = dict()
for p in restaurantReviewsUS:
placesD[p[1]]=p
print "Done"
In [133]:
print len(placesD)
In [14]:
#reviews of restaurants in the US
reviews = [r for r in reviews if r[4] in placesD]
print len(reviews)#2157302
print "Done"
In [15]:
reviews = [r for r in reviews if r[3] in usersD]
print len(reviews)#1610014
print "Done"
In [5]:
print len(reviews)#1610014
In [16]:
#save reviews
file = open("/home/iizhaki/oasis/CSE255/reviewsUS.pck", "w")
pickle.dump(reviews, file)
file.close()
print len(reviews)#1610014
print "done"
In [5]:
#load reviews US with users
file = open("/home/iizhaki/oasis/CSE255/reviewsUS.pck")
reviews = pickle.load( file)
file.close()
print len(reviews)#1610014
print "done"
In [140]:
len(usersD)
Out[140]:
In [134]:
len(reviews)
Out[134]:
In [141]:
categories = list(categories)
usersDD = dict()
for r in reviews:
if r[4] in placesD: #places
if r[3] in usersD: #user
p = placesD[r[4]]
u = usersD[r[3]]
sum = list(u[1])
count = list(u[2])
for c in p[4]:
if c in categories:
idx = categories.index(c)
sum[idx] += r[0]
count[idx] += 1
usersDD[r[3]] = (u[0],sum,count)
print "Done"
In [11]:
len(usersDD)
Out[11]:
In [146]:
del usersD
In [72]:
#save usersDD
file = open("/home/iizhaki/oasis/CSE255/usersDD.pck", "w")
pickle.dump(usersDD, file)
file.close()
print "done"
In [10]:
#load usersDD
file = open("/home/iizhaki/oasis/CSE255/usersDD.pck")
usersDD = pickle.load( file)
file.close()
print "done"
In [126]:
len (list(usersDD))
Out[126]:
In [147]:
#calculate rank
for u in usersDD:
us = usersDD[u]
rank = []
for i in range(len(us[1])):
if us[2][i]!=0:
rank += [us[1][i]/us[2][i]]
else:
rank += [0]
usersDD[u] = (u,us[0],rank)
In [148]:
len(usersDD)
Out[148]:
In [149]:
#save usersDD
file = open("/home/iizhaki/oasis/CSE255/usersDD.pck", "w")
pickle.dump(usersDD, file)
file.close()
print "done"
In [62]:
u= [usersDD[sd] for sd in usersDD[:10]]
u[0]
In [39]:
ur = [r for r in reviews if r[3] =='106887711560311804886']
In [19]:
len(reviews)
Out[19]:
In [16]:
#len of reviews most be 1610014
K=300
matrixC= []
rankingY = []
for r in reviews:
if r[3] in usersDD: #user
l = locationD[r[4]]
locs = list(l) #location or Rest
p = placesD[r[4]]
positive
if r[4] in positive:
pos = list(positive[r[4]])
else:
pos = [0] * 50
negative
if r[4] in negative:
neg = list(negative[r[4]])
else:
neg = [0] * 50
#categories
cats = [0] * len(categories)
for c in p[4]:
cats [categories.index(c)] = 1
u = usersDD[r[3]]
#alpha + location + avg review + categories + user info
matrixC.append ( [1] + locs + [p[5]] + cats + pos + neg + u[2])
rankingY.append (r[0])
print "Done"
print len(matrixC)
In [17]:
len(matrixC[0])
Out[17]:
In [35]:
del usersDD
In [121]:
import numpy as np
#save matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixD.pck", "w")
np.save(file,matrix)
file.close()
print "done"
In [123]:
#save y
file = open("/home/iizhaki/oasis/CSE255/YsD.pck", "w")
np.save(file,rankingR)
file.close()
print "done"
In [3]:
import numpy as np
#save matrix
matrix = np.load("/home/iizhaki/oasis/CSE255/MatrixD.pck")
print "done"
#save y
rankingR = np.load("/home/iizhaki/oasis/CSE255/YsD.pck")
print "done"
In [26]:
m0 = matrixC[:500000]
In [27]:
import numpy as np
#save matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixwWords0.pck", "w")
np.save (file,m0)
#pickle.dump(matrixC, file)
file.close()
print "done"
In [28]:
m1 = matrixC[500000:1000000]
import numpy as np
#save matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixwWords1.pck", "w")
np.save (file,m1)
#pickle.dump(matrixC, file)
file.close()
print "done"
In [29]:
m2 = matrixC[1000000:]
import numpy as np
#save matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixwWords2.pck", "w")
np.save (file,m2)
#pickle.dump(matrixC, file)
file.close()
print "done"
In [1]:
print "Reading data..."
m0 = np.load("/home/iizhaki/oasis/CSE255/MatrixwWords0.pck")
print "done"
print "Reading data..."
m1 = np.load("/home/iizhaki/oasis/CSE255/MatrixwWords1.pck")
print "done"
print "Reading data..."
m2 = np.load("/home/iizhaki/oasis/CSE255/MatrixwWords2.pck")
print "done"
In [5]:
len(m0)+len(m1)+len(m2)
matrixC= numpy.vstack((m0,m1,m2))
print len(matrixC)
In [6]:
type (matrixC)
Out[6]:
In [33]:
#save y
file = open("/home/iizhaki/oasis/CSE255/YwWords.pck", "w")
np.save(file,rankingY)
file.close()
print "done"
In [7]:
#load y
rankingY = np.load("/home/iizhaki/oasis/CSE255/YwWords.pck")
print len(rankingY)
print "done"
In [8]:
print len(rankingY)
In [6]:
print "Reading data..."
mdata = np.load("/home/iizhaki/oasis/CSE255/reviewFeatures.pck")
print "done"
In [25]:
len( matrixC)
Out[25]:
In [7]:
mdata = mdata[:,1:301]
#creates a dict given the reviews Restaurants array
locationD = dict()
for i in range(len(restaurantReviewsUS)):
locationD[restaurantReviewsUS[i][1]]=list(mdata[i])
print "Done"
In [9]:
len(locationD)
Out[9]:
In [8]:
del mdata
In [ ]:
theta,residuals,rank,s = numpy.linalg.lstsq(matrix, rankingR)
In [34]:
print 2
In [38]:
len(rankingR)
Out[38]:
In [50]:
np.matrix(theta)
Out[50]:
In [49]:
len (np.matrix[0])
In [43]:
import numpy as np
#save matrix
file = open("/home/iizhaki/oasis/CSE255/ThetaX.pck", "w")
np.save(file,theta)
file.close()
print "done"
In [ ]:
print "Reading data..."
theta = np.load("/home/iizhaki/oasis/CSE255/ThetaX.pck")
print "done"
In [83]:
def predict(data, theta):
theta = numpy.matrix(theta)
prediction = [theta*numpy.matrix(d).T for d in data]
return prediction
prediction_trainingx = predict(matrix, theta)
def MSE(prediction, real):
squares = [ (p - r) ** 2 for p,r in zip (prediction,real) ]
return numpy.mean(squares)
y = [e/1000.0 for e in rankingR]
p = [e/1000.0 for e in prediction_trainingx]
print " MSE training", MSE(p, y ) # mean abs error 0.708222960173
In [85]:
0.832253748827 /1.39914325993
Out[85]:
In [84]:
avg_p = [sum(y)/len(y)] * len(y)
print avg_p[0]
print " MSE training", MSE(avg_p, y ) # mean abs error 0.924575762201
In [67]:
sum(y)
Out[67]:
In [80]:
3949/1000.0
Out[80]:
In [73]:
prediction_trainingx/1000
In [12]:
#load reviews US with users
file = open("/home/iizhaki/oasis/CSE255/reviewToPositiveMap.pck")
positive = pickle.load( file)
file.close()
print len(positive)#1610014
print "done"
len (positive)
# positive
In [94]:
len (positive)
Out[94]:
In [96]:
len (restaurantReviewsUS)
Out[96]:
In [102]:
az =[r for r in restaurantReviewsUS if r[1] not in positive]
len(az)
Out[102]:
In [107]:
restaurantReviewsUS[0]
Out[107]:
In [13]:
#load reviews US with users
file = open("/home/iizhaki/oasis/CSE255/reviewToNegativeMap.pck")
negative = pickle.load( file)
file.close()
print len(positive)#1610014
print "done"
len(negative)
# negative
Out[13]:
In [104]:
len(negative)
Out[104]:
In [106]:
az[1]
Out[106]:
In [110]:
reviews[0]
Out[110]:
In [112]:
qqq = [q for q in reviews if q[4] ==az[1]]
In [119]:
len (negative[negative.keys()[1]])
Out[119]:
In [115]:
len(qqq)
Out[115]:
In [ ]:
theta,residuals,rank,s = numpy.linalg.lstsq(matrixC, rankingY)
In [10]:
len(theta)
Out[10]:
In [17]:
def predict(data, theta):
theta = numpy.matrix(theta)
prediction = [theta*numpy.matrix(d).T for d in data]
return prediction
def MSE(prediction, real):
squares = [ (p - r) ** 2 for p,r in zip (prediction,real) ]
return numpy.mean(squares)
In [12]:
prediction_trainingx = predict(matrixC, theta)
y = [e/1000.0 for e in rankingY]
p = [e/1000.0 for e in prediction_trainingx]
print " MSE training", MSE(p, y ) # mean abs error 0.832253748827
In [11]:
len(prediction_trainingx)
In [14]:
0.832031361789/0.832253748827
Out[14]:
In [15]:
import numpy as np
#save matrix
file = open("/home/iizhaki/oasis/CSE255/ThetawWords.pck", "w")
np.save(file,theta)
file.close()
print "done"
In [18]:
foo = ['a', 'b', 'c', 'd', 'e']
from random import randrange
random_index = randrange(0,len(foo))
print random_index
In [7]:
import random
indexes = range(len(matrix))
random.shuffle(indexes)
print indexes[:20]
X = [matrix[i] for i in indexes]
y = [rankingR[i] for i in indexes]
In [6]:
print indexes[:20]
X = [matrix[i] for i in indexes]
y = [rankingR[i] for i in indexes]
In [8]:
print sum(X[0]==matrix[0])
In [12]:
import numpy as np
#save matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixReorder.pck", "w")
np.save(file,matrix)
file.close()
print "done"
#save matrix
file = open("/home/iizhaki/oasis/CSE255/YsReorder.pck", "w")
np.save(file,rankingR)
file.close()
print "done"
#save matrix
file = open("/home/iizhaki/oasis/CSE255/IndexReorder.pck", "w")
np.save(file,indexes)
file.close()
print "done"
In [1]:
import numpy as np
#save matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixReorder.pck")
matrix =np.load(file)
file.close()
print "done"
#save matrix
file = open("/home/iizhaki/oasis/CSE255/YsReorder.pck")
rankingR = np.load(file)
file.close()
print "done"
#save matrix
file = open("/home/iizhaki/oasis/CSE255/IndexReorder.pck")
indexes= np.load(file)
file.close()
print "done"
In [2]:
n = int(len(X)*0.7)
X_train = X[:n]
y_train = y[:n]
X_test =X[n:]
y_test =y[n:]
In [ ]:
theta,residuals,rank,s = numpy.linalg.lstsq(X_train, y_train)
In [ ]:
prediction_training = predict(X_train, theta)
yt = [e/1000.0 for e in y_train]
pt = [e/1000.0 for e in prediction_training]
print " MSE training", MSE(pt, yt ) # mean abs error 0.832253748827
prediction_test = predict(X_test, theta)
ys = [e/1000.0 for e in y_test]
ps = [e/1000.0 for e in prediction_test]
print " MSE test", MSE(ps, ys ) # mean abs error 0.832253748827
In [ ]:
a=[y-p for (y,p) in zip (ys,ps)]
In [ ]:
def MAE(prediction, real):
squares = [ abs(p - r) for p,r in zip (prediction,real) ]
return max(squares)#sum(squares)/len(squares)
In [ ]:
foos =[p for p in prediction_test if p>5000]
len(foos)
In [ ]:
min(theta)
In [51]:
matrixCategories = numpy.zeros((len(restaurantReviewsUS),len(categories)))
i =0
for r in restaurantReviewsUS:
for c in r[4]:
matrixCategories [i,categories.index(c)] = r[5]
i+=1
len(matrixCategories)
Out[51]:
In [52]:
s = sum(matrixCategories,axis=0)
In [50]:
l = sum(matrixCategories,axis=0)
In [53]:
t =s /l
In [43]:
print (s/l)
In [56]:
index = range(len(t))
index.sort (key= l.__getitem__, reverse=True)
ct = [categories[i] for i in index]
t[:] = [l[i] for i in index]
In [57]:
print t[:10]
print ct[:10]
In [64]:
prediction_training[:10]
Out[64]:
In [1]:
import numpy as np
print "Reading data..."
m0 = np.load("/home/iizhaki/oasis/CSE255/MatrixwWords0.pck")
print "done"
print "Reading data..."
m1 = np.load("/home/iizhaki/oasis/CSE255/MatrixwWords1.pck")
print "done"
print "Reading data..."
m2 = np.load("/home/iizhaki/oasis/CSE255/MatrixwWords2.pck")
print "done"
matrix = numpy.vstack((m0,m1,m2))
print len(matrix )
#load y
rankingR = np.load("/home/iizhaki/oasis/CSE255/YwWords.pck")
print len(rankingR)
print "done"
#save matrix
file = open("/home/iizhaki/oasis/CSE255/IndexReorder.pck")
indexes= np.load(file)
file.close()
print "done"
print indexes[:20]
X = [matrix[i] for i in indexes]
y = [rankingR[i] for i in indexes]
n = int(len(X)*0.1)
X_train = X[:n]
y_train = y[:n]
X_test =X[n:]
y_test =y[n:]
In [ ]:
import numpy as np
#save matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixReorder.pck")
matrix =np.load(file)
file.close()
print "done"
#save matrix
file = open("/home/iizhaki/oasis/CSE255/YsReorder.pck")
rankingR = np.load(file)
file.close()
print "done"
#save matrix
file = open("/home/iizhaki/oasis/CSE255/IndexReorder.pck")
indexes= np.load(file)
file.close()
print "done"
print indexes[:20]
X = [matrix[i] for i in indexes]
y = [rankingR[i] for i in indexes]
n = 100 #int(len(X)*0.7)
X_train = X[:n]
y_train = y[:n]
X_test =X[n:]
y_test =y[n:]
In [65]:
X_train_b = list(X_train)
In [66]:
from sklearn.cluster import KMeans
K = 1000
km = KMeans (n_clusters=K, n_jobs=-1)
km.fit(X_train)
Out[66]:
In [1]:
In [67]:
X_train = list(X_train_b)
def bitIt(idx, rng):
res = [0] * (rng + 1)
res[0] = 1
res[idx + 1] = 1
return res
X_train = [bitIt(i, K) for i in km.labels_]
print X_train[0]
In [68]:
import scipy.optimize
from numpy.linalg import norm
### Gradient descent ###
# Objective
def f(theta, X, y, lam):
diff = numpy.dot(X, theta) - y
diffSq = (norm(diff) ** 2) / len(X)
diffSqReg = diffSq + lam * norm(theta) ** 2
#print "f : " , diffSqReg
return diffSqReg
# Derivative
def fprime(theta, X, y, lam):
diff = numpy.dot(X, theta) - y
res = 2 * numpy.dot(X.T, diff) / len(X) + 2 * lam * theta
return res
In [69]:
import time
import timeit
start = time.time()
thetar,l,info = scipy.optimize.fmin_l_bfgs_b(f, numpy.array([0] * len(X_train[0])).T, fprime, args = (numpy.array(X_train), numpy.array(y_train).T, 0.1))
#thetar = scipy.optimize.minimize(f, numpy.array([0] * len(X_train[0])).T, jac = fprime, args = (numpy.array(X_train), numpy.array(y_train).T, 0.1))
end = time.time()
finished = end - start
print finished
In [70]:
len (thetar)
Out[70]:
In [71]:
def predict(data, theta):
theta = numpy.matrix(theta)
prediction = [theta*numpy.matrix(d).T for d in data]
return prediction
#prediction_training = predict(X, thetax)
def MSE(prediction, real):
squares = [ (p - r)**2 for p,r in zip (prediction,real) ]
return numpy.mean(squares)
#print " MSE training", MSE(prediction_training, y )
In [72]:
prediction_training = predict(X_train, thetar)
yt = [e/1000.0 for e in y_train]
pt = [e/1000.0 for e in prediction_training]
print " MSE training", MSE(pt, yt ) # mean abs error 0.832253748827
In [240]:
prediction_test = predict(X_test, thetar)
ys = [e/1000.0 for e in y_test]
ps = [e/1000.0 for e in prediction_test]
print " MSE test", MSE(ps, ys ) # mean abs error 0.832253748827
In [16]:
import numpy as np
np.__config__.show()
In [68]:
import numpy as np
#save matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixReorder.pck")
matrix =np.load(file)
file.close()
print "done"
#save matrix
file = open("/home/iizhaki/oasis/CSE255/YsReorder.pck")
rankingR = np.load(file)
file.close()
print "done"
#save matrix
file = open("/home/iizhaki/oasis/CSE255/IndexReorder.pck")
indexes= np.load(file)
file.close()
print "done"
print indexes[:20]
X = [matrix[i] for i in indexes]
y = [rankingR[i] for i in indexes]
n = 100000 #int(len(X)*0.7)
X_train = X[:n]
y_train = y[:n]
#X_test =X[n:]
#y_test =y[n:]
In [1]:
# NEGATIVE Log-likelihood
thetas = numpy.array([0] * len(X_train[0]))
lam = 0.1
def f(i):
X = X_train[i]
y = Y_train[i]
theta = thetas[i]
logit = np.dot(X, theta)
loglikelihood = -np.log(1 + np.exp(-logit)).sum(axis=0, dtype='float')
loglikelihood -= np.dot(logit, 1 - y)
loglikelihood -= lam * np.dot(theta, theta)
#print "ll =", loglikelihood
return -loglikelihood
In [ ]:
import time
import timeit
import playdoh
#thetar,l,info = scipy.optimize.fmin_l_bfgs_b(f, numpy.array([0] * len(X_train[0])).T, fprime, args = (numpy.array(X_train), numpy.array(y_train).T, 0.1))
#thetar = scipy.optimize.minimize(f, numpy.array([0] * len(X_train[0])).T, jac = fprime, args = (numpy.array(X_train), numpy.array(y_train).T, 0.1))
if __name__ == '__main__':
start = time.time()
results = playdoh.maximize(fun,
popsize = 10000, # size of the population
maxiter = 10, # maximum number of iterations
cpu = 2, # number of CPUs to use on the local machine
x_initrange = [-10.,10.]) # initial interval for the ``x`` parameter
# Display the final results in a table
playdoh.print_table(results)
end = time.time()
finished = end - start
print finished
In [ ]:
import playdoh
thetas = numpy.array([0] * 100)
lam = 0.1
# The fitness function to maximize
def fun(x, y):
return x
if __name__ == '__main__':
# Maximize the fitness function in parallel
results = playdoh.minimize(fun,
popsize=1, # size of the population
maxiter=1, # maximum number of iterations
cpu=1)
# Display the final result in a table
playdoh.print_table(results)
In [69]:
X_train = np.array(X_train)
y_train = np.array(y_train)
In [32]:
# Objective
def f(theta, X, y, lam):
diff = numpy.dot(X, theta) - y
diffSq = (norm(diff) ** 2) / len(X)
diffSqReg = diffSq + lam * norm(theta) ** 2
#print "f : " , diffSqReg
return diffSqReg
In [70]:
import time
import timeit
theta = numpy.array([0] * len(X_train[0])).T
start = time.time()
f(theta, X_train, y_train, 0.1)
end = time.time()
finished = end - start
print finished
In [84]:
import threading
from threading import Thread
from multiprocessing.pool import ThreadPool
import time
import timeit
def inner(x,y):
return sum([x[i]*y[i] for i in range(len(x))])
def sigmoid(x):
return 1.0 / (1 + np.exp(-x))
l = X_train.shape[0]
res = dict()
theta = numpy.array([0] * len(X_train[0]))
def fprimeold(theta, X, y, lam):
diff = numpy.dot(X, theta) - y
#print diff
return 2 * numpy.dot(X.T, diff) / X_train.shape[0] + 2 * lam * theta
# Derivative
def fprime(theta, X, y, lam, mi, ma, ress):
X = np.array(X, copy=True)
y = np.array(y, copy=True)
theta = np.array(theta, copy=True)
X[: mi] = 0.
X[ma :] = 0
y[: mi] = 0
y[ma : ] = 0
theta[mi : ma] = 0
diff = numpy.dot(X, theta) - y
#print diff
ress[mi] = 2 * (numpy.dot(diff, X) / X_train.shape[0] + lam * theta)
threads = []
N = 1
for n in range(N):
low = n * 1.0 / N
high = (n + 1.0) / N
threads.append(Thread(target = fprime, args = (theta, X_train, y_train, 0.1, low, high, res)))
#t3 = Thread(target = fprime, args = (theta, X_train, y_train, 0.1, 2.0*l / 3, l - 1, res))
start = time.time()
for t in threads:
t.start()
for t in threads:
t.join()
end = time.time()
finished = end - start
print finished
fres = np.zeros(res[0].shape)
for r in res.values():
fres += r
start = time.time()
d = fprimeold(theta, X_train, y_train, 0.1)
end = time.time()
finished = end - start
print finished
print fres
print d
print [100.0 * sum(fres == d) / len(d)]
In [17]:
In [16]:
In [ ]: