In [1]:
import pickle
from sklearn.linear_model import SGDRegressor
from _collections import defaultdict
import time
import timeit

from numpy.linalg import norm
import scipy.optimize
import random
from sets import Set

import numpy as np

In [2]:
def parseData(fname):
  for l in open(fname):
    yield eval(l)
    
def parseTxt(fname):
  for l in open(fname):
    yield l.strip().split(" ")

print "Reading train..."
train = list(parseData("/home/iizhaki/oasis/CSE255/Project2/assignment2/train.json"))


Reading train...

In [2]:


In [2]:


In [3]:
print "done"

allXs = []
allYs = []
for l in train:
  user, item, rating = l['reviewerID'], l['itemID'], l['rating']
  allXs.append([user, item])
  allYs.append(float(rating))


done

In [3]:


In [3]:


In [4]:
alpha = 0
X = allXs
y = allYs
Ntrain = len(y)
data = zip(X, y)

numU = 0
numI = 0
allUs = {}
allUrs = {}
allIs = {}
allIrs = {}
fastData = []
for [u, i], Rui in data:
    if u not in allUs:
        allUs[u] = numU
        allUrs[numU] = u
        numU += 1
    if i not in allIs:
        allIs[i] = numI
        allIrs[numI] = i
        numI += 1
    fastData.append((allUs[u], allIs[i], Rui))


Iu = np.zeros(numU)
Ii = np.zeros(numI)
uToRI = [[]] * numU
iToRU = [[]] * numI
for (u, i, Rui) in fastData:
    Iu[u] += 1
    Ii[i] += 1
    if uToRI[u] == []:
        uToRI[u] = [(i, Rui)]
    else:
        uToRI[u].append((i, Rui))
    if iToRU[i] == []:
        iToRU[i] = [(u, Rui)]
    else:
        iToRU[i].append((u, Rui))

In [5]:
print "Done"


Done

In [6]:
# Objective
def func(theta, X, y, lam):
    diff = numpy.dot(X,theta) - y
    diffSq = numpy.dot(diff,diff) 
    diffSqReg = diffSq  + lam * numpy.dot(theta,theta)
    #print "offset =", diffSqReg
    return diffSqReg

# Derivative
def fprime(theta, X, y, lam):
    diff = numpy.dot(X,theta) - y
    res = 2 * numpy.dot(X.T,diff)    + 2 * lam * theta
    #print "gradient =", res
    return res

In [7]:
def miniFunc(Data, Alpha, BetaU, BetaI, GammaU, GammaI, Lambd):
    part1 = 0
    for (u, i, Rui) in Data:
        part1 += ((Alpha + BetaU[u] + BetaI[i] + np.dot(GammaU[u], GammaI[i]) - Rui) ** 2)
    
    part2 = sum(BetaU * BetaU) + sum(BetaI * BetaI)
    part3 = sum(GammaU * GammaU) + sum(GammaI * GammaI)
    
    print part1, part2, part3
    return part1 + Lambd * (part2 + part3)

In [8]:
oldVal = 0
betaU = np.zeros(numU)
betaI = np.zeros(numI)
gammaU = [[]] * numU
gammaI = [[]] * numI
for u in range(numU):
    gammaU[u] = [0, 0, 0]
for i in range(numI):
    gammaI[i] = [5.0, 5.0, 5.0]

In [13]:
alpha = pickle.load(open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Rating_a_BEST50.pck", "rb"))
betaU = pickle.load(open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Rating_bu_BEST50.pck", "rb"))
betaI = pickle.load(open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Rating_bi_BEST50.pck", "rb"))
gammaU = pickle.load(open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Rating_gu_BEST50.pck", "rb"))
gammaI = pickle.load(open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Rating_gi_BEST50.pck", "rb"))
oldVal = pickle.load(open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Rating_val_BEST50.pck", "rb"))

In [10]:
#alpha = 3.70361669988
#betaUD = pickle.load(open("/home/iizhaki/oasis/CSE255/Project2/Bu.pck", "r"))
#betaID = pickle.load(open("/home/iizhaki/oasis/CSE255/Project2/Bi.pck", "rb"))
#gammaUD = pickle.load(open("/home/iizhaki/oasis/CSE255/Project2/Yu.pck", "rb"))
#gammaID = pickle.load(open("/home/iizhaki/oasis/CSE255/Project2/Yi.pck", "rb"))
#oldVal = 0

In [11]:
for u in betaUD:
    betaU[allUs[u]] = betaUD[u]
    gammaU[allUs[u]] = gammaUD[u]
    
for i in betaID:
    betaI[allIs[i]] = betaID[i]
    gammaI[allIs[i]] = gammaID[i]
    
print betaU[0]
print gammaU[0]
print betaI[0]
print gammaI[0]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-da674eae297e> in <module>()
----> 1 for u in betaUD:
      2     betaU[allUs[u]] = betaUD[u]
      3     gammaU[allUs[u]] = gammaUD[u]
      4 
      5 for i in betaID:

NameError: name 'betaUD' is not defined

In [26]:
betaU = np.array(betaU)
betaI = np.array(betaI)
gammaU = list(gammaU)
gammaI = list(gammaI)

for u in range(numU):
    gammaU[u] = np.append(gammaU[u], [0] * 35)
    
for i in range(numI):
    gammaI[i] = np.append(gammaI[i], [1] * 35)

gammaU = np.array(gammaU)
gammaI = np.array(gammaI)
oldVal = 0

In [ ]:
# HERE I UNDERFIT WITH 2 & 50
lambd = 2.
oldVal = 0
while True:
    lastAlpha = alpha
    lastBetaU = betaU.copy()
    lastBetaI = betaI.copy()
    lastGammaU = gammaU.copy()
    lastGammaI = gammaI.copy()
    
    #----------------------
    start = time.time()
    #----------------------
    
    # Alpha stage
    alpha = 0
    for (u, i, Rui) in fastData:
        gu = gammaU[u]
        gi = gammaI[i]
        alpha += Rui - (betaU[u] + betaI[i] + np.dot(gu, gi))
    alpha = alpha / Ntrain
    
    #----------------------
    #end = time.time()
    #finished = end - start
    #print "Alpha time: ", finished
    #----------------------
    
    #----------------------
    #start = time.time()
    #----------------------

    # BetaU stage 
    betaU.fill(0)
    for (u, i, Rui) in fastData:
        gu = gammaU[u]
        gi = gammaI[i]
        betaU[u] += (Rui - (alpha + betaI[i] + np.dot(gu, gi)))
    betaU = betaU / (lambd + Iu)
        
    #----------------------
    #end = time.time()
    #finished = end - start
    #print "BetaU time: ", finished
    #----------------------
        
    #----------------------
    #start = time.time()
    #----------------------
        
    # BetaI stage 
    betaI.fill(0)
    for (u, i, Rui) in fastData:
        gu = gammaU[u]
        gi = gammaI[i]
        betaI[i] += (Rui - (alpha + betaU[u] + np.dot(gu, gi)))
    betaI = betaI / (lambd + Ii)
        
    #----------------------
    #end = time.time()
    #finished = end - start
    #print "BetaI time: ", finished
    #----------------------
    
    #----------------------
    #start = time.time()
    #----------------------

    # GammaU stage 
    for u in range(numU):
        gi = []
        y = []
        for (i, Rui) in uToRI[u]:
            gi.append(gammaI[i])
            y.append(Rui - (alpha + betaU[u] + betaI[i]))
                    
        #clf = SGDRegressor(n_iter = 5, alpha = 0.1)
        #clf.fit(gi, y)
        #thetas = clf.coef_

        #thetas, _, _, _ = np.linalg.lstsq(gi, y)
        thetas, _, _ = scipy.optimize.fmin_l_bfgs_b(func, np.array(gammaU[u]).T, fprime, args = (np.array(gi), np.array(y).T, lambd))
        #thetas, _, _ = scipy.optimize.fmin_cg(func, np.array(gammaU[u]).T, fprime, args = (np.array(gi), np.array(y).T, lambd))
        gammaU[u] = thetas
            
    #----------------------
    #end = time.time()
    #finished = end - start
    #print "GammaU time: ", finished
    #----------------------
        
    #----------------------
    #start = time.time()
    #----------------------
        
    # GammaI stage 
    for i in range(numI):
        gu = []
        y = []
        for (u, Rui) in iToRU[i]:
            gu.append(gammaU[u])
            y.append(Rui - (alpha + betaU[u] + betaI[i]))
            
        #clf = SGDRegressor(n_iter = 5, alpha = 0.1)
        #clf.fit(gu, y)
        #thetas = clf.coef_
            
        #thetas, _, _, _ = np.linalg.lstsq(gu, y)
        thetas, _, _ = scipy.optimize.fmin_l_bfgs_b(func, np.array(gammaI[i]).T, fprime, args = (np.array(gu), np.array(y).T, lambd))
        #thetas, _, _ = scipy.optimize.fmin_cg(func, np.array(gammaI[i]).T, fprime, args = (np.array(gu), np.array(y).T, lambd))
        gammaI[i] = thetas
    
    #----------------------
    #end = time.time()
    #finished = end - start
    #print "GammaI time: ", finished
    #----------------------
    
    #----------------------
    #start = time.time()
    #----------------------
    newVal = miniFunc(fastData, alpha, betaU, betaI, np.array(gammaU), np.array(gammaI), lambd)
    #----------------------
    end = time.time()
    finished = end - start
    print "miniFunc time: ", finished,  " --> Diff: ", (oldVal - newVal), newVal
    #----------------------
    
    if oldVal > 0 and oldVal <= newVal:
        alpha = lastAlpha
        betaU = lastBetaU
        betaI = lastBetaI
        gammaU = lastGammaU
        gammaI = lastGammaI
        break
        
    oldVal = newVal
    
print alpha

In [28]:
print len(gammaU[0])


50

In [12]:
pickle.dump(alpha, open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Rating_a_BEST50.pck", "wb"))
pickle.dump(betaU, open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Rating_bu_BEST50.pck", "wb"))
pickle.dump(betaI, open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Rating_bi_BEST50.pck", "wb"))
pickle.dump(gammaU, open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Rating_gu_BEST50.pck", "wb"))
pickle.dump(gammaI, open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Rating_gi_BEST50.pck", "wb"))
pickle.dump(oldVal, open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Rating_val_BEST50.pck", "wb"))


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-12-4ccd5affade8> in <module>()
      2 pickle.dump(betaU, open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Rating_bu_BEST50.pck", "wb"))
      3 pickle.dump(betaI, open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Rating_bi_BEST50.pck", "wb"))
----> 4 pickle.dump(gammaU, open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Rating_gu_BEST50.pck", "wb"))
      5 pickle.dump(gammaI, open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Rating_gi_BEST50.pck", "wb"))
      6 pickle.dump(oldVal, open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Rating_val_BEST50.pck", "wb"))

/opt/python/lib/python2.7/pickle.py in dump(obj, file, protocol)
   1368 
   1369 def dump(obj, file, protocol=None):
-> 1370     Pickler(file, protocol).dump(obj)
   1371 
   1372 def dumps(obj, protocol=None):

/opt/python/lib/python2.7/pickle.py in dump(self, obj)
    222         if self.proto >= 2:
    223             self.write(PROTO + chr(self.proto))
--> 224         self.save(obj)
    225         self.write(STOP)
    226 

/opt/python/lib/python2.7/pickle.py in save(self, obj)
    329 
    330         # Save the reduce() output and finally memoize the object
--> 331         self.save_reduce(obj=obj, *rv)
    332 
    333     def persistent_id(self, obj):

/opt/python/lib/python2.7/pickle.py in save_reduce(self, func, args, state, listitems, dictitems, obj)
    417 
    418         if state is not None:
--> 419             save(state)
    420             write(BUILD)
    421 

/opt/python/lib/python2.7/pickle.py in save(self, obj)
    284         f = self.dispatch.get(t)
    285         if f:
--> 286             f(self, obj) # Call unbound method with explicit self
    287             return
    288 

/opt/python/lib/python2.7/pickle.py in save_tuple(self, obj)
    560         write(MARK)
    561         for element in obj:
--> 562             save(element)
    563 
    564         if id(obj) in memo:

/opt/python/lib/python2.7/pickle.py in save(self, obj)
    284         f = self.dispatch.get(t)
    285         if f:
--> 286             f(self, obj) # Call unbound method with explicit self
    287             return
    288 

/opt/python/lib/python2.7/pickle.py in save_string(self, obj, pack)
    486                 self.write(BINSTRING + pack("<i", n) + obj)
    487         else:
--> 488             self.write(STRING + repr(obj) + '\n')
    489         self.memoize(obj)
    490     dispatch[StringType] = save_string

KeyboardInterrupt: 

In [30]:
print "Done"


Done

In [31]:
print (oldVal - newVal), newVal


0.0 376888.96773

In [31]:


In [11]:
testRest = np.array(list(parseTxt("/home/iizhaki/oasis/CSE255/Project2/assignment2/pairs_Rating.txt")))
myPredictions = open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Rating_" + str(lambd) + "_" + str(alpha) + "_" + str(oldVal) + "_BEST50_AVG.txt", 'w')
myPredictions.write(str(testRest[0][0]) + '\n')

avgBu = np.mean(betaU)
avgBi = np.mean(betaI)
avgGu = np.mean(gammaU, axis = 0)
avgGi = np.mean(gammaI, axis = 0)

mse = 0
for currLine in testRest[1:]:
    u, i = currLine[0].split("-")
    if u in allUs:
        bu = betaU[allUs[u]]
        gu = gammaU[allUs[u]]
    else:
        bu = avgBu
        gu = avgGu
    if i in allIs:
        bi = betaI[allIs[i]]
        gi = gammaI[allIs[i]]
    else:
        bi = avgBi
        gi = avgGi
    p = alpha + bu + bi + np.dot(gu, gi)
    if p > 5.0:
        p = 4.8
    if p < 1.0:
        p = 1.2
    myPredictions.write(u + '-' + i + ',' + str(p) + '\n')

myPredictions.flush()
myPredictions.close()

In [ ]:


In [ ]:


In [47]:


In [59]:


In [60]:


In [ ]: