In [8]:
from _collections import defaultdict
import time
import timeit

from numpy.linalg import norm
import scipy.optimize
import random

import numpy as np


def parseData(fname):
  for l in open(fname):
    yield eval(l)
    
def parseTxt(fname):
  for l in open(fname):
    yield l.strip().split(" ")

print "Reading train..."
train = list(parseData("/home/iizhaki/oasis/CSE255/Project2/assignment2/train.json"))


Reading train...

In [ ]:
trainBu = np.array(train, copy=True)

N = len(train)
myRange = range(N)
random.shuffle(myRange)

forTest = myRange[-int(0.7 * N) : ]
forTrain = myRange[: int(0.7 * N)]

In [17]:
#test = [train[i] for i in forTest]
#train = [train[i] for i in forTrain]
test = train
train = train
print "done"


done

In [18]:
print "done"

allXs = []
allYs = []
for l in train:
  user, item, rating = l['reviewerID'], l['itemID'], l['rating']
  allXs.append([user, item])
  allYs.append(float(rating))


done

In [19]:
print "Reading test..."
testRest = test
print "done"


Reading test...
done

In [ ]:
item = 'I102776733'
user = 'U566105319'

In [28]:
def miniFunc(Data, Alpha, BetaU, BetaI, Lambd):
    part1 = 0
    for [u, i], Rui in Data:
        part1 += ((Alpha + BetaU[u] + BetaI[i] - Rui) ** 2)
    
    part2 = 0
    for u in BetaU:
        part2 += (BetaU[u] ** 2)
    for i in BetaI:
        part2 += (BetaI[i] ** 2)
    
    return part1 + Lambd * part2   


X = allXs
y = allYs
data = zip(X, y)

Ntrain = len(y)
bestMse = 0

storedAlpha = 0
storedBetaU = 0
storedBetaI = 0
rouni = 1

alpha = 1
oldVal = 0
betaU = defaultdict(float)
betaI = defaultdict(float)
lambd = 1.0

while True:
    lastAlpha = alpha
    lastBetaU = betaU
    lastBetaI = betaI

    # Alpha stage
    alpha = 0
    for [u, i], Rui in data:
        bu = betaU[u]
        bi = betaI[i]
        alpha += Rui - (bu + bi)
    alpha = alpha / Ntrain

    # BetaU stage 
    Iu = defaultdict(float)
    betaU = defaultdict(float)
    for [u, i], Rui in data:
        betaU[u] += (Rui - (alpha + betaI[i]))
        Iu[u] += 1
    for u in betaU:
        betaU[u] = betaU[u] / (lambd + Iu[u])

    # BetaI stage 
    Ii = defaultdict(float)
    betaI = defaultdict(float)
    for [u, i], Rui in data:
        betaI[i] += (Rui - (alpha + betaU[u]))
        Ii[i] += 1
    for i in betaI:
        betaI[i] = betaI[i] / (lambd + Ii[i])

    newVal = miniFunc(data, alpha, betaU, betaI, lambd)
    print "Old | New val: ", oldVal, newVal

    if lastAlpha != 0 and oldVal < newVal:
        alpha = lastAlpha
        betaU = lastBetaU
        betaI = lastBetaI
        break

    oldVal = newVal

mse = 0
for [u, i], Rui in data:
    p = alpha + betaU[u] + betaI[i]
    mse += (p - float(Rui)) ** 2
mse = mse / len(test)

print "Improved MSE from ", bestMse, " to ",mse, " using lambda ", lambd


'''
#print "Mse: ", mse
if mse >= bestMse:

storedAlpha = alpha
storedBetaU = betaU
storedBetaI = betaI
print "Improved MSE from ", bestMse, " to ",mse, " using lambda ", lambd

#bestMse = mse

myPredictions = open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Rating_" + str(lambd) + "_" + str(alpha) + "_1.txt", 'w')
myPredictions.write(str(testRest[0][0]) + '\n')

mse = 0
for currLine in testRest[1:]:
    u, i = currLine[0].split("-")
    p = alpha + betaU[u] + betaI[i]
    myPredictions.write(u + '-' + i + ',' + str(p) + '\n')

myPredictions.flush()
myPredictions.close()
'''


Old | New val:  0 909033.552278
Improved MSE from  0  to  9.888611  using lambda  1.0
Out[28]:
'\n#print "Mse: ", mse\nif mse >= bestMse:\n\nstoredAlpha = alpha\nstoredBetaU = betaU\nstoredBetaI = betaI\nprint "Improved MSE from ", bestMse, " to ",mse, " using lambda ", lambd\n\n#bestMse = mse\n\nmyPredictions = open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Rating_" + str(lambd) + "_" + str(alpha) + "_1.txt", \'w\')\nmyPredictions.write(str(testRest[0][0]) + \'\n\')\n\nmse = 0\nfor currLine in testRest[1:]:\n    u, i = currLine[0].split("-")\n    p = alpha + betaU[u] + betaI[i]\n    myPredictions.write(u + \'-\' + i + \',\' + str(p) + \'\n\')\n\nmyPredictions.flush()\nmyPredictions.close()\n'

In [ ]:
testRest = np.array(list(parseTxt("/home/iizhaki/oasis/CSE255/Project2/assignment2/pairs_Rating.txt")))
myPredictions = open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Rating_" + str(lambd) + "_" + str(alpha) + "_1.txt", 'w')
myPredictions.write(str(testRest[0][0]) + '\n')

mse = 0
for currLine in testRest[1:]:
    u, i = currLine[0].split("-")
    p = alpha + betaU[u] + betaI[i]
    myPredictions.write(u + '-' + i + ',' + str(p) + '\n')

myPredictions.flush()
myPredictions.close()

In [4]:
print "Alpha: ", alpha
print "BetaI of ", item, ": ", betaI[item]
print "BetaU of ", user, ": ", betaU[user]


Alpha:  3.5194082852
BetaI of  I102776733 :  0.171752035867
BetaU of  U566105319 :  -1.03076735296

In [20]:



Mse:  0.35533144937

In [30]:
print "Reading test..."
myPredictions = open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Rating.txt", 'w')
print "done"


Reading test...
done

In [31]:
myPredictions.write(str(testRest[0][0]) + '\n')

mse = 0
for currLine in testRest[1:]:
    u, i = currLine[0].split("-")
    p = alpha + betaU[u] + betaI[i]
    myPredictions.write(u + '-' + i + ',' + str(p) + '\n')
    
myPredictions.flush()
myPredictions.close()

Jaccard


In [6]:
user1 = 'U229891973'
user2 = 'U622491081'

A = set([i for (u, i) in allXs if u == user1])
B = set([i for (u, i) in allXs if u == user2])

In [7]:
print len(A), len(B)


4 3

In [8]:
Jaccard1 = len(A.intersection(B)) * 1.0 / len((A.union(B)))
print Jaccard1


0.75

In [9]:
user3 = 'U622491081'

dictU = defaultdict(list)
for u, i in allXs:
    dictU[u].append(i)

In [10]:
A = set(dictU[user3])

bestJac = 0
bestU = []

for u in dictU:
    if u == user3:
        continue
        
    B = set(dictU[u])
    jacc = len(A.intersection(B)) * 1.0 / len(A.union(B))
    
    if jacc > bestJac:
        bestU = [u]
        bestJac = jacc
    elif jacc == bestJac:
        bestU.append(u)
        
print bestU, bestJac


['U359587607', 'U687939146', 'U096951499', 'U296575297', 'U387971231', 'U300899166'] 1.0

In [10]:


In [11]:
user4 = 'U639726733'

dictI = defaultdict(list)
for u, i in allXs:
    dictI[i].append(u)

In [12]:
items = dictU[user4]

for item in items:
    A = set(dictI[item])

    bestJac = 0
    bestI = []

    for i in dictI:
        if i in items:
            continue
            
        B = set(dictI[i])
        jacc = len(A.intersection(B)) * 1.0 / len(A.union(B))

        if jacc > bestJac:
            bestI = [i]
            bestJac = jacc
        elif jacc == bestJac:
            #bestI += [i]
            bestI = [i]

    print item, bestI, bestJac
    print "-------------"


I827118969 ['I368057136'] 0
-------------
I988644602 ['I368057136'] 0
-------------
I958777870 ['I368057136'] 0
-------------
I616454620 ['I368057136'] 0
-------------
I970119134 ['I970165134'] 0.2
-------------

In [12]:


In [ ]: