In [2]:
from _collections import defaultdict
import time
import timeit
from numpy.linalg import norm
import scipy.optimize
import random
import numpy as np
def parseData(fname):
for l in open(fname):
yield eval(l)
def parseTxt(fname):
for l in open(fname):
yield l.strip().split(" ")
print "Reading train..."
train = list(parseData("/home/iizhaki/oasis/CSE255/Project2/assignment2/train.json"))
In [3]:
trainBu = np.array(train, copy=True)
N = len(train)
myRange = range(N)
random.shuffle(myRange)
forTest = myRange[-int(0.7 * N) : ]
forTrain = myRange[: int(0.7 * N)]
In [4]:
test = [train[i] for i in forTest]
train = [train[i] for i in forTrain]
print "done"
In [5]:
from sets import Set
print "done"
allXs = []
allYs = []
allUsers = Set()
for l in train:
user, item, rating = l['reviewerID'], l['itemID'], l['rating']
allXs.append([user, item])
allYs.append(float(rating))
allUsers.add(user)
In [6]:
print "Reading test..."
testRest = test
print "done"
In [ ]:
def miniFunc(Data, Alpha, BetaU, BetaI, Lambd):
part1 = 0
for [u, i], Rui in Data:
part1 += ((Alpha + BetaU[u] + BetaI[i] - Rui) ** 2)
part2 = 0
for u in BetaU:
part2 += (BetaU[u] ** 2)
for i in BetaI:
part2 += (BetaI[i] ** 2)
return part1 + Lambd * part2
X = allXs
y = allYs
data = zip(X, y)
Ntrain = len(y)
bestMse = 0
storedAlpha = 0
storedBetaU = 0
storedBetaI = 0
rouni = 1
alpha = 0
oldVal = 0
betaU = defaultdict(int)
betaI = defaultdict(int)
lambd = 1
while True:
print "Round ", rouni
rouni += 1
lastAlpha = alpha
lastBetaU = betaU
lastBetaI = betaI
# Alpha stage
alpha = 0
for [u, i], Rui in data:
bu = betaU[u]
bi = betaI[i]
alpha += Rui - (bu + bi)
alpha = alpha / Ntrain
# BetaU stage
Iu = defaultdict(int)
betaU = defaultdict(int)
for [u, i], Rui in data:
betaU[u] += (Rui - (alpha + betaI[i]))
Iu[u] += 1
for u in betaU:
betaU[u] = betaU[u] / (lambd + Iu[u])
# BetaI stage
Ii = defaultdict(int)
betaI = defaultdict(int)
for [u, i], Rui in data:
betaI[i] += (Rui - (alpha + betaU[u]))
Ii[i] += 1
for i in betaI:
betaI[i] = betaI[i] / (lambd + Ii[i])
newVal = miniFunc(data, alpha, betaU, betaI, lambd)
if lastAlpha != 0 and oldVal < newVal:
alpha = lastAlpha
betaU = lastBetaU
betaI = lastBetaI
break
oldVal = newVal
mse = 0
for currLine in test[1:]:
u, rest = currLine[0].split("-")
i, Rui = rest.split(",")
p = alpha + betaU[u] + betaI[i]
mse += (p - float(Rui)) ** 2
mse = mse / len(test)
#print "Mse: ", mse
if mse >= bestMse:
storedAlpha = alpha
storedBetaU = betaU
storedBetaI = betaI
print "Improved MSE from ", bestMse, " to ",mse, " using lambda ", lambd
bestMse = mse
myPredictions = open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Rating_" + str(lambd) + "_" + str(alpha) + ".txt", 'w')
myPredictions.write(str(testRest[0][0]) + '\n')
mse = 0
for currLine in testRest[1:]:
u, i = currLine[0].split("-")
p = alpha + betaU[u] + betaI[i]
myPredictions.write(u + '-' + i + ',' + str(p) + '\n')
myPredictions.flush()
myPredictions.close()
In [ ]:
item = 'I102776733'
user = 'U566105319'
In [4]:
print "Alpha: ", alpha
print "BetaI of ", item, ": ", betaI[item]
print "BetaU of ", user, ": ", betaU[user]
In [20]:
In [30]:
print "Reading test..."
myPredictions = open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Rating.txt", 'w')
print "done"
In [31]:
myPredictions.write(str(testRest[0][0]) + '\n')
mse = 0
for currLine in testRest[1:]:
u, i = currLine[0].split("-")
p = alpha + betaU[u] + betaI[i]
myPredictions.write(u + '-' + i + ',' + str(p) + '\n')
myPredictions.flush()
myPredictions.close()
In [6]:
user1 = 'U229891973'
user2 = 'U622491081'
A = set([i for (u, i) in allXs if u == user1])
B = set([i for (u, i) in allXs if u == user2])
In [7]:
print len(A), len(B)
In [8]:
Jaccard1 = len(A.intersection(B)) * 1.0 / len((A.union(B)))
print Jaccard1
In [9]:
user3 = 'U622491081'
dictU = defaultdict(list)
for u, i in allXs:
dictU[u].append(i)
In [10]:
A = set(dictU[user3])
bestJac = 0
bestU = []
for u in dictU:
if u == user3:
continue
B = set(dictU[u])
jacc = len(A.intersection(B)) * 1.0 / len(A.union(B))
if jacc > bestJac:
bestU = [u]
bestJac = jacc
elif jacc == bestJac:
bestU.append(u)
print bestU, bestJac
In [3]:
print "Reading test..."
myPredictions = open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Purchase.txt", 'w')
print "done"
print "Reading test..."
test = np.array(list(parseTxt("/home/iizhaki/oasis/CSE255/Project2/assignment2/pairs_Purchase.txt")))
print "done"
In [6]:
dictI = defaultdict(list)
dictU = defaultdict(list)
for u, i in allXs:
dictI[i].append(u)
dictU[u].append(i)
In [12]:
impUsers = Set(x[0].split("-")[0] for x in test)
In [ ]:
bestUIs = defaultdict(Set)
for user in impUsers:
items = dictU[user]
for item in items:
A = set(dictI[item])
bestJac = 0
bestI = Set()
for i in dictI:
if i in items:
continue
B = set(dictI[i])
jacc = len(A.intersection(B)) * 1.0 / len(A.union(B))
if jacc > bestJac:
bestI = Set(i)
bestJac = jacc
elif jacc == bestJac:
#bestI += [i]
bestI.add(i)
bestUIs[user] = bestI
In [1]:
In [ ]:
myPredictions.write(str(test[0][0]) + '\n')
for currLine in test[1:]:
u, i = currLine[0].split("-")
if i in bestUIs[u]:
myPredictions.write(u + '-' + i + ',1\n')
else:
myPredictions.write(u + '-' + i + ',0\n')
myPredictions.flush()
myPredictions.close()