In [21]:
from _collections import defaultdict
import time
import timeit
from numpy.linalg import norm
import scipy.optimize
import random
import numpy as np
def parseData(fname):
for l in open(fname):
yield eval(l)
def parseTxt(fname):
for l in open(fname):
yield l.strip().split(" ")
print "Reading train..."
train = list(parseData("/home/iizhaki/oasis/CSE255/Project2/assignment2/train.json"))
N = len(train)
In [25]:
myRange = range(N)
random.shuffle(myRange)
test = [train[i] for i in myRange[: int(0.7 * N)]]
train = [train[i] for i in myRange[-int(0.7 * N) : ]]
In [26]:
from sets import Set
print "done"
allXs = []
allYs = []
allUsers = Set()
rater = {}
for l in train:
user, item, rating = l['reviewerID'], l['itemID'], l['rating']
rater[item] = float(rating)
allXs.append([user, item])
allYs.append(float(rating))
allUsers.add(user)
In [3]:
'''print "Reading test..."
myPredictions = open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Purchase_18.txt", 'w')
print "done"
print "Reading test..."
test = np.array(list(parseTxt("/home/iizhaki/oasis/CSE255/Project2/assignment2/pairs_Purchase.txt")))
print "done"'''
Out[3]:
In [4]:
In [30]:
Fs = [0.8, 0.85, 0.9, 0.95]
while Fs:
dictI = defaultdict(Set)
dictU = defaultdict(Set)
for u, i in allXs:
dictI[i].add(u)
dictU[u].add(i)
f = Fs.pop()
print "F is ", f
itemCount = defaultdict(int)
totalPurchases = 0
for user, item in allXs:
itemCount[item] += 1
totalPurchases += 1
mostPopular = [(itemCount[x], x) for x in itemCount]
mostPopular.sort()
mostPopular.reverse()
return1 = set()
count = 0
for ic, i in mostPopular:
count += ic
return1.add(i)
if count > totalPurchases * f: break
#myPredictions.write(str(test[0][0]) + '\n')
hasChanged = True
bestUIs = defaultdict(Set)
loc = 1
while hasChanged:
print "Loop ", loc
loc += 1
hasChanged = False
for l in train[1 :]:
done = False
u, i = l['reviewerID'], l['itemID']
toAdd = []
for it in dictU[u]:
Jacc = len(dictI[it] & dictI[i]) * 1.0 / len(dictI[it] | dictI[i])
if Jacc > 0:
done = True
if i not in dictU[u]:
hasChanged = True
toAdd.append((u, i))
#else:
#myPredictions.write(u + '-' + i + ',1\n')
#break
for (uu, ii) in toAdd:
dictU[u].add(i)
dictI[i].add(u)
break
hasChanged = True
bestUIs = defaultdict(Set)
correct = 0
for l in test[1 :]:
done = False
u, i = l['reviewerID'], l['itemID']
for it in dictU[u]:
Jacc = len(dictI[it] & dictI[i]) * 1.0 / len(dictI[it] | dictI[i])
if Jacc > 0:
correct += 1
done = True
break
if not done:
if i in return1:
correct += 1
elif i not in dictI and u in dictU:
correct += 1
else:
correct += 0
print "Improved using ", f, " from ", bestCorrect, " to ", correct
bestCorrect = correct
bestF = f
In [160]:
In [161]:
myPredictions.write(str(test[0][0]) + '\n')
hasChanged = True
bestUIs = defaultdict(Set)
for l in test[1 :]:
done = False
user, item, rating = l['reviewerID'], l['itemID'], l['rating']
for it in dictU[u]:
Jacc = len(dictI[it] & dictI[i]) * 1.0 / len(dictI[it] | dictI[i])
if Jacc > 0:
myPredictions.write(u + '-' + i + ',1\n')
done = True
break
if not done:
if i in return1:
myPredictions.write(u + '-' + i + ',1\n')
elif i not in dictI and u in dictU:
myPredictions.write(u + '-' + i + ',1\n')
else:
myPredictions.write(u + '-' + i + ',0\n')
myPredictions.flush()
myPredictions.close()
In [ ]:
In [ ]:
myPredictions.write(str(test[0][0]) + '\n')
for currLine in test[1:]:
u, i = currLine[0].split("-")
if i in bestUIs[u]:
myPredictions.write(u + '-' + i + ',0.5\n')
else:
myPredictions.write(u + '-' + i + ',0\n')
myPredictions.flush()
myPredictions.close()
In [ ]: