In [21]:
from _collections import defaultdict
import time
import timeit

from numpy.linalg import norm
import scipy.optimize
import random

import numpy as np


def parseData(fname):
  for l in open(fname):
    yield eval(l)
    
def parseTxt(fname):
  for l in open(fname):
    yield l.strip().split(" ")

print "Reading train..."
train = list(parseData("/home/iizhaki/oasis/CSE255/Project2/assignment2/train.json"))
N = len(train)


Reading train...

In [25]:
myRange = range(N)
random.shuffle(myRange)

test = [train[i] for i in myRange[: int(0.7 * N)]]
train = [train[i] for i in myRange[-int(0.7 * N) : ]]

In [26]:
from sets import Set
print "done"

allXs = []
allYs = []
allUsers = Set()
rater = {}
for l in train:
    user, item, rating = l['reviewerID'], l['itemID'], l['rating']
    rater[item] = float(rating)
    allXs.append([user, item])
    allYs.append(float(rating))
    allUsers.add(user)


done

Jaccard


In [3]:
'''print "Reading test..."
myPredictions = open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Purchase_18.txt", 'w')
print "done"
print "Reading test..."
test = np.array(list(parseTxt("/home/iizhaki/oasis/CSE255/Project2/assignment2/pairs_Purchase.txt")))
print "done"'''


Out[3]:
'print "Reading test..."\nmyPredictions = open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Purchase_18.txt", \'w\')\nprint "done"\nprint "Reading test..."\ntest = np.array(list(parseTxt("/home/iizhaki/oasis/CSE255/Project2/assignment2/pairs_Purchase.txt")))\nprint "done"'

In [4]:


In [30]:
Fs = [0.8, 0.85, 0.9, 0.95]

while Fs:
    dictI = defaultdict(Set)
    dictU = defaultdict(Set)
    for u, i in allXs:
        dictI[i].add(u)
        dictU[u].add(i)   
    
    f = Fs.pop()
    print "F is ", f
    itemCount = defaultdict(int)
    totalPurchases = 0

    for user, item in allXs:
        itemCount[item] += 1
        totalPurchases += 1

    mostPopular = [(itemCount[x], x) for x in itemCount]
    mostPopular.sort()
    mostPopular.reverse()

    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > totalPurchases * f: break
            
    #myPredictions.write(str(test[0][0]) + '\n')

    hasChanged = True
    bestUIs = defaultdict(Set)
    loc = 1

    while hasChanged:
        print "Loop ", loc
        loc += 1
        hasChanged = False

        for l in train[1 :]:
            done = False
            u, i = l['reviewerID'], l['itemID']
            toAdd = []

            for it in dictU[u]:        
                Jacc = len(dictI[it] & dictI[i]) * 1.0 / len(dictI[it] | dictI[i])
                if Jacc > 0:
                    done = True
                    if i not in dictU[u]:
                        hasChanged = True
                        toAdd.append((u, i))
                    #else:
                        #myPredictions.write(u + '-' + i + ',1\n')
                        #break

            for (uu, ii) in toAdd:
                dictU[u].add(i)
                dictI[i].add(u)

        break

    

    hasChanged = True
    bestUIs = defaultdict(Set)


    correct = 0
    for l in test[1 :]:
        done = False
        u, i = l['reviewerID'], l['itemID']

        for it in dictU[u]:        
            Jacc = len(dictI[it] & dictI[i]) * 1.0 / len(dictI[it] | dictI[i])
            if Jacc > 0:
                correct += 1
                done = True
                break

        if not done:
            if i in return1:
                correct += 1
            elif i not in dictI and u in dictU:
                correct += 1
            else:    
                correct += 0
    
    
    print "Improved using ", f, " from ", bestCorrect, " to ", correct
    bestCorrect = correct
    bestF = f


F is  0.95
Loop  1
Improved using  0.95  from  692289  to  683832
F is  0.9
Loop  1
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-30-f3c08848165f> in <module>()
     45 
     46             for it in dictU[u]:
---> 47                 Jacc = len(dictI[it] & dictI[i]) * 1.0 / len(dictI[it] | dictI[i])
     48                 if Jacc > 0:
     49                     done = True

/opt/python/lib/python2.7/sets.py in __or__(self, other)
    176     # raises TypeError as-is is also a bit subtle).
    177 
--> 178     def __or__(self, other):
    179         """Return the union of two sets as a new set.
    180 

KeyboardInterrupt: 

In [160]:



Loop  1
Loop  2
Loop  3

In [161]:
myPredictions.write(str(test[0][0]) + '\n')

hasChanged = True
bestUIs = defaultdict(Set)


for l in test[1 :]:
    done = False
    user, item, rating = l['reviewerID'], l['itemID'], l['rating']

    for it in dictU[u]:        
        Jacc = len(dictI[it] & dictI[i]) * 1.0 / len(dictI[it] | dictI[i])
        if Jacc > 0:
            myPredictions.write(u + '-' + i + ',1\n')
            done = True
            break

    if not done:
        if i in return1:
            myPredictions.write(u + '-' + i + ',1\n')
        elif i not in dictI and u in dictU:
            myPredictions.write(u + '-' + i + ',1\n')
        else:    
            myPredictions.write(u + '-' + i + ',0\n')


myPredictions.flush()
    
myPredictions.close()

In [ ]:


In [ ]:
myPredictions.write(str(test[0][0]) + '\n')

for currLine in test[1:]:
    u, i = currLine[0].split("-")
    if i in bestUIs[u]:
        myPredictions.write(u + '-' + i + ',0.5\n')
    else:
        myPredictions.write(u + '-' + i + ',0\n')
    
myPredictions.flush()
myPredictions.close()

In [ ]: