In [1]:
from _collections import defaultdict
import time
import timeit

from numpy.linalg import norm
import scipy.optimize

import numpy as np

def parseData(fname):
  for l in open(fname):
    yield eval(l)
    
def parseTxt(fname):
  for l in open(fname):
    yield l.strip().split(" ")

print "Reading train..."
train = list(parseData("/home/iizhaki/oasis/CSE255/Project2/assignment2/train.json"))
print "done"


Reading train...
done

In [2]:
allHelpful = []
userHelpful = defaultdict(list)
itemHelpful = defaultdict(list)
userRating = defaultdict(list)
itemRating = defaultdict(list)

for l in train:
    user,item, rating, helpful = l['reviewerID'],l['itemID'], l['rating'], l['helpful']
    allHelpful.append(helpful)
    userHelpful[user].append(helpful)
    itemHelpful[item].append(helpful)
    userRating[user].append(rating / 5.0)
    itemRating[item].append(rating / 5.0)

    
averageRate = sum([x['nHelpful'] for x in allHelpful]) * 1.0 / sum([x['outOf'] for x in allHelpful])
userRate = {}
for u in userHelpful:
  userRate[u] = sum([x['nHelpful'] for x in userHelpful[u]]) * 1.0 / sum([x['outOf'] for x in userHelpful[u]])
itemRate = {}
for i in itemHelpful:
  itemRate[i] = sum([x['nHelpful'] for x in itemHelpful[i]]) * 1.0 / sum([x['outOf'] for x in itemHelpful[i]])
userScore = {}
for u in userRating:
    userScore[u] = mean(userRating[u])
itemScore = {}
for i in itemRating:
    itemScore[i] = mean(itemRating[i])
    
averageUserScore = mean([userScore[u] for u in userScore])
averageItemScore = mean([itemScore[i] for u in itemScore])

In [3]:
for u in userScore:
    print userScore[u]
    break
    
for i in itemScore:
    print itemScore[i]
    break


1.0
0.7

In [4]:
cUser = 0
users = {}
cItem = 0
items = {}

for l in train:
    user, item = l['reviewerID'], l['itemID']
    if user not in users:
        users[user] = cUser
        cUser += 1
    
    if item not in items:
        items[item] = cItem
        cItem += 1

In [5]:
def FF(loc, total):
    res = [0] * total    
    res[loc] = 1
    
    return res

In [6]:
def FFID(vals, bits):
    n = bin(vals)
    res = [0] * bits
    i = bits - 1
    for l in reversed(n[2:]):
        res[i] = int(l)
        i -= 1
    return res

In [7]:
from sets import Set
import string

punctuation = set(string.punctuation)
setPos = defaultdict(int)
setNeg = defaultdict(int)
allCats = Set()

bestOutOf = 0
posCats = defaultdict(int)
negCats = defaultdict(int)

for l in train:
    review, helpful, categories = l['reviewText'], l['helpful'], l['category']
    outOf = float(helpful['outOf'])
    nHelpful = float(helpful['nHelpful'])
    ratio = float(nHelpful) / float(outOf)
    if ratio >= 0.8:
        for w in review.split():
            w = ''.join([c for c in w.lower() if not c in punctuation])
            setPos[w] += 1
        for acat in categories:
            for cat in acat:
                for w in cat.split():
                    w = ''.join([c for c in w.lower() if not c in punctuation])
                    if w:
                        posCats[w] += 1
    elif ratio <= 0.2:
        for w in review.split():
            w = ''.join([c for c in w.lower() if not c in punctuation])
            setNeg[w] += 1
        for acat in categories:
            for cat in acat:
                for w in cat.split():
                    w = ''.join([c for c in w.lower() if not c in punctuation])
                    if w:
                        negCats[w] += 1
    
        

    bestOutOf = max(bestOutOf, outOf)
    
    
'''
    for cat in categories[0]:
        for w in cat.split():
            w = ''.join([c for c in w.lower() if not c in punctuation])
            if w:
                allCats.add(w)
'''


Out[7]:
"\n    for cat in categories[0]:\n        for w in cat.split():\n            w = ''.join([c for c in w.lower() if not c in punctuation])\n            if w:\n                allCats.add(w)\n"

In [7]:


In [8]:
from sets import Set
import string
from nltk.tokenize import word_tokenize

'''
punctuation = set(string.punctuation)
setPos = defaultdict(int)
setNeg = defaultdict(int)

options = Set(['VBN', 'JJ', 'NNS'])

for l in train:
    review, helpful = l['reviewText'], l['helpful']
    outOf = float(helpful['outOf'])
    nHelpful = float(helpful['nHelpful'])
    ratio = float(nHelpful) / float(outOf)
    if ratio >= 0.8:
        tokens = word_tokenize(''.join([c for c in review.lower() if not c in punctuation]))
        for w, pos in nltk.pos_tag(tokens):
            if pos in options:
                setPos[w] += 1
    elif ratio <= 0.2:
        tokens = word_tokenize(''.join([c for c in review.lower() if not c in punctuation]))
        for w, pos in nltk.pos_tag(tokens):
            if pos in options:
                setNeg[w] += 1
'''


Out[8]:
"\npunctuation = set(string.punctuation)\nsetPos = defaultdict(int)\nsetNeg = defaultdict(int)\n\noptions = Set(['VBN', 'JJ', 'NNS'])\n\nfor l in train:\n    review, helpful = l['reviewText'], l['helpful']\n    outOf = float(helpful['outOf'])\n    nHelpful = float(helpful['nHelpful'])\n    ratio = float(nHelpful) / float(outOf)\n    if ratio >= 0.8:\n        tokens = word_tokenize(''.join([c for c in review.lower() if not c in punctuation]))\n        for w, pos in nltk.pos_tag(tokens):\n            if pos in options:\n                setPos[w] += 1\n    elif ratio <= 0.2:\n        tokens = word_tokenize(''.join([c for c in review.lower() if not c in punctuation]))\n        for w, pos in nltk.pos_tag(tokens):\n            if pos in options:\n                setNeg[w] += 1\n"

In [3875]:
import pickle

#filehandler = open(b"/home/iizhaki/oasis/CSE255/Project2/assignment2/posWords.pck","wb")
#pickle.dump(setPos, filehandler)

#filehandler = open(b"/home/iizhaki/oasis/CSE255/Project2/assignment2/negWords.pck","wb")
#pickle.dump(setNeg, filehandler)

filehandler = open(b"/home/iizhaki/oasis/CSE255/Project2/assignment2/posWords.pck","rb")
setPos = pickle.load(filehandler)

filehandler = open(b"/home/iizhaki/oasis/CSE255/Project2/assignment2/negWords.pck","rb")
setNeg = pickle.load(filehandler)

In [2801]:


In [3809]:
vecPositiveCats = {}
i = 0
for w in truePosCats:
    if w[0] not in vecPositiveCats:
        vecPositiveCats[w[0]] = i
        i += 1
    
vecNegativeCats = {}
i = 0
for w in trueNegCats:
    if w[0] not in vecNegativeCats:
        vecNegativeCats[w[0]] = i
        i += 1
        
print vecPositiveCats
print vecNegativeCats


{'hoodies': 4, 'totes': 1, 'sequencing': 32, 'fishing': 88, 'earth': 15, 'sega': 63, 'feeding': 57, 'hunting': 28, 'languages': 59, 'arrest': 85, 'powerdistribution': 38, 'dreamcast': 60, 'archival': 39, 'coffee': 67, 'protective': 61, 'sunglasses': 6, 'suitcases': 25, 'invitations': 68, 'band': 69, 'bracelets': 93, 'fall': 35, 'timing': 55, 'enlargers': 22, 'hoses': 70, 'belts': 40, 'customizable': 8, 'pacifiers': 86, 't': 50, 'barkingdog': 9, 'night': 18, 'optics': 62, 'tea': 74, 'platforms': 53, 'books': 54, 'ebooks': 12, 'carts': 43, 'connecting': 19, 'fossil': 10, 'guiders': 23, 'email': 77, 'math': 13, 'bass': 72, 'guitar': 78, 'protection': 64, 'engines': 83, 'shelves': 41, 'technical': 46, 'womens': 33, 'pulleys': 87, 'filtering': 79, 'binders': 36, 'communication': 76, 'matte': 3, 'belt': 81, 'environment': 45, 'arts': 82, 'handbags': 44, 'instrument': 84, 'pots': 73, 'handles': 52, 'gym': 26, 'specialty': 27, 'environmental': 16, 'tifosi': 56, 'craft': 90, 'rangefinder': 51, 'word': 0, 'f': 29, 'science': 2, 'bed': 21, 'project': 91, 'leashes': 92, 'making': 65, 'purse': 89, 'tabletop': 7, 'wiring': 47, 'eyewear': 48, 'sciences': 14, 'copying': 24, 'toddler': 71, 'blocks': 11, 'pillows': 5, 'sweatshirts': 31, 'sewing': 75, 'smartmedia': 49, 'notebook': 42, 'waist': 37, 'camp': 58, 'contact': 80, 'diagonals': 34, 'crafts': 66, 'alarms': 17, 'wedges': 20, 'rechargeable': 30}
{'boy': 8, 'center': 6, 'chest': 9, 'game': 10, 'wenger': 2, 'black': 4, 'contrast': 0, 'w': 5, 'fitness': 7, 'white': 1, 'columns': 3}

In [3810]:
listPosCats = []
for p in posCats:
    listPosCats.append((p, posCats[p]))

allPosCats = sorted(listPosCats, key=lambda x: x[1], reverse=True)

listNegCats = []
for p in negCats:
    listNegCats.append((p, negCats[p]))

allNegCats = sorted(listNegCats, key=lambda x: x[1], reverse=True)

truePosCats = [x for x in allPosCats if x[0] not in negCats]
trueNegCats = [x for x in allNegCats if x[0] not in posCats]

print truePosCats
print trueNegCats


[('word', 27), ('totes', 14), ('science', 14), ('matte', 13), ('hoodies', 11), ('pillows', 10), ('sunglasses', 8), ('tabletop', 8), ('customizable', 7), ('barkingdog', 6), ('fossil', 6), ('blocks', 6), ('ebooks', 6), ('math', 6), ('sciences', 6), ('earth', 6), ('environmental', 6), ('alarms', 6), ('night', 6), ('connecting', 6), ('wedges', 5), ('bed', 5), ('enlargers', 5), ('guiders', 5), ('copying', 5), ('suitcases', 4), ('gym', 4), ('specialty', 4), ('hunting', 3), ('f', 3), ('rechargeable', 3), ('sweatshirts', 3), ('sequencing', 3), ('womens', 3), ('diagonals', 3), ('fall', 2), ('binders', 2), ('waist', 2), ('powerdistribution', 2), ('archival', 2), ('belts', 2), ('shelves', 2), ('notebook', 2), ('carts', 2), ('handbags', 2), ('environment', 2), ('technical', 2), ('wiring', 2), ('eyewear', 2), ('smartmedia', 2), ('t', 2), ('rangefinder', 2), ('handles', 2), ('platforms', 2), ('books', 2), ('timing', 2), ('tifosi', 2), ('feeding', 1), ('camp', 1), ('languages', 1), ('dreamcast', 1), ('protective', 1), ('optics', 1), ('sega', 1), ('protection', 1), ('making', 1), ('crafts', 1), ('coffee', 1), ('invitations', 1), ('band', 1), ('hoses', 1), ('toddler', 1), ('bass', 1), ('pots', 1), ('tea', 1), ('sewing', 1), ('communication', 1), ('email', 1), ('guitar', 1), ('filtering', 1), ('contact', 1), ('belt', 1), ('arts', 1), ('engines', 1), ('instrument', 1), ('arrest', 1), ('pacifiers', 1), ('pulleys', 1), ('fishing', 1), ('purse', 1), ('craft', 1), ('project', 1), ('leashes', 1), ('bracelets', 1)]
[('contrast', 2), ('white', 2), ('wenger', 2), ('columns', 2), ('black', 2), ('w', 2), ('center', 2), ('fitness', 2), ('boy', 1), ('chest', 1), ('game', 1)]

In [3811]:
'''
allHelpful = []
userHelpful = defaultdict(list)

for l in train:
    user,item = l['reviewerID'],l['itemID']
    allHelpful.append(l['helpful'])
    userHelpful[user].append(l['helpful'])

averageRate = sum([x['nHelpful'] for x in allHelpful]) * 1.0 / sum([x['outOf'] for x in allHelpful])
userRate = {}
for u in userHelpful:
    userRate[u] = sum([x['nHelpful'] for x in userHelpful[u]]) * 1.0 / sum([x['outOf'] for x in userHelpful[u]])
'''


Out[3811]:
"\nallHelpful = []\nuserHelpful = defaultdict(list)\n\nfor l in train:\n    user,item = l['reviewerID'],l['itemID']\n    allHelpful.append(l['helpful'])\n    userHelpful[user].append(l['helpful'])\n\naverageRate = sum([x['nHelpful'] for x in allHelpful]) * 1.0 / sum([x['outOf'] for x in allHelpful])\nuserRate = {}\nfor u in userHelpful:\n    userRate[u] = sum([x['nHelpful'] for x in userHelpful[u]]) * 1.0 / sum([x['outOf'] for x in userHelpful[u]])\n"

In [13]:
import nltk
from nltk.corpus import stopwords
stopWords = Set(stopwords.words("english"))

listPos = []
for p in setPos:
    if not p or p in stopWords:
        continue
    listPos.append((p, setPos[p]))

posWords = sorted(listPos, key=lambda x: x[1], reverse=True)[:300]




In [14]:
listNeg = []
for n in setNeg:
    if not n or n in stopWords:
        continue
    listNeg.append((n, setNeg[n]))

negWords = sorted(listNeg, key=lambda x: x[1], reverse=True)[:300]

In [58]:
removeIt = Set()
negativeWords = [x[0] for x in negWords][:37]
positiveWords = [x[0] for x in posWords if x[0] not in negativeWords][:37]

In [59]:
vecPositive = {}
i = 0
for w in positiveWords:
    if w not in vecPositive:
        vecPositive[w] = i
        i += 1
    
vecNegative = {}
i = 0
for w in negativeWords:
    if w not in vecNegative:
        vecNegative[w] = i
        i += 1

vecCats = {}
i = 0
for w in allCats:
    vecCats[w] = i
    i += 1
        
#print vecPositive
#print vecNegative
#print vecCats

In [60]:
def wordsFF(text, db):
    res = [0] * len(db)
    for w in text.split():
            w = ''.join([c for c in w.lower() if not c in punctuation])
            if w in db:
                res[db[w]] = 1
    return res

In [60]:


In [61]:
X = []
y = []
nVals = []
maxL = 0

for l in train:
    user, item, helpful, rating, words = l['reviewerID'], l['itemID'], l['helpful'], l['rating'], l['reviewText']
    rating = int(rating)
    outOf = float(helpful['outOf'])
    nHelpful = float(helpful['nHelpful'])
    ratio = nHelpful / outOf
    nWords = len(words.split())
    
    nVals.append(rating)
    rating = rating / 5.0;
    
    if nWords < 100:
        nWords = [1, 0]
    else:
        nWords = [0, 1]
    
    cOutOf = outOf * 1.0 / bestOutOf

    #X.append([1] + FF(users[user] / 500, cUser / 500) + FFID(user) + FF(items[item] / 500, cItem / 500) + FFID(item) + FF(rating - 1, 5) + str(nWords))
    # FF(rating - 1, 5)
    X.append([1, ratio, cOutOf, userRate[user], itemRate[item], userScore[user], itemScore[item]] + nWords + wordsFF(words, vecPositive) + wordsFF(words, vecNegative))
    #  
    # + FFID(int(outOf) -1, 10) + FFID(int(user[1:]), 30) +
    
    y.append(rating)

In [62]:
print "Done"


Done

In [63]:
import pylab as pl

from sklearn.linear_model import SGDRegressor
from sklearn.datasets.samples_generator import make_regression
from sklearn.preprocessing import *
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

import time
import timeit

In [64]:
X_train = np.array(X).astype(float)
y_train = np.array(y).astype(float)

#Remember = [X_train[:, i].max() for i in range(len(X_train.T))]
#X_train = X_train / len(X_train[0])

#X_train = np.array([np.array([xi if y > 0 else xi for (xi, y) in zip(x, Remember)]) for x in X_train])
#X_train = X_train / len(X_train[0])

#scaler = MinMaxScaler().fit(X_train)
#X_train = scaler.transform(X_train)

In [65]:
print "Length of FFs: ", len(X_train[0])
#clf = SGDRegressor(n_iter = 1000, alpha = 0.01)
# Objective
def f(theta, X, y, lam):
    diff = numpy.dot(X, theta) - y
    diffSq = (norm(diff) ** 2) / len(X)
    diffSqReg = diffSq + lam * norm(theta) ** 2
    #print "f : " , diffSqReg
    return diffSqReg

    # Derivative
def fprime(theta, X, y, lam):
    diff = numpy.dot(X, theta) - y
    res = 2 * numpy.dot(X.T, diff) / len(X) + 2 * lam * theta
    return res

start = time.time()

#clf.fit(X_train, y_train)
#thetax, residualsx, rankx, sx = numpy.linalg.lstsq(X_train, y_train)
thetax, _, _, _ = np.linalg.lstsq(X_train, y_train)
#thetax, _, _ = scipy.optimize.fmin_l_bfgs_b(f, numpy.array([0] * len(X_train[0])).T, fprime, args = (numpy.array(X_train), numpy.array(y_train).T, 0.0))


end = time.time()
finished = end - start
print finished
print "Length of Thetas: ", len(thetax)
print thetax


Length of FFs:  83
13.1952581406
Length of Thetas:  83
[ -2.88786650e+06   1.18336414e-01  -6.25516078e-01  -1.13704153e-01
  -1.82017516e-02   9.52420879e-01   1.21694842e-01   2.88786645e+06
   2.88786644e+06   5.01930766e-03   3.20991388e-03   3.87032240e-04
   1.87589150e-03  -1.06714375e-03   4.75343626e-03  -2.16084583e-03
   5.17687763e-03   7.31494286e-03   1.77162694e-03  -2.47492611e-03
   1.67313589e-03   2.96732700e-03  -2.02012796e-03   8.09135817e-05
  -2.73085353e-03   3.58179832e-03  -4.16140860e-03   1.74650232e-03
   5.39482639e-03  -9.86605930e-04   3.81643112e-03   2.23524341e-03
   3.27500887e-03   1.98100659e-04   1.57571994e-03   3.13818565e-03
  -1.84487036e-03  -8.15815754e-03  -6.19416675e-04   3.29127185e-03
  -6.65032121e-03   1.18405882e-02   7.73580236e-04  -4.92398068e-04
   2.16941087e-03  -6.92932748e-03   9.43070149e-05   1.24241576e-03
  -7.12443131e-03   1.72182466e-03   1.03372636e-02   8.33892948e-04
  -5.17017370e-03  -4.88281329e-03  -6.26739559e-04  -8.25196224e-03
  -3.40298299e-03   4.20273068e-03  -2.22106478e-03  -6.93857043e-04
   4.49153611e-04  -3.28887670e-03  -1.60939846e-03   5.57715813e-03
   1.77592996e-04   7.64439551e-03  -1.74731846e-04  -1.97356374e-03
  -5.09720475e-03  -1.01702704e-02  -2.33009052e-04   1.74923517e-03
  -1.39623078e-03   3.38629180e-03   6.65081818e-03  -2.20289643e-03
   2.31991993e-03   2.75971126e-03   1.34477264e-03   1.00626489e-02
  -5.07798335e-03   1.20234987e-02  -1.86898749e-03]

In [66]:
def predict(data, theta):
    theta = numpy.array(theta)
    prediction = [np.dot(theta, d)  for d in data]
    return prediction

ptrain = predict(X_train, thetax)
#ptrain = clf.predict(X_train)
ytrain = [e for e in y_train]

print ptrain[0], ytrain[0]

ptrain = [p if p >= 0.2 else 0.2 for p in ptrain]
ptrain = [p if p <= 1. else 1. for p in ptrain]

ptrain = [round(x * 5) for x in ptrain]
ytrain = nVals
#ytrain = expY


1.03394875422 1.0

In [66]:


In [67]:
def MSE(prediction, real):
    squares = [ (p - r)**2 for p,r in zip (prediction,real) ]
    return numpy.mean(squares)

In [68]:
mse = MSE(ptrain, ytrain)
var = MSE(ytrain, [mean(ytrain)] * len(ytrain))
print "MSE training", mse
print "Var", var
print " FVU training", (mse / var)


MSE training 0.202836
Var 2.37327672907
 FVU training 0.0854666451307

In [69]:
trainScore = len([[p, y] for p, y in zip(ptrain, ytrain) if p != y]) * 1.0 / len(ytrain)
print trainScore


0.108253

In [69]:


In [69]:


In [70]:
print "Reading test..."
tetst = list(parseData("/home/iizhaki/oasis/CSE255/Project2/assignment2/helpful.json"))
print "done"


Reading test...
done

In [4055]:
Xt = []
del outOt

for l in tetst:
    user, item, helpful, rating, words = l['reviewerID'], l['itemID'], l['helpful'], l['rating'], l['reviewText']
    outOf = float(helpful['outOf'])
    nHelpful = float(helpful['nHelpful'])
    ratio = nHelpful / outOf
    nWords = len(words.split())
    
    if nWords < 100:
        nWords = [1, 0]
    else:
        nWords = [0, 1]
    
    cOutOf = outOf * 1.0 / bestOutOf
    
    if user in userRate:
        urate = userRate[user]
        uscore = userScore[user]
    else:
        urate = averageRate
        uscore = averageUserScore
    if item in itemRate:
        irate = itemRate[item]
        iscore = itemScore[item]
    else:
        irate = averageRate
        iscore = averageItemScore
        
    Xt.append([1, ratio, cOutOf, urate, irate, uscore, iscore] + nWords + wordsFF(words, vecPositive) + wordsFF(words, vecNegative))
    #Xt.append([1, rating] + urate + nWords + wordsFF(words, vecPositive) + wordsFF(words, vecNegative))

In [4056]:
print Xt[0]
X_test = np.array(Xt).astype(float)
#scaler = MinMaxScaler().fit(X_test)
#X_test = scaler.transform(X_test)


[1, 0.2, 7.987858455148175e-05, 0.0, 0.82622950819672136, 0.20000000000000001, 0.90328638497652625, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [4057]:
ptest = predict(X_test, thetax)
#ptrain = clf.predict(X_train)


ptest = [p if p >= 0. else 0. for p in ptest]
ptest = [p if p <= 1. else 1. for p in ptest]

ptest = [round(x*o) for x, o in zip(ptest, outOt)]

In [4058]:
print ptest[:10]


[0.0, 3.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0]

In [4059]:
print "Reading helpful..."
test = np.array(list(parseTxt("/home/iizhaki/oasis/CSE255/Project2/assignment2/pairs_Helpful.txt")))
print "done"


Reading helpful...
done

In [4060]:
print "Prediction test..."
myPredictions = open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Helpful_TS_" + str(trainScore) + ".txt", 'w')
print "done"

myPredictions.write(test[0][0] + '\n')

for l, p in zip(test[1 :], ptest):
    u, i, o = l[0].split("-")
    myPredictions.write(l[0] + ',' + str(p) + '\n')
    
myPredictions.close()


Prediction test...
done

In [4060]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [84]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:
allXs = []
allYs = []









allXs = []
allYs = []
allRs = []
for l in train:
    user, item, helpful = l['reviewerID'], l['itemID'], l['helpful']
    rating, words = l['rating'], l['reviewText']
    allXs.append([user, item])
    outOf = float(helpful['outOf'])
    nHelpful = float(helpful['nHelpful'])
    allYs.append((outOf, nHelpful))
    
    nWords = len(words.split())
    allRs.append((float(rating), nWords, nHelpful * 1.0 / outOf))

In [9]:
print "Reading test..."
test = list(parseTxt("/home/iizhaki/oasis/CSE255/Project2/assignment2/labeled_Helpful.txt"))
print "done"


Reading test...
done

In [10]:
# Part A
alpha = 0
for o, n in allYs:
    if o == 0.:
        continue
    alpha += n * 1.0 / o
alpha = alpha / len(allYs)

print alpha


0.533185849504

In [11]:
# Part B
dictUI = defaultdict(list)

for [u, i], [o, n] in zip(allXs, allYs):
    dictUI[(u, i)] += (o, n)

In [12]:
yUI = defaultdict(list)
for u, i, o, n in test:
    yUI[(u, i)] += (o, n)

In [13]:
mse = 0.
AE = 0.
for u, i, o, n in test:
    predN = float(o) * alpha
    mse += (float(n) - predN) ** 2
    AE += abs(float(n) - predN)   
        
mse = mse / len(test)
print "MSE: ", mse
print "AE: ", AE


MSE:  74.4869061806
AE:  18362.4004226

In [13]:


In [3]:
# Part C
X = []
y = []
for rating, count, ratio in allRs:
    X.append([1, count, rating])
    y.append(ratio)

In [4]:
from _collections import defaultdict
import time
import timeit

from numpy.linalg import norm
import scipy.optimize

import numpy as np

start = time.time()
thetar, _, _, _ = np.linalg.lstsq(X, y)
end = time.time()
finished = end - start
print finished


0.868791103363

In [ ]:


In [6]:
print "Fitted parmaters: ", thetar


Fitted parmaters:  [  4.00493735e-01   3.31855284e-04   6.27329666e-02]

In [5]:
# Part D
print "Reading train..."
test2 = list(parseData("/home/iizhaki/oasis/CSE255/Project2/assignment2/helpful.json"))
print "done"

allTestRefXs = []
allTestRefYs = []
allTestRefRs = []
for l in test2:
    user, item, helpful = l['reviewerID'], l['itemID'], l['helpful']
    rating, words = l['rating'], l['reviewText']
    allTestRefXs.append([user, item])
    outOf = float(helpful['outOf'])
    allTestRefYs.append(outOf)
    
    nWords = len(words.split())
    allTestRefRs.append((float(rating), nWords))


Reading train...
done

In [6]:
print rating


5.0

In [7]:
def predict(data, theta):
    prediction = [np.dot(theta, d)  for d in data]
    return prediction

In [8]:
mse = 0.
AE = 0.

for [_, _, oReal, nReal], refR  in zip(test, allTestRefRs):
    (rating, count) = refR
    X = np.array([1, count, rating])
    predRatio = np.dot(np.array(thetar, dtype='float'), X)
    
    mse += (float(nReal) / float(oReal) - float(predRatio)) ** 2
    AE += abs(float(nReal) / float(oReal) - float(predRatio))
    
mse = mse / len(test)
print "MSE: ", mse
print "AE: ", AE


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-8-763fba31734f> in <module>()
      2 AE = 0.
      3 
----> 4 for [_, _, oReal, nReal], refR  in zip(test, allTestRefRs):
      5     (rating, count) = refR
      6     X = np.array([1, count, rating])

TypeError: zip argument #1 must support iteration

Q2

The idea:

  • Take the maximum of "outOf"s
  • Divide each outOf by the maximum (normailzed outOf)
  • Take the squared root of this value, and multiplty it by the original ratio

  • 1/1, 4/5, 48/50, 20/40

  • => Maximum is 50 => New rations are: 1/50, 5/50, 50/50, 40/50 => Take square root: sq(1/50) 1/1, sq(5/50) 4/5, sq(50/50) 48/50, sq(40/50) 20/40 ===============> 0.14, 0.25, 0.96, 0.45

In [ ]: