notebook.community

Edit and run



In [8]:

    
from _collections import defaultdict
import time
import timeit

from numpy.linalg import norm
import scipy.optimize

import numpy as np

def parseData(fname):
  for l in open(fname):
    yield eval(l)
    
def parseTxt(fname):
  for l in open(fname):
    yield l.strip().split(" ")

print "Reading train..."
train = list(parseData("homework6_7/train.json"))
print "done"

allXs = []
allYs = []
allRs = []
for l in train:
    user, item, helpful = l['reviewerID'], l['itemID'], l['helpful']
    rating, words = l['rating'], l['reviewText']
    allXs.append([user, item])
    outOf = float(helpful['outOf'])
    nHelpful = float(helpful['nHelpful'])
    allYs.append((outOf, nHelpful))
    
    nWords = len(words.split())
    allRs.append((float(rating), nWords, nHelpful * 1.0 / outOf))









    



Reading train...
done



In [9]:

    
print "Reading test..."
test = list(parseTxt("homework6_7/labeled_Helpful.txt"))
print "done"









    



Reading test...
done



In [10]:

    
# Part A
alpha = 0
for o, n in allYs:
    if o == 0.:
        continue
    alpha += n * 1.0 / o
alpha = alpha / len(allYs)

print alpha









    



0.533185849504



In [11]:

    
# Part B
dictUI = defaultdict(list)

for [u, i], [o, n] in zip(allXs, allYs):
    dictUI[(u, i)] += (o, n)



In [12]:

    
yUI = defaultdict(list)
for u, i, o, n in test:
    yUI[(u, i)] += (o, n)



In [13]:

    
mse = 0.
AE = 0.
for u, i, o, n in test:
    predN = float(o) * alpha
    mse += (float(n) - predN) ** 2
    AE += abs(float(n) - predN)   
        
mse = mse / len(test)
print "MSE: ", mse
print "AE: ", AE









    



MSE:  74.4869061806
AE:  18362.4004226



In [13]:



In [14]:

    
# Part C
X = []
y = []
for rating, count, ratio in allRs:
    X.append([1, count, rating])
    y.append(ratio)



In [15]:

    
from _collections import defaultdict
import time
import timeit

from numpy.linalg import norm
import scipy.optimize

import numpy as np

start = time.time()
thetar, _, _, _ = np.linalg.lstsq(X, y)
end = time.time()
finished = end - start
print finished









    



0.0817348957062



In [16]:

    
print "Fitted parmaters: ", thetar









    



Fitted parmaters:  [  3.45431358e-01   2.71683612e-04   4.39997995e-02]



In [28]:

    
# Part D

print "Reading train..."
test = list(parseTxt("homework6_7/labeled_Helpful.txt"))
print "done"

print "Reading train..."
test2 = list(parseData("homework6_7/helpful.json"))
print "done"

allTestRefXs = []
allTestRefYs = []
allTestRefRs = []
for l in test2:
    user, item, helpful = l['reviewerID'], l['itemID'], l['helpful']
    rating, words = l['rating'], l['reviewText']
    allTestRefXs.append([user, item])
    outOf = float(helpful['outOf'])
    allTestRefYs.append(outOf)
    
    nWords = len(words.split())
    allTestRefRs.append((float(rating), nWords))









    



Reading train...
done
Reading train...
done



In [28]:



In [29]:

    
def predict(data, theta):
    prediction = [np.dot(theta, d)  for d in data]
    return prediction



In [48]:

    
mse = 0.
AE = 0.

for [_, _, oReal, nReal], refR  in zip(test, allTestRefRs):
    (rating, count) = refR
    X = np.array([1, count, rating])
    predRatio = np.dot(np.array(thetar, dtype='float'), X)
    
    mse += (float(nReal) / float(oReal) - float(predRatio)) ** 2
    AE += abs(float(nReal) / float(oReal) - float(predRatio))
    
mse = mse / len(test)
print "MSE: ", mse
print "AE: ", AE









    



MSE:  0.162314627719
AE:  3594.22471383

Q2

The idea:

Take the maximum of "outOf"s
Divide each outOf by the maximum (normailzed outOf)
Take the squared root of this value, and multiplty it by the original ratio
1/1, 4/5, 48/50, 20/40
=> Maximum is 50 => New rations are: 1/50, 5/50, 50/50, 40/50 => Take square root: sq(1/50) 1/1, sq(5/50) 4/5, sq(50/50) 48/50, sq(40/50) 20/40 ===============> 0.14, 0.25, 0.96, 0.45



In [ ]: