In [8]:
from _collections import defaultdict
import time
import timeit

from numpy.linalg import norm
import scipy.optimize

import numpy as np

def parseData(fname):
  for l in open(fname):
    yield eval(l)
    
def parseTxt(fname):
  for l in open(fname):
    yield l.strip().split(" ")

print "Reading train..."
train = list(parseData("homework6_7/train.json"))
print "done"

allXs = []
allYs = []
allRs = []
for l in train:
    user, item, helpful = l['reviewerID'], l['itemID'], l['helpful']
    rating, words = l['rating'], l['reviewText']
    allXs.append([user, item])
    outOf = float(helpful['outOf'])
    nHelpful = float(helpful['nHelpful'])
    allYs.append((outOf, nHelpful))
    
    nWords = len(words.split())
    allRs.append((float(rating), nWords, nHelpful * 1.0 / outOf))


Reading train...
done

In [9]:
print "Reading test..."
test = list(parseTxt("homework6_7/labeled_Helpful.txt"))
print "done"


Reading test...
done

In [10]:
# Part A
alpha = 0
for o, n in allYs:
    if o == 0.:
        continue
    alpha += n * 1.0 / o
alpha = alpha / len(allYs)

print alpha


0.533185849504

In [11]:
# Part B
dictUI = defaultdict(list)

for [u, i], [o, n] in zip(allXs, allYs):
    dictUI[(u, i)] += (o, n)

In [12]:
yUI = defaultdict(list)
for u, i, o, n in test:
    yUI[(u, i)] += (o, n)

In [13]:
mse = 0.
AE = 0.
for u, i, o, n in test:
    predN = float(o) * alpha
    mse += (float(n) - predN) ** 2
    AE += abs(float(n) - predN)   
        
mse = mse / len(test)
print "MSE: ", mse
print "AE: ", AE


MSE:  74.4869061806
AE:  18362.4004226

In [13]:


In [14]:
# Part C
X = []
y = []
for rating, count, ratio in allRs:
    X.append([1, count, rating])
    y.append(ratio)

In [15]:
from _collections import defaultdict
import time
import timeit

from numpy.linalg import norm
import scipy.optimize

import numpy as np

start = time.time()
thetar, _, _, _ = np.linalg.lstsq(X, y)
end = time.time()
finished = end - start
print finished


0.0817348957062

In [16]:
print "Fitted parmaters: ", thetar


Fitted parmaters:  [  3.45431358e-01   2.71683612e-04   4.39997995e-02]

In [28]:
# Part D

print "Reading train..."
test = list(parseTxt("homework6_7/labeled_Helpful.txt"))
print "done"

print "Reading train..."
test2 = list(parseData("homework6_7/helpful.json"))
print "done"

allTestRefXs = []
allTestRefYs = []
allTestRefRs = []
for l in test2:
    user, item, helpful = l['reviewerID'], l['itemID'], l['helpful']
    rating, words = l['rating'], l['reviewText']
    allTestRefXs.append([user, item])
    outOf = float(helpful['outOf'])
    allTestRefYs.append(outOf)
    
    nWords = len(words.split())
    allTestRefRs.append((float(rating), nWords))


Reading train...
done
Reading train...
done

In [28]:


In [29]:
def predict(data, theta):
    prediction = [np.dot(theta, d)  for d in data]
    return prediction

In [48]:
mse = 0.
AE = 0.

for [_, _, oReal, nReal], refR  in zip(test, allTestRefRs):
    (rating, count) = refR
    X = np.array([1, count, rating])
    predRatio = np.dot(np.array(thetar, dtype='float'), X)
    
    mse += (float(nReal) / float(oReal) - float(predRatio)) ** 2
    AE += abs(float(nReal) / float(oReal) - float(predRatio))
    
mse = mse / len(test)
print "MSE: ", mse
print "AE: ", AE


MSE:  0.162314627719
AE:  3594.22471383

Q2

The idea:

  • Take the maximum of "outOf"s
  • Divide each outOf by the maximum (normailzed outOf)
  • Take the squared root of this value, and multiplty it by the original ratio

  • 1/1, 4/5, 48/50, 20/40

  • => Maximum is 50 => New rations are: 1/50, 5/50, 50/50, 40/50 => Take square root: sq(1/50) 1/1, sq(5/50) 4/5, sq(50/50) 48/50, sq(40/50) 20/40 ===============> 0.14, 0.25, 0.96, 0.45

In [ ]: