In [8]:
from _collections import defaultdict
import time
import timeit
from numpy.linalg import norm
import scipy.optimize
import numpy as np
def parseData(fname):
for l in open(fname):
yield eval(l)
def parseTxt(fname):
for l in open(fname):
yield l.strip().split(" ")
print "Reading train..."
train = list(parseData("homework6_7/train.json"))
print "done"
allXs = []
allYs = []
allRs = []
for l in train:
user, item, helpful = l['reviewerID'], l['itemID'], l['helpful']
rating, words = l['rating'], l['reviewText']
allXs.append([user, item])
outOf = float(helpful['outOf'])
nHelpful = float(helpful['nHelpful'])
allYs.append((outOf, nHelpful))
nWords = len(words.split())
allRs.append((float(rating), nWords, nHelpful * 1.0 / outOf))
In [9]:
print "Reading test..."
test = list(parseTxt("homework6_7/labeled_Helpful.txt"))
print "done"
In [10]:
# Part A
alpha = 0
for o, n in allYs:
if o == 0.:
continue
alpha += n * 1.0 / o
alpha = alpha / len(allYs)
print alpha
In [11]:
# Part B
dictUI = defaultdict(list)
for [u, i], [o, n] in zip(allXs, allYs):
dictUI[(u, i)] += (o, n)
In [12]:
yUI = defaultdict(list)
for u, i, o, n in test:
yUI[(u, i)] += (o, n)
In [13]:
mse = 0.
AE = 0.
for u, i, o, n in test:
predN = float(o) * alpha
mse += (float(n) - predN) ** 2
AE += abs(float(n) - predN)
mse = mse / len(test)
print "MSE: ", mse
print "AE: ", AE
In [13]:
In [14]:
# Part C
X = []
y = []
for rating, count, ratio in allRs:
X.append([1, count, rating])
y.append(ratio)
In [15]:
from _collections import defaultdict
import time
import timeit
from numpy.linalg import norm
import scipy.optimize
import numpy as np
start = time.time()
thetar, _, _, _ = np.linalg.lstsq(X, y)
end = time.time()
finished = end - start
print finished
In [16]:
print "Fitted parmaters: ", thetar
In [28]:
# Part D
print "Reading train..."
test = list(parseTxt("homework6_7/labeled_Helpful.txt"))
print "done"
print "Reading train..."
test2 = list(parseData("homework6_7/helpful.json"))
print "done"
allTestRefXs = []
allTestRefYs = []
allTestRefRs = []
for l in test2:
user, item, helpful = l['reviewerID'], l['itemID'], l['helpful']
rating, words = l['rating'], l['reviewText']
allTestRefXs.append([user, item])
outOf = float(helpful['outOf'])
allTestRefYs.append(outOf)
nWords = len(words.split())
allTestRefRs.append((float(rating), nWords))
In [28]:
In [29]:
def predict(data, theta):
prediction = [np.dot(theta, d) for d in data]
return prediction
In [48]:
mse = 0.
AE = 0.
for [_, _, oReal, nReal], refR in zip(test, allTestRefRs):
(rating, count) = refR
X = np.array([1, count, rating])
predRatio = np.dot(np.array(thetar, dtype='float'), X)
mse += (float(nReal) / float(oReal) - float(predRatio)) ** 2
AE += abs(float(nReal) / float(oReal) - float(predRatio))
mse = mse / len(test)
print "MSE: ", mse
print "AE: ", AE
The idea:
Take the squared root of this value, and multiplty it by the original ratio
1/1, 4/5, 48/50, 20/40
In [ ]: