In [1]:
from _collections import defaultdict
import time
import timeit
from numpy.linalg import norm
import scipy.optimize
import numpy as np
def parseData(fname):
for l in open(fname):
yield eval(l)
def parseTxt(fname):
for l in open(fname):
yield l.strip().split(" ")
print "Reading train..."
train = list(parseData("/home/iizhaki/oasis/CSE255/Project2/assignment2/train.json"))
print "done"
In [2]:
allHelpful = []
userHelpful = defaultdict(list)
itemHelpful = defaultdict(list)
userRating = defaultdict(list)
itemRating = defaultdict(list)
for l in train:
user,item, rating, helpful = l['reviewerID'],l['itemID'], l['rating'], l['helpful']
allHelpful.append(helpful)
userHelpful[user].append(helpful)
itemHelpful[item].append(helpful)
userRating[user].append(rating / 5.0)
itemRating[item].append(rating / 5.0)
averageRate = sum([x['nHelpful'] for x in allHelpful]) * 1.0 / sum([x['outOf'] for x in allHelpful])
userRate = {}
for u in userHelpful:
userRate[u] = sum([x['nHelpful'] for x in userHelpful[u]]) * 1.0 / sum([x['outOf'] for x in userHelpful[u]])
itemRate = {}
for i in itemHelpful:
itemRate[i] = sum([x['nHelpful'] for x in itemHelpful[i]]) * 1.0 / sum([x['outOf'] for x in itemHelpful[i]])
userScore = {}
for u in userRating:
userScore[u] = mean(userRating[u])
itemScore = {}
for i in itemRating:
itemScore[i] = mean(itemRating[i])
averageUserScore = mean([userScore[u] for u in userScore])
averageItemScore = mean([itemScore[i] for u in itemScore])
In [3]:
for u in userScore:
print userScore[u]
break
for i in itemScore:
print itemScore[i]
break
In [4]:
cUser = 0
users = {}
cItem = 0
items = {}
for l in train:
user, item = l['reviewerID'], l['itemID']
if user not in users:
users[user] = cUser
cUser += 1
if item not in items:
items[item] = cItem
cItem += 1
In [5]:
def FF(loc, total):
res = [0] * total
res[loc] = 1
return res
In [6]:
def FFID(vals, bits):
n = bin(vals)
res = [0] * bits
i = bits - 1
for l in reversed(n[2:]):
res[i] = int(l)
i -= 1
return res
In [7]:
from sets import Set
import string
punctuation = set(string.punctuation)
setPos = defaultdict(int)
setNeg = defaultdict(int)
allCats = Set()
bestOutOf = 0
posCats = defaultdict(int)
negCats = defaultdict(int)
for l in train:
review, helpful, categories = l['reviewText'], l['helpful'], l['category']
outOf = float(helpful['outOf'])
nHelpful = float(helpful['nHelpful'])
ratio = float(nHelpful) / float(outOf)
if ratio >= 0.8:
for w in review.split():
w = ''.join([c for c in w.lower() if not c in punctuation])
setPos[w] += 1
for acat in categories:
for cat in acat:
for w in cat.split():
w = ''.join([c for c in w.lower() if not c in punctuation])
if w:
posCats[w] += 1
elif ratio <= 0.2:
for w in review.split():
w = ''.join([c for c in w.lower() if not c in punctuation])
setNeg[w] += 1
for acat in categories:
for cat in acat:
for w in cat.split():
w = ''.join([c for c in w.lower() if not c in punctuation])
if w:
negCats[w] += 1
bestOutOf = max(bestOutOf, outOf)
'''
for cat in categories[0]:
for w in cat.split():
w = ''.join([c for c in w.lower() if not c in punctuation])
if w:
allCats.add(w)
'''
Out[7]:
In [7]:
In [8]:
from sets import Set
import string
from nltk.tokenize import word_tokenize
'''
punctuation = set(string.punctuation)
setPos = defaultdict(int)
setNeg = defaultdict(int)
options = Set(['VBN', 'JJ', 'NNS'])
for l in train:
review, helpful = l['reviewText'], l['helpful']
outOf = float(helpful['outOf'])
nHelpful = float(helpful['nHelpful'])
ratio = float(nHelpful) / float(outOf)
if ratio >= 0.8:
tokens = word_tokenize(''.join([c for c in review.lower() if not c in punctuation]))
for w, pos in nltk.pos_tag(tokens):
if pos in options:
setPos[w] += 1
elif ratio <= 0.2:
tokens = word_tokenize(''.join([c for c in review.lower() if not c in punctuation]))
for w, pos in nltk.pos_tag(tokens):
if pos in options:
setNeg[w] += 1
'''
Out[8]:
In [3875]:
import pickle
#filehandler = open(b"/home/iizhaki/oasis/CSE255/Project2/assignment2/posWords.pck","wb")
#pickle.dump(setPos, filehandler)
#filehandler = open(b"/home/iizhaki/oasis/CSE255/Project2/assignment2/negWords.pck","wb")
#pickle.dump(setNeg, filehandler)
filehandler = open(b"/home/iizhaki/oasis/CSE255/Project2/assignment2/posWords.pck","rb")
setPos = pickle.load(filehandler)
filehandler = open(b"/home/iizhaki/oasis/CSE255/Project2/assignment2/negWords.pck","rb")
setNeg = pickle.load(filehandler)
In [2801]:
In [3809]:
vecPositiveCats = {}
i = 0
for w in truePosCats:
if w[0] not in vecPositiveCats:
vecPositiveCats[w[0]] = i
i += 1
vecNegativeCats = {}
i = 0
for w in trueNegCats:
if w[0] not in vecNegativeCats:
vecNegativeCats[w[0]] = i
i += 1
print vecPositiveCats
print vecNegativeCats
In [3810]:
listPosCats = []
for p in posCats:
listPosCats.append((p, posCats[p]))
allPosCats = sorted(listPosCats, key=lambda x: x[1], reverse=True)
listNegCats = []
for p in negCats:
listNegCats.append((p, negCats[p]))
allNegCats = sorted(listNegCats, key=lambda x: x[1], reverse=True)
truePosCats = [x for x in allPosCats if x[0] not in negCats]
trueNegCats = [x for x in allNegCats if x[0] not in posCats]
print truePosCats
print trueNegCats
In [3811]:
'''
allHelpful = []
userHelpful = defaultdict(list)
for l in train:
user,item = l['reviewerID'],l['itemID']
allHelpful.append(l['helpful'])
userHelpful[user].append(l['helpful'])
averageRate = sum([x['nHelpful'] for x in allHelpful]) * 1.0 / sum([x['outOf'] for x in allHelpful])
userRate = {}
for u in userHelpful:
userRate[u] = sum([x['nHelpful'] for x in userHelpful[u]]) * 1.0 / sum([x['outOf'] for x in userHelpful[u]])
'''
Out[3811]:
In [13]:
import nltk
from nltk.corpus import stopwords
stopWords = Set(stopwords.words("english"))
listPos = []
for p in setPos:
if not p or p in stopWords:
continue
listPos.append((p, setPos[p]))
posWords = sorted(listPos, key=lambda x: x[1], reverse=True)[:300]
In [14]:
listNeg = []
for n in setNeg:
if not n or n in stopWords:
continue
listNeg.append((n, setNeg[n]))
negWords = sorted(listNeg, key=lambda x: x[1], reverse=True)[:300]
In [58]:
removeIt = Set()
negativeWords = [x[0] for x in negWords][:37]
positiveWords = [x[0] for x in posWords if x[0] not in negativeWords][:37]
In [59]:
vecPositive = {}
i = 0
for w in positiveWords:
if w not in vecPositive:
vecPositive[w] = i
i += 1
vecNegative = {}
i = 0
for w in negativeWords:
if w not in vecNegative:
vecNegative[w] = i
i += 1
vecCats = {}
i = 0
for w in allCats:
vecCats[w] = i
i += 1
#print vecPositive
#print vecNegative
#print vecCats
In [60]:
def wordsFF(text, db):
res = [0] * len(db)
for w in text.split():
w = ''.join([c for c in w.lower() if not c in punctuation])
if w in db:
res[db[w]] = 1
return res
In [60]:
In [61]:
X = []
y = []
nVals = []
maxL = 0
for l in train:
user, item, helpful, rating, words = l['reviewerID'], l['itemID'], l['helpful'], l['rating'], l['reviewText']
rating = int(rating)
outOf = float(helpful['outOf'])
nHelpful = float(helpful['nHelpful'])
ratio = nHelpful / outOf
nWords = len(words.split())
nVals.append(rating)
rating = rating / 5.0;
if nWords < 100:
nWords = [1, 0]
else:
nWords = [0, 1]
cOutOf = outOf * 1.0 / bestOutOf
#X.append([1] + FF(users[user] / 500, cUser / 500) + FFID(user) + FF(items[item] / 500, cItem / 500) + FFID(item) + FF(rating - 1, 5) + str(nWords))
# FF(rating - 1, 5)
X.append([1, ratio, cOutOf, userRate[user], itemRate[item], userScore[user], itemScore[item]] + nWords + wordsFF(words, vecPositive) + wordsFF(words, vecNegative))
#
# + FFID(int(outOf) -1, 10) + FFID(int(user[1:]), 30) +
y.append(rating)
In [62]:
print "Done"
In [63]:
import pylab as pl
from sklearn.linear_model import SGDRegressor
from sklearn.datasets.samples_generator import make_regression
from sklearn.preprocessing import *
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import time
import timeit
In [64]:
X_train = np.array(X).astype(float)
y_train = np.array(y).astype(float)
#Remember = [X_train[:, i].max() for i in range(len(X_train.T))]
#X_train = X_train / len(X_train[0])
#X_train = np.array([np.array([xi if y > 0 else xi for (xi, y) in zip(x, Remember)]) for x in X_train])
#X_train = X_train / len(X_train[0])
#scaler = MinMaxScaler().fit(X_train)
#X_train = scaler.transform(X_train)
In [65]:
print "Length of FFs: ", len(X_train[0])
#clf = SGDRegressor(n_iter = 1000, alpha = 0.01)
# Objective
def f(theta, X, y, lam):
diff = numpy.dot(X, theta) - y
diffSq = (norm(diff) ** 2) / len(X)
diffSqReg = diffSq + lam * norm(theta) ** 2
#print "f : " , diffSqReg
return diffSqReg
# Derivative
def fprime(theta, X, y, lam):
diff = numpy.dot(X, theta) - y
res = 2 * numpy.dot(X.T, diff) / len(X) + 2 * lam * theta
return res
start = time.time()
#clf.fit(X_train, y_train)
#thetax, residualsx, rankx, sx = numpy.linalg.lstsq(X_train, y_train)
thetax, _, _, _ = np.linalg.lstsq(X_train, y_train)
#thetax, _, _ = scipy.optimize.fmin_l_bfgs_b(f, numpy.array([0] * len(X_train[0])).T, fprime, args = (numpy.array(X_train), numpy.array(y_train).T, 0.0))
end = time.time()
finished = end - start
print finished
print "Length of Thetas: ", len(thetax)
print thetax
In [66]:
def predict(data, theta):
theta = numpy.array(theta)
prediction = [np.dot(theta, d) for d in data]
return prediction
ptrain = predict(X_train, thetax)
#ptrain = clf.predict(X_train)
ytrain = [e for e in y_train]
print ptrain[0], ytrain[0]
ptrain = [p if p >= 0.2 else 0.2 for p in ptrain]
ptrain = [p if p <= 1. else 1. for p in ptrain]
ptrain = [round(x * 5) for x in ptrain]
ytrain = nVals
#ytrain = expY
In [66]:
In [67]:
def MSE(prediction, real):
squares = [ (p - r)**2 for p,r in zip (prediction,real) ]
return numpy.mean(squares)
In [68]:
mse = MSE(ptrain, ytrain)
var = MSE(ytrain, [mean(ytrain)] * len(ytrain))
print "MSE training", mse
print "Var", var
print " FVU training", (mse / var)
In [69]:
trainScore = len([[p, y] for p, y in zip(ptrain, ytrain) if p != y]) * 1.0 / len(ytrain)
print trainScore
In [69]:
In [69]:
In [70]:
print "Reading test..."
tetst = list(parseData("/home/iizhaki/oasis/CSE255/Project2/assignment2/helpful.json"))
print "done"
In [4055]:
Xt = []
del outOt
for l in tetst:
user, item, helpful, rating, words = l['reviewerID'], l['itemID'], l['helpful'], l['rating'], l['reviewText']
outOf = float(helpful['outOf'])
nHelpful = float(helpful['nHelpful'])
ratio = nHelpful / outOf
nWords = len(words.split())
if nWords < 100:
nWords = [1, 0]
else:
nWords = [0, 1]
cOutOf = outOf * 1.0 / bestOutOf
if user in userRate:
urate = userRate[user]
uscore = userScore[user]
else:
urate = averageRate
uscore = averageUserScore
if item in itemRate:
irate = itemRate[item]
iscore = itemScore[item]
else:
irate = averageRate
iscore = averageItemScore
Xt.append([1, ratio, cOutOf, urate, irate, uscore, iscore] + nWords + wordsFF(words, vecPositive) + wordsFF(words, vecNegative))
#Xt.append([1, rating] + urate + nWords + wordsFF(words, vecPositive) + wordsFF(words, vecNegative))
In [4056]:
print Xt[0]
X_test = np.array(Xt).astype(float)
#scaler = MinMaxScaler().fit(X_test)
#X_test = scaler.transform(X_test)
In [4057]:
ptest = predict(X_test, thetax)
#ptrain = clf.predict(X_train)
ptest = [p if p >= 0. else 0. for p in ptest]
ptest = [p if p <= 1. else 1. for p in ptest]
ptest = [round(x*o) for x, o in zip(ptest, outOt)]
In [4058]:
print ptest[:10]
In [4059]:
print "Reading helpful..."
test = np.array(list(parseTxt("/home/iizhaki/oasis/CSE255/Project2/assignment2/pairs_Helpful.txt")))
print "done"
In [4060]:
print "Prediction test..."
myPredictions = open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Helpful_TS_" + str(trainScore) + ".txt", 'w')
print "done"
myPredictions.write(test[0][0] + '\n')
for l, p in zip(test[1 :], ptest):
u, i, o = l[0].split("-")
myPredictions.write(l[0] + ',' + str(p) + '\n')
myPredictions.close()
In [4060]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [84]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
allXs = []
allYs = []
allXs = []
allYs = []
allRs = []
for l in train:
user, item, helpful = l['reviewerID'], l['itemID'], l['helpful']
rating, words = l['rating'], l['reviewText']
allXs.append([user, item])
outOf = float(helpful['outOf'])
nHelpful = float(helpful['nHelpful'])
allYs.append((outOf, nHelpful))
nWords = len(words.split())
allRs.append((float(rating), nWords, nHelpful * 1.0 / outOf))
In [9]:
print "Reading test..."
test = list(parseTxt("/home/iizhaki/oasis/CSE255/Project2/assignment2/labeled_Helpful.txt"))
print "done"
In [10]:
# Part A
alpha = 0
for o, n in allYs:
if o == 0.:
continue
alpha += n * 1.0 / o
alpha = alpha / len(allYs)
print alpha
In [11]:
# Part B
dictUI = defaultdict(list)
for [u, i], [o, n] in zip(allXs, allYs):
dictUI[(u, i)] += (o, n)
In [12]:
yUI = defaultdict(list)
for u, i, o, n in test:
yUI[(u, i)] += (o, n)
In [13]:
mse = 0.
AE = 0.
for u, i, o, n in test:
predN = float(o) * alpha
mse += (float(n) - predN) ** 2
AE += abs(float(n) - predN)
mse = mse / len(test)
print "MSE: ", mse
print "AE: ", AE
In [13]:
In [3]:
# Part C
X = []
y = []
for rating, count, ratio in allRs:
X.append([1, count, rating])
y.append(ratio)
In [4]:
from _collections import defaultdict
import time
import timeit
from numpy.linalg import norm
import scipy.optimize
import numpy as np
start = time.time()
thetar, _, _, _ = np.linalg.lstsq(X, y)
end = time.time()
finished = end - start
print finished
In [ ]:
In [6]:
print "Fitted parmaters: ", thetar
In [5]:
# Part D
print "Reading train..."
test2 = list(parseData("/home/iizhaki/oasis/CSE255/Project2/assignment2/helpful.json"))
print "done"
allTestRefXs = []
allTestRefYs = []
allTestRefRs = []
for l in test2:
user, item, helpful = l['reviewerID'], l['itemID'], l['helpful']
rating, words = l['rating'], l['reviewText']
allTestRefXs.append([user, item])
outOf = float(helpful['outOf'])
allTestRefYs.append(outOf)
nWords = len(words.split())
allTestRefRs.append((float(rating), nWords))
In [6]:
print rating
In [7]:
def predict(data, theta):
prediction = [np.dot(theta, d) for d in data]
return prediction
In [8]:
mse = 0.
AE = 0.
for [_, _, oReal, nReal], refR in zip(test, allTestRefRs):
(rating, count) = refR
X = np.array([1, count, rating])
predRatio = np.dot(np.array(thetar, dtype='float'), X)
mse += (float(nReal) / float(oReal) - float(predRatio)) ** 2
AE += abs(float(nReal) / float(oReal) - float(predRatio))
mse = mse / len(test)
print "MSE: ", mse
print "AE: ", AE
The idea:
Take the squared root of this value, and multiplty it by the original ratio
1/1, 4/5, 48/50, 20/40
In [ ]: