In [1]:
from _collections import defaultdict
import time
import timeit
from numpy.linalg import norm
import scipy.optimize
import numpy as np
def parseData(fname):
for l in open(fname):
yield eval(l)
def parseTxt(fname):
for l in open(fname):
yield l.strip().split(" ")
print "Reading train..."
train = list(parseData("/home/iizhaki/oasis/CSE255/Project2/assignment2/train.json"))
print "done"
In [1]:
In [2]:
allHelpful = []
userHelpful = defaultdict(list)
itemHelpful = defaultdict(list)
userRating = defaultdict(list)
itemRating = defaultdict(list)
userMulRO = defaultdict(list)
itemMulRO = defaultdict(list)
userMulRN = defaultdict(list)
itemMulRN = defaultdict(list)
userTime = defaultdict(list)
itemTime = defaultdict(list)
userTimeOO = defaultdict(list)
itemTimeOO = defaultdict(list)
multiNO = 0
maxTime = 0
minTime = 3000000000
userComb = defaultdict(list)
itemComb = defaultdict(list)
maxC = 0
import math
multiM = 0
def nCr(n,r):
f = scipy.misc.factorial
return float(f(n)) / float(f(r)) / float(f(n-r))
for l in train:
user,item, rating, helpful, utime = l['reviewerID'],l['itemID'], l['rating'], l['helpful'], l['unixReviewTime']
allHelpful.append(helpful)
userHelpful[user].append(helpful)
itemHelpful[item].append(helpful)
#userRating[user].append(rating / 5.0)
#itemRating[item].append(rating / 5.0)
userRating[user].append(rating)
itemRating[item].append(rating)
#userMulRO[user].append(rating * helpful['outOf'] / 5.0)
#itemMulRO[item].append(rating * helpful['outOf'] / 5.0)
userMulRO[user].append(helpful['outOf'])
itemMulRO[item].append(helpful['outOf'])
multi = helpful['nHelpful']
multiM = np.max((multiM, multi))
userMulRN[user].append(multi)
itemMulRN[item].append(multi)
maxTime = np.max((maxTime, utime))
minTime = np.min((minTime, utime))
userTime[(user, int(rating))].append(utime)
itemTime[(item, int(rating))].append(utime)
userTimeOO[(user, int(helpful['outOf']))].append(utime)
itemTimeOO[(item, int(helpful['outOf']))].append(utime)
combRatio = 1.0 * (helpful['nHelpful'] + helpful['outOf'])
maxC = np.max((maxC, combRatio))
userComb[user].append(combRatio)
itemComb[item].append(combRatio)
averageRate = sum([x['nHelpful'] for x in allHelpful]) * 1.0 / sum([x['outOf'] for x in allHelpful])
userRate = {}
for u in userHelpful:
userRate[u] = sum([x['nHelpful'] for x in userHelpful[u]]) * 1.0 / sum([x['outOf'] for x in userHelpful[u]])
itemRate = {}
for i in itemHelpful:
itemRate[i] = sum([x['nHelpful'] for x in itemHelpful[i]]) * 1.0 / sum([x['outOf'] for x in itemHelpful[i]])
userScore = {}
for u in userRating:
userScore[u] = mean(userRating[u])
itemScore = {}
for i in itemRating:
itemScore[i] = mean(itemRating[i])
userMulScore = {}
for u in userMulRO:
userMulScore[u] = mean(userMulRO[u])
itemMulScore = {}
for i in itemMulRO:
itemMulScore[i] = mean(itemMulRO[i])
userMulNScore = {}
for u in userMulRN:
userMulNScore[u] = mean(userMulRN[u]) * 1.0 / multiM
itemMulNScore = {}
for i in itemMulRN:
itemMulNScore[i] = mean(itemMulRN[i]) * 1.0 / multiM
diffTime = maxTime - minTime
userTimeScore = {}
for u in userTime:
userTimeScore[u] = ((mean(userTime[u]) - minTime) * 1.0 / diffTime)
itemTimeScore = {}
for i in itemTime:
itemTimeScore[i] = ((mean(itemTime[i]) - minTime) * 1.0 / diffTime)
userTimeOOScore = {}
for u in userTimeOO:
userTimeOOScore[u] = ((mean(userTimeOO[u]) - minTime) * 1.0 / diffTime)
itemTimeOOScore = {}
for i in itemTimeOO:
itemTimeOOScore[i] = ((mean(itemTimeOO[i]) - minTime) * 1.0 / diffTime)
userCombScore = {}
for u in userComb:
userCombScore[u] = mean(userComb[u]) / maxC
itemCombScore = {}
for i in itemComb:
itemCombScore[i] = mean(itemComb[i]) / maxC
In [3]:
print "Done"
In [4]:
averageUserScore = mean([userScore[u] for u in userScore])
averageItemScore = mean([itemScore[i] for i in itemScore])
averageUserRO = mean([userMulScore[u] for u in userMulScore])
averageItemRO = mean([itemMulScore[i] for i in itemMulScore])
averageUserRN = mean([userMulNScore[u] for u in userMulNScore])
averageItemRN = mean([itemMulNScore[i] for i in itemMulNScore])
averageUserTime = mean([userTimeScore[u] for u in userTimeScore])
averageItemTime = mean([itemTimeScore[i] for i in itemTimeScore])
averageUserTimeOO = mean([userTimeOOScore[u] for u in userTimeOOScore])
averageItemTimeOO = mean([itemTimeOOScore[i] for i in itemTimeOOScore])
averageUserComb = mean([userCombScore[u] for u in userCombScore])
averageItemComb = mean([itemCombScore[u] for u in itemCombScore])
In [4]:
In [5]:
for u in userScore:
print userScore[u]
break
for i in itemScore:
print itemScore[i]
break
In [6]:
cUser = 0
users = {}
cItem = 0
items = {}
for l in train:
user, item = l['reviewerID'], l['itemID']
if user not in users:
users[user] = cUser
cUser += 1
if item not in items:
items[item] = cItem
cItem += 1
In [7]:
def FF(loc, total):
res = [0] * total
res[loc] = 1
return res
In [8]:
def FFID(vals, bits):
n = bin(vals)
res = [0] * bits
i = bits - 1
for l in reversed(n[2:]):
res[i] = int(l)
i -= 1
return res
In [9]:
from sets import Set
import string
punctuation = set(string.punctuation)
setPos = defaultdict(int)
setNeg = defaultdict(int)
allCats = Set()
bestOutOf = 0
posCats = defaultdict(int)
negCats = defaultdict(int)
for l in train:
review, helpful, categories = l['reviewText'], l['helpful'], l['category']
outOf = float(helpful['outOf'])
nHelpful = float(helpful['nHelpful'])
ratio = float(nHelpful) / float(outOf)
if ratio >= 0.8:
for w in review.split():
w = ''.join([c for c in w.lower() if not c in punctuation])
setPos[w] += 1
for acat in categories:
for cat in acat:
for w in cat.split():
w = ''.join([c for c in w.lower() if not c in punctuation])
if w:
posCats[w] += 1
elif ratio <= 0.2:
for w in review.split():
w = ''.join([c for c in w.lower() if not c in punctuation])
setNeg[w] += 1
for acat in categories:
for cat in acat:
for w in cat.split():
w = ''.join([c for c in w.lower() if not c in punctuation])
if w:
negCats[w] += 1
bestOutOf = max(bestOutOf, outOf)
'''
for cat in categories[0]:
for w in cat.split():
w = ''.join([c for c in w.lower() if not c in punctuation])
if w:
allCats.add(w)
'''
Out[9]:
In [9]:
In [10]:
from sets import Set
import string
from nltk.tokenize import word_tokenize
'''
punctuation = set(string.punctuation)
setPos = defaultdict(int)
setNeg = defaultdict(int)
options = Set(['VBN', 'JJ', 'NNS'])
for l in train:
review, helpful = l['reviewText'], l['helpful']
outOf = float(helpful['outOf'])
nHelpful = float(helpful['nHelpful'])
ratio = float(nHelpful) / float(outOf)
if ratio >= 0.8:
tokens = word_tokenize(''.join([c for c in review.lower() if not c in punctuation]))
for w, pos in nltk.pos_tag(tokens):
if pos in options:
setPos[w] += 1
elif ratio <= 0.2:
tokens = word_tokenize(''.join([c for c in review.lower() if not c in punctuation]))
for w, pos in nltk.pos_tag(tokens):
if pos in options:
setNeg[w] += 1
'''
Out[10]:
In [11]:
import pickle
#filehandler = open(b"/home/iizhaki/oasis/CSE255/Project2/assignment2/posWords.pck","wb")
#pickle.dump(setPos, filehandler)
#filehandler = open(b"/home/iizhaki/oasis/CSE255/Project2/assignment2/negWords.pck","wb")
#pickle.dump(setNeg, filehandler)
#filehandler = open(b"/home/iizhaki/oasis/CSE255/Project2/assignment2/posWords.pck","rb")
#setPos = pickle.load(filehandler)
#filehandler = open(b"/home/iizhaki/oasis/CSE255/Project2/assignment2/negWords.pck","rb")
#setNeg = pickle.load(filehandler)
In [11]:
In [12]:
'''vecPositiveCats = {}
i = 0
for w in truePosCats:
if w[0] not in vecPositiveCats:
vecPositiveCats[w[0]] = i
i += 1
vecNegativeCats = {}
i = 0
for w in trueNegCats:
if w[0] not in vecNegativeCats:
vecNegativeCats[w[0]] = i
i += 1
print vecPositiveCats
print vecNegativeCats'''
Out[12]:
In [13]:
'''listPosCats = []
for p in posCats:
listPosCats.append((p, posCats[p]))
allPosCats = sorted(listPosCats, key=lambda x: x[1], reverse=True)
listNegCats = []
for p in negCats:
listNegCats.append((p, negCats[p]))
allNegCats = sorted(listNegCats, key=lambda x: x[1], reverse=True)
truePosCats = [x for x in allPosCats if x[0] not in negCats]
trueNegCats = [x for x in allNegCats if x[0] not in posCats]
print truePosCats
print trueNegCats'''
Out[13]:
In [14]:
'''
allHelpful = []
userHelpful = defaultdict(list)
for l in train:
user,item = l['reviewerID'],l['itemID']
allHelpful.append(l['helpful'])
userHelpful[user].append(l['helpful'])
averageRate = sum([x['nHelpful'] for x in allHelpful]) * 1.0 / sum([x['outOf'] for x in allHelpful])
userRate = {}
for u in userHelpful:
userRate[u] = sum([x['nHelpful'] for x in userHelpful[u]]) * 1.0 / sum([x['outOf'] for x in userHelpful[u]])
'''
Out[14]:
In [15]:
import nltk
from nltk.corpus import stopwords
stopWords = Set(stopwords.words("english"))
listPos = []
for p in setPos:
if not p or p in stopWords:
continue
listPos.append((p, setPos[p]))
posWords = sorted(listPos, key=lambda x: x[1], reverse=True)
In [16]:
listNeg = []
for n in setNeg:
if not n or n in stopWords:
continue
listNeg.append((n, setNeg[n]))
negWords = sorted(listNeg, key=lambda x: x[1], reverse=True)
In [17]:
removeIt = Set()
negativeWords = [x[0] for x in negWords][:120]
positiveWords = [x[0] for x in posWords if x[0] not in negativeWords][:120]
In [18]:
print len(sorted(positiveWords))
print sorted(negativeWords)
In [19]:
vecPositive = {}
i = 0
for w in positiveWords:
if w not in vecPositive:
vecPositive[w] = i
i += 1
vecNegative = {}
i = 0
for w in negativeWords:
if w not in vecNegative:
vecNegative[w] = i
i += 1
vecCats = {}
i = 0
for w in allCats:
vecCats[w] = i
i += 1
#print vecPositive
#print vecNegative
#print vecCats
In [20]:
def wordsFF(text, db):
res = [0] * len(db)
for w in text.split():
w = ''.join([c for c in w.lower() if not c in punctuation])
if w in db:
res[db[w]] = 1
return res
In [21]:
#print vecPositive
In [22]:
X = []
y = []
outO = []
nVals = []
maxL = 0
for l in train:
user, item, helpful, rating, words = l['reviewerID'], l['itemID'], l['helpful'], l['rating'], l['reviewText']
rating = int(rating)
rr = rating
outOf = float(helpful['outOf'])
nHelpful = float(helpful['nHelpful'])
ratio = nHelpful / outOf
n = len(words.split())
#rating = rating / 5.0;
#nWords = [0] * 5
#if n < 25:
# nWords[0] = n
#elif n < 39:
# nWords[1] = n
#elif n < 66:
# nWords[2] = n
#elif n < 125:
# nWords[3] = n
#else:
# nWords[4] = n
#if nWords < 100:
# nWords = [1, 0]
#else:
# nWords = [0, 1]
#cOutOf = outOf * 1.0 / bestOutOf
cOutOf = outOf * 1.0
#cOutOf = [0.0] * 3
#if outOf <= 5:
# cOutOf[0] = 1
#elif outOf <= 10:
# cOutOf[1] = 1
#else:
# cOutOf[2] = outOf / 10.0
#X.append([1] + FF(users[user] / 500, cUser / 500) + FFID(user) + FF(items[item] / 500, cItem / 500) + FFID(item) + FF(rating - 1, 5) + str(nWords))
# FF(rating - 1, 5)
X.append([1, rating, userMulScore[user], userScore[user], userRate[user], cOutOf] +
[n] + wordsFF(words, vecPositive) + wordsFF(words, vecNegative))
#X.append([1, rating, cOutOf, userRate[user], itemRate[item], userScore[user], itemScore[item], userMulScore[user], itemMulScore[item],
# userMulNScore[user], itemMulNScore[item]] +
# nWords + wordsFF(words, vecPositive) + wordsFF(words, vecNegative))
# + userTimeScore[(user, rr)] + itemTimeScore[(item, rr)]
# + FFID(int(outOf) -1, 10) + FFID(int(user[1:]), 30) +
y.append(ratio)
outO.append(outOf)
nVals.append(nHelpful)
In [23]:
print "Done"
print len(X[0]), len(vecPositive)
In [24]:
import pylab as pl
from sklearn.linear_model import SGDRegressor
from sklearn.datasets.samples_generator import make_regression
from sklearn.preprocessing import *
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import time
import timeit
In [25]:
X_train = np.array(X).astype(float)
y_train = np.array(y).astype(float)
#Remember = [X_train[:, i].max() for i in range(len(X_train.T))]
#X_train = X_train / len(X_train[0])
#X_train = np.array([np.array([xi if y > 0 else xi for (xi, y) in zip(x, Remember)]) for x in X_train])
#X_train = X_train / len(X_train[0])
scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)
In [26]:
print "Length of FFs: ", len(X_train[0])
# Objective
'''def f(theta, X, y, lam):
diff = numpy.dot(X, theta) - y
diffSq = (norm(diff) ** 2) / len(X)
diffSqReg = diffSq + lam * norm(theta) ** 2
#print "f : " , diffSqReg
return diffSqReg
# Derivative
def fprime(theta, X, y, lam):
diff = numpy.dot(X, theta) - y
res = 2 * numpy.dot(X.T, diff) / len(X) + 2 * lam * theta
return res
'''
start = time.time()
clf = SGDRegressor(n_iter = 1000, alpha = 0.0)
clf.fit(X_train, y_train)
#thetax, residualsx, rankx, sx = numpy.linalg.lstsq(X_train, y_train)
#thetax, _, _, _ = np.linalg.lstsq(X_train, y_train)
#thetax, _, _ = scipy.optimize.fmin_l_bfgs_b(f, numpy.array([0] * len(X_train[0])).T, fprime, args = (numpy.array(X_train), numpy.array(y_train).T, 0.01))
end = time.time()
finished = end - start
print finished
print "Length of Thetas: ", len(thetax)
print thetax
In [27]:
#def predict(data, theta):
# theta = numpy.array(theta)
# prediction = [np.dot(theta, d) for d in data]
# return prediction
#ptrain = predict(X_train, thetax)
ptrain = clf.predict(X_train)
ytrain = [e for e in y_train]
ptrain = [p if p >= 0. else 0. for p in ptrain]
ptrain = [p if p <= 1. else 1. for p in ptrain]
ptrain = [round(x*o) for x, o in zip(ptrain, outO)]
ytrain = nVals
#ytrain = expY
In [28]:
def MSE(prediction, real):
squares = [ (p - r)**2 for p,r in zip (prediction,real) ]
return numpy.mean(squares)
In [29]:
mse = MSE(ptrain, ytrain)
var = MSE(ytrain, [mean(ytrain)] * len(ytrain))
print "MSE training", mse
print "Var", var
print " FVU training", (mse / var)
In [30]:
trainScore = len([[p, y] for p, y in zip(ptrain, ytrain) if p != y]) * 1.0 / len(ytrain)
print trainScore
In [30]:
In [30]:
In [31]:
print "Reading test..."
tetst = list(parseData("/home/iizhaki/oasis/CSE255/Project2/assignment2/helpful.json"))
print "done"
In [32]:
Xt = []
outOt = []
for l in tetst:
user, item, helpful, rating, words = l['reviewerID'], l['itemID'], l['helpful'], l['rating'], l['reviewText']
rating = int(rating)
rr = rating
outOf = float(helpful['outOf'])
n = len(words.split())
#rating = rating / 5.0;
rating = float(rating);
if user in userRate:
urate = userRate[user]
uscore = userScore[user]
umul = userMulScore[user]
umuln = userMulNScore[user]
else:
if item in itemRate:
urate = itemRate[item]
else:
urate = averageRate
if item in itemScore:
uscore = itemScore[item]
else:
uscore = averageUserScore
if item in itemMulScore:
umul = itemMulScore[item]
else:
umul = averageUserRO
#uscore = averageUserScore
#umul = averageUserRO
umuln = averageUserRN
if item in itemRate:
irate = itemRate[item]
iscore = itemScore[item]
imul = itemMulScore[item]
imuln = itemMulNScore[item]
else:
irate = averageRate
iscore = averageItemScore
imul = averageItemRO
imul = averageItemRN
#nWords = [0] * 5
#if n < 25:
# nWords[0] = n
#elif n < 39:
# nWords[1] = n
#elif n < 66:
# nWords[2] = n
#elif n < 125:
# nWords[3] = n
#else:
# nWords[4] = n
#if nWords < 100:
# nWords = [1, 0]
#else:
# nWords = [0, 1]
#cOutOf = outOf * 1.0 / bestOutOf
cOutOf = outOf * 1.0
#cOutOf = [0.0] * 3
#if outOf <= 5:
# cOutOf[0] = 1
#elif outOf <= 10:
# cOutOf[1] = 1
#else:
# cOutOf[2] = outOf / 10.0
#X.append([1] + FF(users[user] / 500, cUser / 500) + FFID(user) + FF(items[item] / 500, cItem / 500) + FFID(item) + FF(rating - 1, 5) + str(nWords))
# FF(rating - 1, 5)
Xt.append([1, rating, umul, uscore, urate, cOutOf] +
[n] + wordsFF(words, vecPositive) + wordsFF(words, vecNegative))
'''rating = int(rating)
outOf = float(helpful['outOf'])
n = len(words.split())
rr = rating
rating = rating / 5.0;
nWords = [0] * 5
if n < 25:
nWords[0] = n
elif n < 39:
nWords[1] = n
elif n < 66:
nWords[2] = n
elif n < 125:
nWords[3] = n
else:
nWords[4] = n
cOutOf = outOf * 1.0 / bestOutOf
if user in userRate:
urate = userRate[user]
uscore = userScore[user]
umul = userMulScore[user]
umuln = userMulNScore[user]
else:
urate = averageRate
uscore = averageUserScore
umul = averageUserRO
umuln = averageUserRN
if item in itemRate:
irate = itemRate[item]
iscore = itemScore[item]
imul = itemMulScore[item]
imuln = itemMulNScore[item]
else:
irate = averageRate
iscore = averageItemScore
imul = averageItemRO
imul = averageItemRN
if (rr, user) in userTimeScore:
uts = userTimeScore[(rr, user)]
else:
uts = averageUserTime
if (rr, item) in itemTimeScore:
its = itemTimeScore[(rr, user)]
else:
its = averageItemTime
if user in userCombScore:
usc = userCombScore[user]
else:
usc = averageUserComb
if item in itemCombScore:
isc = itemCombScore[item]
else:
isc = averageItemComb
Xt.append([1, rating, cOutOf, urate, irate, uscore, iscore, umul, imul, umuln, imuln] + nWords + wordsFF(words, vecPositive) + wordsFF(words, vecNegative))
#Xt.append([1, rating] + urate + nWords + wordsFF(words, vecPositive) + wordsFF(words, vecNegative))
'''
outOt.append(outOf)
In [33]:
print len(Xt[0]), Xt[0]
X_test = np.array(Xt).astype(float)
scaler = MinMaxScaler().fit(X_test)
X_test = scaler.transform(X_test)
In [35]:
#ptest = predict(X_test, thetax)
ptest = clf.predict(X_train)
ptest = [p if p >= 0. else 0. for p in ptest]
ptest = [p if p <= 1. else 1. for p in ptest]
ptest = [round(x*o) for x, o in zip(ptest, outOt)]
In [36]:
print ptest[:10]
In [37]:
print "Reading helpful..."
test = np.array(list(parseTxt("/home/iizhaki/oasis/CSE255/Project2/assignment2/pairs_Helpful.txt")))
print "done"
In [ ]:
print "Prediction test..."
myPredictions = open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Helpful_MMF_" + str(trainScore) + ".txt", 'w')
print "done"
myPredictions.write(test[0][0] + '\n')
for l, p in zip(test[1 :], ptest):
u, i, o = l[0].split("-")
myPredictions.write(l[0] + ',' + str(p) + '\n')
myPredictions.close()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: