In [1]:
from _collections import defaultdict
import time
import timeit

from numpy.linalg import norm
import scipy.optimize

import numpy as np

def parseData(fname):
  for l in open(fname):
    yield eval(l)
    
def parseTxt(fname):
  for l in open(fname):
    yield l.strip().split(" ")

print "Reading train..."
train = list(parseData("/home/iizhaki/oasis/CSE255/Project2/assignment2/train.json"))
print "done"


Reading train...
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-1-7359a1cfaaae> in <module>()
     17 
     18 print "Reading train..."
---> 19 train = list(parseData("/home/iizhaki/oasis/train.json"))
     20 print "done"

<ipython-input-1-7359a1cfaaae> in parseData(fname)
     10 def parseData(fname):
     11   for l in open(fname):
---> 12     yield eval(l)
     13 
     14 def parseTxt(fname):

<string> in <module>()

KeyboardInterrupt: 

In [1]:


In [2]:
allHelpful = []
userHelpful = defaultdict(list)
itemHelpful = defaultdict(list)
userRating = defaultdict(list)
itemRating = defaultdict(list)
userMulRO = defaultdict(list)
itemMulRO = defaultdict(list)
userMulRN = defaultdict(list)
itemMulRN = defaultdict(list)
userTime  = defaultdict(list)
itemTime = defaultdict(list)
userTimeOO  = defaultdict(list)
itemTimeOO = defaultdict(list)
multiNO = 0
maxTime = 0
minTime = 3000000000
userComb  = defaultdict(list)
itemComb = defaultdict(list)
maxC = 0
import math
multiM = 0


def nCr(n,r):
    f = scipy.misc.factorial
    return float(f(n)) / float(f(r)) / float(f(n-r))

for l in train:
    user,item, rating, helpful, utime = l['reviewerID'],l['itemID'], l['rating'], l['helpful'], l['unixReviewTime']
    allHelpful.append(helpful)
    userHelpful[user].append(helpful)
    itemHelpful[item].append(helpful)
    #userRating[user].append(rating / 5.0)
    #itemRating[item].append(rating / 5.0)
    userRating[user].append(rating)
    itemRating[item].append(rating)
    #userMulRO[user].append(rating * helpful['outOf'] / 5.0)
    #itemMulRO[item].append(rating * helpful['outOf'] / 5.0)
    userMulRO[user].append(helpful['outOf'])
    itemMulRO[item].append(helpful['outOf'])
    
    multi = helpful['nHelpful']
    multiM = np.max((multiM, multi))
    userMulRN[user].append(multi)
    itemMulRN[item].append(multi)
    
    maxTime = np.max((maxTime, utime))
    minTime = np.min((minTime, utime))
    userTime[(user, int(rating))].append(utime)
    itemTime[(item, int(rating))].append(utime)
    
    userTimeOO[(user, int(helpful['outOf']))].append(utime)
    itemTimeOO[(item, int(helpful['outOf']))].append(utime)
    
    combRatio = 1.0 * (helpful['nHelpful'] + helpful['outOf'])
    maxC = np.max((maxC, combRatio))
    userComb[user].append(combRatio)
    itemComb[item].append(combRatio)
    
    
averageRate = sum([x['nHelpful'] for x in allHelpful]) * 1.0 / sum([x['outOf'] for x in allHelpful])
userRate = {}
for u in userHelpful:
  userRate[u] = sum([x['nHelpful'] for x in userHelpful[u]]) * 1.0 / sum([x['outOf'] for x in userHelpful[u]])
itemRate = {}
for i in itemHelpful:
  itemRate[i] = sum([x['nHelpful'] for x in itemHelpful[i]]) * 1.0 / sum([x['outOf'] for x in itemHelpful[i]])

userScore = {}
for u in userRating:
    userScore[u] = mean(userRating[u])
itemScore = {}
for i in itemRating:
    itemScore[i] = mean(itemRating[i])

userMulScore = {}
for u in userMulRO:
    userMulScore[u] = mean(userMulRO[u])
itemMulScore = {}
for i in itemMulRO:
    itemMulScore[i] = mean(itemMulRO[i])

userMulNScore = {}
for u in userMulRN:
    userMulNScore[u] = mean(userMulRN[u]) * 1.0 / multiM
itemMulNScore = {}
for i in itemMulRN:
    itemMulNScore[i] = mean(itemMulRN[i]) * 1.0 / multiM
    
diffTime = maxTime - minTime
userTimeScore = {}
for u in userTime:
    userTimeScore[u] = ((mean(userTime[u]) - minTime) * 1.0 / diffTime)
itemTimeScore = {}
for i in itemTime:
    itemTimeScore[i] = ((mean(itemTime[i]) - minTime) * 1.0 / diffTime)
  
userTimeOOScore = {}
for u in userTimeOO:
    userTimeOOScore[u] = ((mean(userTimeOO[u]) - minTime) * 1.0 / diffTime)
itemTimeOOScore = {}
for i in itemTimeOO:
    itemTimeOOScore[i] = ((mean(itemTimeOO[i]) - minTime) * 1.0 / diffTime)
    
userCombScore = {}
for u in userComb:
    userCombScore[u] = mean(userComb[u]) / maxC
itemCombScore = {}
for i in itemComb:
    itemCombScore[i] = mean(itemComb[i]) / maxC

In [3]:
print "Done"


Done

In [4]:
averageUserScore = mean([userScore[u] for u in userScore])
averageItemScore = mean([itemScore[i] for i in itemScore])
averageUserRO = mean([userMulScore[u] for u in userMulScore])
averageItemRO = mean([itemMulScore[i] for i in itemMulScore])
averageUserRN = mean([userMulNScore[u] for u in userMulNScore])
averageItemRN = mean([itemMulNScore[i] for i in itemMulNScore])
averageUserTime = mean([userTimeScore[u] for u in userTimeScore])
averageItemTime = mean([itemTimeScore[i] for i in itemTimeScore])
averageUserTimeOO = mean([userTimeOOScore[u] for u in userTimeOOScore])
averageItemTimeOO = mean([itemTimeOOScore[i] for i in itemTimeOOScore])
averageUserComb = mean([userCombScore[u] for u in userCombScore])
averageItemComb = mean([itemCombScore[u] for u in itemCombScore])

In [4]:


In [5]:
for u in userScore:
    print userScore[u]
    break
    
for i in itemScore:
    print itemScore[i]
    break


5.0
3.5

In [6]:
cUser = 0
users = {}
cItem = 0
items = {}

for l in train:
    user, item = l['reviewerID'], l['itemID']
    if user not in users:
        users[user] = cUser
        cUser += 1
    
    if item not in items:
        items[item] = cItem
        cItem += 1

In [7]:
def FF(loc, total):
    res = [0] * total    
    res[loc] = 1
    
    return res

In [8]:
def FFID(vals, bits):
    n = bin(vals)
    res = [0] * bits
    i = bits - 1
    for l in reversed(n[2:]):
        res[i] = int(l)
        i -= 1
    return res

In [9]:
from sets import Set
import string

punctuation = set(string.punctuation)
setPos = defaultdict(int)
setNeg = defaultdict(int)
allCats = Set()

bestOutOf = 0
posCats = defaultdict(int)
negCats = defaultdict(int)

for l in train:
    review, helpful, categories = l['reviewText'], l['helpful'], l['category']
    outOf = float(helpful['outOf'])
    nHelpful = float(helpful['nHelpful'])
    ratio = float(nHelpful) / float(outOf)
    if ratio >= 0.8:
        for w in review.split():
            w = ''.join([c for c in w.lower() if not c in punctuation])
            setPos[w] += 1
        for acat in categories:
            for cat in acat:
                for w in cat.split():
                    w = ''.join([c for c in w.lower() if not c in punctuation])
                    if w:
                        posCats[w] += 1
    elif ratio <= 0.2:
        for w in review.split():
            w = ''.join([c for c in w.lower() if not c in punctuation])
            setNeg[w] += 1
        for acat in categories:
            for cat in acat:
                for w in cat.split():
                    w = ''.join([c for c in w.lower() if not c in punctuation])
                    if w:
                        negCats[w] += 1
    
        

    bestOutOf = max(bestOutOf, outOf)
    
    
'''
    for cat in categories[0]:
        for w in cat.split():
            w = ''.join([c for c in w.lower() if not c in punctuation])
            if w:
                allCats.add(w)
'''


Out[9]:
"\n    for cat in categories[0]:\n        for w in cat.split():\n            w = ''.join([c for c in w.lower() if not c in punctuation])\n            if w:\n                allCats.add(w)\n"

In [9]:


In [10]:
from sets import Set
import string
from nltk.tokenize import word_tokenize

'''
punctuation = set(string.punctuation)
setPos = defaultdict(int)
setNeg = defaultdict(int)

options = Set(['VBN', 'JJ', 'NNS'])

for l in train:
    review, helpful = l['reviewText'], l['helpful']
    outOf = float(helpful['outOf'])
    nHelpful = float(helpful['nHelpful'])
    ratio = float(nHelpful) / float(outOf)
    if ratio >= 0.8:
        tokens = word_tokenize(''.join([c for c in review.lower() if not c in punctuation]))
        for w, pos in nltk.pos_tag(tokens):
            if pos in options:
                setPos[w] += 1
    elif ratio <= 0.2:
        tokens = word_tokenize(''.join([c for c in review.lower() if not c in punctuation]))
        for w, pos in nltk.pos_tag(tokens):
            if pos in options:
                setNeg[w] += 1
'''


Out[10]:
"\npunctuation = set(string.punctuation)\nsetPos = defaultdict(int)\nsetNeg = defaultdict(int)\n\noptions = Set(['VBN', 'JJ', 'NNS'])\n\nfor l in train:\n    review, helpful = l['reviewText'], l['helpful']\n    outOf = float(helpful['outOf'])\n    nHelpful = float(helpful['nHelpful'])\n    ratio = float(nHelpful) / float(outOf)\n    if ratio >= 0.8:\n        tokens = word_tokenize(''.join([c for c in review.lower() if not c in punctuation]))\n        for w, pos in nltk.pos_tag(tokens):\n            if pos in options:\n                setPos[w] += 1\n    elif ratio <= 0.2:\n        tokens = word_tokenize(''.join([c for c in review.lower() if not c in punctuation]))\n        for w, pos in nltk.pos_tag(tokens):\n            if pos in options:\n                setNeg[w] += 1\n"

In [11]:
import pickle

#filehandler = open(b"/home/iizhaki/oasis/CSE255/Project2/assignment2/posWords.pck","wb")
#pickle.dump(setPos, filehandler)

#filehandler = open(b"/home/iizhaki/oasis/CSE255/Project2/assignment2/negWords.pck","wb")
#pickle.dump(setNeg, filehandler)

#filehandler = open(b"/home/iizhaki/oasis/CSE255/Project2/assignment2/posWords.pck","rb")
#setPos = pickle.load(filehandler)

#filehandler = open(b"/home/iizhaki/oasis/CSE255/Project2/assignment2/negWords.pck","rb")
#setNeg = pickle.load(filehandler)

In [11]:


In [12]:
'''vecPositiveCats = {}
i = 0
for w in truePosCats:
    if w[0] not in vecPositiveCats:
        vecPositiveCats[w[0]] = i
        i += 1
    
vecNegativeCats = {}
i = 0
for w in trueNegCats:
    if w[0] not in vecNegativeCats:
        vecNegativeCats[w[0]] = i
        i += 1
        
print vecPositiveCats
print vecNegativeCats'''


Out[12]:
'vecPositiveCats = {}\ni = 0\nfor w in truePosCats:\n    if w[0] not in vecPositiveCats:\n        vecPositiveCats[w[0]] = i\n        i += 1\n    \nvecNegativeCats = {}\ni = 0\nfor w in trueNegCats:\n    if w[0] not in vecNegativeCats:\n        vecNegativeCats[w[0]] = i\n        i += 1\n        \nprint vecPositiveCats\nprint vecNegativeCats'

In [13]:
'''listPosCats = []
for p in posCats:
    listPosCats.append((p, posCats[p]))

allPosCats = sorted(listPosCats, key=lambda x: x[1], reverse=True)

listNegCats = []
for p in negCats:
    listNegCats.append((p, negCats[p]))

allNegCats = sorted(listNegCats, key=lambda x: x[1], reverse=True)

truePosCats = [x for x in allPosCats if x[0] not in negCats]
trueNegCats = [x for x in allNegCats if x[0] not in posCats]

print truePosCats
print trueNegCats'''


Out[13]:
'listPosCats = []\nfor p in posCats:\n    listPosCats.append((p, posCats[p]))\n\nallPosCats = sorted(listPosCats, key=lambda x: x[1], reverse=True)\n\nlistNegCats = []\nfor p in negCats:\n    listNegCats.append((p, negCats[p]))\n\nallNegCats = sorted(listNegCats, key=lambda x: x[1], reverse=True)\n\ntruePosCats = [x for x in allPosCats if x[0] not in negCats]\ntrueNegCats = [x for x in allNegCats if x[0] not in posCats]\n\nprint truePosCats\nprint trueNegCats'

In [14]:
'''
allHelpful = []
userHelpful = defaultdict(list)

for l in train:
    user,item = l['reviewerID'],l['itemID']
    allHelpful.append(l['helpful'])
    userHelpful[user].append(l['helpful'])

averageRate = sum([x['nHelpful'] for x in allHelpful]) * 1.0 / sum([x['outOf'] for x in allHelpful])
userRate = {}
for u in userHelpful:
    userRate[u] = sum([x['nHelpful'] for x in userHelpful[u]]) * 1.0 / sum([x['outOf'] for x in userHelpful[u]])
'''


Out[14]:
"\nallHelpful = []\nuserHelpful = defaultdict(list)\n\nfor l in train:\n    user,item = l['reviewerID'],l['itemID']\n    allHelpful.append(l['helpful'])\n    userHelpful[user].append(l['helpful'])\n\naverageRate = sum([x['nHelpful'] for x in allHelpful]) * 1.0 / sum([x['outOf'] for x in allHelpful])\nuserRate = {}\nfor u in userHelpful:\n    userRate[u] = sum([x['nHelpful'] for x in userHelpful[u]]) * 1.0 / sum([x['outOf'] for x in userHelpful[u]])\n"

In [15]:
import nltk
from nltk.corpus import stopwords
stopWords = Set(stopwords.words("english"))

listPos = []
for p in setPos:
    if not p or p in stopWords:
        continue
    listPos.append((p, setPos[p]))

posWords = sorted(listPos, key=lambda x: x[1], reverse=True)

In [16]:
listNeg = []
for n in setNeg:
    if not n or n in stopWords:
        continue
    listNeg.append((n, setNeg[n]))

negWords = sorted(listNeg, key=lambda x: x[1], reverse=True)

In [17]:
removeIt = Set()
negativeWords = [x[0] for x in negWords][:120]
positiveWords = [x[0] for x in posWords if x[0] not in negativeWords][:120]

In [18]:
print len(sorted(positiveWords))
print sorted(negativeWords)


120
['2', '3', '5', 'also', 'amazon', 'another', 'around', 'back', 'bad', 'battery', 'best', 'better', 'bought', 'box', 'buy', 'cable', 'came', 'camera', 'cant', 'card', 'case', 'computer', 'could', 'cover', 'device', 'didnt', 'doesnt', 'dont', 'drive', 'easy', 'even', 'far', 'find', 'fine', 'first', 'fit', 'get', 'go', 'going', 'good', 'got', 'great', 'happy', 'hard', 'headphones', 'however', 'im', 'ipad', 'item', 'ive', 'keyboard', 'kindle', 'know', 'laptop', 'like', 'little', 'long', 'looking', 'lot', 'love', 'made', 'make', 'money', 'much', 'need', 'never', 'new', 'nice', 'old', 'one', 'ordered', 'phone', 'plan', 'power', 'price', 'problem', 'product', 'protection', 'purchase', 'purchased', 'put', 'quality', 'really', 'recommend', 'return', 'right', 'say', 'screen', 'see', 'set', 'since', 'small', 'something', 'sound', 'still', 'sure', 'tablet', 'take', 'thing', 'think', 'time', 'tried', 'tv', 'two', 'unit', 'usb', 'use', 'used', 'using', 'want', 'way', 'well', 'without', 'work', 'worked', 'working', 'works', 'worth', 'would', 'yet']

In [19]:
vecPositive = {}
i = 0
for w in positiveWords:
    if w not in vecPositive:
        vecPositive[w] = i
        i += 1
    
vecNegative = {}
i = 0
for w in negativeWords:
    if w not in vecNegative:
        vecNegative[w] = i
        i += 1

vecCats = {}
i = 0
for w in allCats:
    vecCats[w] = i
    i += 1
        
#print vecPositive
#print vecNegative
#print vecCats

In [20]:
def wordsFF(text, db):
    res = [0] * len(db)
    for w in text.split():
            w = ''.join([c for c in w.lower() if not c in punctuation])
            if w in db:
                res[db[w]] = 1
    return res

In [21]:
#print vecPositive

In [22]:
X = []
y = []
outO = []
nVals = []
maxL = 0

for l in train:
    user, item, helpful, rating, words = l['reviewerID'], l['itemID'], l['helpful'], l['rating'], l['reviewText']
    rating = int(rating)
    rr = rating
    outOf = float(helpful['outOf'])
    nHelpful = float(helpful['nHelpful'])
    ratio = nHelpful / outOf
    n = len(words.split())
    #rating = rating / 5.0;
    
    #nWords = [0] * 5
    #if n < 25:
    #    nWords[0] = n
    #elif n < 39:
    #    nWords[1] = n
    #elif n < 66:
    #    nWords[2] = n
    #elif n < 125:
    #    nWords[3] = n
    #else:
    #    nWords[4] = n
    #if nWords < 100:
    #    nWords = [1, 0]
    #else:
    #    nWords = [0, 1]
    
    #cOutOf = outOf * 1.0 / bestOutOf
    cOutOf = outOf * 1.0
    #cOutOf = [0.0] * 3
    #if outOf <= 5:
    #    cOutOf[0] = 1
    #elif outOf <= 10:
    #    cOutOf[1] = 1
    #else:
    #    cOutOf[2] = outOf / 10.0
    
    #X.append([1] + FF(users[user] / 500, cUser / 500) + FFID(user) + FF(items[item] / 500, cItem / 500) + FFID(item) + FF(rating - 1, 5) + str(nWords))
    # FF(rating - 1, 5)
    X.append([1, rating, userMulScore[user], userScore[user], userRate[user], cOutOf] +
             [n] + wordsFF(words, vecPositive) + wordsFF(words, vecNegative))
    #X.append([1, rating, cOutOf, userRate[user], itemRate[item], userScore[user], itemScore[item], userMulScore[user], itemMulScore[item], 
    #          userMulNScore[user], itemMulNScore[item]]  +
    #         nWords + wordsFF(words, vecPositive) + wordsFF(words, vecNegative))
    #  + userTimeScore[(user, rr)] + itemTimeScore[(item, rr)]
    # + FFID(int(outOf) -1, 10) + FFID(int(user[1:]), 30) +
    
    y.append(ratio)
    outO.append(outOf)
    nVals.append(nHelpful)

In [23]:
print "Done"
print len(X[0]), len(vecPositive)


Done
247 120

In [24]:
import pylab as pl

from sklearn.linear_model import SGDRegressor
from sklearn.datasets.samples_generator import make_regression
from sklearn.preprocessing import *
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

import time
import timeit

In [25]:
X_train = np.array(X).astype(float)
y_train = np.array(y).astype(float)

#Remember = [X_train[:, i].max() for i in range(len(X_train.T))]
#X_train = X_train / len(X_train[0])

#X_train = np.array([np.array([xi if y > 0 else xi for (xi, y) in zip(x, Remember)]) for x in X_train])
#X_train = X_train / len(X_train[0])

scaler = MinMaxScaler().fit(X_train)
X_train = scaler.transform(X_train)

In [26]:
print "Length of FFs: ", len(X_train[0])

# Objective
'''def f(theta, X, y, lam):
    diff = numpy.dot(X, theta) - y
    diffSq = (norm(diff) ** 2) / len(X)
    diffSqReg = diffSq + lam * norm(theta) ** 2
    #print "f : " , diffSqReg
    return diffSqReg

    # Derivative
def fprime(theta, X, y, lam):
    diff = numpy.dot(X, theta) - y
    res = 2 * numpy.dot(X.T, diff) / len(X) + 2 * lam * theta
    return res
'''
start = time.time()

clf = SGDRegressor(n_iter = 1000, alpha = 0.0)
clf.fit(X_train, y_train)
#thetax, residualsx, rankx, sx = numpy.linalg.lstsq(X_train, y_train)
#thetax, _, _, _ = np.linalg.lstsq(X_train, y_train)
#thetax, _, _ = scipy.optimize.fmin_l_bfgs_b(f, numpy.array([0] * len(X_train[0])).T, fprime, args = (numpy.array(X_train), numpy.array(y_train).T, 0.01))


end = time.time()
finished = end - start
print finished
print "Length of Thetas: ", len(thetax)
print thetax


Length of FFs:  247
911.478042126
Length of Thetas: 
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-26-72285af3433d> in <module>()
     27 finished = end - start
     28 print finished
---> 29 print "Length of Thetas: ", len(thetax)
     30 print thetax

NameError: name 'thetax' is not defined

In [27]:
#def predict(data, theta):
#    theta = numpy.array(theta)
#    prediction = [np.dot(theta, d)  for d in data]
#    return prediction

#ptrain = predict(X_train, thetax)
ptrain = clf.predict(X_train)
ytrain = [e for e in y_train]

ptrain = [p if p >= 0. else 0. for p in ptrain]
ptrain = [p if p <= 1. else 1. for p in ptrain]

ptrain = [round(x*o) for x, o in zip(ptrain, outO)]
ytrain = nVals
#ytrain = expY




In [28]:
def MSE(prediction, real):
    squares = [ (p - r)**2 for p,r in zip (prediction,real) ]
    return numpy.mean(squares)

In [29]:
mse = MSE(ptrain, ytrain)
var = MSE(ytrain, [mean(ytrain)] * len(ytrain))
print "MSE training", mse
print "Var", var
print " FVU training", (mse / var)


MSE training 2.185431
Var 439.112243553
 FVU training 0.00497693023159

In [30]:
trainScore = len([[p, y] for p, y in zip(ptrain, ytrain) if p != y]) * 1.0 / len(ytrain)
print trainScore


0.079796

In [30]:


In [30]:


In [31]:
print "Reading test..."
tetst = list(parseData("/home/iizhaki/oasis/CSE255/Project2/assignment2/helpful.json"))
print "done"


Reading test...
done

In [32]:
Xt = []
outOt = []

for l in tetst:
    user, item, helpful, rating, words = l['reviewerID'], l['itemID'], l['helpful'], l['rating'], l['reviewText']
    rating = int(rating)
    rr = rating
    outOf = float(helpful['outOf'])
    n = len(words.split())
    #rating = rating / 5.0;
    rating = float(rating);
    
    if user in userRate:
        urate = userRate[user]
        uscore = userScore[user]
        umul = userMulScore[user]
        umuln = userMulNScore[user]
    else:
        if item in itemRate:
            urate = itemRate[item]
        else:
            urate = averageRate
        if item in itemScore:
            uscore = itemScore[item]
        else:
            uscore = averageUserScore
        if item in itemMulScore:
            umul = itemMulScore[item]
        else:
            umul = averageUserRO
        #uscore = averageUserScore
        #umul = averageUserRO
        umuln = averageUserRN
    if item in itemRate:
        irate = itemRate[item]
        iscore = itemScore[item]
        imul = itemMulScore[item]
        imuln = itemMulNScore[item]
    else:
        irate = averageRate
        iscore = averageItemScore
        imul = averageItemRO
        imul = averageItemRN
        
    #nWords = [0] * 5
    #if n < 25:
    #    nWords[0] = n
    #elif n < 39:
    #    nWords[1] = n
    #elif n < 66:
    #    nWords[2] = n
    #elif n < 125:
    #    nWords[3] = n
    #else:
    #    nWords[4] = n
    #if nWords < 100:
    #    nWords = [1, 0]
    #else:
    #    nWords = [0, 1]
    
    #cOutOf = outOf * 1.0 / bestOutOf
    cOutOf = outOf * 1.0
    #cOutOf = [0.0] * 3
    #if outOf <= 5:
    #    cOutOf[0] = 1
    #elif outOf <= 10:
    #    cOutOf[1] = 1
    #else:
    #    cOutOf[2] = outOf / 10.0
    
    #X.append([1] + FF(users[user] / 500, cUser / 500) + FFID(user) + FF(items[item] / 500, cItem / 500) + FFID(item) + FF(rating - 1, 5) + str(nWords))
    # FF(rating - 1, 5)
    Xt.append([1, rating, umul, uscore, urate, cOutOf] +
             [n] + wordsFF(words, vecPositive) + wordsFF(words, vecNegative))
    '''rating = int(rating)
    outOf = float(helpful['outOf'])
    n = len(words.split())
    rr = rating
    rating = rating / 5.0;
    
    nWords = [0] * 5
    if n < 25:
        nWords[0] = n
    elif n < 39:
        nWords[1] = n
    elif n < 66:
        nWords[2] = n
    elif n < 125:
        nWords[3] = n
    else:
        nWords[4] = n
    
    cOutOf = outOf * 1.0 / bestOutOf
    
    if user in userRate:
        urate = userRate[user]
        uscore = userScore[user]
        umul = userMulScore[user]
        umuln = userMulNScore[user]
    else:
        urate = averageRate
        uscore = averageUserScore
        umul = averageUserRO
        umuln = averageUserRN
    if item in itemRate:
        irate = itemRate[item]
        iscore = itemScore[item]
        imul = itemMulScore[item]
        imuln = itemMulNScore[item]
    else:
        irate = averageRate
        iscore = averageItemScore
        imul = averageItemRO
        imul = averageItemRN
        
    if (rr, user) in userTimeScore:
        uts = userTimeScore[(rr, user)]
    else:
        uts = averageUserTime
        
    if (rr, item) in itemTimeScore:
        its = itemTimeScore[(rr, user)]
    else:
        its = averageItemTime
        
    if user in userCombScore:
        usc = userCombScore[user]
    else:
        usc = averageUserComb
        
    if item in itemCombScore:
        isc = itemCombScore[item]
    else:
        isc = averageItemComb        
        
    Xt.append([1, rating, cOutOf, urate, irate, uscore, iscore, umul, imul, umuln, imuln] + nWords + wordsFF(words, vecPositive) + wordsFF(words, vecNegative))
    #Xt.append([1, rating] + urate + nWords + wordsFF(words, vecPositive) + wordsFF(words, vecNegative))
    '''
    
    outOt.append(outOf)

In [33]:
print len(Xt[0]), Xt[0]
X_test = np.array(Xt).astype(float)
scaler = MinMaxScaler().fit(X_test)
X_test = scaler.transform(X_test)


247 [1, 1.0, 1.0, 1.0, 0.0, 1.0, 52, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]

In [35]:
#ptest = predict(X_test, thetax)
ptest = clf.predict(X_train)


ptest = [p if p >= 0. else 0. for p in ptest]
ptest = [p if p <= 1. else 1. for p in ptest]

ptest = [round(x*o) for x, o in zip(ptest, outOt)]

In [36]:
print ptest[:10]


[1.0, 3.0, 0.0, 2.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0]

In [37]:
print "Reading helpful..."
test = np.array(list(parseTxt("/home/iizhaki/oasis/CSE255/Project2/assignment2/pairs_Helpful.txt")))
print "done"


Reading helpful...
done

In [ ]:
print "Prediction test..."
myPredictions = open("/home/iizhaki/oasis/CSE255/Project2/assignment2/idan_predictions_Helpful_MMF_" + str(trainScore) + ".txt", 'w')
print "done"

myPredictions.write(test[0][0] + '\n')

for l, p in zip(test[1 :], ptest):
    u, i, o = l[0].split("-")
    myPredictions.write(l[0] + ',' + str(p) + '\n')
    
myPredictions.close()

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: