In [82]:
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixReorder502.pck")
matrix =np.load(file)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/YsReorder502.pck")
rankingR = np.load(file)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/IndexReorder502.pck")
indexes= np.load(file)
file.close()
print "done"

print indexes[:20]
X = [matrix[i] for i in indexes]
y = [rankingR[i] for i in indexes]


n = int(len(X) * 0.7)
X_train_org = np.array(X[:n])
y_train_org = np.array(y[:n])

X_test_org = np.array(X[n:])
y_test_org = np.array(y[n:])


done
done
done
[ 573812 1055387   93280 1232988  917762 1230238 1010895  520368 1351840
   38040  294156   77051  166163  328957  316261 1301367  485036  728258
  560926  814129]

In [83]:
n = int(len(X) * 0.7)
X_train = X_train_org #[:n]
y_train = y_train_org #[:n]
X_test = X_test_org #[n:]
y_test = y_test_org #[n:]

In [84]:
import pylab as pl

from sklearn.linear_model import SGDRegressor
from sklearn.datasets.samples_generator import make_regression
from sklearn.preprocessing import *
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

import time
import timeit

In [85]:
print X_train.shape, y_train.shape


(1126994, 502) (1126994,)

In [86]:
X_train = X_train.astype(float)
y_train = y_train.astype(float)


X_test = X_test.astype(float)
y_test = y_test.astype(float)

print y_train[0], y[0]
#X_test = preprocessing.scale(X_test.astype(float))
#y_test = preprocessing.scale(y_test.astype(float))


4.0 4.0

In [87]:
scaler = MinMaxScaler().fit(X_train)

X_train = scaler.transform(X_train)

In [88]:
scaler = MinMaxScaler().fit(X_test)

X_test = scaler.transform(X_test)

In [9]:
# run the classifier - there are parameters for L1/L2 regularization - see link below!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
clf = SGDRegressor(n_iter = 5)

start = time.time()

clf.fit(X_train, y_train)

end = time.time()
finished = end - start
print finished

# and plot the result
#pl.scatter(X, Y, color='black')
#pl.plot(X, clf.predict(X), color='blue', linewidth=3)
#pl.show()


11.4606080055

In [10]:
def predict(data, theta):
    theta = numpy.matrix(theta)
    prediction = [theta * d.T  for d in data]
    return prediction

#prediction_training = predict(X, thetax)

def MSE(prediction, real):
    squares = [ (p - r)**2 for p,r in zip (prediction,real) ]
    return numpy.mean(squares)


#print " MSE training", MSE(prediction_training, y )

In [11]:
import scipy
ptrain = clf.predict(X_train)
ytrain = [e for e in y_train]

ptrain = [e for e in ptrain]
ptrain = [e if e <= 5 else 5. for e in ptrain]
ptrain = [e if e >= 1 else 1. for e in ptrain ]

mse = MSE(ptrain, ytrain)
var = MSE(ytrain, [mean(ytrain)] * len(ytrain))
print "MSE training", mse
print "Var", var
print " FVU training", (mse / var)


MSE training 0.828062087924
Var 1.39814050275
 FVU training 0.592259566401

In [12]:
ptest = clf.predict(X_test)
ytest = [e for e in y_test]
ptest = [e for e in ptest]
ptest = [e if e <= 5 else 5. for e in ptest]
ptest = [e if e >= 1 else 1. for e in ptest ]

mse = MSE(ptest, ytest)
var = MSE(ytest, [mean(ytest)] * len(ytest))
print "MSE testing", mse
print "Var testing", var
print " FVU testing", (mse / var)


MSE testing 0.829286981288
Var testing 1.40083603823
 FVU testing 0.591994322432

In [27]:
print sum([p for p in ptrain if p > 5 or p < 1]) / len(ptrain)
print sum([p for p in ptest if p > 5 or p < 1]) / len(ptrain)


0.0
0.0

In [ ]:
# class sklearn.linear_model.SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.1, random_state=None, learning_rate='invscaling', eta0=0.01, power_t=0.25, warm_start=False)
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html

In [ ]:


In [89]:
start = time.time()

thetax,residualsx,rankx,sx = numpy.linalg.lstsq(X_train, y_train)

end = time.time()
finished = end - start
print finished


423.524701834

In [90]:
def predict(data, theta):
    theta = numpy.matrix(theta)
    prediction = [theta*numpy.matrix(d).T  for d in data]
    return prediction

prediction_training = predict(X_train, thetax)

def MSE(prediction, real):
    squares = [ (p - r)**2 for p,r in zip (prediction,real) ]
    return numpy.mean(squares)

y = [e for e in y_train]
p = [e for e in prediction_training]

mse = MSE(p, y)
var = MSE(y, [mean(y)] * len(y))
print "MSE training", mse
print "Var", var
print " FVU training", (mse / var)


MSE training 0.83264986246
Var 1.39814050275
 FVU training 0.595540906529

In [93]:
prediction_testing = predict(X_test, thetax)

y = [e for e in y_test]
p = [e for e in prediction_testing]

mse = MSE(p, y)
var = MSE(y, [mean(y)] * len(y))
print "MSE testing", mse
print "Var", var
print " FVU testing", (mse / var)


MSE testing 0.834613568009
Var 1.40083603823
 FVU testing 0.595796756532

Graphs of lambda


In [77]:
res = {}

In [102]:
for l in range(0, 21):
    a = l * 0.01
    # run the classifier - there are parameters for L1/L2 regularization - see link below!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    clf = SGDRegressor(n_iter = 5, alpha = a, penalty = 'l1')

    start = time.time()

    clf.fit(X_train, y_train)

    end = time.time()
    finished = end - start
    print finished

    ptest = clf.predict(X_test)
    ytest = [e for e in y_test]
    ptest = [e for e in ptest]
    ptest = [e if e <= 5 else 5. for e in ptest]
    ptest = [e if e >= 1 else 1. for e in ptest ]

    mse = MSE(ptest, ytest)
    var = MSE(ytest, [mean(ytest)] * len(ytest))
    print "MSE testing", mse
    print "Var testing", var
    print " FVU testing", (mse / var)
    
    res[a] =(a, finished, mse, var, (mse / var))


24.2345089912
MSE testing 0.828631456879
Var testing 1.40083603823
 FVU testing 0.591526370158
20.2774541378
MSE testing 0.927514540196
Var testing 1.40083603823
 FVU testing 0.662114990538
20.1288979053
MSE testing 0.971858688356
Var testing 1.40083603823
 FVU testing 0.693770478368
20.3958899975
MSE testing 0.994954338814
Var testing 1.40083603823
 FVU testing 0.71025752598
20.7129850388
MSE testing 1.0207817107
Var testing 1.40083603823
 FVU testing 0.728694638659
20.0346138477
MSE testing 1.05397173218
Var testing 1.40083603823
 FVU testing 0.7523876481
19.965089798
MSE testing 1.09452440325
Var testing 1.40083603823
 FVU testing 0.781336554302
20.3972451687
MSE testing 1.14243972392
Var testing 1.40083603823
 FVU testing 0.815541357265
20.4180140495
MSE testing 1.19771769419
Var testing 1.40083603823
 FVU testing 0.855002056989
20.3330769539
MSE testing 1.26035831405
Var testing 1.40083603823
 FVU testing 0.899718653474
20.4877541065
MSE testing 1.33036158351
Var testing 1.40083603823
 FVU testing 0.949691146722
20.4414789677
MSE testing 1.40176229865
Var testing 1.40083603823
 FVU testing 1.00066121973
20.0542750359
MSE testing 1.40176229865
Var testing 1.40083603823
 FVU testing 1.00066121973
20.1588051319
MSE testing 1.40176229865
Var testing 1.40083603823
 FVU testing 1.00066121973
19.9682350159
MSE testing 1.40176229865
Var testing 1.40083603823
 FVU testing 1.00066121973
20.6555678844
MSE testing 1.40176229865
Var testing 1.40083603823
 FVU testing 1.00066121973
20.0508759022
MSE testing 1.40176229865
Var testing 1.40083603823
 FVU testing 1.00066121973
20.4067659378
MSE testing 1.40176229865
Var testing 1.40083603823
 FVU testing 1.00066121973
20.4263460636
MSE testing 1.40176229865
Var testing 1.40083603823
 FVU testing 1.00066121973
20.4940600395
MSE testing 1.40176229865
Var testing 1.40083603823
 FVU testing 1.00066121973
20.5017299652
MSE testing 1.40176229865
Var testing 1.40083603823
 FVU testing 1.00066121973

In [136]:
from matplotlib import pyplot as PLT
from matplotlib import cm as CM
from matplotlib import mlab as ML
import numpy as NP 
from IPython.display import FileLink, FileLinks

RY = []
XS = []
for i in range (len(R)):
    if R[i] < 0.85 and i > 2:
        RY.append(R[i])
        XS.append(i)
        
ll = len(RY)
RX = [i * 0.01 for i in XS]

fig, axes = PLT.subplots(figsize=(10,8))
#eft, height, width=0.8, bottom=None, **kwargs)
axes.bar (range(ll), RY, width=0.5, alpha=0.5, align="center")
#(bottom, width, height=0.8, left=None, **kwargs)
axes.set_xlabel('Rating')

axes.set_title('title');
axes.set_xticks(range(ll))
axes.set_xticklabels(RX, fontsize=10, rotation='vertical');
axes.set_title("FVU per L1 Regularization lambda factor");
PLT.axis([-1, ll, 0.5, 0.9])

PLT.show()  

fig.savefig("FVU_L1.png", dpi=200)
FileLink("FVU_L1.png")


Out[136]:

In [108]:
res2 = {}

for l in range(0, 21):
    a = l * 0.01
    # run the classifier - there are parameters for L1/L2 regularization - see link below!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    clf = SGDRegressor(n_iter = 5, alpha = a, penalty = 'l2')

    start = time.time()

    clf.fit(X_train, y_train)

    end = time.time()
    finished = end - start
    print finished

    ptest = clf.predict(X_test)
    ytest = [e for e in y_test]
    ptest = [e for e in ptest]
    ptest = [e if e <= 5 else 5. for e in ptest]
    ptest = [e if e >= 1 else 1. for e in ptest ]

    mse = MSE(ptest, ytest)
    var = MSE(ytest, [mean(ytest)] * len(ytest))
    print "MSE testing", mse
    print "Var testing", var
    print " FVU testing", (mse / var)
    
    res2[a] =(a, finished, mse, var, (mse / var))


10.685063839
MSE testing 0.828631456879
Var testing 1.40083603823
 FVU testing 0.591526370158
10.6929478645
MSE testing 0.891173827359
Var testing 1.40083603823
 FVU testing 0.636172830396
11.1689119339
MSE testing 0.946133464832
Var testing 1.40083603823
 FVU testing 0.675406285256
10.7066700459
MSE testing 0.993038201079
Var testing 1.40083603823
 FVU testing 0.70888967301
11.1291618347
MSE testing 1.03211022118
Var testing 1.40083603823
 FVU testing 0.736781602571
10.7130358219
MSE testing 1.06468030239
Var testing 1.40083603823
 FVU testing 0.760032061806
11.161716938
MSE testing 1.09206840364
Var testing 1.40083603823
 FVU testing 0.77958331585
10.7221870422
MSE testing 1.11534705165
Var testing 1.40083603823
 FVU testing 0.796200997986
11.0700058937
MSE testing 1.13534505273
Var testing 1.40083603823
 FVU testing 0.810476759412
10.7216658592
MSE testing 1.15269635019
Var testing 1.40083603823
 FVU testing 0.822863146532
10.7221090794
MSE testing 1.16788780897
Var testing 1.40083603823
 FVU testing 0.833707712465
11.1613440514
MSE testing 1.18129684238
Var testing 1.40083603823
 FVU testing 0.843279877259
11.1632518768
MSE testing 1.19321929053
Var testing 1.40083603823
 FVU testing 0.851790829168
10.7118799686
MSE testing 1.20388972258
Var testing 1.40083603823
 FVU testing 0.859408017588
11.1570730209
MSE testing 1.21349620968
Var testing 1.40083603823
 FVU testing 0.866265698889
10.7131700516
MSE testing 1.22219114143
Var testing 1.40083603823
 FVU testing 0.872472657813
10.7126019001
MSE testing 1.23009921906
Var testing 1.40083603823
 FVU testing 0.878117913509
11.0700390339
MSE testing 1.23732342469
Var testing 1.40083603823
 FVU testing 0.883274980746
10.7104821205
MSE testing 1.24394952777
Var testing 1.40083603823
 FVU testing 0.888005086837
10.7069168091
MSE testing 1.25004952368
Var testing 1.40083603823
 FVU testing 0.892359626372
11.1317820549
MSE testing 1.25568428506
Var testing 1.40083603823
 FVU testing 0.896382053854

In [137]:
from matplotlib import pyplot as PLT
from matplotlib import cm as CM
from matplotlib import mlab as ML
import numpy as NP 
from IPython.display import FileLink, FileLinks

RY = []
XS = []
for i in range (len(R)):
    if R[i] < 0.85 and i > 2:
        RY.append(R[i])
        XS.append(i)
        
ll = len(RY)
RX = [i * 0.01 for i in XS]

fig, axes = PLT.subplots(figsize=(10,8))
#eft, height, width=0.8, bottom=None, **kwargs)
axes.bar (range(ll), RY, width=0.5, alpha=0.5, align="center")
#(bottom, width, height=0.8, left=None, **kwargs)
axes.set_xlabel('Rating')

axes.set_title('title');
axes.set_xticks(range(ll))
axes.set_xticklabels(RX, fontsize=10, rotation='vertical');
axes.set_title("FVU per L2 Regularization lambda factor");
PLT.axis([-1, ll, 0.5, 0.9])

PLT.show()  

fig.savefig("FVU_L2.png", dpi=200)
FileLink("FVU_L2.png")


Out[137]:

In [110]:
res3 = {}

for l in range(0, 21):
    a = l * 0.01
    # run the classifier - there are parameters for L1/L2 regularization - see link below!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    clf = SGDRegressor(n_iter = 5, alpha = a, penalty = 'elasticnet')

    start = time.time()

    clf.fit(X_train, y_train)

    end = time.time()
    finished = end - start
    print finished

    ptest = clf.predict(X_test)
    ytest = [e for e in y_test]
    ptest = [e for e in ptest]
    ptest = [e if e <= 5 else 5. for e in ptest]
    ptest = [e if e >= 1 else 1. for e in ptest ]

    mse = MSE(ptest, ytest)
    var = MSE(ytest, [mean(ytest)] * len(ytest))
    print "MSE testing", mse
    print "Var testing", var
    print " FVU testing", (mse / var)
    
    res3[a] =(a, finished, mse, var, (mse / var))


23.5385699272
MSE testing 0.828631456879
Var testing 1.40083603823
 FVU testing 0.591526370158
20.8136219978
MSE testing 0.902271443074
Var testing 1.40083603823
 FVU testing 0.644094967899
20.9953169823
MSE testing 0.961605654609
Var testing 1.40083603823
 FVU testing 0.686451253657
20.7167339325
MSE testing 1.01245646097
Var testing 1.40083603823
 FVU testing 0.722751580728
20.6516780853
MSE testing 1.0558361898
Var testing 1.40083603823
 FVU testing 0.753718608735
21.0626988411
MSE testing 1.09316793043
Var testing 1.40083603823
 FVU testing 0.780368223402
21.0325140953
MSE testing 1.12480126703
Var testing 1.40083603823
 FVU testing 0.802949978678
20.4984838963
MSE testing 1.15238219812
Var testing 1.40083603823
 FVU testing 0.822638886114
20.4190158844
MSE testing 1.17635641652
Var testing 1.40083603823
 FVU testing 0.839753107729
20.857847929
MSE testing 1.19697480236
Var testing 1.40083603823
 FVU testing 0.854471736659
20.3892409801
MSE testing 1.21484272015
Var testing 1.40083603823
 FVU testing 0.867226918071
20.3825340271
MSE testing 1.23096631355
Var testing 1.40083603823
 FVU testing 0.878736897079
20.734815836
MSE testing 1.24523904395
Var testing 1.40083603823
 FVU testing 0.88892562011
20.7441270351
MSE testing 1.25792562638
Var testing 1.40083603823
 FVU testing 0.897982056468
20.2967419624
MSE testing 1.26904382526
Var testing 1.40083603823
 FVU testing 0.905918887453
20.2354199886
MSE testing 1.27923548857
Var testing 1.40083603823
 FVU testing 0.9131943023
20.2263391018
MSE testing 1.28840311074
Var testing 1.40083603823
 FVU testing 0.919738695739
20.2129888535
MSE testing 1.29703491687
Var testing 1.40083603823
 FVU testing 0.925900591842
20.6135029793
MSE testing 1.3040506602
Var testing 1.40083603823
 FVU testing 0.930908846296
20.6072919369
MSE testing 1.31018465004
Var testing 1.40083603823
 FVU testing 0.935287652721
20.9061069489
MSE testing 1.31593926647
Var testing 1.40083603823
 FVU testing 0.939395639858

In [139]:
from matplotlib import pyplot as PLT
from matplotlib import cm as CM
from matplotlib import mlab as ML
import numpy as NP 

from IPython.display import FileLink, FileLinks

R = list([r for (_, _, _, _, r) in res.values()])
RY = []
XS = []
for i in range (len(R)):
    if R[i] < 0.85 and i > 2:
        RY.append(R[i])
        XS.append(i)
        
ll = len(RY)
RX = [i * 0.01 for i in XS]

fig, axes = PLT.subplots(figsize=(10,8))
#eft, height, width=0.8, bottom=None, **kwargs)
axes.bar (range(ll), RY, width=0.5, alpha=0.5, align="center")
#(bottom, width, height=0.8, left=None, **kwargs)
axes.set_xlabel('Accuracy (FLV value) of prediction')

axes.set_title('title');
axes.set_xticks(range(ll))
axes.set_xticklabels(RX, fontsize=10, rotation='vertical');
axes.set_title("FVU per Elasticnet Regularization lambda factor");
PLT.axis([-1, ll, 0.5, 0.9])

PLT.show()  

fig.savefig("FVU_Elasticent.png", dpi=200)
FileLink("FVU_Elasticent.png")


Out[139]:

In [ ]: