In [1]:
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixReorder_0.pck")
matrix =np.load(file)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/YsReorder_1.pck")
rankingR = np.load(file)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/IndexReorder_1.pck")
indexes= np.load(file)
file.close()
print "done"

print indexes[:20]
X = [matrix[i] for i in indexes]
y = [rankingR[i] for i in indexes]


n = int(len(X) * 0.7)
X_train_org = np.array(X[:n])
y_train_org = np.array(y[:n])

X_test_org = np.array(X[n:])
y_test_org = np.array(y[n:])


done
done
done
[1326168 1268572 1384023 1563614  135893  247612  480020 1359532  743048
  288473 1476688  165460  211059 1269851  837046  957459  372289  836921
  666038  747082]

In [2]:
n = int(len(X) * 0.7)
X_train = X_train_org #[:n]
y_train = y_train_org #[:n]
X_test = X_test_org #[n:]
y_test = y_test_org #[n:]

In [3]:
import pylab as pl

from sklearn.linear_model import SGDRegressor
from sklearn.datasets.samples_generator import make_regression
from sklearn.preprocessing import *
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

import time
import timeit

In [4]:
print X_train.shape, y_train.shape


(1126994, 1028) (1126994,)

In [5]:
X_train = X_train.astype(float)
y_train = y_train.astype(float)


X_test = X_test.astype(float)
y_test = y_test.astype(float)

print y_train[0], y[0]
#X_test = preprocessing.scale(X_test.astype(float))
#y_test = preprocessing.scale(y_test.astype(float))


5000.0 5000

In [6]:
scaler = MinMaxScaler().fit(X_train)

X_train = scaler.transform(X_train)

In [7]:
scaler = MinMaxScaler().fit(X_test)

X_test = scaler.transform(X_test)

In [8]:
# run the classifier - there are parameters for L1/L2 regularization - see link below!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
clf = SGDRegressor(n_iter = 500)

start = time.time()

clf.fit(X_train, y_train)

end = time.time()
finished = end - start
print finished

# and plot the result
#pl.scatter(X, Y, color='black')
#pl.plot(X, clf.predict(X), color='blue', linewidth=3)
#pl.show()


2077.26218796

In [9]:
def predict(data, theta):
    theta = numpy.matrix(theta)
    prediction = [theta * d.T  for d in data]
    return prediction

#prediction_training = predict(X, thetax)

def MSE(prediction, real):
    squares = [ (p - r)**2 for p,r in zip (prediction,real) ]
    return numpy.mean(squares)


#print " MSE training", MSE(prediction_training, y )

In [10]:
import scipy
ptrain = clf.predict(X_train)
ytrain = [e / 1000.0 for e in y_train]

ptrain = [e / 1000.0 for e in ptrain]
ptrain = [e if e <= 5 else 5. for e in ptrain]
ptrain = [e if e >= 1 else 1. for e in ptrain ]

mse = MSE(ptrain, ytrain)
var = MSE(ytrain, [mean(ytrain)] * len(ytrain))
print "MSE training", mse
print "Var", var
print " FVU training", (mse / var)


MSE training 0.825505078352
Var 1.39925657753
 FVU training 0.589959762642

In [11]:
ptest = clf.predict(X_test)
ytest = [e / 1000.0 for e in y_test]
ptest = [e / 1000.0 for e in ptest]
ptest = [e if e <= 5 else 5. for e in ptest]
ptest = [e if e >= 1 else 1. for e in ptest ]

mse = MSE(ptest, ytest)
var = MSE(ytest, [mean(ytest)] * len(ytest))
print "MSE testing", mse
print "Var testing", var
print " FVU testing", (mse / var)


MSE testing 0.825707651065
Var testing 1.39823186516
 FVU testing 0.590537000079

In [12]:
print sum([p for p in ptrain if p > 5 or p < 1]) / len(ptrain)
print sum([p for p in ptest if p > 5 or p < 1]) / len(ptrain)


0.0
0.0

In [13]:
# class sklearn.linear_model.SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.1, random_state=None, learning_rate='invscaling', eta0=0.01, power_t=0.25, warm_start=False)
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html

In [13]:


In [14]:
start = time.time()

thetax,residualsx,rankx,sx = numpy.linalg.lstsq(X_train, y_train)

end = time.time()
finished = end - start
print finished


1603.94804001

In [62]:
def predict(data, theta):
    prediction = [np.dot(theta, d)  for d in data]
    return prediction

def MSE(prediction, real):
    squares = [ (p - r)**2 for p,r in zip (prediction,real) ]
    return numpy.mean(squares)

In [64]:
prediction_training = predict(X_train, thetax)

y = [e/1000.0 for e in y_train]
p = [e/1000.0 for e in prediction_training]

mse = MSE(p, y)
var = MSE(y, [mean(y)] * len(y))
print "MSE training", mse
print "Var", var
print " FVU training", (mse / var)


MSE training 0.831968452555
Var 1.39925657753
 FVU training 0.594578911344

In [49]:


In [66]:
prediction_testing = predict(X_test, thetax)

y = [e/1000.0 for e in y_test]
p = [e/1000.0 for e in prediction_testing]

mse = MSE(p, y)
var = MSE(y, [mean(y)] * len(y))
print "MSE testing", mse
print "Var", var
print " FVU testing", (mse / var)


MSE testing 2.41279725531e+19
Var 1.39823186516
 FVU testing 1.72560597096e+19

In [51]:
print finished


1603.94804001

In [47]:


In [43]:


In [ ]: