notebook.community

Edit and run



In [1]:

    
import numpy as np
#save  matrix
file = open("/home/iizhaki/oasis/CSE255/MatrixReorder_0.pck")
matrix =np.load(file)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/YsReorder_1.pck")
rankingR = np.load(file)
file.close()
print "done"

#save  matrix
file = open("/home/iizhaki/oasis/CSE255/IndexReorder_1.pck")
indexes= np.load(file)
file.close()
print "done"

print indexes[:20]
X = [matrix[i] for i in indexes]
y = [rankingR[i] for i in indexes]


n = int(len(X) * 0.7)
X_train_org = np.array(X[:n])
y_train_org = np.array(y[:n])

X_test_org = np.array(X[n:])
y_test_org = np.array(y[n:])









    



done
done
done
[1326168 1268572 1384023 1563614  135893  247612  480020 1359532  743048
  288473 1476688  165460  211059 1269851  837046  957459  372289  836921
  666038  747082]



In [2]:

    
n = int(len(X) * 0.7)
X_train = X_train_org #[:n]
y_train = y_train_org #[:n]
X_test = X_test_org #[n:]
y_test = y_test_org #[n:]



In [3]:

    
import pylab as pl

from sklearn.linear_model import SGDRegressor
from sklearn.datasets.samples_generator import make_regression
from sklearn.preprocessing import *
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

import time
import timeit



In [4]:

    
print X_train.shape, y_train.shape









    



(1126994, 1028) (1126994,)



In [5]:

    
X_train = X_train.astype(float)
y_train = y_train.astype(float)


X_test = X_test.astype(float)
y_test = y_test.astype(float)

print y_train[0], y[0]
#X_test = preprocessing.scale(X_test.astype(float))
#y_test = preprocessing.scale(y_test.astype(float))









    



5000.0 5000



In [6]:

    
scaler = MinMaxScaler().fit(X_train)

X_train = scaler.transform(X_train)



In [7]:

    
scaler = MinMaxScaler().fit(X_test)

X_test = scaler.transform(X_test)



In [8]:

    
# run the classifier - there are parameters for L1/L2 regularization - see link below!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
clf = SGDRegressor(n_iter = 500)

start = time.time()

clf.fit(X_train, y_train)

end = time.time()
finished = end - start
print finished

# and plot the result
#pl.scatter(X, Y, color='black')
#pl.plot(X, clf.predict(X), color='blue', linewidth=3)
#pl.show()









    



2077.26218796



In [9]:

    
def predict(data, theta):
    theta = numpy.matrix(theta)
    prediction = [theta * d.T  for d in data]
    return prediction

#prediction_training = predict(X, thetax)

def MSE(prediction, real):
    squares = [ (p - r)**2 for p,r in zip (prediction,real) ]
    return numpy.mean(squares)


#print " MSE training", MSE(prediction_training, y )



In [10]:

    
import scipy
ptrain = clf.predict(X_train)
ytrain = [e / 1000.0 for e in y_train]

ptrain = [e / 1000.0 for e in ptrain]
ptrain = [e if e <= 5 else 5. for e in ptrain]
ptrain = [e if e >= 1 else 1. for e in ptrain ]

mse = MSE(ptrain, ytrain)
var = MSE(ytrain, [mean(ytrain)] * len(ytrain))
print "MSE training", mse
print "Var", var
print " FVU training", (mse / var)









    



MSE training 0.825505078352
Var 1.39925657753
 FVU training 0.589959762642



In [11]:

    
ptest = clf.predict(X_test)
ytest = [e / 1000.0 for e in y_test]
ptest = [e / 1000.0 for e in ptest]
ptest = [e if e <= 5 else 5. for e in ptest]
ptest = [e if e >= 1 else 1. for e in ptest ]

mse = MSE(ptest, ytest)
var = MSE(ytest, [mean(ytest)] * len(ytest))
print "MSE testing", mse
print "Var testing", var
print " FVU testing", (mse / var)









    



MSE testing 0.825707651065
Var testing 1.39823186516
 FVU testing 0.590537000079



In [12]:

    
print sum([p for p in ptrain if p > 5 or p < 1]) / len(ptrain)
print sum([p for p in ptest if p > 5 or p < 1]) / len(ptrain)



In [13]:

    
# class sklearn.linear_model.SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.1, random_state=None, learning_rate='invscaling', eta0=0.01, power_t=0.25, warm_start=False)
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html



In [13]:



In [14]:

    
start = time.time()

thetax,residualsx,rankx,sx = numpy.linalg.lstsq(X_train, y_train)

end = time.time()
finished = end - start
print finished









    



1603.94804001



In [62]:

    
def predict(data, theta):
    prediction = [np.dot(theta, d)  for d in data]
    return prediction

def MSE(prediction, real):
    squares = [ (p - r)**2 for p,r in zip (prediction,real) ]
    return numpy.mean(squares)



In [64]:

    
prediction_training = predict(X_train, thetax)

y = [e/1000.0 for e in y_train]
p = [e/1000.0 for e in prediction_training]

mse = MSE(p, y)
var = MSE(y, [mean(y)] * len(y))
print "MSE training", mse
print "Var", var
print " FVU training", (mse / var)









    



MSE training 0.831968452555
Var 1.39925657753
 FVU training 0.594578911344



In [49]:



In [66]:

    
prediction_testing = predict(X_test, thetax)

y = [e/1000.0 for e in y_test]
p = [e/1000.0 for e in prediction_testing]

mse = MSE(p, y)
var = MSE(y, [mean(y)] * len(y))
print "MSE testing", mse
print "Var", var
print " FVU testing", (mse / var)









    



MSE testing 2.41279725531e+19
Var 1.39823186516
 FVU testing 1.72560597096e+19



In [51]:

    
print finished









    



1603.94804001



In [47]:



In [43]:



In [ ]: