In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import NMF
import matplotlib.pyplot as plt
%matplotlib inline

Exercise 1 - Regularization and Regression


In [4]:
df = pd.read_csv('ozone.csv')

In [5]:
df.head()


Out[5]:
ozone vh wind humidity temp ibh dpg ibt vis doy
0 3 5710 4 28 40 2693 -25 87 250 3
1 5 5700 3 37 45 590 -24 128 100 4
2 5 5760 3 51 54 1450 25 139 60 5
3 6 5720 4 69 35 1568 15 121 60 6
4 4 5790 6 19 45 2631 -33 123 100 7

In [55]:
data = df.as_matrix().astype(np.float32)
X, y = data[:,1:], data[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.90, random_state=798)

In [56]:
linear = LinearRegression()
linear.fit(X_train, y_train)

y_train_p = linear.predict(X_train)
y_test_p = linear.predict(X_test)

In [57]:
SSE = lambda y, y_p: ((y-y_p)**2).sum()

In [58]:
SSE(y_train, y_train_p)


Out[58]:
510.09951233932924

In [59]:
SSE(y_test, y_test_p)


Out[59]:
11131.426653604514

In [60]:
lambdas = np.linspace(0 + 1e-5, 2, 100)
loss_l1 = []
loss_l2 = []
coef_l1 = []
coef_l2 = []

for lam in lambdas:
    lasso = Lasso(lam, normalize=True, fit_intercept=True)
    lasso.fit(X_train, y_train)
    coef_l1.append(lasso.coef_)
    
    ridge = Ridge(lam, normalize=True, fit_intercept=True)
    ridge.fit(X_train, y_train)
    coef_l2.append(ridge.coef_)
    
    
    loss_l1.append((
            SSE(y_train, lasso.predict(X_train)),
            SSE(y_test, lasso.predict(X_test)),
        ))
    
    loss_l2.append((
        SSE(y_train, ridge.predict(X_train)),
        SSE(y_test, ridge.predict(X_test)),
    ))
loss_l1 = np.array(loss_l1)
loss_l2 = np.array(loss_l2)

In [61]:
plt.plot(lambdas, loss_l1[:, 0], label='train')
plt.plot(lambdas, loss_l1[:, 1], label='test')
plt.legend();



In [62]:
plt.plot(lambdas, loss_l2[:, 0], label='train')
plt.plot(lambdas, loss_l2[:, 1], label='test')
plt.legend();



In [63]:
plt.plot(lambdas, coef_l1);



In [64]:
plt.plot(lambdas, coef_l2);


Exercise 2 - Matrix Factorizations


In [ ]:
# permutation that reveals the structure
# 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
# 1 1 1 0 0 0 0 0 1 1 1 0 0 0 0
# 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
# 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
# 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0
# 0 0 1 1 1 0 0 0 1 1 1 0 0 0 0
# 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [ ]:
df = np.genfromtxt('nmf.txt', delimiter=' ')
df

In [ ]:
for n_components in range(1, 6):
    nmf = NMF(n_components)
    nmf.fit(M)
    
    W = nmf.fit_transform(M);
    H = nmf.components_;
    
    print(n_components, nmf.reconstruction_err_)

In [ ]:
w = np.array([ [1, 0, 0], [1, 0, 1], [1, 0, 0], [1, 1, 0], [0, 1, 0], [0, 1, 1], [0, 0, 1],
              [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]])
w

In [ ]:
h = np.array([
        [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
    ])
h

In [ ]:
w.dot(h)

with NMF we need rank 4 approximation with BMF we need only rank 3