In [3]:

    
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import NMF
import matplotlib.pyplot as plt
%matplotlib inline

Exercise 1 - Regularization and Regression



In [4]:

    
df = pd.read_csv('ozone.csv')



In [5]:

    
df.head()



In [55]:

    
data = df.as_matrix().astype(np.float32)
X, y = data[:,1:], data[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.90, random_state=798)



In [56]:

    
linear = LinearRegression()
linear.fit(X_train, y_train)

y_train_p = linear.predict(X_train)
y_test_p = linear.predict(X_test)



In [57]:

    
SSE = lambda y, y_p: ((y-y_p)**2).sum()



In [58]:

    
SSE(y_train, y_train_p)









    Out[58]:





510.09951233932924



In [59]:

    
SSE(y_test, y_test_p)









    Out[59]:





11131.426653604514



In [60]:

    
lambdas = np.linspace(0 + 1e-5, 2, 100)
loss_l1 = []
loss_l2 = []
coef_l1 = []
coef_l2 = []

for lam in lambdas:
    lasso = Lasso(lam, normalize=True, fit_intercept=True)
    lasso.fit(X_train, y_train)
    coef_l1.append(lasso.coef_)
    
    ridge = Ridge(lam, normalize=True, fit_intercept=True)
    ridge.fit(X_train, y_train)
    coef_l2.append(ridge.coef_)
    
    
    loss_l1.append((
            SSE(y_train, lasso.predict(X_train)),
            SSE(y_test, lasso.predict(X_test)),
        ))
    
    loss_l2.append((
        SSE(y_train, ridge.predict(X_train)),
        SSE(y_test, ridge.predict(X_test)),
    ))
loss_l1 = np.array(loss_l1)
loss_l2 = np.array(loss_l2)



In [61]:

    
plt.plot(lambdas, loss_l1[:, 0], label='train')
plt.plot(lambdas, loss_l1[:, 1], label='test')
plt.legend();



In [62]:

    
plt.plot(lambdas, loss_l2[:, 0], label='train')
plt.plot(lambdas, loss_l2[:, 1], label='test')
plt.legend();



In [63]:

    
plt.plot(lambdas, coef_l1);



In [64]:

    
plt.plot(lambdas, coef_l2);

Exercise 2 - Matrix Factorizations



In [ ]:

    
# permutation that reveals the structure
# 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
# 1 1 1 0 0 0 0 0 1 1 1 0 0 0 0
# 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
# 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
# 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0
# 0 0 1 1 1 0 0 0 1 1 1 0 0 0 0
# 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0



In [ ]:

    
df = np.genfromtxt('nmf.txt', delimiter=' ')
df



In [ ]:

    
for n_components in range(1, 6):
    nmf = NMF(n_components)
    nmf.fit(M)
    
    W = nmf.fit_transform(M);
    H = nmf.components_;
    
    print(n_components, nmf.reconstruction_err_)



In [ ]:

    
w = np.array([ [1, 0, 0], [1, 0, 1], [1, 0, 0], [1, 1, 0], [0, 1, 0], [0, 1, 1], [0, 0, 1],
              [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]])
w



In [ ]:

    
h = np.array([
        [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
    ])
h



In [ ]:

    
w.dot(h)

with NMF we need rank 4 approximation with BMF we need only rank 3

	ozone	vh	wind	humidity	temp	ibh	dpg	ibt	vis	doy
0	3	5710	4	28	40	2693	-25	87	250	3
1	5	5700	3	37	45	590	-24	128	100	4
2	5	5760	3	51	54	1450	25	139	60	5
3	6	5720	4	69	35	1568	15	121	60	6
4	4	5790	6	19	45	2631	-33	123	100	7