In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import NMF
import matplotlib.pyplot as plt
%matplotlib inline
In [4]:
df = pd.read_csv('ozone.csv')
In [5]:
df.head()
Out[5]:
In [55]:
data = df.as_matrix().astype(np.float32)
X, y = data[:,1:], data[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.90, random_state=798)
In [56]:
linear = LinearRegression()
linear.fit(X_train, y_train)
y_train_p = linear.predict(X_train)
y_test_p = linear.predict(X_test)
In [57]:
SSE = lambda y, y_p: ((y-y_p)**2).sum()
In [58]:
SSE(y_train, y_train_p)
Out[58]:
In [59]:
SSE(y_test, y_test_p)
Out[59]:
In [60]:
lambdas = np.linspace(0 + 1e-5, 2, 100)
loss_l1 = []
loss_l2 = []
coef_l1 = []
coef_l2 = []
for lam in lambdas:
lasso = Lasso(lam, normalize=True, fit_intercept=True)
lasso.fit(X_train, y_train)
coef_l1.append(lasso.coef_)
ridge = Ridge(lam, normalize=True, fit_intercept=True)
ridge.fit(X_train, y_train)
coef_l2.append(ridge.coef_)
loss_l1.append((
SSE(y_train, lasso.predict(X_train)),
SSE(y_test, lasso.predict(X_test)),
))
loss_l2.append((
SSE(y_train, ridge.predict(X_train)),
SSE(y_test, ridge.predict(X_test)),
))
loss_l1 = np.array(loss_l1)
loss_l2 = np.array(loss_l2)
In [61]:
plt.plot(lambdas, loss_l1[:, 0], label='train')
plt.plot(lambdas, loss_l1[:, 1], label='test')
plt.legend();
In [62]:
plt.plot(lambdas, loss_l2[:, 0], label='train')
plt.plot(lambdas, loss_l2[:, 1], label='test')
plt.legend();
In [63]:
plt.plot(lambdas, coef_l1);
In [64]:
plt.plot(lambdas, coef_l2);
In [ ]:
# permutation that reveals the structure
# 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
# 1 1 1 0 0 0 0 0 1 1 1 0 0 0 0
# 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
# 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
# 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0
# 0 0 1 1 1 0 0 0 1 1 1 0 0 0 0
# 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
In [ ]:
df = np.genfromtxt('nmf.txt', delimiter=' ')
df
In [ ]:
for n_components in range(1, 6):
nmf = NMF(n_components)
nmf.fit(M)
W = nmf.fit_transform(M);
H = nmf.components_;
print(n_components, nmf.reconstruction_err_)
In [ ]:
w = np.array([ [1, 0, 0], [1, 0, 1], [1, 0, 0], [1, 1, 0], [0, 1, 0], [0, 1, 1], [0, 0, 1],
[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]])
w
In [ ]:
h = np.array([
[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
])
h
In [ ]:
w.dot(h)
with NMF we need rank 4 approximation with BMF we need only rank 3