In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.linear_model import LinearRegression
In [2]:
from string import ascii_lowercase as letters
In [143]:
n = 1000
p = 10
In [144]:
X = np.random.standard_normal((n,p))
In [145]:
X.shape
Out[145]:
In [146]:
A = np.random.random((p,1))
A
Out[146]:
In [147]:
y = X @ A
In [148]:
y.shape
Out[148]:
In [149]:
model = LinearRegression().fit(X, y)
In [150]:
model
Out[150]:
In [151]:
X_test = np.random.standard_normal((n,p))
y_test = X_test @ A
In [152]:
from sklearn.metrics import r2_score
In [153]:
y_pred = model.predict(X_test)
In [154]:
r2_score(y_test, y_pred)
Out[154]:
In [155]:
plt.scatter(y_test, y_pred, color='#3033ff30')
plt.show()
In [184]:
noise = 1/2
In [185]:
X_train = X + np.random.normal(loc=0, scale=noise, size=(n,p))
y_train = y + np.random.normal(loc=0, scale=noise, size=(n,1))
In [186]:
model = LinearRegression().fit(X_train, y_train)
In [187]:
X_test_noisy = X_test + np.random.normal(loc=0, scale=noise, size=(n,p))
y_test_noisy = y_test + np.random.normal(loc=0, scale=noise, size=(n,1))
In [188]:
y_pred = model.predict(X_test_noisy)
In [189]:
r2_score(y_test, y_pred)
Out[189]:
In [190]:
plt.scatter(y_test_noisy, y_pred, color='#3033ff30')
plt.show()
In [191]:
import itertools
In [192]:
def learning_curve(A, noise=1/3):
p = A.shape[0]
results = []
n_train_seq = itertools.chain.from_iterable(itertools.repeat(x, 10) for x in range(20, 500, 20))
for n in n_train_seq:
X = np.random.standard_normal((n,p))
y = X @ A
X_train = X + np.random.normal(loc=0, scale=noise, size=(n,p))
y_train = y + np.random.normal(loc=0, scale=noise, size=(n,1))
model = LinearRegression().fit(X_train, y_train)
n_test = 1000
X_test = np.random.standard_normal((n_test,p))
y_test = X_test @ A
X_test_noisy = X_test + np.random.normal(loc=0, scale=noise, size=(n_test,p))
y_test_noisy = y_test + np.random.normal(loc=0, scale=noise, size=(n_test,1))
y_pred = model.predict(X_test_noisy)
results.append((n, r2_score(y_test_noisy, y_pred)))
return np.array(results)
In [193]:
lc = learning_curve(A, noise=1/3)
In [194]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
In [195]:
X_lc = lc[:,0:1]
y_lc = lc[:,1]
In [196]:
degree = 3
lc_model = make_pipeline(PolynomialFeatures(degree), Ridge())
lc_model.fit(X_lc, y=y_lc)
lc_y_plot = lc_model.predict(X_lc)
In [197]:
plt.scatter(lc[:,0], lc[:,1], color='#3033ff30')
plt.plot(lc[:,0], lc_y_plot, color='teal', linewidth=2,
label="degree %d" % degree)
plt.title('r-squared as a function of n')
plt.show()
Let's make the problem harder. Let's say there are 10 true features that are linearly related with our target variable. We don't necessarily get to observe those, but we can measure 10 other features. These might be combinations of the original features with more or less noise added. Some variable are totally hidden.
In [198]:
def tr(v, extra_noise):
a,b,c,d,e,f,g,h,i,j = v
super_noisy = np.random.normal(loc=0, scale=extra_noise, size=None)
return (a+b, b*c, (c + d + e)/3, d + i/10, e, f, g+super_noisy, h + i/5, h + c/3, 0)
In [204]:
noise = 1/5
X_tr_train = np.apply_along_axis(tr, axis=1, arr=X, extra_noise=2) + np.random.normal(loc=0, scale=noise, size=(n,p))
In [205]:
model = LinearRegression().fit(X_tr_train, y_train)
In [206]:
X_tr_test = np.apply_along_axis(tr, axis=1, arr=X_test, extra_noise=1) + np.random.normal(loc=0, scale=noise, size=(n,p))
In [207]:
y_pred = model.predict(X_tr_test)
In [208]:
r2_score(y_test, y_pred)
Out[208]:
In [209]:
plt.scatter(y_test, y_pred, color='#3033ff30')
plt.show()
In [210]:
model.coef_
Out[210]:
In [211]:
A
Out[211]:
In [212]:
from pandas.plotting import scatter_matrix
In [213]:
df = pd.DataFrame(X_tr_train, columns=list(letters[:10]))
df['y'] = y_train
df.shape
Out[213]:
In [214]:
df.head()
Out[214]:
In [215]:
x = scatter_matrix(df, alpha = 0.2, figsize = (6, 6), diagonal = 'kde')
In [216]:
df = pd.DataFrame(X, columns=list(letters[:10]))
df['y'] = y_train
df.shape
Out[216]:
In [217]:
x = scatter_matrix(df, alpha = 0.2, figsize = (6, 6), diagonal = 'kde')
In [ ]: