I ran the following notebook in a docker container with the following commands:
docker run -it -p 8888:8888 -p 6006:6006 -v `pwd`:/space/ -w /space/ --rm --name md waleedka/modern-deep-learning jupyter notebook --ip=0.0.0.0 --allow-root
The following code is adapted from http://scikit-learn.org/stable/user_guide.html
Least-angle regression (LARS) is a regression algorithm for high-dimensional data. LARS is similar to forward stepwise regression. At each step, it finds the predictor most correlated with the response. When there are multiple predictors having equal correlation, instead of continuing along the same predictor, it proceeds in a direction equiangular between the predictors.
In [1]:
from sklearn import linear_model
In [2]:
reg = linear_model.LinearRegression()
In [3]:
reg.fit ([[0, 0], [1, 1], [2, 2]], [0, 1, 2])
Out[3]:
In [4]:
reg.coef_
Out[4]:
In [5]:
# based on http://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html#sphx-glr-auto-examples-linear-model-plot-ols-py
# but added Ridge and Lasso
import matplotlib.pyplot as plt
# From http://matplotlib.org/users/usetex.html
# Requires a working TexLive installation in PATH
from matplotlib import rc
# rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})
# rc('text', usetex=True)
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
import math
def fit_and_plot(data_X, data_y, test_ratio=0.0):
if test_ratio == 0:
data_X_train = data_X
data_X_test = data_X
data_y_train = data_y
data_y_test = data_y
else:
print(data_X.shape)
sample_size, _ = data_X.shape
test_set_size = math.floor(sample_size * test_ratio)
# Split the data into training/testing sets
data_X_train = data_X[:-test_set_size]
data_X_test = data_X[-test_set_size:]
# Split the targets into training/testing sets
data_y_train = data_y[:-test_set_size]
data_y_test = data_y[-test_set_size:]
# Create linear regression object
lr = linear_model.LinearRegression()
rg = linear_model.RidgeCV(alphas = [.1, .3, .5, .7, .9])
ls = linear_model.LassoCV(alphas = [.1, .3, .5, .7, .9])
en = linear_model.ElasticNetCV(alphas = [.1, .3, .5, .7, .9], l1_ratio = [.1, .3, .5, .7, .9, .99, .997])
la = linear_model.LarsCV()
# Train the model using the training sets
lr.fit(data_X_train, data_y_train)
rg.fit(data_X_train, data_y_train)
ls.fit(data_X_train, data_y_train)
en.fit(data_X_train, data_y_train)
la.fit(data_X_train, data_y_train)
# Make predictions using the testing set
data_y_pred_lr = lr.predict(data_X_test)
data_y_pred_rg = rg.predict(data_X_test)
data_y_pred_ls = ls.predict(data_X_test)
data_y_pred_en = en.predict(data_X_test)
data_y_pred_la = la.predict(data_X_test)
# The coefficients
print('Coefficients: \n', lr.coef_, rg.coef_, ls.coef_, en.coef_, la.coef_)
print('Super parameters: \n', (), (rg.alpha_,), (ls.alpha_,), (en.alpha_, en.l1_ratio_), (la.alpha_,))
# The mean squared error
print("Mean squared error: \n %.2f %.2f %.2f %.2f %.2f"
% (mean_squared_error(data_y_test, data_y_pred_lr),
mean_squared_error(data_y_test, data_y_pred_rg),
mean_squared_error(data_y_test, data_y_pred_ls),
mean_squared_error(data_y_test, data_y_pred_en),
mean_squared_error(data_y_test, data_y_pred_la)
))
# Explained variance score: 1 is perfect prediction
print('Variance score: \n %.2f %.2f %.2f %.2f %.2f' %
(r2_score(data_y_test, data_y_pred_lr),
r2_score(data_y_test, data_y_pred_rg),
r2_score(data_y_test, data_y_pred_ls),
r2_score(data_y_test, data_y_pred_en),
r2_score(data_y_test, data_y_pred_la)
))
# Plot outputs
plt.rcParams["figure.figsize"] = [15.0, 10.0]
plt.scatter(data_X_test, data_y_test, color='black')
# blue
plot_lr, = plt.plot(data_X_test, data_y_pred_lr, color='#4572a7', label='LinearRegression', linewidth=3, linestyle='solid')
# green
plot_rg, = plt.plot(data_X_test, data_y_pred_rg, color='#1a9850', label='RidgeCV', linewidth=1, linestyle='solid')
# orange
plot_ls, = plt.plot(data_X_test, data_y_pred_ls, color='#ff7f0e', label='LassoCV', linewidth=1, linestyle='solid')
# red
plot_en, = plt.plot(data_X_test, data_y_pred_en, color='#aa4643', label='ElasticNetCV', linewidth=1, linestyle='solid')
# purple
plot_la, = plt.plot(data_X_test, data_y_pred_la, color='#886fa8', label='LarsCV', linewidth=1, linestyle='solid')
plt.legend(handles=[plot_lr, plot_rg, plot_ls, plot_en, plot_la])
plt.xticks(())
plt.yticks(())
plt.show()
In [6]:
from sklearn import datasets
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
print("All features are %s" % diabetes.feature_names)
FEATURE_TO_USE = 2
print("Using %s" % diabetes.feature_names[FEATURE_TO_USE])
print(diabetes.data.shape)
# Use only one feature
diabetes_X = diabetes.data[:, np.newaxis, FEATURE_TO_USE]
diabetes_y = diabetes.target
print(diabetes_X.data.shape)
print(diabetes_y.data.shape)
fit_and_plot(diabetes_X, diabetes_y, test_ratio=0.3)
In [7]:
SAMPLE_SIZE = 500
t_dis = np.random.standard_t(100, size=[2, SAMPLE_SIZE])
t_dis = np.random.normal(0, 1, size=[2, SAMPLE_SIZE])
t_dis
t_dis[1] = t_dis[0] * 5 + t_dis[1] * 10
data_X = t_dis[0].reshape(-1, 1)
data_y = t_dis[1]
fit_and_plot(data_X, data_y, test_ratio=0.3)
In [8]:
SAMPLE_SIZE = 500
data_X, data_y = datasets.make_regression(n_samples=SAMPLE_SIZE, n_features=1, n_informative=1,
n_targets=1, bias=15.0, effective_rank=None,
tail_strength=0.5, noise=30.0, shuffle=True,
coef=False, random_state=None)
fit_and_plot(data_X, data_y, test_ratio=0.3)
In [9]:
# The data is from https://gist.github.com/endolith/3299951 , might change to use
# the dataset in sns instead: https://seaborn.pydata.org/examples/anscombes_quartet.html
x1 = np.array([10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0]).reshape(-1, 1)
y1 = np.array([8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68])
x2 = np.array([10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0]).reshape(-1, 1)
y2 = np.array([9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74])
x3 = np.array([10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0]).reshape(-1, 1)
y3 = np.array([7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73])
x4 = np.array([8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 19.0, 8.0, 8.0, 8.0]).reshape(-1, 1)
y4 = np.array([6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89])
fit_and_plot(x1, y1)
fit_and_plot(x2, y2)
fit_and_plot(x3, y3)
fit_and_plot(x4, y4)
L1-norm:
$$ \left\| \boldsymbol{x} \right\| _1 := \sum_{i=1}^{n} \left| x_i \right| $$
In [10]:
# Adapted from http://scikit-learn.org/stable/auto_examples/linear_model/plot_lasso_lars.html#sphx-glr-auto-examples-linear-model-plot-lasso-lars-py
# Author: Fabian Pedregosa <fabian.pedregosa@inria.fr>
# Alexandre Gramfort <alexandre.gramfort@inria.fr>
# License: BSD 3 clause
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import linear_model
from sklearn import datasets
diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target
df = pd.DataFrame(X, index=y, columns=diabetes.feature_names)
display(df.head())
display(df.describe())
In [11]:
print("Computing regularization path using the LARS ...")
alphas, _, coefs = linear_model.lars_path(X, y, method='lasso', verbose=True)
df = pd.DataFrame(coefs)
display(df)
display(df.describe())
# xx are the L1-norms
xx = np.sum(np.abs(coefs.T), axis=1)
# the last of xx is the maximum of L1-norms
normalized_xx = xx / xx[-1]
df = pd.DataFrame(np.array([xx, normalized_xx]), index=["|coef|", "|coef|/max(|coef|)"])
display(df)
df = pd.DataFrame(alphas.reshape(1, -1), index=["Alphas"])
display(df)
In [12]:
plt.plot(xx, coefs.T)
ymin, ymax = plt.ylim()
plt.vlines(xx, ymin, ymax, linestyle='dashed')
plt.xlabel('|coef|')
plt.ylabel('coefs')
plt.title('LASSO Path')
plt.axis('tight')
plt.show()
From https://stats.stackexchange.com/a/4938 :
In scikit-learn the implementation of Lasso with coordinate descent tends to be faster than our implementation of LARS although for small p (such as in your case) they are roughly equivalent (LARS might even be a bit faster with the latest optimizations available in the master repo). Furthermore coordinate descent allows for efficient implementation of elastic net regularized problems. This is not the case for LARS (that solves only Lasso, aka L1 penalized problems).
Elastic Net penalization tends to yield a better generalization than Lasso (closer to the solution of ridge regression) while keeping the nice sparsity inducing features of Lasso (supervised feature selection).
For large N (and large p, sparse or not) you might also give a stochastic gradient descent (with L1 or elastic net penalty) a try (also implemented in scikit-learn).
In [ ]: