Big Data Architect
https://www.linkedin.com/in/prasannajoshi
Meetup
email: gotokermit@gmail.com
code: github.com/piscataway
In [ ]:
# Agenda
In [2]:
#
In [1]:
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
boston = load_boston()
In [2]:
# describe the dataset
print(boston.DESCR)
In [3]:
# Split into testing and training data sets
xtrain, xtest, ytrain, ytest = train_test_split(boston.data, boston.target, train_size=0.75, test_size=0.25, random_state=2)
In [4]:
print(boston.data.shape)
print(xtrain.shape)
print(ytrain.shape)
print(xtest.shape)
print(ytest.shape)
In [5]:
plt.hist(ytrain, bins=100)
Out[5]:
In [6]:
from ipywidgets import interact
from scipy import stats
# https://ipywidgets.readthedocs.io/en/latest/
In [7]:
@interact(index=(0, xtrain.shape[1]-1))
def plot_scatter(index):
fig, ax = plt.subplots()
x, y = xtrain[:, index], ytrain
ax.scatter(x,y)
ax.set_xlabel(boston.feature_names[index])
ax.set_ylabel("House Price")
print("Correlation: {0[0]:.1} (p-value: {0[1]:.1})".format(stats.pearsonr(x,y)))
In [8]:
from sklearn import linear_model
from sklearn import metrics
In [9]:
lr = linear_model.LinearRegression()
lm = lr.fit(xtrain, ytrain)
yhat = lm.predict(xtest)
In [10]:
plt.plot(yhat, ytest, '.')
plt.plot([0,50], [0,50],'g-')
plt.xlabel('actual')
plt.ylabel('predicted')
Out[10]:
In [11]:
mse = metrics.mean_squared_error(ytest, yhat)
print("MSE of Test Set: ", mse)
rmse = np.sqrt(mse)
print("RMSE of Test Set:", rmse)
r2score = metrics.r2_score(ytest, yhat)
print ("R2 - Coef. of Determination (1=Perfect Fit; 0=No explanatory Power)", r2score)
In [12]:
import numpy as np
from sklearn import linear_model as linear_model
from sklearn import metrics
In [13]:
# 1)
model = linear_model.Lasso()
lasso = model.fit(xtrain, ytrain)
yhat_train = lasso.predict(xtrain)
yhat_test = lasso.predict(xtest)
mse_train = metrics.mean_squared_error(ytrain, yhat_train)
mse_test = metrics.mean_squared_error(ytest, yhat_test)
r2score_train = metrics.r2_score(ytrain, yhat_train)
r2score_test = metrics.r2_score(ytest, yhat_test)
print("MSE (Train) :", mse_train)
print("MSE (Test) :", mse_test)
print ("R2 Score (Train):", r2score_train)
print ("R2 Score (Test) :", r2score_test)
In [14]:
# 2)
model = linear_model.Lasso(alpha=0.1)
lasso = model.fit(xtrain, ytrain)
yhat_train = lasso.predict(xtrain)
yhat_test = lasso.predict(xtest)
mse_train = metrics.mean_squared_error(ytrain, yhat_train)
mse_test = metrics.mean_squared_error(ytest, yhat_test)
r2score_train = metrics.r2_score(ytrain, yhat_train)
r2score_test = metrics.r2_score(ytest, yhat_test)
print("MSE (Train) :", mse_train)
print("MSE (Test) :", mse_test)
print ("R2 Score (Train):", r2score_train)
print ("R2 Score (Test) :", r2score_test)
In [15]:
# 3)
model = linear_model.Lasso(normalize=True, alpha=0.01)
lasso = model.fit(xtrain, ytrain)
yhat_train = lasso.predict(xtrain)
yhat_test = lasso.predict(xtest)
mse_train = metrics.mean_squared_error(ytrain, yhat_train)
mse_test = metrics.mean_squared_error(ytest, yhat_test)
r2score_train = metrics.r2_score(ytrain, yhat_train)
r2score_test = metrics.r2_score(ytest, yhat_test)
print("MSE (Train) :", mse_train)
print("MSE (Test) :", mse_test)
print ("R2 Score (Train):", r2score_train)
print ("R2 Score (Test) :", r2score_test)
In [16]:
model = linear_model.Ridge(normalize=True, alpha=.06)
ridge = model.fit(xtrain, ytrain)
yhat_train = ridge.predict(xtrain)
yhat_test = ridge.predict(xtest)
mse_train = metrics.mean_squared_error(ytrain, yhat_train)
mse_test = metrics.mean_squared_error(ytest, yhat_test)
r2score_train = metrics.r2_score(ytrain, yhat_train)
r2score_test = metrics.r2_score(ytest, yhat_test)
print("MSE (Train) :", mse_train)
print("MSE (Test) :", mse_test)
print ("R2 Score (Train):", r2score_train)
print ("R2 Score (Test) :", r2score_test)
In [17]:
model = linear_model.ARDRegression()
ardr = model.fit(xtrain, ytrain)
yhat_train = ardr.predict(xtrain)
yhat_test = ardr.predict(xtest)
mse_train = metrics.mean_squared_error(ytrain, yhat_train)
mse_test = metrics.mean_squared_error(ytest, yhat_test)
r2score_train = metrics.r2_score(ytrain, yhat_train)
r2score_test = metrics.r2_score(ytest, yhat_test)
print("MSE (Train) :", mse_train)
print("MSE (Test) :", mse_test)
print ("R2 Score (Train):", r2score_train)
print ("R2 Score (Test) :", r2score_test)
In [18]:
model = linear_model.LassoCV(normalize=True, cv = 20)
lassocv = model.fit(xtrain, ytrain)
yhat_train = lassocv.predict(xtrain)
yhat_test = lassocv.predict(xtest)
mse_train = metrics.mean_squared_error(ytrain, yhat_train)
mse_test = metrics.mean_squared_error(ytest, yhat_test)
r2score_train = metrics.r2_score(ytrain, yhat_train)
r2score_test = metrics.r2_score(ytest, yhat_test)
print("MSE (Train) :", mse_train)
print("MSE (Test) :", mse_test)
print ("R2 Score (Train):", r2score_train)
print ("R2 Score (Test) :", r2score_test)
In [19]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, max_depth=3)
forest = model.fit(xtrain, ytrain)
print(model)
yhat_train = forest.predict(xtrain)
yhat_test = forest.predict(xtest)
mse_train = metrics.mean_squared_error(ytrain, yhat_train)
mse_test = metrics.mean_squared_error(ytest, yhat_test)
r2score_train = metrics.r2_score(ytrain, yhat_train)
r2score_test = metrics.r2_score(ytest, yhat_test)
print("MSE (Train) :", mse_train)
print("MSE (Test) :", mse_test)
print ("R2 Score (Train):", r2score_train)
print ("R2 Score (Test) :", r2score_test)
In [20]:
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(xtrain.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
In [21]:
plt.figure(figsize=(8,5))
plt.title("Feature importances")
plt.bar(range(xtrain.shape[1]), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(xtrain.shape[1]), indices)
plt.xlim([-1, xtrain.shape[1]])
plt.show()
In [22]:
import numpy as np
from sklearn import datasets
from sklearn import linear_model
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
boston = load_boston()
xdata, ydata = datasets.make_regression(n_samples=50, n_features=50, n_informative=10)
xtrain, xtest, ytrain, ytest = train_test_split(xdata, ydata, train_size=0.75, test_size=0.25)
In [23]:
model = linear_model.LinearRegression()
ols = model.fit(xtrain, ytrain)
yhat_train = ols.predict(xtrain)
yhat_test = ols.predict(xtest)
mse_train = metrics.mean_squared_error(ytrain, yhat_train)
mse_test = metrics.mean_squared_error(ytest, yhat_test)
print("MSE (Train): ", mse_train)
print("MSE (Test) : ", mse_test)
print("Model Score (Train):", model.score(xtrain, ytrain))
print("Model Score (Test) :", model.score(xtest, ytest))
In [24]:
model = linear_model.Ridge(alpha=2.5)
ridge = model.fit(xtrain, ytrain)
yhat_train = ridge.predict(xtrain)
yhat_test = ridge.predict(xtest)
mse_train = metrics.mean_squared_error(ytrain, yhat_train)
mse_test = metrics.mean_squared_error(ytest, yhat_test)
print("MSE (Train): ", mse_train)
print("MSE (Test) : ", mse_test)
print("Model Score (Train):", model.score(xtrain, ytrain))
print("Model Score (Test) :", model.score(xtest, ytest))
In [25]:
model = linear_model.Lasso(alpha=1.0)
lasso = model.fit(xtrain, ytrain)
yhat_train = lasso.predict(xtrain)
yhat_test = lasso.predict(xtest)
mse_train = metrics.mean_squared_error(ytrain, yhat_train)
mse_test = metrics.mean_squared_error(ytest, yhat_test)
print("MSE (Train): ", mse_train)
print("MSE (Test) : ", mse_test)
print("Model Score (Train):", model.score(xtrain, ytrain))
print("Model Score (Test) :", model.score(xtest, ytest))
In [ ]:
In [26]:
alphas = np.logspace(-4, 3, 200)
#alphas = np.linspace(-0.9, 0.9, 200)
coeffs = np.zeros((len(alphas), xtrain.shape[1]))
mse_train = np.zeros_like(alphas)
mse_test = np.zeros_like(alphas)
for n, alpha in enumerate(alphas):
model = linear_model.Lasso(alpha=alpha, tol=0.001)
#model = linear_model.Ridge(alpha=alpha)
model.fit(xtrain, ytrain)
coeffs[n, :] = model.coef_
mse_train[n] = metrics.mean_squared_error(ytrain, model.predict(xtrain))
mse_test[n] = metrics.mean_squared_error(ytest, model.predict(xtest))
In [27]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5), sharex=True)
for n in range(coeffs.shape[1]):
axes[0].plot(np.log10(alphas), coeffs[:, n], color='k', lw=0.5)
#axes[0].plot(alphas, coeffs[:, n], color='k', lw=0.5)
#axes[1].semilogy(alphas, mse_train, label="train")
axes[1].semilogy(np.log10(alphas), mse_train, label="train")
#axes[1].semilogy(alphas, mse_test, label="test")
axes[1].semilogy(np.log10(alphas), mse_test, label="test")
axes[1].legend(loc=0)
axes[0].set_xlabel("log-alpha", fontsize=15)
axes[0].set_ylabel("coefficients", fontsize=15)
axes[1].set_xlabel("log-alpha", fontsize=15)
axes[1].set_ylabel(r"mse", fontsize=15)
Out[27]:
In [ ]:
In [28]:
# LASSO-CV
model = linear_model.LassoCV()
lassocv = model.fit(xdata, ydata)
yhat_train = lassocv.predict(xtrain)
yhat_test = lassocv.predict(xtest)
yhat_data = lassocv.predict(xdata)
mse_train = metrics.mean_squared_error(ytrain, yhat_train)
mse_test = metrics.mean_squared_error(ytest, yhat_test)
mse_data = metrics.mean_squared_error(ydata, yhat_data)
print("MSE (Train): ", mse_train)
print("MSE (Test) : ", mse_test)
print("MSE (Data) : ", mse_data)
print("Model Score (Train):", model.score(xtrain, ytrain))
print("Model Score (Test) :", model.score(xtest, ytest))
print("Model Score (Data) :", model.score(xdata, ydata))
In [29]:
print("Alpha = ", model.alpha_)
print("Log-Alpha = " , np.log10(model.alpha_))
In [30]:
# ELASTICNET-CV
model = linear_model.ElasticNetCV()
elasticnetcv = model.fit(xdata, ydata)
yhat_data = elasticnetcv.predict(xdata)
mse_data = metrics.mean_squared_error(ydata, yhat_data)
print("MSE (Data) : ", mse_data)
print("Model Score (Data) :", model.score(xdata, ydata))
In [31]:
print(model.alpha_)
print(model.l1_ratio)
In [32]:
# .. keep exploring!
In [ ]: