In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
In [3]:
# EXAMPLE: Predict house price (Gijón)
# Generate N data instances
N = 100
x = np.random.uniform(50, 200, size=N)
y = [1500*i + np.random.normal(0,i*200) for i in x] # Gijón
y = [np.max([i,8]) for i in y]
# Transform the data to numpy format for sklearn
x = np.array(x).reshape(x.shape + (1,)) # x is one-dimensional and sklearn requires matrix data, so we add one dimension
y = np.array(y)
In [4]:
# EXAMPLE: Predict house price (Gijón): plot
# Preplot
fig = plt.figure()
ax = fig.gca()
# Plot the data points
ax.scatter(x, y, label='data')
plt.xlabel('m^2')
plt.ylabel('Price')
ax.set_xlim(0,220)
ax.set_ylim(0,700000)
#ax.legend()
plt.show()
In [5]:
# EXAMPLE: Predict house price (Gijón): estimate model
# Train linear regression model
from sklearn.linear_model import LinearRegression
# This works the same way for all supervised learning algorithms in scikit-learn: create an object and call fit
lr = LinearRegression(fit_intercept=False)
lr.fit(x, y)
w = lr.coef_
print 'Estimated coefficient: {}'.format(w)
In [7]:
# EXAMPLE: Predict house price (Gijón)
# Plot
fig = plt.figure()
ax = fig.gca()
# Plot the data points
ax.scatter(x, y, label='data')
# Compute the estimated function to plot it as a line
xb = [np.min(x)-10, np.max(x)+10]
z = [w*xb[0], w*xb[1]]
ax.plot(xb, z, label='pred', color='red', linewidth=2)
plt.xlabel('m^2')
plt.ylabel('Price')
ax.set_xlim(0,220)
ax.set_ylim(0,700000)
#ax.legend()
plt.show()
#plt.savefig('../pres/images/lr_weight_1v_pred.pdf')
In [8]:
# EXAMPLE: Predict house price (Gijón)
# Predict random samples
x = np.random.rand(1)*200
print 'm^2: {}. Predicted price: {}'.format(x, w*x)
In [9]:
# EXAMPLE: predict house price (Madrid)
# Generate N data instances
N = 100
x = np.random.uniform(50, 200, size=N)
y = [2000*i + 100000 + np.random.normal(0,i*300) for i in x]
# Transform the data to numpy format for sklearn
x = np.array(x).reshape(x.shape + (1,)) # x is one-dimensional and sklearn requires matrix data, so we add one dimension
y = np.array(y)
In [10]:
# EXAMPLE: predict house price (Madrid)
# Train linear regression model
lr = LinearRegression(fit_intercept=True)
lr.fit(x, y)
w = lr.coef_
b = lr.intercept_
print 'Estimated coefficients: {}, {}'.format(w,b)
In [11]:
# EXAMPLE: predict house price (Madrid)
# Plot
fig = plt.figure()
ax = fig.gca()
ax.scatter(x, y, label='data')
# Compute the predicted function to plot it as a line
xb = [np.min(x), np.max(x)]
z = [w*xb[0]+b, w*xb[1]+b]
ax.plot(xb, z, label='pred', color='red', linewidth=2)
plt.xlabel('m^2')
plt.ylabel('Price')
ax.set_xlim(0,220)
ax.set_ylim(0,700000)
#ax.legend()
plt.show()
#plt.savefig('../pres/images/lr_madrid_1v_pred.pdf')
In [12]:
# EXAMPLE: Predict house price with 2 variables
# Generate data
N = 100
x1 = np.random.uniform(50, 200, size=N)
x2 = np.random.uniform(0, 75, size=N)
y = [2000*i - 1000*j + np.random.normal(0,i*j*10) for i,j in zip(x1,x2)]
# Transform the data to numpy format for sklearn
x = np.column_stack([x1, x2])
y = np.array(y)
In [13]:
# EXAMPLE: Predict house price with 2 variables
# Train linear regression model
lr = LinearRegression(fit_intercept=True)
lr.fit(x, y)
w = lr.coef_
b = lr.intercept_
print 'Estimated coefficients: {}, {}'.format(w,b)
In [15]:
# EXAMPLE: Predict house price with 2 variables
# Plot
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
# Build the surface
X = np.arange(1, 200, 1)
Y = np.arange(0, 75, 1)
X, Y = np.meshgrid(X, Y)
z_pred = w[0]*X + w[1]*Y + b
fig = plt.figure()
ax = fig.gca(projection='3d')
# Plot the surface.
surf = ax.plot_surface(X, Y, z_pred, cmap=cm.coolwarm,
linewidth=0, antialiased=False, alpha=0.1, edgecolor=(0,0,0,0)) # Set edgecolor to avoid black edge bug when saving surface to PDF
plt.xlabel('m^2')
plt.ylabel('Distance')
ax.set_zlabel('Price')
ax.scatter(x1, x2, y)
ax.view_init(elev=10, azim=50) #axim=20,80
plt.show()
#plt.savefig('../pres/images/lr_2v.pdf')
In [22]:
# Predict random samples
x1 = np.random.rand(1)*200+20
x2 = np.random.rand(1)*100
print 'm^2: {}, distance to centre: {}. Predicted price: {}'.format(x1, x2, w[0]*x1+w[1]+x2+b)
In [23]:
# Estimate coefficients manually.
# This is the formula we got after deriving the error function with respect to the coefficients
# First, add a column of ones to X to estimate the bias
xe = np.column_stack([x,np.ones(x.shape[0])])
# then compute the estimate with the normal equations
w = np.linalg.inv(xe.T.dot(xe)).dot(xe.T.dot(y))
w
Out[23]:
In [24]:
# EXAMPLE: predict house price only from distance
# Generate data
N = 100
x = np.random.uniform(0, 150, size=N)
y = [50*(i-60)**2 + 50000 + np.random.normal(0,50000) for i in x]
y = [np.max([i,40000+np.random.normal(0,1000)]) for i in y]
# Transform the data to numpy format for sklearn
x = np.array(x).reshape(x.shape + (1,)) # x is one-dimensional and sklearn requires matrix data, so we add one dimension
# Add new variable (feature engineering)
x2 = x**2
x_aug = np.column_stack([x,x2])
y = np.array(y)
In [25]:
# EXAMPLE: predict house price only from distance
# Train linear regression model
lr = LinearRegression(fit_intercept=True)
lr.fit(x_aug, y)
w = lr.coef_
b = lr.intercept_
print 'Estimated coefficients: {}, {}'.format(w,b)
fig = plt.figure()
ax = fig.gca()
ax.scatter(x, y, label='data')
# Compute the predicted function to plot it as a line
xb = np.arange(int(np.min(x)), int(np.max(x)))
z = [w[0]*i + w[1]*i**2 + b for i in xb]
ax.plot(xb, z, label='pred', color='red', linewidth=2)
plt.xlabel('distance')
plt.ylabel('Price')
ax.set_xlim(-20,220)
ax.set_ylim(-20,500000)
#ax.legend()
plt.show()
#plt.savefig('../pres/images/lr_quad_pred.pdf')
In [57]:
# Predict random samples
x = np.random.rand(1)*200
print 'Distance: {}. Predicted price: {}'.format(x, w[0]*x+w[1]*x**2+b)
In [88]:
def extend_dataset(X, degree=1):
"""
This function extends the given data set with entries corresponding to powers of the original entries.
High-order interactions are omitted to avoid combinatorial explosion. Remember, we could also add new features
of the form x_1*x_2, x_1*x_3 (where x_i is the value of feature i), but this would result in way too many
features. You can try adding a random number of such interactions and try various models to see what works best
"""
X_aug = X
for i in range(2, degree+1):
X_aug = np.column_stack([X_aug,x**i])
return X_aug
In [89]:
def standardize(x, mean=None, std=None):
"""
This function transforms the input data to make sure variables are mean-centered and have unit-variance.
Remember: variables could come in very different scales, which can negatively affect learning.
"""
if mean is None:
mean = np.mean(x, axis=0)
if std is None:
std = np.std(x, axis=0)
x = (x-mean)/std
return x, mean, std
Here we introduce overfitting. When a model is complex enough, it can represent the idiosynchrasies of the data, such as noise and outliers, which will most likely hurt predictions on new data.
To combat this undesirable phenomenon, we can introduce regularizing penalties. Ridge regression and the LASSO are two ways of achieving this with linear regression.
In [106]:
def get_cv_split(X, current_split, count=10):
"""
This function generates a list of train and test indices for a CV split
"""
if current_split >= count:
raise ValueError('Current_split must be smaller than count')
N = X.shape[0]
split_size = int(N/float(count))
split = []
test = np.arange(current_split*split_size, np.min([(current_split+1)*split_size, N]))
train = np.delete(np.arange(N), test)
return train, test
In [107]:
def rmse(y, pred):
"""
This function computes the root mean squared error between the given samples
"""
return np.sqrt(np.sum((preds-y)**2)/len(y))
In [112]:
########################################
# Generate data
# Examples: Overfitting, ridge, lasso
N = 100
X = np.random.uniform(50, 200, size=N)
y = [1000*i + np.random.normal(0,i*200) for i in X]
# Create a new feature vector with powers up to DEGREE
DEGREE=18
x_aug = extend_vector(X, DEGREE)
# Transform the data to numpy format for sklearn
#X = np.array(X).reshape(X.shape + (1,)) # x is one-dimensional and sklearn requires matrix data, so we add one dimension
y = np.array(y)
# Standardize
#x = (x-np.mean(x))/np.std(x)
y_mean = np.mean(y)
y_std = np.std(y)
y = (y-y_mean)/y_std
x_aug, x_aug_mean, x_aug_std = standardize(x_aug)
In [166]:
# Train linear regression model
# Note how the resulting coefficients are very high if we use high dimensional polynomial feature vectors
lr = LinearRegression(fit_intercept=True)
lr.fit(x_aug, y)
w = lr.coef_
b = lr.intercept_
print 'Estimated coefficients: {}, {}'.format(w,b)
In [167]:
# Predict random data
# The predictions of an overfitted model are ok within the range of the training data
# but very bad for anything else
x_test = np.random.uniform(50, 250)
xe = extend_vector(x_test, DEGREE)
xe = (xe-x_aug_mean)/x_aug_std
print x_test, lr.predict(xe)*y_std+y_mean
In [288]:
# Generate test data to demonstrate the effect of overfitting
x_test = np.random.uniform(50, 250, size=N)
y_test = [1000*i + np.random.normal(0,i*200) for i in X]
yt_mean = np.mean(y_test)
yt_std = np.std(y_test)
y_test = (y_test-yt_mean)/yt_std
x_test_aug = extend_vector(x_test, DEGREE)
x_test_aug,_,_ = standardize(x_test_aug, x_aug_mean, x_aug_std)
In [289]:
# Compute RMSE
# With an overfitted model, the error on the training set is fine, but not in the test set
preds = lr.predict(x_aug)*y_std+y_mean
print rmse(y*y_std+y_mean, preds)
preds = lr.predict(x_test_aug)*y_std+y_mean
print rmse(y_test*y_std+y_mean, preds)
In [290]:
########################################
# Cross validation
########################################
# A basic example of CV. Remember: in oder to know how well our model will do in the wild, we need to take apart
# a test split, which is not used for training. In k-fold CV, we make k such splits. For each split, we train on one
# bit of the data and test on the rest. We can then take the mean and standard deviation of the errors and
# tweak the parameters if necessary.
k=4
for i in range(k):
t = np.arange(N)
train, test = get_cv_split(t, i, k)
x_train = x_aug[train,:]
x_test = x_aug[test,:]
y_train = y[train]
y_test = y[test]
# Fit the model to the train data, then test on the test split
lr.fit(x_train, y_train)
preds = lr.predict(x_aug)*y_std+y_mean
print 'Train RMSE: {}'.format(rmse(y*y_std+y_mean, preds))
preds = lr.predict(x_test)*y_std+y_mean
print 'Test RMSE: {}'.format(rmse(y_test*y_std+y_mean, preds))
In [291]:
# EXAMPLE: Ridge regression
from sklearn.linear_model import Ridge
# Train ridge regression model
# Try going up and generating a test set again, then compute the RMSE
# Try different values of alpha (strength of the regularization). It might take a few tries to get a good model
lr = Ridge(alpha=1, fit_intercept=True)
lr.fit(x_aug, y)
w = lr.coef_
b = lr.intercept_
print 'Estimated coefficients: {}, {}'.format(w,b)
In [292]:
#EXAMPLE: Lasso
from sklearn.linear_model import Lasso
# Train Lasso model
# Try going up and generating a test set again, then compute the RMSE
# Try different values of alpha (strength of the regularization). It might take a few tries to get a good model
lr = Lasso(alpha=0.01, fit_intercept=True)
lr.fit(x_aug, y)
w = lr.coef_
b = lr.intercept_
print 'Estimated coefficients: {}, {}'.format(w,b)
In [293]:
############################################
# PLOTS
# Examples: overfitting, ridge, lasso
fig = plt.figure()
ax = fig.gca()
ax.scatter(X, y, label='data')
# Compute the predicted function to plot it as a line
xb = np.arange(-10,210,0.1)
xb_aug = extend_vector(xb, DEGREE)
xb_aug = (xb_aug-x_aug_mean)/x_aug_std
z = [w.T.dot(i) + b for i in xb_aug]
ax.plot(xb, z, label='pred', color='red', linewidth=2)
plt.xlabel('m^2')
plt.ylabel('Price')
ax.set_xlim(0,220)
ax.set_ylim(-3,5)
#ax.legend()
plt.show()
#plt.savefig(('../pres/images/ridge_p.pdf'))
In [296]:
# Predict random samples
x = np.random.rand(1)*300
x_aug_t = [x]
for i in range(2, DEGREE+1):
x_aug_t.append(float(x**i))
x_aug_t = np.array(x_aug_t).astype(np.float)
x_aug_t = (x_aug_t - x_aug_mean)/x_aug_std
print 'Distance: {}. Predicted price: {}'.format(x, (w.T.dot(x_aug_t)+b) * y_std + y_mean)