The sklearn.linear_model.LinearRegression class is an estimator. Estimators predict a value based on the observed data. In scikit-learn, all estimators implement the fit() and predict() methods. The former method is used to learn the parameters of a model, and the latter method is used to predict the value of a response variable for an explanatory variable using the learned parameters. It is easy to experiment with different models using scikit-learn because all estimators implement the fit and predict methods.
In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
X = [[6], [8], [10], [14], [18]]
Y = [[7], [9], [13], [17.5], [18]]
In [4]:
plt.figure()
plt.title("Pizza price plotted against diameter")
plt.xlabel("Diameter in inches")
plt.ylabel("Price in dollars")
plt.plot(X,Y,"k.")
plt.axis([0,25,0,25])
plt.grid(True)
plt.show()
In [23]:
#Make simple linear regression
from sklearn.linear_model import LinearRegression
#Create and fit the model
model = LinearRegression()
model.fit(X,Y)
#Make preidiction
print('A 12" pizza should cost: %0.2f' %(model.predict([12])[0]))
In [29]:
import numpy as np
print("Residual sum of squares: %.2f" %np.mean((model.predict(X)-Y)**2))
In [51]:
from __future__ import division
xbar = np.mean(X)
ybar = np.mean(Y)
print("Mean of X is:", xbar)
print("Mean of Y is:", ybar)
In [55]:
#Make own function for variance and covariance to better understand how it works
def variance(X):
return np.sum((X - np.mean(X))**2 / (len(X)-1))
def covariance(X,Y):
return np.sum((X - np.mean(X)) * (Y - np.mean(Y)) / (len(X)-1))
In [56]:
print("Variance of X: ", variance(X))
print("Covariance of X, Y is: ", covariance(X,Y))
In [57]:
#For simple linear regression, beta is cov/var.
#Following calculation of beta, I can also get alpha a = y - bx
beta = covariance(X,Y) / variance(X)
beta
Out[57]:
In [70]:
#Load another set
X_test = np.array([8,9,11,16,12])
Y_test = np.array([11,8.5,15,18,11])
In [92]:
model.fit(X_test.reshape(-1,1),Y_test)
model.predict(X_test.reshape(-1,1))
Out[92]:
In [96]:
def total_sum_squares(Y):
return np.sum((Y_test - np.mean(Y_test))**2)
In [97]:
#Residual sum of squares
def residual_sum_squares(Y):
return np.sum( (Y_test - model.predict(X_test.reshape(-1,1)))**2)
In [98]:
#Get R square
1 - residual_sum_squares(Y_test)/total_sum_squares(Y_test)
Out[98]:
In [100]:
#From sklearn
model.score(X_test.reshape(-1,1),Y_test)
Out[100]:
In [101]:
from numpy.linalg import inv
from numpy import dot, transpose
In [103]:
X = [[1, 6, 2], [1, 8, 1], [1, 10, 0], [1, 14, 2], [1, 18, 0]]
X
Out[103]:
In [104]:
y = [[7], [9], [13], [17.5], [18]]
In [107]:
#Solve using linear algebra
dot(inv(dot(transpose(X),X)), dot(transpose(X),y))
Out[107]:
In [109]:
#Solve using numpy least squares procedure
from numpy.linalg import lstsq
lstsq(X,y)[0]
Out[109]:
In [112]:
#Compare simple vs multinomial
X = [[6, 2], [8, 1], [10, 0], [14, 2], [18, 0]]
Y = [[7], [9], [13], [17.5], [18]]
In [114]:
model = LinearRegression()
model.fit(X,Y)
Out[114]:
In [115]:
X_test = [[8, 2], [9, 0], [11, 2], [16, 2], [12, 0]]
Y_test = [[11], [8.5], [15], [18], [11]]
In [118]:
predictions = model.predict(X_test)
for i, prediction in enumerate(predictions):
print("Predicted: %s, Target: %s" %(prediction, Y_test[i]))
In [119]:
print("R square:", model.score(X_test, Y_test))
In [120]:
from sklearn.preprocessing import PolynomialFeatures
In [122]:
X_train = [[6], [8], [10], [14], [18]]
y_train = [[7], [9], [13], [17.5], [18]]
X_test = [[6], [8], [11], [16]]
y_test = [[8], [12], [15], [18]]
In [123]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)
Out[123]:
In [125]:
xx = np.linspace(0, 26, 100)
xx
Out[125]:
In [127]:
yy = regressor.predict(xx.reshape(xx.shape[0],1))
plt.plot(xx,yy)
Out[127]:
In [129]:
quadratic_featureziser = PolynomialFeatures(degree=2)
X_train_quadratic = quadratic_featureziser.fit_transform(X_train)
X_train_quadratic
Out[129]:
In [130]:
X_test_quadratic = quadratic_featureziser.transform(X_test)
In [131]:
regressor_quadratic = LinearRegression()
In [132]:
regressor_quadratic.fit(X_train_quadratic,y_train)
Out[132]:
In [133]:
xx_quadratic = quadratic_featureziser.transform(xx.reshape(xx.shape[0],1))
In [136]:
plt.plot(xx, regressor_quadratic.predict(xx_quadratic), c='r',linestyle = '--')
plt.title("Pizza price regressed on diameter")
plt.xlabel("Diameter in inches")
plt.ylabel("Price in dollars")
plt.axis([0,25,0,25])
plt.grid(True)
plt.scatter(X_train, y_train)
plt.show()
In [139]:
import pandas as pd
In [155]:
target_url = ("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv")
df = pd.read_csv(target_url,header=0, sep=";")
In [156]:
df.head()
Out[156]:
In [157]:
df.describe()
Out[157]:
In [158]:
plt.scatter(df['alcohol'], df['quality'])
plt.xlabel("Alcohol")
plt.ylabel("Quality")
plt.title("Alcohol against Quality")
plt.show()
In [159]:
from sklearn.cross_validation import train_test_split
In [160]:
#Split into feature and target, train and test
X = df[list(df.columns)[:-1]]
y = df['quality']
In [162]:
X.head()
Out[162]:
In [163]:
y.tail()
Out[163]:
In [168]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
In [169]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_predictions = regressor.predict(X_test)
In [170]:
#Check R squared
print("R squared is: ", regressor.score(X_test, y_test))
In [171]:
#Make cross validation
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(regressor, X, y, cv = 5)
In [172]:
print(scores.mean(), scores)
In [193]:
from sklearn.datasets import load_boston
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
data = load_boston()
In [194]:
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target)
In [195]:
X_scaler = StandardScaler()
y_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_train)
y_train = y_scaler.fit_transform(y_train)
X_test = X_scaler.transform(X_test)
y_test = y_scaler.transform(y_test)
In [196]:
regressor = SGDRegressor(loss='squared_loss')
In [197]:
scores = cross_val_score(regressor, X_train, y_train, cv=5)
print('Cross validation r-squared scores:', scores)
print('Average cross validation r-squared score:', np.mean(scores))
regressor.fit_transform(X_train, y_train)
print('Test set r-squared score', regressor.score(X_test, y_test))
In [ ]: