In [1]:
# import
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
%matplotlib inline
In [2]:
X = [[6], [8], [10], [14], [18]]
y = [[7], [9], [13], [17.5], [18]]
plt.figure()
plt.title('Pizza price plotted against diameter')
plt.xlabel('Diameter in inches')
plt.ylabel('Price in dollars')
plt.plot(X, y, 'k.')
plt.axis([0, 25, 0, 25])
plt.grid(True)
plt.show()
Simple Linear Regression
In [3]:
lReg = LinearRegression()
# fitting the data
lReg.fit(X, y)
# predicting the data for 12" pizza
lReg.predict([[12]])
y_pred = lReg.predict(X)
In [4]:
# plotting the new point in the plot
X = [[6], [8], [10], [14], [18]]
y = [[7], [9], [13], [17.5], [18]]
plt.figure()
plt.title('Pizza price plotted against diameter')
plt.xlabel('Diameter in inches')
plt.ylabel('Price in dollars')
plt.plot(X, y, 'k.')
plt.plot([12], lReg.predict([[12]]), c='r', marker='.')
plt.axis([0, 25, 0, 25])
plt.grid(True)
x1 = [[0], [25]]
y1 = lReg.predict(x1)
plt.plot(x1, y1, 'g')
plt.show()
In [5]:
# priting model coeff and intercept
print("Co-efficient : ", lReg.coef_)
print("Intercept : ", lReg.intercept_)
A cost function, also called a loss function, is used to define and measure the error of a mode
The differences between the predicted and observed values in the test data are called prediction errors or test errors.
We are going to use Residual Sum of Square cost function to evaluate our model
$$ RSS = \sum_{i=1}^{n} (y_i - f(x))^2 $$
In [6]:
# RSS(
#print(y, lReg.predict(X))
#print("RSS : ", np.sum(np.square(y-y_pred)))
print("RSS : ", np.mean(np.square(y-lReg.predict(X))))
Variance
Variance is a measure of how far a set of values is spread out. If all of the numbers
in the set are equal, the variance of the set is zero. A small variance indicates that the
numbers are near the mean of the set, while a set containing numbers that are far
from the mean and each other will have a large variance. Variance can be calculated
using the following equation:
In [7]:
print("X: ", X)
print ("mean X: ", np.mean(X))
var = np.sum(np.square(X - np.mean(X)))/(len(X)-1)
print("Variance: ", var)
print("Variance: ",np.var(X, ddof=1))
Covariance
Covariance is a measure of how much two variables change together. If the value of
the variables increase together, their covariance is positive. If one variable tends to
increase while the other decreases, their covariance is negative. If there is no linear
relationship between the two variables, their covariance will be equal to zero; the
variables are linearly uncorrelated but not necessarily independent. Covariance can
be calculated using the following formula:
In [8]:
X1 = [6, 8, 10, 14, 18]
y1 = [7, 9, 13, 17.5, 18]
xbar = np.mean(X1)
ybar = np.mean(y1)
covar = np.sum((X1 - xbar)*(y1 - ybar))/(len(X1)-1)
print("CoVariance: ", covar)
print("CoVariance: ",np.cov(X1, y1)[0,1])
We know that simple linear regression follows the below equation : $$ y = \alpha + \beta{x} $$
where $$ \beta = \frac{cov(X,y)}{var(x)} $$
In [9]:
# let's get the beta
beta = covar/var
alpha = np.mean(y) - beta*np.mean(X)
print("beta b (co-efficient)= ", beta)
print("alpha a (intercept)= ", alpha)
In [10]:
x2 = [[11], [18]]
print(lReg.predict(x2))
R-squared measures how well the observed values of the response variables are predicted by the model. More concretely, r-squared is the proportion of the variance in the response variable that is explained by the model. An r-squared score of one indicates that the response variable can be predicted without any error using the model. An r-squared score of one half indicates that half of the variance in the response variable can be predicted using the model. There are several methods to calculate r-squared. In the case of simple linear regression, r-squared is equal to the square of the Pearson product moment correlation coefficient, or Pearson's r.
$$ R^2 = 1 - \frac{SS_{res}}{SS_{tot}} $$where $$ SS_{res} = \sum_{i=1}^{n} (y_i - f(x_i))^2 $$
$$ SS_{tot} = \sum_{i=1}^{n} (y_i - \overline{y})^2 $$Using this method, r-squared must be a positive number between zero and one. This method is intuitive; if r-squared describes the proportion of variance in the response variable explained by the model, it cannot be greater than one or less than zero. Other methods, including the method used by scikit-learn, do not calculate r-squared as the square of Pearson's r, and can return a negative r-squared if the model performs extremely poorly.
In [11]:
SSres = np.mean(np.square(y-lReg.predict(X)))
print(SSres)
SStot = np.mean(np.square(y-np.mean(y)))
print(SStot)
Rsqr = 1 - (SSres/SStot)
print("R sqare score:", Rsqr)
print("R sqare score:",lReg.score(X, y))
In [12]:
# in this there are two features of pizza - diameter and no of toppings
X = [[6,2],[8,1],[10, 0],[14,2],[18, 0]]
y = [[7],[9],[13],[17.5],[18]]
X_test = [[8,2],[9,0],[11,2],[16,2],[12, 0]]
y_test = [[11],[8.5],[15],[18],[11]]
In [13]:
lReg2 = LinearRegression()
lReg2.fit(X, y)
y_pred = lReg2.predict(X_test)
print("Training Score:", lReg2.score(X_test, y_test))
print("Test Score:", lReg2.score(X_test, y_test))
for i, yp in enumerate(y_pred):
print("predicted : ",yp, " Actual : ", y_test[i])
In [29]:
X = [[6], [8], [10], [14], [18]]
y = [[7], [9], [13], [17.5], [18]]
X_test = [[6],[8],[11],[16]]
y_test = [[8],[12],[15],[18]]
In [79]:
lReg3 = LinearRegression()
lReg3.fit(X, y)
print(lReg3.coef_, lReg3.intercept_)
xx = np.linspace(0, 20, 100)
yy = lReg3.predict(xx.reshape(xx.shape[0],1))
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(xx, yy)
ax.legend(['Linear 1 degree'])
Out[79]:
In [83]:
# doing feature preprocessing
quad = PolynomialFeatures(degree=2)
X_quad_train = quad.fit_transform(X)
X_quad_test = quad.fit_transform(X_test)
lReg4 = LinearRegression()
lReg4.fit(X_quad_train, y)
xx = np.linspace(0, 20, 100)
xx_quad = quad.fit_transform(xx.reshape(xx.shape[0],1))
yy = lReg4.predict(xx_quad)
print("R squared for polynomial fit : ", lReg4.score(X_quad_test, y_test))
ax.plot(xx, yy, linestyle='--')
ax.plot(X, y, 'r.')
ax.legend([ 'Linear ', 'Poly 2 degree'])
ax.grid(True)
fig
Out[83]:
Let's try with 9 degree polinomial
In [87]:
lReg9 = LinearRegression()
# feature processing
nine = PolynomialFeatures(degree=9)
X_nine_train = nine.fit_transform(X)
X_nine_test = nine.fit_transform(X_test)
#fitting the model
lReg9.fit(X_nine_train, y)
xx_nine = nine.fit_transform(xx.reshape(xx.shape[0],1))
yy = lReg9.predict(xx_nine)
#print(xx_nine)
print("R squared for 9 degree polynomial fit : ", lReg9.score(X_nine_test, y_test))
ax.plot(xx, yy, linestyle='-', c='r', )
ax.legend([ 'Linear ', 'Poly 2 degree', 'Poly 9 degree'], loc='upper center')
fig
# R square value is negative which indicates the poor performance
Out[87]:
In [ ]: