In [6]:
import matplotlib.pyplot as plt
%matplotlib inline
In [29]:
X = [[6],[8], [10],[14],[18]]
y = [[7],[9],[13],[17.5],[18]]
In [9]:
plt.figure()
plt.title("Pizza price vs diameter")
plt.xlabel("dia in inches")
plt.ylabel("price in USD")
plt.plot(X, y, 'k.')
plt.axis([0,25,0,25])
plt.grid(True)
plt.show()
In [10]:
from sklearn.linear_model import LinearRegression
In [11]:
model = LinearRegression()
model.fit(X,y)
Out[11]:
In [13]:
print 'A 12" pizza would cost $%.2f' % model.predict([12])[0]
In [19]:
model.predict([[8],[12],[15],[20]])
Out[19]:
In [20]:
#RSS calculation
import numpy as np
In [22]:
print 'Residual Sum of Squares: %.2f ' % np.mean((model.predict(X) - y) ** 2)
In [23]:
model.predict(X)
Out[23]:
In [24]:
y
Out[24]:
In [28]:
np.mean((model.predict(X) - y) **2)
Out[28]:
In [34]:
#variance
from __future__ import division
xbar = (6 + 8 + 10 + 14 + 18) / 5
variance = ((6 - xbar)**2 + (8 - xbar)**2 + (10 - xbar)**2 + (14 - xbar)**2 + (18 - xbar)**2) / 4
print variance
# OR
print np.var(X, ddof=1)
In [32]:
X
Out[32]:
In [35]:
#covariance
xbar = (6 + 8 + 10 + 14 + 18) / 5
ybar = (7 + 9 + 13 + 17.5 + 18) / 5
cov = ((6 - xbar) * (7 - ybar) + (8 - xbar) * (9 - ybar) + (10 - xbar) * (13 - ybar) + (14 - xbar) * (17.5 - ybar) + (18 - xbar) * (18 - ybar)) / 4
print cov
print np.cov([6, 8, 10, 14, 18], [7, 9, 13, 17.5, 18])[0][1]
In [37]:
print np.cov(X,y)[0][1]
In [39]:
y
Out[39]:
In [41]:
# R-square:-
# In the case of
#simple linear regression, r-squared is equal to the square of the Pearson product
#moment correlation coefficient, or Pearson's r.
In [42]:
# The score method of LinearRegression returns the model's
# r-squared value, as seen in the following example:
In [43]:
from sklearn.linear_model import LinearRegression
X = [[6], [8], [10], [14], [18]]
y = [[7], [9], [13], [17.5], [18]]
X_test = [[8], [9], [11], [16], [12]]
y_test = [[11], [8.5], [15], [18], [11]]
model = LinearRegression()
model.fit(X, y)
print 'R-squared: %.4f' % model.score(X_test, y_test)
In [47]:
model.score(X,y)
Out[47]:
In [48]:
# We can solve β using NumPy, as follows:
from numpy.linalg import inv
from numpy import dot, transpose
X = [[1, 6, 2], [1, 8, 1], [1, 10, 0], [1, 14, 2], [1, 18, 0]]
y = [[7], [9], [13], [17.5], [18]]
# β = (Xt . X) -1 . Xt . y
print dot(inv(dot(transpose(X), X)), dot(transpose(X), y))
In [49]:
# NumPy also provides a least squares function that can solve the values of the parameters more compactly:
from numpy.linalg import lstsq
X = [[1, 6, 2], [1, 8, 1], [1, 10, 0], [1, 14, 2], [1, 18, 0]]
y = [[7], [9], [13], [17.5], [18]]
print lstsq(X, y)[0]
In [50]:
# Let's update our pizza-price predictor program to use the second explanatory
#variable, and compare its performance on the test set to that of the simple linear
#regression model
In [51]:
from sklearn.linear_model import LinearRegression
X = [[6, 2], [8, 1], [10, 0], [14, 2], [18, 0]]
y = [[7], [9], [13], [17.5], [18]]
model = LinearRegression()
model.fit(X, y)
X_test = [[8, 2], [9, 0], [11, 2], [16, 2], [12, 0]]
y_test = [[11], [8.5], [15], [18], [11]]
In [55]:
predictions = model.predict(X_test)
In [56]:
predictions
Out[56]:
In [62]:
for i, predic in enumerate(predictions):
print 'Predicted: %s , Target: %s' % (predic, y_test[i])
In [68]:
# R-square
print 'R-squared value: %.2f' % model.score(X_test, y_test)
In [1]:
#Polynomial regression
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
In [2]:
X_train = [[6], [8], [10], [14], [18]]
y_train = [[7], [9], [13], [17.5], [18]]
X_test = [[6], [8], [11], [16]]
y_test = [[8], [12], [15], [18]]
In [4]:
reg = LinearRegression()
reg.fit(X_train, y_train)
Out[4]:
In [7]:
xx = np.linspace(0,26,100)
yy = reg.predict(xx.reshape(xx.shape[0], 1))
plt.plot(xx, yy)
Out[7]:
In [89]:
#xx.reshape(xx.shape[0],1)
In [92]:
xx.shape
Out[92]:
In [91]:
xx.reshape(xx.shape[0], 1).shape
Out[91]:
In [93]:
quadratic_featurizer = PolynomialFeatures(degree=2)
In [94]:
X_train_quad = quadratic_featurizer.fit_transform(X_train)
X_test_quad = quadratic_featurizer.fit_transform(X_test)
In [96]:
regressor_quadratic = LinearRegression()
regressor_quadratic.fit(X_train_quad, y_train)
Out[96]:
In [99]:
X_train_quad
Out[99]:
In [107]:
xx = np.linspace(0,26,100)
yy = reg.predict(xx.reshape(xx.shape[0], 1))
plt.plot(xx, yy)
xx_quadratic = quadratic_featurizer.transform(xx.reshape(xx.shape[0], 1))
plt.plot(xx, regressor_quadratic.predict(xx_quadratic), c='r', linestyle='--')
plt.title('Pizza price regressed on diameter')
plt.xlabel('Diameter in inches')
plt.ylabel('Price in dollars')
plt.axis([0, 25, 0, 25])
plt.grid(True)
In [104]:
print X_train
print X_train_quad
print X_test
print X_test_quad
print 'Simple linear regression r-squared', reg.score(X_test, y_test)
In [106]:
print 'Quadratic regression r-squared', regressor_quadratic.score(X_test_quad, y_test)
In [38]:
xx = np.linspace(0,26,100)
yy = reg.predict(xx.reshape(xx.shape[0], 1))
# chnge degree value and test for various values
# 9 onwards it starts going -ve
for i in xrange(1,15):
cubic_featurizer = PolynomialFeatures(degree=i)
X_train_cubic = cubic_featurizer.fit_transform(X_train)
X_test_cubic = cubic_featurizer.fit_transform(X_test)
cubic_regressor = LinearRegression()
cubic_regressor.fit(X_train_cubic, y_train)
print 'degree=%d regression R-squared value=' %i , cubic_regressor.score(X_test_cubic, y_test)
#plot for this degree
plt.plot(X_train, y, 'k.')
xx_degree = cubic_featurizer.fit_transform(xx.reshape(xx.shape[0], 1))
plt.plot(xx, cubic_regressor.predict(xx_degree), c='r', linestyle='--')
plt.title('Pizza price regressed on diameter')
plt.xlabel('Diameter in inches')
plt.ylabel('Price in dollars')
plt.axis([0, 25, 0, 25])
plt.grid(True)
plt.show()
In [15]:
print 'Cubic regression r-squared', cubic_regressor.score(X_test_cubic, y_test)
In [36]:
[x for x in xrange(1,10)]
Out[36]:
In [39]:
# Regularization --
# The LASSO produces sparse parameters; most of the coefficients will become zero,
# and the model will depend on a small subset of the features. In contrast, ridge
# regression produces models in which most parameters are small but nonzero.
In [40]:
#UCI wine data regression
# https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data
#book refers this one: http://ieor.berkeley.edu/~ieor265/homeworks/winequality-red.csv
import pandas as pd
In [49]:
df = pd.read_csv("http://ieor.berkeley.edu/~ieor265/homeworks/winequality-red.csv", sep=';')
In [50]:
df.describe()
Out[50]:
In [51]:
df.head()
Out[51]:
In [52]:
import matplotlib.pyplot as plt
In [53]:
plt.scatter(df['alcohol'], df['quality'])
plt.xlabel('alcohol')
plt.ylabel('quality')
plt.title('alcohol vs quality')
plt.show()
In [54]:
plt.scatter(df['volatile acidity'], df['quality'])
plt.xlabel('volatile acidity')
plt.ylabel('quality')
plt.title('volatile_acidity vs quality')
plt.show()
In [55]:
# qualitly ~ .47 * alcohol
# quality ~ -.39 * volatile_acidity
# etc...
df.corr()
Out[55]:
In [56]:
#modeling
from sklearn.linear_model import LinearRegression
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
In [58]:
list(df.columns)[:-1]
Out[58]:
In [60]:
X = df[list(df.columns)[:-1]]
y = df['quality']
In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
In [62]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
Out[62]:
In [63]:
y_pred = lin_reg.predict(X_test)
In [64]:
print "R squared value=%.2f" % lin_reg.score(X_test, y_test)
In [65]:
# using cross validation
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LinearRegression
In [67]:
reg_cv = LinearRegression()
cv_score = cross_val_score(reg_cv, X, y, cv=5)
print cv_score
print cv_score.mean()
In [73]:
print y_pred[:10]
In [74]:
print y_test[:10]
In [85]:
for pred,test in zip(y_pred[:10], y_test[:10]):
print "Predicted:%.3f and Real:%d" %(pred,test)
#print pred,test
In [90]:
plt.scatter(y_test,y_pred)
plt.xlabel("Real value")
plt.ylabel("Predicted value")
plt.title("Predicted vs Real value")
plt.show()
In [91]:
#Gradient Descent - BGD and SGD
import numpy as np
from sklearn.datasets import load_boston
from sklearn.linear_model import SGDRegressor
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
In [94]:
data = load_boston()
In [103]:
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target)
In [102]:
#data.data
#data.target
In [105]:
X_scaler = StandardScaler()
y_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_train)
y_train = y_scaler.fit_transform(y_train)
X_test = X_scaler.fit_transform(X_test)
y_test = y_scaler.fit_transform(y_test)
In [112]:
sgd_reg = SGDRegressor()
cv_scores = cross_val_score(sgd_reg, X_train, y_train, cv=5)
print "CV R-squared: ", cv_scores
print "Average CV R-squared : %.3f" % np.mean(cv_scores)
In [119]:
print sgd_reg.fit(X_train, y_train)
print "Test set R-squared: ", sgd_reg.score(X_test, y_test)
In [ ]:
#--end--