In [1]:
from sklearn.datasets import fetch_california_housing
from sklearn.datasets import load_boston
boston = load_boston()
california = fetch_california_housing()
In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
In [3]:
%matplotlib inline
# If you are using IPython, this will make the images available in the notebook
In [4]:
import numpy as np
vector = np.array([1,2,3,4,5])
row_vector = vector.reshape((5,1))
column_vector = vector.reshape((1,5))
single_feature_matrix = vector.reshape((1,5))
single_feature_matrix = np.array([[1],[2],[3],[4],[5]])
multiple_feature_matrix = np.array([[1,2,3,4,5],[6,7,8,9,10],[11,12,13,14,15]])
In [5]:
all_zeros = np.zeros((5,3))
all_ones = np.ones((5,3))
In [6]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.mlab as mlab
import math
x = np.linspace(-4,4,100)
for mean, variance in [(0,0.7),(0,1),(1,1.5),(-2,0.5)]:
sigma = math.sqrt(variance)
plt.plot(x,mlab.normpdf(x,mean,variance))
plt.show()
In [7]:
dataset = pd.DataFrame(boston.data, columns=boston.feature_names)
dataset['target'] = boston.target
In [8]:
mean_expected_value = dataset['target'].mean()
In [9]:
Squared_errors = pd.Series(mean_expected_value -\
dataset['target'])**2
SSE = np.sum(Squared_errors)
print ('Sum of Squared Errors (SSE): %01.f' % SSE)
In [10]:
density_plot = Squared_errors.plot('hist')
In [11]:
def standard_deviation(variable, bias=0):
observations = float(len(variable))
return np.sqrt(np.sum((variable - np.mean(variable))**2) / (observations-min(bias,1)))
print ('Our function\'s result: %0.5f against Numpy\'s: %0.5f' % (standard_deviation(dataset['RM']), np.std(dataset['RM'])))
In [12]:
def covariance(variable_1, variable_2, bias=0):
observations = float(len(variable_1))
return np.sum((variable_1 - np.mean(variable_1)) * (variable_2 - np.mean(variable_2))) / (observations-min(bias,1))
def standardize(variable):
return (variable - np.mean(variable)) / np.std(variable)
def correlation(var1,var2,bias=0):
return covariance(standardize(var1), standardize(var2),bias)
from scipy.stats.stats import pearsonr
print ('Our correlation estimation: %0.5f' % (correlation(dataset['RM'], dataset['target'])))
print ('Correlation from Scipy pearsonr estimation: %0.5f' % pearsonr(dataset['RM'], dataset['target'])[0])
#print (help(np.corrcoef))
In [13]:
x_range = [dataset['RM'].min(),dataset['RM'].max()]
y_range = [dataset['target'].min(),dataset['target'].max()]
scatter_plot = dataset.plot(kind='scatter', x='RM', y='target', xlim=x_range, ylim=y_range)
meanY = scatter_plot.plot(x_range, [dataset['target'].mean(),dataset['target'].mean()], '--', color='red', linewidth=1)
meanX = scatter_plot.plot([dataset['RM'].mean(),dataset['RM'].mean()], y_range, '--', color='red', linewidth=1)
In [14]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
In [15]:
y = dataset['target']
X = dataset['RM']
X = sm.add_constant(X)
In [16]:
X.head()
Out[16]:
In [17]:
# As an alternative, this example is using the statsmodels.formula.api module
# Equivalent to the R syntax for linear models, it requires to specify the form
# of the linear regression using 'response ~ predictor1 (+ predictor2 + ...)'
In [18]:
linear_regression = smf.ols(formula='target ~ RM', data=dataset)
fitted_model = linear_regression.fit()
In [19]:
linear_regression = sm.OLS(y,X)
fitted_model = linear_regression.fit()
fitted_model.summary()
Out[19]:
In [20]:
print (fitted_model.params)
betas = np.array(fitted_model.params)
fitted_values = fitted_model.predict(X)
In [21]:
RM = 5
Xp = np.array([1,RM])
print ("Our model predicts if RM = %01.f the answer value is %0.1f" % (RM, fitted_model.predict(Xp)))
In [22]:
x_range = [dataset['RM'].min(),dataset['RM'].max()]
y_range = [dataset['target'].min(),dataset['target'].max()]
scatter_plot = dataset.plot(kind='scatter', x='RM', y='target', xlim=x_range, ylim=y_range)
meanY = scatter_plot.plot(x_range, [dataset['target'].mean(),dataset['target'].mean()], '--', color='red', linewidth=1)
meanX = scatter_plot.plot([dataset['RM'].mean(),dataset['RM'].mean()], y_range, '--', color='red', linewidth=1)
regression_line = scatter_plot.plot(dataset['RM'], fitted_values, '-', color='orange', linewidth=1)
In [23]:
predictions_by_dot_product = np.dot(X,betas)
print ("Using the prediction method: %s" % fitted_values[:10])
print ("Using betas and a dot product: %s" % predictions_by_dot_product[:10])
In [24]:
residuals = dataset['target']-fitted_values
normalized_residuals = standardize(residuals)
In [25]:
residual_scatter_plot = plt.plot(dataset['RM'], normalized_residuals,'bp')
plt.xlabel('RM')
plt.ylabel('Normalized residuals')
mean_residual = plt.plot([int(x_range[0]),round(x_range[1],0)], [0,0], '-', color='red', linewidth=2)
upper_bound = plt.plot([int(x_range[0]),round(x_range[1],0)], [3,3], '--', color='red', linewidth=1)
lower_bound = plt.plot([int(x_range[0]),round(x_range[1],0)], [-3,-3], '--', color='red', linewidth=1)
plt.grid()
In [26]:
from sklearn import linear_model
linear_regression = linear_model.LinearRegression(normalize=False, fit_intercept=True)
In [27]:
observations = len(dataset)
X = dataset['RM'].values.reshape((observations,1)) # X should be always a matrix, never a vector
y = dataset['target'].values # y can be a vector
In [28]:
linear_regression.fit(X,y)
Out[28]:
In [29]:
print (linear_regression.coef_)
print (linear_regression.intercept_)
In [30]:
print (linear_regression.predict(X)[:10])
In [31]:
Xp = np.column_stack((X,np.ones(observations)))
v_coef = list(linear_regression.coef_) + [linear_regression.intercept_]
In [32]:
np.dot(Xp,v_coef)[:10]
Out[32]:
In [33]:
from sklearn.datasets import make_regression
HX, Hy = make_regression(n_samples=10000000, n_features=1, n_targets=1, random_state=101)
In [34]:
%%time
sk_linear_regression = linear_model.LinearRegression(normalize=False, fit_intercept=True)
sk_linear_regression.fit(HX,Hy)
In [35]:
%%time
sm_linear_regression = sm.OLS(Hy,sm.add_constant(HX))
sm_linear_regression.fit()
In [36]:
import numpy as np
x = np.array([9.5, 8.5, 8.0, 7.0, 6.0])
y = np.array([85, 95, 70, 65, 70])
In [37]:
def squared_cost(v,e):
return np.sum((v-e)**2)
In [38]:
from scipy.optimize import fmin
xopt = fmin(squared_cost, x0=0, xtol=1e-8, args=(x,))
In [39]:
print ('The result of optimization is %0.1f' % (xopt[0]))
print ('The mean is %0.1f' % (np.mean(x)))
In [40]:
np.mean(x)
Out[40]:
In [41]:
def absolute_cost(v,e):
return np.sum(np.abs(v-e))
In [42]:
xopt = fmin(absolute_cost, x0=0, xtol=1e-8, args=(x,))
In [43]:
print ('The result of optimization is %0.1f' % (xopt[0]))
print ('The median is %0.1f' % (np.median(x)))
In [44]:
observations = len(dataset)
X = dataset['RM'].values.reshape((observations,1)) # X should be always a matrix, never a vector
Xb = np.column_stack((X,np.ones(observations))) # We add the bias
y = dataset['target'].values # y can be a vector
def matrix_inverse(X,y, pseudo=False):
if pseudo:
return np.dot(np.linalg.pinv(np.dot(X.T, X)),np.dot(X.T,y))
else:
return np.dot(np.linalg.inv(np.dot(X.T, X)),np.dot(X.T,y))
def normal_equations(X,y):
return np.linalg.solve(np.dot(X.T,X), np.dot(X.T,y))
print (matrix_inverse(Xb, y))
print (matrix_inverse(Xb, y, pseudo=True))
print (normal_equations(Xb, y))
In [45]:
observations = len(dataset)
X = dataset['RM'].values.reshape((observations,1)) # X should be always a matrix, never a vector
X = np.column_stack((X,np.ones(observations))) # We add the bias
y = dataset['target'].values # y can be a vector
In [46]:
import random
def random_w( p ):
return np.array([np.random.normal() for j in range(p)])
def hypothesis(X,w):
return np.dot(X,w)
def loss(X,w,y):
return hypothesis(X,w) - y
def squared_loss(X,w,y):
return loss(X,w,y)**2
def gradient(X,w,y):
gradients = list()
n = float(len( y ))
for j in range(len(w)):
gradients.append(np.sum(loss(X,w,y) * X[:,j]) / n)
return gradients
def update(X,w,y, alpha=0.01):
return [t - alpha*g for t, g in zip(w, gradient(X,w,y))]
def optimize(X,y, alpha=0.01, eta = 10**-12, iterations = 1000):
w = random_w(X.shape[1])
path = list()
for k in range(iterations):
SSL = np.sum(squared_loss(X,w,y))
new_w = update(X,w,y, alpha=alpha)
new_SSL = np.sum(squared_loss(X,new_w,y))
w = new_w
if k>=5 and (new_SSL - SSL <= eta and new_SSL - SSL >= -eta):
path.append(new_SSL)
return w, path
if k % (iterations / 20) == 0:
path.append(new_SSL)
return w, path
alpha = 0.048
w, path = optimize(X,y,alpha, eta = 10**-12, iterations = 25000)
print ("These are our final coefficients: %s" % w)
print ("Obtained walking on this path of squared loss %s" % path)
# Just as a reminder, we previously estimated by other methods that
# the best solution for our coefficients is w = [9.10210898, -34.67062078]