In [35]:
"""First, some basic stats from chapter 5"""
from math import sqrt
def mean(x):
'''geometric average'''
return sum(x) / len(x)
def de_mean(x):
'''translate x by subtracting its mean (so the result has mean 0)'''
x_bar = mean(x)
return [x_i - x_bar for x_i in x]
def sum_of_squares(x):
return sum([f * f for f in x])
def variance(x):
'''assumes x has at least two elements'''
n = len(x)
deviations = de_mean(x)
return sum_of_squares(deviations) / (n - 1)
def standard_deviation(x):
return sqrt(variance(x))
def dot(v, w):
'''the sum of the product of the matching elements
of the input vectors'''
return sum(v_i * w_i for v_i,w_i in zip(v, w))
# a large covariance means that x tends to be large when y is small
def covariance(x, y):
n = len(x)
return dot(de_mean(x), de_mean(y)) / (n - 1)
'''Correlation lies between -1 (perfect anti-correlation) and 1 (perfect correlation)'''
def correlation(x, y):
stdev_x = standard_deviation(x)
stdev_y = standard_deviation(y)
if stdev_x > 0 and stdev_y > 0:
return covariance(x, y) / stdev_x / stdev_y
else:
return 0
"""Now we'll get into the fitting logic."""
def predict(alpha, beta, x_i):
return beta * x_i + alpha
def error(alpha, beta, x_i, y_i):
return y_i - predict(alpha, beta, x_i)
def sum_of_squared_errors(alpha, beta, x, y):
return sum(error(alpha, beta, x_i, y_i) ** 2 for x_i, y_i in zip(x, y))
def least_squares_fit(x, y):
beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x)
alpha = mean(y) - beta * mean(x)
return alpha, beta
In [28]:
"""Now let's mock up some data to fit"""
%matplotlib inline
from matplotlib import pyplot as plt
from numpy.random import poisson
from random import random
from collections import Counter
# our fake data
x = list(poisson(7, 53)) + list(poisson(17, 47)) + list(poisson(42, 17)) + list(poisson(72, 7)) + [99]
y = [1.1415926535 * v + 42.0 + (-17.0 + 34.0 * random()) for v in x]
# and now let's plot it
plt.scatter(x, y, edgecolor='none', color='r')
plt.axis([0, max(x), min(y), max(y)])
plt.show()
In [30]:
alpha, beta = least_squares_fit(x, y)
print (alpha, beta)
In [32]:
# and now let's the data and the fit
plt.scatter(x, y, edgecolor='none', color='r')
plt.plot(x, [beta * v + alpha for v in x], color='b')
plt.axis([0, max(x), min(y), max(y)])
plt.show()
In [38]:
def sum_of_squared_deviations(y):
"""the total variation squared of y from the mean"""
return sum(v**2 for v in de_mean(y))
def r_squared(alpha, beta, x, y):
"""The fraction of the variation in y captured by the model,
which equals 1 - the fraction of variation in y not captured
by the model."""
return 1.0 - (sum_of_squared_errors(alpha, beta, x, y) /
sum_of_squared_deviations(y))
print(r_squared(alpha, beta, x, y))
In [ ]: