Here, we implement a linear regression via least squares fitting (with vertical offsets) by solving to fit n points $(x_i, y_i)$ with $i=1,2,...n,$ via linear equation of the form
$f(x) = a\cdot x + b$.
I have described the approach in more detail in this IPython notebook.
In the more "classic" approach that is often found in statistics textbooks, we calculate the following parameters as follows:
$a = \frac{S_{x,y}}{\sigma_{x}^{2}}\quad$ (slope)
$b = \bar{y} - a\bar{x}\quad$ (y-axis intercept)
where
$S_{xy} = \sum_{i=1}^{n} (x_i - \bar{x})(y_i - \bar{y})\quad$ (covariance)
$\sigma{_x}^{2} = \sum_{i=1}^{n} (x_i - \bar{x})^2\quad$ (variance)
We will implement this "classic" approach in
py_lstsqr()
numba_lstsqrs()
cy_lstsqrs()
In [1]:
import numpy as np
import scipy.stats
from numba import jit
In [2]:
%load_ext cythonmagic
In [3]:
def py_lstsqr(x, y):
""" Computes the least-squares solution to a linear matrix equation. """
len_x = len(x)
x_avg = sum(x)/len_x
y_avg = sum(y)/len(y)
var_x = 0
cov_xy = 0
for i in range(len_x):
temp = (x[i] - x_avg)
var_x += temp**2
cov_xy += temp*(y[i] - y_avg)
slope = cov_xy / var_x
y_interc = y_avg - slope*x_avg
return (slope, y_interc)
In [4]:
@jit
def numba_lstsqr(x, y):
""" Computes the least-squares solution to a linear matrix equation. """
len_x = len(x)
x_avg = sum(x)/len_x
y_avg = sum(y)/len(y)
var_x = 0
cov_xy = 0
for i in range(len_x):
temp = (x[i] - x_avg)
var_x += temp**2
cov_xy += temp*(y[i] - y_avg)
slope = cov_xy / var_x
y_interc = y_avg - slope*x_avg
return (slope, y_interc)
In [5]:
%%cython
def cy_lstsqr(x, y):
""" Computes the least-squares solution to a linear matrix equation. """
cdef double x_avg, y_avg, var_x, cov_xy,\
slope, y_interc, x_i, y_i
cdef int len_x
len_x = len(x)
x_avg = sum(x)/len_x
y_avg = sum(y)/len(y)
var_x = 0
cov_xy = 0
for i in range(len_x):
temp = (x[i] - x_avg)
var_x += temp**2
cov_xy += temp*(y[i] - y_avg)
slope = cov_xy / var_x
y_interc = y_avg - slope*x_avg
return (slope, y_interc)
In [6]:
%matplotlib inline
In [ ]:
In [30]:
from matplotlib import pyplot as plt
import scipy.stats
slope, intercept = py_lstsqr(x, y)
line_x = [round(min(x)) - 1, round(max(x)) + 1]
line_y = [slope*x_i + intercept for x_i in line_x]
plt.figure(figsize=(7,6))
plt.scatter(x,y)
plt.plot(line_x, line_y, color='red', lw='2')
plt.ylabel('y')
plt.xlabel('x')
plt.title('Linear regression via least squares fit')
ftext = 'y = ax + b = {:.3f} + {:.3f}x'\
.format(slope, intercept)
plt.figtext(.15,.8, ftext, fontsize=11, ha='left')
assert(
scipy.stats.linregress(x, y)[0:2] == py_lstsqr(x, y))
plt.show()
In [8]:
import matplotlib.pyplot as plt
def plot_benchmarks(times_n):
labels = [ ('py_lstsqr', '"classic" least squares in reg. (C)Python'),
('numba_lstsqr', '"classic" least squares in Numba'),
('cy_lstsqr', '"classic" least squares in Cython')]
plt.rcParams.update({'font.size': 12})
fig = plt.figure(figsize=(10,8))
for lb in labels:
plt.plot(orders_n, times_n[lb[0]], alpha=0.5, label=lb[1], marker='o', lw=3)
plt.xlabel('sample size n')
plt.ylabel('time per computation in milliseconds [ms]')
plt.xlim([1,max(orders_n) + max(orders_n) * 10])
plt.legend(loc=2)
plt.grid()
plt.xscale('log')
plt.yscale('log')
max_perf = max( py/cy for py,cy in zip(times_n['py_lstsqr'],
times_n['cy_lstsqr']) )
min_perf = min( py/cy for py,cy in zip(times_n['py_lstsqr'],
times_n['cy_lstsqr']) )
ftext = 'Using Cython is {:.2f}x to '\
'{:.2f}x faster than regular (C)Python'\
.format(min_perf, max_perf)
plt.figtext(.14,.15, ftext, fontsize=11, ha='left')
plt.title('Performance of least square fit implementations')
plt.show()
In [6]:
import timeit
import random
random.seed(12345)
funcs = ['py_lstsqr', 'numba_lstsqr', 'cy_lstsqr']
orders_n = [10**n for n in range(1, 8)]
times_lists = {f:[] for f in funcs}
for n in orders_n:
x = [x_i*random.randint(8,12)/10 for x_i in range(n)]
y = [y_i*random.randint(10,14)/10 for y_i in range(n)]
for f in funcs:
times_lists[f].append(min(timeit.Timer('%s(x,y)' %f,
'from __main__ import %s, x, y' %f)
.repeat(repeat=3, number=100)))
In [9]:
plot_benchmarks(times_lists)
In [41]:
import timeit
import numpy as np
random.seed(12345)
funcs = ['py_lstsqr', 'numba_lstsqr', 'cy_lstsqr']
orders_n = [10**n for n in range(1, 5)]
times_np_arys = {f:[] for f in funcs}
for n in orders_n:
x = np.asarray([x_i*np.random.randint(8,12)/10 for x_i in range(n)])
y = np.asarray([y_i*np.random.randint(10,14)/10 for y_i in range(n)])
for f in funcs:
times_np_arys[f].append(min(timeit.Timer('%s(x,y)' %f,
'from __main__ import %s, x, y' %f)
.repeat(repeat=3, number=100)))
In [42]:
plot_benchmarks(times_np_arys)