In [1]:
from scipy.optimize import curve_fit
from scipy.ndimage import gaussian_filter1d
from random import random
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
def func(x, c, z):
return c*x**z
def func_lin(x, a, b):
return (a*x)+b
def get_r2 (fun, X, Y, *args):
sstot = sum([(Y[i]-np.mean(Y))**2 for i in xrange(len(Y))])
sserr = sum([(Y[i] - fun(X[i], *args))**2 for i in xrange(len(Y))])
return 1 - sserr/sstot
plt.clf()
fig = plt.figure(figsize=(14,16))
lines = []
labels = []
x = [i+(2*random()+1)**3 for i in xrange(100)]
y = [np.log(float(1+i))+np.log(1+i**(float(i+1)/(random()*20+i+1))) for i in xrange(100)]
lines.extend (plt.plot(x, y,'ro', linewidth=2))
labels.append('almost random dots')
# define x-range of values where to estimate y in function of x
xp = np.arange(min(x), max(x), (max (x)-min(x))/200)
# linereression
slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
lines.extend(plt.plot(x, intercept + slope*np.array (x), 'r:', linewidth=2))
labels.append('linregression $y = %.3fx + %.3f$ $R^2 = %.4f, P=%.4f, Err= %.4f$'\
% (intercept, slope, r_value**2, p_value, std_err))
# curve_fit, using least-square to estimate best parameter in linear model
a, b = curve_fit(func_lin, np.array(x), np.array(y))[0]
R2 = get_r2(func_lin, x, y, a, b)
lines.extend(plt.plot(xp, func_lin(xp, a, b), ':', color='b', linewidth=2))
labels.append('Linear fit $y = %.3f\cdot x + %.3f$ $R^2=%.4f$'\
% (a, b, R2))
# polynomial fit first order
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
R2 = get_r2(p, x, y)
lines.extend(plt.plot(xp, p(xp), ":", color='g', linewidth=2))
labels.append('Polynomial fit ($1^{st}$ order) $y = %.3f\cdot x + %.3f$ $R^2=%.4f$'\
% (p[1], p[0],R2))
# polynomial fit second order
z = np.polyfit(x, y, 2)
p = np.poly1d(z)
R2 = get_r2(p, x, y)
lines.extend(plt.plot(xp, p(xp), "-", color='purple', linewidth=2))
labels.append('Polynomial fit ($2^{nd}$ order) $y=%.3f\cdot x^2+%.3f\cdot x+%.3f$ $R^2=%.4f$'\
% (p[2], p[1], p[0], R2))
# curve_fit, using least-square to estimate best parameter in power model
c, z = curve_fit(func, np.array(x), np.array(y))[0]
R2 = get_r2(func, x, y, c, z)
lines.extend(plt.plot(xp, func(xp, c, z), '-', color='darkgreen', linewidth=2))
labels.append('Power fit least-square $y = %.3f\cdot x^{%.3f}$ $R^2=%.4f$'\
% (c, z, R2))
# gaussian fit
sigma = 5
t = np.linspace(0, 1, len(x))
t2 = np.linspace(0, 1, 100)
x2 = np.interp(t2, t, x)
y2 = np.interp(t2, t, y)
x3 = gaussian_filter1d(x2, sigma)
y3 = gaussian_filter1d(y2, sigma)
lines.extend(plt.plot(x3, y3, '-', color='orange', linewidth=2))
labels.append('Gaussian fit (%s$\sigma$)' % sigma)
# legend
fig.subplots_adjust(top=0.7)
plt.legend(lines, labels, loc=0, shadow=True, fancybox=True, ncol=1,
numpoints=1, title='what is here:', bbox_to_anchor=[.1,1, .5,.3])
plt.show()
In [340]:
from scipy.optimize import curve_fit
from random import random
import numpy as np
import matplotlib.pyplot as plt
def func(x, a, b, c):
return a*x**-b - x**(1./c)
x = [i+(2*random()+1)**3 for i in xrange(100)]
y = [np.log(float(1+i))+np.log(1+i**(float(i+1)/(random()*20+i+1))) for i in xrange(100)]
x, y = zip(*sorted([(x[i], y[i]) for i in range(100)]))
x = np.array(x)
y = np.array(y)
dots = plt.plot(x, y,'ro', linewidth=2)
# curve_fit
params, covariance = curve_fit(func, x, y)
fit_line = plt.plot(x, func(x, *params), '-', color='darkgreen', linewidth=2)
# now plot the best fit curve and also +- 1/2/3 sigma curves
sigma = [covariance[i,i] for i in range(covariance[0].size)]
plt.fill_between(x,
func(x, *[params[i] + sigma[i] for i in xrange(len(params))]),
func(x, *[params[i] - sigma[i] for i in xrange(len(params))]), color='darkgreen', alpha=.1)
plt.fill_between(x,
func(x, *[params[i] + 2*sigma[i] for i in xrange(len(params))]),
func(x, *[params[i] - 2*sigma[i] for i in xrange(len(params))]), color='darkgreen', alpha=.1)
plt.fill_between(x,
func(x, *[params[i] + 3*sigma[i] for i in xrange(len(params))]),
func(x, *[params[i] - 3*sigma[i] for i in xrange(len(params))]), color='darkgreen', alpha=.1,
label='1 $\sigma$ standard deviation')
p1 = Rectangle((0, 0), 1, 1, fc="darkgreen", alpha=.3)
p2 = Rectangle((0, 0), 1, 1, fc="darkgreen", alpha=.2)
p3 = Rectangle((0, 0), 1, 1, fc="darkgreen", alpha=.1)
plt.legend(dots + fit_line + [p1, p2, p3],
['almost random dots',
'Power fit $y = %.3f\cdot x^{%.3f} + {%.3f}$' % (a, b, c),
'1 $\sigma$ fit error', '2 $\sigma$ fit error', '3 $\sigma$ fit error'], loc=4)
plt.show()
In [138]:
# References:
# - Statistics in Geography by David Ebdon (ISBN: 978-0631136880)
# - Reliability Engineering Resource Website:
# - http://www.weibull.com/DOEWeb/confidence_intervals_in_simple_linear_regression.htm
# - University of Glascow, Department of Statistics:
# - http://www.stats.gla.ac.uk/steps/glossary/confidence_intervals.html#conflim
import numpy as np
import matplotlib.pyplot as plt
from random import random
from scipy.stats import t as t_distr
# example data
size = 50
x = [i+(2*random()+.5)**4 for i in xrange(size)]
y = [np.log((1.+i))+np.log(1+i**(float(i+1)/(random()*20+i))) for i in xrange(size)]
#x, y = zip(*sorted([(x[i], y[i]) for i in range(size)]))
x = np.array(x)
y = np.array(y)
def uncertainty_band(x, y, df=1, conf=0.95):
"""
Calculates the confidence or prediction band of the polynomial regression
model at the desired confidence level.
The 2 sigma confidence interval is 95% sure to contain the best-fit
regression line.
The 2 sigma prediction interval contains 95% of the data points.
:params .95 conf: desired confidence level, by default 0.95 (2 sigma)
:params x: data array or list
:params y: data array or list
:params 1 df: polynomial degree for the fit
:returns: an array with the confidence values to add/substract, another
with prediction values, an array with the fitted x values, an array with
the fitted y values and the scipy.poly1d object, the R-square value of
the fit
"""
if not isinstance(x, np.ndarray):
x = np.array(x)
if not isinstance(y, np.ndarray):
y = np.array(y)
# fit a curve to the data using a least squares of "df" order polynomial fit
z = np.polyfit(x, y, df)
p = np.poly1d(z)
# predict y values of origional data using the fit
p_y = p(x)
# create series of new test x values to predict for
p_x = np.linspace(min(x), max(x), 100)
# number of samples in origional fit
n = x.size
# alpha 1 minus the wanted probability
alpha = 1. - conf
# t distribution with n-2 degrees of freedom
t = t_distr.ppf(1. - alpha / 2., n - df - 1)
# mean of x
mean_x = np.mean(x)
# Error sum of squares
sse = sum((y - p_y)**2)
# Error mean square (estimate of the variance)
mse = sse / (n - df - 1)
# Square individual deviation
sdi = (p_x - mean_x)**2
# standard deviation
sd = sum((x - mean_x)**2)
# relative individual deviation
sdi_sd = sdi / sd
confs = t * np.sqrt(mse * ( 1.0 / n + sdi_sd))
preds = t * np.sqrt(mse * (1.0 + 1.0 / n + sdi_sd))
# calculate R-square
sstot = sum((y - np.mean(y))**2)
sserr = sum((y - p(x))**2)
r2 = 1 - sserr/sstot
# now predict y based on test x-values
p_y = p(p_x)
return confs, preds, p_x, p_y, p, r2
confs, preds, p_x, p_y, p, r2 = uncertainty_band(x, y, df=4)
# plot sample data
dots = plt.plot(x, y, 'o', mec='none', color='red', label='Sample observations', alpha=.8)
# plot line of best fit
fit_line = plt.plot(p_x, p_y,color= 'darkgreen', lw=2, label='Regression line')
# plot confidence limits
plt.fill_between(p_x, p_y + confs, p_y - confs, color='darkgreen', alpha=0.3)
plt.fill_between(p_x, p_y - preds, p_y + preds, color='orangered', alpha=0.2)
p1 = Rectangle((0, 0), 1, 1, fc="darkgreen", alpha=.3)
p2 = Rectangle((0, 0), 1, 1, fc="orangered", alpha=.2)
plt.legend(dots + fit_line + [p1, p2],
['almost random dots',
'Polynomial fit ($R^2=%.3f$):\n$y = %s$' % (r2,
''.join([(('+' if i and v > 0 else '') + '%.2g' % v) +
(('x^%s' % (len(p) - i)) if (len(p) - i) > 1
else 'x' if len(p) - i else '') for i, v in enumerate(p)])),
'95% Confidence band',
'95% Prediction band'], loc='upper left', frameon=False, bbox_to_anchor=[1,1])
plt.xlim((min(x), max(x)))
Out[138]:
In [29]:
# References:
# - Statistics in Geography by David Ebdon (ISBN: 978-0631136880)
# - Reliability Engineering Resource Website:
# - http://www.weibull.com/DOEWeb/confidence_intervals_in_simple_linear_regression.htm
# - University of Glascow, Department of Statistics:
# - http://www.stats.gla.ac.uk/steps/glossary/confidence_intervals.html#conflim
import numpy as np
import matplotlib.pyplot as plt
from random import random
from scipy.stats import t as t_distr
from scipy.optimize import curve_fit
import re
def uncertainty_band(x, y, func_string, df=None, conf=0.95):
"""
Calculates the confidence or prediction band of the polynomial regression
model at the desired confidence level.
The 2 sigma confidence interval is 95% sure to contain the best-fit
regression line.
The 2 sigma prediction interval contains 95% of the data points.
:params .95 conf: desired confidence level, by default 0.95 (2 sigma)
:params x: data array or list
:params y: data array or list
:params func_string: a string representing the function to optimize.
E.g.: "A*x^B+C*x" constants in upper-case
:params None df: polynomial degree for the fit
:returns: an array with the confidence values to add/substract, another
with prediction values, an array with the fitted x values, an array with
the fitted y values and the scipy.poly1d object, the R-square value of
the fit
"""
recomp = re.compile("[A-Z]")
func_restring = re.sub(recomp, "%s", func_string)
df = df or len(re.findall(recomp, func_string))
def func(x, *args):
cmd = "zzz = " + func_restring.replace('^', '**') % (args)
exec(cmd) in globals(), locals()
#print cmd
try:
return np.lib.asarray_chkfinite(zzz)
except:
# avoid the creation of NaNs when invalid values for power or log
return x
if not isinstance(x, np.ndarray):
x = np.array(x)
if not isinstance(y, np.ndarray):
y = np.array(y)
# fit a curve to the data using a least squares of "df" order polynomial fit
z, covariance = curve_fit(func, x, y, [1. for _ in xrange(df)])
# predict y values of origional data using the fit
p_y = func(x, *z)
# create series of new test x values to predict for
p_x = np.linspace(min(x), max(x), 100)
# number of samples in origional fit
n = x.size
# alpha 1 minus the wanted probability
alpha = 1. - conf
# t distribution with n-2 degrees of freedom
t = t_distr.ppf(1. - alpha / 2., n - df - 1)
# mean of x
mean_x = np.mean(x)
# Error sum of squares
sse = sum((y - p_y)**2)
# Error mean square (estimate of the variance)
mse = sse / (n - df - 1)
# Square individual deviation
sdi = (p_x - mean_x)**2
# standard deviation
sd = sum((x - mean_x)**2)
# relative individual deviation
sdi_sd = sdi / sd
confs = t * np.sqrt(mse * ( 1.0 / n + sdi_sd))
preds = t * np.sqrt(mse * (1.0 + 1.0 / n + sdi_sd))
# calculate R-square
sstot = sum((y - np.mean(y))**2)
sserr = sum((y - func(x, *z))**2)
r2 = 1 - sserr/sstot
# now predict y based on test x-values
p_y = func(p_x, *z)
return confs, preds, p_x, p_y, z, r2
# example data
size = 100
x = [i+(4*random()-2) for i in xrange(size)]
y = [3.4*i**3+(random()*10-5)*i-3*i**2+43 for i in xrange(size)]
y = [2.4**(.2*i)+(random()*1000-500)+7 for i in xrange(size)]
y = [np.log(1+i**(float(i+1)/(random()*20+i+1))) for i in xrange(size)]
x = np.array(x)
y = np.array(y)
func_string = "A*x^3+ B*x^2+C*x+D"
func_string = "A^(x*B)+C"
func_string = "np.log(A*x^B)"
func_restring = re.sub("[A-Z]", "%s", func_string)
confs, preds, p_x, p_y, z, r2 = uncertainty_band(x, y, func_string)
# plot sample data
dots = plt.plot(x, y, 'o', mec='none', color='red', label='Sample observations', alpha=.8)
# plot line of best fit
fit_line = plt.plot(p_x, p_y,color= 'darkgreen', lw=2, label='Regression line')
# plot confidence limits
plt.fill_between(p_x, p_y + confs, p_y - confs, color='darkgreen', alpha=0.3)
plt.fill_between(p_x, p_y - preds, p_y + preds, color='orangered', alpha=0.2)
p1 = Rectangle((0, 0), 1, 1, fc="darkgreen", alpha=.3)
p2 = Rectangle((0, 0), 1, 1, fc="orangered", alpha=.2)
plt.legend(dots + fit_line + [p1, p2],
['almost random dots',
'Fit ($R^2=%.3f$):\ny = $%s$' % (r2,
re.sub('\^(\([^)(]+\))', '^{\\1}',
re.sub('\^([0-9.]+)', '^{\\1}',
func_restring.replace('np.', '') % tuple(
["%.2g" % i for i in z]))).replace('*','\\times ')),
'95% Confidence band',
'95% Prediction band'], loc='upper left', frameon=False, bbox_to_anchor=[1,1])
plt.xlim((min(x), max(x)))
Out[29]:
In [ ]: