Fitting curve to data


In [1]:
from scipy.optimize import curve_fit
from scipy.ndimage import gaussian_filter1d
from random import random
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
 
 
def func(x, c, z):
    return c*x**z
 
 
def func_lin(x, a, b):
    return (a*x)+b
 
 
def get_r2 (fun, X, Y, *args):
    sstot = sum([(Y[i]-np.mean(Y))**2 for i in xrange(len(Y))])
    sserr = sum([(Y[i] - fun(X[i], *args))**2 for i in xrange(len(Y))])
    return 1 - sserr/sstot
 
plt.clf()
fig = plt.figure(figsize=(14,16))

lines = []
labels = []
 
x = [i+(2*random()+1)**3 for i in xrange(100)]
y = [np.log(float(1+i))+np.log(1+i**(float(i+1)/(random()*20+i+1))) for i in xrange(100)]
lines.extend (plt.plot(x, y,'ro', linewidth=2))
labels.append('almost random dots')
 
# define x-range of values where to estimate y in function of x
xp = np.arange(min(x), max(x), (max (x)-min(x))/200)

# linereression
slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
lines.extend(plt.plot(x, intercept + slope*np.array (x), 'r:', linewidth=2))
labels.append('linregression $y = %.3fx + %.3f$ $R^2 = %.4f, P=%.4f, Err= %.4f$'\
              % (intercept, slope, r_value**2, p_value, std_err))
 
# curve_fit, using least-square to estimate best parameter in linear model
a, b = curve_fit(func_lin, np.array(x), np.array(y))[0]
R2 = get_r2(func_lin, x, y, a, b)
lines.extend(plt.plot(xp, func_lin(xp, a, b), ':', color='b', linewidth=2))
labels.append('Linear fit $y = %.3f\cdot x + %.3f$ $R^2=%.4f$'\
                  % (a, b, R2))

# polynomial fit first order
z = np.polyfit(x, y, 1)
p  = np.poly1d(z)
R2 = get_r2(p, x, y)
lines.extend(plt.plot(xp, p(xp), ":", color='g', linewidth=2))
labels.append('Polynomial fit ($1^{st}$ order) $y = %.3f\cdot x + %.3f$ $R^2=%.4f$'\
              % (p[1], p[0],R2))
 
# polynomial fit second order
z = np.polyfit(x, y, 2)
p  = np.poly1d(z)
R2 = get_r2(p, x, y)
lines.extend(plt.plot(xp, p(xp), "-", color='purple', linewidth=2))
labels.append('Polynomial fit ($2^{nd}$ order) $y=%.3f\cdot x^2+%.3f\cdot x+%.3f$ $R^2=%.4f$'\
              % (p[2], p[1], p[0], R2))

 
# curve_fit, using least-square to estimate best parameter in power model
c, z = curve_fit(func, np.array(x), np.array(y))[0]
R2 = get_r2(func, x, y, c, z)
lines.extend(plt.plot(xp, func(xp, c, z), '-', color='darkgreen', linewidth=2))
labels.append('Power fit least-square $y = %.3f\cdot x^{%.3f}$ $R^2=%.4f$'\
               % (c, z, R2))

# gaussian fit
sigma = 5
t = np.linspace(0, 1, len(x))
t2 = np.linspace(0, 1, 100)
x2 = np.interp(t2, t, x)
y2 = np.interp(t2, t, y)
x3 = gaussian_filter1d(x2, sigma)
y3 = gaussian_filter1d(y2, sigma)
lines.extend(plt.plot(x3, y3, '-', color='orange', linewidth=2))
labels.append('Gaussian fit (%s$\sigma$)' % sigma)

# legend
fig.subplots_adjust(top=0.7)
plt.legend(lines, labels, loc=0, shadow=True, fancybox=True, ncol=1, 
           numpoints=1, title='what is here:', bbox_to_anchor=[.1,1, .5,.3])


plt.show()


<matplotlib.figure.Figure at 0x1135c3fd0>

Fit uncertainty


In [340]:
from scipy.optimize import curve_fit
from random import random
import numpy as np
import matplotlib.pyplot as plt

def func(x, a, b, c):
    return a*x**-b - x**(1./c)

x = [i+(2*random()+1)**3 for i in xrange(100)]
y = [np.log(float(1+i))+np.log(1+i**(float(i+1)/(random()*20+i+1))) for i in xrange(100)]
x, y = zip(*sorted([(x[i], y[i]) for i in range(100)]))
x = np.array(x)
y = np.array(y)
dots = plt.plot(x, y,'ro', linewidth=2)

# curve_fit
params, covariance = curve_fit(func, x, y)
fit_line = plt.plot(x, func(x, *params), '-', color='darkgreen', linewidth=2)

# now plot the best fit curve and also +- 1/2/3 sigma curves

sigma = [covariance[i,i] for i in range(covariance[0].size)]

plt.fill_between(x,
                 func(x, *[params[i] + sigma[i] for i in xrange(len(params))]),
                 func(x, *[params[i] - sigma[i] for i in xrange(len(params))]), color='darkgreen', alpha=.1)
plt.fill_between(x,
                 func(x, *[params[i] + 2*sigma[i] for i in xrange(len(params))]),
                 func(x, *[params[i] - 2*sigma[i] for i in xrange(len(params))]), color='darkgreen', alpha=.1)
plt.fill_between(x,
                 func(x, *[params[i] + 3*sigma[i] for i in xrange(len(params))]),
                 func(x, *[params[i] - 3*sigma[i] for i in xrange(len(params))]), color='darkgreen', alpha=.1, 
                 label='1 $\sigma$ standard deviation')

p1 = Rectangle((0, 0), 1, 1, fc="darkgreen", alpha=.3)
p2 = Rectangle((0, 0), 1, 1, fc="darkgreen", alpha=.2)
p3 = Rectangle((0, 0), 1, 1, fc="darkgreen", alpha=.1)
plt.legend(dots + fit_line + [p1, p2, p3], 
           ['almost random dots',
            'Power fit $y = %.3f\cdot x^{%.3f} + {%.3f}$' % (a, b, c),
            '1 $\sigma$ fit error', '2 $\sigma$ fit error', '3 $\sigma$ fit error'], loc=4)


plt.show()


Confidance and prediction bands


In [138]:
# References:
# - Statistics in Geography by David Ebdon (ISBN: 978-0631136880)
# - Reliability Engineering Resource Website:
# - http://www.weibull.com/DOEWeb/confidence_intervals_in_simple_linear_regression.htm
# - University of Glascow, Department of Statistics:
# - http://www.stats.gla.ac.uk/steps/glossary/confidence_intervals.html#conflim
 
import numpy as np
import matplotlib.pyplot as plt
from random import random
from scipy.stats import t as t_distr

# example data
size = 50
x = [i+(2*random()+.5)**4 for i in xrange(size)]
y = [np.log((1.+i))+np.log(1+i**(float(i+1)/(random()*20+i))) for i in xrange(size)]
#x, y = zip(*sorted([(x[i], y[i]) for i in range(size)]))
x = np.array(x)
y = np.array(y) 

def uncertainty_band(x, y, df=1, conf=0.95):
    """
    Calculates the confidence or prediction band of the polynomial regression 
    model at the desired confidence level. 
    The 2 sigma confidence interval is 95% sure to contain the best-fit 
    regression line.
    The 2 sigma prediction interval contains 95% of the data points.
    
    :params .95 conf: desired confidence level, by default 0.95 (2 sigma)
    :params x: data array or list
    :params y: data array or list
    :params 1 df: polynomial degree for the fit
    
    :returns: an array with the confidence values to add/substract, another 
       with prediction values, an array with the fitted x values, an array with
       the fitted y values and the scipy.poly1d object, the R-square value of
       the fit
       
    """
    if not isinstance(x, np.ndarray):
        x = np.array(x)
    if not isinstance(y, np.ndarray):
        y = np.array(y)
        
    # fit a curve to the data using a least squares of "df" order polynomial fit
    z = np.polyfit(x, y, df)
    p = np.poly1d(z)
    # predict y values of origional data using the fit
    p_y = p(x)

    # create series of new test x values to predict for
    p_x = np.linspace(min(x), max(x), 100)

    # number of samples in origional fit
    n = x.size
    # alpha 1 minus the wanted probability
    alpha = 1. - conf
    # t distribution with n-2 degrees of freedom
    t = t_distr.ppf(1. - alpha / 2., n - df - 1)

    # mean of x
    mean_x = np.mean(x)
    # Error sum of squares
    sse = sum((y - p_y)**2)
    # Error mean square (estimate of the variance)
    mse = sse / (n - df - 1)
    # Square individual deviation
    sdi = (p_x - mean_x)**2
    # standard deviation
    sd = sum((x - mean_x)**2)
    # relative individual deviation
    sdi_sd = sdi / sd

    confs = t * np.sqrt(mse * (      1.0 / n + sdi_sd))
    preds = t * np.sqrt(mse * (1.0 + 1.0 / n + sdi_sd))

    # calculate R-square
    sstot = sum((y - np.mean(y))**2)
    sserr = sum((y - p(x))**2)
    r2 = 1 - sserr/sstot

    # now predict y based on test x-values
    p_y = p(p_x)

    return confs, preds, p_x, p_y, p, r2

confs, preds, p_x, p_y, p, r2 = uncertainty_band(x, y, df=4)

# plot sample data
dots = plt.plot(x, y, 'o', mec='none', color='red', label='Sample observations', alpha=.8)

# plot line of best fit
fit_line = plt.plot(p_x, p_y,color= 'darkgreen', lw=2, label='Regression line')

# plot confidence limits
plt.fill_between(p_x, p_y + confs, p_y - confs, color='darkgreen', alpha=0.3)
plt.fill_between(p_x, p_y - preds, p_y + preds, color='orangered', alpha=0.2)

p1 = Rectangle((0, 0), 1, 1, fc="darkgreen", alpha=.3)
p2 = Rectangle((0, 0), 1, 1, fc="orangered", alpha=.2)
plt.legend(dots + fit_line + [p1, p2], 
           ['almost random dots',
            'Polynomial fit ($R^2=%.3f$):\n$y = %s$' % (r2,
                                ''.join([(('+' if i and v > 0 else '') + '%.2g' % v) + 
                               (('x^%s' % (len(p) - i)) if (len(p) - i) > 1
                                else 'x' if len(p) - i else '') for i, v in enumerate(p)])),
                '95% Confidence band', 
                '95% Prediction band'], loc='upper left', frameon=False, bbox_to_anchor=[1,1])


plt.xlim((min(x), max(x)))


Out[138]:
(3.836920029058672, 77.582024217741576)

Same with any function


In [29]:
# References:
# - Statistics in Geography by David Ebdon (ISBN: 978-0631136880)
# - Reliability Engineering Resource Website:
# - http://www.weibull.com/DOEWeb/confidence_intervals_in_simple_linear_regression.htm
# - University of Glascow, Department of Statistics:
# - http://www.stats.gla.ac.uk/steps/glossary/confidence_intervals.html#conflim
 
import numpy as np
import matplotlib.pyplot as plt
from random import random
from scipy.stats import t as t_distr
from scipy.optimize import curve_fit
import re

def uncertainty_band(x, y, func_string, df=None, conf=0.95):
    """
    Calculates the confidence or prediction band of the polynomial regression 
    model at the desired confidence level. 
    The 2 sigma confidence interval is 95% sure to contain the best-fit 
    regression line.
    The 2 sigma prediction interval contains 95% of the data points.
    
    :params .95 conf: desired confidence level, by default 0.95 (2 sigma)
    :params x: data array or list
    :params y: data array or list
    :params func_string: a string representing the function to optimize.
      E.g.: "A*x^B+C*x" constants in upper-case
    :params None df: polynomial degree for the fit
    
    :returns: an array with the confidence values to add/substract, another 
       with prediction values, an array with the fitted x values, an array with
       the fitted y values and the scipy.poly1d object, the R-square value of
       the fit
       
    """
    recomp = re.compile("[A-Z]")
    func_restring = re.sub(recomp, "%s", func_string)
    
    df = df or len(re.findall(recomp, func_string))
    
    def func(x, *args):
        cmd = "zzz = " + func_restring.replace('^', '**') % (args)
        exec(cmd) in globals(), locals()
        #print cmd
        try:
            return np.lib.asarray_chkfinite(zzz)
        except:
            # avoid the creation of NaNs when invalid values for power or log
            return x
    
    if not isinstance(x, np.ndarray):
        x = np.array(x)
    if not isinstance(y, np.ndarray):
        y = np.array(y)

    # fit a curve to the data using a least squares of "df" order polynomial fit
    z, covariance = curve_fit(func, x, y, [1. for _ in xrange(df)])
    # predict y values of origional data using the fit
    p_y = func(x, *z)

    # create series of new test x values to predict for
    p_x = np.linspace(min(x), max(x), 100)

    # number of samples in origional fit
    n = x.size
    # alpha 1 minus the wanted probability
    alpha = 1. - conf
    # t distribution with n-2 degrees of freedom
    t = t_distr.ppf(1. - alpha / 2., n - df - 1)

    # mean of x
    mean_x = np.mean(x)
    # Error sum of squares
    sse = sum((y - p_y)**2)
    # Error mean square (estimate of the variance)
    mse = sse / (n - df - 1)
    # Square individual deviation
    sdi = (p_x - mean_x)**2
    # standard deviation
    sd = sum((x - mean_x)**2)
    # relative individual deviation
    sdi_sd = sdi / sd

    confs = t * np.sqrt(mse * (      1.0 / n + sdi_sd))
    preds = t * np.sqrt(mse * (1.0 + 1.0 / n + sdi_sd))

    # calculate R-square
    sstot = sum((y - np.mean(y))**2)
    sserr = sum((y - func(x, *z))**2)
    r2 = 1 - sserr/sstot

    # now predict y based on test x-values
    p_y = func(p_x, *z)

    return confs, preds, p_x, p_y, z, r2

# example data
size = 100
x = [i+(4*random()-2) for i in xrange(size)]
y = [3.4*i**3+(random()*10-5)*i-3*i**2+43 for i in xrange(size)]
y = [2.4**(.2*i)+(random()*1000-500)+7 for i in xrange(size)]
y = [np.log(1+i**(float(i+1)/(random()*20+i+1))) for i in xrange(size)]

x = np.array(x)
y = np.array(y) 

func_string = "A*x^3+ B*x^2+C*x+D"
func_string = "A^(x*B)+C"
func_string = "np.log(A*x^B)"
func_restring = re.sub("[A-Z]", "%s", func_string)

confs, preds, p_x, p_y, z, r2 = uncertainty_band(x, y, func_string)

# plot sample data
dots = plt.plot(x, y, 'o', mec='none', color='red', label='Sample observations', alpha=.8)

# plot line of best fit
fit_line = plt.plot(p_x, p_y,color= 'darkgreen', lw=2, label='Regression line')

# plot confidence limits
plt.fill_between(p_x, p_y + confs, p_y - confs, color='darkgreen', alpha=0.3)
plt.fill_between(p_x, p_y - preds, p_y + preds, color='orangered', alpha=0.2)

p1 = Rectangle((0, 0), 1, 1, fc="darkgreen", alpha=.3)
p2 = Rectangle((0, 0), 1, 1, fc="orangered", alpha=.2)
plt.legend(dots + fit_line + [p1, p2], 
           ['almost random dots',
            'Fit ($R^2=%.3f$):\ny = $%s$' % (r2,
                                re.sub('\^(\([^)(]+\))', '^{\\1}', 
                                re.sub('\^([0-9.]+)', '^{\\1}', 
                                       func_restring.replace('np.', '') % tuple(
                                ["%.2g" % i for i in z]))).replace('*','\\times ')),
                '95% Confidence band', 
                '95% Prediction band'], loc='upper left', frameon=False, bbox_to_anchor=[1,1])

plt.xlim((min(x), max(x)))


log(0.47*x^{1.1})
Out[29]:
(1.3099251190151153, 98.974685401182271)

In [ ]: