In [1]:
from scipy import random, optimize, std
from matplotlib import pyplot
%matplotlib inline
import numpy
import csv
Set of measurements $\left\{Y_{k}\right\}$
at known locations $\left\{X_{k}\right\}$
Gaussian uncertainty $\sigma=1.0$
$p=30\%$ chance of adding constant factor $5.0$
fit a line $y=a_{0}+a_{1}x$
we want to find the parameters of the above function that will maximize the measured data given the known points
$p\left(\vec{z}\bigr|\left\{Y_{k},X_{k},\sigma_{k}\right\}\right) \propto p\left(\left\{Y_{k}\right\}\bigr|\vec{a},\left\{X_{k},\sigma_{k}\right\}\right)p\left(\vec{a}\bigr|\left\{\sigma_{k}\right\}\right)$
we will assume a uniform or mostly flat prior, ignore the probability of the measurements, and drop the $\sigma$ and $X$ terms, so the likelihood will dominate, and we want to maximize
$p\left(\vec{a}\bigr|\left\{Y_{k}\right\}\right) \propto p\left(\left\{Y_{k}\right\}\bigr|\vec{a},\right)$
if the data measurements are independent, then the probability density is separable:
$p\left(\vec{a}\bigr|\left\{Y_{k}\right\}\right) \propto \prod_{k}p\left(y_{k}|\vec{a}\right)$
But instead of the standard standalone gaussian distribution, for each data point, we have two mutually exclusive cases. They can be added because they are disjoint, and since they form an exhaustive set, the sum of their probabilities, integrated over the whole parameter space, should equal 1. Since we're just doing a maximization we don't care about normalization, but this means we can write the likelihood for each data point as:
$p\left(\vec{a}\bigr|y_{k}\right) \propto\left(1-p\right)\exp{\left[-\frac{\left(y_{k}-\left(a_{0}+a_{1}x_{k}\right)\right)^{2}}{2\sigma^{2}}\right]}+p\exp{\left[-\frac{\left(y_{k}-\left(5+a_{0}+a_{1}x_{k}\right)\right)^{2}}{2\sigma^{2}}\right]}$
In [2]:
sigma_meas = 1.0 # standard deviation of measurements
p_err = 0.30 # probability of experimental mistake occurring
err_add = 5.0 # additive increase error
reader = csv.reader(open('hw2prob1-data.txt','rt'), delimiter = ' ')
x_data = [ ]
y_data = [ ]
for row in reader:
if len(row) == 0:
continue
try:
float(row[0]) and float(row[1]) and x_data.append(float(row[0]))
y_data.append(float(row[1]))
except ValueError:
continue
x_data = numpy.asarray(x_data)
y_data = numpy.asarray(y_data)
print(x_data)
print(y_data)
In [3]:
def negative_posterior(a, x, y, sigma, p):
prob = -1
for k in range(0, len(x)):
prob *= ((1-p)*numpy.exp(-(y[k]-(a[0]+a[1]*x[k]))**2/(2*sigma**2)) + p*numpy.exp(-(y[k]-(5+a[0]+a[1]*x[k]))**2/(2*sigma**2)))
return prob
In [4]:
a0 = numpy.arange(-2.0, 14.0, 0.05)
a1 = numpy.arange(-1.1, 1.4, 0.05)
z = numpy.array([[-negative_posterior([i,j], x_data, y_data, sigma_meas, p_err) for i in a0] for j in a1])
contour_plot = pyplot.contour(a0, a1, z)
pyplot.colorbar()
pyplot.xlabel('$a_0$', fontsize=20)
pyplot.ylabel('$a_1$', fontsize=20)
pyplot.title('Posterior probability (unnormalized)', fontsize=15)
Out[4]:
In [7]:
guess = [ [ 0.1, 0.1 ] ]
a = [ optimize.fmin(negative_posterior, guess[0], args=(x_data, y_data, sigma_meas, p_err)) ]
print(a)
y = [ a[0][0]*numpy.ones(len(x_data)) + a[0][1]*x_data ]
In [8]:
def chi_square(a, x, y):
residuals = numpy.zeros(len(x))
for k in range(0, len(x)):
residuals[k] = (y[k]-(a[0]+a[1]*x[k]))
return numpy.sum(residuals**2)
In [9]:
guess.append(optimize.fmin(chi_square, guess[0], args=(x_data, y_data)))
print(guess[1])
In [16]:
a.append(optimize.fmin(negative_posterior, guess[1], args=(x_data, y_data, sigma_meas, p_err)))
print(a[1])
y.append(a[1][0]*numpy.ones(len(x_data)) + a[1][1]*x_data)
In [17]:
guess.append( [ 1.9, 0.75 ] )
a.append(optimize.fmin(negative_posterior, guess[2], args=(x_data, y_data, sigma_meas, p_err)))
print(a[2])
y.append(a[2][0]*numpy.ones(len(x_data)) + a[2][1]*x_data)
In [19]:
guess[3] = [ 11.0, -0.6 ]
a.append( optimize.fmin(negative_posterior, guess[3], args=(x_data, y_data, sigma_meas, p_err)) )
print(a[3])
y.append( a[3][0]*numpy.ones(len(x_data)) + a[3][1]*x_data )
Plot results
In [20]:
pyplot.title("y vs x")
pyplot.xlabel("x")
pyplot.ylabel("y")
plots = [ pyplot.plot(x_data, y_data, 'bo', label="Measured") ]
plots.append( pyplot.plot(x_data, y_data - 5, 'mo', label="Measured (error)") )
plots.append( pyplot.plot(x_data, y[0], 'r-', label="Guess i") )
plots.append( pyplot.plot(x_data, y[1], 'g-', label="Guess ii") )
plots.append( pyplot.plot(x_data, y[2], 'y-', label="Guess iii") )
plots.append( pyplot.plot(x_data, y[3], 'c-', label="Guess iv") )
pyplot.axis([2.0, 10.0, 1.0, 10.0])
pyplot.legend(plots, loc='center left', bbox_to_anchor=(1., 0.5))
Out[20]:
The legend isn't working properly. The legend should read: y_meas, y_remove_error, first guess, second, third, fourth
In [21]:
l = len(a)
params_x = []*l
params_y = []*l
for i in range(0, l):
params_x.append( a[i][0] )
params_y.append( a[i][1] )
pyplot.contourf(a0, a1, z)
pyplot.plot(params_x, params_y, 'ko')
pyplot.xlabel('$a_0$', fontsize=20)
pyplot.ylabel('$a_1$', fontsize=20)
pyplot.title("Fit param locations")
Out[21]: