This notebook was developed from July 2014 through September 2014. The primary goal of the development of this notebook was to create a model that accurately estimates the probability that a given star will succeed or fail in its acquisition. Specifically, the model in this notebook attempts to provide a posterior predictive distribution for a given star utilizing the magnitude of the star as well as the estimated warm pixel fraction for that observation.
In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import acqstattools as ast
import pystan
from astropy.time import Time
import warnings
warnings.filterwarnings('ignore')
In [2]:
# Pointer to historical star acquisition data
datalocation = 'data/acq_table.npy'
# Enter a training period from the dataset
train_start = 2011.5
train_end = 2014.5
# Enter a test period from the dataset for validation
test_start = 2011.5
test_end = 2014.5
fileout = "acqstats_bp6_draws_trained_{}_to_{}.pkl".format(train_start, train_end)
analysis_description = """
Chandra Star Acquisition Analysis Results
- Method: Bayesian Binomial Probit Regression
- Second Order Polynomial of Star Magnitude
- Warm Pixel Fraction
- Interactions between Mag & WPF
Train Period: {0} to {1}
Test Period : {2} to {3}
File Output : {4}
""".format(train_start, train_end, test_start, test_end, fileout)
In [3]:
#Loading data from file
acq_data = np.load(datalocation)
#Adding fields required for analysis
acq_data = ast.add_column(acq_data, 'tstart_jyear' , np.zeros(len(acq_data)))
acq_data = ast.add_column(acq_data, 'year' , np.zeros(len(acq_data)))
acq_data = ast.add_column(acq_data, 'failure' , np.zeros(len(acq_data)))
acq_data['tstart_jyear'] = Time(acq_data['tstart'], format='cxcsec').jyear
acq_data['year'] = np.floor(acq_data.tstart_jyear)
acq_data['failure'][acq_data['obc_id']=="NOID"] = 1.
In [4]:
bad_obsids = [
# Venus
2411,2414,6395,7306,7307,7308,7309,7311,7312,7313,7314,7315,7317,7318,7406,583,
7310,9741,9742,9743,9744,9745,9746,9747,9749,9752,9753,9748,7316,15292,16499,
16500,16501,16503,16504,16505,16506,16502,
# multi-obi catalogs
943,897,60879,2042,60881,800,1900,2365,906,2010,3182,2547,380,3057,2077,60880,
2783,1578,1561,4175,3764
]
for badid in bad_obsids:
acq_data = acq_data[acq_data['obsid']!=badid]
In [5]:
#Subsetting up the training data
traindata = ast.subset_range_tstart_jyear(acq_data, train_start, train_end)
train_success = traindata[traindata['obc_id']=='ID']
train_fail = traindata[traindata['obc_id']=='NOID']
Below you can change the Warm Pixel Fraction that we're initially testing (the notebook shows cases of other warm pixel fractions later). I'm subsetting the data this far up in the notebook so that we can get a visualization of what the training data looks like for the samples we're trying to estimate. A good idea is to use the newest data as out of sample test data to attempt to see how we'll perform in the future.
In [6]:
#Subsetting the Test Data
test_data = ast.subset_range_tstart_jyear(acq_data, test_start, test_end)
#Finding the mean warmpixel fraction in the test data
wp_test = np.mean(test_data['warm_pix'])
#Subsetting based on this warm pixel fraction
wp_range = ast.subset_range_warmpix(test_data, wp_test)
Before we start we also need a function that estimates error bars for each bin that we'll eventually put together. Normal statistics textbooks would set up the interval as $\hat p \pm z^{*} \sqrt{\frac{\hat p(1-\hat p)}{n}}$ where $n$ is the total number of trials and $\hat p$ in our case would be the sum of the failures over the total number of observations. This represents the results from the Central Limit Theorem. Unfortunately this does not work well for small $n$ or extreme probabilities, which is a majority of dataset. Thus in order to calculate our intervals, we'll use Wilson's method with a continuity correction which is better for small $n$ and extreme probabilities.
http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval
In [7]:
def wilsoninterval(failures, total_count):
p = failures.astype(float)/total_count # Estimated Probability of Failure
q = 1 - p # Complement (1 - p)
z = 1.96 # Z Score corresponding to 95% Interval
n = total_count # Total observed
L = (2*n*p + z**2 - 1 - z*np.sqrt(z**2 - 2 - (1/n) + 4*p*(n*q + 1))) / (2*(n + z**2)) # Lower Bound
U = (2*n*p + z**2 + 1 + z*np.sqrt(z**2 - 2 - (1/n) + 4*p*(n*q + 1))) / (2*(n + z**2)) # Upper Bound
return U, L
Included are the following plots:
In [8]:
################################
# Visualizing the Training Data
################################
nbins = np.arange(6,11.75,0.25)
wp_train_range = ast.subset_range_warmpix(traindata, wp_test)
# Histograms of the successes/failures for the training range.
F = plt.figure()
p1 = plt.hist(train_success['mag'], bins=nbins, color='b', alpha=0.5, label="Successes")
p2 = plt.hist(train_fail['mag'], bins=nbins, color='r', alpha=0.5, label="Failures")
lgd = plt.legend()
F.set_size_inches(10,3)
plt.xlabel('Star Magnitude')
plt.ylabel('Frequency')
plt.title('Star Acquisitions for Time Range: {} - {} (All Data)'.format(train_start, train_end))
plt.grid()
plt.show()
# Histogram of the observed warm pixel fractions for the training data.
F = plt.figure()
p1 = plt.hist(traindata['warm_pix'])
F.set_size_inches(10,3)
plt.xlabel('Warm Pixel Fraction')
plt.ylabel('Frequency')
plt.title('Star Acquisitions for Time Range: {} - {} (All Data)'.format(train_start, train_end))
plt.grid()
plt.show()
# Line chart of the proportion of failed stars binned by star magnitude.
failure_counts, bin_edges = np.histogram(train_fail['mag'], bins=nbins)
success_counts, bin_edges = np.histogram(train_success['mag'], bins=nbins)
total_counts, bin_edges = np.histogram(traindata['mag'], bins=nbins)
percent_fail = failure_counts.astype(float) / total_counts
pupper, plower = wilsoninterval(failure_counts, total_counts)
bin_middle = bin_edges[1:] - 0.125 # Centering the bin counts for the plots. Learned this was important
F = plt.figure()
p1, = plt.plot(bin_middle, percent_fail)
p2, = plt.plot(bin_middle, plower, linestyle='--', color='red')
p3, = plt.plot(bin_middle, pupper, linestyle='--', color='red')
plt.legend([p1, p2],
["Observed Data", "95% Interval"], loc=2)
plt.ylim((0.,1.))
plt.xlim((6.,11.))
F.set_size_inches(10,3)
plt.xlabel('Star Magnitude')
plt.ylabel('Proportion Failed')
plt.title('Star Acquisitions for Time Range: {} - {} (All Data)'.format(train_start, train_end))
plt.grid()
plt.show()
# Histograms of a subset of successes/failures for the training range for a given warm pixel fraction
F = plt.figure()
plt.hist(wp_train_range[wp_train_range['obc_id']=="ID"]['mag'], bins=nbins, color='b', alpha=0.5, label="Successes")
plt.hist(wp_train_range[wp_train_range['obc_id']=="NOID"]['mag'], bins=nbins, color='r', alpha=0.5, label="Failures")
lgd = plt.legend()
F.set_size_inches(10,3)
plt.xlabel('Star Magnitude')
plt.ylabel('Frequency')
plt.title('Star Acquisitions for Time Range: {} - {} (Warm Pix Subset)'.format(train_start, train_end))
plt.grid()
plt.show()
# Histograms of the window of eventually tested warm pixel fraction
F = plt.figure()
plt.hist(wp_train_range['warm_pix'])
F.set_size_inches(10,3)
plt.xlabel('Warm Pixel Fraction')
plt.ylabel('Frequency')
plt.title('Star Acquisitions for Time Range: {} - {} '.format(train_start, train_end))
plt.grid()
plt.show()
# Summary Table
print """Warm Pixel Fraction Range: {0:<3.2f} - {1:3.2f}""".format(wp_test - 0.01, wp_test + 0.01)
print """Average WPF : {}\n""".format(np.mean(wp_train_range['warm_pix']))
print """{0:^15} | {1:^15} | {2:^15} | {3:^15}""".format("Mag Bin", "Percent Failed", "Total Observed", "Failure Count")
print """{}""".format("-"*70)
for i in np.arange(0, len(total_counts)):
print """{0:<15} | {1:^15.3f} | {2:^15} | {3:^15}""".format("{0:<5.2f} - {1:>5.2f}".format(nbins[i+1] - 0.25, nbins[i+1]),
100*percent_fail[i],
total_counts[i],
failure_counts[i])
# Line chart of the proportion of failed stars binned by star magnitude for the given warm pixel fraction range
failure_counts, bin_edges = np.histogram(wp_train_range[wp_train_range['obc_id']=="NOID"]['mag'], bins=nbins)
success_counts, bin_edges = np.histogram(wp_train_range[wp_train_range['obc_id']=="ID"]['mag'], bins=nbins)
total_counts, bin_edges = np.histogram(wp_train_range['mag'], bins=nbins)
percent_fail = failure_counts.astype(float) / total_counts
pupper, plower = wilsoninterval(failure_counts, total_counts)
bin_middle = bin_edges[1:] - 0.125
F = plt.figure()
p1, = plt.plot(bin_middle, percent_fail)
p2, = plt.plot(bin_middle, plower, linestyle='--', color='red')
p3, = plt.plot(bin_middle, pupper, linestyle='--', color='red')
plt.xlim((8,11))
plt.ylim((0,1))
F.set_size_inches(10,3)
plt.xlabel('Star Magnitude')
plt.ylabel('Proportion Failed')
plt.title('Star Acquisitions for Time Range: {} - {}'.format(train_start, train_end))
plt.grid()
plt.show()
Included are the following plots:
In [9]:
#############################
# Test Data Visualization
#############################
nbins = np.arange(6,11.75,0.25)
test_data = ast.subset_range_tstart_jyear(acq_data, test_start, test_end)
wp_range = ast.subset_range_warmpix(test_data, wp_test)
# Histograms of the successes/failures for observations in the observed test range.
F = plt.figure()
plt.hist(test_data[test_data['obc_id']=="ID"]['mag'], bins=nbins, color='b', alpha=0.5, label="Successes")
plt.hist(test_data[test_data['obc_id']=="NOID"]['mag'], bins=nbins, color='r', alpha=0.5, label="Failures")
lgd = plt.legend()
F.set_size_inches(10,3)
plt.xlabel('Star Magnitude')
plt.ylabel('Frequency')
plt.title('Star Acquisitions for Time Range: {} - {} (All Data)'.format(test_start, test_end))
plt.grid()
plt.show()
# Histogram of the observed warm pixel fractions for the test data.
F = plt.figure()
p1 = plt.hist(test_data['warm_pix'])
F.set_size_inches(10,3)
plt.xlabel('Warm Pixel Fraction')
plt.ylabel('Frequency')
plt.title('Star Acquisitions for Time Range: {} - {} (All Data)'.format(test_start, test_end))
plt.grid()
plt.show()
# Line chart of the proportion of failed stars binned by star magnitude.
failure_counts, bin_edges = np.histogram(test_data[test_data['obc_id']=="NOID"]['mag'], bins=nbins)
success_counts, bin_edges = np.histogram(test_data[test_data['obc_id']=="ID"]['mag'], bins=nbins)
total_counts, bin_edges = np.histogram(test_data['mag'], bins=nbins)
percent_fail = failure_counts.astype(float) / total_counts
pupper, plower = wilsoninterval(failure_counts, total_counts)
bin_middle = bin_edges[1:] - 0.125
F = plt.figure()
p1, = plt.plot(bin_middle, percent_fail)
p2, = plt.plot(bin_middle, plower, linestyle='--', color='red')
p3, = plt.plot(bin_middle, pupper, linestyle='--', color='red')
plt.xlim((8,11))
plt.ylim((0,1))
F.set_size_inches(10,3)
plt.xlabel('Star Magnitude')
plt.ylabel('Proportion Failed')
plt.title('Star Acquisitions for Time Range: {} - {}'.format(test_start, test_end))
plt.grid()
plt.show()
# Histograms of a subset of successes/failures for the test range for a given warm pixel fraction
F = plt.figure()
plt.hist(wp_range[wp_range['obc_id']=="ID"]['mag'], bins=nbins, color='b', alpha=0.5, label="Successes")
plt.hist(wp_range[wp_range['obc_id']=="NOID"]['mag'], bins=nbins, color='r', alpha=0.5, label="Failures")
lgd = plt.legend()
F.set_size_inches(10,3)
plt.xlabel('Star Magnitude')
plt.ylabel('Frequency')
plt.title('Star Acquisitions for Time Range: {} - {}'.format(test_start, test_end))
plt.grid()
plt.show()
# Histograms of the window of tested warm pixel fraction
F = plt.figure()
plt.hist(wp_range['warm_pix'])
F.set_size_inches(10,3)
plt.xlabel('Warm Pixel Fraction')
plt.ylabel('Frequency')
plt.title('Star Acquisitions for Time Range: {} - {}'.format(test_start, test_end))
plt.grid()
plt.show()
# Summary Table
print """Warm Pixel Fraction Range: {0:<3.2f} - {1:3.2f}""".format(wp_test - 0.01, wp_test + 0.01)
print """Average WPF : {}\n""".format(np.mean(wp_range['warm_pix']))
print """{0:^15} | {1:^15} | {2:^15} | {3:^15}""".format("Mag Bin", "Percent Failed", "Total Observed", "Failure Count")
print """{}""".format("-"*70)
for i in np.arange(0, len(total_counts)):
print """{0:<15} | {1:^15.3f} | {2:^15} | {3:^15}""".format("{0:<5.2f} - {1:>5.2f}".format(nbins[i+1] - 0.25, nbins[i+1]),
100*percent_fail[i],
total_counts[i],
failure_counts[i])
# Line chart of the proportion of failed stars binned by star magnitude for the given warm pixel fraction range
failure_counts, bin_edges = np.histogram(wp_range[wp_range['obc_id']=="NOID"]['mag'], bins=nbins)
success_counts, bin_edges = np.histogram(wp_range[wp_range['obc_id']=="ID"]['mag'], bins=nbins)
total_counts, bin_edges = np.histogram(wp_range['mag'], bins=nbins)
percent_fail = failure_counts.astype(float) / total_counts
pupper, plower = wilsoninterval(failure_counts, total_counts)
bin_middle = bin_edges[1:] - 0.125
F = plt.figure()
p1, = plt.plot(bin_middle, percent_fail)
p2, = plt.plot(bin_middle, plower, linestyle='--', color='red')
p3, = plt.plot(bin_middle, pupper, linestyle='--', color='red')
plt.xlim((8,11))
plt.ylim((0,1))
F.set_size_inches(10,3)
plt.xlabel('Star Magnitude')
plt.ylabel('Proportion Failed')
plt.title('Star Acquisitions for Time Range: {} - {}'.format(test_start, test_end))
plt.grid()
plt.show()
Below is the PyStan Model that we'll be using. Documentation can be found at the following locations:
To understand the "model" below, use the modeling documentation at the second link. The first link will allow you to understand how to access elements from the model fit as well as what's available.
NOTE: All variables are centered for computational purposes. Without centering a years worth of data goes from completing in 5 mins to completing in 70 minutes. Do NOT uncenter and rerun, you won't be happy. Also if this is being run within an iPython Notebook, I've found it best to Interupt and Restart the Kernel between each running of the PyStan Model.
Response:
Covariates:
Model:
\begin{eqnarray} y_i &\sim& Bernoulli(\Phi(\eta_i)) \\ \eta_i &=& \alpha + \beta_1 \times Mag + \beta_2 \times Mag^2 + \beta_3 \times WarmPix \\ & & + \beta_4 \times Mag \times WarmPix + \beta_5 \times Mag^2 \times WarmPix \\ \beta &\sim& N(0, 5I) \end{eqnarray}Coefficient Descriptions:
\begin{eqnarray} \alpha &:& \mbox{Intercept term (Included in beta vector)} \\ \beta_1 &:& \mbox{First Order Star Magnitude Coefficient} \\ \beta_2 &:& \mbox{Second Order Star Magnitude Coefficient} \\ \beta_3 &:& \mbox{Warm Pixel Fraction Coefficient} \\ \beta_4 &:& \mbox{First Order Interaction Coefficient} \\ \beta_5 &:& \mbox{Second Order Interaction Coefficient} \\ \end{eqnarray}Model Runtime:
Factor of number of observations and number of iterations. Currently 3000 iterations are used on 4 chains. Note as a reference, the Stan modeling language guidelines suggest NOT using 1 chain. After burn in, and merging we have about 6000 samples to play with.
In [10]:
chandra_model = """
data {
int N; // number of observations
int y[N]; // binary outcome for star acquisition event 'n'
real Mag[N]; // predictive feature - star magnitude
real WarmPix[N]; // predictive feature - warm pixel fraction
}
parameters {
real alpha; // intercept coefficient
real beta1; // Star Magnitude coefficient
real beta2; // Second Order Star Mag Coefficient
real beta3; // Warm Pixel Fraction coefficient
real beta4; // Magnitude and WP interaction coefficient
real beta5; // Second Order Interaction Term
}
model {
alpha ~ normal(0,5); // weakly informative
beta1 ~ normal(0,5); // Star Magnitude coefficient
beta2 ~ normal(0,5); // Star Magnitude Second Order
beta3 ~ normal(0,5); // Warm Pixel Fraction coefficient
beta4 ~ normal(0,5); // Magnitude and WP interaction coefficient
beta5 ~ normal(0,5); // M & WP Second Order Interaction
for (n in 1:N)
y[n] ~ bernoulli(Phi(alpha +
beta1 * Mag[n] +
beta2 * Mag[n] * Mag[n] +
beta3 * WarmPix[n] +
beta4 * Mag[n] * WarmPix[n] +
beta5 * Mag[n] * Mag[n] * WarmPix[n])); //
}
"""
chandra_data = {'N': int(len(traindata)),
'y':traindata["failure"].astype(int),
'Mag':traindata["mag"] - np.mean(traindata["mag"]),
'WarmPix':traindata["warm_pix"] - np.mean(traindata["warm_pix"])
}
chandra_fit = pystan.stan(model_code=chandra_model, data=chandra_data, iter=3000, chains=4)
In [11]:
chandra_fit
Out[11]:
In [12]:
chandra_fit.plot().set_size_inches(15,15)
In [13]:
beta_draws = chandra_fit.extract()
In [14]:
### Coefficient Diagnostic Plots
#
# This takes in output from the pystan.extract() function and plots pairwise scatter plots of each coefficient.
# The diagonal of the plot is the histogram that is similar to what is show in the pystan supplied plots.
def plotdiagnostic(draws):
n_coefs = len(draws) - 1
nplots = n_coefs**2
names = []
for d in draws:
names.append(d)
F = plt.figure()
spot = 1
for n in names[:-1]:
for i, d in enumerate(draws):
if i == n_coefs:
break
else:
if n == d:
plt.subplot(n_coefs,n_coefs,spot)
plt.hist(draws[d], alpha=0.5)
plt.grid()
plt.xticks(rotation=45)
if spot <= n_coefs:
plt.title(d)
spot += 1
else:
plt.subplot(n_coefs,n_coefs,spot)
plt.plot(draws[n],draws[d],marker='.',linestyle='', alpha=0.3)
plt.grid()
plt.xticks(rotation=45)
if spot <= n_coefs:
plt.title(d)
spot += 1
F.set_size_inches(15,15)
plt.show()
plotdiagnostic(beta_draws)
In [15]:
# Calculating the posterior mean for each coefficient.
a = np.mean(beta_draws['alpha'])
b1 = np.mean(beta_draws['beta1'])
b2 = np.mean(beta_draws['beta2'])
b3 = np.mean(beta_draws['beta3'])
b4 = np.mean(beta_draws['beta4'])
b5 = np.mean(beta_draws['beta5'])
# Creating a vector with these means
betas = np.array([a, b1, b2, b3, b4, b5])
# Storing the centers of the inputs for later use
m_center = np.mean(traindata['mag'])
wp_center = np.mean(traindata['warm_pix'])
In [16]:
def build_matrix_line(m, wp, m_center, wp_center):
mag = m - m_center
warm = wp - wp_center
return np.array([1, mag, mag**2, warm, mag*warm, mag*mag*warm])
In [17]:
def build_matrix_fixedWP(wp, m_center, wp_center):
mag = np.arange(5.5,13.25,0.25)
Xline = [build_matrix_line(mag[0], wp, m_center, wp_center)]
for i, m in enumerate(mag[1:],start=1):
Xline = np.vstack((Xline, build_matrix_line(mag[i], wp, m_center, wp_center)))
return Xline
In [18]:
def compare_obs_2_calc(wp_test, betas):
nbins = np.arange(6,11.75,0.25)
# Creating a subset of the data based on wp_test input
test_data = ast.subset_range_tstart_jyear(acq_data, test_start, test_end)
wp_range = ast.subset_range_warmpix(test_data, wp_test)
test_matrix = build_matrix_fixedWP(wp_test, m_center, wp_center)
# Calculating the estimated probabilities
y = stats.norm.cdf(np.dot(test_matrix, betas))
x = np.arange(5.5,13.25,0.25)
# Calculating the proportion of failures in each bin and CI
failure_counts, bin_edges = np.histogram(wp_range[wp_range['obc_id']=="NOID"]['mag'], bins=nbins)
success_counts, bin_edges = np.histogram(wp_range[wp_range['obc_id']=="ID"]['mag'], bins=nbins)
total_counts, bin_edges = np.histogram(wp_range['mag'], bins=nbins)
percent_fail = failure_counts.astype(float) / total_counts
pupper, plower = wilsoninterval(failure_counts, total_counts)
# Centering the bin for plotting
bin_middle = bin_edges[1:] - 0.125
# Calculating the expected failures for each bin based on estimated probabilities
exp_fails = []
exp_sd = []
for i in np.arange(0, len(total_counts)):
mag = ((nbins[i+1] - 0.25) + nbins[i+1])/2.0
pi = stats.norm.cdf(np.dot(build_matrix_line(mag, wp_test, m_center, wp_center), betas))
exp_fail = total_counts[i] * pi
std_dev = np.sqrt(total_counts[i] * pi * (1 - pi))
exp_fails.append(exp_fail)
exp_sd.append(std_dev)
upper = np.array(exp_fails) + 2 * np.array(exp_sd)
lower = np.array(exp_fails) - 2 * np.array(exp_sd)
pltmax = np.max((np.max(upper), np.max(failure_counts)))
# Plotting
F = plt.figure()
plt.subplot(121)
p1, = plt.plot(x,y)
p2, = plt.plot(bin_middle, percent_fail)
p3, = plt.plot(bin_middle, plower, linestyle='--', color='red')
p4, = plt.plot(bin_middle, pupper, linestyle='--', color='red')
plt.grid()
plt.legend([p2,p3,p1], ["Observed: WPF={}".format(wp_test), "Observed Interval", "Est. Probability Curve"], loc=2)
plt.xlabel("Star Magnitude")
plt.ylabel("Proportion Failed")
plt.xlim((8,11))
plt.subplot(122)
p1, = plt.plot(bin_middle, failure_counts, color='green')
p2, = plt.plot(bin_middle, exp_fails, color='b', alpha=0.5)
p3, = plt.plot(bin_middle, upper, color='r', linestyle="--", alpha=0.5)
p4, = plt.plot(bin_middle, lower, color='r', linestyle="--", alpha=0.5)
plt.ylim((0,pltmax))
plt.legend([p1,p2, p3], ["Observed Failures: WPF={}".format(wp_test), "Est. # Failures", "95% Interval"], loc=2)
plt.xlabel("Star Magnitude")
plt.ylabel("Number Failed")
F.set_size_inches(15,5)
plt.grid()
plt.show()
In [19]:
compare_obs_2_calc(wp_test, betas)
In [20]:
def compare_show_errors(wp_test, beta_draws, betas):
nbins = np.arange(6,11.75,0.25)
# Creating a subset of the data based on wp_test input
test_data = ast.subset_range_tstart_jyear(acq_data, test_start, test_end)
wp_range = ast.subset_range_warmpix(test_data, wp_test)
test_matrix = build_matrix_fixedWP(wp_test, m_center, wp_center)
# Calculating the proportion of failures in each bin and CI
failure_counts, bin_edges = np.histogram(wp_range[wp_range['obc_id']=="NOID"]['mag'], bins=nbins)
success_counts, bin_edges = np.histogram(wp_range[wp_range['obc_id']=="ID"]['mag'], bins=nbins)
total_counts, bin_edges = np.histogram(wp_range['mag'], bins=nbins)
percent_fail = failure_counts.astype(float) / total_counts
plower, pupper = wilsoninterval(failure_counts, total_counts)
bin_middle = bin_edges[1:] - 0.125
F = plt.figure()
for n in np.arange(1,len(beta_draws['alpha']),1):
betas_n = beta_draws['alpha'][n], beta_draws['beta1'][n], beta_draws['beta2'][n], beta_draws['beta3'][n], beta_draws['beta4'][n], beta_draws['beta5'][n]
y = stats.norm.cdf(np.dot(test_matrix, betas_n))
x = np.arange(5.5,13.25,0.25)
plt.plot(x,y, alpha=0.5)
p1, = plt.plot(bin_middle, percent_fail, color='green', linewidth=5)
p2, = plt.plot(bin_middle, plower, linestyle='--', color='red', linewidth=3)
p3, = plt.plot(bin_middle, pupper, linestyle='--', color='red', linewidth=3)
plt.legend([p1], ["Observed: WPF={}".format(wp_test)], loc=2)
plt.xlabel("Star Magnitude")
plt.ylabel("Proportion Failed")
plt.xlim((8,11))
F.set_size_inches(15,5)
plt.grid()
plt.show()
# Summary Table
print """Warm Pixel Fraction Range: {0:<3.2f} - {1:3.2f}""".format(wp_test - 0.01, wp_test + 0.01)
print """Average WPF : {}\n""".format(np.mean(wp_range['warm_pix']))
print """{0:^15} | {1:^15} | {2:^15} | {3:^15} | {4:^15} | {5:^15}""".format("Mag Bin",
"Percent Failed",
"Total Observed",
"Failure Count",
"Expected Fails",
"1-Std Dev")
print """{}""".format("-"*105)
for i in np.arange(0, len(total_counts)):
mag = ((nbins[i+1] - 0.25) + nbins[i+1])/2.0
pi = stats.norm.cdf(np.dot(build_matrix_line(mag, wp_test, m_center, wp_center), betas))
exp_fail = total_counts[i] * pi
std_dev = np.sqrt(total_counts[i] * pi * (1 - pi))
print """{0:<15} | {1:^15.3f} | {2:^15} | {3:^15} | {4:^15.1f} | {5:^15}""".format("{0:<5.2f} - {1:>5.2f}".format(nbins[i+1] - 0.25, nbins[i+1]),
100*percent_fail[i],
total_counts[i],
failure_counts[i],
exp_fail,
"+/- {:<4.1f}".format(std_dev))
In [21]:
compare_show_errors(wp_test, beta_draws, betas)
In [22]:
star_mag = 10.5
allbetas = np.vstack((beta_draws['alpha'],
beta_draws['beta1'],
beta_draws['beta2'],
beta_draws['beta3'],
beta_draws['beta4'],
beta_draws['beta5']))
pvalues = stats.norm.cdf(np.dot(build_matrix_line(star_mag, wp_test, m_center, wp_center), allbetas))
F = plt.figure()
plt.hist(pvalues, bins=25, normed=True)
plt.xlim((0,1))
plt.xlabel('Probability of Failure')
if np.min(pvalues>=0.5):
plt.legend(['Star Mag = {}\nWP: {}'.format(star_mag, wp_test)], loc=1)
elif np.max(pvalues>=0.5):
plt.legend(['Star Mag = {}\nWP: {}'.format(star_mag, wp_test)], loc=2)
plt.yticks(visible=False)
plt.xticks(np.arange(0,1.1,0.1),np.arange(0,1.1,0.1))
plt.grid(which='both', axis='x')
plt.show()
In [23]:
compare_obs_2_calc(0.11, betas)
compare_show_errors(0.11, beta_draws, betas)
In [24]:
compare_obs_2_calc(0.12, betas)
compare_show_errors(0.12, beta_draws, betas)
In [25]:
compare_obs_2_calc(0.13, betas)
compare_show_errors(0.13, beta_draws, betas)
In [26]:
compare_obs_2_calc(0.14, betas)
compare_show_errors(0.14, beta_draws, betas)
In [27]:
compare_obs_2_calc(0.15, betas)
compare_show_errors(0.15, beta_draws, betas)
In [28]:
compare_obs_2_calc(0.16, betas)
compare_show_errors(0.16, beta_draws, betas)
In [29]:
compare_obs_2_calc(0.17, betas)
compare_show_errors(0.17, beta_draws, betas)
In [30]:
compare_obs_2_calc(0.18, betas)
compare_show_errors(0.18, beta_draws, betas)
In [31]:
compare_obs_2_calc(0.19, betas)
compare_show_errors(0.19, beta_draws, betas)
Since our model is $p = \Phi(X\beta)$, we can solve for warm pixel faction if we know $p$ and star magnitude.
Here all covariates are centered....,
\begin{eqnarray} p &=& \Phi(X\beta) \\ 0.5 &=& \Phi(\alpha + \beta_1 \times Mag + \beta_2 \times Mag^2 + \beta_3 \times WarmPix + \beta_4 \times Mag \times WarmPix + \beta_5 \times Mag^2 \times WarmPix) \\ \Phi^{-1}(0.5) &=& \alpha + \beta_1 \times Mag + \beta_2 \times Mag^2 + \beta_3 \times WarmPix + \beta_4 \times Mag \times WarmPix + \beta_5 \times Mag^2 \times WarmPix \\ 0 &=& \alpha + \beta_1 \times Mag + \beta_2 \times Mag^2 + \beta_3 \times WarmPix + \beta_4 \times Mag \times WarmPix + \beta_5 \times Mag^2 \times WarmPix \\ - (\alpha + \beta_1 \times Mag + \beta_2 \times Mag^2) &=& (\beta_3 + \beta_4 \times Mag + \beta_5 \times Mag^2) \times WarmPix \\ & & \\ WarmPix_{0.5} &=& \frac{-(\alpha + \beta_1 \times Mag + \beta_2 \times Mag^2)}{(\beta_3 + \beta_4 \times Mag + \beta_5 \times Mag^2)} \\ \end{eqnarray}
In [32]:
def warmpix50(betas, mag, mcenter, wpcenter):
num = -1.0 * (betas[0] + betas[1]*(mag-mcenter) + betas[2]*(mag-mcenter)**2)
denom = betas[3] + betas[4]*(mag-mcenter) + betas[5]*(mag-mcenter)**2
return num/denom + wpcenter
In [33]:
wp50 = []
mags = np.arange(9.75,11.0,0.001)
maxyear = np.max(acq_data['tstart_jyear'])
avgwpf_year = np.mean(ast.subset_range_tstart_jyear(acq_data, maxyear - 1, maxyear)['warm_pix'])
avgwpf_6mo = np.mean(ast.subset_range_tstart_jyear(acq_data, maxyear - 0.5, maxyear)['warm_pix'])
avgwpf_3mo = np.mean(ast.subset_range_tstart_jyear(acq_data, maxyear - 0.25, maxyear)['warm_pix'])
for m in mags:
wp50.append(warmpix50(betas, m, m_center, wp_center))
F = plt.figure()
p1, = plt.plot(mags, wp50)
p2, = plt.plot(mags, avgwpf_year*np.ones(len(mags)), color='r', linestyle='--')
p3, = plt.plot(mags, avgwpf_6mo*np.ones(len(mags)), color='g', linestyle='--')
p4, = plt.plot(mags, avgwpf_3mo*np.ones(len(mags)), color='black', linestyle='--')
plt.grid(b=None, which='both', axis='both')
plt.legend([p1,p2,p3,p4],
['50% Failure Line',
'Most Recent Year WP Average',
'Most Recent 6 mo. WP Average',
'Most Recent 3 mo. WP Average'])
F.set_size_inches(15,5)
plt.xlim(9.75,11.0)
plt.ylabel('Warm Pixel Fraction')
plt.xlabel('Star Magnitude')
plt.xticks(np.arange(9.8,11.0,0.1),np.arange(9.8,11.0,0.1))
plt.title('50% Failure Rate line for Warm Pixel Fraction and Magnitude')
plt.show()
In [34]:
# Now let's plot the distribution of the 50% failure rate.
mags = np.arange(9.75,11.0,0.001)
F = plt.figure()
for n in np.arange(1,len(beta_draws['alpha']),1):
betas_n = beta_draws['alpha'][n], beta_draws['beta1'][n], beta_draws['beta2'][n], beta_draws['beta3'][n], beta_draws['beta4'][n], beta_draws['beta5'][n]
wp50 = []
for m in mags:
wp50.append(warmpix50(betas_n, m, m_center, wp_center))
p1, = plt.plot(mags, wp50)
plt.grid(b=None, which='both', axis='both')
F.set_size_inches(15,5)
plt.xlim(9.75,11.0)
plt.ylabel('Warm Pixel Fraction')
plt.xlabel('Star Magnitude')
plt.xticks(np.arange(9.8,11.0,0.1),np.arange(9.8,11.0,0.1))
plt.title('Distribution of 50% Failure Rate line for Warm Pixel Fraction and Magnitude')
plt.show()
In [35]:
beta_results = {'description':analysis_description,
'names':np.array(['alpha', 'beta1', 'beta2', 'beta3', 'beta4', 'beta5']),
'means':betas,
'draws':beta_draws,
'betamatrix':np.vstack((beta_draws['alpha'],
beta_draws['beta1'],
beta_draws['beta2'],
beta_draws['beta3'],
beta_draws['beta4'],
beta_draws['beta5'])),
'centers':{'mag':m_center,
'warmpix':wp_center}}
In [36]:
import pickle
with open(fileout, 'wb') as f:
pickle.dump(beta_results, f)