In [1]:
import numpy as np
import scipy
import tables
import csv
import datetime as dt
import pandas as pd
from astropy.time import Time
import matplotlib.pyplot as plt
from astropy.time import Time
import itertools as it
%matplotlib inline
In [2]:
chandra = tables.openFile('/Users/bvegetabile/_ucirvine/2015/winter/cs273/finalproject/acq_stats.h5')
In [3]:
allevents = chandra.root.data.read()
allevents.dtype
Out[3]:
http://cxc.cfa.harvard.edu/mta/ASPECT/tool_doc/dev/mica/acquisition_data.html
Columns for Analysis
Response Variable
Identifying Variables
Input Variables of Interest
Creating New Variables
In [4]:
def decimaltime(x):
return Time(dt.datetime.strptime(x, "%Y:%j:%H:%M:%S.%f"), format='datetime').jyear
def expand_grid(*args, **kwargs):
columns = []
lst = []
if args:
columns += xrange(len(args))
lst += args
if kwargs:
columns += kwargs.iterkeys()
lst += kwargs.itervalues()
return pd.DataFrame(list(it.product(*lst)), columns=columns)
In [5]:
dectime = [decimaltime(x) for x in allevents['acq_start']]
dectime = np.array(dectime)
dayofyear = [dt.datetime.strptime(x, "%Y:%j:%H:%M:%S.%f").strftime('%j') for x in allevents['acq_start']]
dayofyear = np.array(dayofyear).astype('int')
chandraData = pd.DataFrame(allevents[['obsid',
'agasc_id',
'acq_start',
'mag',
'ra',
'dec',
'n100_warm_frac',
'ccd_temp',
'known_bad',
'acqid']])
chandraData['dectime'] = dectime
chandraData['dayofyear'] = dayofyear
In [6]:
recentData = chandraData[(chandraData['dectime']>=2011.5)]
trainData = chandraData[(chandraData['dectime']>=2011.5) & (chandraData['dectime']<2014.5)]
testData = chandraData[(chandraData['dectime']>=2014.5)]
In [7]:
print """
Number of Training Cases: {0}
Number of Test Cases : {1}
""".format(trainData.shape[0], testData.shape[0])
In [8]:
success = recentData[recentData['acqid']==True]
failed = recentData[recentData['acqid']==False]
In [9]:
F = plt.figure()
plt.scatter(success['mag'], success['n100_warm_frac'], alpha=0.3)
plt.scatter(failed['mag'], failed['n100_warm_frac'], color='r', alpha=0.3)
plt.xlabel('Star Magnitude')
plt.ylabel('Fraction of Warm Pixels')
F.set_size_inches(10,5)
plt.show()
In [10]:
F = plt.figure()
plt.scatter(success['ra'], success['dec'], alpha=0.3)
plt.scatter(failed['ra'], failed['dec'], color='r', alpha=0.3)
plt.xlabel('Right Ascension')
plt.ylabel('Declination')
plt.xlim(0, 360)
F.set_size_inches(10,5)
plt.show()
In [11]:
F = plt.figure()
plt.scatter(success['dayofyear'], success['ra'], alpha=0.3)
plt.scatter(failed['dayofyear'], failed['ra'], color='r', alpha=0.3)
plt.ylabel('Right Ascension')
plt.xlabel('Day of Year')
plt.xlim(0, 366)
F.set_size_inches(10,5)
plt.show()
In [12]:
from sklearn.ensemble import RandomForestClassifier
In [13]:
trainData['is_validation'] = np.random.uniform(0, 1, len(trainData)) <= .1
train = trainData[trainData['is_validation']==False]
validate = trainData[trainData['is_validation']==True]
features = trainData.columns[[3,4,5,6,7,11]]
features
Out[13]:
In [14]:
nEstimates = np.arange(3,151,3)
mseTrain = np.empty(np.shape(nEstimates)[0])
mseValidate = np.empty(np.shape(nEstimates)[0])
for e, i in enumerate(nEstimates):
clf = RandomForestClassifier(n_estimators=i)
y, _ = pd.factorize(train['acqid'])
y2, _ = pd.factorize(validate['acqid'])
clf.fit(train[features], y)
trainPreds = clf.predict_proba(train[features])
valPreds = clf.predict_proba(validate[features])
mseTrain[e] = np.sum((y - trainPreds[:,1])**2)/(y.shape[0])
mseValidate[e] = np.sum((y2 - valPreds[:,1])**2)/(y2.shape[0])
In [15]:
F = plt.figure()
plt.plot(nEstimates,mseTrain)
plt.plot(nEstimates,mseValidate)
F.set_size_inches(10,5)
plt.show()
In [16]:
clf = RandomForestClassifier(n_estimators=150, criterion='entropy')
yTrain, _ = pd.factorize(trainData['acqid'])
yTest, _ = pd.factorize(testData['acqid'])
clf.fit(trainData[features], yTrain)
trainPreds = clf.predict_proba(trainData[features])
testPreds = clf.predict_proba(testData[features])
mseTrainFinal = np.sum((yTrain - trainPreds[:,1])**2)/(yTrain.shape[0])
mseValidateFinal = np.sum((yTest - testPreds[:,1])**2)/(yTest.shape[0])
mseTrainFailures = np.sum((yTrain[yTrain==1] - trainPreds[yTrain==1,1])**2)/(yTrain[yTrain==1].shape[0])
mseTestFailures = np.sum((yTest[yTest==1] - testPreds[yTest==1,1])**2)/(yTest[yTest==1].shape[0])
In [17]:
print """
Training Dataset MSE: {0} ({2})
Test Dataset MSE : {1} ({3})
""".format(mseTrainFinal, mseValidateFinal, mseTrainFailures,mseTestFailures)
In [18]:
fig, ax = plt.subplots()
ax.bar(np.arange(0.6,6.6),clf.feature_importances_)
ax.set_xticks(np.arange(1,7))
ax.set_xticklabels( ('MAG', 'RA', 'DEC', 'FWP', 'TEMP', 'DOY') )
ax.set_title('Feature Importance')
fig.set_size_inches(8,5)
plt.show()
In [19]:
tf = [True, False]
pick8 = expand_grid(tf,tf,tf,tf,tf,tf,tf,tf)
pick7 = expand_grid(tf,tf,tf,tf,tf,tf,tf)
pick6 = expand_grid(tf,tf,tf,tf,tf,tf)
pick5 = expand_grid(tf,tf,tf,tf,tf)
pick4 = expand_grid(tf,tf,tf,tf)
mask8 = np.sum(pick8, axis=1)
mask7 = np.sum(pick7, axis=1)
mask6 = np.sum(pick6, axis=1)
mask5 = np.sum(pick5, axis=1)
mask4 = np.sum(pick4, axis=1)
In [31]:
catalogs = np.unique(trainData['obsid'])
catalogFailures = np.empty(np.shape(catalogs)[0])
trainData['failures'], _ = pd.factorize(trainData['acqid'])
for i, c in enumerate(catalogs):
catalogFailures[i] = np.sum(trainData[trainData['obsid']==c]['failures'])
catalogs = np.unique(trainData['obsid'])
expectedFails = np.empty(np.shape(catalogs)[0])
for i, c in enumerate(catalogs):
stars = trainData[trainData['obsid'] == c]
starPreds = clf.predict_proba(stars[features])
nstars = np.shape(stars)[0]
failureProbs = np.empty(nstars+1)
for p in np.arange(0,nstars+1):
if nstars == 8:
prob = np.dot(pick8[mask8==p], np.diag(starPreds[:,1])) + np.dot((1-pick8[mask8==p]), np.diag(starPreds[:,0]))
if nstars == 7:
prob = np.dot(pick7[mask7==p], np.diag(starPreds[:,1])) + np.dot((1-pick7[mask7==p]), np.diag(starPreds[:,0]))
if nstars == 6:
prob = np.dot(pick6[mask6==p], np.diag(starPreds[:,1])) + np.dot((1-pick6[mask6==p]), np.diag(starPreds[:,0]))
if nstars == 5:
prob = np.dot(pick5[mask5==p], np.diag(starPreds[:,1])) + np.dot((1-pick5[mask5==p]), np.diag(starPreds[:,0]))
if nstars == 4:
prob = np.dot(pick4[mask4==p], np.diag(starPreds[:,1])) + np.dot((1-pick4[mask4==p]), np.diag(starPreds[:,0]))
failureProbs[p] = np.sum(np.prod(prob, axis=1))
exFail = np.dot(np.arange(0,nstars+1),failureProbs)
expectedFails[i] = exFail
In [32]:
F = plt.figure()
plt.hist(catalogFailures-expectedFails, bins=20)
plt.xlabel('Actual Failures - Expected Failures')
plt.title('Training Data')
F.set_size_inches(8,5)
plt.show()
In [33]:
F = plt.figure()
plt.scatter(catalogFailures+ np.random.normal(0,0.025,size=len(catalogFailures)), catalogFailures-expectedFails, )
plt.axhline(y=0, linewidth=2, color = 'r', linestyle='--')
plt.xlabel('Actual Failures (Jitter Added)')
plt.ylabel('Actual Failures - Expected Failures')
F.set_size_inches(8,5)
plt.show()
In [34]:
catalogs = np.unique(testData['obsid'])
catalogFailures = np.empty(np.shape(catalogs)[0])
testData['failures'], _ = pd.factorize(testData['acqid'])
for i, c in enumerate(catalogs):
catalogFailures[i] = np.sum(testData[testData['obsid']==c]['failures'])
catalogs = np.unique(testData['obsid'])
expectedFails = np.empty(np.shape(catalogs)[0])
for i, c in enumerate(catalogs):
stars = testData[testData['obsid'] == c]
starPreds = clf.predict_proba(stars[features])
nstars = np.shape(stars)[0]
failureProbs = np.empty(nstars+1)
for p in np.arange(0,nstars+1):
if nstars == 8:
prob = np.dot(pick8[mask8==p], np.diag(starPreds[:,1])) + np.dot((1-pick8[mask8==p]), np.diag(starPreds[:,0]))
if nstars == 7:
prob = np.dot(pick7[mask7==p], np.diag(starPreds[:,1])) + np.dot((1-pick7[mask7==p]), np.diag(starPreds[:,0]))
if nstars == 6:
prob = np.dot(pick6[mask6==p], np.diag(starPreds[:,1])) + np.dot((1-pick6[mask6==p]), np.diag(starPreds[:,0]))
if nstars == 5:
prob = np.dot(pick5[mask5==p], np.diag(starPreds[:,1])) + np.dot((1-pick5[mask5==p]), np.diag(starPreds[:,0]))
if nstars == 4:
prob = np.dot(pick4[mask4==p], np.diag(starPreds[:,1])) + np.dot((1-pick4[mask4==p]), np.diag(starPreds[:,0]))
failureProbs[p] = np.sum(np.prod(prob, axis=1))
exFail = np.dot(np.arange(0,nstars+1),failureProbs)
expectedFails[i] = exFail
In [35]:
F = plt.figure()
plt.hist(catalogFailures-expectedFails, bins=20)
plt.xlabel('Actual Failures - Expected Failures')
F.set_size_inches(8,5)
plt.show()
In [36]:
F = plt.figure()
plt.scatter(catalogFailures+ np.random.normal(0,0.05,size=len(catalogFailures)), catalogFailures-expectedFails, )
plt.axhline(y=0, linewidth=2, color = 'r', linestyle='--')
plt.xlabel('Actual Failures (Jitter Added)')
plt.ylabel('Actual Failures - Expected Failures')
F.set_size_inches(8,5)
plt.show()