In [1]:
## Read in the Training Data and Instantiating the Photo-z Algorithm
%matplotlib inline
from astropy.table import Table
import numpy as np
import matplotlib.pyplot as plt
data = Table.read('GTR-ADM-QSO-ir-testhighz_findbw_lup_2016_starclean.fits')
# Remove stars
qmask = (data['zspec']>0)
qdata = data[qmask]
print len(qdata)
# X is in the format need for all of the sklearn tools, it just has the colors
Xtrain = np.vstack([ qdata['ug'], qdata['gr'], qdata['ri'], qdata['iz'], qdata['zs1'], qdata['s1s2']]).T
#y = np.array(data['labels'])
ytrain = np.array(qdata['zspec'])
Since we are running on separate test data, we don't need to do a train_test_split
here. But we will scale the data. Need to remember to scale the test data later!
In [2]:
# For algorithms that need scaled data:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(Xtrain) # Don't cheat - fit only on training data
Out[2]:
Quasars candidates from the legacy KDE algorithm are in
GTR-ADM-QSO-ir-testhighz_kdephotoz_lup_2016_quasar_candidates.dat
Quasars candidates from the Random Forest Algorithm are in
GTR-ADM-QSO-ir_good_test_2016_out.fits
Quasar candidates from the RF, SVM, and/or bagging algorithms are in
GTR-ADM-QSO-ir_good_test_2016_out_Stripe82all.fits
In the case of the latter file, this includes Stripe82 only. If we run on the other files, we might want to limit to Stripe 82 to keep the computing time reasonable.
In [3]:
testdata = Table.read('GTR-ADM-QSO-ir_good_test_2016_out_Stripe82all.fits')
In [4]:
qsocandmask = ((testdata['ypredRFC']==0) | (testdata['ypredSVM']==0) | (testdata['ypredBAG']==0))
testdatacand = testdata[qsocandmask]
print len(testdata),len(testdatacand)
In [5]:
Xtest = np.vstack([ testdatacand['ug'], testdatacand['gr'], testdatacand['ri'], testdatacand['iz'], testdatacand['zs1'], testdatacand['s1s2']]).T
XStest = scaler.transform(Xtest) # apply same transformation to test data
Not currently executing the next 2 cells, but putting the code here in case we want to do it later.
In [ ]:
# Read in KDE candidates
dataKDE = Table.read('GTR-ADM-QSO-ir-testhighz_kdephotoz_lup_2016_quasar_candidates.dat', format='ascii')
print dataKDE.keys()
print len(XKDE)
XKDE = np.vstack([ dataKDE['ug'], dataKDE['gr'], dataKDE['ri'], dataKDE['iz'], dataKDE['zch1'], dataKDE['ch1ch2'] ]).T
In [ ]:
# Read in RF candidates
dataRF = Table.read('GTR-ADM-QSO-ir_good_test_2016_out.fits')
print dataRF.keys()
print len(dataRF)
# Canidates only
maskRF = (dataRF['ypred']==0)
dataRF = dataRF[maskRF]
print len(dataRF)
# X is in the format need for all of the sklearn tools, it just has the colors
XRF = np.vstack([ dataRF['ug'], dataRF['gr'], dataRF['ri'], dataRF['iz'], dataRF['zs1'], dataRF['s1s2']]).T
In [6]:
import numpy as np
from astroML.linear_model import NadarayaWatson
model = NadarayaWatson('gaussian', 0.05)
In [7]:
model.fit(Xtrain,ytrain)
Out[7]:
In [8]:
from sklearn.ensemble import RandomForestRegressor
modelRF = RandomForestRegressor()
modelRF.fit(Xtrain,ytrain)
Out[8]:
In [9]:
zphotRF = modelRF.predict(Xtest)
In [10]:
zphotNW = model.predict(Xtest)
Only need this if Xtest is too big
In [9]:
from dask import compute, delayed
def process(Xin):
return model.predict(Xin)
# Create dask objects
dobjs = [delayed(process)(x.reshape(1,-1)) for x in Xtest]
In [ ]:
import dask.threaded
ypred = compute(*dobjs, get=dask.threaded.get)
In [ ]:
# The dask output needs to be reformatted.
zphotNW = np.array(ypred).reshape(1,-1)[0]
In [12]:
testdatacand['zphotNW'] = zphotNW
testdatacand['zphotRF'] = zphotRF
In [13]:
testdatacand.write('GTR-ADM-QSO-ir_good_test_2016_out_Stripe82all_zphot.fits', format='fits')
In [16]:
from densityplot import *
from pylab import *
fig = plt.figure(figsize=(5,5))
hex_scatter(testdatacand['zphotNW'],testdatacand['ug'], min_cnt=10, levels=2, std=True, smoothing=1,
hkwargs={'gridsize': 100, 'cmap': plt.cm.Blues},
skwargs={'color': 'k'})
plt.xlabel('zphot')
plt.ylabel('u-g')
#plt.xlim([-0.1,5.5])
#plt.ylim([-0.1,5.5])
plt.show()
In [24]:
from astroML.plotting import hist as fancyhist
fancyhist(testdatacand['zphotRF'], bins="freedman", histtype="step")
Out[24]: