In [1]:
import numpy as np
import os
import pandas as pd
import h5py

from astrometry.util.fits import fits_table, merge_tables

# To plot pretty figures
%matplotlib inline
#%matplotlib notebook

# to make this notebook's output stable across runs
def reset_graph(seed=7):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

import tensorflow as tf

%load_ext autoreload
%autoreload 2


/Users/kaylan1/miniconda3/envs/mlbook/lib/python3.6/importlib/_bootstrap.py:205: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6
  return f(*args, **kwds)

In [30]:
from obiwan.qa.visual import plotImage, readImage, sliceImage

Visualize the training data

read the hdf5 files


In [123]:
f_dr5= h5py.File(os.path.join(os.environ['HOME'],'DOWNLOADS',
                    'google_ai/hdf5/%s/121/1211p060/img_ivar_grz.hdf5' % 'dr5'),
                    'r')
f_dr5_jpeg= h5py.File(os.path.join(os.environ['HOME'],'DOWNLOADS',
                    'google_ai/hdf5/%s/121/1211p060/jpeg_grz.hdf5' % 'dr5'),
                    'r')
dr5_sorted_ids= fits_table(os.path.join(os.environ['HOME'],'DOWNLOADS',
                    'google_ai/hdf5/%s/121/1211p060/sorted_ids.fits' % 'dr5'))

f_sim= h5py.File(os.path.join(os.environ['HOME'],'DOWNLOADS',
                    'google_ai/hdf5/%s/121/1211p060/img_ivar_grz.hdf5' % 'sim'),
                 'r')

f_sim_jpeg= h5py.File(os.path.join(os.environ['HOME'],'DOWNLOADS',
                    'google_ai/hdf5/%s/121/1211p060/jpeg_grz.hdf5' % 'sim'),
                 'r')
sim_sorted_ids= fits_table(os.path.join(os.environ['HOME'],'DOWNLOADS',
                    'google_ai/hdf5/%s/121/1211p060/sorted_ids.fits' % 'sim'))

In [124]:
keys=dict(dr5=[key for key in f_dr5.keys()],
          sim=[key for key in f_sim.keys()])
          #jpeg=list(f_sim_jpeg.keys()))
len(keys['dr5']),len(dr5_sorted_ids),len(keys['sim']),len(sim_sorted_ids)


Out[124]:
(880, 880, 259, 259)

In [125]:
f_sim['/%s/ivar' % keys['sim'][i]].shape


Out[125]:
(64, 64, 3)

In [126]:
print(sim_sorted_ids.get_columns())
for T in sim_sorted_ids[:10]: print(T.id,T.mag_g)


['id', 'mag_g']
103080843 21.3871759879
370074523 21.9118273261
114185542 22.1511480357
106085338 22.2520302338
61604085 22.4234469734
243535310 22.4476237756
156101043 22.516784148
8341765 22.5456832241
114521399 22.5477962344
85407414 22.6195802363

In [130]:
_= plt.hist(sim_sorted_ids.mag_g,histtype='step',color='b')
_= plt.hist(dr5_sorted_ids.mag_g,histtype='step',color='g')


For each sim mag_g, draw the nearest dr5 mag_g, without replacements


In [151]:
dr5_mags= dr5_sorted_ids.mag_g.copy()
id_dr5_draw= []
for sim_mag in sim_sorted_ids.mag_g:
    i= np.argmin(np.abs(dr5_mags - sim_mag))
    id_dr5_draw.append(dr5_sorted_ids.id[i])
    len_i= len(dr5_mags)
    dr5_mags= np.delete(dr5_mags,dr5_mags[i])
    assert(len_i == len(dr5_mags)+1)


/Users/kaylan1/miniconda3/envs/mlbook/lib/python3.6/site-packages/ipykernel_launcher.py:7: DeprecationWarning: using a non-integer array as obj in delete will result in an error in the future
  import sys

In [157]:
for i in range(20):
    print(dr5_sorted_ids[dr5_sorted_ids.id == id_dr5_draw[i]].mag_g, sim_sorted_ids.mag_g[i])


[ 21.39718819] 21.3871759879
[ 21.90782928] 21.9118273261
[ 22.14355278] 22.1511480357
[ 22.217062] 22.2520302338
[ 22.41222191] 22.4234469734
[ 22.4383316] 22.4476237756
[ 22.48746109] 22.516784148
[ 22.48746109] 22.5456832241
[ 22.48746109] 22.5477962344
[ 22.5747509] 22.6195802363
[ 22.62215424] 22.6675529884
[ 22.62044716] 22.6677327543
[ 22.63945389] 22.694754685
[ 22.64107895] 22.7006423466
[ 22.69321251] 22.7392880101
[ 22.69935799] 22.750211746
[ 22.69935799] 22.7524777029
[ 22.7031498] 22.7607775988
[ 22.71958923] 22.7808052859
[ 22.71958923] 22.7834926551

In [191]:
import matplotlib as mpl
fontsize= 15
mpl.rcParams['axes.titlesize'] = fontsize
mpl.rcParams['axes.labelsize']= fontsize
mpl.rcParams['font.size']= fontsize

In [221]:
def fake_real_mosaic(i_start, nrow=8,ncol=7):
    fig,ax= plt.subplots(nrow,ncol,figsize=(ncol*1.5,1.5*nrow))
    plt.subplots_adjust(hspace=0.01,wspace=0.01)
    d_text= dict(rotation='horizontal',
               horizontalalignment='left',
                verticalalignment='center')

    # Real
    i_real=i_start-1
    for row in np.arange(0,nrow,2):
        i_real+=1
        plotImage().imshow(f_dr5_jpeg['/%s/img' % id_dr5_draw[i_real]],ax[row,0])
        for iband,col in enumerate(range(1,4)):
            plotImage().imshow(f_dr5['/%s/img' % id_dr5_draw[i_real]][:,:,iband].T,ax[row,col],qs=[3,97])
        for iband,col in enumerate(range(4,7)):
            plotImage().imshow(f_dr5['/%s/ivar' % id_dr5_draw[i_real]][:,:,iband].T,ax[row,col],qs=[3,97])
        ax[row,-1].text(1.05,0.5,'%.2f' % dr5_sorted_ids[dr5_sorted_ids.id == id_dr5_draw[i_real]].mag_g,
                        transform=ax[row,-1].transAxes,**d_text)
    # Fake
    i_fake=i_start-1
    for row in np.arange(1,nrow,2):
        i_fake+=1
        plotImage().imshow(f_sim_jpeg['/%s/img' % sim_sorted_ids.id[i_fake]],ax[row,0])
        for iband,col in enumerate(range(1,4)):
            plotImage().imshow(f_sim['/%s/img' % sim_sorted_ids.id[i_fake]][:,:,iband].T,ax[row,col],qs=[3,97])
        for iband,col in enumerate(range(4,7)):
            plotImage().imshow(f_sim['/%s/ivar' % sim_sorted_ids.id[i_fake]][:,:,iband].T,ax[row,col],qs=[3,97])
        ax[row,-1].text(1.05,0.5,'%.2f' % sim_sorted_ids.mag_g[i_fake],
                        transform=ax[row,-1].transAxes,**d_text)


    for row in range(nrow):
        for col in range(ncol):
            ax[row,col].set_xticks([])
            ax[row,col].set_yticks([])

    for col,name in zip(range(ncol),
                        ['g+r+z','g','r','z','ivar(g)','ivar(r)','ivar(z)']):
        tlab= ax[0,col].set_title(name)
    mlab= ax[0,-1].text(1.05,1.1,'mag(g)',
                        transform=ax[0,-1].transAxes,**d_text)
    d_lab= dict(rotation='horizontal',
               horizontalalignment='right',
               verticalalignment='center')
    for row in np.arange(0,nrow,2):
        ylab=ax[row,0].set_ylabel("R",**d_lab)
    for row in np.arange(1,nrow,2):
        ax[row,0].set_ylabel("F",**d_lab)

    plt.savefig('fake_real_mosaic_istart_%d.png' % i_start,
                bbox_extra_artists=[tlab,ylab,mlab], bbox_inches='tight')

Mosaic of training images


In [225]:
# Brightest
fake_real_mosaic(0, nrow=8,ncol=7)



In [223]:
# Faintest
fake_real_mosaic(len(sim_sorted_ids)-int(nrow/2)-1, nrow=8,ncol=7)



In [224]:
# Median
fake_real_mosaic(len(sim_sorted_ids)//2, nrow=8,ncol=7)



In [ ]:
# Loop over all sim sources
for istart in np.arange(0,len(sim_sorted_ids),8):
    fake_real_mosaic(istart, nrow=8,ncol=7)


/Users/kaylan1/miniconda3/envs/mlbook/lib/python3.6/site-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-227-a6e3b923c3b1> in <module>()
      1 for istart in np.arange(0,len(sim_sorted_ids),8):
----> 2     fake_real_mosaic(istart, nrow=8,ncol=7)

<ipython-input-221-ad060ee414cb> in fake_real_mosaic(i_start, nrow, ncol)
     10     for row in np.arange(0,nrow,2):
     11         i_real+=1
---> 12         plotImage().imshow(f_dr5_jpeg['/%s/img' % id_dr5_draw[i_real]],ax[row,0])
     13         for iband,col in enumerate(range(1,4)):
     14             plotImage().imshow(f_dr5['/%s/img' % id_dr5_draw[i_real]][:,:,iband].T,ax[row,col],qs=[3,97])

IndexError: list index out of range
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

In [12]:
from glob import glob
fns= glob(os.path.join(os.environ['HOME'],'Downloads',
                'dr5_testtrain/testtrain/121/1211p060/xtrain_*.npy'))
xtrain= [np.load(fn) for fn in fns]
np.vstack(xtrain).shape


Out[12]:
(1408, 64, 64, 6)

In [34]:
jpg= readImage(os.path.join(os.environ['HOME'],'DOWNLOADS',
                    'google_ai/legacysurvey-1211p060-image.jpg'),jpeg=True)
sliceImage(jpg,xslice=slice(10,20),yslice=slice(10,20)).shape


Out[34]:
(10, 10, 3)

In [29]:
os.path.basename('/global/cscratch1/sd/kaylanb/obiwan_out/elg_dr5_coadds/coadd/121/1211p060/rs0')


Out[29]:
'rs0'

Load training data (.npy files created with "split_traintest.py" script)


In [7]:
dr= os.path.join(os.environ['HOME'],'Downloads',
                'dr5_testtrain/testtrain/121/1211p060')
xtrain= np.load(os.path.join(dr,'xtrain_1.npy'))
ytrain= np.load(os.path.join(dr,'ytrain_1.npy'))
xtrain.shape,ytrain.shape,len(ytrain[ytrain == 0]),set(ytrain)


Out[7]:
((512, 64, 64, 6), (512,), 250, {0.0, 1.0})

In [26]:
isFake= ytrain == 1
img_inds= [0,2,4]

nrow,ncol=2,8
for band,iband in zip('grz',img_inds):
    fig,ax= plt.subplots(nrow,ncol,figsize=(10,2))
    i=-1
    for row in range(nrow):
        for col in range(ncol):
            i+=1
            plotImage().imshow(xtrain[isFake][i,:,:,iband],ax[row,col])
    fig.suptitle('%s-band' % band)



In [28]:
img_inds= np.array([0,2,4])+1

nrow,ncol=2,8
for band,iband in zip('grz',img_inds):
    fig,ax= plt.subplots(nrow,ncol,figsize=(10,2))
    i=-1
    for row in range(nrow):
        for col in range(ncol):
            i+=1
            plotImage().imshow(xtrain[isFake][i,:,:,iband],ax[row,col])
    fig.suptitle('ivar-%s' % band)


Build the CNN


In [31]:
xtrain[0,...].shape, xtrain.dtype,ytrain.dtype


Out[31]:
((64, 64, 6), dtype('float32'), dtype('float64'))

In [67]:
# Design:
# input, 3x(conv + avg pool), 2x(fc)

height,width,channels = (64,64,6) #images_real.shape

reset_graph()

conv_kwargs= dict(strides=1,
                  padding='SAME',
                  activation=tf.nn.relu)
pool_kwargs= dict(ksize= [1,2,2,1],
                  strides=[1,2,2,1],
                  padding='VALID')

with tf.name_scope("inputs"):
    X = tf.placeholder(tf.float32, shape=[None,height,width,channels], name="X") #training data shape
    #X_reshaped = tf.reshape(X, shape=[-1, height, width, channels])
    y = tf.placeholder(tf.int32, shape=[None], name="y") 

init = tf.global_variables_initializer()

# 64x64
with tf.name_scope("layer1"):
    conv1 = tf.layers.conv2d(X, filters=2*channels, kernel_size=7,
                             **conv_kwargs)
    pool1 = tf.nn.avg_pool(conv1, **pool_kwargs)

# 32x32
with tf.name_scope("layer2"):
    conv2 = tf.layers.conv2d(pool1, filters=4*channels, kernel_size=7,
                             **conv_kwargs)
    pool2 = tf.nn.avg_pool(conv2, **pool_kwargs)

# 16x16
with tf.name_scope("layer3"):
    conv3 = tf.layers.conv2d(pool2, filters=8*channels, kernel_size=7,
                             **conv_kwargs)
    pool3 = tf.nn.avg_pool(conv3, **pool_kwargs)
    # next is fc
    pool3_flat = tf.reshape(pool3, 
                            shape=[-1, pool3.shape[1] * pool3.shape[2] * pool3.shape[3]])


with tf.name_scope("fc"):
    fc = tf.layers.dense(pool3_flat, 64, activation=tf.nn.relu, name="fc")

with tf.name_scope("output"):
    logits = tf.layers.dense(fc, 2, name="output") # 2 classes
    Y_proba = tf.nn.softmax(logits, name="Y_proba")

with tf.name_scope("train"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y)
    loss = tf.reduce_mean(xentropy)
    optimizer = tf.train.AdamOptimizer()
    training_op = optimizer.minimize(loss)

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
init = tf.global_variables_initializer()
saver = tf.train.Saver()

loss_summary= tf.summary.scalar('loss', loss)
accur_summary = tf.summary.scalar('accuracy', accuracy)

In [75]:
logits.op.name, loss.op.name, accuracy.op.name


Out[75]:
('output/output/BiasAdd', 'train/Mean', 'eval/Mean')

In [68]:
def BatchGen(X,y,batch_size=32):
    # if not perfect divide, will drop extra training instances
    N= X.shape[0]
    ind= np.array_split(np.arange(N),N // batch_size)
    for i in ind:
        yield X[i,...],y[i].astype(np.int32) #.reshape(-1,1).astype(np.int32)
        
a=BatchGen(xtrain,ytrain,batch_size=32)
for X_,y_ in a:
    print('batch:',X_.shape,y_.shape)


batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)
batch: (32, 64, 64, 6) (32,)

In [69]:
from datetime import datetime

def get_logdir(root_logdir):
    now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
    return "{}/run-{}/".format(root_logdir, now)

Layers (check that size of each layer makes sense)


In [70]:
batch_size=32
X_,y_= xtrain[:batch_size,...],ytrain[:batch_size].astype(np.int32)

with tf.Session() as sess:
    sess.run(init)
    print(sess.run(X, feed_dict={X: X_}).shape)
    print(sess.run(conv1, feed_dict={X: X_}).shape)
    print(sess.run(pool1, feed_dict={X: X_}).shape)
    print(sess.run(conv2, feed_dict={X: X_}).shape)
    print(sess.run(pool2, feed_dict={X: X_}).shape)
    print(sess.run(conv3, feed_dict={X: X_}).shape)
    print(sess.run(pool3, feed_dict={X: X_}).shape)
    print(sess.run(pool3_flat, feed_dict={X: X_}).shape)
    print(sess.run(fc, feed_dict={X: X_}).shape)
    print(sess.run(logits, feed_dict={X: X_}).shape)


(32, 64, 64, 6)
(32, 64, 64, 12)
(32, 32, 32, 12)
(32, 32, 32, 24)
(32, 16, 16, 24)
(32, 16, 16, 48)
(32, 8, 8, 48)
(32, 3072)
(32, 64)
(32, 2)

Train (4 epochs)


In [72]:
xtrain= np.load(os.path.join(dr,'xtrain_1.npy'))
ytrain= np.load(os.path.join(dr,'ytrain_1.npy'))
file_writer = tf.summary.FileWriter(get_logdir(os.path.join(os.environ['HOME'],'Downloads',
                                                "cnn",'logs')), 
                                    tf.get_default_graph())

n_epochs = 4
batch_size = 32
n_batches= ytrain.shape[0]//batch_size + 1

with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        data_gen= BatchGen(xtrain,ytrain,batch_size)
        batch_index=0
        for X_,y_ in data_gen:
            sess.run(training_op, feed_dict={X: X_, y: y_})
            batch_index+=1
            if i % 1 == 0:
                step = epoch * n_batches + batch_index
                file_writer.add_summary(loss_summary.eval(feed_dict={X: X_, y: y_}), 
                                        step)
                file_writer.add_summary(accur_summary.eval(feed_dict={X: X_, y: y_}), 
                                        step)
                
        acc_train = accuracy.eval(feed_dict={X: X_, y: y_})
        print(epoch, "Train accuracy:", acc_train)
        if epoch % 2 == 0:
            save_path = saver.save(sess, os.path.join(os.environ['HOME'],'Downloads',
                                                  "cnn/checkpoint.ckpt"))
    save_path = saver.save(sess, os.path.join(os.environ['HOME'],'Downloads',
                                              "cnn/final.ckpt"))


0 Train accuracy: 0.46875
1 Train accuracy: 0.78125
2 Train accuracy: 0.75
3 Train accuracy: 0.84375

Restart from checkpoints


In [59]:
xtrain= np.load(os.path.join(dr,'xtrain_2.npy'))
ytrain= np.load(os.path.join(dr,'ytrain_2.npy'))
with tf.Session() as sess:
    # Equiv of sess.run(init)
    saver.restore(sess, os.path.join(os.environ['HOME'],'Downloads',
                                              "cnn/final.ckpt")) 
    #print('resorted model has accur=',accuracy.eval())
    for epoch in range(n_epochs):
        data_gen= BatchGen(xtrain,ytrain,batch_size)
        for X_,y_ in data_gen:
            sess.run(training_op, feed_dict={X: X_, y: y_})
        acc_train = accuracy.eval(feed_dict={X: X_, y: y_})
        print(epoch, "Train accuracy:", acc_train)
        if epoch % 2 == 0:
            save_path = saver.save(sess, os.path.join(os.environ['HOME'],'Downloads',
                                                  "cnn/checkpoint.ckpt"))
    save_path = saver.save(sess, os.path.join(os.environ['HOME'],'Downloads',
                                              "cnn/final.ckpt"))


INFO:tensorflow:Restoring parameters from /Users/kaylan/Downloads/cnn/final.ckpt
0 Train accuracy: 0.90625
1 Train accuracy: 0.90625
2 Train accuracy: 0.90625
3 Train accuracy: 0.90625

Using hdf5 files not the newer npy train/test split ones


In [4]:
f_real= h5py.File(os.path.join(os.environ['HOME'],'DOWNLOADS',
                    'dr5_hdf5/hdf5/121/1211p060/img_ivar_grz.hdf5'),
                    'r') #1165p107
dr5_tractor= fits_table(os.path.join(os.environ['HOME'],'DOWNLOADS',
                    'dr5_hdf5/hdf5/121/1211p060/tractor-1211p060.fits'))


f_fake= h5py.File(os.path.join(os.environ['HOME'],'DOWNLOADS',
                    'elg_dr5_coadds/hdf5/121/1211p060/img_ivar_grz.hdf5'),
                    'r')
simcat= fits_table(os.path.join(os.environ['HOME'],'DOWNLOADS',
                    'elg_dr5_coadds/hdf5/121/1211p060',
                    'simcat-elg-1211p060-rsALL.fits'))

print(len(f_real.keys()),len(f_fake.keys()),np.min([len(f_real.keys()),len(f_fake.keys())]))


---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
<ipython-input-4-45479c8404b2> in <module>()
      1 f_real= h5py.File(os.path.join(os.environ['HOME'],'DOWNLOADS',
      2                     'dr5_hdf5/hdf5/121/1211p060/img_ivar_grz.hdf5'),
----> 3                     'r') #1165p107
      4 dr5_tractor= fits_table(os.path.join(os.environ['HOME'],'DOWNLOADS',
      5                     'dr5_hdf5/hdf5/121/1211p060/tractor-1211p060.fits'))

~/miniconda3/envs/tensorflow/lib/python3.5/site-packages/h5py/_hl/files.py in __init__(self, name, mode, driver, libver, userblock_size, swmr, **kwds)
    267             with phil:
    268                 fapl = make_fapl(driver, libver, **kwds)
--> 269                 fid = make_fid(name, mode, userblock_size, fapl, swmr=swmr)
    270 
    271                 if swmr_support:

~/miniconda3/envs/tensorflow/lib/python3.5/site-packages/h5py/_hl/files.py in make_fid(name, mode, userblock_size, fapl, fcpl, swmr)
     97         if swmr and swmr_support:
     98             flags |= h5f.ACC_SWMR_READ
---> 99         fid = h5f.open(name, flags, fapl=fapl)
    100     elif mode == 'r+':
    101         fid = h5f.open(name, h5f.ACC_RDWR, fapl=fapl)

h5py/_objects.pyx in h5py._objects.with_phil.wrapper()

h5py/_objects.pyx in h5py._objects.with_phil.wrapper()

h5py/h5f.pyx in h5py.h5f.open()

OSError: Unable to open file (unable to open file: name = '/Users/kaylan/DOWNLOADS/dr5_hdf5/hdf5/121/1211p060/img_ivar_grz.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [87]:
def get_data(f,num=128):
    """Returns numpy array (num,64,64,6)"""
    return np.array([np.stack([f[key+'/img'],f[key+'/ivar']],axis=-1).reshape((64,64,6))
                     for key in list(f.keys())[:num]])

def get_data_imgonly(f,num=128):
    """Returns numpy array (num,64,64,3)"""
    return np.array([np.reshape(f[key+'/img'],(64,64,3))
                     for key in list(f.keys())[:num]])

real= get_data_imgonly(f_real,n)
real.shape


Out[87]:
(128, 64, 64, 6)

In [92]:
inHdf5= pd.Series(dr5_tractor.objid).isin(list(f_real.keys()))
print(len(dr5_tractor[inHdf5]),len(list(f_real.keys())))
dr5_tractor.cut(inHdf5)
# flux_gt_0= ((inHdf5) &
#             (dr5_tractor.flux_g > 0) &
#             (dr5_tractor.flux_r > 0) &
#             (dr5_tractor.flux_z > 0))
# dr5_tractor.cut(flux_gt_0)


880 880

In [93]:
d= {b:dr5_tractor.get('flux_'+b)
    for b in 'grz'}
df= pd.DataFrame(d)
df= df.apply(lambda x: x.values.byteswap().newbyteorder())
df.describe()


Out[93]:
g r z
count 880.000000 880.000000 880.000000
mean 0.761698 1.529735 2.634772
std 0.746897 1.434992 2.366555
min 0.145401 0.261415 0.613148
25% 0.282475 0.506985 1.009587
50% 0.447857 0.891814 1.629145
75% 0.962490 2.082288 3.528070
max 4.510773 6.133409 14.447383

In [96]:
def flux2mag(nmgy):
    return -2.5 * (np.log10(nmgy) - 9)

assert('g' not in dr5_tractor.get_columns())
assert('g' not in simcat.get_columns())
for b in 'grz':
    dr5_tractor.set(b, flux2mag(dr5_tractor.get('flux_'+b)/dr5_tractor.get('mw_transmission_'+b)))
    simcat.set(b, flux2mag(simcat.get(b+'flux')))
dr5_tractor.set('gr', dr5_tractor.g - dr5_tractor.r)
dr5_tractor.set('rz', dr5_tractor.r - dr5_tractor.z)
simcat.set('gr', simcat.g - simcat.r)
simcat.set('rz', simcat.r - simcat.z)

Real vs. Fake galaxies in TS box


In [99]:
fig,ax=plt.subplots(1,3,figsize=(12,4))
ax[0].scatter(dr5_tractor.rz,dr5_tractor.gr,alpha=0.3,c='b',label='dr5')
ax[1].scatter(simcat.rz, simcat.gr,alpha=0.3,c='g',label='sim')
ax[2].scatter(simcat.rz, simcat.gr,alpha=0.3,c='g',label='sim')
ax[2].scatter(dr5_tractor.rz,dr5_tractor.gr,alpha=0.3,c='b',label='dr5')
# ax.plot(pad['x1'],pad['y1'],'r--')
# ax.plot(pad['x2'],pad['y2'],c='r',ls='--',lw=2)
# ax.plot(pad['x3'],pad['y3'],c='r',ls='--',lw=2)
# ax.plot(pad['x4'],pad['y4'],c='r',ls='--',lw=2)
for i in range(3):
    ax[i].set_xlabel('r-z')
    ax[i].set_ylabel('g-r')
    ax[i].set_ylim(-0.3,2)
    ax[i].set_xlim(-0.6,2.2)
    ax[i].legend()


Real galaxies are brighter


In [97]:
import seaborn as sns
fig,ax= plt.subplots(1,3,figsize=(12,3))
for i,b in enumerate('grz'):
    sns.distplot(dr5_tractor.get(b),kde=False,ax=ax[i],color='b')
    sns.distplot(simcat.get(b),kde=False,ax=ax[i],color='g')
    ax[i].set_xlabel(b + ' mag')


apply cuts at bright end


In [101]:
def inRegion(rz,gr):
    return ((rz > 0.5) &
            (rz < 1.) &
            (gr > 0.5) &
            (gr < 1.))

def isFaint(g,r,z):
    return ((g > 23.) &
            (r > 22.5) &
            (z > 21.5))
    
dr5_keep= ((inRegion(dr5_tractor.rz,dr5_tractor.gr)) & 
           (isFaint(dr5_tractor.g,dr5_tractor.r,dr5_tractor.z)))
sim_keep= ((inRegion(simcat.rz, simcat.gr)) &
           (isFaint(simcat.g,simcat.r,simcat.z)))


fig,ax= plt.subplots(1,3,figsize=(12,3))
for i,b in enumerate('grz'):
    sns.distplot(dr5_tractor.get(b)[dr5_keep],kde=False,ax=ax[i],color='b')
    sns.distplot(simcat.get(b)[sim_keep],kde=False,ax=ax[i],color='g')
    ax[i].set_xlabel(b + ' mag')
            
        
fig,ax= plt.subplots(1,3,figsize=(8,3))
ax[0].scatter(dr5_tractor.rz[dr5_keep], dr5_tractor.gr[dr5_keep],alpha=0.3,c='b',label='dr5')
ax[1].scatter(simcat.rz[sim_keep], simcat.gr[sim_keep],alpha=0.3,c='g',label='sim')
for i in range(3):
    ax[i].set_xlabel('r-z')
    ax[i].set_ylabel('g-r')
    ax[i].set_ylim(-0.3,2)
    ax[i].set_xlim(-0.6,2.2)
    ax[i].legend()
    
print(len(dr5_tractor[dr5_keep]), len(simcat[sim_keep]))


/Users/kaylan1/miniconda3/envs/mlbook/lib/python3.6/site-packages/matplotlib/axes/_axes.py:545: UserWarning: No labelled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labelled objects found. "
135 544

Look at 3D plot of grz, diff spaces filled?

Nearest in grz mag


In [102]:
from scipy import spatial
dr5_grz= np.array([[g,r,z]
                  for g,r,z in zip(dr5_tractor.g[dr5_keep],
                                   dr5_tractor.r[dr5_keep],
                                   dr5_tractor.z[dr5_keep])])
sim_grz= np.array([[g,r,z]
                  for g,r,z in zip(simcat.g[sim_keep],
                                   simcat.r[sim_keep],
                                   simcat.z[sim_keep])])

print(dr5_grz.shape, sim_grz.shape)
dr5_tree = spatial.KDTree(dr5_grz)


(135, 3) (544, 3)

In [103]:
d,ind=dr5_tree.query(sim_grz,p=1)
print(len(ind))
plt.hist(d)


544
Out[103]:
(array([  19.,   47.,  102.,  133.,  120.,   57.,   47.,   13.,    5.,    1.]),
 array([ 0.00694428,  0.03292467,  0.05890506,  0.08488544,  0.11086583,
         0.13684621,  0.1628266 ,  0.18880698,  0.21478737,  0.24076775,
         0.26674814]),
 <a list of 10 Patch objects>)