In [1]:
import numpy as np
import os
import pandas as pd
import h5py
from astrometry.util.fits import fits_table, merge_tables
# To plot pretty figures
%matplotlib inline
#%matplotlib notebook
# to make this notebook's output stable across runs
def reset_graph(seed=7):
tf.reset_default_graph()
tf.set_random_seed(seed)
np.random.seed(seed)
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
import tensorflow as tf
%load_ext autoreload
%autoreload 2
In [30]:
from obiwan.qa.visual import plotImage, readImage, sliceImage
In [123]:
f_dr5= h5py.File(os.path.join(os.environ['HOME'],'DOWNLOADS',
'google_ai/hdf5/%s/121/1211p060/img_ivar_grz.hdf5' % 'dr5'),
'r')
f_dr5_jpeg= h5py.File(os.path.join(os.environ['HOME'],'DOWNLOADS',
'google_ai/hdf5/%s/121/1211p060/jpeg_grz.hdf5' % 'dr5'),
'r')
dr5_sorted_ids= fits_table(os.path.join(os.environ['HOME'],'DOWNLOADS',
'google_ai/hdf5/%s/121/1211p060/sorted_ids.fits' % 'dr5'))
f_sim= h5py.File(os.path.join(os.environ['HOME'],'DOWNLOADS',
'google_ai/hdf5/%s/121/1211p060/img_ivar_grz.hdf5' % 'sim'),
'r')
f_sim_jpeg= h5py.File(os.path.join(os.environ['HOME'],'DOWNLOADS',
'google_ai/hdf5/%s/121/1211p060/jpeg_grz.hdf5' % 'sim'),
'r')
sim_sorted_ids= fits_table(os.path.join(os.environ['HOME'],'DOWNLOADS',
'google_ai/hdf5/%s/121/1211p060/sorted_ids.fits' % 'sim'))
In [124]:
keys=dict(dr5=[key for key in f_dr5.keys()],
sim=[key for key in f_sim.keys()])
#jpeg=list(f_sim_jpeg.keys()))
len(keys['dr5']),len(dr5_sorted_ids),len(keys['sim']),len(sim_sorted_ids)
Out[124]:
In [125]:
f_sim['/%s/ivar' % keys['sim'][i]].shape
Out[125]:
In [126]:
print(sim_sorted_ids.get_columns())
for T in sim_sorted_ids[:10]: print(T.id,T.mag_g)
In [130]:
_= plt.hist(sim_sorted_ids.mag_g,histtype='step',color='b')
_= plt.hist(dr5_sorted_ids.mag_g,histtype='step',color='g')
In [151]:
dr5_mags= dr5_sorted_ids.mag_g.copy()
id_dr5_draw= []
for sim_mag in sim_sorted_ids.mag_g:
i= np.argmin(np.abs(dr5_mags - sim_mag))
id_dr5_draw.append(dr5_sorted_ids.id[i])
len_i= len(dr5_mags)
dr5_mags= np.delete(dr5_mags,dr5_mags[i])
assert(len_i == len(dr5_mags)+1)
In [157]:
for i in range(20):
print(dr5_sorted_ids[dr5_sorted_ids.id == id_dr5_draw[i]].mag_g, sim_sorted_ids.mag_g[i])
In [191]:
import matplotlib as mpl
fontsize= 15
mpl.rcParams['axes.titlesize'] = fontsize
mpl.rcParams['axes.labelsize']= fontsize
mpl.rcParams['font.size']= fontsize
In [221]:
def fake_real_mosaic(i_start, nrow=8,ncol=7):
fig,ax= plt.subplots(nrow,ncol,figsize=(ncol*1.5,1.5*nrow))
plt.subplots_adjust(hspace=0.01,wspace=0.01)
d_text= dict(rotation='horizontal',
horizontalalignment='left',
verticalalignment='center')
# Real
i_real=i_start-1
for row in np.arange(0,nrow,2):
i_real+=1
plotImage().imshow(f_dr5_jpeg['/%s/img' % id_dr5_draw[i_real]],ax[row,0])
for iband,col in enumerate(range(1,4)):
plotImage().imshow(f_dr5['/%s/img' % id_dr5_draw[i_real]][:,:,iband].T,ax[row,col],qs=[3,97])
for iband,col in enumerate(range(4,7)):
plotImage().imshow(f_dr5['/%s/ivar' % id_dr5_draw[i_real]][:,:,iband].T,ax[row,col],qs=[3,97])
ax[row,-1].text(1.05,0.5,'%.2f' % dr5_sorted_ids[dr5_sorted_ids.id == id_dr5_draw[i_real]].mag_g,
transform=ax[row,-1].transAxes,**d_text)
# Fake
i_fake=i_start-1
for row in np.arange(1,nrow,2):
i_fake+=1
plotImage().imshow(f_sim_jpeg['/%s/img' % sim_sorted_ids.id[i_fake]],ax[row,0])
for iband,col in enumerate(range(1,4)):
plotImage().imshow(f_sim['/%s/img' % sim_sorted_ids.id[i_fake]][:,:,iband].T,ax[row,col],qs=[3,97])
for iband,col in enumerate(range(4,7)):
plotImage().imshow(f_sim['/%s/ivar' % sim_sorted_ids.id[i_fake]][:,:,iband].T,ax[row,col],qs=[3,97])
ax[row,-1].text(1.05,0.5,'%.2f' % sim_sorted_ids.mag_g[i_fake],
transform=ax[row,-1].transAxes,**d_text)
for row in range(nrow):
for col in range(ncol):
ax[row,col].set_xticks([])
ax[row,col].set_yticks([])
for col,name in zip(range(ncol),
['g+r+z','g','r','z','ivar(g)','ivar(r)','ivar(z)']):
tlab= ax[0,col].set_title(name)
mlab= ax[0,-1].text(1.05,1.1,'mag(g)',
transform=ax[0,-1].transAxes,**d_text)
d_lab= dict(rotation='horizontal',
horizontalalignment='right',
verticalalignment='center')
for row in np.arange(0,nrow,2):
ylab=ax[row,0].set_ylabel("R",**d_lab)
for row in np.arange(1,nrow,2):
ax[row,0].set_ylabel("F",**d_lab)
plt.savefig('fake_real_mosaic_istart_%d.png' % i_start,
bbox_extra_artists=[tlab,ylab,mlab], bbox_inches='tight')
In [225]:
# Brightest
fake_real_mosaic(0, nrow=8,ncol=7)
In [223]:
# Faintest
fake_real_mosaic(len(sim_sorted_ids)-int(nrow/2)-1, nrow=8,ncol=7)
In [224]:
# Median
fake_real_mosaic(len(sim_sorted_ids)//2, nrow=8,ncol=7)
In [ ]:
# Loop over all sim sources
for istart in np.arange(0,len(sim_sorted_ids),8):
fake_real_mosaic(istart, nrow=8,ncol=7)
In [12]:
from glob import glob
fns= glob(os.path.join(os.environ['HOME'],'Downloads',
'dr5_testtrain/testtrain/121/1211p060/xtrain_*.npy'))
xtrain= [np.load(fn) for fn in fns]
np.vstack(xtrain).shape
Out[12]:
In [34]:
jpg= readImage(os.path.join(os.environ['HOME'],'DOWNLOADS',
'google_ai/legacysurvey-1211p060-image.jpg'),jpeg=True)
sliceImage(jpg,xslice=slice(10,20),yslice=slice(10,20)).shape
Out[34]:
In [29]:
os.path.basename('/global/cscratch1/sd/kaylanb/obiwan_out/elg_dr5_coadds/coadd/121/1211p060/rs0')
Out[29]:
In [7]:
dr= os.path.join(os.environ['HOME'],'Downloads',
'dr5_testtrain/testtrain/121/1211p060')
xtrain= np.load(os.path.join(dr,'xtrain_1.npy'))
ytrain= np.load(os.path.join(dr,'ytrain_1.npy'))
xtrain.shape,ytrain.shape,len(ytrain[ytrain == 0]),set(ytrain)
Out[7]:
In [26]:
isFake= ytrain == 1
img_inds= [0,2,4]
nrow,ncol=2,8
for band,iband in zip('grz',img_inds):
fig,ax= plt.subplots(nrow,ncol,figsize=(10,2))
i=-1
for row in range(nrow):
for col in range(ncol):
i+=1
plotImage().imshow(xtrain[isFake][i,:,:,iband],ax[row,col])
fig.suptitle('%s-band' % band)
In [28]:
img_inds= np.array([0,2,4])+1
nrow,ncol=2,8
for band,iband in zip('grz',img_inds):
fig,ax= plt.subplots(nrow,ncol,figsize=(10,2))
i=-1
for row in range(nrow):
for col in range(ncol):
i+=1
plotImage().imshow(xtrain[isFake][i,:,:,iband],ax[row,col])
fig.suptitle('ivar-%s' % band)
In [31]:
xtrain[0,...].shape, xtrain.dtype,ytrain.dtype
Out[31]:
In [67]:
# Design:
# input, 3x(conv + avg pool), 2x(fc)
height,width,channels = (64,64,6) #images_real.shape
reset_graph()
conv_kwargs= dict(strides=1,
padding='SAME',
activation=tf.nn.relu)
pool_kwargs= dict(ksize= [1,2,2,1],
strides=[1,2,2,1],
padding='VALID')
with tf.name_scope("inputs"):
X = tf.placeholder(tf.float32, shape=[None,height,width,channels], name="X") #training data shape
#X_reshaped = tf.reshape(X, shape=[-1, height, width, channels])
y = tf.placeholder(tf.int32, shape=[None], name="y")
init = tf.global_variables_initializer()
# 64x64
with tf.name_scope("layer1"):
conv1 = tf.layers.conv2d(X, filters=2*channels, kernel_size=7,
**conv_kwargs)
pool1 = tf.nn.avg_pool(conv1, **pool_kwargs)
# 32x32
with tf.name_scope("layer2"):
conv2 = tf.layers.conv2d(pool1, filters=4*channels, kernel_size=7,
**conv_kwargs)
pool2 = tf.nn.avg_pool(conv2, **pool_kwargs)
# 16x16
with tf.name_scope("layer3"):
conv3 = tf.layers.conv2d(pool2, filters=8*channels, kernel_size=7,
**conv_kwargs)
pool3 = tf.nn.avg_pool(conv3, **pool_kwargs)
# next is fc
pool3_flat = tf.reshape(pool3,
shape=[-1, pool3.shape[1] * pool3.shape[2] * pool3.shape[3]])
with tf.name_scope("fc"):
fc = tf.layers.dense(pool3_flat, 64, activation=tf.nn.relu, name="fc")
with tf.name_scope("output"):
logits = tf.layers.dense(fc, 2, name="output") # 2 classes
Y_proba = tf.nn.softmax(logits, name="Y_proba")
with tf.name_scope("train"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y)
loss = tf.reduce_mean(xentropy)
optimizer = tf.train.AdamOptimizer()
training_op = optimizer.minimize(loss)
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
saver = tf.train.Saver()
loss_summary= tf.summary.scalar('loss', loss)
accur_summary = tf.summary.scalar('accuracy', accuracy)
In [75]:
logits.op.name, loss.op.name, accuracy.op.name
Out[75]:
In [68]:
def BatchGen(X,y,batch_size=32):
# if not perfect divide, will drop extra training instances
N= X.shape[0]
ind= np.array_split(np.arange(N),N // batch_size)
for i in ind:
yield X[i,...],y[i].astype(np.int32) #.reshape(-1,1).astype(np.int32)
a=BatchGen(xtrain,ytrain,batch_size=32)
for X_,y_ in a:
print('batch:',X_.shape,y_.shape)
In [69]:
from datetime import datetime
def get_logdir(root_logdir):
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
return "{}/run-{}/".format(root_logdir, now)
In [70]:
batch_size=32
X_,y_= xtrain[:batch_size,...],ytrain[:batch_size].astype(np.int32)
with tf.Session() as sess:
sess.run(init)
print(sess.run(X, feed_dict={X: X_}).shape)
print(sess.run(conv1, feed_dict={X: X_}).shape)
print(sess.run(pool1, feed_dict={X: X_}).shape)
print(sess.run(conv2, feed_dict={X: X_}).shape)
print(sess.run(pool2, feed_dict={X: X_}).shape)
print(sess.run(conv3, feed_dict={X: X_}).shape)
print(sess.run(pool3, feed_dict={X: X_}).shape)
print(sess.run(pool3_flat, feed_dict={X: X_}).shape)
print(sess.run(fc, feed_dict={X: X_}).shape)
print(sess.run(logits, feed_dict={X: X_}).shape)
In [72]:
xtrain= np.load(os.path.join(dr,'xtrain_1.npy'))
ytrain= np.load(os.path.join(dr,'ytrain_1.npy'))
file_writer = tf.summary.FileWriter(get_logdir(os.path.join(os.environ['HOME'],'Downloads',
"cnn",'logs')),
tf.get_default_graph())
n_epochs = 4
batch_size = 32
n_batches= ytrain.shape[0]//batch_size + 1
with tf.Session() as sess:
sess.run(init)
for epoch in range(n_epochs):
data_gen= BatchGen(xtrain,ytrain,batch_size)
batch_index=0
for X_,y_ in data_gen:
sess.run(training_op, feed_dict={X: X_, y: y_})
batch_index+=1
if i % 1 == 0:
step = epoch * n_batches + batch_index
file_writer.add_summary(loss_summary.eval(feed_dict={X: X_, y: y_}),
step)
file_writer.add_summary(accur_summary.eval(feed_dict={X: X_, y: y_}),
step)
acc_train = accuracy.eval(feed_dict={X: X_, y: y_})
print(epoch, "Train accuracy:", acc_train)
if epoch % 2 == 0:
save_path = saver.save(sess, os.path.join(os.environ['HOME'],'Downloads',
"cnn/checkpoint.ckpt"))
save_path = saver.save(sess, os.path.join(os.environ['HOME'],'Downloads',
"cnn/final.ckpt"))
In [59]:
xtrain= np.load(os.path.join(dr,'xtrain_2.npy'))
ytrain= np.load(os.path.join(dr,'ytrain_2.npy'))
with tf.Session() as sess:
# Equiv of sess.run(init)
saver.restore(sess, os.path.join(os.environ['HOME'],'Downloads',
"cnn/final.ckpt"))
#print('resorted model has accur=',accuracy.eval())
for epoch in range(n_epochs):
data_gen= BatchGen(xtrain,ytrain,batch_size)
for X_,y_ in data_gen:
sess.run(training_op, feed_dict={X: X_, y: y_})
acc_train = accuracy.eval(feed_dict={X: X_, y: y_})
print(epoch, "Train accuracy:", acc_train)
if epoch % 2 == 0:
save_path = saver.save(sess, os.path.join(os.environ['HOME'],'Downloads',
"cnn/checkpoint.ckpt"))
save_path = saver.save(sess, os.path.join(os.environ['HOME'],'Downloads',
"cnn/final.ckpt"))
In [4]:
f_real= h5py.File(os.path.join(os.environ['HOME'],'DOWNLOADS',
'dr5_hdf5/hdf5/121/1211p060/img_ivar_grz.hdf5'),
'r') #1165p107
dr5_tractor= fits_table(os.path.join(os.environ['HOME'],'DOWNLOADS',
'dr5_hdf5/hdf5/121/1211p060/tractor-1211p060.fits'))
f_fake= h5py.File(os.path.join(os.environ['HOME'],'DOWNLOADS',
'elg_dr5_coadds/hdf5/121/1211p060/img_ivar_grz.hdf5'),
'r')
simcat= fits_table(os.path.join(os.environ['HOME'],'DOWNLOADS',
'elg_dr5_coadds/hdf5/121/1211p060',
'simcat-elg-1211p060-rsALL.fits'))
print(len(f_real.keys()),len(f_fake.keys()),np.min([len(f_real.keys()),len(f_fake.keys())]))
In [87]:
def get_data(f,num=128):
"""Returns numpy array (num,64,64,6)"""
return np.array([np.stack([f[key+'/img'],f[key+'/ivar']],axis=-1).reshape((64,64,6))
for key in list(f.keys())[:num]])
def get_data_imgonly(f,num=128):
"""Returns numpy array (num,64,64,3)"""
return np.array([np.reshape(f[key+'/img'],(64,64,3))
for key in list(f.keys())[:num]])
real= get_data_imgonly(f_real,n)
real.shape
Out[87]:
In [92]:
inHdf5= pd.Series(dr5_tractor.objid).isin(list(f_real.keys()))
print(len(dr5_tractor[inHdf5]),len(list(f_real.keys())))
dr5_tractor.cut(inHdf5)
# flux_gt_0= ((inHdf5) &
# (dr5_tractor.flux_g > 0) &
# (dr5_tractor.flux_r > 0) &
# (dr5_tractor.flux_z > 0))
# dr5_tractor.cut(flux_gt_0)
In [93]:
d= {b:dr5_tractor.get('flux_'+b)
for b in 'grz'}
df= pd.DataFrame(d)
df= df.apply(lambda x: x.values.byteswap().newbyteorder())
df.describe()
Out[93]:
In [96]:
def flux2mag(nmgy):
return -2.5 * (np.log10(nmgy) - 9)
assert('g' not in dr5_tractor.get_columns())
assert('g' not in simcat.get_columns())
for b in 'grz':
dr5_tractor.set(b, flux2mag(dr5_tractor.get('flux_'+b)/dr5_tractor.get('mw_transmission_'+b)))
simcat.set(b, flux2mag(simcat.get(b+'flux')))
dr5_tractor.set('gr', dr5_tractor.g - dr5_tractor.r)
dr5_tractor.set('rz', dr5_tractor.r - dr5_tractor.z)
simcat.set('gr', simcat.g - simcat.r)
simcat.set('rz', simcat.r - simcat.z)
In [99]:
fig,ax=plt.subplots(1,3,figsize=(12,4))
ax[0].scatter(dr5_tractor.rz,dr5_tractor.gr,alpha=0.3,c='b',label='dr5')
ax[1].scatter(simcat.rz, simcat.gr,alpha=0.3,c='g',label='sim')
ax[2].scatter(simcat.rz, simcat.gr,alpha=0.3,c='g',label='sim')
ax[2].scatter(dr5_tractor.rz,dr5_tractor.gr,alpha=0.3,c='b',label='dr5')
# ax.plot(pad['x1'],pad['y1'],'r--')
# ax.plot(pad['x2'],pad['y2'],c='r',ls='--',lw=2)
# ax.plot(pad['x3'],pad['y3'],c='r',ls='--',lw=2)
# ax.plot(pad['x4'],pad['y4'],c='r',ls='--',lw=2)
for i in range(3):
ax[i].set_xlabel('r-z')
ax[i].set_ylabel('g-r')
ax[i].set_ylim(-0.3,2)
ax[i].set_xlim(-0.6,2.2)
ax[i].legend()
In [97]:
import seaborn as sns
fig,ax= plt.subplots(1,3,figsize=(12,3))
for i,b in enumerate('grz'):
sns.distplot(dr5_tractor.get(b),kde=False,ax=ax[i],color='b')
sns.distplot(simcat.get(b),kde=False,ax=ax[i],color='g')
ax[i].set_xlabel(b + ' mag')
In [101]:
def inRegion(rz,gr):
return ((rz > 0.5) &
(rz < 1.) &
(gr > 0.5) &
(gr < 1.))
def isFaint(g,r,z):
return ((g > 23.) &
(r > 22.5) &
(z > 21.5))
dr5_keep= ((inRegion(dr5_tractor.rz,dr5_tractor.gr)) &
(isFaint(dr5_tractor.g,dr5_tractor.r,dr5_tractor.z)))
sim_keep= ((inRegion(simcat.rz, simcat.gr)) &
(isFaint(simcat.g,simcat.r,simcat.z)))
fig,ax= plt.subplots(1,3,figsize=(12,3))
for i,b in enumerate('grz'):
sns.distplot(dr5_tractor.get(b)[dr5_keep],kde=False,ax=ax[i],color='b')
sns.distplot(simcat.get(b)[sim_keep],kde=False,ax=ax[i],color='g')
ax[i].set_xlabel(b + ' mag')
fig,ax= plt.subplots(1,3,figsize=(8,3))
ax[0].scatter(dr5_tractor.rz[dr5_keep], dr5_tractor.gr[dr5_keep],alpha=0.3,c='b',label='dr5')
ax[1].scatter(simcat.rz[sim_keep], simcat.gr[sim_keep],alpha=0.3,c='g',label='sim')
for i in range(3):
ax[i].set_xlabel('r-z')
ax[i].set_ylabel('g-r')
ax[i].set_ylim(-0.3,2)
ax[i].set_xlim(-0.6,2.2)
ax[i].legend()
print(len(dr5_tractor[dr5_keep]), len(simcat[sim_keep]))
In [102]:
from scipy import spatial
dr5_grz= np.array([[g,r,z]
for g,r,z in zip(dr5_tractor.g[dr5_keep],
dr5_tractor.r[dr5_keep],
dr5_tractor.z[dr5_keep])])
sim_grz= np.array([[g,r,z]
for g,r,z in zip(simcat.g[sim_keep],
simcat.r[sim_keep],
simcat.z[sim_keep])])
print(dr5_grz.shape, sim_grz.shape)
dr5_tree = spatial.KDTree(dr5_grz)
In [103]:
d,ind=dr5_tree.query(sim_grz,p=1)
print(len(ind))
plt.hist(d)
Out[103]: