In [1]:

    
%matplotlib inline
from matplotlib import pyplot as plt
import matplotlib as mpl

import seaborn as sns; sns.set(style="white", font_scale=2)

import numpy as np
import pandas as pd
from astropy.io import fits
import glob

import sklearn
import sklearn.metrics

import keras
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.layers.noise import GaussianNoise

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.preprocessing.image import random_channel_shift
from keras.optimizers import SGD, Adam
from keras import backend as K
K.set_image_data_format('channels_first')
K.set_session(K.tf.Session(config=K.tf.ConfigProto(intra_op_parallelism_threads=10, 
                                                   inter_op_parallelism_threads=10)))

import scipy.ndimage as ndi

import matplotlib.patches as patches

import pathlib









    



Using TensorFlow backend.



In [2]:

    
mpl.rcParams['savefig.dpi'] = 80
mpl.rcParams['figure.dpi'] = 80
mpl.rcParams['figure.figsize'] = np.array((10,6))*.6
mpl.rcParams['figure.facecolor'] = "white"



In [3]:

    
%load_ext autoreload
%autoreload 2



In [50]:

    
# my modules that are DNN specific
import os, sys
dwarfz_package_dir = pathlib.Path \
                            .cwd() \
                            .parent / "DNN"

if str(dwarfz_package_dir) not in sys.path:
    sys.path.insert(0, str(dwarfz_package_dir))
    sys.path.insert(0, str(dwarfz_package_dir.parent.parent))
                    
import dwarfz

# these actually exist in "../DNN" which is why I had to add it to the path above
import preprocessing
import geometry

To Do

Read in fits images
Apply pre-processing stretch (asinh)
crop image smaller
Combine filters into one cube
Create training set with labels
Set up keras model
Poke around at the results

0) Get files



In [5]:

    
# images_dir = pathlib.Path(preprocessing.images_dir)
images_dir = pathlib.Path.home() / "dwarfz" / "galaxies_narrowband"
images_dir









    Out[5]:





PosixPath('/Users/egentry/dwarfz/galaxies_narrowband')



In [6]:

    
HSC_ids_target = {int(subdir.name)
                  for subdir in images_dir.glob("target/*")
                  if subdir.name[0] != "."}

HSC_ids_contaminant = {int(subdir.name)
                       for subdir in images_dir.glob("contaminant/*")
                       if subdir.name[0] != "."}

remove_ids = HSC_ids_target & HSC_ids_contaminant
HSC_ids_target -= remove_ids
HSC_ids_contaminant -= remove_ids

HSC_ids = HSC_ids_target | HSC_ids_contaminant

HSC_ids_target      = np.array(list(HSC_ids_target))
HSC_ids_contaminant = np.array(list(HSC_ids_contaminant))
HSC_ids             = np.array(list(HSC_ids))


print(len(HSC_ids_target))
print(len(HSC_ids_contaminant))
print(len(HSC_ids), len(HSC_ids_target) + len(HSC_ids_contaminant))


HSC_ids = np.array(sorted(HSC_ids)) # enforce a deterministic ordering









    



6278
18999
25277 25277



In [7]:

    
HSC_id = HSC_ids[1] # for when I need a single sample galaxy



In [8]:

    
bands = ["g", "r", "i", "z", "y"]

1) Read in fits image



In [9]:

    
get_image = lambda HSC_id, band: preprocessing.get_image(HSC_id, band, images_dir=images_dir/"*"/str(HSC_id))



In [10]:

    
image, flux_mag_0 = get_image(HSC_id, "g")
print("image size: {} x {}".format(*image.shape))
image









    



image size: 239 x 239






    Out[10]:





array([[-0.02194131,  0.00391446,  0.01770653, ..., -0.01461175,
        -0.01249455,  0.01746454],
       [-0.01601904, -0.02720453, -0.06848946, ...,  0.04288237,
         0.00534859,  0.02132502],
       [-0.01800561, -0.05129848, -0.06990851, ...,  0.03650333,
         0.01508071, -0.03739795],
       ...,
       [ 0.02909144,  0.03687791,  0.04685813, ...,  0.03878131,
        -0.04980357, -0.0178718 ],
       [ 0.01575067, -0.01513606,  0.02736958, ...,  0.01000878,
         0.00148029,  0.01590817],
       [ 0.02265622,  0.04364862, -0.00685509, ..., -0.01821685,
         0.01330067,  0.01832419]], dtype=float32)



In [11]:

    
preprocessing.image_plotter(image)



In [12]:

    
preprocessing.image_plotter(np.log(image))









    



/Users/egentry/anaconda3/envs/tf36/lib/python3.6/site-packages/ipykernel_launcher.py:1: RuntimeWarning: invalid value encountered in log
  """Entry point for launching an IPython kernel.

2) Apply stretch

We're using (negative) asinh magnitudes, as implemented by the HSC collaboration.

To see more about asinh magnitude system, see : Lupton, Gunn and Szalay (1999) used for SDSS. (It's expliticly given in the SDSS Algorithms documentation as well as this overview page).

To see the source of our code, see: the HSC color image creator

And for reference, a common form of this stretch is: $$ \mathrm{mag}_\mathrm{asinh} = - \left(\frac{2.5}{\ln(10)}\right) \left(\mathrm{asinh}\left(\frac{f/f_0}{2b}\right) + \ln(b) \right)$$ for dimensionless softening parameter $b$, and reference flux (f_0).



In [13]:

    
def scale(x, fluxMag0):
    ### adapted from https://hsc-gitlab.mtk.nao.ac.jp/snippets/23
    mag0 = 19
    scale = 10 ** (0.4 * mag0) / fluxMag0
    x *= scale

    u_min = -0.05
    u_max = 2. / 3.
    u_a = np.exp(10.)

    x = np.arcsinh(u_a*x) / np.arcsinh(u_a)
    x = (x - u_min) / (u_max - u_min)

    return x



In [14]:

    
image_scaled = preprocessing.scale(image, flux_mag_0)



In [15]:

    
preprocessing.image_plotter(image_scaled)



In [16]:

    
sns.distplot(image_scaled.flatten())
plt.title("Distribution of Transformed Intensities")









    Out[16]:





Text(0.5, 1.0, 'Distribution of Transformed Intensities')



In [17]:

    
for band in bands:
    image, flux_mag_0 = get_image(HSC_id, band)
    image_scaled = preprocessing.scale(image, flux_mag_0)

    plt.figure()
    preprocessing.image_plotter(image_scaled)
    plt.title("{} band".format(band))
    plt.colorbar()

3) Crop Image

Am I properly handling odd numbers?



In [18]:

    
pre_transformed_image_size  = 150
post_transformed_image_size = 75



In [19]:

    
cutout = preprocessing.get_cutout(image_scaled, post_transformed_image_size)
cutout.shape









    Out[19]:





(75, 75)



In [20]:

    
preprocessing.image_plotter(cutout)



In [21]:

    
for band in bands:
    image, flux_mag_0 = get_image(HSC_id, band)
    image_scaled = preprocessing.scale(image, flux_mag_0)
    cutout = preprocessing.get_cutout(image_scaled, post_transformed_image_size)

    plt.figure()
    preprocessing.image_plotter(cutout)
    plt.title("{} band".format(band))
    plt.colorbar()

4) Combine filters into cube



In [22]:

    
images = [None]*len(bands)
flux_mag_0s = [None]*len(bands)
cutouts = [None]*len(bands)
for i, band in enumerate(bands):
    images[i], flux_mag_0s[i] = get_image(HSC_id, band)
    
    cutouts[i] = preprocessing.get_cutout(
        preprocessing.scale(images[i], flux_mag_0s[i]), 
        post_transformed_image_size
    )



In [23]:

    
cutout_cube = np.array(cutouts)
cutout_cube.shape









    Out[23]:





(5, 75, 75)



In [24]:

    
# must transform into [0,1] for plt.imshow
# the HSC standard tool accomplishes this by clipping instead.
plt.imshow(preprocessing.transform_0_1(cutout_cube[(4,2,0),:,:].transpose(1,2,0)) )









    Out[24]:





<matplotlib.image.AxesImage at 0x135f0ffd0>



In [25]:

    
for i, band in enumerate(bands):
    sns.distplot(cutout_cube[:,:,:].transpose(1,2,0)[:,:,i].flatten(), label=band)
    plt.legend(loc="best")

5) Load Training Set Labels



In [26]:

    
df = pd.DataFrame(data={
    "HSC_id" :HSC_ids,
    "target" : [HSC_id in HSC_ids_target for HSC_id in HSC_ids]
})

df = df.set_index("HSC_id")

df.head()









    Out[26]:







  
    
      
      target
    
    
      HSC_id
      
    
  
  
    
      43158176442354210
      False
    
    
      43158176442354294
      False
    
    
      43158176442354356
      False
    
    
      43158176442354418
      True
    
    
      43158176442354561
      False



In [27]:

    
df.mean()









    Out[27]:





target    0.248368
dtype: float64



In [28]:

    
def load_image_mappable(HSC_id):
    images      = [None]*len(bands)
    flux_mag_0s = [None]*len(bands)
    cutouts     = [None]*len(bands)
    for j, band in enumerate(bands):
        images[j], flux_mag_0s[j] = get_image(HSC_id, band)
        cutouts[j] = preprocessing.get_cutout(
            preprocessing.scale(images[j], flux_mag_0s[j]),
            pre_transformed_image_size)
    cutout_cube = np.array(cutouts)
    return cutout_cube



In [29]:

    
X = np.empty((len(HSC_ids[:400]), 5, 
              pre_transformed_image_size, pre_transformed_image_size))



In [30]:

    
X.shape









    Out[30]:





(400, 5, 150, 150)



In [31]:

    
for i in range(X.shape[0]):
    HSC_id = HSC_ids[i]
    X[i] = load_image_mappable(HSC_id)
    
    if i%(X.shape[0]//20)==0:
        print("loading # {} / {}".format(i, X.shape[0]), end="\r", flush=True)









    



loading # 380 / 400



In [32]:

    
X_full = X
X_small = X[:,(4,2,0),:,:] # drop down to 3 bands (y, i, g)



In [33]:

    
X.shape









    Out[33]:





(400, 5, 150, 150)



In [34]:

    
Y = df.loc[HSC_ids].target.values
Y









    Out[34]:





array([False, False, False, ..., False, False, False])



In [35]:

    
Y.mean()









    Out[35]:





0.24836808165525973

load photometric features



In [51]:

    
COSMOS_filename = pathlib.Path(dwarfz.data_dir_default) / "COSMOS_reference.sqlite"
COSMOS = dwarfz.datasets.COSMOS(COSMOS_filename)

HSC_filename = pathlib.Path(dwarfz.data_dir_default) / "HSC_COSMOS_median_forced.sqlite3"
HSC = dwarfz.datasets.HSC(HSC_filename)

matches_filename = pathlib.Path(dwarfz.data_dir_default) / "matches.sqlite3"
matches_df = dwarfz.matching.Matches.load_from_filename(matches_filename)

combined = matches_df[matches_df.match].copy()
combined["ra"]       = COSMOS.df.loc[combined.index].ra
combined["dec"]      = COSMOS.df.loc[combined.index].dec
combined["photo_z"]  = COSMOS.df.loc[combined.index].photo_z
combined["log_mass"] = COSMOS.df.loc[combined.index].mass_med

photometry_cols = [
    "gcmodel_flux","gcmodel_flux_err","gcmodel_flux_flags", "gcmodel_mag",
    "rcmodel_flux","rcmodel_flux_err","rcmodel_flux_flags", "rcmodel_mag",
    "icmodel_flux","icmodel_flux_err","icmodel_flux_flags", "icmodel_mag",
    "zcmodel_flux","zcmodel_flux_err","zcmodel_flux_flags", "zcmodel_mag",
    "ycmodel_flux","ycmodel_flux_err","ycmodel_flux_flags", "ycmodel_mag",
]

for col in photometry_cols:
    combined[col] = HSC.df.loc[combined.catalog_2_ids][col].values
    
    
combined["g_minus_r"] = combined.gcmodel_mag - combined.rcmodel_mag
combined["r_minus_i"] = combined.rcmodel_mag - combined.icmodel_mag
combined["i_minus_z"] = combined.icmodel_mag - combined.zcmodel_mag
combined["z_minus_y"] = combined.zcmodel_mag - combined.ycmodel_mag

mask =    np.isfinite(combined["g_minus_r"]) & np.isfinite(combined["r_minus_i"]) \
        & np.isfinite(combined["i_minus_z"]) & np.isfinite(combined["z_minus_y"]) \
        & np.isfinite(combined["icmodel_mag"]) \
        & (~combined.gcmodel_flux_flags) & (~combined.rcmodel_flux_flags) \
        & (~combined.icmodel_flux_flags) & (~combined.zcmodel_flux_flags) \
        & (~combined.ycmodel_flux_flags)

combined = combined[mask]

df_frankenz = pd.read_sql_table("photo_z",
                                "sqlite:///{}".format(
                                    pathlib.Path(dwarfz.data_dir_default)
                                                 / "HSC_matched_to_FRANKENZ.sqlite"),
                                index_col="object_id")

df_frankenz.head()

combined = combined.join(df_frankenz[["photoz_best", "photoz_risk_best"]],
                         on="catalog_2_ids")

photometry_features = combined.loc[:,["g_minus_r", "r_minus_i", "i_minus_z", "z_minus_y",
                                      "icmodel_mag",
                                      "photoz_best",
                                      "photoz_risk_best" # The risk of photoz_best being outside of the range z_true +- 0.15(1+z_true). It ranges from 0 (safe) to 1(risky)
                                     ]]

photometry_features



In [37]:

    
training_set_labels_filename = "../data/galaxy_images_training/2017_09_26-dwarf_galaxy_scores.csv"
HSC_to_COSMOS = pd.read_csv(training_set_labels_filename)
HSC_to_COSMOS = HSC_to_COSMOS.drop_duplicates("HSC_id")
HSC_to_COSMOS = HSC_to_COSMOS.set_index("HSC_id")
HSC_to_COSMOS = HSC_to_COSMOS[["COSMOS_id"]]
HSC_to_COSMOS.head()









    Out[37]:







  
    
      
      COSMOS_id
    
    
      HSC_id
      
    
  
  
    
      43158322471244656
      628457
    
    
      43158605939114836
      919771
    
    
      43159142810013665
      444239
    
    
      43158734788125011
      569427
    
    
      43158863637144621
      369948



In [63]:

    
photometry_features_aligned = photometry_features.loc[HSC_to_COSMOS.loc[HSC_ids].values.flatten()]
photometry_features_aligned.head()









    Out[63]:







  
    
      
      g_minus_r
      r_minus_i
      i_minus_z
      z_minus_y
      icmodel_mag
      photoz_best
      photoz_risk_best
    
    
      catalog_1_ids
      
      
      
      
      
      
      
    
  
  
    
      387020
      0.241915
      0.287563
      0.580751
      -0.461952
      25.067200
      0.75
      0.189435
    
    
      389379
      0.815916
      0.300808
      0.021614
      -0.038185
      23.948193
      0.36
      0.158907
    
    
      390478
      1.774567
      1.506727
      0.083956
      -0.385647
      24.193956
      0.85
      0.183067
    
    
      391721
      0.493143
      0.108091
      0.332539
      0.058878
      25.082439
      2.61
      0.458986
    
    
      394884
      1.182629
      0.388001
      0.275812
      -0.031630
      23.426733
      0.50
      0.168350

Geometry!



In [65]:

    
X[0].shape









    Out[65]:





(5, 150, 150)



In [66]:

    
h = pre_transformed_image_size
w = pre_transformed_image_size
transform_matrix = geometry.create_random_transform_matrix(h, w,
                                                  include_rotation=True,
                                                  translation_size = .01,
                                                  verbose=False)

x_tmp = X[0][(4,2,0),:,:]

result = geometry.apply_transform_new(x_tmp, transform_matrix, 
                            channel_axis=0, fill_mode="constant", cval=np.max(x_tmp))

result = preprocessing.get_cutout(x_tmp, post_transformed_image_size)
plt.imshow(preprocessing.transform_0_1(result.transpose(1,2,0)))









    Out[66]:





<matplotlib.image.AxesImage at 0x133462748>



In [67]:

    
import ipywidgets
ipywidgets.interact(preprocessing.transform_plotter,
                    X = ipywidgets.fixed(X_small),
                    rotation_degrees = ipywidgets.IntSlider(min=0, max=360, step=15, value=45),
                    dx_after = ipywidgets.IntSlider(min=-15, max=15),
                    dy_after = ipywidgets.IntSlider(min=-15, max=15),
                    color = ipywidgets.fixed(True),
                    shear_degrees = ipywidgets.IntSlider(min=0, max=90, step=5, value=0),
                    zoom_x = ipywidgets.FloatSlider(min=.5, max=2, value=1),
                    crop = ipywidgets.Checkbox(value=True)
                    )









    





 
 










    Out[67]:





<function preprocessing.transform_plotter(X, reflect_x=False, rotation_degrees=45, dx_after=0, dy_after=0, shear_degrees=0, zoom_x=1, crop=False, color=True)>

5b) Split training and testing set



In [68]:

    
np.random.seed(1)

randomized_indices = np.arange(Y.shape[0])
np.random.shuffle(randomized_indices)

testing_fraction = 0.2
testing_set_indices = randomized_indices[:int(testing_fraction*Y.shape[0])]
training_set_indices = np.array(list(set([*randomized_indices]) - set([*testing_set_indices])))



In [69]:

    
print(testing_set_indices.size, Y[testing_set_indices].mean())









    



5055 0.247675568743818



In [70]:

    
print(training_set_indices.size, Y[training_set_indices].mean())









    



20222 0.24854119276036



In [71]:

    
p = Y[training_set_indices].mean()
prior_loss = sklearn.metrics.log_loss(Y[testing_set_indices], 
                                      [p]*testing_set_indices.size)
prior_loss









    Out[71]:





0.5597690655312428

6b) Adapt NumpyArrayIterator

The original only allowed 1, 3 or 4 channel images. I have 5 channel images.

Also, I want to change the way that augmentation is happening



In [72]:

    
from data_generator_directory import DirectoryIterator

6c) Adapt ImageDataGenerator

The original only allowed 1, 3 or 4 channel images. I have 5 channel images. Also, I'm adjusting the way that the affine transformations work for the data augmentation



In [73]:

    
from data_generator_directory import ImageDataGeneratorLoadable

6d) Create Data Generator



In [106]:

    
print('Using real-time data augmentation.')

h_before, w_before = X[0,0].shape
print("image shape before: ({},{})".format(h_before, w_before))

h_after = post_transformed_image_size
w_after = post_transformed_image_size
print("image shape after:  ({},{})".format(h_after, w_after))

# get a closure that binds the image size to get_cutout
postprocessing_function = lambda image: preprocessing.get_cutout(image, post_transformed_image_size)

# this will do preprocessing and realtime data augmentation
datagen = ImageDataGeneratorLoadable(
    featurewise_center=False,  # set input mean to 0 over the dataset
    samplewise_center=False,  # set each sample mean to 0
    featurewise_std_normalization=False,  # divide inputs by std of the dataset
    samplewise_std_normalization=False,  # divide each input by its std
    zca_whitening=False,  # apply ZCA whitening
    with_reflection_x=True, # randomly apply a reflection (in x)
    with_reflection_y=True, # randomly apply a reflection (in y)
    with_rotation=False, # randomly apply a rotation
    width_shift_range=0.002,  # randomly shift images horizontally (fraction of total width)
    height_shift_range=0.002,  # randomly shift images vertically (fraction of total height)
    postprocessing_function=postprocessing_function, # get a cutout of the processed image
    output_image_shape=(post_transformed_image_size,post_transformed_image_size)
)   

# this will do preprocessing and realtime data augmentation
from data_generator import ImageDataGenerator
datagen_features = ImageDataGenerator(
    featurewise_center=False,  # set input mean to 0 over the dataset
    samplewise_center=False,  # set each sample mean to 0
    featurewise_std_normalization=False,  # divide inputs by std of the dataset
    samplewise_std_normalization=False,  # divide each input by its std
    zca_whitening=False,  # apply ZCA whitening
    with_reflection_x=False, # randomly apply a reflection (in x)
    with_reflection_y=False, # randomly apply a reflection (in y)
    with_rotation=False, # randomly apply a rotation
#     width_shift_range=0.002,  # randomly shift images horizontally (fraction of total width)
#     height_shift_range=0.002,  # randomly shift images vertically (fraction of total height)
#     postprocessing_function=postprocessing_function, # get a cutout of the processed image
    output_image_shape=(1,1),
)









    



Using real-time data augmentation.
image shape before: (150,150)
image shape after:  (75,75)



In [75]:

    
# datagen.fit()

7) Set up `keras` model



In [76]:

    
with_one_by_one = False
trainable = False

input_shape = cutout_cube.shape

nb_dense = 64



In [77]:

    
from keras.applications import inception_v3, inception_resnet_v2, vgg19



In [79]:

    
feature_extractor = Sequential()
extractor_input_shape = tuple([3, *input_shape[1:]])
extractor_layers = vgg19.VGG19(include_top=False,
                                input_shape=extractor_input_shape
                               )
for layer in extractor_layers.layers:
    layer.trainable = False
feature_extractor.add(extractor_layers)

feature_extractor.compile(optimizer= Adam(lr=1),
               loss="mean_squared_error")



In [80]:

    
reextract_features = False
features_filename = "vgg_features.h5"

if reextract_features or (not pathlib.Path(features_filename).is_file()):
    
    features = np.zeros((Y.size, 512, 2, 2))
    for i, HSC_id in enumerate(HSC_ids):
        x_tmp = np.array([datagen.standardize(load_image_mappable(HSC_id))])
        features[i] = feature_extractor.predict(x_tmp[:,(4,2,0),:,:])

        if i%(features.shape[0]//20)==0:
            print("loading # {} / {}".format(i, features.shape[0]), end="\r", flush=True)
    np.save(features_filename, features, allow_pickle=False)
else:
    features = np.load(features_filename)
    
features.shape









    Out[80]:





(25277, 512, 2, 2)



In [ ]:

    
reextract_features = False
features_filename = "vgg_features.h5"

if reextract_features or (not pathlib.Path(features_filename).is_file()):
    
    features = np.zeros((Y.size, 512, 2, 2))
    for i, HSC_id in enumerate(HSC_ids):
        x_tmp = np.array([datagen.standardize(load_image_mappable(HSC_id))])
        features[i] = feature_extractor.predict(x_tmp[:,(4,2,0),:,:])

        if i%(features.shape[0]//20)==0:
            print("loading # {} / {}".format(i, features.shape[0]), end="\r", flush=True)
    df_vgg = pd.DataFrame(data={
        "HSC_id" : HSC_ids,
        "vgg_features" : [feature_row for feature_row in features]
    })
    df_vgg.to_hdf(features_filename, "data")
else:
    df_vgg = pd.read_hdf(features_filename)
    features = np.array([row for row in df_vgg.vgg_features.values])    
features.shape



In [81]:

    
# model = Sequential()

# # # # 1x1 convolution to make sure we only have 3 channels
# n_channels_for_pretrained=3
# one_by_one = Conv2D(n_channels_for_pretrained, 1, padding='same',
#                     input_shape=input_shape)
# if not with_one_by_one:
#     one_by_one.trainable = False
# model.add(one_by_one)

# pretrained_input_shape = tuple([n_channels_for_pretrained, *input_shape[1:]])
# pretrained_layers = vgg19.VGG19(include_top=False,
#                                 input_shape=pretrained_input_shape
#                                )
# for layer in pretrained_layers.layers:
#     layer.trainable = trainable
# model.add(pretrained_layers)


# model.add(Flatten())
# model.add(Dense(2*nb_dense, activation="relu"))
# model.add(Dense(nb_dense, activation="relu"))
# model.add(Dense(1, activation="sigmoid"))



In [83]:

    
learning_rate = 0.00025
decay = 1e-5
momentum = 0.9

# sgd = SGD(lr=learning_rate, decay=decay, momentum=momentum, nesterov=True)

adam = Adam(lr=learning_rate)



In [135]:

    
!head $logger_filename









    



head: training_transfer.with_photometry.log: No such file or directory



In [137]:

    
!head training_transfer.log.old



In [134]:

    
logger_filename = "training_transfer"
if trainable:
    logger_filename += "-trainable" 
if with_one_by_one:
    logger_filename += "-1x1"
logger_filename += ".with_photometry.log"


# model.compile(loss='binary_crossentropy', 
# #               optimizer=sgd, 
#               optimizer=adam,
# #               metrics=["accuracy"]
#              )

# can only manually set weights _after_ compiling
# one_by_one_weights = np.zeros((1,1,5,3))
# for i in range(3):
#     one_by_one_weights[0,0,4-2*i,i] = 1 # chooses y i g with correct color order
# one_by_one.set_weights([one_by_one_weights,
#                         np.zeros(3)])

print("using logger_filename: ", logger_filename)

# if os.path.exists(logger_filename):
#     logger_filename_tmp = logger_filename + ".old"
#     os.rename(logger_filename, logger_filename_tmp)









    



using logger_filename:  training_transfer.with_photometry.log



In [109]:

    
features_combined.shape









    Out[109]:





(25277, 2055)



In [123]:

    
model = Sequential()
model.add(Flatten(input_shape=(np.prod(features.shape[1:]) + photometry_features.shape[1], 1, 1)))
model.add(Dense(2*nb_dense, activation="relu"))
model.add(Dense(nb_dense, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

model.compile(optimizer= Adam(lr=learning_rate),
               loss="binary_crossentropy")

model.summary()









    



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
flatten_2 (Flatten)          (None, 2055)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 128)               263168    
_________________________________________________________________
dense_5 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 65        
=================================================================
Total params: 271,489
Trainable params: 271,489
Non-trainable params: 0
_________________________________________________________________



In [124]:

    
earlystopping = EarlyStopping(monitor='loss',
                              patience=35,
                              verbose=1,
                              mode='auto' )



In [125]:

    
csv_logger = keras.callbacks.CSVLogger(logger_filename,
                                       append=True)

8) Run basic keras model



In [126]:

    
goal_batch_size = 128
steps_per_epoch = max(2, training_set_indices.size//goal_batch_size)
batch_size = training_set_indices.size//steps_per_epoch
print("steps_per_epoch: ", steps_per_epoch)
print("batch_size: ", batch_size)
epoch_to_save_state = 50
epochs = 100
verbose = 1









    



steps_per_epoch:  157
batch_size:  128



In [127]:

    
prior_loss









    Out[127]:





0.5597690655312428



In [128]:

    
features_combined = np.hstack([features.reshape(-1, 512*2*2), photometry_features_aligned])
features_combined = features_combined.reshape(*features_combined.shape, 1, 1)
print(features_combined.shape)









    



(25277, 2055, 1, 1)



In [174]:

    
epochs=200


args = (datagen_features.flow(features_combined[training_set_indices], 
                              Y[training_set_indices],
                              batch_size=batch_size,
                             ),
       )

kwargs = dict(steps_per_epoch=steps_per_epoch,
              validation_data=(features_combined[testing_set_indices], 
                               Y[testing_set_indices]),
              verbose=verbose,
              callbacks=[csv_logger],
             )

for epoch_i in range(epochs):
    history = model.fit_generator(*args,
                                  initial_epoch=epoch_i,
                                  epochs=epoch_i+1,
                                  **kwargs,
                                  )

    model.save_weights("weights/cached_weights.extracted_features.with_photometry.{:>04d}.h5".format(epoch_i))









    



Epoch 1/1
  2/157 [..............................] - ETA: 20:53 - loss: 0.4380





    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-174-ec80bc48bdc9> in <module>
     19                                   initial_epoch=epoch_i,
     20                                   epochs=epoch_i+1,
---> 21                                   **kwargs,
     22                                   )
     23 

~/anaconda3/envs/tf36/lib/python3.6/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
     89                 warnings.warn('Update your `' + object_name + '` call to the ' +
     90                               'Keras 2 API: ' + signature, stacklevel=2)
---> 91             return func(*args, **kwargs)
     92         wrapper._original_function = func
     93         return wrapper

~/anaconda3/envs/tf36/lib/python3.6/site-packages/keras/engine/training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
   1416             use_multiprocessing=use_multiprocessing,
   1417             shuffle=shuffle,
-> 1418             initial_epoch=initial_epoch)
   1419 
   1420     @interfaces.legacy_generator_methods_support

~/anaconda3/envs/tf36/lib/python3.6/site-packages/keras/engine/training_generator.py in fit_generator(model, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
    179             batch_index = 0
    180             while steps_done < steps_per_epoch:
--> 181                 generator_output = next(output_generator)
    182 
    183                 if not hasattr(generator_output, '__len__'):

~/anaconda3/envs/tf36/lib/python3.6/site-packages/keras/utils/data_utils.py in get(self)
    593         try:
    594             while self.is_running():
--> 595                 inputs = self.queue.get(block=True).get()
    596                 self.queue.task_done()
    597                 if inputs is not None:

~/anaconda3/envs/tf36/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
    662 
    663     def get(self, timeout=None):
--> 664         self.wait(timeout)
    665         if not self.ready():
    666             raise TimeoutError

~/anaconda3/envs/tf36/lib/python3.6/multiprocessing/pool.py in wait(self, timeout)
    659 
    660     def wait(self, timeout=None):
--> 661         self._event.wait(timeout)
    662 
    663     def get(self, timeout=None):

~/anaconda3/envs/tf36/lib/python3.6/threading.py in wait(self, timeout)
    549             signaled = self._flag
    550             if not signaled:
--> 551                 signaled = self._cond.wait(timeout)
    552             return signaled
    553 

~/anaconda3/envs/tf36/lib/python3.6/threading.py in wait(self, timeout)
    293         try:    # restore state no matter what (e.g., KeyboardInterrupt)
    294             if timeout is None:
--> 295                 waiter.acquire()
    296                 gotit = True
    297             else:

KeyboardInterrupt:



In [138]:

    
logged_history = pd.read_csv(logger_filename)
logged_history



In [142]:

    
with mpl.rc_context(rc={"figure.figsize": (10,6)}):

        
    plt.axhline(prior_loss, label="Prior", 
                linestyle="dashed", color="black")
    
    plt.plot(logged_history["val_loss"], label="Validation")
    plt.plot(logged_history["loss"], label="Training")
    
    plt.xlabel("Epoch")
    plt.ylabel("Loss\n(avg. binary cross-entropy)")
    
    plt.legend()
    
#     plt.ylim(bottom=.56, top=.62)

    plt.tight_layout()
    plot_filename = pathlib.Path("plots_for_thesis") / "learning_curve-with_photometry"
    plt.savefig(plot_filename.with_suffix(".png"))
    plt.savefig(plot_filename.with_suffix(".pdf"))



In [141]:

    
with mpl.rc_context(rc={"figure.figsize": (10,6)}):

    simple_conv = lambda x: np.convolve(x, np.ones(5)/5, mode="valid")
    
    plt.axhline(prior_loss, label="Prior", 
                linestyle="dashed", color="black")
    
    plt.plot(simple_conv(logged_history["val_loss"]),
             label="Validation")
    plt.plot(simple_conv(logged_history["loss"]),
             label="Training")
    
    plt.xlabel("Epoch")
    plt.ylabel("Loss\n(avg. binary cross-entropy)")
    
    plt.legend()
    
#     plt.ylim(bottom=.56, top=.62)

9) Look at validation results



In [165]:

    
color_CNN = "b"

linestyle_CNN = "solid"

linewidth=3



In [150]:

    
overwrite = True
use_cached_if_exists = False
class_probs_filename = "class_probs.with_photometry.csv"

if use_cached_if_exists and pathlib.Path(class_probs_filename).is_file():
    df_class_probs = pd.read_csv(class_probs_filename)
else:
    X_transformed = np.array([datagen.standardize(X_img)
                              for X_img in X])
    class_probs = model3.predict_proba(features_combined).flatten()
    
    df_class_probs = pd.DataFrame({
        "HSC_id": HSC_ids,
        "CNN_prob": class_probs,
        "testing": [HSC_id in HSC_ids[testing_set_indices] for HSC_id in HSC_ids],
        "target": Y,
    })

    if overwrite or (not pathlib.Path(class_probs_filename).is_file()):
        df_class_probs.to_csv(class_probs_filename, index=False)


df_testing = df_class_probs[df_class_probs.testing]

df_class_probs.head()



In [173]:

    
print(df_class_probs.target.mean())
print(df_class_probs[~df_class_probs.testing].target.mean())

print(df_testing.CNN_prob.mean())









    



0.24836808165525973
0.24854119276036
0.1659968



In [158]:

    
with mpl.rc_context(rc={"figure.figsize": (10,6)}):
    sns.distplot(df_testing[df_testing.target].CNN_prob, color="g", label="targets")
    sns.distplot(df_testing[~df_testing.target].CNN_prob, color="b", label="contaminants")

    plt.xlabel("p(target | image)")
    plt.ylabel("density (galaxies)")

    plt.xlim(0, .7)
    plt.axvline(df_testing.target.mean(), linestyle="dashed", color="black", label="prior\n(from training set)")
    plt.axvline(.5, linestyle="dotted", color="black", label="50/50")

    plt.legend(
        loc="upper left",
        bbox_to_anchor=(1, 1),
    )



In [166]:

    
from sklearn import metrics
from sklearn.metrics import roc_auc_score

with mpl.rc_context(rc={"figure.figsize": (10,6)}):
    fpr, tpr, _ = metrics.roc_curve(df_testing.target, df_testing.CNN_prob)
    roc_auc = roc_auc_score(df_testing.target, df_testing.CNN_prob)

    plt.plot(fpr, tpr, label="CNN (AUC = {:.2})".format(roc_auc), 
             color=color_CNN, linewidth=linewidth,
            )
    plt.plot([0,1], [0,1], linestyle="dotted", color="black", label="Random guessing",
             linewidth=linewidth,
            )

    plt.xlim(0,1)
    plt.ylim(0,1)

    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")

    plt.legend(loc="best")
    
    plt.tight_layout()
    plot_filename = "plots_for_thesis/ROC-CNN"
    plt.savefig(plot_filename + ".pdf")
    plt.savefig(plot_filename + ".png")



In [170]:

    
from sklearn.metrics import average_precision_score
with mpl.rc_context(rc={"figure.figsize": (10,6)}):
    precision, recall, _ = metrics.precision_recall_curve(df_testing.target, df_testing.CNN_prob)
    pr_auc = average_precision_score(df_testing.target, df_testing.CNN_prob)

    plt.plot(recall, precision, label="AUC = {:.3}".format(pr_auc), color=color_CNN, linewidth=linewidth)
    
    plt.plot([0,1], [Y[testing_set_indices].mean()]*2, linestyle="dotted", color="black", 
             label="Random guessing",
             linewidth=linewidth,
            )


    plt.xlim(0,1)
    plt.ylim(0,1)

    plt.xlabel("Completeness")
    plt.ylabel("Purity ")

    plt.legend(loc="best")
    
    plt.tight_layout()
    plot_filename = "plots_for_thesis/purity_completeness-CNN"
    plt.savefig(plot_filename + ".pdf")
    plt.savefig(plot_filename + ".png")



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

Apply a threshold

For now, just use a threshold using the prior class probability (estimated from the training set)

Note under a symmetric loss function, this isn't as good as



In [ ]:

    
predicted_classes = class_probs > (Y[training_set_indices].mean())
predicted_classes.mean()



In [ ]:

    
confusion_matrix = metrics.confusion_matrix(Y[testing_set_indices], predicted_classes)
confusion_matrix



In [ ]:

    
print("number of dwarfs (true)     : ", Y[testing_set_indices].sum())
print("number of dwarfs (predicted): ", predicted_classes.sum())



In [ ]:

    
import itertools
# adapted from: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

confusion_matrix_normalized = confusion_matrix.astype('float') / confusion_matrix.sum(axis=1)[:, np.newaxis]

plt.imshow(confusion_matrix_normalized, interpolation='nearest',cmap="gray_r")
# plt.title(title)
plt.colorbar()
tick_marks = np.arange(2)
classes = [False, True]
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

fmt = 'd'
thresh = 1 / 2.
for i, j in itertools.product(range(confusion_matrix.shape[0]), range(confusion_matrix.shape[1])):
    plt.text(j, i, format(confusion_matrix[i, j], fmt),
             fontdict={"size":20},
             horizontalalignment="center",
             color="white" if confusion_matrix_normalized[i, j] > thresh else "black")

plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.tight_layout()



In [ ]:

    
print("  i - Y_true[i] - Y_pred[i] -  error?")
print("-------------------------------------")
for i in range(predicted_classes.size):
    print("{:>3} -    {:>1d}      -    {:d}      -   {:>2d}".format(i, 
                                                    Y[testing_set_indices][i], 
                                                    predicted_classes[i],
                                                    (Y[testing_set_indices][i] != predicted_classes[i]), 
                                                   ))

Analyze Errors



In [ ]:

    
HSC_ids



In [ ]:

    
df.loc[HSC_ids[testing_set_indices]].head()



In [ ]:

    
COSMOS_filename = os.path.join(dwarfz.data_dir_default, 
                               "COSMOS_reference.sqlite")
COSMOS = dwarfz.datasets.COSMOS(COSMOS_filename)



In [ ]:

    
HSC_filename = os.path.join(dwarfz.data_dir_default, 
                            "HSC_COSMOS_median_forced.sqlite3")
HSC = dwarfz.datasets.HSC(HSC_filename)



In [ ]:

    
matches_filename = os.path.join(dwarfz.data_dir_default, 
                                "matches.sqlite3")
matches_df = dwarfz.matching.Matches.load_from_filename(matches_filename)



In [ ]:

    
combined = matches_df[matches_df.match].copy()
combined["ra"]       = COSMOS.df.loc[combined.index].ra
combined["dec"]      = COSMOS.df.loc[combined.index].dec
combined["photo_z"]  = COSMOS.df.loc[combined.index].photo_z
combined["log_mass"] = COSMOS.df.loc[combined.index].mass_med
combined["active"]   = COSMOS.df.loc[combined.index].classification

combined = combined.set_index("catalog_2_ids")

combined.head()



In [ ]:

    
df_features_testing = combined.loc[HSC_ids[testing_set_indices]]



In [ ]:

    
df_tmp = df_features_testing.loc[HSC_ids[testing_set_indices]]
df_tmp["error"] =   np.array(Y[testing_set_indices], dtype=int) \
                  - np.array(predicted_classes, dtype=int)


mask = (df_tmp.photo_z < .5)
# mask &= (df_tmp.error == -1)

print(sum(mask))

plt.hexbin(df_tmp.photo_z[mask], 
           df_tmp.log_mass[mask],
#            C=class_probs,
           gridsize=20,
           cmap="Blues",
           vmin=0,
          )

plt.xlabel("photo z")
plt.ylabel("log M_star")

plt.gca().add_patch(
    patches.Rectangle([0, 8], 
                      .15, 1, 
                      fill=False, 
                      linewidth=3,
                      color="red",
                     ),
)

plt.colorbar(label="Number of objects",
            )

plt.suptitle("All Objects")



In [ ]:

    
df_tmp = df_features_testing.loc[HSC_ids[testing_set_indices]]
df_tmp["error"] =   np.array(Y[testing_set_indices], dtype=int) \
                  - np.array(predicted_classes, dtype=int)


mask = (df_tmp.photo_z < .5)
# mask &= (df_tmp.error == -1)

print(sum(mask))

plt.hexbin(df_tmp.photo_z[mask], 
           df_tmp.log_mass[mask],
           C=class_probs,
           gridsize=20,
           cmap="Blues",
           vmin=0,
          )

plt.xlabel("photo z")
plt.ylabel("log M_star")

plt.gca().add_patch(
    patches.Rectangle([0, 8], 
                      .15, 1, 
                      fill=False, 
                      linewidth=3,
                      color="red",
                     ),
)

plt.colorbar(label="Average Predicted\nProb. within bin",
            )


plt.suptitle("All Objects")

^^ Huh, that's a pretty uniform looking spread. It doesn't really seem like it's trending in an useful direction (either near the desired boundaries or as you go further away).



In [ ]:

    
df_tmp = df_features_testing.loc[HSC_ids[testing_set_indices]]
df_tmp["error"] =   np.array(Y[testing_set_indices], dtype=int) \
                  - np.array(predicted_classes, dtype=int)


mask = (df_tmp.photo_z < .5)
mask &= (df_tmp.error == -1)

print(sum(mask))

plt.hexbin(df_tmp.photo_z[mask], 
           df_tmp.log_mass[mask],
           C=class_probs,
           gridsize=20,
           cmap="Blues",
           vmin=0,
          )

plt.xlabel("photo z")
plt.ylabel("log M_star")

plt.gca().add_patch(
    patches.Rectangle([0, 8], 
                      .15, 1, 
                      fill=False, 
                      linewidth=3,
                      color="red",
                     ),
)

plt.colorbar(label="Average Predicted\nProb. within bin",
            )

plt.suptitle("False Positives")



In [ ]:

	epoch	loss	val_loss
0	1	0.5487	0.5481
1	2	0.5362	0.5308
2	3	0.5214	0.5136
3	4	0.5130	0.5036
4	5	0.5040	0.5211
5	6	0.4996	0.4989
6	7	0.4904	0.4871
7	8	0.4823	0.4779
8	9	0.4804	0.4813
9	10	0.4720	0.4758
10	11	0.4687	0.4679
11	12	0.4643	0.4787
12	13	0.4583	0.4569
13	14	0.4529	0.4573
14	15	0.4549	0.4594
15	16	0.4545	0.4579
16	17	0.4458	0.4510
17	18	0.4430	0.4482
18	19	0.4405	0.4580
19	20	0.4389	0.4438
20	21	0.4362	0.4426
21	22	0.4380	0.4461
22	23	0.4291	0.4402
23	24	0.4288	0.4430
24	25	0.4294	0.4422
25	26	0.4255	0.4482
26	27	0.4269	0.4372
27	28	0.4197	0.4527
28	29	0.4236	0.4557
29	30	0.4231	0.4555
30	31	0.4201	0.4354
31	32	0.4177	0.4348
32	33	0.4198	0.4680
33	34	0.4137	0.4368
34	35	0.4164	0.4400
35	36	0.4084	0.4331
36	37	0.4068	0.4365
37	38	0.4093	0.4495
38	39	0.4100	0.4363
39	40	0.4067	0.4326
40	41	0.4047	0.4479
41	42	0.4001	0.4346
42	43	0.4058	0.4314
43	44	0.4020	0.4401
44	45	0.3998	0.4374
45	46	0.3964	0.4397
46	47	0.4015	0.4314
47	48	0.3926	0.4422
48	49	0.3986	0.4563
49	50	0.3926	0.4389

	target
HSC_id
43158176442354210	False
43158176442354294	False
43158176442354356	False
43158176442354418	True
43158176442354561	False

	COSMOS_id
HSC_id
43158322471244656	628457
43158605939114836	919771
43159142810013665	444239
43158734788125011	569427
43158863637144621	369948

	g_minus_r	r_minus_i	i_minus_z	z_minus_y	icmodel_mag	photoz_best	photoz_risk_best
catalog_1_ids
387020	0.241915	0.287563	0.580751	-0.461952	25.067200	0.75	0.189435
389379	0.815916	0.300808	0.021614	-0.038185	23.948193	0.36	0.158907
390478	1.774567	1.506727	0.083956	-0.385647	24.193956	0.85	0.183067
391721	0.493143	0.108091	0.332539	0.058878	25.082439	2.61	0.458986
394884	1.182629	0.388001	0.275812	-0.031630	23.426733	0.50	0.168350