In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import matplotlib as mpl

import seaborn as sns; sns.set(style="white", font_scale=2)

import numpy as np
import pandas as pd
from astropy.io import fits
import glob

import sklearn
import sklearn.metrics

import keras
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.layers.noise import GaussianNoise

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.preprocessing.image import random_channel_shift
from keras.optimizers import SGD, Adam
from keras import backend as K
K.set_image_data_format('channels_first')
K.set_session(K.tf.Session(config=K.tf.ConfigProto(intra_op_parallelism_threads=10, 
                                                   inter_op_parallelism_threads=10)))

import scipy.ndimage as ndi

import matplotlib.patches as patches

import pathlib


Using TensorFlow backend.

In [2]:
mpl.rcParams['savefig.dpi'] = 80
mpl.rcParams['figure.dpi'] = 80
mpl.rcParams['figure.figsize'] = np.array((10,6))*.6
mpl.rcParams['figure.facecolor'] = "white"

In [3]:
%load_ext autoreload
%autoreload 2

In [50]:
# my modules that are DNN specific
import os, sys
dwarfz_package_dir = pathlib.Path \
                            .cwd() \
                            .parent / "DNN"

if str(dwarfz_package_dir) not in sys.path:
    sys.path.insert(0, str(dwarfz_package_dir))
    sys.path.insert(0, str(dwarfz_package_dir.parent.parent))
                    
import dwarfz

# these actually exist in "../DNN" which is why I had to add it to the path above
import preprocessing
import geometry

To Do

  1. Read in fits images
  2. Apply pre-processing stretch (asinh)
  3. crop image smaller
  4. Combine filters into one cube
  5. Create training set with labels
  6. Set up keras model
  7. Poke around at the results

0) Get files


In [5]:
# images_dir = pathlib.Path(preprocessing.images_dir)
images_dir = pathlib.Path.home() / "dwarfz" / "galaxies_narrowband"
images_dir


Out[5]:
PosixPath('/Users/egentry/dwarfz/galaxies_narrowband')

In [6]:
HSC_ids_target = {int(subdir.name)
                  for subdir in images_dir.glob("target/*")
                  if subdir.name[0] != "."}

HSC_ids_contaminant = {int(subdir.name)
                       for subdir in images_dir.glob("contaminant/*")
                       if subdir.name[0] != "."}

remove_ids = HSC_ids_target & HSC_ids_contaminant
HSC_ids_target -= remove_ids
HSC_ids_contaminant -= remove_ids

HSC_ids = HSC_ids_target | HSC_ids_contaminant

HSC_ids_target      = np.array(list(HSC_ids_target))
HSC_ids_contaminant = np.array(list(HSC_ids_contaminant))
HSC_ids             = np.array(list(HSC_ids))


print(len(HSC_ids_target))
print(len(HSC_ids_contaminant))
print(len(HSC_ids), len(HSC_ids_target) + len(HSC_ids_contaminant))


HSC_ids = np.array(sorted(HSC_ids)) # enforce a deterministic ordering


6278
18999
25277 25277

In [7]:
HSC_id = HSC_ids[1] # for when I need a single sample galaxy

In [8]:
bands = ["g", "r", "i", "z", "y"]

1) Read in fits image


In [9]:
get_image = lambda HSC_id, band: preprocessing.get_image(HSC_id, band, images_dir=images_dir/"*"/str(HSC_id))

In [10]:
image, flux_mag_0 = get_image(HSC_id, "g")
print("image size: {} x {}".format(*image.shape))
image


image size: 239 x 239
Out[10]:
array([[-0.02194131,  0.00391446,  0.01770653, ..., -0.01461175,
        -0.01249455,  0.01746454],
       [-0.01601904, -0.02720453, -0.06848946, ...,  0.04288237,
         0.00534859,  0.02132502],
       [-0.01800561, -0.05129848, -0.06990851, ...,  0.03650333,
         0.01508071, -0.03739795],
       ...,
       [ 0.02909144,  0.03687791,  0.04685813, ...,  0.03878131,
        -0.04980357, -0.0178718 ],
       [ 0.01575067, -0.01513606,  0.02736958, ...,  0.01000878,
         0.00148029,  0.01590817],
       [ 0.02265622,  0.04364862, -0.00685509, ..., -0.01821685,
         0.01330067,  0.01832419]], dtype=float32)

In [11]:
preprocessing.image_plotter(image)



In [12]:
preprocessing.image_plotter(np.log(image))


/Users/egentry/anaconda3/envs/tf36/lib/python3.6/site-packages/ipykernel_launcher.py:1: RuntimeWarning: invalid value encountered in log
  """Entry point for launching an IPython kernel.

2) Apply stretch

We're using (negative) asinh magnitudes, as implemented by the HSC collaboration.

To see more about asinh magnitude system, see : Lupton, Gunn and Szalay (1999) used for SDSS. (It's expliticly given in the SDSS Algorithms documentation as well as this overview page).

To see the source of our code, see: the HSC color image creator

And for reference, a common form of this stretch is: $$ \mathrm{mag}_\mathrm{asinh} = - \left(\frac{2.5}{\ln(10)}\right) \left(\mathrm{asinh}\left(\frac{f/f_0}{2b}\right) + \ln(b) \right)$$ for dimensionless softening parameter $b$, and reference flux (f_0).


In [13]:
def scale(x, fluxMag0):
    ### adapted from https://hsc-gitlab.mtk.nao.ac.jp/snippets/23
    mag0 = 19
    scale = 10 ** (0.4 * mag0) / fluxMag0
    x *= scale

    u_min = -0.05
    u_max = 2. / 3.
    u_a = np.exp(10.)

    x = np.arcsinh(u_a*x) / np.arcsinh(u_a)
    x = (x - u_min) / (u_max - u_min)

    return x

In [14]:
image_scaled = preprocessing.scale(image, flux_mag_0)

In [15]:
preprocessing.image_plotter(image_scaled)



In [16]:
sns.distplot(image_scaled.flatten())
plt.title("Distribution of Transformed Intensities")


Out[16]:
Text(0.5, 1.0, 'Distribution of Transformed Intensities')

In [17]:
for band in bands:
    image, flux_mag_0 = get_image(HSC_id, band)
    image_scaled = preprocessing.scale(image, flux_mag_0)

    plt.figure()
    preprocessing.image_plotter(image_scaled)
    plt.title("{} band".format(band))
    plt.colorbar()


3) Crop Image

Am I properly handling odd numbers?


In [18]:
pre_transformed_image_size  = 150
post_transformed_image_size = 75

In [19]:
cutout = preprocessing.get_cutout(image_scaled, post_transformed_image_size)
cutout.shape


Out[19]:
(75, 75)

In [20]:
preprocessing.image_plotter(cutout)



In [21]:
for band in bands:
    image, flux_mag_0 = get_image(HSC_id, band)
    image_scaled = preprocessing.scale(image, flux_mag_0)
    cutout = preprocessing.get_cutout(image_scaled, post_transformed_image_size)

    plt.figure()
    preprocessing.image_plotter(cutout)
    plt.title("{} band".format(band))
    plt.colorbar()


4) Combine filters into cube


In [22]:
images = [None]*len(bands)
flux_mag_0s = [None]*len(bands)
cutouts = [None]*len(bands)
for i, band in enumerate(bands):
    images[i], flux_mag_0s[i] = get_image(HSC_id, band)
    
    cutouts[i] = preprocessing.get_cutout(
        preprocessing.scale(images[i], flux_mag_0s[i]), 
        post_transformed_image_size
    )

In [23]:
cutout_cube = np.array(cutouts)
cutout_cube.shape


Out[23]:
(5, 75, 75)

In [24]:
# must transform into [0,1] for plt.imshow
# the HSC standard tool accomplishes this by clipping instead.
plt.imshow(preprocessing.transform_0_1(cutout_cube[(4,2,0),:,:].transpose(1,2,0)) )


Out[24]:
<matplotlib.image.AxesImage at 0x135f0ffd0>

In [25]:
for i, band in enumerate(bands):
    sns.distplot(cutout_cube[:,:,:].transpose(1,2,0)[:,:,i].flatten(), label=band)
    plt.legend(loc="best")


5) Load Training Set Labels


In [26]:
df = pd.DataFrame(data={
    "HSC_id" :HSC_ids,
    "target" : [HSC_id in HSC_ids_target for HSC_id in HSC_ids]
})

df = df.set_index("HSC_id")

df.head()


Out[26]:
target
HSC_id
43158176442354210 False
43158176442354294 False
43158176442354356 False
43158176442354418 True
43158176442354561 False

In [27]:
df.mean()


Out[27]:
target    0.248368
dtype: float64

In [28]:
def load_image_mappable(HSC_id):
    images      = [None]*len(bands)
    flux_mag_0s = [None]*len(bands)
    cutouts     = [None]*len(bands)
    for j, band in enumerate(bands):
        images[j], flux_mag_0s[j] = get_image(HSC_id, band)
        cutouts[j] = preprocessing.get_cutout(
            preprocessing.scale(images[j], flux_mag_0s[j]),
            pre_transformed_image_size)
    cutout_cube = np.array(cutouts)
    return cutout_cube

In [29]:
X = np.empty((len(HSC_ids[:400]), 5, 
              pre_transformed_image_size, pre_transformed_image_size))

In [30]:
X.shape


Out[30]:
(400, 5, 150, 150)

In [31]:
for i in range(X.shape[0]):
    HSC_id = HSC_ids[i]
    X[i] = load_image_mappable(HSC_id)
    
    if i%(X.shape[0]//20)==0:
        print("loading # {} / {}".format(i, X.shape[0]), end="\r", flush=True)


loading # 380 / 400

In [32]:
X_full = X
X_small = X[:,(4,2,0),:,:] # drop down to 3 bands (y, i, g)

In [33]:
X.shape


Out[33]:
(400, 5, 150, 150)

In [34]:
Y = df.loc[HSC_ids].target.values
Y


Out[34]:
array([False, False, False, ..., False, False, False])

In [35]:
Y.mean()


Out[35]:
0.24836808165525973

load photometric features


In [51]:
COSMOS_filename = pathlib.Path(dwarfz.data_dir_default) / "COSMOS_reference.sqlite"
COSMOS = dwarfz.datasets.COSMOS(COSMOS_filename)

HSC_filename = pathlib.Path(dwarfz.data_dir_default) / "HSC_COSMOS_median_forced.sqlite3"
HSC = dwarfz.datasets.HSC(HSC_filename)

matches_filename = pathlib.Path(dwarfz.data_dir_default) / "matches.sqlite3"
matches_df = dwarfz.matching.Matches.load_from_filename(matches_filename)

combined = matches_df[matches_df.match].copy()
combined["ra"]       = COSMOS.df.loc[combined.index].ra
combined["dec"]      = COSMOS.df.loc[combined.index].dec
combined["photo_z"]  = COSMOS.df.loc[combined.index].photo_z
combined["log_mass"] = COSMOS.df.loc[combined.index].mass_med

photometry_cols = [
    "gcmodel_flux","gcmodel_flux_err","gcmodel_flux_flags", "gcmodel_mag",
    "rcmodel_flux","rcmodel_flux_err","rcmodel_flux_flags", "rcmodel_mag",
    "icmodel_flux","icmodel_flux_err","icmodel_flux_flags", "icmodel_mag",
    "zcmodel_flux","zcmodel_flux_err","zcmodel_flux_flags", "zcmodel_mag",
    "ycmodel_flux","ycmodel_flux_err","ycmodel_flux_flags", "ycmodel_mag",
]

for col in photometry_cols:
    combined[col] = HSC.df.loc[combined.catalog_2_ids][col].values
    
    
combined["g_minus_r"] = combined.gcmodel_mag - combined.rcmodel_mag
combined["r_minus_i"] = combined.rcmodel_mag - combined.icmodel_mag
combined["i_minus_z"] = combined.icmodel_mag - combined.zcmodel_mag
combined["z_minus_y"] = combined.zcmodel_mag - combined.ycmodel_mag

mask =    np.isfinite(combined["g_minus_r"]) & np.isfinite(combined["r_minus_i"]) \
        & np.isfinite(combined["i_minus_z"]) & np.isfinite(combined["z_minus_y"]) \
        & np.isfinite(combined["icmodel_mag"]) \
        & (~combined.gcmodel_flux_flags) & (~combined.rcmodel_flux_flags) \
        & (~combined.icmodel_flux_flags) & (~combined.zcmodel_flux_flags) \
        & (~combined.ycmodel_flux_flags)

combined = combined[mask]

df_frankenz = pd.read_sql_table("photo_z",
                                "sqlite:///{}".format(
                                    pathlib.Path(dwarfz.data_dir_default)
                                                 / "HSC_matched_to_FRANKENZ.sqlite"),
                                index_col="object_id")

df_frankenz.head()

combined = combined.join(df_frankenz[["photoz_best", "photoz_risk_best"]],
                         on="catalog_2_ids")

photometry_features = combined.loc[:,["g_minus_r", "r_minus_i", "i_minus_z", "z_minus_y",
                                      "icmodel_mag",
                                      "photoz_best",
                                      "photoz_risk_best" # The risk of photoz_best being outside of the range z_true +- 0.15(1+z_true). It ranges from 0 (safe) to 1(risky)
                                     ]]

photometry_features

In [37]:
training_set_labels_filename = "../data/galaxy_images_training/2017_09_26-dwarf_galaxy_scores.csv"
HSC_to_COSMOS = pd.read_csv(training_set_labels_filename)
HSC_to_COSMOS = HSC_to_COSMOS.drop_duplicates("HSC_id")
HSC_to_COSMOS = HSC_to_COSMOS.set_index("HSC_id")
HSC_to_COSMOS = HSC_to_COSMOS[["COSMOS_id"]]
HSC_to_COSMOS.head()


Out[37]:
COSMOS_id
HSC_id
43158322471244656 628457
43158605939114836 919771
43159142810013665 444239
43158734788125011 569427
43158863637144621 369948

In [63]:
photometry_features_aligned = photometry_features.loc[HSC_to_COSMOS.loc[HSC_ids].values.flatten()]
photometry_features_aligned.head()


Out[63]:
g_minus_r r_minus_i i_minus_z z_minus_y icmodel_mag photoz_best photoz_risk_best
catalog_1_ids
387020 0.241915 0.287563 0.580751 -0.461952 25.067200 0.75 0.189435
389379 0.815916 0.300808 0.021614 -0.038185 23.948193 0.36 0.158907
390478 1.774567 1.506727 0.083956 -0.385647 24.193956 0.85 0.183067
391721 0.493143 0.108091 0.332539 0.058878 25.082439 2.61 0.458986
394884 1.182629 0.388001 0.275812 -0.031630 23.426733 0.50 0.168350

Geometry!


In [65]:
X[0].shape


Out[65]:
(5, 150, 150)

In [66]:
h = pre_transformed_image_size
w = pre_transformed_image_size
transform_matrix = geometry.create_random_transform_matrix(h, w,
                                                  include_rotation=True,
                                                  translation_size = .01,
                                                  verbose=False)

x_tmp = X[0][(4,2,0),:,:]

result = geometry.apply_transform_new(x_tmp, transform_matrix, 
                            channel_axis=0, fill_mode="constant", cval=np.max(x_tmp))

result = preprocessing.get_cutout(x_tmp, post_transformed_image_size)
plt.imshow(preprocessing.transform_0_1(result.transpose(1,2,0)))


Out[66]:
<matplotlib.image.AxesImage at 0x133462748>

In [67]:
import ipywidgets
ipywidgets.interact(preprocessing.transform_plotter,
                    X = ipywidgets.fixed(X_small),
                    rotation_degrees = ipywidgets.IntSlider(min=0, max=360, step=15, value=45),
                    dx_after = ipywidgets.IntSlider(min=-15, max=15),
                    dy_after = ipywidgets.IntSlider(min=-15, max=15),
                    color = ipywidgets.fixed(True),
                    shear_degrees = ipywidgets.IntSlider(min=0, max=90, step=5, value=0),
                    zoom_x = ipywidgets.FloatSlider(min=.5, max=2, value=1),
                    crop = ipywidgets.Checkbox(value=True)
                    )


Out[67]:
<function preprocessing.transform_plotter(X, reflect_x=False, rotation_degrees=45, dx_after=0, dy_after=0, shear_degrees=0, zoom_x=1, crop=False, color=True)>

5b) Split training and testing set


In [68]:
np.random.seed(1)

randomized_indices = np.arange(Y.shape[0])
np.random.shuffle(randomized_indices)

testing_fraction = 0.2
testing_set_indices = randomized_indices[:int(testing_fraction*Y.shape[0])]
training_set_indices = np.array(list(set([*randomized_indices]) - set([*testing_set_indices])))

In [69]:
print(testing_set_indices.size, Y[testing_set_indices].mean())


5055 0.247675568743818

In [70]:
print(training_set_indices.size, Y[training_set_indices].mean())


20222 0.24854119276036

In [71]:
p = Y[training_set_indices].mean()
prior_loss = sklearn.metrics.log_loss(Y[testing_set_indices], 
                                      [p]*testing_set_indices.size)
prior_loss


Out[71]:
0.5597690655312428

6b) Adapt NumpyArrayIterator

The original only allowed 1, 3 or 4 channel images. I have 5 channel images.

Also, I want to change the way that augmentation is happening


In [72]:
from data_generator_directory import DirectoryIterator

6c) Adapt ImageDataGenerator

The original only allowed 1, 3 or 4 channel images. I have 5 channel images. Also, I'm adjusting the way that the affine transformations work for the data augmentation


In [73]:
from data_generator_directory import ImageDataGeneratorLoadable

6d) Create Data Generator


In [106]:
print('Using real-time data augmentation.')

h_before, w_before = X[0,0].shape
print("image shape before: ({},{})".format(h_before, w_before))

h_after = post_transformed_image_size
w_after = post_transformed_image_size
print("image shape after:  ({},{})".format(h_after, w_after))

# get a closure that binds the image size to get_cutout
postprocessing_function = lambda image: preprocessing.get_cutout(image, post_transformed_image_size)

# this will do preprocessing and realtime data augmentation
datagen = ImageDataGeneratorLoadable(
    featurewise_center=False,  # set input mean to 0 over the dataset
    samplewise_center=False,  # set each sample mean to 0
    featurewise_std_normalization=False,  # divide inputs by std of the dataset
    samplewise_std_normalization=False,  # divide each input by its std
    zca_whitening=False,  # apply ZCA whitening
    with_reflection_x=True, # randomly apply a reflection (in x)
    with_reflection_y=True, # randomly apply a reflection (in y)
    with_rotation=False, # randomly apply a rotation
    width_shift_range=0.002,  # randomly shift images horizontally (fraction of total width)
    height_shift_range=0.002,  # randomly shift images vertically (fraction of total height)
    postprocessing_function=postprocessing_function, # get a cutout of the processed image
    output_image_shape=(post_transformed_image_size,post_transformed_image_size)
)   

# this will do preprocessing and realtime data augmentation
from data_generator import ImageDataGenerator
datagen_features = ImageDataGenerator(
    featurewise_center=False,  # set input mean to 0 over the dataset
    samplewise_center=False,  # set each sample mean to 0
    featurewise_std_normalization=False,  # divide inputs by std of the dataset
    samplewise_std_normalization=False,  # divide each input by its std
    zca_whitening=False,  # apply ZCA whitening
    with_reflection_x=False, # randomly apply a reflection (in x)
    with_reflection_y=False, # randomly apply a reflection (in y)
    with_rotation=False, # randomly apply a rotation
#     width_shift_range=0.002,  # randomly shift images horizontally (fraction of total width)
#     height_shift_range=0.002,  # randomly shift images vertically (fraction of total height)
#     postprocessing_function=postprocessing_function, # get a cutout of the processed image
    output_image_shape=(1,1),
)


Using real-time data augmentation.
image shape before: (150,150)
image shape after:  (75,75)

In [75]:
# datagen.fit()

7) Set up keras model


In [76]:
with_one_by_one = False
trainable = False

input_shape = cutout_cube.shape

nb_dense = 64

In [77]:
from keras.applications import inception_v3, inception_resnet_v2, vgg19

In [79]:
feature_extractor = Sequential()
extractor_input_shape = tuple([3, *input_shape[1:]])
extractor_layers = vgg19.VGG19(include_top=False,
                                input_shape=extractor_input_shape
                               )
for layer in extractor_layers.layers:
    layer.trainable = False
feature_extractor.add(extractor_layers)

feature_extractor.compile(optimizer= Adam(lr=1),
               loss="mean_squared_error")

In [80]:
reextract_features = False
features_filename = "vgg_features.h5"

if reextract_features or (not pathlib.Path(features_filename).is_file()):
    
    features = np.zeros((Y.size, 512, 2, 2))
    for i, HSC_id in enumerate(HSC_ids):
        x_tmp = np.array([datagen.standardize(load_image_mappable(HSC_id))])
        features[i] = feature_extractor.predict(x_tmp[:,(4,2,0),:,:])

        if i%(features.shape[0]//20)==0:
            print("loading # {} / {}".format(i, features.shape[0]), end="\r", flush=True)
    np.save(features_filename, features, allow_pickle=False)
else:
    features = np.load(features_filename)
    
features.shape


Out[80]:
(25277, 512, 2, 2)

In [ ]:
reextract_features = False
features_filename = "vgg_features.h5"

if reextract_features or (not pathlib.Path(features_filename).is_file()):
    
    features = np.zeros((Y.size, 512, 2, 2))
    for i, HSC_id in enumerate(HSC_ids):
        x_tmp = np.array([datagen.standardize(load_image_mappable(HSC_id))])
        features[i] = feature_extractor.predict(x_tmp[:,(4,2,0),:,:])

        if i%(features.shape[0]//20)==0:
            print("loading # {} / {}".format(i, features.shape[0]), end="\r", flush=True)
    df_vgg = pd.DataFrame(data={
        "HSC_id" : HSC_ids,
        "vgg_features" : [feature_row for feature_row in features]
    })
    df_vgg.to_hdf(features_filename, "data")
else:
    df_vgg = pd.read_hdf(features_filename)
    features = np.array([row for row in df_vgg.vgg_features.values])    
features.shape

In [81]:
# model = Sequential()

# # # # 1x1 convolution to make sure we only have 3 channels
# n_channels_for_pretrained=3
# one_by_one = Conv2D(n_channels_for_pretrained, 1, padding='same',
#                     input_shape=input_shape)
# if not with_one_by_one:
#     one_by_one.trainable = False
# model.add(one_by_one)

# pretrained_input_shape = tuple([n_channels_for_pretrained, *input_shape[1:]])
# pretrained_layers = vgg19.VGG19(include_top=False,
#                                 input_shape=pretrained_input_shape
#                                )
# for layer in pretrained_layers.layers:
#     layer.trainable = trainable
# model.add(pretrained_layers)


# model.add(Flatten())
# model.add(Dense(2*nb_dense, activation="relu"))
# model.add(Dense(nb_dense, activation="relu"))
# model.add(Dense(1, activation="sigmoid"))

In [83]:
learning_rate = 0.00025
decay = 1e-5
momentum = 0.9

# sgd = SGD(lr=learning_rate, decay=decay, momentum=momentum, nesterov=True)

adam = Adam(lr=learning_rate)

In [135]:
!head $logger_filename


head: training_transfer.with_photometry.log: No such file or directory

In [137]:
!head training_transfer.log.old












In [134]:
logger_filename = "training_transfer"
if trainable:
    logger_filename += "-trainable" 
if with_one_by_one:
    logger_filename += "-1x1"
logger_filename += ".with_photometry.log"


# model.compile(loss='binary_crossentropy', 
# #               optimizer=sgd, 
#               optimizer=adam,
# #               metrics=["accuracy"]
#              )

# can only manually set weights _after_ compiling
# one_by_one_weights = np.zeros((1,1,5,3))
# for i in range(3):
#     one_by_one_weights[0,0,4-2*i,i] = 1 # chooses y i g with correct color order
# one_by_one.set_weights([one_by_one_weights,
#                         np.zeros(3)])

print("using logger_filename: ", logger_filename)

# if os.path.exists(logger_filename):
#     logger_filename_tmp = logger_filename + ".old"
#     os.rename(logger_filename, logger_filename_tmp)


using logger_filename:  training_transfer.with_photometry.log

In [109]:
features_combined.shape


Out[109]:
(25277, 2055)

In [123]:
model = Sequential()
model.add(Flatten(input_shape=(np.prod(features.shape[1:]) + photometry_features.shape[1], 1, 1)))
model.add(Dense(2*nb_dense, activation="relu"))
model.add(Dense(nb_dense, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

model.compile(optimizer= Adam(lr=learning_rate),
               loss="binary_crossentropy")

model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
flatten_2 (Flatten)          (None, 2055)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 128)               263168    
_________________________________________________________________
dense_5 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 65        
=================================================================
Total params: 271,489
Trainable params: 271,489
Non-trainable params: 0
_________________________________________________________________

In [124]:
earlystopping = EarlyStopping(monitor='loss',
                              patience=35,
                              verbose=1,
                              mode='auto' )

In [125]:
csv_logger = keras.callbacks.CSVLogger(logger_filename,
                                       append=True)

8) Run basic keras model


In [126]:
goal_batch_size = 128
steps_per_epoch = max(2, training_set_indices.size//goal_batch_size)
batch_size = training_set_indices.size//steps_per_epoch
print("steps_per_epoch: ", steps_per_epoch)
print("batch_size: ", batch_size)
epoch_to_save_state = 50
epochs = 100
verbose = 1


steps_per_epoch:  157
batch_size:  128

In [127]:
prior_loss


Out[127]:
0.5597690655312428

In [128]:
features_combined = np.hstack([features.reshape(-1, 512*2*2), photometry_features_aligned])
features_combined = features_combined.reshape(*features_combined.shape, 1, 1)
print(features_combined.shape)


(25277, 2055, 1, 1)

In [174]:
epochs=200


args = (datagen_features.flow(features_combined[training_set_indices], 
                              Y[training_set_indices],
                              batch_size=batch_size,
                             ),
       )

kwargs = dict(steps_per_epoch=steps_per_epoch,
              validation_data=(features_combined[testing_set_indices], 
                               Y[testing_set_indices]),
              verbose=verbose,
              callbacks=[csv_logger],
             )

for epoch_i in range(epochs):
    history = model.fit_generator(*args,
                                  initial_epoch=epoch_i,
                                  epochs=epoch_i+1,
                                  **kwargs,
                                  )

    model.save_weights("weights/cached_weights.extracted_features.with_photometry.{:>04d}.h5".format(epoch_i))


Epoch 1/1
  2/157 [..............................] - ETA: 20:53 - loss: 0.4380
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-174-ec80bc48bdc9> in <module>
     19                                   initial_epoch=epoch_i,
     20                                   epochs=epoch_i+1,
---> 21                                   **kwargs,
     22                                   )
     23 

~/anaconda3/envs/tf36/lib/python3.6/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
     89                 warnings.warn('Update your `' + object_name + '` call to the ' +
     90                               'Keras 2 API: ' + signature, stacklevel=2)
---> 91             return func(*args, **kwargs)
     92         wrapper._original_function = func
     93         return wrapper

~/anaconda3/envs/tf36/lib/python3.6/site-packages/keras/engine/training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
   1416             use_multiprocessing=use_multiprocessing,
   1417             shuffle=shuffle,
-> 1418             initial_epoch=initial_epoch)
   1419 
   1420     @interfaces.legacy_generator_methods_support

~/anaconda3/envs/tf36/lib/python3.6/site-packages/keras/engine/training_generator.py in fit_generator(model, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
    179             batch_index = 0
    180             while steps_done < steps_per_epoch:
--> 181                 generator_output = next(output_generator)
    182 
    183                 if not hasattr(generator_output, '__len__'):

~/anaconda3/envs/tf36/lib/python3.6/site-packages/keras/utils/data_utils.py in get(self)
    593         try:
    594             while self.is_running():
--> 595                 inputs = self.queue.get(block=True).get()
    596                 self.queue.task_done()
    597                 if inputs is not None:

~/anaconda3/envs/tf36/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
    662 
    663     def get(self, timeout=None):
--> 664         self.wait(timeout)
    665         if not self.ready():
    666             raise TimeoutError

~/anaconda3/envs/tf36/lib/python3.6/multiprocessing/pool.py in wait(self, timeout)
    659 
    660     def wait(self, timeout=None):
--> 661         self._event.wait(timeout)
    662 
    663     def get(self, timeout=None):

~/anaconda3/envs/tf36/lib/python3.6/threading.py in wait(self, timeout)
    549             signaled = self._flag
    550             if not signaled:
--> 551                 signaled = self._cond.wait(timeout)
    552             return signaled
    553 

~/anaconda3/envs/tf36/lib/python3.6/threading.py in wait(self, timeout)
    293         try:    # restore state no matter what (e.g., KeyboardInterrupt)
    294             if timeout is None:
--> 295                 waiter.acquire()
    296                 gotit = True
    297             else:

KeyboardInterrupt: 

In [138]:
logged_history = pd.read_csv(logger_filename)
logged_history


Out[138]:
epoch loss val_loss
0 1 0.5487 0.5481
1 2 0.5362 0.5308
2 3 0.5214 0.5136
3 4 0.5130 0.5036
4 5 0.5040 0.5211
5 6 0.4996 0.4989
6 7 0.4904 0.4871
7 8 0.4823 0.4779
8 9 0.4804 0.4813
9 10 0.4720 0.4758
10 11 0.4687 0.4679
11 12 0.4643 0.4787
12 13 0.4583 0.4569
13 14 0.4529 0.4573
14 15 0.4549 0.4594
15 16 0.4545 0.4579
16 17 0.4458 0.4510
17 18 0.4430 0.4482
18 19 0.4405 0.4580
19 20 0.4389 0.4438
20 21 0.4362 0.4426
21 22 0.4380 0.4461
22 23 0.4291 0.4402
23 24 0.4288 0.4430
24 25 0.4294 0.4422
25 26 0.4255 0.4482
26 27 0.4269 0.4372
27 28 0.4197 0.4527
28 29 0.4236 0.4557
29 30 0.4231 0.4555
30 31 0.4201 0.4354
31 32 0.4177 0.4348
32 33 0.4198 0.4680
33 34 0.4137 0.4368
34 35 0.4164 0.4400
35 36 0.4084 0.4331
36 37 0.4068 0.4365
37 38 0.4093 0.4495
38 39 0.4100 0.4363
39 40 0.4067 0.4326
40 41 0.4047 0.4479
41 42 0.4001 0.4346
42 43 0.4058 0.4314
43 44 0.4020 0.4401
44 45 0.3998 0.4374
45 46 0.3964 0.4397
46 47 0.4015 0.4314
47 48 0.3926 0.4422
48 49 0.3986 0.4563
49 50 0.3926 0.4389

In [142]:
with mpl.rc_context(rc={"figure.figsize": (10,6)}):

        
    plt.axhline(prior_loss, label="Prior", 
                linestyle="dashed", color="black")
    
    plt.plot(logged_history["val_loss"], label="Validation")
    plt.plot(logged_history["loss"], label="Training")
    
    plt.xlabel("Epoch")
    plt.ylabel("Loss\n(avg. binary cross-entropy)")
    
    plt.legend()
    
#     plt.ylim(bottom=.56, top=.62)

    plt.tight_layout()
    plot_filename = pathlib.Path("plots_for_thesis") / "learning_curve-with_photometry"
    plt.savefig(plot_filename.with_suffix(".png"))
    plt.savefig(plot_filename.with_suffix(".pdf"))



In [141]:
with mpl.rc_context(rc={"figure.figsize": (10,6)}):

    simple_conv = lambda x: np.convolve(x, np.ones(5)/5, mode="valid")
    
    plt.axhline(prior_loss, label="Prior", 
                linestyle="dashed", color="black")
    
    plt.plot(simple_conv(logged_history["val_loss"]),
             label="Validation")
    plt.plot(simple_conv(logged_history["loss"]),
             label="Training")
    
    plt.xlabel("Epoch")
    plt.ylabel("Loss\n(avg. binary cross-entropy)")
    
    plt.legend()
    
#     plt.ylim(bottom=.56, top=.62)


9) Look at validation results


In [165]:
color_CNN = "b"

linestyle_CNN = "solid"

linewidth=3

In [150]:
overwrite = True
use_cached_if_exists = False
class_probs_filename = "class_probs.with_photometry.csv"

if use_cached_if_exists and pathlib.Path(class_probs_filename).is_file():
    df_class_probs = pd.read_csv(class_probs_filename)
else:
    X_transformed = np.array([datagen.standardize(X_img)
                              for X_img in X])
    class_probs = model3.predict_proba(features_combined).flatten()
    
    df_class_probs = pd.DataFrame({
        "HSC_id": HSC_ids,
        "CNN_prob": class_probs,
        "testing": [HSC_id in HSC_ids[testing_set_indices] for HSC_id in HSC_ids],
        "target": Y,
    })

    if overwrite or (not pathlib.Path(class_probs_filename).is_file()):
        df_class_probs.to_csv(class_probs_filename, index=False)


df_testing = df_class_probs[df_class_probs.testing]

df_class_probs.head()

In [173]:
print(df_class_probs.target.mean())
print(df_class_probs[~df_class_probs.testing].target.mean())

print(df_testing.CNN_prob.mean())


0.24836808165525973
0.24854119276036
0.1659968

In [158]:
with mpl.rc_context(rc={"figure.figsize": (10,6)}):
    sns.distplot(df_testing[df_testing.target].CNN_prob, color="g", label="targets")
    sns.distplot(df_testing[~df_testing.target].CNN_prob, color="b", label="contaminants")

    plt.xlabel("p(target | image)")
    plt.ylabel("density (galaxies)")

    plt.xlim(0, .7)
    plt.axvline(df_testing.target.mean(), linestyle="dashed", color="black", label="prior\n(from training set)")
    plt.axvline(.5, linestyle="dotted", color="black", label="50/50")

    plt.legend(
        loc="upper left",
        bbox_to_anchor=(1, 1),
    )



In [166]:
from sklearn import metrics
from sklearn.metrics import roc_auc_score

with mpl.rc_context(rc={"figure.figsize": (10,6)}):
    fpr, tpr, _ = metrics.roc_curve(df_testing.target, df_testing.CNN_prob)
    roc_auc = roc_auc_score(df_testing.target, df_testing.CNN_prob)

    plt.plot(fpr, tpr, label="CNN (AUC = {:.2})".format(roc_auc), 
             color=color_CNN, linewidth=linewidth,
            )
    plt.plot([0,1], [0,1], linestyle="dotted", color="black", label="Random guessing",
             linewidth=linewidth,
            )

    plt.xlim(0,1)
    plt.ylim(0,1)

    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")

    plt.legend(loc="best")
    
    plt.tight_layout()
    plot_filename = "plots_for_thesis/ROC-CNN"
    plt.savefig(plot_filename + ".pdf")
    plt.savefig(plot_filename + ".png")



In [170]:
from sklearn.metrics import average_precision_score
with mpl.rc_context(rc={"figure.figsize": (10,6)}):
    precision, recall, _ = metrics.precision_recall_curve(df_testing.target, df_testing.CNN_prob)
    pr_auc = average_precision_score(df_testing.target, df_testing.CNN_prob)

    plt.plot(recall, precision, label="AUC = {:.3}".format(pr_auc), color=color_CNN, linewidth=linewidth)
    
    plt.plot([0,1], [Y[testing_set_indices].mean()]*2, linestyle="dotted", color="black", 
             label="Random guessing",
             linewidth=linewidth,
            )


    plt.xlim(0,1)
    plt.ylim(0,1)

    plt.xlabel("Completeness")
    plt.ylabel("Purity ")

    plt.legend(loc="best")
    
    plt.tight_layout()
    plot_filename = "plots_for_thesis/purity_completeness-CNN"
    plt.savefig(plot_filename + ".pdf")
    plt.savefig(plot_filename + ".png")



In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:

Apply a threshold

For now, just use a threshold using the prior class probability (estimated from the training set)

Note under a symmetric loss function, this isn't as good as


In [ ]:
predicted_classes = class_probs > (Y[training_set_indices].mean())
predicted_classes.mean()

In [ ]:
confusion_matrix = metrics.confusion_matrix(Y[testing_set_indices], predicted_classes)
confusion_matrix

In [ ]:
print("number of dwarfs (true)     : ", Y[testing_set_indices].sum())
print("number of dwarfs (predicted): ", predicted_classes.sum())

In [ ]:
import itertools
# adapted from: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

confusion_matrix_normalized = confusion_matrix.astype('float') / confusion_matrix.sum(axis=1)[:, np.newaxis]

plt.imshow(confusion_matrix_normalized, interpolation='nearest',cmap="gray_r")
# plt.title(title)
plt.colorbar()
tick_marks = np.arange(2)
classes = [False, True]
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

fmt = 'd'
thresh = 1 / 2.
for i, j in itertools.product(range(confusion_matrix.shape[0]), range(confusion_matrix.shape[1])):
    plt.text(j, i, format(confusion_matrix[i, j], fmt),
             fontdict={"size":20},
             horizontalalignment="center",
             color="white" if confusion_matrix_normalized[i, j] > thresh else "black")

plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.tight_layout()

In [ ]:
print("  i - Y_true[i] - Y_pred[i] -  error?")
print("-------------------------------------")
for i in range(predicted_classes.size):
    print("{:>3} -    {:>1d}      -    {:d}      -   {:>2d}".format(i, 
                                                    Y[testing_set_indices][i], 
                                                    predicted_classes[i],
                                                    (Y[testing_set_indices][i] != predicted_classes[i]), 
                                                   ))

Analyze Errors


In [ ]:
HSC_ids

In [ ]:
df.loc[HSC_ids[testing_set_indices]].head()

In [ ]:
COSMOS_filename = os.path.join(dwarfz.data_dir_default, 
                               "COSMOS_reference.sqlite")
COSMOS = dwarfz.datasets.COSMOS(COSMOS_filename)

In [ ]:
HSC_filename = os.path.join(dwarfz.data_dir_default, 
                            "HSC_COSMOS_median_forced.sqlite3")
HSC = dwarfz.datasets.HSC(HSC_filename)

In [ ]:
matches_filename = os.path.join(dwarfz.data_dir_default, 
                                "matches.sqlite3")
matches_df = dwarfz.matching.Matches.load_from_filename(matches_filename)

In [ ]:
combined = matches_df[matches_df.match].copy()
combined["ra"]       = COSMOS.df.loc[combined.index].ra
combined["dec"]      = COSMOS.df.loc[combined.index].dec
combined["photo_z"]  = COSMOS.df.loc[combined.index].photo_z
combined["log_mass"] = COSMOS.df.loc[combined.index].mass_med
combined["active"]   = COSMOS.df.loc[combined.index].classification

combined = combined.set_index("catalog_2_ids")

combined.head()

In [ ]:
df_features_testing = combined.loc[HSC_ids[testing_set_indices]]

In [ ]:
df_tmp = df_features_testing.loc[HSC_ids[testing_set_indices]]
df_tmp["error"] =   np.array(Y[testing_set_indices], dtype=int) \
                  - np.array(predicted_classes, dtype=int)


mask = (df_tmp.photo_z < .5)
# mask &= (df_tmp.error == -1)

print(sum(mask))

plt.hexbin(df_tmp.photo_z[mask], 
           df_tmp.log_mass[mask],
#            C=class_probs,
           gridsize=20,
           cmap="Blues",
           vmin=0,
          )

plt.xlabel("photo z")
plt.ylabel("log M_star")

plt.gca().add_patch(
    patches.Rectangle([0, 8], 
                      .15, 1, 
                      fill=False, 
                      linewidth=3,
                      color="red",
                     ),
)

plt.colorbar(label="Number of objects",
            )

plt.suptitle("All Objects")

In [ ]:
df_tmp = df_features_testing.loc[HSC_ids[testing_set_indices]]
df_tmp["error"] =   np.array(Y[testing_set_indices], dtype=int) \
                  - np.array(predicted_classes, dtype=int)


mask = (df_tmp.photo_z < .5)
# mask &= (df_tmp.error == -1)

print(sum(mask))

plt.hexbin(df_tmp.photo_z[mask], 
           df_tmp.log_mass[mask],
           C=class_probs,
           gridsize=20,
           cmap="Blues",
           vmin=0,
          )

plt.xlabel("photo z")
plt.ylabel("log M_star")

plt.gca().add_patch(
    patches.Rectangle([0, 8], 
                      .15, 1, 
                      fill=False, 
                      linewidth=3,
                      color="red",
                     ),
)

plt.colorbar(label="Average Predicted\nProb. within bin",
            )


plt.suptitle("All Objects")

^^ Huh, that's a pretty uniform looking spread. It doesn't really seem like it's trending in an useful direction (either near the desired boundaries or as you go further away).


In [ ]:
df_tmp = df_features_testing.loc[HSC_ids[testing_set_indices]]
df_tmp["error"] =   np.array(Y[testing_set_indices], dtype=int) \
                  - np.array(predicted_classes, dtype=int)


mask = (df_tmp.photo_z < .5)
mask &= (df_tmp.error == -1)

print(sum(mask))

plt.hexbin(df_tmp.photo_z[mask], 
           df_tmp.log_mass[mask],
           C=class_probs,
           gridsize=20,
           cmap="Blues",
           vmin=0,
          )

plt.xlabel("photo z")
plt.ylabel("log M_star")

plt.gca().add_patch(
    patches.Rectangle([0, 8], 
                      .15, 1, 
                      fill=False, 
                      linewidth=3,
                      color="red",
                     ),
)

plt.colorbar(label="Average Predicted\nProb. within bin",
            )

plt.suptitle("False Positives")

In [ ]: