In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import matplotlib as mpl

import seaborn as sns; sns.set(style="white", font_scale=2)

import numpy as np
import pandas as pd
from astropy.io import fits
import glob

import sklearn
import sklearn.metrics

import keras
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.layers.noise import GaussianNoise

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.preprocessing.image import random_channel_shift
from keras.optimizers import SGD, Adam
from keras import backend as K
K.set_image_data_format('channels_first')
K.set_session(K.tf.Session(config=K.tf.ConfigProto(intra_op_parallelism_threads=10, 
                                                   inter_op_parallelism_threads=10)))

import scipy.ndimage as ndi

import matplotlib.patches as patches

import pathlib


Using TensorFlow backend.

In [2]:
mpl.rcParams['savefig.dpi'] = 80
mpl.rcParams['figure.dpi'] = 80
mpl.rcParams['figure.figsize'] = np.array((10,6))*.6
mpl.rcParams['figure.facecolor'] = "white"

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
# my modules that are DNN specific
import os, sys
dwarfz_package_dir = pathlib.Path \
                            .cwd() \
                            .parent / "DNN"

if str(dwarfz_package_dir) not in sys.path:
    sys.path.insert(0, str(dwarfz_package_dir))
    sys.path.insert(0, str(dwarfz_package_dir.parent.parent))
                    
import dwarfz

# these actually exist in "../DNN" which is why I had to add it to the path above
import preprocessing
import geometry

To Do

  1. Read in fits images
  2. Apply pre-processing stretch (asinh)
  3. crop image smaller
  4. Combine filters into one cube
  5. Create training set with labels
  6. Set up keras model
  7. Poke around at the results

0) Get files


In [5]:
# images_dir = pathlib.Path(preprocessing.images_dir)
images_dir = pathlib.Path.home() / "dwarfz" / "galaxies_narrowband"
images_dir


Out[5]:
PosixPath('/Users/egentry/dwarfz/galaxies_narrowband')

In [6]:
HSC_ids_target = {int(subdir.name)
                  for subdir in images_dir.glob("target/*")
                  if subdir.name[0] != "."}

HSC_ids_contaminant = {int(subdir.name)
                       for subdir in images_dir.glob("contaminant/*")
                       if subdir.name[0] != "."}

remove_ids = HSC_ids_target & HSC_ids_contaminant
HSC_ids_target -= remove_ids
HSC_ids_contaminant -= remove_ids

HSC_ids = HSC_ids_target | HSC_ids_contaminant

HSC_ids_target      = np.array(list(HSC_ids_target))
HSC_ids_contaminant = np.array(list(HSC_ids_contaminant))
HSC_ids             = np.array(list(HSC_ids))


print(len(HSC_ids_target))
print(len(HSC_ids_contaminant))
print(len(HSC_ids), len(HSC_ids_target) + len(HSC_ids_contaminant))


HSC_ids = np.array(sorted(HSC_ids)) # enforce a deterministic ordering


6278
18999
25277 25277

In [7]:
HSC_id = HSC_ids[1] # for when I need a single sample galaxy

In [8]:
bands = ["g", "r", "i", "z", "y"]

1) Read in fits image


In [9]:
get_image = lambda HSC_id, band: preprocessing.get_image(HSC_id, band, images_dir=images_dir/"*"/str(HSC_id))

In [10]:
image, flux_mag_0 = get_image(HSC_id, "g")
print("image size: {} x {}".format(*image.shape))
image


image size: 239 x 239
Out[10]:
array([[-0.02194131,  0.00391446,  0.01770653, ..., -0.01461175,
        -0.01249455,  0.01746454],
       [-0.01601904, -0.02720453, -0.06848946, ...,  0.04288237,
         0.00534859,  0.02132502],
       [-0.01800561, -0.05129848, -0.06990851, ...,  0.03650333,
         0.01508071, -0.03739795],
       ...,
       [ 0.02909144,  0.03687791,  0.04685813, ...,  0.03878131,
        -0.04980357, -0.0178718 ],
       [ 0.01575067, -0.01513606,  0.02736958, ...,  0.01000878,
         0.00148029,  0.01590817],
       [ 0.02265622,  0.04364862, -0.00685509, ..., -0.01821685,
         0.01330067,  0.01832419]], dtype=float32)

In [11]:
preprocessing.image_plotter(image)



In [12]:
preprocessing.image_plotter(np.log(image))


/Users/egentry/anaconda3/envs/tf36/lib/python3.6/site-packages/ipykernel_launcher.py:1: RuntimeWarning: invalid value encountered in log
  """Entry point for launching an IPython kernel.

2) Apply stretch

We're using (negative) asinh magnitudes, as implemented by the HSC collaboration.

To see more about asinh magnitude system, see : Lupton, Gunn and Szalay (1999) used for SDSS. (It's expliticly given in the SDSS Algorithms documentation as well as this overview page).

To see the source of our code, see: the HSC color image creator

And for reference, a common form of this stretch is: $$ \mathrm{mag}_\mathrm{asinh} = - \left(\frac{2.5}{\ln(10)}\right) \left(\mathrm{asinh}\left(\frac{f/f_0}{2b}\right) + \ln(b) \right)$$ for dimensionless softening parameter $b$, and reference flux (f_0).


In [13]:
def scale(x, fluxMag0):
    ### adapted from https://hsc-gitlab.mtk.nao.ac.jp/snippets/23
    mag0 = 19
    scale = 10 ** (0.4 * mag0) / fluxMag0
    x *= scale

    u_min = -0.05
    u_max = 2. / 3.
    u_a = np.exp(10.)

    x = np.arcsinh(u_a*x) / np.arcsinh(u_a)
    x = (x - u_min) / (u_max - u_min)

    return x

In [14]:
image_scaled = preprocessing.scale(image, flux_mag_0)

In [15]:
preprocessing.image_plotter(image_scaled)



In [16]:
sns.distplot(image_scaled.flatten())
plt.title("Distribution of Transformed Intensities")


Out[16]:
Text(0.5, 1.0, 'Distribution of Transformed Intensities')

In [17]:
for band in bands:
    image, flux_mag_0 = get_image(HSC_id, band)
    image_scaled = preprocessing.scale(image, flux_mag_0)

    plt.figure()
    preprocessing.image_plotter(image_scaled)
    plt.title("{} band".format(band))
    plt.colorbar()


3) Crop Image

Am I properly handling odd numbers?


In [18]:
pre_transformed_image_size  = 150
post_transformed_image_size = 75

In [19]:
cutout = preprocessing.get_cutout(image_scaled, post_transformed_image_size)
cutout.shape


Out[19]:
(75, 75)

In [20]:
preprocessing.image_plotter(cutout)



In [21]:
for band in bands:
    image, flux_mag_0 = get_image(HSC_id, band)
    image_scaled = preprocessing.scale(image, flux_mag_0)
    cutout = preprocessing.get_cutout(image_scaled, post_transformed_image_size)

    plt.figure()
    preprocessing.image_plotter(cutout)
    plt.title("{} band".format(band))
    plt.colorbar()


4) Combine filters into cube


In [22]:
images = [None]*len(bands)
flux_mag_0s = [None]*len(bands)
cutouts = [None]*len(bands)
for i, band in enumerate(bands):
    images[i], flux_mag_0s[i] = get_image(HSC_id, band)
    
    cutouts[i] = preprocessing.get_cutout(
        preprocessing.scale(images[i], flux_mag_0s[i]), 
        post_transformed_image_size
    )

In [23]:
cutout_cube = np.array(cutouts)
cutout_cube.shape


Out[23]:
(5, 75, 75)

In [24]:
# must transform into [0,1] for plt.imshow
# the HSC standard tool accomplishes this by clipping instead.
plt.imshow(preprocessing.transform_0_1(cutout_cube[(4,2,0),:,:].transpose(1,2,0)) )


Out[24]:
<matplotlib.image.AxesImage at 0x12c2afe80>

In [25]:
for i, band in enumerate(bands):
    sns.distplot(cutout_cube[:,:,:].transpose(1,2,0)[:,:,i].flatten(), label=band)
    plt.legend(loc="best")


5) Load Training Set Labels


In [26]:
df = pd.DataFrame(data={
    "HSC_id" :HSC_ids,
    "target" : [HSC_id in HSC_ids_target for HSC_id in HSC_ids]
})

df = df.set_index("HSC_id")

df.head()


Out[26]:
target
HSC_id
43158176442354210 False
43158176442354294 False
43158176442354356 False
43158176442354418 True
43158176442354561 False

In [27]:
df.mean()


Out[27]:
target    0.248368
dtype: float64

In [28]:
def load_image_mappable(HSC_id):
    images      = [None]*len(bands)
    flux_mag_0s = [None]*len(bands)
    cutouts     = [None]*len(bands)
    for j, band in enumerate(bands):
        images[j], flux_mag_0s[j] = get_image(HSC_id, band)
        cutouts[j] = preprocessing.get_cutout(
            preprocessing.scale(images[j], flux_mag_0s[j]),
            pre_transformed_image_size)
    cutout_cube = np.array(cutouts)
    return cutout_cube

In [29]:
X = np.empty((len(HSC_ids[:400]), 5, 
              pre_transformed_image_size, pre_transformed_image_size))

In [30]:
X.shape


Out[30]:
(400, 5, 150, 150)

In [31]:
for i in range(X.shape[0]):
    HSC_id = HSC_ids[i]
    X[i] = load_image_mappable(HSC_id)
    
    if i%(X.shape[0]//20)==0:
        print("loading # {} / {}".format(i, X.shape[0]), end="\r", flush=True)


loading # 380 / 400

In [32]:
X_full = X
X_small = X[:,(4,2,0),:,:] # drop down to 3 bands (y, i, g)

In [33]:
X.shape


Out[33]:
(400, 5, 150, 150)

In [34]:
Y = df.loc[HSC_ids].target.values
Y


Out[34]:
array([False, False, False, ..., False, False, False])

In [35]:
Y.mean()


Out[35]:
0.24836808165525973

load photometric features


In [42]:
COSMOS_filename = pathlib.Path(dwarfz.data_dir_default) / "COSMOS_reference.sqlite"
COSMOS = dwarfz.datasets.COSMOS(COSMOS_filename)

HSC_filename = pathlib.Path(dwarfz.data_dir_default) / "HSC_COSMOS_median_forced.sqlite3"
HSC = dwarfz.datasets.HSC(HSC_filename)

matches_filename = pathlib.Path(dwarfz.data_dir_default) / "matches.sqlite3"
matches_df = dwarfz.matching.Matches.load_from_filename(matches_filename)

combined = matches_df[matches_df.match].copy()
combined["ra"]       = COSMOS.df.loc[combined.index].ra
combined["dec"]      = COSMOS.df.loc[combined.index].dec
combined["photo_z"]  = COSMOS.df.loc[combined.index].photo_z
combined["log_mass"] = COSMOS.df.loc[combined.index].mass_med

photometry_cols = [
    "gcmodel_flux","gcmodel_flux_err","gcmodel_flux_flags", "gcmodel_mag",
    "rcmodel_flux","rcmodel_flux_err","rcmodel_flux_flags", "rcmodel_mag",
    "icmodel_flux","icmodel_flux_err","icmodel_flux_flags", "icmodel_mag",
    "zcmodel_flux","zcmodel_flux_err","zcmodel_flux_flags", "zcmodel_mag",
    "ycmodel_flux","ycmodel_flux_err","ycmodel_flux_flags", "ycmodel_mag",
]

for col in photometry_cols:
    combined[col] = HSC.df.loc[combined.catalog_2_ids][col].values
    
    
combined["g_minus_r"] = combined.gcmodel_mag - combined.rcmodel_mag
combined["r_minus_i"] = combined.rcmodel_mag - combined.icmodel_mag
combined["i_minus_z"] = combined.icmodel_mag - combined.zcmodel_mag
combined["z_minus_y"] = combined.zcmodel_mag - combined.ycmodel_mag

mask =    np.isfinite(combined["g_minus_r"]) & np.isfinite(combined["r_minus_i"]) \
        & np.isfinite(combined["i_minus_z"]) & np.isfinite(combined["z_minus_y"]) \
        & np.isfinite(combined["icmodel_mag"]) \
        & (~combined.gcmodel_flux_flags) & (~combined.rcmodel_flux_flags) \
        & (~combined.icmodel_flux_flags) & (~combined.zcmodel_flux_flags) \
        & (~combined.ycmodel_flux_flags)

combined = combined[mask]

df_frankenz = pd.read_sql_table("photo_z",
                                "sqlite:///{}".format(
                                    pathlib.Path(dwarfz.data_dir_default)
                                                 / "HSC_matched_to_FRANKENZ.sqlite"),
                                index_col="object_id")

df_frankenz.head()

combined = combined.join(df_frankenz[["photoz_best", "photoz_risk_best"]],
                         on="catalog_2_ids")

photometry_features = combined.loc[:,["g_minus_r", "r_minus_i", "i_minus_z", "z_minus_y",
                                      "icmodel_mag",
                                      "photoz_best",
                                      "photoz_risk_best" # The risk of photoz_best being outside of the range z_true +- 0.15(1+z_true). It ranges from 0 (safe) to 1(risky)
                                     ]]

photometry_features.head()


Out[42]:
g_minus_r r_minus_i i_minus_z z_minus_y icmodel_mag photoz_best photoz_risk_best
catalog_1_ids
219656 0.904190 0.614107 -0.192213 -1.643913 25.473045 1.66 0.456715
219741 0.620031 0.076033 -0.445490 0.817816 25.845751 1.60 0.448231
219743 0.749485 0.457653 0.204971 -0.212778 24.897820 0.69 0.145915
219744 0.548159 -0.039301 0.487778 -1.281529 26.055256 3.37 0.351811
219769 -1.042015 -3.360178 4.475088 0.235411 30.648498 0.91 0.680943

In [43]:
training_set_labels_filename = "../data/galaxy_images_training/2017_09_26-dwarf_galaxy_scores.csv"
HSC_to_COSMOS = pd.read_csv(training_set_labels_filename)
HSC_to_COSMOS = HSC_to_COSMOS.drop_duplicates("HSC_id")
HSC_to_COSMOS = HSC_to_COSMOS.set_index("HSC_id")
HSC_to_COSMOS = HSC_to_COSMOS[["COSMOS_id"]]
HSC_to_COSMOS.head()


Out[43]:
COSMOS_id
HSC_id
43158322471244656 628457
43158605939114836 919771
43159142810013665 444239
43158734788125011 569427
43158863637144621 369948

In [44]:
photometry_features_aligned = photometry_features.loc[HSC_to_COSMOS.loc[HSC_ids].values.flatten()]
photometry_features_aligned.head()


Out[44]:
g_minus_r r_minus_i i_minus_z z_minus_y icmodel_mag photoz_best photoz_risk_best
catalog_1_ids
387020 0.241915 0.287563 0.580751 -0.461952 25.067200 0.75 0.189435
389379 0.815916 0.300808 0.021614 -0.038185 23.948193 0.36 0.158907
390478 1.774567 1.506727 0.083956 -0.385647 24.193956 0.85 0.183067
391721 0.493143 0.108091 0.332539 0.058878 25.082439 2.61 0.458986
394884 1.182629 0.388001 0.275812 -0.031630 23.426733 0.50 0.168350

Geometry!


In [45]:
X[0].shape


Out[45]:
(5, 150, 150)

In [46]:
h = pre_transformed_image_size
w = pre_transformed_image_size
transform_matrix = geometry.create_random_transform_matrix(h, w,
                                                  include_rotation=True,
                                                  translation_size = .01,
                                                  verbose=False)

x_tmp = X[0][(4,2,0),:,:]

result = geometry.apply_transform_new(x_tmp, transform_matrix, 
                            channel_axis=0, fill_mode="constant", cval=np.max(x_tmp))

result = preprocessing.get_cutout(x_tmp, post_transformed_image_size)
plt.imshow(preprocessing.transform_0_1(result.transpose(1,2,0)))


Out[46]:
<matplotlib.image.AxesImage at 0x1199684a8>

In [47]:
import ipywidgets
ipywidgets.interact(preprocessing.transform_plotter,
                    X = ipywidgets.fixed(X_small),
                    rotation_degrees = ipywidgets.IntSlider(min=0, max=360, step=15, value=45),
                    dx_after = ipywidgets.IntSlider(min=-15, max=15),
                    dy_after = ipywidgets.IntSlider(min=-15, max=15),
                    color = ipywidgets.fixed(True),
                    shear_degrees = ipywidgets.IntSlider(min=0, max=90, step=5, value=0),
                    zoom_x = ipywidgets.FloatSlider(min=.5, max=2, value=1),
                    crop = ipywidgets.Checkbox(value=True)
                    )


Out[47]:
<function preprocessing.transform_plotter(X, reflect_x=False, rotation_degrees=45, dx_after=0, dy_after=0, shear_degrees=0, zoom_x=1, crop=False, color=True)>

5b) Split training and testing set


In [48]:
np.random.seed(1)

randomized_indices = np.arange(Y.shape[0])
np.random.shuffle(randomized_indices)

testing_fraction = 0.2
testing_set_indices = randomized_indices[:int(testing_fraction*Y.shape[0])]
training_set_indices = np.array(list(set([*randomized_indices]) - set([*testing_set_indices])))

In [49]:
print(testing_set_indices.size, Y[testing_set_indices].mean())


5055 0.247675568743818

In [50]:
print(training_set_indices.size, Y[training_set_indices].mean())


20222 0.24854119276036

In [51]:
p = Y[training_set_indices].mean()
prior_loss = sklearn.metrics.log_loss(Y[testing_set_indices], 
                                      [p]*testing_set_indices.size)
prior_loss


Out[51]:
0.5597690655312428

6b) Adapt NumpyArrayIterator

The original only allowed 1, 3 or 4 channel images. I have 5 channel images.

Also, I want to change the way that augmentation is happening


In [52]:
from data_generator_directory import DirectoryIterator

6c) Adapt ImageDataGenerator

The original only allowed 1, 3 or 4 channel images. I have 5 channel images. Also, I'm adjusting the way that the affine transformations work for the data augmentation


In [53]:
from data_generator_directory import ImageDataGeneratorLoadable

6d) Create Data Generator


In [54]:
print('Using real-time data augmentation.')

h_before, w_before = X[0,0].shape
print("image shape before: ({},{})".format(h_before, w_before))

h_after = post_transformed_image_size
w_after = post_transformed_image_size
print("image shape after:  ({},{})".format(h_after, w_after))

# get a closure that binds the image size to get_cutout
postprocessing_function = lambda image: preprocessing.get_cutout(image, post_transformed_image_size)

# this will do preprocessing and realtime data augmentation
datagen = ImageDataGeneratorLoadable(
    featurewise_center=False,  # set input mean to 0 over the dataset
    samplewise_center=False,  # set each sample mean to 0
    featurewise_std_normalization=False,  # divide inputs by std of the dataset
    samplewise_std_normalization=False,  # divide each input by its std
    zca_whitening=False,  # apply ZCA whitening
    with_reflection_x=True, # randomly apply a reflection (in x)
    with_reflection_y=True, # randomly apply a reflection (in y)
    with_rotation=False, # randomly apply a rotation
    width_shift_range=0.002,  # randomly shift images horizontally (fraction of total width)
    height_shift_range=0.002,  # randomly shift images vertically (fraction of total height)
    postprocessing_function=postprocessing_function, # get a cutout of the processed image
    output_image_shape=(post_transformed_image_size,post_transformed_image_size)
)   

# this will do preprocessing and realtime data augmentation
from data_generator import ImageDataGenerator
datagen_features = ImageDataGenerator(
    featurewise_center=False,  # set input mean to 0 over the dataset
    samplewise_center=False,  # set each sample mean to 0
    featurewise_std_normalization=False,  # divide inputs by std of the dataset
    samplewise_std_normalization=False,  # divide each input by its std
    zca_whitening=False,  # apply ZCA whitening
    with_reflection_x=False, # randomly apply a reflection (in x)
    with_reflection_y=False, # randomly apply a reflection (in y)
    with_rotation=False, # randomly apply a rotation
#     width_shift_range=0.002,  # randomly shift images horizontally (fraction of total width)
#     height_shift_range=0.002,  # randomly shift images vertically (fraction of total height)
#     postprocessing_function=postprocessing_function, # get a cutout of the processed image
    output_image_shape=(1,1),
)


Using real-time data augmentation.
image shape before: (150,150)
image shape after:  (75,75)

In [55]:
# datagen.fit()

7) Set up keras model


In [56]:
with_one_by_one = False
trainable = False

input_shape = cutout_cube.shape

nb_dense = 64

In [57]:
from keras.applications import inception_v3, inception_resnet_v2, vgg19

In [79]:
feature_extractor = Sequential()
extractor_input_shape = tuple([3, *input_shape[1:]])
extractor_layers = vgg19.VGG19(include_top=False,
                                input_shape=extractor_input_shape
                               )
for layer in extractor_layers.layers:
    layer.trainable = False
feature_extractor.add(extractor_layers)

feature_extractor.compile(optimizer= Adam(lr=1),
               loss="mean_squared_error")

In [59]:
reextract_features = False
features_filename = "vgg_features.h5"

if reextract_features or (not pathlib.Path(features_filename).is_file()):
    
    features = np.zeros((Y.size, 512, 2, 2))
    for i, HSC_id in enumerate(HSC_ids):
        x_tmp = np.array([datagen.standardize(load_image_mappable(HSC_id))])
        features[i] = feature_extractor.predict(x_tmp[:,(4,2,0),:,:])

        if i%(features.shape[0]//20)==0:
            print("loading # {} / {}".format(i, features.shape[0]), end="\r", flush=True)
    df_vgg = pd.DataFrame(data={
        "HSC_id" : HSC_ids,
        "vgg_features" : [feature_row for feature_row in features]
    })
#     df_vgg.to_hdf(features_filename, "data")
else:
    df_vgg = pd.read_hdf(features_filename)
    features = np.array([row for row in df_vgg.vgg_features.values])    
features.shape


Out[59]:
(25277, 512, 2, 2)

In [60]:
learning_rate = 0.00025
decay = 1e-5
momentum = 0.9

# sgd = SGD(lr=learning_rate, decay=decay, momentum=momentum, nesterov=True)

adam = Adam(lr=learning_rate)

In [137]:
!head training_transfer.log.old












In [134]:
logger_filename = "training_transfer"
if trainable:
    logger_filename += "-trainable" 
if with_one_by_one:
    logger_filename += "-1x1"
logger_filename += ".log"


# model.compile(loss='binary_crossentropy', 
# #               optimizer=sgd, 
#               optimizer=adam,
# #               metrics=["accuracy"]
#              )

# can only manually set weights _after_ compiling
# one_by_one_weights = np.zeros((1,1,5,3))
# for i in range(3):
#     one_by_one_weights[0,0,4-2*i,i] = 1 # chooses y i g with correct color order
# one_by_one.set_weights([one_by_one_weights,
#                         np.zeros(3)])

print("using logger_filename: ", logger_filename)

# if os.path.exists(logger_filename):
#     logger_filename_tmp = logger_filename + ".old"
#     os.rename(logger_filename, logger_filename_tmp)


using logger_filename:  training_transfer.with_photometry.log

In [109]:
features_combined.shape


Out[109]:
(25277, 2055)

In [123]:
model = Sequential()
model.add(Flatten(input_shape=(np.prod(features.shape[1:]) + photometry_features.shape[1], 1, 1)))
model.add(Dense(2*nb_dense, activation="relu"))
model.add(Dense(nb_dense, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

model.compile(optimizer= Adam(lr=learning_rate),
               loss="binary_crossentropy")

model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
flatten_2 (Flatten)          (None, 2055)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 128)               263168    
_________________________________________________________________
dense_5 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 65        
=================================================================
Total params: 271,489
Trainable params: 271,489
Non-trainable params: 0
_________________________________________________________________

In [124]:
earlystopping = EarlyStopping(monitor='loss',
                              patience=35,
                              verbose=1,
                              mode='auto' )

In [125]:
csv_logger = keras.callbacks.CSVLogger(logger_filename,
                                       append=True)

8) Run basic keras model


In [126]:
goal_batch_size = 128
steps_per_epoch = max(2, training_set_indices.size//goal_batch_size)
batch_size = training_set_indices.size//steps_per_epoch
print("steps_per_epoch: ", steps_per_epoch)
print("batch_size: ", batch_size)
epoch_to_save_state = 50
epochs = 100
verbose = 1


steps_per_epoch:  157
batch_size:  128

In [127]:
prior_loss


Out[127]:
0.5597690655312428

In [128]:
features_combined = np.hstack([features.reshape(-1, 512*2*2), photometry_features_aligned])
features_combined = features_combined.reshape(*features_combined.shape, 1, 1)
print(features_combined.shape)


(25277, 2055, 1, 1)

In [174]:
epochs=200


args = (datagen_features.flow(features_combined[training_set_indices], 
                              Y[training_set_indices],
                              batch_size=batch_size,
                             ),
       )

kwargs = dict(steps_per_epoch=steps_per_epoch,
              validation_data=(features_combined[testing_set_indices], 
                               Y[testing_set_indices]),
              verbose=verbose,
              callbacks=[csv_logger],
             )

for epoch_i in range(epochs):
    history = model.fit_generator(*args,
                                  initial_epoch=epoch_i,
                                  epochs=epoch_i+1,
                                  **kwargs,
                                  )

    model.save_weights("weights/cached_weights.extracted_features.with_photometry.{:>04d}.h5".format(epoch_i))


Epoch 1/1
  2/157 [..............................] - ETA: 20:53 - loss: 0.4380
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-174-ec80bc48bdc9> in <module>
     19                                   initial_epoch=epoch_i,
     20                                   epochs=epoch_i+1,
---> 21                                   **kwargs,
     22                                   )
     23 

~/anaconda3/envs/tf36/lib/python3.6/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
     89                 warnings.warn('Update your `' + object_name + '` call to the ' +
     90                               'Keras 2 API: ' + signature, stacklevel=2)
---> 91             return func(*args, **kwargs)
     92         wrapper._original_function = func
     93         return wrapper

~/anaconda3/envs/tf36/lib/python3.6/site-packages/keras/engine/training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
   1416             use_multiprocessing=use_multiprocessing,
   1417             shuffle=shuffle,
-> 1418             initial_epoch=initial_epoch)
   1419 
   1420     @interfaces.legacy_generator_methods_support

~/anaconda3/envs/tf36/lib/python3.6/site-packages/keras/engine/training_generator.py in fit_generator(model, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
    179             batch_index = 0
    180             while steps_done < steps_per_epoch:
--> 181                 generator_output = next(output_generator)
    182 
    183                 if not hasattr(generator_output, '__len__'):

~/anaconda3/envs/tf36/lib/python3.6/site-packages/keras/utils/data_utils.py in get(self)
    593         try:
    594             while self.is_running():
--> 595                 inputs = self.queue.get(block=True).get()
    596                 self.queue.task_done()
    597                 if inputs is not None:

~/anaconda3/envs/tf36/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
    662 
    663     def get(self, timeout=None):
--> 664         self.wait(timeout)
    665         if not self.ready():
    666             raise TimeoutError

~/anaconda3/envs/tf36/lib/python3.6/multiprocessing/pool.py in wait(self, timeout)
    659 
    660     def wait(self, timeout=None):
--> 661         self._event.wait(timeout)
    662 
    663     def get(self, timeout=None):

~/anaconda3/envs/tf36/lib/python3.6/threading.py in wait(self, timeout)
    549             signaled = self._flag
    550             if not signaled:
--> 551                 signaled = self._cond.wait(timeout)
    552             return signaled
    553 

~/anaconda3/envs/tf36/lib/python3.6/threading.py in wait(self, timeout)
    293         try:    # restore state no matter what (e.g., KeyboardInterrupt)
    294             if timeout is None:
--> 295                 waiter.acquire()
    296                 gotit = True
    297             else:

KeyboardInterrupt: 

In [62]:
# logged_history = pd.read_csv(logger_filename)
logged_history = pd.read_csv("training_transfer.log")

logged_history


Out[62]:
epoch loss val_loss
0 0 0.561169 0.559210
1 1 0.556649 0.557379
2 2 0.553965 0.550001
3 3 0.551831 0.546692
4 4 0.548141 0.552239
5 5 0.546731 0.542289
6 6 0.545093 0.540325
7 7 0.545430 0.539295
8 8 0.542962 0.542445
9 9 0.541900 0.549355
10 10 0.542598 0.539548
11 11 0.540675 0.539088
12 12 0.539242 0.537772
13 13 0.539168 0.538849
14 14 0.537349 0.540864
15 15 0.536133 0.540466
16 16 0.537476 0.534967
17 17 0.534984 0.541928
18 18 0.536063 0.540574
19 19 0.535426 0.534597
20 20 0.535670 0.534120
21 21 0.534769 0.535426
22 22 0.533496 0.538302
23 23 0.533971 0.536321
24 24 0.533663 0.534151
25 25 0.532628 0.535383
26 26 0.531716 0.531847
27 27 0.531484 0.536293
28 28 0.530476 0.531211
29 29 0.530417 0.538820
... ... ... ...
370 370 0.402363 0.721434
371 371 0.405923 0.710017
372 372 0.402852 0.731157
373 373 0.405399 0.705745
374 374 0.405876 0.728483
375 375 0.402458 0.704780
376 376 0.401710 0.772596
377 377 0.404992 0.728114
378 378 0.402443 0.763202
379 379 0.403148 0.720218
380 380 0.401499 0.758650
381 381 0.401686 0.735867
382 382 0.403551 0.742887
383 383 0.398348 0.735743
384 384 0.398581 0.745191
385 385 0.399005 0.736384
386 386 0.401024 0.768526
387 387 0.396491 0.723762
388 388 0.397051 0.792783
389 389 0.399833 0.738814
390 390 0.402004 0.730398
391 391 0.396699 0.728464
392 392 0.397151 0.775350
393 393 0.399727 0.761854
394 394 0.397626 0.749709
395 395 0.397351 0.726964
396 396 0.395427 0.749004
397 397 0.394903 0.734681
398 398 0.399013 0.759718
399 399 0.396927 0.771454

400 rows × 3 columns


In [63]:
with mpl.rc_context(rc={"figure.figsize": (10,6)}):

        
    plt.axhline(prior_loss, label="Prior", 
                linestyle="dashed", color="black")
    
    plt.plot(logged_history["val_loss"], label="Validation")
    plt.plot(logged_history["loss"], label="Training")
    
    plt.xlabel("Epoch")
    plt.ylabel("Loss\n(avg. binary cross-entropy)")
    
    plt.legend()
    
#     plt.ylim(bottom=.56, top=.62)

    plt.tight_layout()
    plot_filename = pathlib.Path("plots_for_thesis") / "learning_curve-with_photometry"
    plt.savefig(plot_filename.with_suffix(".png"))
    plt.savefig(plot_filename.with_suffix(".pdf"))



In [64]:
with mpl.rc_context(rc={"figure.figsize": (10,6)}):

    simple_conv = lambda x: np.convolve(x, np.ones(5)/5, mode="valid")
    
    plt.axhline(prior_loss, label="Prior", 
                linestyle="dashed", color="black")
    
    plt.plot(simple_conv(logged_history["val_loss"]),
             label="Validation")
    plt.plot(simple_conv(logged_history["loss"]),
             label="Training")
    
    plt.xlabel("Epoch")
    plt.ylabel("Loss\n(avg. binary cross-entropy)")
    
    plt.legend()
    
#     plt.ylim(bottom=.56, top=.62)


9) Look at validation results


In [165]:
color_CNN = "b"

linestyle_CNN = "solid"

linewidth=3

In [150]:
overwrite = True
use_cached_if_exists = False
class_probs_filename = "class_probs.with_photometry.csv"

if use_cached_if_exists and pathlib.Path(class_probs_filename).is_file():
    df_class_probs = pd.read_csv(class_probs_filename)
else:
    X_transformed = np.array([datagen.standardize(X_img)
                              for X_img in X])
    class_probs = model3.predict_proba(features_combined).flatten()
    
    df_class_probs = pd.DataFrame({
        "HSC_id": HSC_ids,
        "CNN_prob": class_probs,
        "testing": [HSC_id in HSC_ids[testing_set_indices] for HSC_id in HSC_ids],
        "target": Y,
    })

    if overwrite or (not pathlib.Path(class_probs_filename).is_file()):
        df_class_probs.to_csv(class_probs_filename, index=False)


df_testing = df_class_probs[df_class_probs.testing]

df_class_probs.head()

In [173]:
print(df_class_probs.target.mean())
print(df_class_probs[~df_class_probs.testing].target.mean())

print(df_testing.CNN_prob.mean())


0.24836808165525973
0.24854119276036
0.1659968

In [158]:
with mpl.rc_context(rc={"figure.figsize": (10,6)}):
    sns.distplot(df_testing[df_testing.target].CNN_prob, color="g", label="targets")
    sns.distplot(df_testing[~df_testing.target].CNN_prob, color="b", label="contaminants")

    plt.xlabel("p(target | image)")
    plt.ylabel("density (galaxies)")

    plt.xlim(0, .7)
    plt.axvline(df_testing.target.mean(), linestyle="dashed", color="black", label="prior\n(from training set)")
    plt.axvline(.5, linestyle="dotted", color="black", label="50/50")

    plt.legend(
        loc="upper left",
        bbox_to_anchor=(1, 1),
    )



In [166]:
from sklearn import metrics
from sklearn.metrics import roc_auc_score

with mpl.rc_context(rc={"figure.figsize": (10,6)}):
    fpr, tpr, _ = metrics.roc_curve(df_testing.target, df_testing.CNN_prob)
    roc_auc = roc_auc_score(df_testing.target, df_testing.CNN_prob)

    plt.plot(fpr, tpr, label="CNN (AUC = {:.2})".format(roc_auc), 
             color=color_CNN, linewidth=linewidth,
            )
    plt.plot([0,1], [0,1], linestyle="dotted", color="black", label="Random guessing",
             linewidth=linewidth,
            )

    plt.xlim(0,1)
    plt.ylim(0,1)

    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")

    plt.legend(loc="best")
    
    plt.tight_layout()
    plot_filename = "plots_for_thesis/ROC-CNN"
    plt.savefig(plot_filename + ".pdf")
    plt.savefig(plot_filename + ".png")



In [170]:
from sklearn.metrics import average_precision_score
with mpl.rc_context(rc={"figure.figsize": (10,6)}):
    precision, recall, _ = metrics.precision_recall_curve(df_testing.target, df_testing.CNN_prob)
    pr_auc = average_precision_score(df_testing.target, df_testing.CNN_prob)

    plt.plot(recall, precision, label="AUC = {:.3}".format(pr_auc), color=color_CNN, linewidth=linewidth)
    
    plt.plot([0,1], [Y[testing_set_indices].mean()]*2, linestyle="dotted", color="black", 
             label="Random guessing",
             linewidth=linewidth,
            )


    plt.xlim(0,1)
    plt.ylim(0,1)

    plt.xlabel("Completeness")
    plt.ylabel("Purity ")

    plt.legend(loc="best")
    
    plt.tight_layout()
    plot_filename = "plots_for_thesis/purity_completeness-CNN"
    plt.savefig(plot_filename + ".pdf")
    plt.savefig(plot_filename + ".png")



In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:

Apply a threshold

For now, just use a threshold using the prior class probability (estimated from the training set)

Note under a symmetric loss function, this isn't as good as


In [ ]:
predicted_classes = class_probs > (Y[training_set_indices].mean())
predicted_classes.mean()

In [ ]:
confusion_matrix = metrics.confusion_matrix(Y[testing_set_indices], predicted_classes)
confusion_matrix

In [ ]:
print("number of dwarfs (true)     : ", Y[testing_set_indices].sum())
print("number of dwarfs (predicted): ", predicted_classes.sum())

In [ ]:
import itertools
# adapted from: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

confusion_matrix_normalized = confusion_matrix.astype('float') / confusion_matrix.sum(axis=1)[:, np.newaxis]

plt.imshow(confusion_matrix_normalized, interpolation='nearest',cmap="gray_r")
# plt.title(title)
plt.colorbar()
tick_marks = np.arange(2)
classes = [False, True]
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

fmt = 'd'
thresh = 1 / 2.
for i, j in itertools.product(range(confusion_matrix.shape[0]), range(confusion_matrix.shape[1])):
    plt.text(j, i, format(confusion_matrix[i, j], fmt),
             fontdict={"size":20},
             horizontalalignment="center",
             color="white" if confusion_matrix_normalized[i, j] > thresh else "black")

plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.tight_layout()

In [ ]:
print("  i - Y_true[i] - Y_pred[i] -  error?")
print("-------------------------------------")
for i in range(predicted_classes.size):
    print("{:>3} -    {:>1d}      -    {:d}      -   {:>2d}".format(i, 
                                                    Y[testing_set_indices][i], 
                                                    predicted_classes[i],
                                                    (Y[testing_set_indices][i] != predicted_classes[i]), 
                                                   ))

Analyze Errors


In [ ]:
HSC_ids

In [ ]:
df.loc[HSC_ids[testing_set_indices]].head()

In [ ]:
COSMOS_filename = os.path.join(dwarfz.data_dir_default, 
                               "COSMOS_reference.sqlite")
COSMOS = dwarfz.datasets.COSMOS(COSMOS_filename)

In [ ]:
HSC_filename = os.path.join(dwarfz.data_dir_default, 
                            "HSC_COSMOS_median_forced.sqlite3")
HSC = dwarfz.datasets.HSC(HSC_filename)

In [ ]:
matches_filename = os.path.join(dwarfz.data_dir_default, 
                                "matches.sqlite3")
matches_df = dwarfz.matching.Matches.load_from_filename(matches_filename)

In [ ]:
combined = matches_df[matches_df.match].copy()
combined["ra"]       = COSMOS.df.loc[combined.index].ra
combined["dec"]      = COSMOS.df.loc[combined.index].dec
combined["photo_z"]  = COSMOS.df.loc[combined.index].photo_z
combined["log_mass"] = COSMOS.df.loc[combined.index].mass_med
combined["active"]   = COSMOS.df.loc[combined.index].classification

combined = combined.set_index("catalog_2_ids")

combined.head()

In [ ]:
df_features_testing = combined.loc[HSC_ids[testing_set_indices]]

In [ ]:
df_tmp = df_features_testing.loc[HSC_ids[testing_set_indices]]
df_tmp["error"] =   np.array(Y[testing_set_indices], dtype=int) \
                  - np.array(predicted_classes, dtype=int)


mask = (df_tmp.photo_z < .5)
# mask &= (df_tmp.error == -1)

print(sum(mask))

plt.hexbin(df_tmp.photo_z[mask], 
           df_tmp.log_mass[mask],
#            C=class_probs,
           gridsize=20,
           cmap="Blues",
           vmin=0,
          )

plt.xlabel("photo z")
plt.ylabel("log M_star")

plt.gca().add_patch(
    patches.Rectangle([0, 8], 
                      .15, 1, 
                      fill=False, 
                      linewidth=3,
                      color="red",
                     ),
)

plt.colorbar(label="Number of objects",
            )

plt.suptitle("All Objects")

In [ ]:
df_tmp = df_features_testing.loc[HSC_ids[testing_set_indices]]
df_tmp["error"] =   np.array(Y[testing_set_indices], dtype=int) \
                  - np.array(predicted_classes, dtype=int)


mask = (df_tmp.photo_z < .5)
# mask &= (df_tmp.error == -1)

print(sum(mask))

plt.hexbin(df_tmp.photo_z[mask], 
           df_tmp.log_mass[mask],
           C=class_probs,
           gridsize=20,
           cmap="Blues",
           vmin=0,
          )

plt.xlabel("photo z")
plt.ylabel("log M_star")

plt.gca().add_patch(
    patches.Rectangle([0, 8], 
                      .15, 1, 
                      fill=False, 
                      linewidth=3,
                      color="red",
                     ),
)

plt.colorbar(label="Average Predicted\nProb. within bin",
            )


plt.suptitle("All Objects")

^^ Huh, that's a pretty uniform looking spread. It doesn't really seem like it's trending in an useful direction (either near the desired boundaries or as you go further away).


In [ ]:
df_tmp = df_features_testing.loc[HSC_ids[testing_set_indices]]
df_tmp["error"] =   np.array(Y[testing_set_indices], dtype=int) \
                  - np.array(predicted_classes, dtype=int)


mask = (df_tmp.photo_z < .5)
mask &= (df_tmp.error == -1)

print(sum(mask))

plt.hexbin(df_tmp.photo_z[mask], 
           df_tmp.log_mass[mask],
           C=class_probs,
           gridsize=20,
           cmap="Blues",
           vmin=0,
          )

plt.xlabel("photo z")
plt.ylabel("log M_star")

plt.gca().add_patch(
    patches.Rectangle([0, 8], 
                      .15, 1, 
                      fill=False, 
                      linewidth=3,
                      color="red",
                     ),
)

plt.colorbar(label="Average Predicted\nProb. within bin",
            )

plt.suptitle("False Positives")

In [ ]: