In [1]:
    
import os
# os.environ['KERAS_BACKEND'] = 'tensorflow'
import glob
import h5py
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
from bokeh.io import output_notebook
from dask import delayed
import dask.array as da
from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler, ProgressBar, visualize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from mlxtend.preprocessing import standardize
import keras
from keras.models import Model
from keras.layers import (Input, Conv2D, MaxPooling2D, Cropping2D, Flatten,
                          Dense, Dropout)
from keras.layers.advanced_activations import LeakyReLU
from keras.utils import to_categorical
from livelossplot import PlotLossesKeras
import comptools as comp
output_notebook()
%matplotlib inline
    
    
    
    
In [2]:
    
config = 'IC86.2012'
num_groups = 2
comp_list = comp.get_comp_list(num_groups=num_groups)
energybins = comp.get_energybins(config=config)
log_energy_min = energybins.log_energy_min
log_energy_max = energybins.log_energy_max
    
In [3]:
    
df_sim_train, df_sim_test = comp.load_sim(config=config,
                                          log_energy_min=log_energy_min,
                                          log_energy_max=log_energy_max,
                                          test_size=0.5,
                                          verbose=True)
    
    
In [4]:
    
feature_list, feature_labels = comp.get_training_features()
    
In [5]:
    
feature_list += ['lap_x', 'lap_y']
feature_list
    
    Out[5]:
In [6]:
    
scaler = StandardScaler()
df_sim_train[feature_list] = scaler.fit_transform(df_sim_train[feature_list])
df_sim_test[feature_list] = scaler.transform(df_sim_test[feature_list])
    
In [6]:
    
# X = df_sim_train[feature_list].values
# y = df_sim_train['comp_target_{}'.format(num_groups)].values
    
In [7]:
    
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=2)
    
In [7]:
    
file_pattern = os.path.join(comp.paths.comp_data_dir,
                            config,
                            'sim',
                            'charge_dist_hdf',
                            '*.hdf')
file_pattern
    
    Out[7]:
In [8]:
    
filenames = glob.glob(file_pattern)
    
In [9]:
    
len(filenames)
    
    Out[9]:
In [10]:
    
def generate_data(filenames, df_sim, features, labels, batchsize=3):
    
    batch_id = np.arange(len(df_sim)) // batchsize
    # Loop over indefinitley
    while True:
        for batch_idx, batch_df in df_sim.groupby(batch_id):
            X_batch = []
            targets_batch = []
            charge_dist_batch = []
            for file_idx, filename in enumerate(filenames):
                with h5py.File(filename, mode='r') as f:
                    array = f['/charge_dist'].value
                    event_id = f['/event_id'].value
                array = np.nan_to_num(array, copy=False)
                # Decode byte strings
                event_id = np.array([i.decode('utf-8') for i in event_id])
                #Get boolean array for events    
                pass_cuts_mask = np.isin(event_id, batch_df.index)
                if pass_cuts_mask.sum() == 0:
                    continue
                array_pass_cuts = array[pass_cuts_mask]
                event_id_pass_cuts = event_id[pass_cuts_mask]
                num_images = array_pass_cuts.shape[0]
                assert array_pass_cuts.shape[0] == event_id_pass_cuts.shape[0]
                # Normalize each image
                array_pass_cuts = array_pass_cuts / array_pass_cuts.reshape((num_images, -1)).sum(axis=1)[:, None, None]
                X_batch.extend(batch_df.loc[event_id_pass_cuts, features].values)
                targets_batch.extend(to_categorical(labels.loc[event_id_pass_cuts].values, num_classes=num_groups))
                charge_dist_batch.extend(array_pass_cuts)
            charge_dist_batch = np.asarray(charge_dist_batch)
            new_shape = tuple(list(charge_dist_batch.shape) + [1])
            charge_dist_batch = charge_dist_batch.reshape(new_shape)
            inputs = [charge_dist_batch, np.asarray(X_batch)]
            targets_batch= np.array(targets_batch)
            yield (inputs, targets_batch)
    
In [26]:
    
sample_generator = generate_data(filenames,
                                 df_sim_train.iloc[:100],
                                 feature_list,
                                 df_sim_train.iloc[:100]['comp_target_{}'.format(num_groups)],
                                 batchsize=10)
    
In [27]:
    
for inputs, targets in sample_generator:
    print('w')
    
    
    
In [77]:
    
# dask_arrays = []
# for fn in filenames:
#     with h5py.File(fn, mode='r') as f:
#         d = f['/charge_dist'].value
# #         d = d / d.max()
#         d = np.nan_to_num(d)
#         d = np.array([i / i.sum() for i in d])
#     x = da.from_array(d, chunks=100)
#     dask_arrays.append(x)
# x = da.concatenate(dask_arrays, axis=0)  # concatenate arrays along first axis
    
In [79]:
    
# with ProgressBar() as _, Profiler() as prof, ResourceProfiler(dt=0.25) as rprof, CacheProfiler() as cprof:
#     hists = x.compute(num_workers=20)
    
    
Reshape charge distribution array for use in keras
In [83]:
    
# print(f'current_shape = {hists.shape}')
# new_shape = tuple(list(hists.shape) + [1])
# hists = hists.reshape(new_shape)
# print(f'new_shape = {hists.shape}')
    
    
In [84]:
    
# dask_arrays = []
# for fn in filenames:
#     with h5py.File(fn, mode='r') as f:
#         d = f['/event_id'].value
#         # Byte strings
#         d = np.array([i.decode('utf-8') for i in d])
#     x = da.from_array(d, chunks=100)
#     dask_arrays.append(x)
# event_ids = da.concatenate(dask_arrays, axis=0)  # concatenate arrays along first axis
    
In [85]:
    
# with ProgressBar():
#     event_ids = event_ids.compute(num_workers=20)
    
    
In [86]:
    
event_ids
    
    Out[86]:
In [87]:
    
event_ids[0]
    
    Out[87]:
In [88]:
    
train_mask = np.array([item in df_sim_train.index for item in event_ids])
test_mask = np.array([item in df_sim_test.index for item in event_ids])
    
In [89]:
    
df_sim_train.index.shape, df_sim_test.index.shape
    
    Out[89]:
In [90]:
    
df_sim_train.index.intersection(df_sim_test.index)
    
    Out[90]:
In [91]:
    
np.sum(train_mask), np.sum(test_mask)
    
    Out[91]:
In [92]:
    
hist_array_train = xr.DataArray(hists[train_mask], coords={'dim_0': event_ids[train_mask]})
hist_array_test = xr.DataArray(hists[test_mask], coords={'dim_0': event_ids[test_mask]})
    
In [93]:
    
hist_array_train
    
    Out[93]:
In [94]:
    
hist_array_train = hist_array_train.loc[df_sim_train.index]
hist_array_test = hist_array_test.loc[df_sim_test.index]
    
In [95]:
    
for idx in range(10):
    h = hist_array_train[idx].values
    h = h.reshape(new_shape[1:-1])
    h_norm = h / h.max()
    h_norm[h_norm == 0] = np.nan
#     plt.imshow(h_norm, origin='lower')
    plt.imshow(np.log10(h_norm), origin='lower')
    plt.colorbar()
    plt.xlabel('X [m]')
    plt.ylabel('Y [m]')
    true_comp = df_sim_train.iloc[idx]['MC_comp']
    true_log_energy = df_sim_train.iloc[idx]['MC_log_energy']
    plt.title(true_comp + ' ({:0.2f})'.format(true_log_energy))
    plt.grid()
    plt.show()
    
    
    
    
    
    
    
    
    
    
    
    
In [40]:
    
inputs = Input(shape=(24, 24, 1), name='hist_input')
x = Conv2D(12, (9, 9), padding='same')(inputs)
x = LeakyReLU(alpha=0.3)(x)
x = Conv2D(12, (9, 9), padding='same')(x)
x = LeakyReLU(alpha=0.3)(x)
x = MaxPooling2D(pool_size=(6, 6))(x)
x = Dropout(0.2)(x)
x = Conv2D(24, (9, 9), padding='same')(inputs)
x = LeakyReLU(alpha=0.3)(x)
x = Conv2D(24, (9, 9), padding='same')(x)
x = LeakyReLU(alpha=0.3)(x)
x = MaxPooling2D(pool_size=(6, 6))(x)
x = Dropout(0.2)(x)
# x = Conv2D(64, (3, 3), padding='same')(inputs)
# x = LeakyReLU(alpha=0.3)(x)
# x = Conv2D(64, (3, 3), padding='same')(x)
# x = LeakyReLU(alpha=0.3)(x)
# x = MaxPooling2D(pool_size=(2, 2))(x)
# x = Dropout(0.2)(x)
# x = Conv2D(128, (3, 3), padding='same')(inputs)
# x = LeakyReLU(alpha=0.3)(x)
# x = Conv2D(128, (3, 3), padding='same')(x)
# x = LeakyReLU(alpha=0.3)(x)
# x = MaxPooling2D(pool_size=(2, 2))(x)
# x = Dropout(0.2)(x)
flatten_output = Flatten()(x)
auxiliary_input = Input(shape=(len(feature_list),), name='aux_input')
x = keras.layers.concatenate([flatten_output, auxiliary_input])
x = Dense(256, activation='relu')(x)
x = Dropout(0.4)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.4)(x)
predictions = Dense(num_groups, activation='softmax')(x)
# This creates a model that includes
# the Input layer and three Dense layers
model = Model(inputs=[inputs, auxiliary_input], outputs=predictions)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
    
In [41]:
    
model.summary()
    
    
In [42]:
    
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
SVG(model_to_dot(model).create(prog='dot', format='svg'))
    
    Out[42]:
In [43]:
    
# X_stand = standardize(X)
    
In [44]:
    
np.random.seed(2)
# n_samples = -1
batch_size = 64
train_sample = df_sim_train.sample(n=8000, random_state=2)
train_generator = generate_data(filenames,
                                train_sample,
                                feature_list,
                                train_sample['comp_target_{}'.format(num_groups)],
                                batchsize=batch_size)
test_sample = df_sim_test.sample(n=1000, random_state=2)
test_generator = generate_data(filenames,
                               test_sample,
                               feature_list,
                               test_sample['comp_target_{}'.format(num_groups)],
                               batchsize=batch_size)
steps_per_epoch = int(train_sample.shape[0] / batch_size)
validation_steps = int(test_sample.shape[0] / batch_size)
hist = model.fit_generator(train_generator,
                           validation_data=test_generator,
                           epochs=20,
                           steps_per_epoch=steps_per_epoch,
                           validation_steps=validation_steps,
                           use_multiprocessing=True,
                           workers=10,
                           callbacks=[PlotLossesKeras()])
    
    
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
np.random.seed(2)
n_samples = -1
hist = model.fit([hist_array_train[:n_samples], X_stand[:n_samples]], y_cat[:n_samples],
                 epochs=10,
                 batch_size=128,
                 validation_split=0.3,
                 callbacks=[PlotLossesKeras()])
    
    
    
In [133]:
    
hist
    
    Out[133]:
In [134]:
    
hist.history
    
    Out[134]:
In [ ]: