LSTM recurrent neural network for classifying astronomical object names

by Leon Chen (@transcranial)

Data collection

Code for collecting star and planet names from Wikipedia below. Code for collecting names of galaxies, quasars, asteroids, and comets are elsewhere.


In [39]:
import requests
from bs4 import BeautifulSoup
import re

In [23]:
links = []
res = requests.get('https://en.wikipedia.org/wiki/Lists_of_stars_by_constellation')
soup = BeautifulSoup(res.text)
for link in soup.find_all('a'):
    try:
        if link['href'].startswith('/wiki/List_of_stars_in_'):
            links.append('https://en.wikipedia.org' + link['href'])
    except:
        pass

In [ ]:
for link in links:
    res = requests.get(link)
    soup = BeautifulSoup(res.text)
    for row in soup.table.children:
        if row.name == 'tr':
            for rowc in row.children:
                if rowc.name == 'td':
                    print(row.td.text)
                    break

In [ ]:
links = ['https://en.wikipedia.org/wiki/List_of_exoplanets_detected_by_radial_velocity',
         'https://en.wikipedia.org/wiki/List_of_transiting_exoplanets',
         'https://en.wikipedia.org/wiki/List_of_exoplanets_detected_by_microlensing',
         'https://en.wikipedia.org/wiki/List_of_directly_imaged_exoplanets',
         'https://en.wikipedia.org/wiki/List_of_exoplanets_detected_by_timing']
res = requests.get(links[4])
soup = BeautifulSoup(res.text)
for row in soup.find_all('table')[0].children:
    if row.name == 'tr':
        for rowc in row.children:
            if rowc.name == 'td':
                print(row.td.text)
                break

Preprocess data


In [1]:
import csv
import codecs

names = []
labels = []

with codecs.open('astronomical_objects.csv', 'r', 'utf8') as f:
    reader = csv.reader(f)
    for line in reader:
        names.append(line[0])
        labels.append(line[1])

In [2]:
# get rid of duplicates

names = [name.replace('\n', ' ') for name in names]

objs = []
for obj in list(zip(names, labels)):
    if len(obj[0].strip()) != 0 and obj not in objs:
        objs.append(obj)

In [3]:
len(objs)


Out[3]:
14215

In [4]:
import random
random.shuffle(objs)

In [5]:
names = []
labels = []

for n, l in objs:
    names.append(n)
    labels.append(l)

In [72]:
char_list = list(set(''.join(names)))
labels_list = list(set(labels))

In [73]:
char_indices = dict((c, i) for i, c in enumerate(char_list))
indices_char = dict((i, c) for i, c in enumerate(char_list))
label_indices = dict((l, i) for i, l in enumerate(labels_list))
indices_label = dict((i, l) for i, l in enumerate(labels_list))

In [74]:
print(label_indices)


{'comet': 0, 'asteroid': 1, 'quasar': 2, 'galaxy': 5, 'star': 4, 'planet': 3}

In [77]:
MAX_LENGTH = 0
for n in names:
    if len(n) > MAX_LENGTH:
        MAX_LENGTH = len(n)
print(MAX_LENGTH)


47

In [78]:
if MAX_LENGTH < 50:
    MAX_LENGTH = 50

In [79]:
def name_to_char_seq(name):
    name_chars = list(name)
    name_chars_indices = list(map(lambda char: char_indices[char], name_chars))
    return sequence.pad_sequences([name_chars_indices], maxlen=MAX_LENGTH)[0]

In [80]:
X = []
y = []

for n, l in zip(names, labels):
    X.append(name_to_char_seq(n))
    y.append(label_indices[l])
    
X = np.array(X).astype(np.uint8)
y = np_utils.to_categorical(np.array(y)).astype(np.bool)

print(X.shape, y.shape)


(14215, 50) (14215, 6)

In [ ]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

Train RNN classifier


In [10]:
import numpy as np
from keras.models import Sequential
from keras.layers.core import Activation, Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU, JZS1, JZS2, JZS3
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing import sequence
from keras.utils import np_utils


Using gpu device 3: Tesla K80 (CNMeM is disabled)

In [83]:
batch_size = 32
nb_epoch = 100

model = Sequential()
model.add(Embedding(len(char_list), 64, input_length=MAX_LENGTH, mask_zero=True))
model.add(LSTM(64, init='glorot_uniform', inner_init='orthogonal',
               activation='tanh', inner_activation='hard_sigmoid', 
               return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(len(labels_list)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', class_mode='categorical')

early_stopping = EarlyStopping(patience=5, verbose=1)
checkpointer = ModelCheckpoint(filepath='astro_lstm_keras_weights.hdf5', 
                               verbose=1, 
                               save_best_only=True)

model.fit(X_train, y_train, 
          batch_size=batch_size, 
          nb_epoch=nb_epoch, 
          show_accuracy=True,
          verbose=1,
          shuffle=True,
          validation_split=0.1,
          callbacks=[early_stopping, checkpointer])


Train on 12793 samples, validate on 1422 samples
Epoch 1/100
12793/12793 [==============================] - 24s - loss: 0.4070 - acc: 0.8983 - val_loss: 0.1755 - val_acc: 0.9430
Epoch 00000: val_loss improved from inf to 0.17550, saving model to astro_lstm_keras_weights.hdf5
Epoch 2/100
12793/12793 [==============================] - 24s - loss: 0.1495 - acc: 0.9552 - val_loss: 0.1259 - val_acc: 0.9620
Epoch 00001: val_loss improved from 0.17550 to 0.12592, saving model to astro_lstm_keras_weights.hdf5
Epoch 3/100
12793/12793 [==============================] - 23s - loss: 0.1122 - acc: 0.9671 - val_loss: 0.1063 - val_acc: 0.9662
Epoch 00002: val_loss improved from 0.12592 to 0.10634, saving model to astro_lstm_keras_weights.hdf5
Epoch 4/100
12793/12793 [==============================] - 23s - loss: 0.0902 - acc: 0.9723 - val_loss: 0.0910 - val_acc: 0.9698
Epoch 00003: val_loss improved from 0.10634 to 0.09104, saving model to astro_lstm_keras_weights.hdf5
Epoch 5/100
12793/12793 [==============================] - 23s - loss: 0.0741 - acc: 0.9764 - val_loss: 0.0746 - val_acc: 0.9761
Epoch 00004: val_loss improved from 0.09104 to 0.07461, saving model to astro_lstm_keras_weights.hdf5
Epoch 6/100
12793/12793 [==============================] - 24s - loss: 0.0654 - acc: 0.9796 - val_loss: 0.0713 - val_acc: 0.9775
Epoch 00005: val_loss improved from 0.07461 to 0.07127, saving model to astro_lstm_keras_weights.hdf5
Epoch 7/100
12793/12793 [==============================] - 23s - loss: 0.0606 - acc: 0.9815 - val_loss: 0.0615 - val_acc: 0.9810
Epoch 00006: val_loss improved from 0.07127 to 0.06148, saving model to astro_lstm_keras_weights.hdf5
Epoch 8/100
12793/12793 [==============================] - 23s - loss: 0.0518 - acc: 0.9859 - val_loss: 0.0562 - val_acc: 0.9852
Epoch 00007: val_loss improved from 0.06148 to 0.05622, saving model to astro_lstm_keras_weights.hdf5
Epoch 9/100
12793/12793 [==============================] - 23s - loss: 0.0455 - acc: 0.9875 - val_loss: 0.0542 - val_acc: 0.9831
Epoch 00008: val_loss improved from 0.05622 to 0.05418, saving model to astro_lstm_keras_weights.hdf5
Epoch 10/100
12793/12793 [==============================] - 23s - loss: 0.0427 - acc: 0.9875 - val_loss: 0.0532 - val_acc: 0.9845
Epoch 00009: val_loss improved from 0.05418 to 0.05323, saving model to astro_lstm_keras_weights.hdf5
Epoch 11/100
12793/12793 [==============================] - 23s - loss: 0.0366 - acc: 0.9898 - val_loss: 0.0436 - val_acc: 0.9887
Epoch 00010: val_loss improved from 0.05323 to 0.04361, saving model to astro_lstm_keras_weights.hdf5
Epoch 12/100
12793/12793 [==============================] - 23s - loss: 0.0369 - acc: 0.9899 - val_loss: 0.0476 - val_acc: 0.9873
Epoch 00011: val_loss did not improve
Epoch 13/100
12793/12793 [==============================] - 23s - loss: 0.0337 - acc: 0.9901 - val_loss: 0.0390 - val_acc: 0.9909
Epoch 00012: val_loss improved from 0.04361 to 0.03902, saving model to astro_lstm_keras_weights.hdf5
Epoch 14/100
12793/12793 [==============================] - 23s - loss: 0.0261 - acc: 0.9932 - val_loss: 0.0361 - val_acc: 0.9902
Epoch 00013: val_loss improved from 0.03902 to 0.03606, saving model to astro_lstm_keras_weights.hdf5
Epoch 15/100
12793/12793 [==============================] - 23s - loss: 0.0256 - acc: 0.9936 - val_loss: 0.0477 - val_acc: 0.9852
Epoch 00014: val_loss did not improve
Epoch 16/100
12793/12793 [==============================] - 23s - loss: 0.0306 - acc: 0.9902 - val_loss: 0.0403 - val_acc: 0.9887
Epoch 00015: val_loss did not improve
Epoch 17/100
12793/12793 [==============================] - 23s - loss: 0.0222 - acc: 0.9942 - val_loss: 0.0288 - val_acc: 0.9923
Epoch 00016: val_loss improved from 0.03606 to 0.02883, saving model to astro_lstm_keras_weights.hdf5
Epoch 18/100
12793/12793 [==============================] - 23s - loss: 0.0221 - acc: 0.9936 - val_loss: 0.0321 - val_acc: 0.9916
Epoch 00017: val_loss did not improve
Epoch 19/100
12793/12793 [==============================] - 23s - loss: 0.0202 - acc: 0.9949 - val_loss: 0.0328 - val_acc: 0.9909
Epoch 00018: val_loss did not improve
Epoch 20/100
12793/12793 [==============================] - 23s - loss: 0.0167 - acc: 0.9962 - val_loss: 0.0342 - val_acc: 0.9909
Epoch 00019: val_loss did not improve
Epoch 21/100
12793/12793 [==============================] - 23s - loss: 0.0132 - acc: 0.9968 - val_loss: 0.0354 - val_acc: 0.9887
Epoch 00020: val_loss did not improve
Epoch 22/100
12793/12793 [==============================] - 23s - loss: 0.0144 - acc: 0.9962 - val_loss: 0.0331 - val_acc: 0.9909
Epoch 00021: val_loss did not improve
Epoch 23/100
12793/12793 [==============================] - 23s - loss: 0.0205 - acc: 0.9951 - val_loss: 0.0372 - val_acc: 0.9909
Epoch 00022: early stopping
Epoch 00022: val_loss did not improve
Out[83]:
<keras.callbacks.History at 0x7fc499fc15f8>

In [96]:
%%time
model.predict(X[0:1,:])


CPU times: user 15.7 ms, sys: 28 ms, total: 43.7 ms
Wall time: 43 ms
Out[96]:
array([[  1.50226856e-06,   5.04170603e-05,   1.35677372e-04,
          1.68430142e-05,   9.99785244e-01,   1.03284810e-05]])

In [84]:
from sklearn.metrics import confusion_matrix, classification_report

model.load_weights('astro_lstm_keras_weights.hdf5')
preds = model.predict_classes(X_test, batch_size=64, verbose=0)

print('')
print(classification_report(np.argmax(y_test, axis=1), preds, target_names=labels_list))
print('')
print(confusion_matrix(np.argmax(y_test, axis=1), preds))


             precision    recall  f1-score   support

      comet       1.00      1.00      1.00      2984
   asteroid       0.98      0.98      0.98       222
     quasar       0.94      0.89      0.92       208
     planet       1.00      0.98      0.99       325
       star       1.00      1.00      1.00     10219
     galaxy       0.94      0.95      0.95       257

avg / total       1.00      1.00      1.00     14215


[[ 2982     0     0     0     0     2]
 [    0   218     0     0     3     1]
 [    0     0   185     0    17     6]
 [    0     1     0   319     5     0]
 [    0     2    10     1 10200     6]
 [    0     1     1     0    10   245]]

In [1]:
import h5py
import json
import gzip

layer_name_dict = {
    'Dense': 'denseLayer',
    'Dropout': 'dropoutLayer',
    'Flatten': 'flattenLayer',
    'Embedding': 'embeddingLayer',
    'BatchNormalization': 'batchNormalizationLayer',
    'LeakyReLU': 'leakyReLULayer',
    'PReLU': 'parametricReLULayer',
    'ParametricSoftplus': 'parametricSoftplusLayer',
    'ThresholdedLinear': 'thresholdedLinearLayer',
    'ThresholdedReLu': 'thresholdedReLuLayer',
    'LSTM': 'rLSTMLayer',
    'GRU': 'rGRULayer',
    'JZS1': 'rJZS1Layer',
    'JZS2': 'rJZS2Layer',
    'JZS3': 'rJZS3Layer',
    'Convolution2D': 'convolution2DLayer',
    'MaxPooling2D': 'maxPooling2DLayer'
}

layer_params_dict = {
    'Dense': ['weights', 'activation'],
    'Dropout': ['p'],
    'Flatten': [],
    'Embedding': ['weights', 'mask_zero'],
    'BatchNormalization': ['weights', 'epsilon'],
    'LeakyReLU': ['alpha'],
    'PReLU': ['weights'],
    'ParametricSoftplus': ['weights'],
    'ThresholdedLinear': ['theta'],
    'ThresholdedReLu': ['theta'],
    'LSTM': ['weights', 'activation', 'inner_activation', 'return_sequences'],
    'GRU': ['weights', 'activation', 'inner_activation', 'return_sequences'],
    'JZS1': ['weights', 'activation', 'inner_activation', 'return_sequences'],
    'JZS2': ['weights', 'activation', 'inner_activation', 'return_sequences'],
    'JZS3': ['weights', 'activation', 'inner_activation', 'return_sequences'],
    'Convolution2D': ['weights', 'nb_filter', 'nb_row', 'nb_col', 'border_mode', 'subsample', 'activation'],
    'MaxPooling2D': ['pool_size', 'stride', 'ignore_border']
}

layer_weights_dict = {
    'Dense': ['W', 'b'],
    'Embedding': ['E'],
    'BatchNormalization': ['gamma', 'beta', 'mean', 'std'],
    'PReLU': ['alphas'],
    'ParametricSoftplus': ['alphas', 'betas'],
    'LSTM': ['W_xi', 'W_hi', 'b_i', 'W_xc', 'W_hc', 'b_c', 'W_xf', 'W_hf', 'b_f', 'W_xo', 'W_ho', 'b_o'],
    'GRU': ['W_xz', 'W_hz', 'b_z', 'W_xr', 'W_hr', 'b_r', 'W_xh', 'W_hh', 'b_h'],
    'JZS1': ['W_xz', 'b_z', 'W_xr', 'W_hr', 'b_r', 'W_hh', 'b_h', 'Pmat'],
    'JZS2': ['W_xz', 'W_hz', 'b_z', 'W_hr', 'b_r', 'W_xh', 'W_hh', 'b_h', 'Pmat'],
    'JZS3': ['W_xz', 'W_hz', 'b_z', 'W_xr', 'W_hr', 'b_r', 'W_xh', 'W_hh', 'b_h'],
    'Convolution2D': ['W', 'b']
}

def serialize(model_json_file, weights_hdf5_file, save_filepath, compress):
    with open(model_json_file, 'r') as f:
        model_metadata = json.load(f)
    weights_file = h5py.File(weights_hdf5_file, 'r')

    layers = []

    num_activation_layers = 0
    for k, layer in enumerate(model_metadata['layers']):
        if layer['name'] == 'Activation':
            num_activation_layers += 1
            prev_layer_name = model_metadata['layers'][k-1]['name']
            idx_activation = layer_params_dict[prev_layer_name].index('activation')
            layers[k-num_activation_layers]['parameters'][idx_activation] = layer['activation']
            continue

        layer_params = []

        for param in layer_params_dict[layer['name']]:
            if param == 'weights':
                weights = {}
                weight_names = layer_weights_dict[layer['name']]
                for p, name in enumerate(weight_names):
                    weights[name] = weights_file.get('layer_{}/param_{}'.format(k, p)).value.tolist()
                layer_params.append(weights)
            else:
                layer_params.append(layer[param])

        layers.append({
            'layerName': layer_name_dict[layer['name']],
            'parameters': layer_params
        })

    if compress:
        with gzip.open(save_filepath, 'wb') as f:
            f.write(json.dumps(layers).encode('utf8'))
    else:
        with open(save_filepath, 'w') as f:
            json.dump(layers, f)

In [144]:
import json
model_metadata = json.loads(model.to_json())

with open('astro_lstm_keras_model.json', 'w') as f:
    json.dump(model_metadata, f)

In [145]:
model_metadata


Out[145]:
{'class_mode': 'categorical',
 'layers': [{'W_constraint': None,
   'W_regularizer': None,
   'activity_regularizer': None,
   'init': 'uniform',
   'input_dim': 112,
   'input_length': 50,
   'input_shape': [112],
   'mask_zero': True,
   'name': 'Embedding',
   'output_dim': 64},
  {'activation': 'tanh',
   'forget_bias_init': 'one',
   'init': 'glorot_uniform',
   'inner_activation': 'hard_sigmoid',
   'inner_init': 'orthogonal',
   'input_dim': None,
   'input_length': None,
   'name': 'LSTM',
   'output_dim': 64,
   'return_sequences': False,
   'truncate_gradient': -1},
  {'name': 'Dropout', 'p': 0.5},
  {'W_constraint': None,
   'W_regularizer': None,
   'activation': 'linear',
   'activity_regularizer': None,
   'b_constraint': None,
   'b_regularizer': None,
   'init': 'glorot_uniform',
   'input_dim': None,
   'name': 'Dense',
   'output_dim': 6},
  {'activation': 'softmax', 'beta': 0.1, 'name': 'Activation', 'target': 0}],
 'loss': 'categorical_crossentropy',
 'name': 'Sequential',
 'optimizer': {'beta_1': 0.9,
  'beta_2': 0.999,
  'epsilon': 1e-08,
  'lr': 0.0010000000474974513,
  'name': 'Adam'},
 'theano_mode': None}

In [2]:
serialize('astro_lstm_keras_model.json', 
          'astro_lstm_keras_weights.hdf5', 
          'astro_lstm_model_params.json.gz', 
          True)
serialize('astro_lstm_keras_model.json', 
          'astro_lstm_keras_weights.hdf5', 
          'astro_lstm_model_params.json', 
          False)

Export sample data


In [11]:
import csv
import codecs

names = []
labels = []
with codecs.open('astronomical_objects.csv', 'r', 'utf8') as f:
    reader = csv.reader(f)
    for line in reader:
        names.append(line[0])
        labels.append(line[1])
        
# get rid of duplicates
names = [name.replace('\n', ' ') for name in names]
objs = []
for obj in list(zip(names, labels)):
    if len(obj[0].strip()) != 0 and obj not in objs:
        objs.append(obj)        
names = []
labels = []
for n, l in objs:
    names.append(n)
    labels.append(l)
    
y = []
label_indices = {'comet': 0, 'asteroid': 1, 'quasar': 2, 'galaxy': 5, 'star': 4, 'planet': 3}
for n, l in zip(names, labels):
    y.append(label_indices[l])
y = np.array(y).astype(np.uint8)

In [13]:
import numpy as np

indices = []
for i in range(6):
    indices.append(np.random.choice(np.where(y == i)[0], 100))
indices = np.concatenate(tuple(indices))
np.random.shuffle(indices)
indices = indices.tolist()

In [15]:
import json
import gzip

names_rand = []
labels_rand = []
for i in indices:
    names_rand.append(names[i])
    labels_rand.append(labels[i])

with gzip.open('astro_lstm_sample_data.json.gz', 'wb') as f:
    f.write(json.dumps({'names': names_rand, 'labels': labels_rand}).encode('utf8'))
with open('astro_lstm_sample_data.json', 'w') as f:
    json.dump({'names': names_rand, 'labels': labels_rand}, f)

In [107]:
with open('char_label_indices.json', 'w') as f:
    json.dump({'chars': char_indices, 'labels': indices_label}, f)

In [ ]: