In [39]:
import requests
from bs4 import BeautifulSoup
import re
In [23]:
links = []
res = requests.get('https://en.wikipedia.org/wiki/Lists_of_stars_by_constellation')
soup = BeautifulSoup(res.text)
for link in soup.find_all('a'):
try:
if link['href'].startswith('/wiki/List_of_stars_in_'):
links.append('https://en.wikipedia.org' + link['href'])
except:
pass
In [ ]:
for link in links:
res = requests.get(link)
soup = BeautifulSoup(res.text)
for row in soup.table.children:
if row.name == 'tr':
for rowc in row.children:
if rowc.name == 'td':
print(row.td.text)
break
In [ ]:
links = ['https://en.wikipedia.org/wiki/List_of_exoplanets_detected_by_radial_velocity',
'https://en.wikipedia.org/wiki/List_of_transiting_exoplanets',
'https://en.wikipedia.org/wiki/List_of_exoplanets_detected_by_microlensing',
'https://en.wikipedia.org/wiki/List_of_directly_imaged_exoplanets',
'https://en.wikipedia.org/wiki/List_of_exoplanets_detected_by_timing']
res = requests.get(links[4])
soup = BeautifulSoup(res.text)
for row in soup.find_all('table')[0].children:
if row.name == 'tr':
for rowc in row.children:
if rowc.name == 'td':
print(row.td.text)
break
In [1]:
import csv
import codecs
names = []
labels = []
with codecs.open('astronomical_objects.csv', 'r', 'utf8') as f:
reader = csv.reader(f)
for line in reader:
names.append(line[0])
labels.append(line[1])
In [2]:
# get rid of duplicates
names = [name.replace('\n', ' ') for name in names]
objs = []
for obj in list(zip(names, labels)):
if len(obj[0].strip()) != 0 and obj not in objs:
objs.append(obj)
In [3]:
len(objs)
Out[3]:
In [4]:
import random
random.shuffle(objs)
In [5]:
names = []
labels = []
for n, l in objs:
names.append(n)
labels.append(l)
In [72]:
char_list = list(set(''.join(names)))
labels_list = list(set(labels))
In [73]:
char_indices = dict((c, i) for i, c in enumerate(char_list))
indices_char = dict((i, c) for i, c in enumerate(char_list))
label_indices = dict((l, i) for i, l in enumerate(labels_list))
indices_label = dict((i, l) for i, l in enumerate(labels_list))
In [74]:
print(label_indices)
In [77]:
MAX_LENGTH = 0
for n in names:
if len(n) > MAX_LENGTH:
MAX_LENGTH = len(n)
print(MAX_LENGTH)
In [78]:
if MAX_LENGTH < 50:
MAX_LENGTH = 50
In [79]:
def name_to_char_seq(name):
name_chars = list(name)
name_chars_indices = list(map(lambda char: char_indices[char], name_chars))
return sequence.pad_sequences([name_chars_indices], maxlen=MAX_LENGTH)[0]
In [80]:
X = []
y = []
for n, l in zip(names, labels):
X.append(name_to_char_seq(n))
y.append(label_indices[l])
X = np.array(X).astype(np.uint8)
y = np_utils.to_categorical(np.array(y)).astype(np.bool)
print(X.shape, y.shape)
In [ ]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
In [10]:
import numpy as np
from keras.models import Sequential
from keras.layers.core import Activation, Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU, JZS1, JZS2, JZS3
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing import sequence
from keras.utils import np_utils
In [83]:
batch_size = 32
nb_epoch = 100
model = Sequential()
model.add(Embedding(len(char_list), 64, input_length=MAX_LENGTH, mask_zero=True))
model.add(LSTM(64, init='glorot_uniform', inner_init='orthogonal',
activation='tanh', inner_activation='hard_sigmoid',
return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(len(labels_list)))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', class_mode='categorical')
early_stopping = EarlyStopping(patience=5, verbose=1)
checkpointer = ModelCheckpoint(filepath='astro_lstm_keras_weights.hdf5',
verbose=1,
save_best_only=True)
model.fit(X_train, y_train,
batch_size=batch_size,
nb_epoch=nb_epoch,
show_accuracy=True,
verbose=1,
shuffle=True,
validation_split=0.1,
callbacks=[early_stopping, checkpointer])
Out[83]:
In [96]:
%%time
model.predict(X[0:1,:])
Out[96]:
In [84]:
from sklearn.metrics import confusion_matrix, classification_report
model.load_weights('astro_lstm_keras_weights.hdf5')
preds = model.predict_classes(X_test, batch_size=64, verbose=0)
print('')
print(classification_report(np.argmax(y_test, axis=1), preds, target_names=labels_list))
print('')
print(confusion_matrix(np.argmax(y_test, axis=1), preds))
In [1]:
import h5py
import json
import gzip
layer_name_dict = {
'Dense': 'denseLayer',
'Dropout': 'dropoutLayer',
'Flatten': 'flattenLayer',
'Embedding': 'embeddingLayer',
'BatchNormalization': 'batchNormalizationLayer',
'LeakyReLU': 'leakyReLULayer',
'PReLU': 'parametricReLULayer',
'ParametricSoftplus': 'parametricSoftplusLayer',
'ThresholdedLinear': 'thresholdedLinearLayer',
'ThresholdedReLu': 'thresholdedReLuLayer',
'LSTM': 'rLSTMLayer',
'GRU': 'rGRULayer',
'JZS1': 'rJZS1Layer',
'JZS2': 'rJZS2Layer',
'JZS3': 'rJZS3Layer',
'Convolution2D': 'convolution2DLayer',
'MaxPooling2D': 'maxPooling2DLayer'
}
layer_params_dict = {
'Dense': ['weights', 'activation'],
'Dropout': ['p'],
'Flatten': [],
'Embedding': ['weights', 'mask_zero'],
'BatchNormalization': ['weights', 'epsilon'],
'LeakyReLU': ['alpha'],
'PReLU': ['weights'],
'ParametricSoftplus': ['weights'],
'ThresholdedLinear': ['theta'],
'ThresholdedReLu': ['theta'],
'LSTM': ['weights', 'activation', 'inner_activation', 'return_sequences'],
'GRU': ['weights', 'activation', 'inner_activation', 'return_sequences'],
'JZS1': ['weights', 'activation', 'inner_activation', 'return_sequences'],
'JZS2': ['weights', 'activation', 'inner_activation', 'return_sequences'],
'JZS3': ['weights', 'activation', 'inner_activation', 'return_sequences'],
'Convolution2D': ['weights', 'nb_filter', 'nb_row', 'nb_col', 'border_mode', 'subsample', 'activation'],
'MaxPooling2D': ['pool_size', 'stride', 'ignore_border']
}
layer_weights_dict = {
'Dense': ['W', 'b'],
'Embedding': ['E'],
'BatchNormalization': ['gamma', 'beta', 'mean', 'std'],
'PReLU': ['alphas'],
'ParametricSoftplus': ['alphas', 'betas'],
'LSTM': ['W_xi', 'W_hi', 'b_i', 'W_xc', 'W_hc', 'b_c', 'W_xf', 'W_hf', 'b_f', 'W_xo', 'W_ho', 'b_o'],
'GRU': ['W_xz', 'W_hz', 'b_z', 'W_xr', 'W_hr', 'b_r', 'W_xh', 'W_hh', 'b_h'],
'JZS1': ['W_xz', 'b_z', 'W_xr', 'W_hr', 'b_r', 'W_hh', 'b_h', 'Pmat'],
'JZS2': ['W_xz', 'W_hz', 'b_z', 'W_hr', 'b_r', 'W_xh', 'W_hh', 'b_h', 'Pmat'],
'JZS3': ['W_xz', 'W_hz', 'b_z', 'W_xr', 'W_hr', 'b_r', 'W_xh', 'W_hh', 'b_h'],
'Convolution2D': ['W', 'b']
}
def serialize(model_json_file, weights_hdf5_file, save_filepath, compress):
with open(model_json_file, 'r') as f:
model_metadata = json.load(f)
weights_file = h5py.File(weights_hdf5_file, 'r')
layers = []
num_activation_layers = 0
for k, layer in enumerate(model_metadata['layers']):
if layer['name'] == 'Activation':
num_activation_layers += 1
prev_layer_name = model_metadata['layers'][k-1]['name']
idx_activation = layer_params_dict[prev_layer_name].index('activation')
layers[k-num_activation_layers]['parameters'][idx_activation] = layer['activation']
continue
layer_params = []
for param in layer_params_dict[layer['name']]:
if param == 'weights':
weights = {}
weight_names = layer_weights_dict[layer['name']]
for p, name in enumerate(weight_names):
weights[name] = weights_file.get('layer_{}/param_{}'.format(k, p)).value.tolist()
layer_params.append(weights)
else:
layer_params.append(layer[param])
layers.append({
'layerName': layer_name_dict[layer['name']],
'parameters': layer_params
})
if compress:
with gzip.open(save_filepath, 'wb') as f:
f.write(json.dumps(layers).encode('utf8'))
else:
with open(save_filepath, 'w') as f:
json.dump(layers, f)
In [144]:
import json
model_metadata = json.loads(model.to_json())
with open('astro_lstm_keras_model.json', 'w') as f:
json.dump(model_metadata, f)
In [145]:
model_metadata
Out[145]:
In [2]:
serialize('astro_lstm_keras_model.json',
'astro_lstm_keras_weights.hdf5',
'astro_lstm_model_params.json.gz',
True)
serialize('astro_lstm_keras_model.json',
'astro_lstm_keras_weights.hdf5',
'astro_lstm_model_params.json',
False)
In [11]:
import csv
import codecs
names = []
labels = []
with codecs.open('astronomical_objects.csv', 'r', 'utf8') as f:
reader = csv.reader(f)
for line in reader:
names.append(line[0])
labels.append(line[1])
# get rid of duplicates
names = [name.replace('\n', ' ') for name in names]
objs = []
for obj in list(zip(names, labels)):
if len(obj[0].strip()) != 0 and obj not in objs:
objs.append(obj)
names = []
labels = []
for n, l in objs:
names.append(n)
labels.append(l)
y = []
label_indices = {'comet': 0, 'asteroid': 1, 'quasar': 2, 'galaxy': 5, 'star': 4, 'planet': 3}
for n, l in zip(names, labels):
y.append(label_indices[l])
y = np.array(y).astype(np.uint8)
In [13]:
import numpy as np
indices = []
for i in range(6):
indices.append(np.random.choice(np.where(y == i)[0], 100))
indices = np.concatenate(tuple(indices))
np.random.shuffle(indices)
indices = indices.tolist()
In [15]:
import json
import gzip
names_rand = []
labels_rand = []
for i in indices:
names_rand.append(names[i])
labels_rand.append(labels[i])
with gzip.open('astro_lstm_sample_data.json.gz', 'wb') as f:
f.write(json.dumps({'names': names_rand, 'labels': labels_rand}).encode('utf8'))
with open('astro_lstm_sample_data.json', 'w') as f:
json.dump({'names': names_rand, 'labels': labels_rand}, f)
In [107]:
with open('char_label_indices.json', 'w') as f:
json.dump({'chars': char_indices, 'labels': indices_label}, f)
In [ ]: