In [ ]:
import h5py
import numpy as np
from sklearn.model_selection import train_test_split
import spectraldl.preprocessing as preprocessing
In [ ]:
with h5py.File('data/data.hdf5') as f:
spectra = f['spectra']
# create dataset matrix
X = np.zeros((len(spectra), 140), dtype=np.float64)
y = np.zeros((len(spectra), ), dtype=np.int8)
for i, (name, data) in enumerate(f['spectra'].items()):
waves = data[0]
fluxes = data[1]
# process waves and fluxes and store them
vac_waves = preprocessing.air2vacuum(waves)
conv_fluxes = preprocessing.convolve_spectrum(fluxes)
X[i, :] = preprocessing.resample_spectrum(vac_waves, conv_fluxes)
# store label
y[i] = spectra[name].attrs['label']
# remove unknown spectra (have label 2)
X = X[y != 2]
y = y[y != 2]
# change label of double-peak spectra (from 3 to 2)
y[y == 3] = 2
np.unique(y, return_counts=True)
In [ ]:
# test and train split means 10% for test set
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1, stratify=y)
# validation and train split means 20% for validation
X_tr, X_val, y_tr, y_val = train_test_split(X_tr, y_tr, test_size=0.2, stratify=y_tr)
train = np.unique(y_tr, return_counts=True)
valid = np.unique(y_val, return_counts=True)
test = np.unique(y_te, return_counts=True)
train, valid, test
In [ ]:
with h5py.File('data/data.hdf5') as f:
f.create_dataset('X', X.shape, dtype=np.float64)[...] = X
f.create_dataset('y', y.shape, dtype=np.int8)[...] = y
f.create_dataset('X_tr', X_tr.shape, dtype=np.float64)[...] = X_tr
f.create_dataset('y_tr', y_tr.shape, dtype=np.int8)[...] = y_tr
f.create_dataset('X_val', X_val.shape, dtype=np.float64)[...] = X_val
f.create_dataset('y_val', y_val.shape, dtype=np.int8)[...] = y_val
f.create_dataset('X_te', X_te.shape, dtype=np.float64)[...] = X_te
f.create_dataset('y_te', y_te.shape, dtype=np.int8)[...] = y_te
In [ ]:
import os
import h5py
import numpy as np
import astropy.io.fits as fits
import spectraldl.preprocessing as preprocessing
import spectraldl.lamost as lamost
def lam_files_generator():
'''Generator of lamost fits paths.'''
# Docker container '/lamost' directory is mounted
# from antares:/data/public/LAMOST-DR1/fits
for dirpath, _, filenames in os.walk('/lamost'):
for filename in filenames:
if filename.endswith('.fits'):
yield dirpath + '/' + filename
def get_hdf5_datasets(hdf5):
# delete datasets if exist
try:
del hdf5['X_lam']
del hdf5['id_lam']
except KeyError:
pass
# see /data/public/LAMOST-DR1/count file
# $ find /data/public/LAMOST-DR1/fits/ -name '*.fits' | wc -l
size = 2202000
# fluxes matrix
X_lam = hdf5.create_dataset('X_lam', (size, 140), dtype=np.float64)
# create variable-len unicode datatype
str_dtype = h5py.special_dtype(vlen=str)
id_lam = hdf5.create_dataset('id_lam', (size, ), dtype=str_dtype)
return X_lam, id_lam
def create_lamost_matrix():
with h5py.File('data/data.hdf5') as hdf5:
# get the datasets
X_lam, id_lam = get_hdf5_datasets(hdf5)
for i, path in enumerate(lam_files_generator()):
print(path)
try:
with fits.open(path) as hdulist:
fluxes = lamost.get_fluxes(hdulist)
waves = lamost.get_waves(hdulist)
X_lam[i, :] = preprocessing.resample_spectrum(waves, fluxes)
id_lam[i] = path
# catch any excetion so that progress would not be lost
except Exception as e:
print(e)
continue