Dataset

This notebook creates the data matrices.

Ondřejov Datasets


In [ ]:
import h5py
import numpy as np
from sklearn.model_selection import train_test_split
import spectraldl.preprocessing as preprocessing

In [ ]:
with h5py.File('data/data.hdf5') as f:
    spectra = f['spectra']
    
    # create dataset matrix
    X = np.zeros((len(spectra), 140), dtype=np.float64)
    y = np.zeros((len(spectra), ), dtype=np.int8)
    
    for i, (name, data) in enumerate(f['spectra'].items()):
        waves = data[0]
        fluxes = data[1]
        
        # process waves and fluxes and store them
        vac_waves = preprocessing.air2vacuum(waves)
        conv_fluxes = preprocessing.convolve_spectrum(fluxes)
        X[i, :] = preprocessing.resample_spectrum(vac_waves, conv_fluxes)
        
        # store label
        y[i] = spectra[name].attrs['label']

# remove unknown spectra (have label 2)
X = X[y != 2]
y = y[y != 2]
# change label of double-peak spectra (from 3 to 2)
y[y == 3] = 2

np.unique(y, return_counts=True)

In [ ]:
# test and train split means 10% for test set
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1, stratify=y)

# validation and train split means 20% for validation
X_tr, X_val, y_tr, y_val = train_test_split(X_tr, y_tr, test_size=0.2, stratify=y_tr)

train = np.unique(y_tr, return_counts=True)
valid = np.unique(y_val, return_counts=True)
test = np.unique(y_te, return_counts=True)

train, valid, test

In [ ]:
with h5py.File('data/data.hdf5') as f:
    f.create_dataset('X', X.shape, dtype=np.float64)[...] = X
    f.create_dataset('y', y.shape, dtype=np.int8)[...] = y
    f.create_dataset('X_tr', X_tr.shape, dtype=np.float64)[...] = X_tr
    f.create_dataset('y_tr', y_tr.shape, dtype=np.int8)[...] = y_tr
    f.create_dataset('X_val', X_val.shape, dtype=np.float64)[...] = X_val
    f.create_dataset('y_val', y_val.shape, dtype=np.int8)[...] = y_val
    f.create_dataset('X_te', X_te.shape, dtype=np.float64)[...] = X_te
    f.create_dataset('y_te', y_te.shape, dtype=np.int8)[...] = y_te

LAMOST DR1 Matrix

LAMOST data should be mounted into Docker container into /lamost/

Be paitient when running this code. It takes time. Originally it was script.


In [ ]:
import os
import h5py
import numpy as np
import astropy.io.fits as fits
import spectraldl.preprocessing as preprocessing
import spectraldl.lamost as lamost


def lam_files_generator():
    '''Generator of lamost fits paths.'''
    # Docker container '/lamost' directory is mounted
    # from antares:/data/public/LAMOST-DR1/fits
    for dirpath, _, filenames in os.walk('/lamost'):
        for filename in filenames:
            if filename.endswith('.fits'):
                yield dirpath + '/' + filename

def get_hdf5_datasets(hdf5):
    # delete datasets if exist
    try:
        del hdf5['X_lam']
        del hdf5['id_lam']
    except KeyError:
        pass

    # see /data/public/LAMOST-DR1/count file 
    # $ find /data/public/LAMOST-DR1/fits/ -name '*.fits' | wc -l
    size = 2202000

    # fluxes matrix
    X_lam = hdf5.create_dataset('X_lam', (size, 140), dtype=np.float64)

    # create variable-len unicode datatype
    str_dtype = h5py.special_dtype(vlen=str)
    id_lam = hdf5.create_dataset('id_lam', (size, ), dtype=str_dtype)
    
    return X_lam, id_lam

def create_lamost_matrix():
    with h5py.File('data/data.hdf5') as hdf5:
        # get the datasets
        X_lam, id_lam = get_hdf5_datasets(hdf5)

        for i, path in enumerate(lam_files_generator()):
            print(path)

            try:
                with fits.open(path) as hdulist:
                    fluxes = lamost.get_fluxes(hdulist)
                    waves = lamost.get_waves(hdulist)
                    X_lam[i, :] = preprocessing.resample_spectrum(waves, fluxes)
                    id_lam[i] = path
            # catch any excetion so that progress would not be lost
            except Exception as e:
                print(e)
                continue