In [1]:
%matplotlib inline
In [2]:
import numpy as np
import collections
import math
import matplotlib.pyplot as plt
import h5py
import csv
In [3]:
LABELS_FILE = 'data/ondrejov-dataset.csv'
In [4]:
with open(LABELS_FILE, newline='') as f:
labels = list(csv.DictReader(f))
In [14]:
counts = collections.Counter(map(lambda x: x['label'], labels))
print(counts)
for label, cnt in counts.items():
percents = cnt / len(labels) * 100
print('{} is {}%'.format(label, round(percents, 2)))
idx = np.arange(len(counts))
rects = plt.bar(idx, list(map(lambda x: x[1], sorted(counts.items()))))
plt.xticks(idx, ('emission', 'absorption', 'unknown', 'double-peak'))
plt.ylabel('number of spectra')
plt.xlabel('class')
plt.title('portion of each class in Ondřejov dataset');
In [6]:
f = h5py.File('data/data.hdf5')
spectra = f['spectra']
In [7]:
def plot_class(spectrum, ax, class_name):
ax.plot(spectrum[0], spectrum[1])
ax.set_title(class_name)
ax.set_xlabel('wavelength (Angstrom)')
ax.set_ylabel('flux')
ax.axvline(x=6562.8, color='black', label='H-alpha', alpha=0.25)
ax.legend()
fig, axs = plt.subplots(3, 1)
idents = ['lb160035', 'a201403300026', 'si220021']
classes = ['emission', 'absorption', 'double-peak']
for ident, ax, cl in zip(idents, axs, classes):
plot_class(spectra[ident], ax, cl)
fig.tight_layout()
In [16]:
for spectrum in labels:
ident = spectrum['id'].split('/')[-1]
spectra[ident].attrs['label'] = int(spectrum['label'])
In [9]:
fig, (ax0, ax1, ax3) = plt.subplots(3, 1)
axs = [ax0, ax1, None, ax3]
for ident, data in spectra.items():
label = spectra[ident].attrs['label']
if label == 2:
continue
axs[label].plot(data[0], data[1], alpha=0.1, lw=0.5)
fig.tight_layout()
This analysis shows that the infimum from starting wavelengths is 6518.4272. That is pretty high but H-alpha is 6562.8 and H-alpha is the main feature. It may shorten the range of value and thus speed up training. I also reviewed some spectra and it is far enough from H-alpha. Therefore 6519 Angstrom should be choosen as starting wavelength.
In [10]:
# find spectrum which start with highest value
# x is tuple x[1] are values, [0, 0] is first wavelength
wave_starts = dict(map(lambda x: (x[0], x[1][0, 0]), spectra.items()))
In [11]:
starts_n, starts_bins, _ = plt.hist(list(wave_starts.values()))
plt.title('wavelenght starts')
starts_n, starts_bins
Out[11]:
In [12]:
infimum = list(reversed(sorted(wave_starts.items(), key=lambda x: x[1])))[0][1]
print('infimum:', math.ceil(infimum), 'Angstrom')
In [13]:
list(reversed(sorted(wave_starts.items(), key=lambda x: x[1])))[:10]
Out[13]:
In [14]:
def plot_spectrum(ident):
spectrum = spectra[ident]
plt.plot(spectrum[0], spectrum[1], label=ident)
plot_spectrum('la220044')
plot_spectrum('a201504060008')
plot_spectrum('a201504060037')
plot_spectrum('td210007')
plot_spectrum('qd260023')
plt.legend();
In [15]:
# find spectrum which end with lowest value
# x is tuple x[1] are values, [0, 0] is first wavelength
wave_ends = dict(map(lambda x: (x[0], x[1][0, -1]), spectra.items()))
In [16]:
ends_n, ends_bins, _ = plt.hist(list(wave_ends.values()))
plt.title('wavelenght ends')
ends_n, ends_bins
Out[16]:
In [17]:
supremum = list(sorted(wave_ends.items(), key=lambda x: x[1]))[0][1]
print('supremum:', math.floor(supremum), 'Angstrom')
In [18]:
list(sorted(wave_ends.items(), key=lambda x: x[1]))[:10]
Out[18]:
In [19]:
plot_spectrum('pb060015')
plot_spectrum('lb160035')
plt.legend();
In [20]:
f.close()