In [1]:
import sklearn.linear_model, h5py, numpy, crowdastro.crowd.util
In [20]:
with h5py.File('/Users/alger/data/Crowdastro/all_training_data_01_05_17.h5', 'r') as f:
features = f['features'].value
labels = f['norris_labels'].value
train_sets = f['sets/RGZ & Norris/train'].value
test_sets = f['sets/RGZ & Norris/test'].value
with h5py.File('/Users/alger/data/Crowdastro/crowdastro-swire.h5', 'r') as f:
swire_features = f['/swire/cdfs/numeric'][:, 2:2 + 5]
SPITZER_SENSITIVITIES = {
36: 7.3,
45: 9.7,
58: 27.5,
80: 32.5,
24: 450,
}
swire_features[swire_features[:, 0] == -99, 0] = SPITZER_SENSITIVITIES[36]
swire_features[swire_features[:, 1] == -99, 1] = SPITZER_SENSITIVITIES[45]
swire_features[swire_features[:, 2] == -99, 2] = SPITZER_SENSITIVITIES[58]
swire_features[swire_features[:, 2] == 0, 2] = SPITZER_SENSITIVITIES[58]
swire_features[swire_features[:, 3] == -99, 3] = SPITZER_SENSITIVITIES[80]
swire_features[swire_features[:, 3] == 0, 3] = SPITZER_SENSITIVITIES[80]
swire_features[swire_features[:, 4] == -99, 4] = SPITZER_SENSITIVITIES[24]
swire_features[swire_features[:, 4] == 0, 4] = SPITZER_SENSITIVITIES[24]
colour_colour_36_45 = numpy.log10(swire_features[:, 0]) - numpy.log10(swire_features[:, 1])
colour_colour_36_58 = numpy.log10(swire_features[:, 0]) - numpy.log10(swire_features[:, 2])
colour_colour_36_80 = numpy.log10(swire_features[:, 0]) - numpy.log10(swire_features[:, 3])
colour_colour_45_58 = numpy.log10(swire_features[:, 1]) - numpy.log10(swire_features[:, 2])
colour_colour_45_80 = numpy.log10(swire_features[:, 1]) - numpy.log10(swire_features[:, 3])
colour_colour_58_80 = numpy.log10(swire_features[:, 2]) - numpy.log10(swire_features[:, 3])
In [16]:
# Train LR on the 36-45 and 45-58 features.
bas = []
for train, test in zip(train_sets, test_sets):
lr = sklearn.linear_model.LogisticRegression(class_weight='balanced')
f = features
f -= f.mean(axis=0)
f /= f.std(axis=0)
lr.fit(f[train], labels[train])
p = lr.predict(f[test])
bas.append(crowdastro.crowd.util.balanced_accuracy(labels[test], p))
print(numpy.mean(bas), numpy.std(bas))
In [21]:
# Train LR on all flux ratio features.
bas_all = []
for train, test in zip(train_sets, test_sets):
lr = sklearn.linear_model.LogisticRegression(class_weight='balanced')
f = numpy.vstack([features.T, colour_colour_36_45, colour_colour_36_58,
colour_colour_36_80, colour_colour_45_58, colour_colour_45_80,
colour_colour_58_80]).T
f -= f.mean(axis=0)
f /= f.std(axis=0)
lr.fit(f[train], labels[train])
p = lr.predict(f[test])
bas_all.append(crowdastro.crowd.util.balanced_accuracy(labels[test], p))
print(numpy.mean(bas_all), numpy.std(bas_all))
In [28]:
list(lr.coef_.ravel()[:5]) + list(lr.coef_.ravel()[-6:])
Out[28]:
In [ ]: