This notebook examines whether $w_1 - w_2$ and $w_2 - w_3$ are good features. There are indications that these may be correlated with whether galaxies contain AGNs. It also looks at whether the fluxes are more useful than the magnitudes, i.e., should we exponentiate the magnitudes.
In [11]:
import h5py, numpy, sklearn.linear_model, sklearn.cross_validation, sklearn.metrics
In [30]:
with h5py.File('../data/training.h5') as f:
raw_astro_features = f['features'][:, :4]
dist_features = f['features'][:, 4]
image_features = f['features'][:, 5:]
w1_w2 = raw_astro_features[:, 0] - raw_astro_features[:, 1]
w2_w3 = raw_astro_features[:, 1] - raw_astro_features[:, 2]
features_linear = f['features'][:]
features_nonlinear = numpy.hstack([
raw_astro_features,
dist_features.reshape((-1, 1)),
w1_w2.reshape((-1, 1)),
w2_w3.reshape((-1, 1)),
image_features,
])
features_exp = numpy.hstack([
numpy.power(10, -0.4 * raw_astro_features),
dist_features.reshape((-1, 1)),
image_features,
])
features_nlexp = numpy.hstack([
numpy.power(10, -0.4 * raw_astro_features),
numpy.power(10, -0.4 * w1_w2.reshape((-1, 1))),
numpy.power(10, -0.4 * w2_w3.reshape((-1, 1))),
dist_features.reshape((-1, 1)),
image_features,
])
labels = f['labels'].value
In [15]:
x_train, x_test, t_train, t_test = sklearn.cross_validation.train_test_split(
numpy.arange(raw_astro_features.shape[0]), labels, test_size=0.2)
In [18]:
lr = sklearn.linear_model.LogisticRegression(C=100.0, class_weight='balanced')
lr.fit(features_linear[x_train], t_train)
cm = sklearn.metrics.confusion_matrix(t_test, lr.predict(features_linear[x_test]))
tp = cm[1, 1]
n, p = cm.sum(axis=1)
tn = cm[0, 0]
ba = (tp / p + tn / n) / 2
print('Linear features, balanced accuracy: {:.02%}'.format(ba))
print(cm)
In [17]:
lrnl = sklearn.linear_model.LogisticRegression(C=100.0, class_weight='balanced')
lrnl.fit(features_nonlinear[x_train], t_train)
cm = sklearn.metrics.confusion_matrix(t_test, lrnl.predict(features_nonlinear[x_test]))
tp = cm[1, 1]
n, p = cm.sum(axis=1)
tn = cm[0, 0]
ba = (tp / p + tn / n) / 2
print('Nonlinear features, balanced accuracy: {:.02%}'.format(ba))
print(cm)
So maybe they're useful features (but not very). What about the fact they're magnitudes?
In [25]:
lrexp = sklearn.linear_model.LogisticRegression(C=100.0, class_weight='balanced')
lrexp.fit(features_exp[x_train], t_train)
cm = sklearn.metrics.confusion_matrix(t_test, lrexp.predict(features_exp[x_test]))
tp = cm[1, 1]
n, p = cm.sum(axis=1)
tn = cm[0, 0]
ba = (tp / p + tn / n) / 2
print('Exponentiated features, balanced accuracy: {:.02%}'.format(ba))
print(cm)
In [27]:
lrnlexp = sklearn.linear_model.LogisticRegression(C=100.0, class_weight='balanced')
lrnlexp.fit(features_nlexp[x_train], t_train)
cm = sklearn.metrics.confusion_matrix(t_test, lrnlexp.predict(features_nlexp[x_test]))
tp = cm[1, 1]
n, p = cm.sum(axis=1)
tn = cm[0, 0]
ba = (tp / p + tn / n) / 2
print('Exponentiated features, balanced accuracy: {:.02%}'.format(ba))
print(cm)
Those are promising results, but we need to rererun this a few times with different training and testing sets to get some error bars.
In [33]:
def balanced_accuracy(lr, x_test, t_test):
cm = sklearn.metrics.confusion_matrix(t_test, lr.predict(x_test))
tp = cm[1, 1]
n, p = cm.sum(axis=1)
tn = cm[0, 0]
ba = (tp / p + tn / n) / 2
return ba
def test_feature_set(features, x_train, t_train, x_test, t_test):
lr = sklearn.linear_model.LogisticRegression(C=100.0, class_weight='balanced')
lr.fit(features[x_train], t_train)
return balanced_accuracy(lr, features[x_test], t_test)
linear_ba = []
nonlinear_ba = []
exp_ba = []
nonlinear_exp_ba = []
n_trials = 10
for trial in range(n_trials):
print('Trial {}/{}'.format(trial + 1, n_trials))
x_train, x_test, t_train, t_test = sklearn.cross_validation.train_test_split(
numpy.arange(raw_astro_features.shape[0]), labels, test_size=0.2)
linear_ba.append(test_feature_set(features_linear, x_train, t_train, x_test, t_test))
nonlinear_ba.append(test_feature_set(features_nonlinear, x_train, t_train, x_test, t_test))
exp_ba.append(test_feature_set(features_exp, x_train, t_train, x_test, t_test))
nonlinear_exp_ba.append(test_feature_set(features_nlexp, x_train, t_train, x_test, t_test))
In [35]:
print('Linear features: ({:.02f} +- {:.02f})%'.format(
numpy.mean(linear_ba) * 100, numpy.std(linear_ba) * 100))
print('Nonlinear features: ({:.02f} +- {:.02f})%'.format(
numpy.mean(nonlinear_ba) * 100, numpy.std(nonlinear_ba) * 100))
print('Exponentiated features: ({:.02f} +- {:.02f})%'.format(
numpy.mean(exp_ba) * 100, numpy.std(exp_ba) * 100))
print('Exponentiated nonlinear features: ({:.02f} +- {:.02f})%'.format(
numpy.mean(nonlinear_exp_ba) * 100, numpy.std(nonlinear_exp_ba) * 100))
In [ ]: