Nonlinear Astro Features

This notebook examines whether $w_1 - w_2$ and $w_2 - w_3$ are good features. There are indications that these may be correlated with whether galaxies contain AGNs. It also looks at whether the fluxes are more useful than the magnitudes, i.e., should we exponentiate the magnitudes.


In [11]:
import h5py, numpy, sklearn.linear_model, sklearn.cross_validation, sklearn.metrics

In [30]:
with h5py.File('../data/training.h5') as f:
    raw_astro_features = f['features'][:, :4]
    dist_features = f['features'][:, 4]
    image_features = f['features'][:, 5:]
    
    w1_w2 = raw_astro_features[:, 0] - raw_astro_features[:, 1]
    w2_w3 = raw_astro_features[:, 1] - raw_astro_features[:, 2]
    
    features_linear = f['features'][:]
    features_nonlinear = numpy.hstack([
            raw_astro_features,
            dist_features.reshape((-1, 1)),
            w1_w2.reshape((-1, 1)),
            w2_w3.reshape((-1, 1)),
            image_features,
    ])
    features_exp = numpy.hstack([
            numpy.power(10, -0.4 * raw_astro_features),
            dist_features.reshape((-1, 1)),
            image_features,
    ])
    features_nlexp = numpy.hstack([
            numpy.power(10, -0.4 * raw_astro_features),
            numpy.power(10, -0.4 * w1_w2.reshape((-1, 1))),
            numpy.power(10, -0.4 * w2_w3.reshape((-1, 1))),
            dist_features.reshape((-1, 1)),
            image_features,
    ])
    labels = f['labels'].value

In [15]:
x_train, x_test, t_train, t_test = sklearn.cross_validation.train_test_split(
        numpy.arange(raw_astro_features.shape[0]), labels, test_size=0.2)

In [18]:
lr = sklearn.linear_model.LogisticRegression(C=100.0, class_weight='balanced')
lr.fit(features_linear[x_train], t_train)
cm = sklearn.metrics.confusion_matrix(t_test, lr.predict(features_linear[x_test]))
tp = cm[1, 1]
n, p = cm.sum(axis=1)
tn = cm[0, 0]
ba = (tp / p + tn / n) / 2
print('Linear features, balanced accuracy: {:.02%}'.format(ba))
print(cm)


Linear features, balanced accuracy: 88.20%
[[4114  268]
 [  78  368]]

In [17]:
lrnl = sklearn.linear_model.LogisticRegression(C=100.0, class_weight='balanced')
lrnl.fit(features_nonlinear[x_train], t_train)
cm = sklearn.metrics.confusion_matrix(t_test, lrnl.predict(features_nonlinear[x_test]))
tp = cm[1, 1]
n, p = cm.sum(axis=1)
tn = cm[0, 0]
ba = (tp / p + tn / n) / 2
print('Nonlinear features, balanced accuracy: {:.02%}'.format(ba))
print(cm)


Nonlinear features, balanced accuracy: 88.52%
[[4103  279]
 [  74  372]]

So maybe they're useful features (but not very). What about the fact they're magnitudes?


In [25]:
lrexp = sklearn.linear_model.LogisticRegression(C=100.0, class_weight='balanced')
lrexp.fit(features_exp[x_train], t_train)
cm = sklearn.metrics.confusion_matrix(t_test, lrexp.predict(features_exp[x_test]))
tp = cm[1, 1]
n, p = cm.sum(axis=1)
tn = cm[0, 0]
ba = (tp / p + tn / n) / 2
print('Exponentiated features, balanced accuracy: {:.02%}'.format(ba))
print(cm)


Exponentiated features, balanced accuracy: 89.10%
[[4124  258]
 [  71  375]]

In [27]:
lrnlexp = sklearn.linear_model.LogisticRegression(C=100.0, class_weight='balanced')
lrnlexp.fit(features_nlexp[x_train], t_train)
cm = sklearn.metrics.confusion_matrix(t_test, lrnlexp.predict(features_nlexp[x_test]))
tp = cm[1, 1]
n, p = cm.sum(axis=1)
tn = cm[0, 0]
ba = (tp / p + tn / n) / 2
print('Exponentiated features, balanced accuracy: {:.02%}'.format(ba))
print(cm)


Exponentiated features, balanced accuracy: 89.35%
[[4107  275]
 [  67  379]]

Those are promising results, but we need to rererun this a few times with different training and testing sets to get some error bars.


In [33]:
def balanced_accuracy(lr, x_test, t_test):
    cm = sklearn.metrics.confusion_matrix(t_test, lr.predict(x_test))
    tp = cm[1, 1]
    n, p = cm.sum(axis=1)
    tn = cm[0, 0]
    ba = (tp / p + tn / n) / 2
    return ba

def test_feature_set(features, x_train, t_train, x_test, t_test):
    lr = sklearn.linear_model.LogisticRegression(C=100.0, class_weight='balanced')
    lr.fit(features[x_train], t_train)
    return balanced_accuracy(lr, features[x_test], t_test)

linear_ba = []
nonlinear_ba = []
exp_ba = []
nonlinear_exp_ba = []

n_trials = 10
for trial in range(n_trials):
    print('Trial {}/{}'.format(trial + 1, n_trials))
    x_train, x_test, t_train, t_test = sklearn.cross_validation.train_test_split(
        numpy.arange(raw_astro_features.shape[0]), labels, test_size=0.2)
    linear_ba.append(test_feature_set(features_linear, x_train, t_train, x_test, t_test))
    nonlinear_ba.append(test_feature_set(features_nonlinear, x_train, t_train, x_test, t_test))
    exp_ba.append(test_feature_set(features_exp, x_train, t_train, x_test, t_test))
    nonlinear_exp_ba.append(test_feature_set(features_nlexp, x_train, t_train, x_test, t_test))


[0.87926733193277307]
[0.8515625]
[0.88248424369747902]
[0.88453584558823528]
[0.87926733193277307, 0.88709672756497815]
[0.8515625, 0.88978171720807331]
[0.88248424369747902, 0.90201871711218939]
[0.88453584558823528, 0.90356851867109755]
[0.87926733193277307, 0.88709672756497815, 0.88338354869339275]
[0.8515625, 0.88978171720807331, 0.87759469717527616]
[0.88248424369747902, 0.90201871711218939, 0.90199786319816277]
[0.88453584558823528, 0.90356851867109755, 0.90017429356059719]
[0.87926733193277307, 0.88709672756497815, 0.88338354869339275, 0.89836929366341134]
[0.8515625, 0.88978171720807331, 0.87759469717527616, 0.89028843970020444]
[0.88248424369747902, 0.90201871711218939, 0.90199786319816277, 0.89785600726777193]
[0.88453584558823528, 0.90356851867109755, 0.90017429356059719, 0.9004042698160345]
[0.87926733193277307, 0.88709672756497815, 0.88338354869339275, 0.89836929366341134, 0.87160312684947727]
[0.8515625, 0.88978171720807331, 0.87759469717527616, 0.89028843970020444, 0.86731233971197474]
[0.88248424369747902, 0.90201871711218939, 0.90199786319816277, 0.89785600726777193, 0.90828134247386072]
[0.88453584558823528, 0.90356851867109755, 0.90017429356059719, 0.9004042698160345, 0.90953590451765631]
[0.87926733193277307, 0.88709672756497815, 0.88338354869339275, 0.89836929366341134, 0.87160312684947727, 0.87634941658782717]
[0.8515625, 0.88978171720807331, 0.87759469717527616, 0.89028843970020444, 0.86731233971197474, 0.86569082308420053]
[0.88248424369747902, 0.90201871711218939, 0.90199786319816277, 0.89785600726777193, 0.90828134247386072, 0.901931251970987]
[0.88453584558823528, 0.90356851867109755, 0.90017429356059719, 0.9004042698160345, 0.90953590451765631, 0.90642144433932514]
[0.87926733193277307, 0.88709672756497815, 0.88338354869339275, 0.89836929366341134, 0.87160312684947727, 0.87634941658782717, 0.87592505060404435]
[0.8515625, 0.88978171720807331, 0.87759469717527616, 0.89028843970020444, 0.86731233971197474, 0.86569082308420053, 0.87744690610655063]
[0.88248424369747902, 0.90201871711218939, 0.90199786319816277, 0.89785600726777193, 0.90828134247386072, 0.901931251970987, 0.90776139078849805]
[0.88453584558823528, 0.90356851867109755, 0.90017429356059719, 0.9004042698160345, 0.90953590451765631, 0.90642144433932514, 0.90758745067882418]
[0.87926733193277307, 0.88709672756497815, 0.88338354869339275, 0.89836929366341134, 0.87160312684947727, 0.87634941658782717, 0.87592505060404435, 0.89230593847556916]
[0.8515625, 0.88978171720807331, 0.87759469717527616, 0.89028843970020444, 0.86731233971197474, 0.86569082308420053, 0.87744690610655063, 0.8868778280542986]
[0.88248424369747902, 0.90201871711218939, 0.90199786319816277, 0.89785600726777193, 0.90828134247386072, 0.901931251970987, 0.90776139078849805, 0.894419306184012]
[0.88453584558823528, 0.90356851867109755, 0.90017429356059719, 0.9004042698160345, 0.90953590451765631, 0.90642144433932514, 0.90758745067882418, 0.89259532077589532]
[0.87926733193277307, 0.88709672756497815, 0.88338354869339275, 0.89836929366341134, 0.87160312684947727, 0.87634941658782717, 0.87592505060404435, 0.89230593847556916, 0.89844451547113535]
[0.8515625, 0.88978171720807331, 0.87759469717527616, 0.89028843970020444, 0.86731233971197474, 0.86569082308420053, 0.87744690610655063, 0.8868778280542986, 0.89511513933914744]
[0.88248424369747902, 0.90201871711218939, 0.90199786319816277, 0.89785600726777193, 0.90828134247386072, 0.901931251970987, 0.90776139078849805, 0.894419306184012, 0.90695059586320259]
[0.88453584558823528, 0.90356851867109755, 0.90017429356059719, 0.9004042698160345, 0.90953590451765631, 0.90642144433932514, 0.90758745067882418, 0.89259532077589532, 0.90993466812778712]
[0.87926733193277307, 0.88709672756497815, 0.88338354869339275, 0.89836929366341134, 0.87160312684947727, 0.87634941658782717, 0.87592505060404435, 0.89230593847556916, 0.89844451547113535, 0.86800810449011045]
[0.8515625, 0.88978171720807331, 0.87759469717527616, 0.89028843970020444, 0.86731233971197474, 0.86569082308420053, 0.87744690610655063, 0.8868778280542986, 0.89511513933914744, 0.85676049910927476]
[0.88248424369747902, 0.90201871711218939, 0.90199786319816277, 0.89785600726777193, 0.90828134247386072, 0.901931251970987, 0.90776139078849805, 0.894419306184012, 0.90695059586320259, 0.89731786414315262]
[0.88453584558823528, 0.90356851867109755, 0.90017429356059719, 0.9004042698160345, 0.90953590451765631, 0.90642144433932514, 0.90758745067882418, 0.89259532077589532, 0.90993466812778712, 0.89731786414315262]

In [35]:
print('Linear features: ({:.02f} +- {:.02f})%'.format(
        numpy.mean(linear_ba) * 100, numpy.std(linear_ba) * 100))
print('Nonlinear features: ({:.02f} +- {:.02f})%'.format(
        numpy.mean(nonlinear_ba) * 100, numpy.std(nonlinear_ba) * 100))
print('Exponentiated features: ({:.02f} +- {:.02f})%'.format(
        numpy.mean(exp_ba) * 100, numpy.std(exp_ba) * 100))
print('Exponentiated nonlinear features: ({:.02f} +- {:.02f})%'.format(
        numpy.mean(nonlinear_exp_ba) * 100, numpy.std(nonlinear_exp_ba) * 100))


Linear features: (88.31 +- 1.02)%
Nonlinear features: (87.58 +- 1.43)%
Exponentiated features: (90.01 +- 0.73)%
Exponentiated nonlinear features: (90.12 +- 0.77)%

In [ ]: