In [38]:
    
import sys
import astropy.io.ascii as asc
import astropy.coordinates
import crowdastro.crowd.util
import cmocean
import h5py
import keras.models
import numpy
import scipy.spatial
import scipy.stats
import sklearn.ensemble
import sklearn.linear_model
import sklearn.metrics
import pipeline
# Try to set up matplotlib fonts.
import matplotlib
# http://bkanuka.com/articles/native-latex-plots/
def figsize(scale):
    fig_width_pt = 240.0
    inches_per_pt = 1.0/72.27
    golden_mean = (numpy.sqrt(5.0)-1.0)/2.0
    fig_width = fig_width_pt*inches_per_pt*scale
    fig_height = fig_width*golden_mean
    fig_size = [fig_width,fig_height]
    return fig_size
pgf_with_latex = {
    "pgf.texsystem": "pdflatex",
    "text.usetex": True,
    "font.family": "serif",
    "font.serif": [],
    "font.sans-serif": [],
    "font.monospace": [],
    "axes.labelsize": 12,
    "font.size": 12,
    "legend.fontsize": 12,
    "xtick.labelsize": 12,
    "ytick.labelsize": 12,
    "figure.figsize": figsize(0.9),
    "pgf.preamble": [
        r"\usepackage[utf8x]{inputenc}",
        r"\usepackage[T1]{fontenc}",
    ]
}
matplotlib.rcParams.update(pgf_with_latex)
swire_names_cdfs, swire_coords_cdfs, swire_features_cdfs = pipeline.generate_swire_features(field='cdfs')
swire_names_elais, swire_coords_elais, swire_features_elais = pipeline.generate_swire_features(field='elais')
swire_labels_cdfs = pipeline.generate_swire_labels(swire_names_cdfs, swire_coords_cdfs, field='cdfs')
swire_labels_elais = pipeline.generate_swire_labels(swire_names_elais, swire_coords_elais, field='elais')
(atlas_train_sets_cdfs, atlas_test_sets_cdfs), (swire_train_sets_cdfs, swire_test_sets_cdfs) = pipeline.generate_data_sets(swire_coords_cdfs, swire_labels_cdfs, field='cdfs')
(atlas_train_sets_elais, atlas_test_sets_elais), (swire_train_sets_elais, swire_test_sets_elais) = pipeline.generate_data_sets(swire_coords_elais, swire_labels_elais, field='elais')
table = asc.read('/Users/alger/data/Crowdastro/one-table-to-rule-them-all.tbl')
FALLOFF_SIGMA = 1 / 120
SEARCH_RADIUS = 1 / 60
import matplotlib.pyplot as plt
%matplotlib inline
    
In [2]:
    
swire_tree = scipy.spatial.KDTree(swire_coords_cdfs)
    
In [3]:
    
swire_name_to_index = {j:i for i, j in enumerate(swire_names_cdfs)}
    
In [4]:
    
_atlas_to_nearby_features = {}
_atlas_to_nearby_names = {}
_atlas_to_nearby_gaussians = {}
def get_xid_acc(classifier, quadrant):
    n_correct = 0
    n_total = 0
    for atlas in atlas_test_sets_cdfs[:, pipeline.SET_NAMES['RGZ & Norris'], quadrant].nonzero()[0]:
        row = table[table['Key'] == atlas][0]
        ra = row['Component RA (Franzen)']
        dec = row['Component DEC (Franzen)']
        swire = row['Source SWIRE (Norris)']
        if not swire.startswith('SWIRE'):
            # Skip objects without a SWIRE host.
            continue
        # Find candidate hosts.
        if atlas in _atlas_to_nearby_features:
            nearby_features = _atlas_to_nearby_features[atlas]
            nearby_names = _atlas_to_nearby_names[atlas]
            nearby_gaussians = _atlas_to_nearby_gaussians[atlas]
        else:
            nearby = swire_tree.query_ball_point(numpy.array([ra, dec]), SEARCH_RADIUS)
            nearby_features = swire_features_cdfs[nearby]
            scoords = astropy.coordinates.SkyCoord(ra=ra, dec=dec, unit='deg')
            nearby_scoords = astropy.coordinates.SkyCoord(
                ra=swire_coords_cdfs[nearby, 0], dec=swire_coords_cdfs[nearby, 1], unit='deg')
            separations = numpy.array(scoords.separation(nearby_scoords).deg)
            nearby_gaussians = scipy.stats.norm.pdf(separations, scale=FALLOFF_SIGMA)
            nearby_names = [swire_names_cdfs[n] for n in nearby]
            _atlas_to_nearby_features[atlas] = nearby_features
            _atlas_to_nearby_names[atlas] = nearby_names
            _atlas_to_nearby_gaussians[atlas] = nearby_gaussians
        if not nearby_names:
            # No candidates.
            continue
        # Predict class probabilities.
        atpreds = classifier.predict_proba(nearby_features)
        if len(atpreds.shape) == 1:  # CNN
            pass
        elif atpreds.shape[1] == 2:  # LR
            atpreds = atpreds[:, 1]
        else:  # RF
            atpreds = atpreds[:, 0]
        # Multiply by Gaussians.
        atpreds *= nearby_gaussians
        # Cross-identify.
        name = nearby_names[numpy.argmax(atpreds)]
        n_correct += name == swire
        n_total += 1
    return n_correct / n_total
    
In [5]:
    
def get_gct_accs(classifier, quadrant, fit_threshold=True) -> ('balanced accuracy', 'auc'):
    train_features = swire_features_cdfs[swire_train_sets_cdfs[:, pipeline.SET_NAMES['RGZ & Norris'], quadrant]]
    test_features = swire_features_cdfs[swire_test_sets_cdfs[:, pipeline.SET_NAMES['RGZ & Norris'], quadrant]]
    train_labels = swire_labels_cdfs[swire_train_sets_cdfs[:, pipeline.SET_NAMES['RGZ & Norris'], quadrant], 0]
    test_labels = swire_labels_cdfs[swire_test_sets_cdfs[:, pipeline.SET_NAMES['RGZ & Norris'], quadrant], 0]
    # Pick a new threshold that maximises balanced accuracy.
    if fit_threshold:
        thresholds = numpy.linspace(0.4, 0.7, 100)
        pred_probs = classifier.predict_proba(train_features)
        if len(pred_probs.shape) == 1:  # CNN
            pass
        elif pred_probs.shape[1] == 2:  # LR
            pred_probs = pred_probs[:, 1]
        else:  # RF
            pred_probs = pred_probs[:, 0]
        threshold_bas = []
        for threshold in thresholds:
            try:
                ba = crowdastro.crowd.util.balanced_accuracy(
                    train_labels,
                    pred_probs > threshold)
            except ValueError:
                ba = 0.5
            threshold_bas.append(ba)
        best_threshold = thresholds[numpy.argmax(threshold_bas)]
    else:
        best_threshold = 0.5
    # Test on SWIRE (GCT).
    pred_probs = classifier.predict_proba(test_features)
    if len(pred_probs.shape) == 1:  # CNN
        pass
    elif pred_probs.shape[1] == 2:  # LR
        pred_probs = pred_probs[:, 1]
    else:  # RF
        pred_probs = pred_probs[:, 0]
    pred_labels = pred_probs > best_threshold
    auc = sklearn.metrics.roc_auc_score(test_labels, pred_probs)
    ba = crowdastro.crowd.util.balanced_accuracy(test_labels, pred_labels)
    return ba, auc
    
In [6]:
    
# Generate some classifiers and test them.
numpy.random.seed(0)
accs_gct = []
accs_xid = []
aucs_gct = []
n_max = swire_train_sets_cdfs[:, pipeline.SET_NAMES['RGZ & Norris']].sum(axis=0).min()
# Train 16 LR models (four per quadrant).
# I want an ensemble of models so that any correlations we get by using the same model
# twice, but noised the second time around, are averaged out.
lrs = {q: [] for q in range(4)}
for _ in range(4):
    for quadrant in range(4):
        print('Quadrant:', quadrant)
        lr = sklearn.linear_model.LogisticRegression(class_weight='balanced',
                                                     C=100000.0)
        train = list(swire_train_sets_cdfs[:, pipeline.SET_NAMES['RGZ & Norris'], quadrant].nonzero()[0])
        train_features = swire_features_cdfs[train]
        train_labels = swire_labels_cdfs[train, 0]
        lr.fit(train_features, train_labels)
        lrs[quadrant].append(lr)
for i in range(40):
    print(i)
    for q, lrs_ in lrs.items():
        for lr in lrs_:
            ba, auc = get_gct_accs(lr, q)
            acc = get_xid_acc(lr, q)
            accs_xid.append(acc)
            accs_gct.append(ba)
            aucs_gct.append(auc)
            # Noise the LR weights to drop the accuracies.
            lr.coef_ += numpy.random.normal(scale=lr.coef_.std() / 100, size=lr.coef_.shape)
    
    
In [7]:
    
def density_scatter(xs, ys, cmap='summer', log=False, marker='o', **kwargs):
    xy = numpy.vstack([xs, ys])
    z = scipy.stats.gaussian_kde(xy)(xy)
    if log:
        z = numpy.log(z)
    plt.scatter(xs, ys, c=z, marker=marker, cmap=cmap, **kwargs)
    
In [8]:
    
accs_gct_ = numpy.array(accs_gct)
accs_xid_ = numpy.array(accs_xid)
density_scatter(accs_gct_ * 100, accs_xid_ * 100,
                cmap=cmocean.cm.dense, edgecolors='k')
plt.xlabel('GCT balanced accuracy')
plt.ylabel('X-ID accuracy')
plt.grid(color='lightgrey', axis='y')
# plt.xscale('log')
# plt.yscale('log')
# plt.savefig('/Users/alger/repos/crowdastro-projects/ATLAS-CDFS/gct-to-xid-lr.pdf')
    
    
In [9]:
    
aucs_gct_ = numpy.array(aucs_gct)
accs_xid_ = numpy.array(accs_xid)
density_scatter(aucs_gct_ * 100, accs_xid_ * 100,
                cmap=cmocean.cm.dense, edgecolors='k')
plt.xlabel('GCT AUC')
plt.ylabel('X-ID accuracy')
plt.grid(color='lightgrey', axis='y')
    
    
In [10]:
    
aucs_gct_ = numpy.array(aucs_gct)
accs_gct_ = numpy.array(accs_gct)
accs_xid_ = numpy.array(accs_xid)
density_scatter(accs_gct_ * 100, accs_xid_ * 100,
                cmap='Greens', edgecolors='k',
                label='BA', marker='s')
density_scatter(aucs_gct_ * 100, accs_xid_ * 100,
                cmap='Blues', edgecolors='k',
                label='AUC', marker='o')
plt.xlabel('GCT performance')
plt.ylabel('X-ID accuracy')
plt.legend()
plt.grid(color='lightgrey', axis='y')
    
    
In [11]:
    
bin_means = []
bin_stdevs = []
n_bins = 10
bins = numpy.linspace(accs_gct_.min(), accs_gct_.max(), n_bins)
bins_ = numpy.digitize(accs_gct_, bins=bins)
for b in range(n_bins):
    bin_means.append(accs_xid_[bins_ == b].mean())
    bin_stdevs.append(accs_xid_[bins_ == b].std())
bin_means = numpy.array(bin_means)
bin_stdevs = numpy.array(bin_stdevs)
plt.plot(bins, bin_means)
plt.fill_between(bins, bin_means - bin_stdevs, bin_means + bin_stdevs, alpha=0.2)
    
    
    Out[11]:
    
In [12]:
    
# Generate some classifiers and test them.
numpy.random.seed(0)
accs_gct_rf = []
accs_xid_rf = []
aucs_gct_rf = []
rfs = {q: [] for q in range(4)}
n_max = swire_train_sets_cdfs[:, pipeline.SET_NAMES['RGZ & Norris']].sum(axis=0).min()
# Train 16 * 40 RF models (4 * 40 per quadrant).
# We can't add noise to random forests so we'll have to retrain them with subsets of the training data.
for quadrant in range(4):
    print(quadrant)
    train = list(swire_train_sets_cdfs[:, pipeline.SET_NAMES['RGZ & Norris'], quadrant].nonzero()[0])
    for i in range(4 * 40):
        subset_size = numpy.linspace(10, 500, 4 * 40, dtype=int)[i]
        rf = sklearn.ensemble.RandomForestClassifier(class_weight='balanced',
                                                     criterion='entropy',
                                                     min_samples_leaf=45)
        numpy.random.shuffle(train)
        train_subset = train[:subset_size]
        train_features = swire_features_cdfs[train_subset]
        train_labels = swire_labels_cdfs[train_subset, 0]
        rf.fit(train_features, train_labels)
        rfs[quadrant].append(rf)
        ba, auc = get_gct_accs(rf, q)
        acc = get_xid_acc(rf, q)
        accs_xid_rf.append(acc)
        accs_gct_rf.append(ba)
        aucs_gct_rf.append(auc)
    
    
In [13]:
    
accs_gct_rf_ = numpy.array(accs_gct_rf)
accs_xid_rf_ = numpy.array(accs_xid_rf)[accs_gct_rf_ > 0.5]
accs_gct_rf_ = numpy.array(accs_gct_rf)[accs_gct_rf_ > 0.5]
density_scatter(accs_gct_rf_ * 100, accs_xid_rf_ * 100,
                cmap=cmocean.cm.dense, edgecolors='k')
plt.xlabel('GCT balanced accuracy')
plt.ylabel('X-ID accuracy')
plt.grid(color='lightgrey', axis='y')
# plt.savefig('/Users/alger/repos/crowdastro-projects/ATLAS-CDFS/gct-to-xid-rf.pdf')
    
    
In [14]:
    
accs_gct_rf_ = numpy.array(accs_gct_rf)
accs_xid_rf_ = numpy.array(accs_xid_rf)[accs_gct_rf_ > 0.5]
aucs_gct_rf_ = numpy.array(aucs_gct_rf)[accs_gct_rf_ > 0.5]
density_scatter(aucs_gct_rf_ * 100, accs_xid_rf_ * 100,
                cmap=cmocean.cm.dense, edgecolors='k')
plt.xlabel('GCT balanced accuracy')
plt.ylabel('X-ID accuracy')
plt.grid(color='lightgrey', axis='y')
# plt.savefig('/Users/alger/repos/crowdastro-projects/ATLAS-CDFS/gct-to-xid-rf.pdf')
    
    
In [15]:
    
accs_gct_rf_ = numpy.array(accs_gct_rf)
aucs_gct_rf_ = numpy.array(aucs_gct_rf)[accs_gct_rf_ > 0.5]
accs_xid_rf_ = numpy.array(accs_xid_rf)[accs_gct_rf_ > 0.5]
accs_gct_rf_ = numpy.array(accs_gct_rf)[accs_gct_rf_ > 0.5]
density_scatter(accs_gct_rf_ * 100, accs_xid_rf_ * 100,
                cmap='Greens', edgecolors='k',
                label='BA', marker='s')
density_scatter(aucs_gct_rf_ * 100, accs_xid_rf_ * 100,
                cmap='Blues', edgecolors='k',
                label='AUC', marker='o')
plt.xlabel('GCT performance')
plt.ylabel('X-ID accuracy')
plt.legend()
plt.grid(color='lightgrey', axis='y')
    
    
In [16]:
    
raise NotImplementedError()
# Generate some classifiers and test them.
numpy.random.seed(0)
accs_gct_cnn = []
accs_xid_cnn = []
aucs_gct_cnn = []
n_max = swire_train_sets_cdfs[:, pipeline.SET_NAMES['RGZ & Norris']].sum(axis=0).min()
# Generate 4 CNN models (one per quadrant). Noise them to get new classifiers.
cnns = {q: [] for q in range(4)}
for _ in range(1):
    for quadrant in range(4):
        print('Quadrant:', quadrant)
        # Load the trained CNN from file.
        with open('/Users/alger/data/Crowdastro/model_03_06_17.json') as f:
            cnn = keras.models.model_from_json(f.read())
            path = '/Users/alger/data/Crowdastro/weights_03_06_17/weights_{}_norris.h5'.format(quadrant)
            cnn.load_weights(path)
        cnns[quadrant].append(pipeline.CNN(cnn))
for i in range(10):
    print('Iteration', i)
    for q, cnns_ in cnns.items():
        print('Quadrant', q)
        for cnn in cnns_:
            ba, auc = get_gct_accs(cnn, q)
            acc = get_xid_acc(cnn, q)
            accs_xid_cnn.append(acc)
            accs_gct_cnn.append(ba)
            aucs_gct_cnn.append(auc)
            # Noise the CNN weights to drop the accuracies.
            weights = cnn._cnn.get_weights()
            for i in range(len(weights)):
                w = weights[i]
                w += numpy.random.normal(scale=w.std() / 5, size=w.shape)
            cnn._cnn.set_weights(weights)
    
    
In [17]:
    
plt.scatter(accs_gct_cnn, accs_xid_cnn)
    
    
In [18]:
    
rf_line = scipy.stats.linregress(accs_gct_rf_[accs_gct_rf_ > 0.51], accs_xid_rf_[accs_gct_rf_ > 0.51])
lr_line = scipy.stats.linregress(accs_gct_[accs_gct_ > 0.51], accs_xid_[accs_gct_ > 0.51])
print('rf: {}'.format(rf_line.rvalue))
print('lr: {}'.format(lr_line.rvalue))
    
    
In [42]:
    
fs = figsize(0.9)
fs = fs[0] * 1.5, fs[1] * 3
fig = plt.figure(figsize=fs)
plt.subplot(2, 1, 1)
density_scatter(
    accs_gct_rf_[accs_gct_rf_ > 0.51] * 100,
    accs_xid_rf_[accs_gct_rf_ > 0.51] * 100, log=True, cmap='cool')
plt.plot(numpy.array(sorted(accs_gct_rf_[accs_gct_rf_ > 0.51])) * 100,
         numpy.array(sorted(accs_gct_rf_[accs_gct_rf_ > 0.51])) * rf_line.slope * 100 + rf_line.intercept * 100,
         linewidth=1, color='k', linestyle='solid')
plt.plot(numpy.linspace(0, 100, 2), numpy.linspace(0, 100, 2), linestyle='--', color='blue',
         linewidth=1)
plt.gca().tick_params(axis='both', which='major', direction='out', length=5)
plt.minorticks_on()
plt.gca().tick_params(axis='both', which='minor', direction='out', length=3)
plt.ylabel('Cross-identification\naccuracy (per cent)')
plt.xticks([])
plt.annotate('RF', (50, 85))
plt.xlim((45, 100))
plt.ylim((40, 100))
plt.yticks(range(50, 100, 10))
plt.subplot(2, 1, 2)
density_scatter(accs_gct_[accs_gct_ > 0.51] * 100, accs_xid_[accs_gct_ > 0.51] * 100, log=True, cmap='cool')
plt.plot(numpy.array(sorted(accs_gct_[accs_gct_ > 0.51])) * 100,
         numpy.array(sorted(accs_gct_[accs_gct_ > 0.51])) * lr_line.slope * 100 + lr_line.intercept * 100,
         linewidth=1, color='k', linestyle='solid')
plt.xlabel('Classification balanced accuracy\n(per cent)')
plt.ylabel('Cross-identification\naccuracy (per cent)')
plt.xlim((45, 100))
plt.ylim((40, 100))
plt.yticks(range(50, 100, 10))
plt.annotate('LR', (50, 85))
plt.plot(numpy.linspace(0, 100, 2), numpy.linspace(0, 100, 2), linestyle='--', color='blue',
         linewidth=1)
plt.gca().tick_params(axis='both', which='major', direction='out', length=5)
plt.minorticks_on()
plt.gca().tick_params(axis='both', which='minor', direction='out', length=3)
import matplotlib.cm, matplotlib.colors, matplotlib.colorbar
cbar_ax = fig.add_axes([0.75, 0.12, 0.05, 0.76])
norm = matplotlib.colors.Normalize(vmin=0, vmax=1)
cb1 = matplotlib.colorbar.ColorbarBase(cbar_ax,
                                       norm=norm,
                                       orientation='vertical',
                                       cmap='cool')
cb1.set_label('log(classifier density)')
plt.subplots_adjust(hspace=0, right=0.7, left=0.2)
plt.savefig('/Users/alger/repos/crowdastro-projects/ATLAS-CDFS/images/gct-to-xid.pdf')
    
    
In [ ]:
    
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(accs_gct_[accs_gct_ > 0.51], accs_xid_[accs_gct_ > 0.51])
slope_rf, intercept_rf, r_value_rf, p_value_rf, std_err_rf = scipy.stats.linregress(accs_gct_rf_[accs_gct_rf_ > 0.51], accs_xid_rf_[accs_gct_rf_ > 0.51])
print(r_value**2, r_value_rf**2)
print(std_err, std_err_rf)
    
In [ ]:
    
aucs_rf_ = numpy.array(aucs_rf)
xs = numpy.log(aucs_rf_[aucs_rf_ > 0.51])
ys = numpy.log(accs_xid_rf_[aucs_rf_ > 0.51])
plt.scatter(xs, ys)
m, b = numpy.polyfit(xs, ys, 1)
plt.plot(xs, m * xs + b, c='black')
print(m, b)
plt.xlabel('log(GCT AUC)')
plt.ylabel('log(XID accuracy)')
# N = 20
# plt.scatter(xs, numpy.exp(-N / 200) * xs ** N + (1 - numpy.exp(-N / 200)))
    
In [ ]:
    
numpy.exp(0.19)
    
So only the ~8 galaxies closest the radio object matter under my model. What's the expected radius we need to get $N$ galaxies?
Do the units check out?
$$m = \sqrt{\frac{1}{1 \times \frac{1}{m^2}}} = m$$Yup, looks good. What's the average density of galaxies, per square arcmin?
In [66]:
    
densities = []
for atlas in atlas_test_sets_cdfs[:, pipeline.SET_NAMES['RGZ & Norris'], 0].nonzero()[0]:
    row = table[table['Key'] == atlas][0]
    ra = row['Component RA (Franzen)']
    dec = row['Component DEC (Franzen)']
    swire = row['Source SWIRE (Norris)']
    if not swire.startswith('SWIRE'):
        continue
    nearby = swire_tree.query_ball_point(numpy.array([ra, dec]), 1 / 60)
    n = len(nearby)
    area = numpy.pi
    densities.append(n / area)
print(numpy.mean(densities))
    
    
So
$$R_8 = \sqrt{\frac{8}{8 \pi}} = \frac{1}{\sqrt{\pi}} \approx 0.56 \text{ arcmin}$$is the radius in which we expect to find "difficult" examples. What's that look like on an image?
In [71]:
    
import aplpy
    
In [105]:
    
f = aplpy.FITSFigure('/Users/alger/data/RGZ/cdfs/2x2/CI0020_radio.fits')
f.show_grayscale()
f.show_rectangles(*f.pixel2world(100, 100), 2 * 0.56 / 60, 2 * 0.56 / 60, edgecolor='red', linewidth=10)
f.show_rectangles(*f.pixel2world(70, 70),
                  *numpy.abs(numpy.array(f.pixel2world(100, 100)) - numpy.array(f.pixel2world(132, 132))),
                  edgecolor='pink', linewidth=5)
    
    
    
    
In [80]:
    
    
    Out[80]:
In [ ]: