This notebook generates a table of radio components in the CDFS and ELAIS-S1 fields, according to various incarnations of the ATLAS survey. To run it, you will need a MongoDB server with the RGZ database loaded. All other data is fetched from the internet.
In the following cell, specify the MongoDB server details:
In [2]:
MONGO_HOST = 'localhost'
MONGO_PORT = 27017
In this cell, specify if you have access to a crowdastro output file (crowdastro.h5), and if so, where it is:
In [3]:
USING_CROWDASTRO = True
CROWDASTRO_PATH = 'crowdastro-swire.h5'
# To get this file, run `crowdastro import_data --ir swire`.
In this cell, specify if you have access to a CSV of the Fan et al. (2015) cross-identifications, and if so, where it is:
In [4]:
USING_FAN = True
FAN_PATH = 'fan_2015_a.csv'
In this cell, specify if you have access to the 11 January 2014 Franzen catalogue, and if so, where it is:
In [5]:
USING_FRANZEN = True
FRANZEN_PATH = 'ATLASDR3_CDFS_cmpcat_11JAN2014.dat'
Next, we will fetch the resources we need.
In [6]:
NORRIS_COMPONENTS_URI = 'http://www.atnf.csiro.au/people/rnorris/papers/n202/tab4.txt'
NORRIS_CROSS_IDENTIFICATIONS_URI = 'http://www.atnf.csiro.au/people/rnorris/papers/n202/tab6.txt'
MIDDELBERG_COMPONENTS_URI = 'http://iopscience.iop.org/article/10.1086/508275/fulltext/datafile4.txt'
MIDDELBERG_CROSS_IDENTIFICATIONS_URI = 'http://iopscience.iop.org/article/10.1086/508275/fulltext/datafile6.txt'
In [7]:
# Load Norris components.
import requests, io, astropy.io.ascii as asc, astropy.table, pandas
norris_components = astropy.table.Table.from_pandas(
pandas.read_fwf(
io.StringIO(
requests.get(NORRIS_COMPONENTS_URI).text
),
skiprows=[0, 2],
header=0,
widths=map(len, [
' # ',
'Name ',
'Radio RA ',
'Radio dec ',
'err(RA) ',
'err(dec) ',
'Peak Flux ',
'Int flux ',
'Bmaj ',
'Bmin ',
' Bpa ',
' rms ',
])
)
)
norris_components
Out[7]:
In [8]:
# Load Norris cross-identifications.
# This table has inconsistent tabs, so we will have to convert them to "soft tabs".
def replace_tabs(s, tabstop=8):
"""Convert tabs to spaces."""
out = ''
upto = 0
last = None
for c in s:
if c == '\t':
# Fill up to next tabstop.
diff = tabstop - upto % tabstop
if diff == 0:
diff = tabstop
out += ' ' * diff
upto += diff
last = c
continue
last = c
out += c
upto += 1
return out
test_input = ('S001 ATCDFS_J032602.78-284709.0 C001 SWIRE3_J032603.15-284708.5 3:26:02.785 -28:47:09.06 1.4 33.8 21.1 -1.0 -1.0 -1.0 4 looks like a group in irac 1')
test_output = ('S001 ATCDFS_J032602.78-284709.0 C001 SWIRE3_J032603.15-284708.5 3:26:02.785 -28:47:09.06 1.4 33.8 21.1 -1.0 -1.0 -1.0 4 looks like a group in irac 1')
assert test_output == replace_tabs(test_input)
norris_cross_identifications = astropy.table.Table.from_pandas(
pandas.read_fwf(
io.StringIO(
'\n'.join(map(
lambda s: replace_tabs(s, 8),
requests.get(NORRIS_CROSS_IDENTIFICATIONS_URI).text.split('\r\n'))
)
),
skiprows=[0, 2],
header=0,
widths=[8, 32, 20, 28, 16, 16, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 16, 8, 16]
)
)
norris_cross_identifications[700:710]
Out[8]:
In [9]:
# Load Middelberg tables.
middelberg_components = asc.read(MIDDELBERG_COMPONENTS_URI)
print(middelberg_components[0])
middelberg_cross_identifications = asc.read(MIDDELBERG_CROSS_IDENTIFICATIONS_URI)
print(middelberg_cross_identifications[0])
In [10]:
# Convert Middelberg data into columns. There's no catalogue matching to do here so we can
# throw everything in right away.
import astropy.coordinates
_middelberg_component_ids = middelberg_components['ID']
_middelberg_component_names = middelberg_components['Name']
_middelberg_component_positions = [
astropy.coordinates.SkyCoord(ra=(r['RAh'], r['RAm'], r['RAs']),
dec=(-r['DEd'], r['DEm'], r['DEs']),
unit=('hourangle', 'deg'))
for r in middelberg_components
]
_middelberg_component_ras = [r.ra.deg for r in _middelberg_component_positions]
_middelberg_component_decs = [r.dec.deg for r in _middelberg_component_positions]
_middelberg_component_peak_flux = middelberg_components['PFlux']
_middelberg_component_int_flux = middelberg_components['IFlux']
_middelberg_source_ids = middelberg_components['ID']
_middelberg_cid_to_source_id = {}
_middelberg_cid_to_source_name = {}
_middelberg_cid_to_swire = {}
_middelberg_cid_to_source_z = {}
_middelberg_cid_to_source_ra = {}
_middelberg_cid_to_source_dec = {}
for row in middelberg_cross_identifications:
for component in row['CID'].split(','):
component = component.strip()
_middelberg_cid_to_source_id[component] = row['ID']
_middelberg_cid_to_source_name[component] = row['Name']
_middelberg_cid_to_swire[component] = row['SName']
_middelberg_cid_to_source_z[component] = row['z']
pos = astropy.coordinates.SkyCoord(ra=(row['RAh'], row['RAm'], row['RAs']),
dec=(-row['DEd'], row['DEm'], row['DEs']),
unit=('hourangle', 'deg'))
_middelberg_cid_to_source_ra[component] = pos.ra.deg
_middelberg_cid_to_source_dec[component] = pos.dec.deg
_middelberg_component_source_ids = [_middelberg_cid_to_source_id[c] for c in _middelberg_component_ids]
_middelberg_component_source_names = [_middelberg_cid_to_source_name[c] for c in _middelberg_component_ids]
_middelberg_component_swires = [_middelberg_cid_to_swire[c] for c in _middelberg_component_ids]
_middelberg_component_source_zs = [_middelberg_cid_to_source_z[c] for c in _middelberg_component_ids]
_middelberg_component_source_ras = [_middelberg_cid_to_source_ra[c] for c in _middelberg_component_ids]
_middelberg_component_source_decs = [_middelberg_cid_to_source_dec[c] for c in _middelberg_component_ids]
In [13]:
# Load RGZ.
import pymongo, numpy
client = pymongo.MongoClient(MONGO_HOST, MONGO_PORT)
db = client['radio']
_rgz_sources = []
_rgz_coords = []
_rgz_zids = []
for subject in db.radio_subjects.find({'metadata.survey': 'atlas'}):
source = subject['metadata']['source']
ra, dec = subject['coords']
zid = subject['zooniverse_id']
_rgz_sources.append(source)
_rgz_coords.append((ra, dec))
_rgz_zids.append(zid)
_rgz_coords = numpy.array(_rgz_coords)
In [71]:
# Load consensuses from crowdastro.
import h5py
with h5py.File(CROWDASTRO_PATH, 'r') as crowdastro_h5:
# (atlas_i, ir_i, success, percentage)
_crowdastro_consensus_objects = crowdastro_h5['/atlas/cdfs/consensus_objects']
_crowdastro_zids = [r[0].decode('ascii') for r in crowdastro_h5['/atlas/cdfs/string']]
_crowdastro_swire_names = [r.decode('ascii') for r in crowdastro_h5['/swire/cdfs/string']]
_crowdastro_zid_to_swire = {}
_crowdastro_zid_to_percentages = {}
_crowdastro_zid_to_fit_success = {}
for atlas_i, ir_i, success, percentage in _crowdastro_consensus_objects:
_crowdastro_zid_to_swire[_crowdastro_zids[int(atlas_i)]] = _crowdastro_swire_names[int(ir_i)]
_crowdastro_zid_to_percentages[_crowdastro_zids[int(atlas_i)]] = percentage
_crowdastro_zid_to_fit_success[_crowdastro_zids[int(atlas_i)]] = bool(success)
In [15]:
# Load Franzen.
franzen = asc.read(FRANZEN_PATH)
# Note that multi-component Franzen objects are matched to exactly one RGZ object, which is associated with the
# first component of said object. We will not make the same assumption here.
franzen
Out[15]:
In [27]:
# Match Franzen to Norris.
import scipy.spatial
_franzen_cid_to_norris = {} # Maps Franzen CID -> Norris CID (RGZ uses Franzen CIDs)
_norris_cids = [r['#'] for r in norris_components]
_norris_coords = [astropy.coordinates.SkyCoord(
ra=r['Radio RA'],
dec=r['Radio dec'],
unit=('hourangle', 'deg')) for r in norris_components]
_norris_coords = numpy.array([(p.ra.deg, p.dec.deg) for p in _norris_coords])
_norris_tree = scipy.spatial.KDTree(_norris_coords)
_franzen_coords = numpy.array(list(zip([franzen['RA'], franzen['DEC']])))[:, 0, :].T
_franzen_cids = franzen['ID']
_dists, _indices = _norris_tree.query(_franzen_coords)
_matches = _dists < 5 / 60 / 60
for cid, match, index in zip(_franzen_cids, _matches, _indices):
if not match:
continue
_franzen_cid_to_norris[cid] = _norris_cids[index]
_norris_to_franzen_cid = {j:i for i, j in _franzen_cid_to_norris.items()}
In [21]:
# Load Fan.
fan_cross_identifications = asc.read(FAN_PATH, header_start=0, delimiter=',')
_fan_source_ids = fan_cross_identifications['id']
_fan_id_to_swire = {r['id']:r['swire'] for r in fan_cross_identifications}
# Assuming that CID in Fan = CID in Norris.
_fan_component_to_source = {}
_fan_component_to_swire = {}
for row in fan_cross_identifications:
components = row['radios'].split(',')
for component in components:
component = component.strip()
_fan_component_to_source[component] = row['id']
_fan_component_to_swire[component] = row['swire']
Now, we can construct the table. We will have the following columns:
In [72]:
names = ['Key'] + [
'Component ' + k + ' (Norris)' for k in norris_components.columns.keys()] + [
'Source ' + k + ' (Norris)' for k in norris_cross_identifications.columns.keys()] + [
'Source # (Fan)',
'Source SWIRE Name (Fan)'] + [
'Component ' + k + ' (Franzen)' for k in franzen.columns.keys()] + [
'Component Zooniverse ID (RGZ)',
'Primary Component ID (RGZ)',
'Source SWIRE Name (RGZ)',
'Gaussian Click Fit Success (RGZ)',
'Click Agreement (RGZ)']
names
Out[72]:
In [91]:
import astropy.table
# Component (Norris)
comp_columns = []
for column in norris_components.columns:
comp_columns.append(list(norris_components[column]))
# Source (Norris)
source_columns = []
for column in norris_cross_identifications.columns:
_component_to_value = {}
for row in norris_cross_identifications:
components = row['Component'].split(',')
for component in components:
component = component.strip()
_component_to_value[component] = row[column]
column = []
for component in comp_columns[0]:
column.append(_component_to_value[component])
source_columns.append(column)
# Add in the Fan matches.
fan_sources = [_fan_component_to_source.get(c, '') for c in comp_columns[0]]
fan_swires = [_fan_component_to_swire.get(c, '') for c in comp_columns[0]]
fan_columns = [fan_sources, fan_swires]
# Add in the Franzen matches.
franzen_columns = []
for column in franzen.columns:
_component_to_value = {}
for row in franzen:
component = row['ID']
_component_to_value[component] = row[column]
column = []
for norris_cid in norris_components['#']:
if norris_cid not in _norris_to_franzen_cid:
column.append(float('nan'))
else:
franzen_cid = _norris_to_franzen_cid[norris_cid]
column.append(_component_to_value[franzen_cid])
franzen_columns.append(column)
# Add in all the Franzen objects with no corresponding Norris.
_all_franzen_ids = set(franzen['ID'])
_included_franzen_ids = {i for i in franzen_columns[0] if isinstance(i, str)}
assert 700 < len(_included_franzen_ids) < 800 # Sanity check.
_missing_franzen_ids = _all_franzen_ids - _included_franzen_ids
# Pad existing columns to include new datapoints.
for column in itertools.chain(comp_columns, source_columns, fan_columns):
column.extend([float('nan')] * len(_missing_franzen_ids))
# Fill in the Franzen columns.
for row in franzen:
if row['ID'] in _missing_franzen_ids:
for column, column_name in zip(franzen_columns, franzen.columns):
column.append(row[column_name])
# Add in the RGZ data.
# Columns: Zooniverse ID, Primary Component ID, SWIRE Name, Gaussian Click Fit Success,
# Click Agreement.
# RGZ is a proper subset of Franzen, so all objects in RGZ should already be in the table.
# For multi-component radio objects like CI0001C1..CI0001C5, there is only one RGZ object,
# and it is associated with the first component. Each component here should be associated
# with the same Zooniverse ID, Component ID, and SWIRE Name. The component ID will be the
# ID of the primary component.
_cid_to_zid = dict(zip(_rgz_sources, _rgz_zids))
zooniverse_ids = []
rgz_primary_components = []
rgz_swire_names = []
rgz_successes = []
rgz_percentages = []
import re
for cid in franzen_columns[0]:
if not cid or not isinstance(cid, str):
zooniverse_ids.append('')
rgz_primary_components.append('')
rgz_swire_names.append('')
rgz_successes.append('')
rgz_percentages.append('')
continue
multi_component = re.match(r'(CI\d+)C\d+', cid)
if multi_component:
primary = multi_component.group(1) + 'C1'
else:
primary = cid
if primary not in _cid_to_zid:
zooniverse_ids.append('')
rgz_primary_components.append('')
rgz_swire_names.append('')
rgz_successes.append('')
rgz_percentages.append('')
continue
rgz_primary_components.append(primary)
zooniverse_ids.append(_cid_to_zid[primary])
rgz_swire_names.append(_crowdastro_zid_to_swire.get(_cid_to_zid[primary], ''))
rgz_successes.append(_crowdastro_zid_to_fit_success.get(_cid_to_zid[primary], ''))
rgz_percentages.append(_crowdastro_zid_to_percentages.get(_cid_to_zid[primary], ''))
# Check that all RGZ objects are in the table.
assert all(z in zooniverse_ids for z in _rgz_zids)
assert all(c in zooniverse_ids for c in _crowdastro_zid_to_swire)
rgz_columns = [zooniverse_ids, rgz_primary_components, rgz_swire_names, rgz_successes, rgz_percentages]
# Key
keys = list(range(len(comp_columns[0])))
# Assemble the table data.
data = [keys] + comp_columns + source_columns + fan_columns + franzen_columns + rgz_columns
# Mask everything that is NaN or empty.
for i, column in enumerate(data):
column = numpy.array(column)
try:
masked = numpy.ma.MaskedArray(data=column,
mask=numpy.logical_or(numpy.isnan(column), column == -99.0))
except TypeError:
masked = numpy.ma.MaskedArray(data=column,
mask=numpy.logical_or(column == '', column == 'nan'))
data[i] = masked
# Assemble the whole table.
table = astropy.table.Table(data=data,
names=names)
table
Out[91]:
In [92]:
table.write('one-table-to-rule-them-all.tbl', format='csv')
In [46]:
import matplotlib.pyplot as plt
%matplotlib inline
def decimalify(ras, decs):
from astropy.coordinates import SkyCoord
coords = []
for ra, dec in zip(ras, decs):
sc = SkyCoord(ra=ra, dec=dec, unit=('hourangle', 'deg'))
coords.append((sc.ra.deg, sc.dec.deg))
return zip(*coords)
not_in_franzen = table[[not bool(i) for i in table['Component ID (Franzen)']]]
ras = not_in_franzen['Component Radio RA (Norris)']
decs = not_in_franzen['Component Radio dec (Norris)']
plt.scatter(*decimalify(ras, decs), color='green', marker='+')
plt.show()
In [ ]: