In [1]:
# This changes the current directory to the base saga directory - make sure to run this first!
# This is necessary to be able to import the py files and use the right directories,
# while keeping all the notebooks in their own directory.
import os
import sys
import collections
if 'saga_base_dir' not in locals():
saga_base_dir = os.path.abspath('..')
if saga_base_dir not in sys.path:
os.chdir(saga_base_dir)
import targeting
In [2]:
from __future__ import print_function, division
import numpy as np
from astropy import units as u
from astropy import table
from astropy.coordinates import SkyCoord
from astropy.visualization import hist as ahist
In [3]:
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style
from mpl_toolkits.mplot3d import Axes3D
plt.rcParams['image.cmap'] = 'viridis'
plt.rcParams['image.origin'] = 'lower'
plt.rcParams['axes.prop_cycle'] = style.library['seaborn-deep']['axes.prop_cycle']
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['axes.titlesize'] = plt.rcParams['axes.labelsize'] = 16
plt.rcParams['xtick.labelsize'] = plt.rcParams['ytick.labelsize'] = 14
In [4]:
spec_data_raw = table.Table.read('SAGADropbox/data/saga_spectra_raw.fits.gz')
spec_data_raw
Out[4]:
Hmm, the specobjid's are floats, but they should be integers to avoid possible rounding errors during comparison. Lets fix that:
In [5]:
# Just setting the dtype does *not* do the conversion of the values. It instead tells numpy to
# re-interpret the same set of bits as thought they were ints.
# So we do the first and third line below to update the values - `astype` *does* do the conversion
spec_data = spec_data_raw.copy()
spec_data['specobjid'].dtype = int
spec_data['specobjid'] = spec_data_raw['specobjid'].astype(int)
spec_data
Out[5]:
First we see if we can match on spectroscopic ID's
In [6]:
sid = spec_data['specobjid']
len(np.unique(sid)), len(sid)
Out[6]:
Hmm... far too few specobjids. Probably whatever converted to a float above also led to some mis-casting of the ID's. So we'll fall back on doing matching based on sky coordinates.
In [7]:
scs = SkyCoord(spec_data['RA'], spec_data['DEC'], unit=u.deg)
This identifies all pairs if coordinates that are within 5" of each other:
In [8]:
idx1, idx2, sep2d, _ = scs.search_around_sky(scs, 5*u.arcsec)
Because we matched scs to itself, this list also inclues "self-matches" where the pairs are an object and itself. But we want these to make sure we get the single-object "groups"
In [9]:
fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.hist(sep2d.to(u.arcsec).value, bins=100, range=(0, 5), histtype='step', log=True)
plt.xlim(0, 5)
ax2.hist(sep2d.to(u.arcsec).value, bins=100, range=(0, .4), histtype='step', log=True)
plt.xlim(0, .4)
plt.tight_layout()
Lets have a quick look at the things that are between $.1"$ and $.2"$ to see if they are matches or shreds
In [10]:
msk = (.1*u.arcsec<sep2d)&(sep2d<.2*u.arcsec)
random25 = np.random.permutation(np.sum(msk))[:25]
scs1 = scs[idx1[msk][random25]]
scs2 = scs[idx2[msk][random25]]
print(targeting.sampled_imagelist(scs1, None, n=len(scs1)))
print(targeting.sampled_imagelist(scs2, None, n=len(scs2)))
Inspection of these reveals them to be the same objects. Most be difference between different DR's or similar. So we define matches as simply anything within $.4"$
In [11]:
grpdct = {}
grpi = 0
for i1, i2 in zip(idx1, idx2):
if i1 in grpdct:
if i2 in grpdct:
# combine the two groups by assigning grp2 items to grp1
# this block is by far the slowest part so if the data size grows it should be optimized
grp1 = grpdct[i1]
grp2 = grpdct[i2]
if grp1 != grp2:
to_set_to_1 = [i for i, grp in grpdct.iteritems() if grp==grp2]
for i in to_set_to_1:
grpdct[i] = grp1
else:
#add i2 to the group i1 is already in
grpdct[i2] = grpdct[i1]
else:
if i2 in grpdct:
#add i1 to the group i2 is already in
grpdct[i2] = grpdct[i1]
else:
# add them both to a new group
grpdct[i1] = grpdct[i2] = grpi
grpi += 1
len(idx1), len(idx2), len(np.unique(grpdct.keys())), len(np.unique(grpdct.values()))
Out[11]:
In [12]:
grpnum_to_group_members = collections.defaultdict(list)
for k, v in grpdct.iteritems():
grpnum_to_group_members[v].append(k)
# convert the members into arrays
grpnum_to_group_members = {k:np.array(v) for k, v in grpnum_to_group_members.iteritems()}
In [13]:
# this is the count *before* the IMACS fix:
{1: 88822,
2: 2460,
3: 293,
4: 79,
5: 19,
6: 6,
7: 5,
8: 1,
9: 2,
10: 3,
13: 1,
34: 1,
54: 1,
64: 1,
72: 1,
79: 1}
Out[13]:
In [14]:
# this counts how many have a certain number of elements in their group
counts = np.bincount([len(arr) for arr in grpnum_to_group_members.values()])
{i:c for i, c in enumerate(counts) if c != 0}
Out[14]:
79 seems like a lot... What's the deal with the big ones?
In [29]:
Groupnum = collections.namedtuple('Groupnum', ['nobjs', 'grpnum'])
big_grps = sorted([Groupnum(len(arr), idx) for idx, arr in grpnum_to_group_members.iteritems() if len(arr)>5])
big_grps
Out[29]:
In [31]:
spec_data[grpnum_to_group_members[big_grps[-1].grpnum]]
Out[31]:
The above is just an example but the ones with the problem originally were IMACS targets...? So we have the option of ignoring all IMACS targets.
Marla fixed this after we realized it, though, so the remaining large groups are all MMT calibration stars:
In [37]:
spec_data[grpnum_to_group_members[big_grps[-2].grpnum]]
Out[37]:
In [152]:
idxs_to_keep = []
new_repeats = []
for grpnum, members in grpnum_to_group_members.iteritems():
notimacs_members = members[spec_data['TELNAME'][members]!='IMACS']
if len(notimacs_members)==0:
continue
idxs_to_keep.append(notimacs_members[np.argsort(spec_data['ZQUALITY'][notimacs_members])[0]])
new_repeats.append('+'.join(np.unique(spec_data['SPEC_REPEAT'][notimacs_members])))
In [153]:
unique_objs = spec_data[np.array(idxs_to_keep)]
del unique_objs['SPEC_REPEAT']
unique_objs['SPEC_REPEAT'] = new_repeats
unique_objs
Out[153]:
In [154]:
# double check that you sometimes get a combination of them together
np.unique(unique_objs['SPEC_REPEAT'])
Out[154]:
The function below just combines all the steps above into one function
In [1]:
from __future__ import print_function, division
import collections
import numpy as np
from astropy import units as u
from astropy import table
from astropy.coordinates import SkyCoord
In [2]:
def find_uniques(infn='saga_spectra_raw.fits.gz', remove_imacs=True, nearenough_sep=5*u.arcsec):
spec_data_raw = table.Table.read(infn)
if spec_data_raw['specobjid'].dtype.kind == 'i':
spec_data = spec_data_raw
else:
#if the specobjid is not an int, convert the type of that column while retaining the value
spec_data = spec_data_raw.copy()
spec_data['specobjid'].dtype = int
spec_data['specobjid'] = spec_data_raw['specobjid'].astype(int)
scs = SkyCoord(spec_data['RA'], spec_data['DEC'], unit=u.deg)
idx1, idx2, sep2d, _ = scs.search_around_sky(scs, nearenough_sep)
# now contruct the groups from the pairs
grpdct = {}
grpi = 0
for i1, i2 in zip(idx1, idx2):
if i1 in grpdct:
if i2 in grpdct:
# combine the two groups by assigning grp2 items to grp1
# this block is by far the slowest part so if the data size grows it should be optimized
grp1 = grpdct[i1]
grp2 = grpdct[i2]
if grp1 != grp2:
to_set_to_1 = [i for i, grp in grpdct.iteritems() if grp==grp2]
for i in to_set_to_1:
grpdct[i] = grp1
else:
#add i2 to the group i1 is already in
grpdct[i2] = grpdct[i1]
else:
if i2 in grpdct:
#add i1 to the group i2 is already in
grpdct[i2] = grpdct[i1]
else:
# add them both to a new group
grpdct[i1] = grpdct[i2] = grpi
grpi += 1
grpnum_to_group_members = collections.defaultdict(list)
for k, v in grpdct.iteritems():
grpnum_to_group_members[v].append(k)
# convert the members into arrays
grpnum_to_group_members = {k:np.array(v) for k, v in grpnum_to_group_members.iteritems()}
# identify which is the "best" spectrum (meaning the first zq=4 spectrum)
idxs_to_keep = []
new_repeats = []
for grpnum, allmembers in grpnum_to_group_members.iteritems():
if remove_imacs:
members = allmembers[spec_data['TELNAME'][allmembers]!='IMACS']
if len(members)==0:
continue
else:
members = allmembers
idxs_to_keep.append(members[np.argsort(spec_data['ZQUALITY'][members])[0]])
new_repeats.append('+'.join(np.unique(spec_data['SPEC_REPEAT'][members])))
# now build the output table from the input
unique_objs = spec_data[np.array(idxs_to_keep)]
del unique_objs['SPEC_REPEAT']
unique_objs['SPEC_REPEAT'] = new_repeats
return unique_objs
uniq_objs = find_uniques('../saga_spectra_raw.fits.gz')
In [3]:
uniq_objs
Out[3]: