Goal: Gather images of quasars, non-quasar celestial objects, and quasar candidates.
We will use images collected from the Sloan Digital Sky Survey.
Some useful links for the Sloan Digitial Sky Survey:
Other useful links about Quasars
In [ ]:
import urllib
from IPython.display import display, Image
import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
%matplotlib inline
from astropy import units as u
from astropy.coordinates import SkyCoord
from astropy.table import Table
from astroquery.sdss import SDSS
from astroquery.simbad import Simbad
As as test, we will use AstroPy to get a nice image for the project page. We will use SIMBAD to find a random quasar.
In [ ]:
# Limit the number of results we get from our query.
Simbad.ROW_LIMIT = 1000
In [ ]:
result = Simbad.query_criteria('region(box,180d +30d, 8d +8d)', otype='QSO')
In [ ]:
result
Out[ ]:
In [ ]:
# Choose a random quasar.
qnumber = np.random.randint(0, 1000)
# Get that quasar's coordinates, and format them for SkyCoordinate
resultRA = result[qnumber]['RA'].split()
RA = '%sh%sm%ss' % (resultRA[0], resultRA[1], resultRA[2])
resultDEC = result[qnumber]['DEC'].split()
DEC = '%sd%sm%ss' % (resultDEC[0], resultDEC[1], resultDEC[2])
# Convert to a SkyCoordinate
QCoord = SkyCoord(RA, DEC, frame='icrs')
QCoord
Out[ ]:
We will now get an image from the Sloan Digital Sky Survey. The following code follows this tutorial.
In [ ]:
impix = 1024
imsize = 12 * u.arcmin
cutoutbaseurl = 'http://skyservice.pha.jhu.edu/DR12/ImgCutout/getjpeg.aspx'
query_string = urllib.parse.urlencode(dict(
ra=QCoord.ra.deg, dec=QCoord.dec.deg, width=impix, height=impix,
scale=imsize.to(u.arcsec).value / impix))
url = cutoutbaseurl + '?' + query_string
urllib.request.urlretrieve(url, 'Quasar.jpg')
Out[ ]:
In [ ]:
display(Image('Quasar.jpg'))
There is a list of 46420 detected quasars from the Penn State Center for Astrostatistics. We will use their SDSS_quasar.dat data set and the AstroPy python package.
In [ ]:
Quasars = pd.read_fwf('SDSS_quasar.dat')
In [ ]:
Quasars.head()
Out[ ]:
In [ ]:
Quasars.tail() # 46420 rows
Out[ ]:
In [ ]:
coord = SkyCoord(str(Quasars.iloc[1]['R.A.']) + 'd',
str(Quasars.iloc[1]['Dec.']) + 'd', frame='icrs')
In [ ]:
impix = 120
imsize = 1 * u.arcmin
cutoutbaseurl = 'http://skyservice.pha.jhu.edu/DR12/ImgCutout/getjpeg.aspx'
query_string = urllib.parse.urlencode(dict(
ra=coord.ra.deg, dec=coord.dec.deg, width=impix, height=impix,
scale=imsize.to(u.arcsec).value / impix))
url = cutoutbaseurl + '?' + query_string
urllib.request.urlretrieve(url, 'Quasar_1.jpg')
display(Image('Quasar_1.jpg'))
In [ ]:
def get_image(coordinate, name, impix=120):
"""Downloads the image from the SDSS DR12 release as a impix pixel by impix pixel image.
Parameters
----------
coordinate : coordinate of the celestial object as a Sky Coordinate.
name: The name string to save the image as. It will be saved as 'name.jpg'.
"""
imsize = 1 * u.arcmin
cutoutbaseurl = 'http://skyservice.pha.jhu.edu/DR12/ImgCutout/getjpeg.aspx'
query_string = urllib.parse.urlencode(dict(
ra=coord.ra.deg, dec=coord.dec.deg, width=impix, height=impix,
scale=imsize.to(u.arcsec).value / impix))
url = cutoutbaseurl + '?' + query_string
urllib.request.urlretrieve(url, './Images/' + name + '.jpg')
In [ ]:
get_image(coord, 'test1')
# Worked successfully
In [ ]:
# Some data manipulation to get Sky Coordinates for each entry.
# The application of the SkyCoord function will take time.
QuasarLocs = pd.concat([Quasars['R.A.'].apply(lambda x: str(x) + 'd '),
Quasars['Dec.'].apply(lambda x: str(x) + 'd')], axis=1)
QuasarLocs['Coords'] = QuasarLocs[['R.A.', 'Dec.']].apply(
lambda x: SkyCoord(x[0], x[1], frame='icrs'), axis=1)
In [ ]:
QuasarLocs.head()
Out[ ]:
In [ ]:
# We will now download these images from SDSS DR12
for i in range(46420):
get_image(QuasarLocs['Coords'].iloc[i], name='Quasar_' + str(i))
We will use SIMBAD to find objects that are not Quasars or Quasar Candidates. We will sample 200 random regions in the SDSS footprint and take 500 objects from each region.
In [ ]:
# Limit the number of results we get from our query.
Simbad.ROW_LIMIT = 20000
In [ ]:
# We will stay in the 8h to 16h +0d to +60 footprint region of SDSS.
# Note that there are some regions in SDSS DR 12 and DR 13 outside of this range,
# but this range covers a majority of the footprint.
# As the box we form is 8d by 8d, we start at 124d and end at 236d for longitude,
# and start as +4d to +56d in latitude.
NonQuasars = pd.DataFrame()
randcoord = []
for i in range(200):
randcoord.append(str(np.random.randint(128, 237)) +
'd +' + str(np.random.randint(4, 57)) + 'd')
try:
# For otype, QSO are quasars, Q? are quasar candidates, and LeQ are gravitationally lenses quasars.
result = Simbad.query_criteria(
'region(box,' + randcoord[i] + ', 4d +4d)', 'otype != QSO', 'otype != Q?', 'otype != LeQ')
sample = result.to_pandas().sample(500)
NonQuasars = pd.concat([NonQuasars, sample], axis=0)
if i % 10 == 0:
print('At attempt %s' % i)
except:
print('Attempt Failed... retrying')
i = i - 1
In [ ]:
NonQuasars
Out[ ]:
In [ ]:
# Checking for duplicates, which is a possibility in this process.
NonQuasars[NonQuasars.duplicated()]
Out[ ]:
In [ ]:
# As duplicates were found, we will drop all but the first.
NonQuasars = NonQuasars.drop_duplicates(keep='first')
In [ ]:
# Reindexing
NonQuasars.reset_index(inplace=True)
NonQuasars.drop('index', axis=1, inplace=True)
In [ ]:
# Saving a copy of the data to accompany the images.
NonQuasars.to_csv('NonQuasarsData.csv', index=False)
In [ ]:
NonQuasars.head()
Out[ ]:
In [ ]:
NonQuasars.tail() # 94670 rows
Out[ ]:
In [ ]:
def RAtoICRS(RAValue):
"""Converts SIMBAD Right Ascent (RA) format to ICRS format.
Parameters
----------
RAValue : A SIMBAD Right Ascent value in "X Y Z" format for X hours, Y minutes, and Z seconds.
"""
if len(RAValue.split()) == 1:
return '%sh' % (RAValue.split()[0])
elif len(RAValue.split()) == 2:
return '%sh%sm' % (RAValue.split()[0], RAValue.split()[1])
elif len(RAValue.split()) == 3:
return '%sh%sm%ss' % (RAValue.split()[0], RAValue.split()[1], RAValue.split()[2])
else:
return np.nan()
def DECtoICRS(DECValue):
"""Converts SIMBAD Declination (DEC) format to ICRS format.
Parameters
----------
RAValue : A SIMBAD Declination value in "+X Y Z" format for X degrees, Y minutes, and Z seconds.
"""
if len(DECValue.split()) == 1:
return '%sd' % (DECValue.split()[0])
elif len(DECValue.split()) == 2:
return '%sd%sm' % (DECValue.split()[0], DECValue.split()[1])
elif len(DECValue.split()) == 3:
return '%sd%sm%ss' % (DECValue.split()[0], DECValue.split()[1], DECValue.split()[2])
else:
return np.nan()
In [ ]:
# Data manipulation to get Sky Coordinates for each entry.
# Note that SIMBAD gives the values separated as hours, minutes, seconds for RA and degrees, minutes, seconds for Dec
NonQuasarLocs = pd.concat([NonQuasars['RA'].apply(RAtoICRS),
NonQuasars['DEC'].apply(DECtoICRS)], axis=1)
NonQuasarLocs['Coords'] = NonQuasarLocs[['RA', 'DEC']].apply(
lambda x: SkyCoord(x[0], x[1], frame='icrs'), axis=1)
In [ ]:
NonQuasarLocs.head()
Out[ ]:
In [ ]:
# We will now download these images from SDSS DR12
for i in range(94670):
get_image(NonQuasarLocs['Coords'].iloc[i], name='NonQ_' + str(i))
We will now use SIMBAD to identify quasar candidates for analysis with our trained model.
In [ ]:
# As with the Non-quasar data, we will stay in the 8h to 16h +0d to +60 footprint
# region of SDSS. Note that there are some regions in SDSS DR 12 and DR 13 outside
# of this range, but this range covers a majority of the footprint.
# Due to timeout issues from SIMBAD, we will use smaller regions
# of width 10d by +6d to gather the candidates.
QuasarCandidates = pd.DataFrame()
for i in range(12): # Separate longitude into 12 segments of length 10d
for j in range(10): # Separate latitude into 10 segments of length 6d
try:
# For otype Q? are quasar candidates.
result = Simbad.query_criteria(
'region(box,' + str(125 + 10 * i) + 'd +'
+ str(3 + 6 * j) + 'd' + ', 5d +3d)', 'otype = Q?')
QuasarCandidates = pd.concat(
[QuasarCandidates, result.to_pandas()], axis=0)
if (i * 10 + j) % 20 == 0:
print('At attempt %s' % (i * 10 + j))
except:
print('Attempt Failed at i=%s and j=%s.' % (i, j))
In [ ]:
QuasarCandidates.head()
Out[ ]:
In [ ]:
QuasarCandidates.tail()
Out[ ]:
In [ ]:
# Checking for duplication
QuasarCandidates[QuasarCandidates.duplicated()]
Out[ ]:
In [ ]:
# Reindexing
QuasarCandidates.reset_index(inplace=True)
QuasarCandidates.drop('index', axis=1, inplace=True)
In [ ]:
QuasarCandidates.tail() # 5418 rows
Out[ ]:
In [ ]:
# Saving a copy of the data to accompany the images.
QuasarCandidates.to_csv('QuasarCandidatesData.csv', index=False)
In [ ]:
# Data manipulation to get Sky Coordinates for each entry.
# Note that SIMBAD gives the values separated as hours, minutes, seconds for RA and degrees, minutes, seconds for Dec
QuasarCandidateLocs = pd.concat([QuasarCandidates['RA'].apply(RAtoICRS),
QuasarCandidates['DEC'].apply(DECtoICRS)], axis=1)
QuasarCandidateLocs['Coords'] = QuasarCandidateLocs[['RA', 'DEC']].apply(
lambda x: SkyCoord(x[0], x[1], frame='icrs'), axis=1)
In [ ]:
QuasarCandidateLocs.head()
Out[ ]:
In [ ]:
# We will now download these images from SDSS DR12
for i in range(5418):
get_image(QuasarCandidateLocs['Coords'].iloc[i],
name='QuasarCandidate_' + str(i))