Explanation

The HSC data is too large to store as one sqlite database file using github. So instead, it needs to be fetched by the user, separately from cloning the repository. This notebook is a work-in-progress to help automate that process, and make sure that the final schema is correct.

Sending the query

The HSC data release site provides a command line tool for querying the database; I've adapted it to run programmatically from within a python session. Check it out; it's the file hsc_release_query.py. There's a working example of a simple query in sql_tester.ipynb. This notebook rolls everything together: querying the server, and combining the subsets into one table.

What gets saved?

This comes in two parts:

1) Get the main HSC table (position, fluxes, flags for each object)

2) Get a list of matched spec-z's

Code

Remember to set your credentials within hsc_credentials.py !


In [ ]:
from __future__ import division, print_function

# give access to importing dwarfz
import os, sys
dwarfz_package_dir = os.getcwd().split("dwarfz")[0]
if dwarfz_package_dir not in sys.path:
    sys.path.insert(0, dwarfz_package_dir)

import dwarfz
from dwarfz.hsc_credentials import credential
from dwarfz.hsc_release_query import query_wrapper
    
# back to regular import statements

import os, sys
import shutil
import glob
import pandas as pd
import numpy as np

import pathlib

Get HSC Fluxes

Build the query

Gets both the fluxes and the magnitudes. The difference shouldn't matter, but now you have both, depending on what's more convenient. In general, using the flux flags with the magnitude values is what I usually do.


In [ ]:
sql_base = """
SELECT 
    object_id, 
    ra, dec, 
    detect_is_patch_inner, detect_is_tract_inner, detect_is_primary,
    gcmodel_flux, gcmodel_flux_err, gcmodel_flux_flags, gcmodel_mag,
    rcmodel_flux, rcmodel_flux_err, rcmodel_flux_flags, rcmodel_mag,
    icmodel_flux, icmodel_flux_err, icmodel_flux_flags, icmodel_mag,
    zcmodel_flux, zcmodel_flux_err, zcmodel_flux_flags, zcmodel_mag,
    ycmodel_flux, ycmodel_flux_err, ycmodel_flux_flags, ycmodel_mag
FROM 
    pdr1_cosmos_widedepth_median.forced
LIMIT 
    {}
OFFSET 
    {}
"""

Make the query

The total number of objects is currently hardcoded! Make sure this hasn't changed! The cleaner way to do this would be to make a simple query to the database, then count the number of records. But for now, hardcoding it is simpler.


In [ ]:
n_objects = 1263503

block_size = 250000
n_blocks = (n_objects // block_size) + 1

In [ ]:
temp_hsc_table_dir = pathlib.Path("partial_hsc_tables")
if not temp_hsc_table_dir.is_dir():
    temp_hsc_table_dir.mkdir()

In [ ]:


In [ ]:
limit = block_size

preview_results = False
delete_job = True
out_format = "sqlite3"

for i in range(n_blocks):
    offset = i*block_size
    
    sql = sql_base.format(limit, offset)
    
    output_filename = temp_hsc_table_dir /  "tmp_{}.sqlite3".format(i)
    
    print(" ---------------- QUERY {} -------------------- ".format(i+1))
    print(sql)

    with open(output_filename, mode="wb") as output_file:
        query_wrapper(credential, sql, preview_results, delete_job, 
                      out_format, output_file,
                      nomail=True)

Check if it worked


In [ ]:
database_filenames = sorted(temp_hsc_table_dir.glob("tmp_*.sqlite3"))
database_filenames

Combine databases


In [ ]:
dfs = [pd.read_sql_table("table_1", "sqlite:///{}".format(database_filename),
                         index_col="object_id")
       for database_filename in database_filenames]
assert(sum(df.shape[0] for df in dfs) == n_objects)

combined = pd.concat(dfs)
assert(combined.shape[0] == n_objects)

del dfs
combined.head()

In [ ]:
for filename in database_filenames:
    os.remove(filename)
    
if len(list(temp_hsc_table_dir.glob("*")))==0:
    temp_hsc_table_dir.rmdir()

In [ ]:
combined.keys()

In [ ]:
hsc_database_filename = "HSC_COSMOS_median_forced.sqlite3"
hsc_database_filename_old = hsc_database_filename + ".old"

if os.path.exists(hsc_database_filename):
    try:
        shutil.move(hsc_database_filename, hsc_database_filename_old)
        combined.to_sql("hsc", "sqlite:///{}".format(hsc_database_filename))
    except:
        # in case there's an error during writing, don't overwrite/delete the existing database
        shutil.move(hsc_database_filename_old, hsc_database_filename)
        raise
    else:
        # only delete if combining went successfully
        os.remove(hsc_database_filename + ".old")
else:
    combined.to_sql("hsc", "sqlite:///{}".format(hsc_database_filename))

Match HSC objects to COSMOS objects

Every COSMOS galaxy will be in 1 pair. HSC galaxies can be in 0, 1 or more pairs.


In [ ]:
COSMOS_filename = pathlib.Path(dwarfz.data_dir_default) / "COSMOS_reference.sqlite"
COSMOS = dwarfz.datasets.COSMOS(COSMOS_filename)
COSMOS.df.head()

In [ ]:
HSC_filename = pathlib.Path(dwarfz.data_dir_default) / "HSC_COSMOS_median_forced.sqlite3"
HSC = dwarfz.datasets.HSC(HSC_filename)
HSC.df.head()

In [ ]:
matches = dwarfz.matching.Matches(COSMOS.df, HSC.df)

In [ ]:
matches_filename = pathlib.Path(dwarfz.data_dir_default) / "matches.sqlite3"
if not matches_filename.exists():
    matches.save_to_filename(matches_filename)

Check matches


In [ ]:
print("threshold (error) : {:>5.2f}".format(matches.threshold_error))
print("threshold (match) : {:>5.2f}".format(matches.threshold_match))

In [ ]:
print("overall completeness : {:.2f} %".format(100*np.mean(matches.df.match[~matches.df.error])))

In [ ]:
print("min separation: {:.4f} [arcsec]".format(min(matches.df.sep)))
print("max separation: {:.4f} [arcsec]".format(max(matches.df.sep)))

Get spec-z's matched to HSC objects

Build the query


In [ ]:
redshifts_sql = """
SELECT 
    object_id, specz_id,
    d_pos, 
    specz_ra, specz_dec,
    specz_redshift, specz_redshift_err, specz_flag_homogeneous
FROM
    pdr1_cosmos_widedepth_median.specz
"""

Make the query


In [ ]:
preview_results = False
delete_job = True
out_format = "sqlite3"

output_filename = "specz.{}".format(out_format)
print(output_filename)

with open(output_filename, mode="wb") as output_file:
    query_wrapper(credential, redshifts_sql, preview_results, delete_job, 
                  out_format, output_file,
                  nomail=True,
                 )

Check if it worked


In [ ]:
!ls -lh specz.sqlite3

In [ ]:
df = pd.read_sql_table("table_1", 
                            "sqlite:///{}".format("specz.sqlite3"), 
                            index_col="object_id")
df = df[df.specz_flag_homogeneous]

df.head()

Get FRANKEN-Z photo-z's, and then match to HSC

Build the query

There are no photo-z's with the "fake" COSMOS-field Wide images. That catalog was originally UltraDeep, degraded to being Wide-like. To most-closely match the photo-z catalogs, I'd then want to look in the UltraDeep dataset; but to most-correctly prepare for running on the true-Wide data, I'll pull my photo-z's from the Deep later. (Note: no photo-z's have been publicly released for the Wide data within the COSMOS field, circa 8 June 2017)


In [ ]:
photoz_sql = """
SELECT
  pdr1_deep.forced.object_id,
  pdr1_deep.forced.ra,
  pdr1_deep.forced.dec,
  pdr1_deep.photoz_frankenz.photoz_best,
  pdr1_deep.photoz_frankenz.photoz_risk_best
FROM
  pdr1_deep.forced
INNER JOIN pdr1_deep.photoz_frankenz 
    ON pdr1_deep.photoz_frankenz.object_id=pdr1_deep.forced.object_id
WHERE (ra BETWEEN 149.25 AND 151.25) AND (dec BETWEEN 1.4 AND 3);
"""

Make the query


In [ ]:
preview_results = False
delete_job = True
out_format = "sqlite3"

output_filename = "photoz_tmp.{}".format(out_format)
print(output_filename)

with open(output_filename, mode="wb") as output_file:
    query_wrapper(credential, photoz_sql, preview_results, delete_job, 
                  out_format, output_file,
                  nomail=True,
                 )

Check if it worked


In [ ]:
!ls -lh photoz_tmp.sqlite3

In [ ]:
df = pd.read_sql_table("table_1", 
                       "sqlite:///{}".format("photoz_tmp.sqlite3"), 
                       index_col="object_id")

df.head()

In [ ]:
df.to_sql("FRANKENZ", "sqlite:///franken_z-DEEP-COSMOS.sqlite3",
          if_exists="replace")

In [ ]:
os.remove("photoz_tmp.sqlite3")

Cross reference FRANKENZ ids to general HSC ids


In [ ]:
HSC_filename = pathlib.Path(dwarfz.data_dir_default) /  "HSC_COSMOS_median_forced.sqlite3"
HSC = dwarfz.datasets.HSC(HSC_filename)

In [ ]:
matches = dwarfz.matching.Matches(HSC.df, df )
matches.df["HSC_ids"] = matches.df.index
matches.df["FRANKENZ_ids"] = matches.df.catalog_2_ids

matches.df.head()

In [ ]:
HSC.df.join(matches.df).join(df[["photoz_best", 
                                 "photoz_risk_best"]], 
                             on="FRANKENZ_ids").head()

Copy index column to a new data frame, then only add desired columns


In [ ]:
HSC_photo_zs = HSC.df.copy()[[]] # only copy index column
HSC_photo_zs = HSC_photo_zs.join(matches.df[["FRANKENZ_ids"]])
HSC_photo_zs = HSC_photo_zs.join(df[["photoz_best", "photoz_risk_best"]],
                                 on="FRANKENZ_ids")
HSC_photo_zs.head()

In [ ]:
HSC_photo_zs.to_sql("photo_z", 
                    "sqlite:///HSC_matched_to_FRANKENZ.sqlite",
                    if_exists="replace",
                   )

In [ ]: