In [1]:
# give access to importing dwarfz
import os, sys
dwarfz_package_dir = os.getcwd().split("dwarfz")[0]
if dwarfz_package_dir not in sys.path:
sys.path.insert(0, dwarfz_package_dir)
import dwarfz
# back to regular import statements
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
sns.set(context="poster", style="ticks", font_scale=1.4)
import numpy as np
import pandas as pd
import glob
import shutil
from scipy.special import expit
In [2]:
COSMOS_filename = os.path.join(dwarfz.data_dir_default, "COSMOS_reference.sqlite")
COSMOS = dwarfz.datasets.COSMOS(COSMOS_filename)
In [3]:
HSC_filename = os.path.join(dwarfz.data_dir_default, "HSC_COSMOS_median_forced.sqlite3")
HSC = dwarfz.datasets.HSC(HSC_filename)
In [4]:
matches_filename = os.path.join(dwarfz.data_dir_default, "matches.sqlite3")
matches_df = dwarfz.matching.Matches.load_from_filename(matches_filename)
In [5]:
combined = matches_df[matches_df.match].copy()
combined["ra"] = COSMOS.df.loc[combined.index].ra
combined["dec"] = COSMOS.df.loc[combined.index].dec
combined["photo_z"] = COSMOS.df.loc[combined.index].photo_z
combined["log_mass"] = COSMOS.df.loc[combined.index].mass_med
photometry_cols = [
"gcmodel_flux","gcmodel_flux_err","gcmodel_flux_flags", "gcmodel_mag",
"rcmodel_flux","rcmodel_flux_err","rcmodel_flux_flags", "rcmodel_mag",
"icmodel_flux","icmodel_flux_err","icmodel_flux_flags", "icmodel_mag",
"zcmodel_flux","zcmodel_flux_err","zcmodel_flux_flags", "zcmodel_mag",
"ycmodel_flux","ycmodel_flux_err","ycmodel_flux_flags", "ycmodel_mag",
]
for col in photometry_cols:
combined[col] = HSC.df.loc[combined.catalog_2_ids][col].values
In [6]:
combined["g_minus_r"] = combined.gcmodel_mag - combined.rcmodel_mag
combined["r_minus_i"] = combined.rcmodel_mag - combined.icmodel_mag
combined["i_minus_z"] = combined.icmodel_mag - combined.zcmodel_mag
combined["z_minus_y"] = combined.zcmodel_mag - combined.ycmodel_mag
In [7]:
mask = np.isfinite(combined["g_minus_r"]) & np.isfinite(combined["r_minus_i"]) \
& np.isfinite(combined["i_minus_z"]) & np.isfinite(combined["z_minus_y"]) \
& np.isfinite(combined["icmodel_mag"]) \
& (~combined.gcmodel_flux_flags) & (~combined.rcmodel_flux_flags) \
& (~combined.icmodel_flux_flags) & (~combined.zcmodel_flux_flags) \
& (~combined.ycmodel_flux_flags)
combined = combined[mask]
In [8]:
df_frankenz = pd.read_sql_table("photo_z",
"sqlite:///{}".format(
os.path.join(dwarfz.data_dir_default,
"HSC_matched_to_FRANKENZ.sqlite")),
index_col="object_id")
df_frankenz.head()
Out[8]:
In [9]:
combined = combined.join(df_frankenz[["photoz_best", "photoz_risk_best"]],
on="catalog_2_ids")
In [10]:
low_z = (combined.photo_z < .15)
low_mass = (combined.log_mass > 8) & (combined.log_mass < 9)
In [11]:
combined["low_z_low_mass"] = (low_z & low_mass)
combined.low_z_low_mass.mean()
Out[11]:
In [13]:
filename = "galaxy_images_training/2017_09_26-dwarf_galaxy_scores.csv"
In [14]:
!head -n 20 $filename
In [15]:
df_dwarf_prob = pd.read_csv(filename)
df_dwarf_prob.head()
Out[15]:
In [16]:
theoretical_probs=np.linspace(0,1,num=20+1)
In [17]:
empirical_probs_RF = np.empty(theoretical_probs.size-1)
In [18]:
for i in range(theoretical_probs.size-1):
prob_lim_low = theoretical_probs[i]
prob_lim_high = theoretical_probs[i+1]
mask_RF = (df_dwarf_prob.dwarf_prob >= prob_lim_low) & (df_dwarf_prob.dwarf_prob < prob_lim_high)
empirical_probs_RF[i] = df_dwarf_prob.low_z_low_mass[mask_RF].mean()
In [19]:
color_RF = "g"
label_RF = "Random Forest"
plt.hist(df_dwarf_prob.dwarf_prob, bins=theoretical_probs)
plt.xlim(0,1)
plt.yscale("log")
plt.ylabel("counts")
plt.figure()
plt.step(theoretical_probs, [empirical_probs_RF[0], *empirical_probs_RF],
linestyle="steps", color=color_RF, label=label_RF)
plt.plot(theoretical_probs, theoretical_probs-.05,
drawstyle="steps", color="black", label="ideal", linestyle="dashed")
plt.xlabel("Reported Probability")
plt.ylabel("Actual (Binned) Probability")
plt.legend(loc="best")
plt.xlim(0,1)
plt.ylim(0,1)
Out[19]:
In [21]:
COSMOS_field_area = 2 # sq. deg.
In [22]:
sample_size = int(1000 * COSMOS_field_area)
print("sample size: ", sample_size)
In [23]:
selected_galaxies = df_dwarf_prob.sort_values("dwarf_prob", ascending=False)[:sample_size]
print("threshold: ", selected_galaxies.dwarf_prob.min())
print("galaxies at or above threshold: ", (df_dwarf_prob.dwarf_prob>=selected_galaxies.dwarf_prob.min()).sum() )
In [24]:
df_dwarf_prob.dwarf_prob.min()
Out[24]:
In [25]:
bins = np.linspace(0,1, num=100)
plt.hist(df_dwarf_prob.dwarf_prob, bins=bins, label="RF score")
plt.axvline(selected_galaxies.dwarf_prob.min(), linestyle="dashed", color="black", label="threshold")
plt.legend(loc="best")
plt.xlabel("Dwarf Prob.")
plt.ylabel("counts (galaxies)")
plt.yscale("log")
In [26]:
# how balanced is the CNN training set? What fraction are actually dwarfs?
selected_galaxies.low_z_low_mass.mean()
Out[26]:
For technical details, see: https://hsc-release.mtk.nao.ac.jp/das_quarry/manual.html
In [27]:
selected_galaxy_coords = HSC.df.loc[selected_galaxies.HSC_id][["ra", "dec"]]
selected_galaxy_coords.head()
Out[27]:
In [122]:
selected_galaxy_coords.to_csv("galaxy_images_training/2017_09_26-selected_galaxy_coords.csv")
In [29]:
width = "20asec"
filters = ["HSC-G", "HSC-R", "HSC-I", "HSC-Z", "HSC-Y"]
rerun = "pdr1_deep"
In [49]:
tmp_filename = "galaxy_images_training/tmp_quarry.txt"
tmp_filename_missing_ext = tmp_filename[:-3]
with open(tmp_filename, mode="w") as f:
# print("#? ra dec filter sw sh rerun", file=f)
print_formatter = " {galaxy.ra:.6f}deg {galaxy.dec:.6f}deg {filter} {width} {width} {rerun}"
for galaxy in selected_galaxy_coords.itertuples():
for filter in filters:
print(print_formatter.format(galaxy=galaxy,
width=width,
filter=filter,
rerun=rerun),
file=f)
In [51]:
!head -n 10 $tmp_filename
In [52]:
!wc -l $tmp_filename
In [53]:
!split -a 1 -l 1000 $tmp_filename $tmp_filename_missing_ext
To do: I need to find a way to deal with the script below when there aren't any files to process (i.e. they've already been processeded)
In [72]:
for filename_old in sorted(glob.glob("galaxy_images_training/tmp_quarry.?")):
filename_new = filename_old + ".txt"
with open(filename_new, mode="w") as f_new:
f_new.write("#? ra dec filter sw sh rerun\n")
with open(filename_old, mode="r") as f_old:
data = f_old.read()
f_new.write(data)
os.remove(filename_old)
In [74]:
!ls galaxy_images_training/
1)
First you need to setup you authentication information. Add it to a file like galaxy_images_training/curl_netrc which should look like:
machine hsc-release.mtk.nao.ac.jp login <your username> password <your password>
This allows you to script the curl calls, without being prompted for your password each time
2a)
The curl call (in (2b)) will spit out files into a somewhat unpredicatably named directory, like arch-170928-231223. You should rename this to match the batch suffix. You really should do this right away, so you don't get confused. In general I add the rename onto the same line as the curl call:
curl ... | tar xvf - && mv arch-* quarry_files_a
This only works if it finds one arch- directory, but you really shouldn't have multiple arch directories at any given time; that's a recipe for getting your galaxies mixed up.
2b)
Here's the actual curl invocation:
curl --netrc-file galaxy_images_training/curl_netrc https://hsc-release.mtk.nao.ac.jp/das_quarry/cgi-bin/quarryImage --form list=@<coord list filename> | tar xvf -
In [82]:
!head -n 10 galaxy_images_training/tmp_quarry.a.txt
In [92]:
!ls -lhtr galaxy_images_training/quarry_files_a | head -n 11
steps:
1) figure out what suffix we're using (e.g. "a")
2) convert suffix into zero-indexed integer (e.g. 0)
3) determine which HSC-ids corresponded to that file
suffix_int*200 to (suffix_int+1)*2004) swapping the line number prefix with an HSC_id prefix, while preserving the rest of the filename (esp. the band information)
In [93]:
!mkdir -p galaxy_images_training/quarry_files
In [96]:
prefix_map = { char:i for i, char in enumerate(["a","b","c","d","e",
"f","g","h","i","j"])
}
In [97]:
prefix = "a"
prefix_int = prefix_map[prefix]
print(prefix_int)
In [109]:
files = glob.glob("galaxy_images_training/quarry_files_{prefix}/*".format(prefix=prefix))
file_numbers = [int(os.path.basename(file).split("-")[0]) for file in files]
file_numbers = np.array(sorted(file_numbers))
In [113]:
desired_range = np.arange(2,1002)
print("missing file numbers: ", desired_range[~np.isin(desired_range, file_numbers)])
In [114]:
i_start = prefix_int*200
i_end = (prefix_int+1)*200
print(i_start, i_end)
HSC_ids_for_prefix = selected_galaxies.iloc[i_start:i_end].HSC_id.values
In [115]:
HSC_ids_for_prefix
Out[115]:
In [117]:
i_file = 1
for HSC_id in HSC_ids_for_prefix:
for filter in filters:
i_file += 1
filenames = glob.glob("galaxy_images_training/quarry_files_{prefix}/{i_file}-*-{filter}-*.fits".format(
prefix=prefix,
i_file=i_file,
filter=filter))
if len(filenames) > 1:
raise ValueError("too many matching files for i_file={}".format(i_file))
elif len(filenames) == 0:
print("skipping missing i_file={}".format(i_file))
continue
old_filename = filenames[0]
new_filename = old_filename.replace("/{i_file}-".format(i_file=i_file),
"/{HSC_id}-".format(HSC_id=HSC_id))
new_filename = new_filename.replace("quarry_files_{prefix}".format(prefix=prefix),
"quarry_files")
if os.path.exists(old_filename):
# for now use a copy operation, so I can fix things if it breaks,
# but later this should be a move instead
os.rename(old_filename, new_filename)
In [ ]:
In [121]:
for file in glob.glob("galaxy_images_training/tmp_quarry.?.txt"):
os.remove(file)
In [123]:
for file in glob.glob("galaxy_images_training/quarry_files_?"):
# will fail if the directory isn't empty
# (directory should be empty after moving the renamed files to `quarry_files`)
os.rmdir(file)
In [124]:
filename = "galaxy_images_training/tmp_quarry.txt"
if os.path.exists(filename):
os.remove(filename)
In [ ]: