In this notebook we examine the accuracy as a function of magnitude for sources with spectroscopic classifications from DEIMOS COSMOS survey. The DEIMOS set contains $\sim$ 10K sources, and $\sim$ 2.7K sources are crossmatched with PS1 catalog.
The overall accuracy for the classification by the ML model we developed is $\sim$ 95%, but the FoM @FPR=0.05 is lower than 0.4, which is worse than the FoM obtained with HSTxPS1 catalog.
We found the accuracy of the HST classification is also $\sim$ 95%. The performance of the ML model, therefore, is reasonable because the ML model is trained with the HST classification.
In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
In [2]:
_df = pd.read_table('DEIMOS/deimos_10K_March2018/deimos.tbl', header=None)
In [3]:
arr = np.empty((len(_df), len(_df.iloc[0][0].split())), dtype='<U50')
for i in range(len(_df)):
i_row = [k for k in _df.iloc[i][0].split(' ') if (k != '')and(k != ' ')]
for j in range(len(_df.iloc[0][0].split())):
arr[i][j] = i_row[j]
In [4]:
df = pd.DataFrame(arr)
In [5]:
ra = np.array(df[1], dtype=float)
dec = np.array(df[2], dtype=float)
sel = np.array(df[3], dtype=int)
imag = np.array(df[4].replace('null', '-999').replace(' null', '-999'), dtype=float)
kmag = np.array(df[5].replace('null', '-999').replace(' null', '-999'), dtype=float)
zspec = np.array(df[6].replace('null', '-999').replace(' null', '-999'), dtype=float)
Qflag = np.array(df[7].replace('null', '-999').replace(' null', '-999'), dtype=int)
Q = np.array(df[8].replace('null', '-999').replace(' null', '-999'), dtype=float)
In [6]:
np.array(df[9][0:20])
Out[6]:
In [7]:
sgFlag = np.empty(len(df), dtype=int)
for i in range(len(df[9])):
if 'star' in df[9][i]:
sgFlag[i] = 1 # star
elif 'null' in df[9][i]:
sgFlag[i] = -999 # null
else:
sgFlag[i] = 0 # galaxy
if "Remarks" contains "star", the source is classifyed star.
In [41]:
plt.hist(imag[sgFlag!=-999], bins=np.arange(15, 28, 0.2), color='0.8', label='All')
plt.hist(imag[sgFlag==0], bins=np.arange(15, 28, 0.2), alpha=0.5, label='GALAXY')
plt.hist(imag[sgFlag==1], bins=np.arange(15, 28, 0.2), alpha=0.5, label='STAR')
plt.yscale('log')
plt.xlabel('i mag'); plt.ylabel('#')
plt.legend(loc='best')
plt.show()
The distribution of galaxies looks similar to that of the HST COSMOS catalog, but that of stars has a peak at i-mag$\sim$22, which is not shown in that of the HSTxPS1 catalog.
In [9]:
df = pd.DataFrame()
df['ra'] = ra; df['dec'] = dec
df['sel'] = sel
df['imag'] = imag; df['kmag'] = kmag
df['zspec'] = zspec
df['Qflag'] = Qflag; df['Q'] = Q
df['class'] = sgFlag
In [42]:
df[0:10]
Out[42]:
In [11]:
df.to_csv('./DEIMOS/DEIMOS.csv', index=None)
In [12]:
import star_galaxy_models
rf_obj = star_galaxy_models.RandomForestModel()
rf_obj.read_rf_from_pickle()
In [13]:
features = ['wwpsfChiSq', 'wwExtNSigma', 'wwpsfLikelihood',
'wwPSFKronRatio', 'wwPSFKronDist', 'wwPSFApRatio',
'wwmomentRH', 'wwmomentXX', 'wwmomentXY', 'wwmomentYY',
'wwKronRad']
In [14]:
from sklearn.metrics import roc_curve, accuracy_score, auc, make_scorer
In [13]:
ps1_dei = pd.read_csv('./DEIMOS/PS1_DEIMOS_features.csv').drop_duplicates(subset='objid')
print("PS1xDEIMOS catalog constains %i sources."%len(ps1_dei))
In [14]:
ps1_dei_det_mask = np.logical_and(ps1_dei['class'] != -999, (ps1_dei.nDetections>0)&(ps1_dei.wwKronFlux>0))
ps1_dei = ps1_dei[ps1_dei_det_mask]
print("%i sources are classified by both of the DEIMOS and the ML model."%len(ps1_dei))
In [15]:
ps1_df = pd.read_csv('./DEIMOS/HST_COSMOS_features.csv')
In [16]:
dupl_mask = np.empty(len(ps1_dei), dtype=bool)
for i in range(len(dupl_mask)):
dupl_mask[i] = ps1_dei.objid.iloc[i] in np.array(ps1_df.objid)
print("Only %i sources are included both of the PS1xDEIMOS and the PS1xHST catalog..."%np.sum(dupl_mask))
ps1_dei = ps1_dei[~dupl_mask]
#print("%i sources are not contained in PS1xHST catalog."%len(ps1_dei))
In [19]:
kron_mag = -2.5*np.log10(ps1_dei.wwKronFlux/3631)
ps1_dei_features = ps1_dei[features]
ps1_dei_class = ps1_dei['class']
In [20]:
ps1_dei_score = rf_obj.rf_clf_.predict_proba(ps1_dei_features)
ps1_dei_pred = rf_obj.rf_clf_.predict(ps1_dei_features)
In [21]:
print("Overall accuracy of the classification by the ML model is %f"%accuracy_score(ps1_dei_class, ps1_dei_pred))
In [22]:
fpr, tpr, thre = roc_curve(ps1_dei_class, ps1_dei_score[:,1])
plt.grid(linestyle='dotted')
plt.plot(fpr, tpr, 'k-')
#plt.xscale('log'); plt.yscale('log')
plt.xlim(1e-3, 1e-1); plt.ylim(0.1, 1.01)
plt.xlabel('FPR'); plt.ylabel('TPR')
plt.show()
In [23]:
ps1_dei_class = np.array(ps1_dei_class)
ps1_dei_score = np.array(ps1_dei_score)
kron_mag = np.array(kron_mag)
binwidth = 1.5
Nboot = 100
mag_array = np.arange(14 , 23+binwidth, binwidth)
kron_mag = np.array(-2.5*np.log10(ps1_dei['wwKronFlux']/3631))
ml_acc_arr = np.zeros_like(mag_array, dtype=float)
ml_boot_scatt = np.vstack((np.zeros_like(mag_array, dtype=float), np.zeros_like(mag_array, dtype=float)))
for bin_num, binedge in enumerate(mag_array):
bin_sources = np.where((kron_mag >= binedge) & (kron_mag < binedge + binwidth))
ml_acc_arr[bin_num] = accuracy_score(ps1_dei_class[bin_sources],
ps1_dei_pred[bin_sources])
ml_boot_acc = np.empty(Nboot)
for i in range(Nboot):
boot_sources = np.random.choice(bin_sources[0], len(bin_sources[0]),
replace=True)
ml_boot_acc[i] = accuracy_score(ps1_dei_class[boot_sources],
ps1_dei_pred[boot_sources])
ml_boot_scatt[:,bin_num] = np.percentile(ml_boot_acc, [16, 84])
In [24]:
from sklearn.neighbors import KernelDensity
kde_grid = np.linspace(10,26,200)
deimos_stars = np.where(ps1_dei_class == 1)
deimos_gal = np.where(ps1_dei_class == 0)
deimos_kde_gal_norm = len(deimos_gal[0])/len(ps1_dei_class)
deimos_kde_star_norm = 1 - deimos_kde_gal_norm
kde_deimos = KernelDensity(bandwidth=1.059*np.std(kron_mag, ddof=1)*len(kron_mag)**(-0.2),
rtol=1E-4)
kde_deimos.fit(kron_mag[:, np.newaxis])
kde_deimos_stars = KernelDensity(bandwidth=1.059*np.std(kron_mag[deimos_stars], ddof=1)*len(kron_mag[deimos_stars])**(-0.2),
rtol=1E-4)
kde_deimos_stars.fit(kron_mag[deimos_stars[0], np.newaxis])
kde_deimos_gal = KernelDensity(bandwidth=1.059*np.std(kron_mag[deimos_gal], ddof=1)*len(kron_mag[deimos_gal])**(-0.2),
rtol=1E-4)
kde_deimos_gal.fit(kron_mag[deimos_gal[0], np.newaxis])
pdf_deimos = np.exp(kde_deimos.score_samples(kde_grid[:, np.newaxis]))
pdf_deimos_stars = np.exp(kde_deimos_stars.score_samples(kde_grid[:, np.newaxis]))
pdf_deimos_gal = np.exp(kde_deimos_gal.score_samples(kde_grid[:, np.newaxis]))
In [35]:
from matplotlib.ticker import MultipleLocator
#import seaborn as sns
color_dict = {'ml': "black"}
mag_bin_centers = mag_array + binwidth/2
#cmap_star = sns.cubehelix_palette(rot=0.5, light=0.7,dark=0.3,as_cmap=True)
#cmap_gal = sns.cubehelix_palette(start=0.3,rot=-0.5,light=0.7,dark=0.3,as_cmap=True)
fig, ax = plt.subplots(figsize=(8, 5))
ax.grid(linestyle='dotted', zorder=1)
ax.errorbar(mag_bin_centers, ml_acc_arr,
yerr=np.abs(ml_boot_scatt - ml_acc_arr),
ls='-', lw=.75, fmt='o',
color=color_dict['ml'], label="ML model",
linewidth=1.5, markersize=7.5, zorder=5)
# add KDE plots
ax.fill(kde_grid, pdf_deimos + 0.5, alpha=0.4, color="0.7", zorder=2)
ax.fill(kde_grid, pdf_deimos_gal*deimos_kde_gal_norm + 0.5, alpha=0.7, zorder=3)#, color=cmap_gal(0.25))
ax.fill(kde_grid, pdf_deimos_stars*deimos_kde_star_norm + 0.5, alpha=0.7, zorder=4)#, color=cmap_star(0.25))
ax.set_ylim(0.5,1.01)
ax.set_xlim(14, 24)
ax.tick_params(which="both", top=True, right=True, labelsize=15)
ax.set_xlabel('whiteKronMag', fontsize=15)
ax.set_ylabel('Accuracy', fontsize=15)
ax.yaxis.set_minor_locator(MultipleLocator(0.025))
ax.xaxis.set_major_locator(MultipleLocator(2))
ax.xaxis.set_minor_locator(MultipleLocator(0.5))
#ax.legend(bbox_to_anchor=(0.01, 0.3, 1., 0.102), loc=3, fontsize=13)
fig.subplots_adjust(top=0.98,right=0.98,left=0.1,bottom=0.12)
In [201]:
from astropy.table import Table
In [202]:
deimos = pd.read_csv('./DEIMOS/DEIMOS.csv')
In [203]:
hst = Table.read('./DEIMOS/HST_COSMOS.fit').to_pandas()
In [204]:
hstX = np.empty((len(hst), 2), dtype=np.float64)
hstX[:, 0] = hst['ALPHA_J2000']
hstX[:, 1] = hst['DELTA_J2000']
deiX = np.empty((len(deimos), 2), dtype=np.float64)
deiX[:, 0] = deimos['ra']
deiX[:, 1] = deimos['dec']
Cross-matching the sources in the DEIMOS catalog within radius = 0.5 arcsec around those in the HST catalog
In [205]:
from astroML.crossmatch import crossmatch_angular
max_radius = 0.5 / 3600 # 0.5 arcsec
dist, ind = crossmatch_angular(hstX, deiX, max_radius)
match = ~np.isinf(dist)
In [206]:
print("The number of sources cross-matched is %i"%np.sum(match))
In [207]:
plt.hist(dist[match]*3600, bins=np.arange(0, 0.5,0.01))
plt.xlabel('Distance')
plt.show()
The distribution of the distance has a peak at 0.1 arcsec. Changes the cross-matching radius to 0.3 arcsec.
In [208]:
from astroML.crossmatch import crossmatch_angular
max_radius = 0.3 / 3600 # 0.3 arcsec
dist, ind = crossmatch_angular(hstX, deiX, max_radius)
match = ~np.isinf(dist)
In [209]:
print("The number of sources cross-matched is %i"%np.sum(match))
In [210]:
plt.hist(dist[match]*3600, bins=np.arange(0, 0.5,0.01))
plt.xlabel('Distance')
plt.show()
In [211]:
hst_match = hst[match]
deimos_match = deimos.loc[ind[match]]
Remove duplicated sources.
In [212]:
dupl_mask = deimos_match.duplicated('ra')
In [213]:
deimos_match_uniq = deimos_match[~dupl_mask.values]
hst_match_uniq = hst_match[~dupl_mask.values]
Remove the sources which are not able to be classified to star or galaxy by the DEIMOS catalog.
In [214]:
good_mask = deimos_match_uniq["class"] != -999
In [215]:
deimos_match_uniq_good = deimos_match_uniq[good_mask.values]
hst_match_uniq_good = hst_match_uniq[good_mask.values]
In [216]:
print("The number of sources used to verify the classification accuracy is %i"%len(deimos_match_uniq))
In [217]:
xlims = [12, 29]
ylims = [12, 29]
plt.hexbin(hst_match_uniq["MAG_BEST"], deimos_match_uniq["imag"],
extent=[xlims[0], xlims[1], ylims[0], ylims[1]],
bins='log', cmap='viridis')
plt.xlim(xlims); plt.ylim(ylims)
plt.xlabel('MAG_BEST(HST)')
plt.ylabel('imag(DEIMOS)')
Out[217]:
In [218]:
from sklearn.metrics import accuracy_score
In [219]:
print("The overall accuracy od the crassification of the HST catalog is %0.4f"\
%accuracy_score(deimos_match_uniq_good["class"], hst_match_uniq_good["MU_CLASS"]-1))
In [220]:
dei_class = np.array(deimos_match_uniq_good["class"], dtype=int)
hst_class = np.array(hst_match_uniq_good["MU_CLASS"]-1, dtype=int)
kron_mag = np.array(hst_match_uniq_good["MAG_BEST"])
binwidth = 1
Nboot = 100
mag_array = np.arange(14 , 26+binwidth, binwidth)
ml_acc_arr = np.zeros_like(mag_array, dtype=float)
ml_boot_scatt = np.vstack((np.zeros_like(mag_array, dtype=float), np.zeros_like(mag_array, dtype=float)))
for bin_num, binedge in enumerate(mag_array):
bin_sources = np.where((kron_mag >= binedge) & (kron_mag < binedge + binwidth))
ml_acc_arr[bin_num] = accuracy_score(dei_class[bin_sources],
hst_class[bin_sources])
ml_boot_acc = np.empty(Nboot)
for i in range(Nboot):
boot_sources = np.random.choice(bin_sources[0], len(bin_sources[0]),
replace=True)
ml_boot_acc[i] = accuracy_score(dei_class[boot_sources],
hst_class[boot_sources])
ml_boot_scatt[:,bin_num] = np.percentile(ml_boot_acc, [16, 84])
In [221]:
from sklearn.neighbors import KernelDensity
kde_grid = np.linspace(10,29,200)
deimos_stars = np.where(dei_class == 1)
deimos_gal = np.where(dei_class == 0)
deimos_kde_gal_norm = len(deimos_gal[0])/len(dei_class)
deimos_kde_star_norm = 1 - deimos_kde_gal_norm
kde_deimos = KernelDensity(bandwidth=1.059*np.std(kron_mag, ddof=1)*len(kron_mag)**(-0.2),
rtol=1E-4)
kde_deimos.fit(kron_mag[:, np.newaxis])
kde_deimos_stars = KernelDensity(bandwidth=1.059*np.std(kron_mag[deimos_stars], ddof=1)*len(kron_mag[deimos_stars])**(-0.2),
rtol=1E-4)
kde_deimos_stars.fit(kron_mag[deimos_stars[0], np.newaxis])
kde_deimos_gal = KernelDensity(bandwidth=1.059*np.std(kron_mag[deimos_gal], ddof=1)*len(kron_mag[deimos_gal])**(-0.2),
rtol=1E-4)
kde_deimos_gal.fit(kron_mag[deimos_gal[0], np.newaxis])
pdf_deimos = np.exp(kde_deimos.score_samples(kde_grid[:, np.newaxis]))
pdf_deimos_stars = np.exp(kde_deimos_stars.score_samples(kde_grid[:, np.newaxis]))
pdf_deimos_gal = np.exp(kde_deimos_gal.score_samples(kde_grid[:, np.newaxis]))
In [222]:
from matplotlib.ticker import MultipleLocator
#import seaborn as sns
color_dict = {'ml': "black"}
mag_bin_centers = mag_array + binwidth/2
#cmap_star = sns.cubehelix_palette(rot=0.5, light=0.7,dark=0.3,as_cmap=True)
#cmap_gal = sns.cubehelix_palette(start=0.3,rot=-0.5,light=0.7,dark=0.3,as_cmap=True)
fig, ax = plt.subplots(figsize=(8, 5))
ax.grid(linestyle='dotted', zorder=1)
ax.errorbar(mag_bin_centers, ml_acc_arr,
yerr=np.abs(ml_boot_scatt - ml_acc_arr),
ls='-', lw=.75, fmt='o',
color=color_dict['ml'], label="ML model",
linewidth=1.5, markersize=7.5, zorder=5)
# add KDE plots
ax.fill(kde_grid, pdf_deimos + 0.5, alpha=0.4, color="0.7", zorder=2)
ax.fill(kde_grid, pdf_deimos_gal*deimos_kde_gal_norm + 0.5, alpha=0.7, zorder=3)#, color=cmap_gal(0.25))
ax.fill(kde_grid, pdf_deimos_stars*deimos_kde_star_norm + 0.5, alpha=0.7, zorder=4)#, color=cmap_star(0.25))
ax.set_ylim(0.5,1.01)
ax.set_xlim(14, 27)
ax.tick_params(which="both", top=True, right=True, labelsize=15)
ax.set_xlabel('MAG_BEST', fontsize=15)
ax.set_ylabel('Accuracy', fontsize=15)
ax.yaxis.set_minor_locator(MultipleLocator(0.025))
ax.xaxis.set_major_locator(MultipleLocator(2))
ax.xaxis.set_minor_locator(MultipleLocator(0.5))
#ax.legend(bbox_to_anchor=(0.01, 0.3, 1., 0.102), loc=3, fontsize=13)
fig.subplots_adjust(top=0.98,right=0.98,left=0.1,bottom=0.12)
In [ ]: