This demonstrates all the steps in my candidate selection before conducting visual inspection
In [1]:
import numpy as np
import splat
import wisps.data_analysis as wispd
from wisps.data_analysis import selection_criteria as sel_crt
import shapey
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import wisps
import matplotlib as mpl
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
import pandas as pd
df=pd.read_pickle(wisps.OUTPUT_FILES+'/stdspex_sample.pkl')
In [3]:
df2=pd.read_pickle(wisps.LIBRARIES+'/spex_data_set_table.pkl')
In [4]:
df2.columns
Out[4]:
In [5]:
#withnans=wisps.datasets['highsnr']
#wihtoutnans=
alldata=pd.read_hdf(wisps.LIBRARIES+'/master_dataset.hdf', key='new')
In [6]:
sf=pd.read_pickle(wisps.OUTPUT_FILES+'/selection_function.pkl') #the simulated spectral data
In [7]:
simulated_data=pd.DataFrame.from_records(pd.DataFrame(sf).values.flatten())
In [8]:
simulated_data=wisps.Annotator.reformat_table(simulated_data)
In [9]:
star_ids=alldata[alldata['class_star'] !=0]
In [10]:
stars=wisps.Annotator.reformat_table(star_ids).reset_index(drop=True)
In [11]:
spex=wisps.Annotator.reformat_table(wisps.datasets['spex'])
for k in wisps.INDEX_NAMES: fig, ax=plt.subplots()
#plt.hist(spex[k].apply(np.log10).replace((np.inf, -np.inf), np.nan).dropna(), bins=32, histtype='step', normed=True)
plt.hist(comb[k].apply(np.log10).replace((np.inf, -np.inf), np.nan).dropna(), bins=32, histtype='step', normed=True)
plt.hist(st[k].apply(np.log10).replace((np.inf, -np.inf), np.nan).dropna(), bins=32, histtype='step', normed=True, color='k')
ax.set_title(r'$'+str(k.split(' ')[0])+'$', fontsize=18)
In [12]:
cands=pd.read_hdf(wisps.OUTPUT_FILES+'/true_spectra_cands.hdf')
In [13]:
cands=wisps.Annotator.reformat_table(cands)
In [14]:
cands=cands[(cands.spt >=17.) | ((cands.spt >=16))]
In [15]:
dt=stars[(stars.f_test< 0.4) &(stars.snr2>=1.)].reset_index(drop=True)
In [16]:
dt['spt']=(dt['spt']).apply(wisps.make_spt_number).apply(float)
In [17]:
gbhio=sel_crt.save_criteria(conts=dt)
In [18]:
crts=sel_crt.crts_from_file()
In [19]:
contamns=pd.DataFrame([ x.contamination for x in crts.values()])
compls=pd.DataFrame([ x.completeness for x in crts.values()])
In [20]:
compls.min()
Out[20]:
In [21]:
contamns.index=[x for x in crts.keys()]
compls.index=[x for x in crts.keys()]
In [22]:
1/100
Out[22]:
In [23]:
contamns.style.apply(lambda x: ["background-color: #7FDBFF"
if (i >= 0 and (v < 0.1
and v > 0. ))
else "" for i, v in enumerate(x)], axis = 1)
Out[23]:
In [24]:
crts['H_2O-1/J-Cont H_2O-2/H_2O-1']
Out[24]:
In [25]:
#
In [26]:
to_use={'H_2O-1/J-Cont H_2O-1+CH_4/H-Cont':'L0-L5',
'H_2O-1+H_2O-2/J-Cont H_2O-1+H_2O-2/H-Cont':'L5-T0' ,
'H_2O-1/J-Cont H_2O-2/J-Cont':'M7-L0',
'H_2O-1+H_2O-2/J-Cont H_2O-2+CH_4/J-Cont': 'T0-T5',
'H_2O-2/J-Cont H_2O-1+CH_4/H-Cont': 'T5-T9',
'H_2O-1+CH_4/J-Cont H_2O-2+CH_4/J-Cont': 'Y dwarfs',
'H_2O-1/J-Cont H-cont/J-Cont': 'subdwarfs'}
In [ ]:
In [27]:
len(to_use)
Out[27]:
In [28]:
def box_parameters(idx, spt_range):
bs=idx.shapes
b=[x for x in bs if x.shape_name==spt_range][0]
print ('{} {} m: {} b: {} s:{}, comp : {}, cont: {}'.format(spt_range, idx, round(b.coeffs[0], 2), round(b.coeffs[1], 2), round(b.scatter, 2), round(idx.completeness[spt_range], 2), round(idx.contamination[spt_range], 3)))
In [29]:
spex_df=wisps.Annotator.reformat_table(wisps.datasets['spex']).reset_index(drop=True)
manj=wisps.Annotator.reformat_table(wisps.datasets['manjavacas']).reset_index(drop=True)
schn=wisps.Annotator.reformat_table(wisps.datasets['schneider']).reset_index(drop=True)
In [30]:
ydwarfs=(manj[manj['spt'].apply(wisps.make_spt_number)>38].append(schn)).reset_index(drop=True)
In [31]:
spex_df['spt']=np.vstack(spex_df['spt'].values)[:,0]
manj['spt']=np.vstack(manj['spt'].values)[:,0]
schn['spt']=np.vstack(schn['spt'].values)[:,0]
In [32]:
cands_dff=cands.reset_index(drop=True)
In [33]:
import pickle
#save the random forest
output_file=wisps.OUTPUT_FILES+'/best_indices_to_use.pkl'
with open(output_file, 'wb') as file:
pickle.dump(to_use,file)
In [34]:
#dt
In [35]:
fp={}
In [36]:
cands_dff.grism_id=cands_dff.grism_id.apply(lambda x: x.lower())
In [37]:
stars.grism_id=stars.grism_id.apply(lambda x: x.lower())
In [38]:
cx=cands_dff[cands_dff.grism_id.isin(stars.grism_id)]
In [39]:
cy=stars[stars.grism_id.isin(cx.grism_id)]
In [40]:
idx=crts['H_2O-1/J-Cont H_2O-2/H_2O-1']
In [41]:
cands_dff['spt']=np.vstack(cands_dff['spt'].values)
In [42]:
cands_indices=pd.DataFrame.from_records([x.indices for x in cands_dff.spectra])
In [43]:
for k in wisps.INDEX_NAMES:
cands_dff[str(k)]=cands_indices[str(k)]
In [44]:
cands_dff=wisps.Annotator.reformat_table(cands_dff)
In [45]:
simulated_data=(simulated_data[simulated_data.snr1>3.]).reset_index(drop=True)
In [46]:
simulated_data['spt']=np.vstack(simulated_data.spt_new.values)[:,0]
In [47]:
def plot_index_box(index_name, box_name, ax):
#get the index and the box
idx=crts[index_name]
bx=[x for x in idx.shapes if x.shape_name==box_name][0]
xkey=idx.xkey
ykey=idx.ykey
to_use_df=spex_df
if box_name.lower()=='y dwarfs':
to_use_df=ydwarfs
if box_name.lower()=='subdwarfs':
to_use_df=wisps.Annotator.reformat_table(idx.subdwarfs)
to_use_df['spt']=17
ax.scatter(dt[xkey].apply(float).values, dt[ykey].apply(float).values, marker='o', s=10., facecolors='none', edgecolors='#AAAAAA')
cands_slctd, cands_bools=bx._select(np.array([cands_dff[xkey].values,cands_dff[ykey].values]))
trash_slctd, trsh_bools=bx._select(np.array([dt[xkey].values, dt[ykey].values]))
simul_slctd, simul_bools=bx._select(np.array([simulated_data[xkey].values, simulated_data[ykey].values]))
cands_in_that_class_bool=(cands_dff[ cands_bools]).spt.apply(lambda x: wisps.is_in_that_classification(x, box_name))
spexs_slctd_in_that_class_bool= (to_use_df).spt.apply(lambda x: wisps.is_in_that_classification(x, box_name))
simulated_in_that_class_bool=(simulated_data[simul_bools]).spt.apply(lambda x: wisps.is_in_that_classification(x, box_name))
if box_name.lower()=='subdwarfs':
spexs_slctd_in_that_class_bool=np.ones(len(to_use_df), dtype=bool)
cands_in_that_class=np.array([cands_slctd[0][cands_in_that_class_bool], cands_slctd[1][cands_in_that_class_bool]])
simulated_in_that_class= np.array([simul_slctd[0][simulated_in_that_class_bool], simul_slctd[1][simulated_in_that_class_bool]])
spexs_slctd_in_that_class=np.array([to_use_df[xkey][spexs_slctd_in_that_class_bool], to_use_df[ykey][spexs_slctd_in_that_class_bool]])
#ax.scatter( simulated_in_that_class[0], simulated_in_that_class[1], facecolors='none', s=10,
# edgecolors='#001f3f', label='simulated')
ax.scatter(spexs_slctd_in_that_class[0], spexs_slctd_in_that_class[1], facecolors='#0074D9', label='SpeX', s=10.)
ax.scatter(cands_dff[xkey], cands_dff[ykey], marker='x', facecolors='#111111', s=10.)
ax.scatter( cands_in_that_class[0], cands_in_that_class[1], marker='x', facecolors='#FF851B', s=10,
edgecolors='#2ECC40', label='candidates')
bx.color='None'
bx.alpha=1.
bx.linewidth=1
bx.linestyle='-'
bx.edgecolor='#111111'
bx.plot(ax=ax, only_shape=True, highlight=False)
ax.set_xlabel(r'$'+str(idx.name.split(' ')[0])+'$', fontsize=15)
ax.set_ylabel(r'$'+str(idx.name.split(' ')[1])+'$', fontsize=15)
ax.set_title(box_name, fontsize=18)
xbuffer=np.nanstd(to_use_df[[xkey,ykey]])
ax.set_xlim([ bx.xrange[0]-0.5*abs(np.ptp(bx.xrange)), bx.xrange[1]+0.5*abs(np.ptp(bx.xrange))])
ax.set_ylim([ bx.yrange[0]-0.5*abs(np.ptp(bx.yrange)), bx.yrange[1]+0.5*abs(np.ptp(bx.yrange))])
ax.minorticks_on()
if (trash_slctd.shape[1])==0:
fprate=0.0
else:
fprate=(trash_slctd.shape[1]- cands_slctd.shape[1])/trash_slctd.shape[1]
if box_name.lower()=='subdwarfs':
fprate=1.
fp[box_name]= fprate
plt.tight_layout()
return {str(box_name): bx}
In [48]:
fig, ax=plt.subplots(nrows=4, ncols=2, figsize=(12, 16))
bxs=[]
for idx, k in enumerate(to_use.keys()):
b=plot_index_box( k, to_use[k], np.concatenate(ax)[idx])
bxs.append(b)
plt.tight_layout()
fig.delaxes(np.concatenate(ax)[-1])
plt.savefig(wisps.OUTPUT_FIGURES+'/index_index_plots.jpeg', bbox_inches='tight')
In [49]:
to_use
Out[49]:
In [ ]:
In [ ]:
In [50]:
common=dt[dt.grism_id.isin(cands.grism_id)]
In [51]:
cands.columns
Out[51]:
In [52]:
fig, ax=plt.subplots(ncols=2, sharex=True, sharey=True, figsize=(10, 6))
#['H_2O-1+H_2O-2/J-Cont'], train_df_ucds['H_2O-1+CH_4/H-Cont'],
ax[0].scatter(simulated_data['H_2O-1+H_2O-2/J-Cont'],simulated_data['H_2O-1+CH_4/H-Cont'],
c=simulated_data.snr1.apply(np.log10), marker='o', alpha=0.1,label='simulated', cmap='viridis', s=1.)
ax[1].scatter(spex['H_2O-1+H_2O-2/J-Cont'], spex['H_2O-1+CH_4/H-Cont'], label='original ', s=1.)
ax[1].scatter(common['H_2O-1+H_2O-2/J-Cont'], common['H_2O-1+CH_4/H-Cont'], facecolors='#FF851B',marker='o', s=20.)
ax[1].scatter(cands['H_2O-1+H_2O-2/J-Cont'], cands['H_2O-1+CH_4/H-Cont'], facecolors='#111111',marker='x', s=1.)
plt.tight_layout()
ax[0].set_xlim([-0.1, 2.])
ax[0].set_ylim([-0.1, 2.5])
for a in ax:
a.legend(fontsize=18)
a.set_xlabel(r'$'+str('H_2O-1+H_2O-2/J-Cont'.split(' ')[0])+'$', fontsize=18)
a.set_ylabel(r'$'+str('H_2O-1+CH_4/H-Cont'.split(' ')[0])+'$', fontsize=18)
plt.tight_layout()
In [53]:
cands.grism_id.to_csv('/users/caganze/desktop/true_brown_dwarfs.csv')
In [54]:
plt.hist(common.spt.values)
plt.hist(cands.spt.values)
Out[54]:
In [55]:
bx_dict={}
for b in bxs:
bx_dict.update(b)
In [56]:
#invert to use
inv_to_use = {v: k for k, v in to_use.items()}
In [57]:
inv_to_use
Out[57]:
In [58]:
dt.shape
Out[58]:
In [59]:
from tqdm import tqdm
In [60]:
ncandidates=[]
for spt_range in bx_dict.keys():
idx_name=inv_to_use[spt_range]
idx=crts[idx_name]
s, bools=(bx_dict[spt_range])._select(np.array([dt[idx.xkey].values, dt[idx.ykey].values]))
ncandidates.append(dt[bools])
In [61]:
candsss=(pd.concat(ncandidates).drop_duplicates(subset='grism_id'))
In [62]:
cands_dff.grism_id=cands_dff.grism_id.apply(lambda x: x.lower())
In [63]:
candsss.spt.plot(kind='hist')
Out[63]:
In [64]:
len(candsss), len(candsss[candsss.grism_id.isin(cands_dff.grism_id) & (candsss.spt.apply(wisps.make_spt_number)>16)])
Out[64]:
In [65]:
candsss.grism_id.values
Out[65]:
In [66]:
len(candsss[candsss.grism_id.isin(cands_dff.grism_id) & (candsss.spt.apply(wisps.make_spt_number)>=17)])
Out[66]:
In [67]:
candsss.to_pickle(wisps.OUTPUT_FILES+'/selected_by_indices.pkl')
In [68]:
def round_tuple(tpl, n=2):
return round(tpl[0], n), round(tpl[1],n)
In [69]:
inv_to_use
Out[69]:
In [70]:
good_indices=[crts[x] for x in inv_to_use.values()]
In [71]:
for sindex, k in zip(good_indices, to_use.keys()):
spt_range=to_use[k]
bs=sindex.shapes
bs=[x for x in bs if x.shape_name==spt_range]
bx=bs[0]
print (" {} & {} & {} & {} & {} & {} & {} & {} & {} & {} \\\ ".format(spt_range,sindex.xkey, sindex.ykey,
round_tuple(bx.vertices[0]), round_tuple(bx.vertices[1])
, round_tuple(bx.vertices[2]), round_tuple(bx.vertices[3]),
round(sindex.completeness[spt_range], 2),
round(sindex.contamination[spt_range], 3),
round(fp[spt_range], 2)))
In [72]:
wisps.get_multiple_sources
Out[72]:
In [99]:
s=splat.getSpectrum(lucky=True)[0]
In [100]:
fig, ax=plt.subplots(figsize=(10, 6))
s.normalize()
s.addNoise(10.)
s.toInstrument('SPEX-PRISM')
plt.step(s.wave, s.flux, label='spex', c='k')
plt.xlim([1.1, 1.7])
s.toInstrument('WFC3-G141')
plt.step(s.wave.value, s.flux.value+0.1, label='WCF3')
s.reset()
plt.legend(fontsize=18)
Out[100]:
In [ ]: