In [1]:
import splat
import wisps
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from wisps import datasets
from scipy import stats
%matplotlib inline
In [2]:
sf0=pd.read_pickle(wisps.OUTPUT_FILES+'/selection_function.pkl')
sf=pd.DataFrame.from_records(pd.DataFrame(sf0).values.flatten())
In [3]:
alldata=pd.read_hdf(wisps.LIBRARIES+'/master_dataset.hdf', key='new')
star_ids=alldata#[alldata['class_star'] !=0]
stars=wisps.Annotator.reformat_table(star_ids).reset_index(drop=True)
In [4]:
#I should probably add a list of galaxies
#let's start with from Zanella et al. 2016
In [5]:
stars.grism_id=stars.grism_id.apply(lambda x: str(x).lower())
In [6]:
trash_names=trash=pd.read_pickle(wisps.OUTPUT_FILES+'/trash.pkl').grism_id
In [7]:
trash=stars[stars.grism_id.isin(trash_names)]
In [8]:
trash['label']=0.
In [9]:
spex=(datasets['spex']).rename(columns={'spt':'spt_tupl'})
In [10]:
spex['grism_id']=spex.data_file
In [11]:
man=(datasets['manjavacas']).rename(columns={'f':'f_test'})
schn=(datasets['schneider']).rename(columns={'f':'f_test'})
In [12]:
spex['spt']=np.vstack(spex.spt_tupl.values)[:,0]
spex['spt_unc']=np.vstack(spex.spt_tupl.values)[:,1]
In [13]:
spex['spt']=spex.spt.apply(wisps.make_spt_number)
man['spt']=man.spt.apply(wisps.make_spt_number)
schn['spt']=schn.spt.apply(wisps.make_spt_number)
trash['spt']=trash.spt.apply(wisps.make_spt_number)
spex['name']=spex['grism_id']
In [14]:
cand_ids=pd.read_pickle(wisps.OUTPUT_FILES+'/true_spectra_cands.pkl')
cand_ids['grism_id']=cand_ids.spectra.apply(lambda x: x.grism_id.lower())
In [15]:
cands=(stars[stars.grism_id.isin(cand_ids.grism_id)]).reset_index(drop=True)
In [16]:
cands['label']=1
cands['spt']=cands.spt.apply(wisps.make_spt_number)
In [17]:
features =np.concatenate([wisps.INDEX_NAMES, ['spex_chi', 'snr2', 'snr1', 'snr3','snr4',
'line_chi', 'label', 'f_test', 'spt', 'snr1']])
In [18]:
def f_test_comp(x):
return stats.f.cdf(x, 2, 1, 0, scale=1)
man['x']=man.spex_chi/man.line_chi
schn['x']=schn.spex_chi/schn.line_chi
man['f_test']=man.x.apply(f_test_comp)
schn['f_test']=schn.x.apply(f_test_comp)
spex['f_test']=(spex.spex_chi/spex.line_chi).apply(f_test_comp)
sf['f_test']=(sf.spex_chi/sf.line_chi).apply(f_test_comp)
In [19]:
trash['name']=trash.grism_id
sf['name']=['spc {}'+str(i) for i in sf.index]
In [20]:
feats=[x for x in features if x not in ['name']]
In [21]:
#def add_binary_label(row, label):
# row
In [22]:
trash['label']='trash'
spex['label']='spex'
schn['label']='schn'
sf['label']='sf'
man['label']='man'
In [23]:
spex=spex.iloc[(spex[wisps.INDEX_NAMES]).dropna().index]
man=man.iloc[(man[wisps.INDEX_NAMES]).dropna().index]
schn=schn.iloc[(schn[wisps.INDEX_NAMES]).dropna().index]
In [24]:
spex=wisps.Annotator.reformat_table(spex)
trash=wisps.Annotator.reformat_table(trash)
schn=wisps.Annotator.reformat_table(schn)
man=wisps.Annotator.reformat_table(man)
sf=wisps.Annotator.reformat_table(sf)
In [25]:
sf['spt']=np.vstack(sf.spt_new.values)[:,0]
In [26]:
training_set=pd.concat( [ spex[features], trash[features], man[features], schn[features], sf[features]]).reset_index(drop=True)
In [27]:
training_set.shape
Out[27]:
In [28]:
training_set=(training_set[features]).dropna(how='all').reset_index(drop=True)
In [29]:
final_set=training_set
In [30]:
#final_set=pd.concat([label0, label1]).reset_index(drop=True)
In [31]:
#final_set=final_set.reset_index(drop=True).applymap(float)
In [32]:
final_set = final_set.loc[:,~final_set.columns.duplicated()]
In [33]:
final_set.to_pickle(wisps.LIBRARIES+'/training_set.pkl')
In [34]:
plt.scatter(final_set['f_test'], final_set['snr1'].apply(np.log10), c=final_set['spt'], s=4, marker='+' , cmap='viridis')
plt.ylim([-1, 4])
plt.colorbar()
Out[34]:
In [ ]: