In [1]:
    
import splat
import wisps
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline
    
    
In [2]:
    
from wisps import datasets
    
In [3]:
    
spex=(datasets['spex'].rename(columns={'l_snr': 'snr1', 'l_t_snr':'snr2', 'f_x':'f_test'}))
#trash
trash=pd.read_pickle(wisps.OUTPUT_FILES+'/trash.pkl')
trash['label']=0.
    
In [4]:
    
man=(datasets['manjavacas']).rename(columns={'f':'f_test'})
schn=(datasets['schneider']).rename(columns={'f':'f_test'})
    
In [5]:
    
sf=pd.read_pickle(wisps.OUTPUT_FILES+'/selection_function.pkl')
    
In [6]:
    
simulated_spectra=pd.DataFrame.from_records(pd.DataFrame(sf).values.flatten()).rename(columns={'f':'f_test', 'Names':'name'})
simulated_spectra=(simulated_spectra[simulated_spectra.snr1>3.])
    
In [7]:
    
spex['spt']=spex.spt.apply(wisps.make_spt_number)
man['spt']=man.spt.apply(wisps.make_spt_number)
schn['spt']=schn.spt.apply(wisps.make_spt_number)
trash['spt']=trash.spt.apply(wisps.make_spt_number)
    
In [8]:
    
len(trash)
    
    Out[8]:
In [9]:
    
spex['name']=spex['grism_id']
    
In [10]:
    
features=['CH_4/H-Cont', 'CH_4/H_2O-1', 'CH_4/H_2O-2', 'CH_4/J-Cont',
       'H-cont/H_2O-1', 'H-cont/H_2O-2', 'H-cont/J-Cont', 'H_2O-1/J-Cont',
       'H_2O-2/H_2O-1', 'H_2O-2/J-Cont', 'spt', 'spex_chi', 'name', 'snr2','snr1', 'line_chi', 'f_test',  'label']
    
In [11]:
    
from scipy import stats
    
In [12]:
    
def f_test_comp(x):
    return stats.f.cdf(x, 2, 1, 0, scale=1)
man['x']=man.spex_chi/man.line_chi
schn['x']=schn.spex_chi/schn.line_chi
simulated_spectra['x']=simulated_spectra.spex_chi/simulated_spectra.line_chi
man['f_test']=man.x.apply(f_test_comp)
schn['f_test']=schn.x.apply(f_test_comp)
simulated_spectra['f_test']=simulated_spectra.x.apply(f_test_comp)
spex['f_test']=(spex.spex_chi/spex.line_chi).apply(f_test_comp)
    
In [13]:
    
simulated_spectra['spt']=simulated_spectra.spt_new.apply(wisps.make_spt_number)
    
In [14]:
    
simulated_spectra['name']=['spect {}'.format(idx) for idx in np.arange(len(simulated_spectra))]
    
In [15]:
    
def add_labels(spt):
    label=0.0
    if spt>=17.:
        label=1.0
    return label
    
In [16]:
    
spex['label']=spex.spt.apply(add_labels)
man['label']=man.spt.apply(add_labels)
schn['label']=schn.spt.apply(add_labels)
simulated_spectra['label']=simulated_spectra.spt.apply(add_labels)
trash['label']=0.
    
In [17]:
    
#en(trash), len(simulated_spectra)
    
In [18]:
    
trash['name']=trash.grism_id
    
In [19]:
    
#a[features]
    
In [20]:
    
training_set=pd.concat( [trash[features], spex[features], man[features], schn[features]])
    
In [21]:
    
len(trash), len(spex)+len(man)+len(schn)+len(simulated_spectra)
    
    Out[21]:
In [22]:
    
training_set.label.plot(kind='hist')
    
    Out[22]:
    
In [23]:
    
feats=[x for x in features if x not in ['name', 'label']]
    
In [24]:
    
training_set[feats]=wisps.Annotator.reformat_table(training_set[feats]).applymap(float)
training_set.to_pickle(wisps.LIBRARIES+'/training_set.pkl')
    
In [25]:
    
training_set.label.plot(kind='hist')
    
    Out[25]:
    
In [ ]: