In [1]:
import splat
import wisps
import numpy as np
import matplotlib.pyplot as plt
from wisps import Annotator as an
from wisps import datasets
import pandas as pd
from tqdm import tqdm
%matplotlib inline

In [2]:
#scale the data 
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [3]:
train_df=datasets['traing_set']

In [4]:
feats=['CH_4/H-Cont', 'CH_4/H_2O-1', 'CH_4/H_2O-2', 'CH_4/J-Cont',
       'H-cont/H_2O-1', 'H-cont/H_2O-2', 'H-cont/J-Cont', 'H_2O-1/J-Cont',
       'H_2O-2/H_2O-1', 'H_2O-2/J-Cont',  'spex_chi', 'snr2', 'snr1',
       'line_chi', 'f_test', 'x', 'spt']
train_df['x']=train_df.spex_chi/train_df.line_chi

In [5]:
unc_columns=[x+'er' for x in wisps.INDEX_NAMES]
features=feats

In [6]:
prblm_feats=['line_chi', 'spex_chi', 'H_2O-2/J-Cont', 'H-cont/J-Cont', 'H_2O-1/J-Cont', 'H-cont/H_2O-1', 'snr2', 'x']
#pred_df[prblm_feats]=pred_df[prblm_feats].applymap(np.log10).replace(np.inf, np.nan).replace(-np.inf, np.nan).replace(np.nan, -999999.9)
train_df[prblm_feats]=train_df[prblm_feats].applymap(np.float).applymap(np.log10).replace(np.inf, np.nan).replace(-np.inf, np.nan).replace(np.nan, -999999.9)

In [7]:
def make_label_binary(labl):
    ##only two labels
    if labl >0.:
        return 1.
    else:
        return 0.

In [8]:
train_df['label']=train_df['label'].apply(make_label_binary)

In [9]:
from sklearn.metrics import confusion_matrix,accuracy_score

In [10]:
def compute_accuracy_score(features):
    scaler = MinMaxScaler(feature_range=(0, 1))
    #train_set=train_df[features]
    X_train, X_test, y_train, y_test = train_test_split(train_df[features].values, train_df['label'].values, test_size=0.5,  random_state=123456) ###grammar  
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    rf = RandomForestClassifier(n_estimators=1000, oob_score=True, random_state=123456)
    rf.fit(X_train, y_train)
    
    pred_labels = rf.predict(X_test)
    model_accuracy = accuracy_score(y_test, pred_labels)
    
    return model_accuracy

In [11]:
scores=[]
fets=[]
for f in tqdm(features):
    fets.append(f)
    scores.append(compute_accuracy_score(fets))


100%|██████████| 17/17 [01:53<00:00,  7.74s/it]

In [12]:
from matplotlib.ticker import MultipleLocator

In [43]:
fealables=['idx'+ str(idx+1) for idx in range(0, len(wisps.INDEX_NAMES)) ]
fealbls=np.append(fealables, ['spexchi', 'snrj', 'snrh',
       'linechi', 'ftest', 'x', 'spt'])

In [44]:
fealbls=[x.upper() for x in fealbls ]

In [46]:
fig, ax=plt.subplots(figsize=(6, 8))

plt.plot(  scores,  np.arange(len(features))+1)
ml = MultipleLocator(2.)
ml2=MultipleLocator(.05)

ax.xaxis.set_major_locator(ml2)

plt.grid(axis='y')
plt.yticks( np.arange(len(features))+1, fealbls, rotation='horizontal')

plt.ylabel('Features')
plt.xlabel('Accuracy')
plt.tight_layout()
plt.savefig(wisps.OUTPUT_FIGURES+'/cv_scores.pdf')



In [25]:
for x, y in zip(features, scores): print (x,y)


CH_4/H-Cont 0.7541425818882467
CH_4/H_2O-1 0.859344894026975
CH_4/H_2O-2 0.8830443159922928
CH_4/J-Cont 0.9290944123314065
H-cont/H_2O-1 0.9175337186897881
H-cont/H_2O-2 0.9221579961464355
H-cont/J-Cont 0.918111753371869
H_2O-1/J-Cont 0.9115606936416185
H_2O-2/H_2O-1 0.9651252408477842
H_2O-2/J-Cont 0.9655105973025048
spex_chi 0.9745664739884393
snr2 0.9801541425818883
snr1 0.9799614643545279
line_chi 0.9807321772639692
f_test 0.9803468208092485
x 0.9816955684007708
spt 0.9930635838150289

In [15]:
train_df.columns


Out[15]:
Index(['H_2O-1/J-Conter', 'H_2O-2/H_2O-1er', 'H-cont/H_2O-1er',
       'CH_4/H_2O-1er', 'H_2O-2/J-Conter', 'H-cont/J-Conter', 'CH_4/J-Conter',
       'H-cont/H_2O-2er', 'CH_4/H_2O-2er', 'CH_4/H-Conter', 'CH_4/H-Cont',
       'CH_4/H_2O-1', 'CH_4/H_2O-2', 'CH_4/J-Cont', 'H-cont/H_2O-1',
       'H-cont/H_2O-2', 'H-cont/J-Cont', 'H_2O-1/J-Cont', 'H_2O-2/H_2O-1',
       'H_2O-2/J-Cont', 'spt', 'spex_chi', 'name', 'snr2', 'snr1', 'line_chi',
       'f_test', 'label', 'x'],
      dtype='object')

In [ ]: