In [1]:
import glob
import os

import pandas as pd 
import numpy as np
import pprint
from sklearn.linear_model import SGDRegressor, SGDClassifier
from sklearn.metrics import log_loss, accuracy_score

In [2]:
os.environ['OCTAVE_EXECUTABLE'] = "C:/Octave/Octave-4.2.1/bin/octave-cli-4.2.1.exe"

In [3]:
%load_ext oct2py.ipython

In [4]:
from oct2py import octave
_ = octave.addpath('LOFS_Octave/source_codes/')


warning: function LOFS_Octave/source_codes\example.m shadows a core library function

In [5]:
all_train = glob.glob("microarray/*_train.csv") + glob.glob("uci/*_train.csv") + glob.glob("NIPS/*_train.csv")
#all_train = glob.glob("uci/*_train.csv") + glob.glob("NIPS/*_train.csv")
#all_train = glob.glob("NIPS/*_train.csv")

In [6]:
def train_label(fname):
    targetname = fname.replace(".csv", ".labels")
    return pd.read_csv(targetname)

In [7]:
results_all = []
for fpath in all_train:    
    print(fpath)
    X = np.array(pd.read_csv(fpath))
    y = np.array(train_label(fpath)).flatten()
    alpha = 0.05

    X_train = np.hstack([X, y.reshape(-1, 1)])
    s_feats = octave.saola_z_test(X_train, alpha)
    feats_fix = [int(x-1) for x in list(np.array(s_feats).flatten())]
    print(len(feats_fix))
    
    # now fit and return metrics...
    X_sel = X[:, feats_fix]
    mod = SGDClassifier(loss='log', max_iter=5)
    mod.fit(X_sel, y)
    results = {
            'dataset': fpath,
            'accuracy': accuracy_score(y, mod.predict(X_sel)), 
           'logloss': log_loss(y, mod.predict_proba(X_sel)), 
           'feat_dim': len(feats_fix)}
    print(results)
    print("\n\n")
    results_all.append(results)


microarray\colon_train.csv
6
{'dataset': 'microarray\\colon_train.csv', 'accuracy': 0.85483870967741937, 'logloss': 5.0136933476483243, 'feat_dim': 6}



microarray\leukemia_train.csv
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\sklearn\linear_model\base.py:340: RuntimeWarning: overflow encountered in exp
  np.exp(prob, prob)
21
{'dataset': 'microarray\\leukemia_train.csv', 'accuracy': 0.98611111111111116, 'logloss': 0.47970522770709373, 'feat_dim': 21}



microarray\lung_cancer_train.csv
54
{'dataset': 'microarray\\lung_cancer_train.csv', 'accuracy': 1.0, 'logloss': 9.9920072216264128e-16, 'feat_dim': 54}



microarray\prostate_train.csv
16
{'dataset': 'microarray\\prostate_train.csv', 'accuracy': 0.92156862745098034, 'logloss': 2.708923638816525, 'feat_dim': 16}



uci\Ionosphere_train.csv
4
{'dataset': 'uci\\Ionosphere_train.csv', 'accuracy': 0.89458689458689455, 'logloss': 1.0215881689838706, 'feat_dim': 4}



uci\spambase_train.csv
25
{'dataset': 'uci\\spambase_train.csv', 'accuracy': 0.68767659204520759, 'logloss': 10.754718658437609, 'feat_dim': 25}



uci\spectf_train.csv
1
{'dataset': 'uci\\spectf_train.csv', 'accuracy': 0.79400749063670417, 'logloss': 0.92935358555818504, 'feat_dim': 1}



uci\wdbc_train.csv
2
{'dataset': 'uci\\wdbc_train.csv', 'accuracy': 0.70298769771529002, 'logloss': 8.9675765630070234, 'feat_dim': 2}



NIPS\arcene_train.csv
37
{'dataset': 'NIPS\\arcene_train.csv', 'accuracy': 1.0, 'logloss': 9.9920072216264128e-16, 'feat_dim': 37}



NIPS\dexter_train.csv
24
{'dataset': 'NIPS\\dexter_train.csv', 'accuracy': 0.91333333333333333, 'logloss': 2.8496912440314817, 'feat_dim': 24}



NIPS\gisette_train.csv
warning: division by zero
24
{'dataset': 'NIPS\\gisette_train.csv', 'accuracy': 0.84833333333333338, 'logloss': 5.2383810865614544, 'feat_dim': 24}



NIPS\madelon_train.csv
25
{'dataset': 'NIPS\\madelon_train.csv', 'accuracy': 0.52049999999999996, 'logloss': 16.561343281359672, 'feat_dim': 25}




In [8]:
pprint.pprint(results_all)


[{'accuracy': 0.85483870967741937,
  'dataset': 'microarray\\colon_train.csv',
  'feat_dim': 6,
  'logloss': 5.0136933476483243},
 {'accuracy': 0.98611111111111116,
  'dataset': 'microarray\\leukemia_train.csv',
  'feat_dim': 21,
  'logloss': 0.47970522770709373},
 {'accuracy': 1.0,
  'dataset': 'microarray\\lung_cancer_train.csv',
  'feat_dim': 54,
  'logloss': 9.9920072216264128e-16},
 {'accuracy': 0.92156862745098034,
  'dataset': 'microarray\\prostate_train.csv',
  'feat_dim': 16,
  'logloss': 2.708923638816525},
 {'accuracy': 0.89458689458689455,
  'dataset': 'uci\\Ionosphere_train.csv',
  'feat_dim': 4,
  'logloss': 1.0215881689838706},
 {'accuracy': 0.68767659204520759,
  'dataset': 'uci\\spambase_train.csv',
  'feat_dim': 25,
  'logloss': 10.754718658437609},
 {'accuracy': 0.79400749063670417,
  'dataset': 'uci\\spectf_train.csv',
  'feat_dim': 1,
  'logloss': 0.92935358555818504},
 {'accuracy': 0.70298769771529002,
  'dataset': 'uci\\wdbc_train.csv',
  'feat_dim': 2,
  'logloss': 8.9675765630070234},
 {'accuracy': 1.0,
  'dataset': 'NIPS\\arcene_train.csv',
  'feat_dim': 37,
  'logloss': 9.9920072216264128e-16},
 {'accuracy': 0.91333333333333333,
  'dataset': 'NIPS\\dexter_train.csv',
  'feat_dim': 24,
  'logloss': 2.8496912440314817},
 {'accuracy': 0.84833333333333338,
  'dataset': 'NIPS\\gisette_train.csv',
  'feat_dim': 24,
  'logloss': 5.2383810865614544},
 {'accuracy': 0.52049999999999996,
  'dataset': 'NIPS\\madelon_train.csv',
  'feat_dim': 25,
  'logloss': 16.561343281359672}]