In [1]:
import glob
import os

import pandas as pd 
import numpy as np
import pprint
from sklearn.linear_model import SGDRegressor, SGDClassifier
from sklearn.metrics import log_loss, accuracy_score

In [2]:
os.environ['OCTAVE_EXECUTABLE'] = "C:/Octave/Octave-4.2.1/bin/octave-cli-4.2.1.exe"

In [3]:
%load_ext oct2py.ipython

In [4]:
from oct2py import octave
_ = octave.addpath('LOFS_Octave/source_codes/')


warning: function LOFS_Octave/source_codes\example.m shadows a core library function

In [5]:
all_train = glob.glob("microarray/*_train.csv") + glob.glob("uci/*_train.csv") + glob.glob("NIPS/*_train.csv")
all_train = glob.glob("uci/*_train.csv") + glob.glob("NIPS/*_train.csv")
all_train = glob.glob("NIPS/*_train.csv")

In [6]:
def train_label(fname):
    targetname = fname.replace(".csv", ".labels")
    return pd.read_csv(targetname)

In [7]:
all_train = ['NIPS\\gisette_train.csv',
 'NIPS\\madelon_train.csv']

In [8]:
results_all = []
for fpath in all_train:    
    print(fpath)
    X = np.array(pd.read_csv(fpath))
    y = np.array(train_label(fpath)).flatten()
    alpha = 0.05

    X_train = np.hstack([y.reshape(-1, 1), X])
    osfs_feats = octave.fast_osfs_z(X_train, 1, alpha)
    feats_fix = [int(x-1) for x in list(np.array(osfs_feats).flatten())]
    print(len(feats_fix))
    
    # now fit and return metrics...
    X_sel = X[:, feats_fix]
    mod = SGDClassifier(loss='log', max_iter=5)
    mod.fit(X_sel, y)
    results = {
            'dataset': fpath,
            'accuracy': accuracy_score(y, mod.predict(X_sel)), 
           'logloss': log_loss(y, mod.predict_proba(X_sel)), 
           'feat_dim': len(feats_fix)}
    print(results)
    print("\n\n")
    results_all.append(results)


NIPS\gisette_train.csv
warning: division by zero
warning: division by zero
warning: division by zero
47
{'dataset': 'NIPS\\gisette_train.csv', 'accuracy': 0.64033333333333331, 'logloss': 12.422446576702876, 'feat_dim': 47}



NIPS\madelon_train.csv
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\sklearn\linear_model\base.py:340: RuntimeWarning: overflow encountered in exp
  np.exp(prob, prob)
13
{'dataset': 'NIPS\\madelon_train.csv', 'accuracy': 0.5, 'logloss': 17.269388197455346, 'feat_dim': 13}



[{'accuracy': 0.35483870967741937,
  'dataset': 'microarray\\colon_train.csv',
  'feat_dim': (1, 2),
  'logloss': 22.283081545103663},
 {'accuracy': 0.3888888888888889,
  'dataset': 'microarray\\leukemia_train.csv',
  'feat_dim': (1, 2),
  'logloss': 21.107030019112088},
 {'accuracy': 0.83425414364640882,
  'dataset': 'microarray\\lung_cancer_train.csv',
  'feat_dim': (1, 2),
  'logloss': 5.7246590709796719},
 {'accuracy': 0.57843137254901966,
  'dataset': 'microarray\\prostate_train.csv',
  'feat_dim': (1, 3),
  'logloss': 14.560464558638815},
 {'accuracy': 1.0,
  'dataset': 'uci\\Ionosphere_train.csv',
  'feat_dim': (1, 2),
  'logloss': 0.00075720760748687404}]

[{'accuracy': 1.0,
  'dataset': 'uci\\Ionosphere_train.csv',
  'feat_dim': 2,
  'logloss': 0.0011243083425130506},
 {'accuracy': 1.0,
  'dataset': 'uci\\spambase_train.csv',
  'feat_dim': 1,
  'logloss': 0.0024249066390211525},
 {'accuracy': 1.0,
  'dataset': 'uci\\spectf_train.csv',
  'feat_dim': 2,
  'logloss': 0.0023242266648043691},
 {'accuracy': 0.97891036906854134,
  'dataset': 'uci\\wdbc_train.csv',
  'feat_dim': 2,
  'logloss': 0.50816603432809349}]

In [9]:
pprint.pprint(results_all)


[{'accuracy': 0.64033333333333331,
  'dataset': 'NIPS\\gisette_train.csv',
  'feat_dim': 47,
  'logloss': 12.422446576702876},
 {'accuracy': 0.5,
  'dataset': 'NIPS\\madelon_train.csv',
  'feat_dim': 13,
  'logloss': 17.269388197455346}]