In [1]:
import glob
import os
import pandas as pd
import numpy as np
import pprint
from sklearn.linear_model import SGDRegressor, SGDClassifier
from sklearn.metrics import log_loss, accuracy_score
In [2]:
os.environ['OCTAVE_EXECUTABLE'] = "C:/Octave/Octave-4.2.1/bin/octave-cli-4.2.1.exe"
In [3]:
%load_ext oct2py.ipython
In [4]:
from oct2py import octave
_ = octave.addpath('LOFS_Octave/source_codes/')
In [5]:
all_train = glob.glob("microarray/*_train.csv") + glob.glob("uci/*_train.csv") + glob.glob("NIPS/*_train.csv")
all_train = glob.glob("uci/*_train.csv") + glob.glob("NIPS/*_train.csv")
all_train = glob.glob("NIPS/*_train.csv")
In [6]:
def train_label(fname):
targetname = fname.replace(".csv", ".labels")
return pd.read_csv(targetname)
In [7]:
all_train = ['NIPS\\gisette_train.csv',
'NIPS\\madelon_train.csv']
In [8]:
results_all = []
for fpath in all_train:
print(fpath)
X = np.array(pd.read_csv(fpath))
y = np.array(train_label(fpath)).flatten()
alpha = 0.05
X_train = np.hstack([y.reshape(-1, 1), X])
osfs_feats = octave.fast_osfs_z(X_train, 1, alpha)
feats_fix = [int(x-1) for x in list(np.array(osfs_feats).flatten())]
print(len(feats_fix))
# now fit and return metrics...
X_sel = X[:, feats_fix]
mod = SGDClassifier(loss='log', max_iter=5)
mod.fit(X_sel, y)
results = {
'dataset': fpath,
'accuracy': accuracy_score(y, mod.predict(X_sel)),
'logloss': log_loss(y, mod.predict_proba(X_sel)),
'feat_dim': len(feats_fix)}
print(results)
print("\n\n")
results_all.append(results)
[{'accuracy': 0.35483870967741937,
'dataset': 'microarray\\colon_train.csv',
'feat_dim': (1, 2),
'logloss': 22.283081545103663},
{'accuracy': 0.3888888888888889,
'dataset': 'microarray\\leukemia_train.csv',
'feat_dim': (1, 2),
'logloss': 21.107030019112088},
{'accuracy': 0.83425414364640882,
'dataset': 'microarray\\lung_cancer_train.csv',
'feat_dim': (1, 2),
'logloss': 5.7246590709796719},
{'accuracy': 0.57843137254901966,
'dataset': 'microarray\\prostate_train.csv',
'feat_dim': (1, 3),
'logloss': 14.560464558638815},
{'accuracy': 1.0,
'dataset': 'uci\\Ionosphere_train.csv',
'feat_dim': (1, 2),
'logloss': 0.00075720760748687404}]
[{'accuracy': 1.0,
'dataset': 'uci\\Ionosphere_train.csv',
'feat_dim': 2,
'logloss': 0.0011243083425130506},
{'accuracy': 1.0,
'dataset': 'uci\\spambase_train.csv',
'feat_dim': 1,
'logloss': 0.0024249066390211525},
{'accuracy': 1.0,
'dataset': 'uci\\spectf_train.csv',
'feat_dim': 2,
'logloss': 0.0023242266648043691},
{'accuracy': 0.97891036906854134,
'dataset': 'uci\\wdbc_train.csv',
'feat_dim': 2,
'logloss': 0.50816603432809349}]
In [9]:
pprint.pprint(results_all)