In [1]:
import glob
import os
import pandas as pd
import numpy as np
import pprint
from sklearn.linear_model import SGDRegressor, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss, accuracy_score
In [2]:
os.environ['OCTAVE_EXECUTABLE'] = "C:/Octave/Octave-4.2.1/bin/octave-cli-4.2.1.exe"
In [3]:
%load_ext oct2py.ipython
In [4]:
from oct2py import octave
_ = octave.addpath('LOFS_Octave/source_codes/')
In [5]:
all_train = glob.glob("microarray/*_train.csv") + glob.glob("uci/*_train.csv") + glob.glob("NIPS/*_train.csv")
all_train = glob.glob("uci/*_train.csv") + glob.glob("NIPS/*_train.csv")
all_train = glob.glob("NIPS/*_train.csv")
print(all_train)
In [6]:
def train_label(fname):
targetname = fname.replace(".csv", ".labels")
return pd.read_csv(targetname)
In [8]:
all_train = ['NIPS\\gisette_train.csv',
'NIPS\\madelon_train.csv']
In [ ]:
results_all = []
for fpath in all_train:
print(fpath)
X = np.array(pd.read_csv(fpath))
y = np.array(train_label(fpath)).flatten()
alpha = 0.05
X_train = np.hstack([y.reshape(-1, 1), X])
ai_feats = octave.Alpha_Investing(X, y.reshape(-1, 1))
feats_fix = [int(x-1) for x in list(np.array(ai_feats).flatten())]
print(len(feats_fix))
# now fit and return metrics...
X_sel = X[:, feats_fix]
mod = SGDClassifier(loss='log', max_iter=5)
mod.fit(X_sel, y)
results = {
'dataset': fpath,
'accuracy': accuracy_score(y, mod.predict(X_sel)),
'logloss': log_loss(y, mod.predict_proba(X_sel)),
'feat_dim': len(feats_fix)}
print(results)
print("\n\n")
results_all.append(results)
Results:
[{'accuracy': 0.87179487179487181,
'dataset': 'uci\\Ionosphere_train.csv',
'feat_dim': 10,
'logloss': 1.1571746528803923},
{'accuracy': 0.77178874157791788,
'dataset': 'uci\\spambase_train.csv',
'feat_dim': 45,
'logloss': 7.8664959137554975},
{'accuracy': 0.79400749063670417,
'dataset': 'uci\\spectf_train.csv',
'feat_dim': 7,
'logloss': 2.3041623877531987},
{'accuracy': 0.7152899824253075,
'dataset': 'uci\\wdbc_train.csv',
'feat_dim': 21,
'logloss': 9.8335356344033933},
{'accuracy': 0.35483870967741937,
'dataset': 'microarray\\colon_train.csv',
'feat_dim': 4,
'logloss': 22.283081545103663},
{'accuracy': 0.34722222222222221,
'dataset': 'microarray\\leukemia_train.csv',
'feat_dim': 16,
'logloss': 22.546145702233364},
{'accuracy': 1.0,
'dataset': 'microarray\\lung_cancer_train.csv',
'feat_dim': 69,
'logloss': 9.9920072216264128e-16},
{'accuracy': 0.57843137254901966,
'dataset': 'microarray\\prostate_train.csv',
'feat_dim': 25,
'logloss': 14.560464558638817}]
In [ ]:
pprint.pprint(results_all)