In [1]:
import glob
import numpy as np
import pandas as pd
from grafting_classifier import GraftingClassifier
from sklearn.linear_model import SGDClassifier
from ogfs_classifier import OGFSClassifier
from osfs_classifier import OSFSClassifier
from dpp_classifier import DPPClassifier
from dpp_classifier_dppsample import DPPClassifier as DPPClassifier0
from dpp_classifier_mitra import DPPClassifier as DPPClassifier2
from dpp_classifier_ogfs_dppsample import DPPClassifier as DPPClassifier3
from sklearn.metrics import log_loss, accuracy_score
#import dask.dataframe as dd
#import dask.array as da
In [2]:
class_train = glob.glob("uci/*_train.csv")
print(class_train)
In [3]:
def train_label(fname):
targetname = fname.replace(".csv", ".labels")
return pd.read_csv(targetname)
In [4]:
def get_performance(mod, fpath, mod_name):
train1 = pd.read_csv(fpath).fillna(0)
y = np.array(train_label(fpath)).flatten()
# simulate streaming...
# try splitting into groups of ~10,
# if there is no splits, try ~5.
train1_cols = np.array_split(range(train1.shape[1]), int(train1.shape[1]/10.0) + 1)
if len(train1_cols) == 1:
train1_cols = np.array_split(range(train1.shape[1]), int(train1.shape[1]/5.0) + 1)
all_cols = []
#mod = GraftingClassifier(max_iter=5)
if mod_name == 'Base':
mod.fit(train1, y)
results = {'accuracy': accuracy_score(y, mod.predict(train1)),
'logloss': log_loss(y, mod.predict_proba(train1)),
'feat_dim': mod.coef_.flatten().shape}
return results
# lets normalise the dataset...
train1 = (train1 - train1.mean())/(np.maximum(train1.std(), 1))
for idx, collist in enumerate(train1_cols):
if idx == 0:
column_list = list(np.array(list(train1.columns))[collist])
mod.fit(train1[column_list], y)
all_cols.extend(list(collist))
else:
all_cols.extend(list(collist))
column_list = list(np.array(list(train1.columns))[all_cols])
mod.partial_fit(train1[column_list], y)
# debugging
print_cond = True if idx % int((len(train1_cols)/10)+1) == 0 else False
if mod_name in ['Fast_OSFS', 'DPP', 'DPP3', 'OGFS'] and print_cond:
print("\tmodel: {}, iter: {}".format(mod_name, idx))
# for fast osfs
if mod_name == 'Fast_OSFS':
mod._redundancy(train1, y, mode='all')
results = {'accuracy': accuracy_score(y, mod.predict(train1)),
'logloss': log_loss(y, mod.predict_proba(train1)),
'feat_dim': mod.coef_.flatten().shape}
return results
In [5]:
def create_models():
return [
#('Grafting', GraftingClassifier(max_iter=5, random_state=42)),
#('DPP', DPPClassifier(max_iter=5, random_state=42)),
#('DPP0', DPPClassifier0(max_iter=5, random_state=42)),
#('DPP2', DPPClassifier2(max_iter=5, random_state=42)),
('DPP3', DPPClassifier3(max_iter=5, random_state=42)),
#('OGFS', OGFSClassifier(max_iter=5, random_state=42)),
#('OSFS', OSFSClassifier(max_iter=5, random_state=42, fast_osfs=False)),
#('Fast_OSFS', OSFSClassifier(max_iter=5, random_state=42)),
('Base', SGDClassifier(loss='log', max_iter=5, random_state=42))
]
In [6]:
#ex_dat = class_train[0]
#print(ex_dat, pd.read_csv(ex_dat).shape)
#models = create_models()
#for nm, mod in models:
# print(nm, get_performance(mod, ex_dat, mod_name=nm))
In [7]:
#ex_dat = class_train[1]
#print(ex_dat, pd.read_csv(ex_dat).shape)
#models = create_models()
#for nm, mod in models:
# print(nm, get_performance(mod, ex_dat, mod_name=nm))
In [8]:
ex_dat = class_train[2]
print(ex_dat, pd.read_csv(ex_dat).shape)
models = create_models()
for nm, mod in models:
print(nm, get_performance(mod, ex_dat, mod_name=nm))
In [9]:
ex_dat = class_train[3]
print(ex_dat, pd.read_csv(ex_dat).shape)
models = create_models()
for nm, mod in models:
print(nm, get_performance(mod, ex_dat, mod_name=nm))
In [ ]: