In [8]:
import glob

import numpy as np
import pandas as pd

from grafting_classifier import GraftingClassifier
from sklearn.linear_model import SGDClassifier
from ogfs_classifier import OGFSClassifier
from osfs_classifier import OSFSClassifier
from dpp_classifier import DPPClassifier
from dpp_classifier_mitra import DPPClassifier as DPPClassifier2
from dpp_classifier_ogfs import DPPClassifier as DPPClassifier3

from sklearn.metrics import log_loss, accuracy_score

#import dask.dataframe as dd
#import dask.array as da

In [9]:
class_train = glob.glob("NIPS/*_train.csv")
print(class_train)


['NIPS\\arcene_train.csv', 'NIPS\\dexter_train.csv', 'NIPS\\dorothea_train.csv', 'NIPS\\gisette_train.csv', 'NIPS\\madelon_train.csv']

In [10]:
def train_label(fname):
    targetname = fname.replace(".csv", ".labels")
    return pd.read_csv(targetname)

In [11]:
def get_performance(mod, fpath, mod_name):
    train1 = pd.read_csv(fpath).fillna(0)
    y = np.array(train_label(fpath)).flatten()
    
    # simulate streaming...
    # try splitting into groups of ~10,
    # if there is no splits, try ~5.
    train1_cols = np.array_split(range(train1.shape[1]), int(train1.shape[1]/10.0) + 1)
    if len(train1_cols) == 1:
        train1_cols = np.array_split(range(train1.shape[1]), int(train1.shape[1]/5.0) + 1)
    all_cols = []

    #mod = GraftingClassifier(max_iter=5)
    if mod_name == 'Base':
        mod.fit(train1, y)
        results = {'accuracy': accuracy_score(y, mod.predict(train1)), 
               'logloss': log_loss(y, mod.predict_proba(train1)), 
               'feat_dim': mod.coef_.flatten().shape}
        return results
    
    # lets normalise the dataset...
    train1 = (train1 - train1.mean())/(np.maximum(train1.std(), 1))
    for idx, collist in enumerate(train1_cols):
        if idx == 0:
            column_list = list(np.array(list(train1.columns))[collist])
            mod.fit(train1[column_list], y)
            all_cols.extend(list(collist))
        else:
            all_cols.extend(list(collist))
            column_list = list(np.array(list(train1.columns))[all_cols])
            mod.partial_fit(train1[column_list], y)
        
        # debugging
        print_cond = True if idx % int((len(train1_cols)/10)+1) == 0 else False
        if mod_name in ['Fast_OSFS', 'DPP', 'DPP3', 'OGFS'] and print_cond:
            print("\tmodel: {}, iter: {}".format(mod_name, idx))
        
        # for fast osfs
    if mod_name == 'Fast_OSFS':
        mod._redundancy(train1, y, mode='all')
    
    results = {'accuracy': accuracy_score(y, mod.predict(train1)), 
               'logloss': log_loss(y, mod.predict_proba(train1)), 
               'feat_dim': mod.coef_.flatten().shape}
    return results

In [12]:
def create_models():
    return [
    ('Grafting', GraftingClassifier(max_iter=5, random_state=42)), 
    #('DPP', DPPClassifier(max_iter=5, random_state=42)), 
    #('DPP2', DPPClassifier2(max_iter=5, random_state=42)),
    #('DPP3', DPPClassifier3(max_iter=5, random_state=42)),
    #('OGFS', OGFSClassifier(max_iter=5, random_state=42)),
    #('OSFS', OSFSClassifier(max_iter=5, random_state=42, fast_osfs=False)),
    ('Fast_OSFS', OSFSClassifier(max_iter=5, random_state=42)),
    ('Base', SGDClassifier(loss='log', max_iter=5, random_state=42))
]

In [7]:
ex_dat = class_train[4]
print(ex_dat, pd.read_csv(ex_dat).shape)
models = create_models()
for nm, mod in models:
    print(nm, get_performance(mod, ex_dat, mod_name=nm))


NIPS\madelon_train.csv (2000, 500)
Grafting {'accuracy': 0.56850000000000001, 'logloss': 2.1218714045988234, 'feat_dim': (92,)}
	model: Fast_OSFS, iter: 0
	model: Fast_OSFS, iter: 6
	model: Fast_OSFS, iter: 12
	model: Fast_OSFS, iter: 18
	model: Fast_OSFS, iter: 24
	model: Fast_OSFS, iter: 30
	model: Fast_OSFS, iter: 36
	model: Fast_OSFS, iter: 42
	model: Fast_OSFS, iter: 48
		(2000, 52)
		(2000, 52)
		(2000, 52)
		(2000, 51)
		(2000, 51)
		(2000, 50)
		(2000, 49)
		(2000, 49)
		(2000, 48)
		(2000, 47)
		(2000, 47)
		(2000, 47)
		(2000, 46)
		(2000, 45)
		(2000, 44)
		(2000, 43)
		(2000, 43)
		(2000, 43)
		(2000, 42)
		(2000, 41)
		(2000, 40)
		(2000, 40)
		(2000, 39)
		(2000, 38)
		(2000, 37)
		(2000, 36)
		(2000, 35)
		(2000, 34)
		(2000, 34)
		(2000, 33)
		(2000, 33)
		(2000, 33)
		(2000, 33)
		(2000, 32)
		(2000, 32)
		(2000, 32)
		(2000, 32)
		(2000, 32)
		(2000, 31)
		(2000, 30)
		(2000, 30)
		(2000, 30)
		(2000, 30)
		(2000, 30)
		(2000, 30)
		(2000, 29)
		(2000, 28)
		(2000, 28)
		(2000, 28)
		(2000, 28)
		(2000, 27)
		(2000, 27)
		(2000, 26)
Fast_OSFS {'accuracy': 0.4965, 'logloss': 1.2189715854570837, 'feat_dim': (43,)}
Base {'accuracy': 0.5, 'logloss': 17.269388197455346, 'feat_dim': (500,)}
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\sklearn\linear_model\base.py:340: RuntimeWarning: overflow encountered in exp
  np.exp(prob, prob)