In [1]:
import glob

import numpy as np
import pandas as pd

from grafting_classifier import GraftingClassifier
from sklearn.linear_model import SGDClassifier
from ogfs_classifier import OGFSClassifier
from osfs_classifier import OSFSClassifier
from dpp_classifier import DPPClassifier
from dpp_classifier_mitra import DPPClassifier as DPPClassifier2
from dpp_classifier_ogfs import DPPClassifier as DPPClassifier3

from sklearn.metrics import log_loss, accuracy_score

#import dask.dataframe as dd
#import dask.array as da

In [2]:
class_train = glob.glob("NIPS/*_train.csv")
print(class_train)


['NIPS\\arcene_train.csv', 'NIPS\\dexter_train.csv', 'NIPS\\dorothea_train.csv', 'NIPS\\gisette_train.csv', 'NIPS\\madelon_train.csv']

In [3]:
def train_label(fname):
    targetname = fname.replace(".csv", ".labels")
    return pd.read_csv(targetname)

In [4]:
def get_performance(mod, fpath, mod_name):
    train1 = pd.read_csv(fpath).fillna(0)
    y = np.array(train_label(fpath)).flatten()
    
    # simulate streaming...
    # try splitting into groups of ~10,
    # if there is no splits, try ~5.
    train1_cols = np.array_split(range(train1.shape[1]), int(train1.shape[1]/10.0) + 1)
    if len(train1_cols) == 1:
        train1_cols = np.array_split(range(train1.shape[1]), int(train1.shape[1]/5.0) + 1)
    all_cols = []

    #mod = GraftingClassifier(max_iter=5)
    if mod_name == 'Base':
        mod.fit(train1, y)
        results = {'accuracy': accuracy_score(y, mod.predict(train1)), 
               'logloss': log_loss(y, mod.predict_proba(train1)), 
               'feat_dim': mod.coef_.flatten().shape}
        return results
    
    # lets normalise the dataset...
    train1 = (train1 - train1.mean())/(np.maximum(train1.std(), 1))
    for idx, collist in enumerate(train1_cols):
        if idx == 0:
            column_list = list(np.array(list(train1.columns))[collist])
            mod.fit(train1[column_list], y)
            all_cols.extend(list(collist))
        else:
            all_cols.extend(list(collist))
            column_list = list(np.array(list(train1.columns))[all_cols])
            mod.partial_fit(train1[column_list], y)
        
        # debugging
        print_cond = True if idx % int((len(train1_cols)/10)+1) == 0 else False
        if mod_name in ['Fast_OSFS', 'DPP', 'DPP3', 'OGFS'] and print_cond:
            print("\tmodel: {}, iter: {}".format(mod_name, idx))
        
        # for fast osfs
    if mod_name == 'Fast_OSFS':
        mod._redundancy(train1, y, mode='all')
    
    results = {'accuracy': accuracy_score(y, mod.predict(train1)), 
               'logloss': log_loss(y, mod.predict_proba(train1)), 
               'feat_dim': mod.coef_.flatten().shape}
    return results

In [ ]:
def create_models():
    return [
    #('Grafting', GraftingClassifier(max_iter=5, random_state=42)), 
    #('DPP', DPPClassifier(max_iter=5, random_state=42)), 
    #('DPP2', DPPClassifier2(max_iter=5, random_state=42)),
    #('DPP3', DPPClassifier3(max_iter=5, random_state=42)),
    #('OGFS', OGFSClassifier(max_iter=5, random_state=42)),
    #('OSFS', OSFSClassifier(max_iter=5, random_state=42, fast_osfs=False)),
    ('Fast_OSFS', OSFSClassifier(max_iter=5, random_state=42)),
    ('Base', SGDClassifier(loss='log', max_iter=5, random_state=42))
]

In [ ]:
ex_dat = class_train[3]
print(ex_dat, pd.read_csv(ex_dat).shape)
models = create_models()
for nm, mod in models:
    print(nm, get_performance(mod, ex_dat, mod_name=nm))


NIPS\gisette_train.csv (6000, 5000)
	model: Fast_OSFS, iter: 0
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\sklearn\feature_selection\univariate_selection.py:113: UserWarning: Features [1 9] are constant.
  UserWarning)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\sklearn\feature_selection\univariate_selection.py:114: RuntimeWarning: invalid value encountered in true_divide
  f = msb / msw
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\sklearn\feature_selection\univariate_selection.py:113: UserWarning: Features [6] are constant.
  UserWarning)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\sklearn\feature_selection\univariate_selection.py:113: UserWarning: Features [1] are constant.
  UserWarning)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\sklearn\feature_selection\univariate_selection.py:113: UserWarning: Features [9] are constant.
  UserWarning)
	model: Fast_OSFS, iter: 51
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\sklearn\feature_selection\univariate_selection.py:113: UserWarning: Features [3] are constant.
  UserWarning)
	model: Fast_OSFS, iter: 102
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\sklearn\feature_selection\univariate_selection.py:113: UserWarning: Features [7] are constant.
  UserWarning)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\sklearn\feature_selection\univariate_selection.py:113: UserWarning: Features [0] are constant.
  UserWarning)
	model: Fast_OSFS, iter: 153
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\sklearn\feature_selection\univariate_selection.py:113: UserWarning: Features [2] are constant.
  UserWarning)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\sklearn\feature_selection\univariate_selection.py:113: UserWarning: Features [5] are constant.
  UserWarning)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\sklearn\feature_selection\univariate_selection.py:113: UserWarning: Features [4] are constant.
  UserWarning)
	model: Fast_OSFS, iter: 204
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\sklearn\feature_selection\univariate_selection.py:113: UserWarning: Features [8] are constant.
  UserWarning)
	model: Fast_OSFS, iter: 255
	model: Fast_OSFS, iter: 306
	model: Fast_OSFS, iter: 357
	model: Fast_OSFS, iter: 408
	model: Fast_OSFS, iter: 459
		(6000, 2061)
		(6000, 2060)
		(6000, 2059)
		(6000, 2058)
		(6000, 2057)
		(6000, 2056)
		(6000, 2055)
		(6000, 2054)
		(6000, 2053)
		(6000, 2052)
		(6000, 2051)
		(6000, 2050)
		(6000, 2049)
		(6000, 2048)
		(6000, 2047)
		(6000, 2046)
		(6000, 2045)
		(6000, 2044)
		(6000, 2044)
		(6000, 2043)
		(6000, 2042)
		(6000, 2041)
		(6000, 2040)
		(6000, 2039)
		(6000, 2038)
		(6000, 2037)
		(6000, 2036)
		(6000, 2035)
		(6000, 2034)
		(6000, 2033)
		(6000, 2032)
		(6000, 2031)
		(6000, 2030)
		(6000, 2029)
		(6000, 2028)
		(6000, 2027)
		(6000, 2026)