In [1]:
import glob

import numpy as np
import pandas as pd

from grafting_classifier import GraftingClassifier
from sklearn.linear_model import SGDClassifier
from ogfs_classifier import OGFSClassifier
from dpp_classifier import DPPClassifier
from dpp_classifier_ogfs2 import DPPClassifier as DPPClassifier0

from sklearn.metrics import log_loss, accuracy_score

#import dask.dataframe as dd
#import dask.array as da

In [2]:
class_train = glob.glob("NIPS/*_train.csv")
print(class_train)


['NIPS\\arcene_train.csv', 'NIPS\\dexter_train.csv', 'NIPS\\gisette_train.csv', 'NIPS\\madelon_train.csv']

In [3]:
def train_label(fname):
    targetname = fname.replace(".csv", ".labels")
    return pd.read_csv(targetname)

In [4]:
def get_performance(mod, fpath, base=False):
    train1 = pd.read_csv(fpath).fillna(0)
    y = np.array(train_label(fpath)).flatten()
    
    # simulate streaming...
    # try splitting into groups of ~10,
    # if there is no splits, try ~5.
    train1_cols = np.array_split(range(train1.shape[1]), min(10, int(train1.shape[1]/10.0) + 1))
    if len(train1_cols) == 1:
        train1_cols = np.array_split(range(train1.shape[1]), min(10, int(train1.shape[1]/5.0) + 1))
    all_cols = []

    #mod = GraftingClassifier(max_iter=5)
    if base:
        mod.fit(train1, y)
        results = {'accuracy': accuracy_score(y, mod.predict(train1)), 
               'logloss': log_loss(y, mod.predict_proba(train1)), 
               'feat_dim': mod.coef_.flatten().shape}
        return results

    for idx, collist in enumerate(train1_cols):
        if idx == 0:
            column_list = list(np.array(list(train1.columns))[collist])
            mod.fit(train1[column_list], y)
            all_cols.extend(list(collist))
        else:
            all_cols.extend(list(collist))
            column_list = list(np.array(list(train1.columns))[all_cols])
            mod.partial_fit(train1[column_list], y)
    
    results = {'accuracy': accuracy_score(y, mod.predict(train1)), 
               'logloss': log_loss(y, mod.predict_proba(train1)), 
               'feat_dim': mod.coef_.flatten().shape}
    return results

In [5]:
models = [
    #('Grafting', GraftingClassifier(max_iter=5, random_state=42)), 
    #('DPP', DPPClassifier(max_iter=5, random_state=42)), 
    ('DPP', DPPClassifier0(max_iter=5, random_state=42)), 
    #('DPP2', DPPClassifier2(max_iter=5, random_state=42)),
    #('OGFS', OGFSClassifier(max_iter=5, random_state=42)),
    ('Base', SGDClassifier(loss='log', max_iter=5, random_state=42))
]

In [6]:
#ex_dat = class_train[0]
#for nm, mod in models:
#    if nm != 'Base':
#        print(nm, get_performance(mod, ex_dat))
#    else:
#        print(nm, get_performance(mod, ex_dat, base=True))

In [7]:
#ex_dat = class_train[1]
#for nm, mod in models:
#    if nm != 'Base':
#        print(nm, get_performance(mod, ex_dat))
#    else:
#        print(nm, get_performance(mod, ex_dat, base=True))

In [ ]:
models = [
    #('Grafting', GraftingClassifier(max_iter=5, random_state=42)), 
    #('DPP', DPPClassifier(max_iter=5, random_state=42)), 
    ('DPP', DPPClassifier0(max_iter=5, random_state=42)), 
    #('DPP2', DPPClassifier2(max_iter=5, random_state=42)),
    #('OGFS', OGFSClassifier(max_iter=5, random_state=42)),
    ('Base', SGDClassifier(loss='log', max_iter=5, random_state=42))
]

In [ ]:
ex_dat = class_train[2]
print(ex_dat)
print(pd.read_csv(ex_dat).shape)
for nm, mod in models:
    if nm != 'Base':
        print(nm, get_performance(mod, ex_dat))
    else:
        print(nm, get_performance(mod, ex_dat, base=True))


NIPS\gisette_train.csv
(6000, 5000)
(6000, 1000)
	Sampling DPP...
	Sampling DPP Done!
	Calculating separability (covariance matrix)...
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\sklearn\utils\validation.py:444: DataConversionWarning: Data with input dtype int64 was converted to float64 by the scale function.
  warnings.warn(msg, DataConversionWarning)
C:\Users\chapm\Documents\GitHub\sklearn-recipes\streaming\dpp_classifier_ogfs2.py:97: RuntimeWarning: divide by zero encountered in true_divide
  X = s_b/s_w
C:\Users\chapm\Documents\GitHub\sklearn-recipes\streaming\dpp_classifier_ogfs2.py:116: RuntimeWarning: divide by zero encountered in double_scalars
  prev_score = np.sum(curr_u1)/np.sum(curr_u2)
C:\Users\chapm\Documents\GitHub\sklearn-recipes\streaming\dpp_classifier_ogfs2.py:111: RuntimeWarning: divide by zero encountered in double_scalars
  score = ((np.sum(test_u1)/np.sum(test_u2)) - prev_score)
C:\Users\chapm\Documents\GitHub\sklearn-recipes\streaming\dpp_classifier_ogfs2.py:111: RuntimeWarning: invalid value encountered in double_scalars
  score = ((np.sum(test_u1)/np.sum(test_u2)) - prev_score)
C:\Users\chapm\Documents\GitHub\sklearn-recipes\streaming\dpp_classifier_ogfs2.py:161: RuntimeWarning: divide by zero encountered in true_divide
  eval2 = s_b/s_w
C:\Users\chapm\Documents\GitHub\sklearn-recipes\streaming\dpp_classifier_ogfs2.py:163: RuntimeWarning: divide by zero encountered in true_divide
  eval2 = np.diag(s_b)/np.diag(s_w)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\numpy\core\_methods.py:112: RuntimeWarning: invalid value encountered in subtract
  x = asanyarray(arr - arrmean)
C:\Users\chapm\Documents\GitHub\sklearn-recipes\streaming\dpp_classifier_ogfs2.py:150: RuntimeWarning: invalid value encountered in double_scalars
  t_stat = (mu - X[idx])/(sigma/np.sqrt(U))
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\_distn_infrastructure.py:876: RuntimeWarning: invalid value encountered in greater_equal
  return (self.a <= x) & (x <= self.b)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\_distn_infrastructure.py:876: RuntimeWarning: invalid value encountered in less_equal
  return (self.a <= x) & (x <= self.b)
	Calculating separability Done!
	Wilcoxon stats...
	Wilcoxon stats Done!
(6000, 1500)
	Sampling DPP...
	Sampling DPP Done!
	Calculating separability (covariance matrix)...
	Calculating separability Done!
	Wilcoxon stats...
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\morestats.py:2397: UserWarning: Warning: sample size too small for normal approximation.
  warnings.warn("Warning: sample size too small for normal approximation.")
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\morestats.py:2422: RuntimeWarning: invalid value encountered in double_scalars
  z = (T - mn - correction) / se
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in greater
  return (self.a < x) & (x < self.b)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in less
  return (self.a < x) & (x < self.b)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\_distn_infrastructure.py:1818: RuntimeWarning: invalid value encountered in less_equal
  cond2 = cond0 & (x <= self.a)
	Wilcoxon stats Done!
(6000, 2000)
	Sampling DPP...
	Sampling DPP Done!
	Calculating separability (covariance matrix)...
	Calculating separability Done!
	Wilcoxon stats...
	Wilcoxon stats Done!
(6000, 2500)
	Sampling DPP...
	Sampling DPP Done!
	Calculating separability (covariance matrix)...
	Calculating separability Done!
	Wilcoxon stats...
	Wilcoxon stats Done!
(6000, 3000)
	Sampling DPP...
	Sampling DPP Done!
	Calculating separability (covariance matrix)...
	Calculating separability Done!
	Wilcoxon stats...
	Wilcoxon stats Done!
(6000, 3500)
	Sampling DPP...
	Sampling DPP Done!
	Calculating separability (covariance matrix)...

In [ ]:
models = [
    #('Grafting', GraftingClassifier(max_iter=5, random_state=42)), 
    #('DPP', DPPClassifier(max_iter=5, random_state=42)), 
    ('DPP', DPPClassifier0(max_iter=5, random_state=42)), 
    #('DPP2', DPPClassifier2(max_iter=5, random_state=42)),
    #('OGFS', OGFSClassifier(max_iter=5, random_state=42)),
    ('Base', SGDClassifier(loss='log', max_iter=5, random_state=42))
]

In [ ]:
ex_dat = class_train[3]
print(ex_dat)
print(pd.read_csv(ex_dat).shape)
for nm, mod in models:
    if nm != 'Base':
        print(nm, get_performance(mod, ex_dat))
    else:
        print(nm, get_performance(mod, ex_dat, base=True))

In [ ]: