In [1]:
import glob

import numpy as np
import pandas as pd

from grafting_classifier import GraftingClassifier
from sklearn.linear_model import SGDClassifier
from dpp_classifier_dpp_only import DPPClassifier as DPPClassifier0

from sklearn.metrics import log_loss, accuracy_score

#import dask.dataframe as dd
#import dask.array as da

In [2]:
class_train = glob.glob("NIPS/*_train.csv")
print(class_train)


['NIPS\\arcene_train.csv', 'NIPS\\dexter_train.csv', 'NIPS\\gisette_train.csv', 'NIPS\\madelon_train.csv']

In [3]:
def train_label(fname):
    targetname = fname.replace(".csv", ".labels")
    return pd.read_csv(targetname)

In [4]:
def get_performance(mod, fpath, base=False):
    train1 = pd.read_csv(fpath).fillna(0)
    y = np.array(train_label(fpath)).flatten()
    
    # simulate streaming...
    # try splitting into groups of ~10,
    # if there is no splits, try ~5.
    train1_cols = np.array_split(range(train1.shape[1]), min(10, int(train1.shape[1]/10.0) + 1))
    if len(train1_cols) == 1:
        train1_cols = np.array_split(range(train1.shape[1]), min(10, int(train1.shape[1]/5.0) + 1))
    all_cols = []

    #mod = GraftingClassifier(max_iter=5)
    if base:
        mod.fit(train1, y)
        results = {'accuracy': accuracy_score(y, mod.predict(train1)), 
               'logloss': log_loss(y, mod.predict_proba(train1)), 
               'feat_dim': mod.coef_.flatten().shape}
        return results

    for idx, collist in enumerate(train1_cols):
        if idx == 0:
            column_list = list(np.array(list(train1.columns))[collist])
            mod.fit(train1[column_list], y)
            all_cols.extend(list(collist))
        else:
            all_cols.extend(list(collist))
            column_list = list(np.array(list(train1.columns))[all_cols])
            mod.partial_fit(train1[column_list], y)
    
    results = {'accuracy': accuracy_score(y, mod.predict(train1)), 
               'logloss': log_loss(y, mod.predict_proba(train1)), 
               'feat_dim': mod.coef_.flatten().shape}
    return results

In [5]:
models = [
    #('Grafting', GraftingClassifier(max_iter=5, random_state=42)), 
    #('DPP', DPPClassifier(max_iter=5, random_state=42)), 
    ('DPP', DPPClassifier0(max_iter=5, random_state=42)), 
    #('DPP2', DPPClassifier2(max_iter=5, random_state=42)),
    #('OGFS', OGFSClassifier(max_iter=5, random_state=42)),
    ('Base', SGDClassifier(loss='log', max_iter=5, random_state=42))
]

In [6]:
#ex_dat = class_train[0]
#for nm, mod in models:
#    if nm != 'Base':
#        print(nm, get_performance(mod, ex_dat))
#    else:
#        print(nm, get_performance(mod, ex_dat, base=True))

In [7]:
#ex_dat = class_train[1]
#for nm, mod in models:
#    if nm != 'Base':
#        print(nm, get_performance(mod, ex_dat))
#    else:
#        print(nm, get_performance(mod, ex_dat, base=True))

In [8]:
models = [
    #('Grafting', GraftingClassifier(max_iter=5, random_state=42)), 
    #('DPP', DPPClassifier(max_iter=5, random_state=42)), 
    ('DPP', DPPClassifier0(max_iter=5, random_state=42)), 
    #('DPP2', DPPClassifier2(max_iter=5, random_state=42)),
    #('OGFS', OGFSClassifier(max_iter=5, random_state=42)),
    ('Base', SGDClassifier(loss='log', max_iter=5, random_state=42))
]

In [9]:
ex_dat = class_train[2]
print(ex_dat)
print(pd.read_csv(ex_dat).shape)
for nm, mod in models:
    if nm != 'Base':
        print(nm, get_performance(mod, ex_dat))
    else:
        print(nm, get_performance(mod, ex_dat, base=True))


NIPS\gisette_train.csv
(6000, 5000)
(6000, 1000)
(6000, 1471)
(6000, 1492)
(6000, 1517)
(6000, 1548)
(6000, 1576)
(6000, 1590)
(6000, 1606)
(6000, 1612)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\sklearn\linear_model\base.py:340: RuntimeWarning: overflow encountered in exp
  np.exp(prob, prob)
DPP {'accuracy': 0.93283333333333329, 'logloss': 2.3198544811915016, 'feat_dim': (628,)}
Base {'accuracy': 0.9986666666666667, 'logloss': 0.046051701859881909, 'feat_dim': (5000,)}

In [10]:
models = [
    #('Grafting', GraftingClassifier(max_iter=5, random_state=42)), 
    #('DPP', DPPClassifier(max_iter=5, random_state=42)), 
    ('DPP', DPPClassifier0(max_iter=5, random_state=42)), 
    #('DPP2', DPPClassifier2(max_iter=5, random_state=42)),
    #('OGFS', OGFSClassifier(max_iter=5, random_state=42)),
    ('Base', SGDClassifier(loss='log', max_iter=5, random_state=42))
]

In [11]:
ex_dat = class_train[3]
print(ex_dat)
print(pd.read_csv(ex_dat).shape)
for nm, mod in models:
    if nm != 'Base':
        print(nm, get_performance(mod, ex_dat))
    else:
        print(nm, get_performance(mod, ex_dat, base=True))


NIPS\madelon_train.csv
(2000, 500)
(2000, 100)
(2000, 150)
(2000, 170)
(2000, 187)
(2000, 205)
(2000, 218)
(2000, 227)
(2000, 229)
C:\Users\chapm\Documents\GitHub\sklearn-recipes\streaming_take2\dpp.py:283: RuntimeWarning: invalid value encountered in true_divide
  V[:, a] = V[:, a]/np.linalg.norm(V[:, a])
C:\Users\chapm\Documents\GitHub\sklearn-recipes\streaming_take2\dpp.py:252: RuntimeWarning: invalid value encountered in less
  choose_item = np.random.choice(range(len(P_list)), 1, p=P_norm.flatten())
(2000, 233)
DPP {'accuracy': 0.5, 'logloss': 17.269388197455346, 'feat_dim': (136,)}
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\sklearn\linear_model\base.py:340: RuntimeWarning: overflow encountered in exp
  np.exp(prob, prob)
Base {'accuracy': 0.5, 'logloss': 17.269388197455346, 'feat_dim': (500,)}

In [ ]: