In [1]:
import glob

import numpy as np
import pandas as pd

from grafting_classifier import GraftingClassifier
from sklearn.linear_model import SGDClassifier
from ogfs_classifier import OGFSClassifier
from dpp_classifier import DPPClassifier
from dpp_classifier_mitra import DPPClassifier as DPPClassifier2
from dpp_classifier_dppsample import DPPClassifier as DPPClassifier0
from dpp_classifier_ogfs2 import DPPClassifier as DPPClassifier3

from sklearn.metrics import log_loss, accuracy_score

#import dask.dataframe as dd
#import dask.array as da

In [2]:
class_train = glob.glob("microarray/*_train.csv")
print(class_train)


['microarray\\colon_train.csv', 'microarray\\leukemia_train.csv', 'microarray\\lung_cancer_train.csv', 'microarray\\prostate_train.csv']

In [3]:
def train_label(fname):
    targetname = fname.replace(".csv", ".labels")
    return pd.read_csv(targetname)

In [4]:
def get_performance(mod, fpath, base=False):
    train1 = pd.read_csv(fpath).fillna(0)
    y = np.array(train_label(fpath)).flatten()
    
    # simulate streaming...
    # try splitting into groups of ~10,
    # if there is no splits, try ~5.
    train1_cols = np.array_split(range(train1.shape[1]), min(10, int(train1.shape[1]/10.0) + 1))
    if len(train1_cols) == 1:
        train1_cols = np.array_split(range(train1.shape[1]), min(10, int(train1.shape[1]/5.0) + 1))
    all_cols = []

    #mod = GraftingClassifier(max_iter=5)
    if base:
        mod.fit(train1, y)
        results = {'accuracy': accuracy_score(y, mod.predict(train1)), 
               'logloss': log_loss(y, mod.predict_proba(train1)), 
               'feat_dim': mod.coef_.flatten().shape}
        return results

    for idx, collist in enumerate(train1_cols):
        if idx == 0:
            column_list = list(np.array(list(train1.columns))[collist])
            mod.fit(train1[column_list], y)
            all_cols.extend(list(collist))
        else:
            all_cols.extend(list(collist))
            column_list = list(np.array(list(train1.columns))[all_cols])
            mod.partial_fit(train1[column_list], y)
    
    results = {'accuracy': accuracy_score(y, mod.predict(train1)), 
               'logloss': log_loss(y, mod.predict_proba(train1)), 
               'feat_dim': mod.coef_.flatten().shape}
    return results

In [5]:
models = [
    #('Grafting', GraftingClassifier(max_iter=5, random_state=42)), 
    #('DPP', DPPClassifier(max_iter=5, random_state=42)), 
    ('DPP', DPPClassifier3(max_iter=5, random_state=42)), 
    #('DPP2', DPPClassifier2(max_iter=5, random_state=42)),
    #('OGFS', OGFSClassifier(max_iter=5, random_state=42)),
    ('Base', SGDClassifier(loss='log', max_iter=5, random_state=42))
]

In [6]:
class_train = glob.glob("microarray/*_train.csv")
print(class_train)


['microarray\\colon_train.csv', 'microarray\\leukemia_train.csv', 'microarray\\lung_cancer_train.csv', 'microarray\\prostate_train.csv']

In [7]:
#ex_dat = class_train[0]
#print(ex_dat)
#for nm, mod in models:
#    if nm != 'Base':
#        print(nm, get_performance(mod, ex_dat))
#    else:
#        print(nm, get_performance(mod, ex_dat, base=True))

In [ ]:
ex_dat = class_train[1]
print(ex_dat)
print(pd.read_csv(ex_dat).shape)
for nm, mod in models:
    if nm != 'Base':
        print(nm, get_performance(mod, ex_dat))
    else:
        print(nm, get_performance(mod, ex_dat, base=True))


microarray\leukemia_train.csv
(72, 7129)
(72, 1426)
	Sampling DPP...
	Sampling DPP Done!
	Calculating separability (covariance matrix)...
	Calculating separability Done!
	Wilcoxon stats...
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\morestats.py:2397: UserWarning: Warning: sample size too small for normal approximation.
  warnings.warn("Warning: sample size too small for normal approximation.")
	Wilcoxon stats Done!
(72, 2139)
	Sampling DPP...
	Sampling DPP Done!
	Calculating separability (covariance matrix)...
	Calculating separability Done!
	Wilcoxon stats...
	Wilcoxon stats Done!
(72, 2852)
	Sampling DPP...
	Sampling DPP Done!
	Calculating separability (covariance matrix)...
	Calculating separability Done!
	Wilcoxon stats...
	Wilcoxon stats Done!
(72, 3565)
	Sampling DPP...
	Sampling DPP Done!
	Calculating separability (covariance matrix)...
	Calculating separability Done!
	Wilcoxon stats...
	Wilcoxon stats Done!
(72, 4278)
	Sampling DPP...
	Sampling DPP Done!
	Calculating separability (covariance matrix)...
	Calculating separability Done!
	Wilcoxon stats...
	Wilcoxon stats Done!
(72, 4991)
	Sampling DPP...
	Sampling DPP Done!
	Calculating separability (covariance matrix)...
	Calculating separability Done!
	Wilcoxon stats...
	Wilcoxon stats Done!
(72, 5704)
	Sampling DPP...
	Sampling DPP Done!
	Calculating separability (covariance matrix)...
	Calculating separability Done!
	Wilcoxon stats...
	Wilcoxon stats Done!
(72, 6417)
	Sampling DPP...
	Sampling DPP Done!
	Calculating separability (covariance matrix)...
	Calculating separability Done!
	Wilcoxon stats...
	Wilcoxon stats Done!
(72, 7129)
	Sampling DPP...
	Sampling DPP Done!
	Calculating separability (covariance matrix)...
	Calculating separability Done!
	Wilcoxon stats...
	Wilcoxon stats Done!
DPP {'accuracy': 0.34722222222222221, 'logloss': 22.546145702233364, 'feat_dim': (4069,)}
Base {'accuracy': 1.0, 'logloss': 9.9920072216264148e-16, 'feat_dim': (7129,)}
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\sklearn\linear_model\base.py:340: RuntimeWarning: overflow encountered in exp
  np.exp(prob, prob)

In [ ]:
ex_dat = class_train[2]
print(ex_dat)
for nm, mod in models:
    if nm != 'Base':
        print(nm, get_performance(mod, ex_dat))
    else:
        print(nm, get_performance(mod, ex_dat, base=True))


microarray\lung_cancer_train.csv
(181, 2508)
	Sampling DPP...
	Sampling DPP Done!
	Calculating separability (covariance matrix)...
	Calculating separability Done!
	Wilcoxon stats...
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\scipy\stats\morestats.py:2397: UserWarning: Warning: sample size too small for normal approximation.
  warnings.warn("Warning: sample size too small for normal approximation.")
	Wilcoxon stats Done!
(181, 3762)
	Sampling DPP...
	Sampling DPP Done!
	Calculating separability (covariance matrix)...
	Calculating separability Done!
	Wilcoxon stats...
	Wilcoxon stats Done!
(181, 5015)
	Sampling DPP...
	Sampling DPP Done!
	Calculating separability (covariance matrix)...
	Calculating separability Done!
	Wilcoxon stats...
	Wilcoxon stats Done!
(181, 6268)
	Sampling DPP...
	Sampling DPP Done!
	Calculating separability (covariance matrix)...

In [ ]:
ex_dat = class_train[3]
print(ex_dat)
for nm, mod in models:
    if nm != 'Base':
        print(nm, get_performance(mod, ex_dat))
    else:
        print(nm, get_performance(mod, ex_dat, base=True))