In [1]:
import glob

import numpy as np
import pandas as pd

from grafting_classifier import GraftingClassifier
from sklearn.linear_model import SGDClassifier
from dpp_classifier_dpp_only import DPPClassifier

from sklearn.metrics import log_loss, accuracy_score

#import dask.dataframe as dd
#import dask.array as da

In [2]:
class_train = glob.glob("microarray/*_train.csv")
print(class_train)


['microarray\\colon_train.csv', 'microarray\\leukemia_train.csv', 'microarray\\lung_cancer_train.csv', 'microarray\\prostate_train.csv']

In [3]:
def train_label(fname):
    targetname = fname.replace(".csv", ".labels")
    return pd.read_csv(targetname)

In [4]:
def get_performance(mod, fpath, base=False):
    train1 = pd.read_csv(fpath).fillna(0)
    y = np.array(train_label(fpath)).flatten()
    
    # simulate streaming...
    # try splitting into groups of ~10,
    # if there is no splits, try ~5.
    train1_cols = np.array_split(range(train1.shape[1]), min(10, int(train1.shape[1]/10.0) + 1))
    if len(train1_cols) == 1:
        train1_cols = np.array_split(range(train1.shape[1]), min(10, int(train1.shape[1]/5.0) + 1))
    all_cols = []

    #mod = GraftingClassifier(max_iter=5)
    if base:
        mod.fit(train1, y)
        results = {'accuracy': accuracy_score(y, mod.predict(train1)), 
               'logloss': log_loss(y, mod.predict_proba(train1)), 
               'feat_dim': mod.coef_.flatten().shape}
        return results

    for idx, collist in enumerate(train1_cols):
        if idx == 0:
            column_list = list(np.array(list(train1.columns))[collist])
            mod.fit(train1[column_list], y)
            all_cols.extend(list(collist))
        else:
            all_cols.extend(list(collist))
            column_list = list(np.array(list(train1.columns))[all_cols])
            mod.partial_fit(train1[column_list], y)
    
    results = {'accuracy': accuracy_score(y, mod.predict(train1)), 
               'logloss': log_loss(y, mod.predict_proba(train1)), 
               'feat_dim': mod.coef_.flatten().shape}
    return results

In [5]:
models = [
    #('Grafting', GraftingClassifier(max_iter=5, random_state=42)), 
    #('DPP', DPPClassifier(max_iter=5, random_state=42)), 
    ('DPP', DPPClassifier(max_iter=5, random_state=42)), 
    #('DPP2', DPPClassifier2(max_iter=5, random_state=42)),
    #('OGFS', OGFSClassifier(max_iter=5, random_state=42)),
    ('Base', SGDClassifier(loss='log', max_iter=5, random_state=42))
]

In [6]:
class_train = glob.glob("microarray/*_train.csv")
print(class_train)


['microarray\\colon_train.csv', 'microarray\\leukemia_train.csv', 'microarray\\lung_cancer_train.csv', 'microarray\\prostate_train.csv']

In [7]:
"""
ex_dat = class_train[0]
print(ex_dat)
for nm, mod in models:
    if nm != 'Base':
        print(nm, get_performance(mod, ex_dat))
    else:
        print(nm, get_performance(mod, ex_dat, base=True))
"""


Out[7]:
"\nex_dat = class_train[0]\nprint(ex_dat)\nfor nm, mod in models:\n    if nm != 'Base':\n        print(nm, get_performance(mod, ex_dat))\n    else:\n        print(nm, get_performance(mod, ex_dat, base=True))\n"

In [8]:
#ex_dat = class_train[1]
#print(ex_dat)
#print(pd.read_csv(ex_dat).shape)
#for nm, mod in models:
#    if nm != 'Base':
#        print(nm, get_performance(mod, ex_dat))
#    else:
#        print(nm, get_performance(mod, ex_dat, base=True))

In [9]:
ex_dat = class_train[2]
print(ex_dat)
for nm, mod in models:
    if nm != 'Base':
        print(nm, get_performance(mod, ex_dat))
    else:
        print(nm, get_performance(mod, ex_dat, base=True))


microarray\lung_cancer_train.csv
(181, 2508)
(181, 3762)
(181, 3766)
(181, 3776)
(181, 3793)
(181, 3807)
(181, 3823)
(181, 3840)
(181, 3855)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\sklearn\linear_model\base.py:340: RuntimeWarning: overflow encountered in exp
  np.exp(prob, prob)
DPP {'accuracy': 0.94475138121546964, 'logloss': 1.908219690326558, 'feat_dim': (1366,)}
Base {'accuracy': 1.0, 'logloss': 9.9920072216264128e-16, 'feat_dim': (12533,)}

In [10]:
"""
ex_dat = class_train[3]
print(ex_dat)
for nm, mod in models:
    if nm != 'Base':
        print(nm, get_performance(mod, ex_dat))
    else:
        print(nm, get_performance(mod, ex_dat, base=True))
"""


Out[10]:
"\nex_dat = class_train[3]\nprint(ex_dat)\nfor nm, mod in models:\n    if nm != 'Base':\n        print(nm, get_performance(mod, ex_dat))\n    else:\n        print(nm, get_performance(mod, ex_dat, base=True))\n"

In [ ]: