In [1]:
import glob

import numpy as np
import pandas as pd

from grafting_classifier import GraftingClassifier
from sklearn.linear_model import SGDClassifier
from ogfs_classifier import OGFSClassifier
from osfs_classifier import OSFSClassifier
from dpp_classifier_dpp_only import DPPClassifier
#from dpp_classifier_mitra import DPPClassifier as DPPClassifier2
#from dpp_classifier_ogfs import DPPClassifier as DPPClassifier3

from sklearn.metrics import log_loss, accuracy_score

#import dask.dataframe as dd
#import dask.array as da

In [2]:
class_train = glob.glob("microarray/*_train.csv")
print(class_train)


['microarray\\colon_train.csv', 'microarray\\leukemia_train.csv', 'microarray\\lung_cancer_train.csv', 'microarray\\prostate_train.csv']

In [5]:
def train_label(fname):
    targetname = fname.replace(".csv", ".labels")
    return pd.read_csv(targetname)

In [6]:
def get_performance(mod, fpath, mod_name):
    train1 = pd.read_csv(fpath).fillna(0)
    y = np.array(train_label(fpath)).flatten()
    
    # simulate streaming...
    # try splitting into groups of ~10,
    # if there is no splits, try ~5.
    train1_cols = np.array_split(range(train1.shape[1]), int(train1.shape[1]/10.0) + 1)
    if len(train1_cols) == 1:
        train1_cols = np.array_split(range(train1.shape[1]), int(train1.shape[1]/5.0) + 1)
    all_cols = []

    #mod = GraftingClassifier(max_iter=5)
    if mod_name == 'Base':
        mod.fit(train1, y)
        results = {'accuracy': accuracy_score(y, mod.predict(train1)), 
               'logloss': log_loss(y, mod.predict_proba(train1)), 
               'feat_dim': mod.coef_.flatten().shape}
        return results
    
    # lets normalise the dataset...
    train1 = (train1 - train1.mean())/(np.maximum(train1.std(), 1))
    for idx, collist in enumerate(train1_cols):
        if idx == 0:
            column_list = list(np.array(list(train1.columns))[collist])
            mod.fit(train1[column_list], y)
            all_cols.extend(list(collist))
        else:
            all_cols.extend(list(collist))
            column_list = list(np.array(list(train1.columns))[all_cols])
            mod.partial_fit(train1[column_list], y)
        
        # debugging
        print_cond = True if idx % int((len(train1_cols)/10)+1) == 0 else False
        if mod_name in ['Fast_OSFS', 'DPP', 'DPP3', 'OGFS'] and print_cond:
            print("\tmodel: {}, iter: {}".format(mod_name, idx))
        
        # for fast osfs
    if mod_name == 'Fast_OSFS':
        mod._redundancy(train1, y, mode='all')
    
    results = {'accuracy': accuracy_score(y, mod.predict(train1)), 
               'logloss': log_loss(y, mod.predict_proba(train1)), 
               'feat_dim': mod.coef_.flatten().shape}
    return results

In [10]:
def create_models():
    return [
    #('Grafting', GraftingClassifier(max_iter=5, random_state=42)), 
    ('DPP', DPPClassifier(max_iter=5, random_state=42)), 
    #('DPP2', DPPClassifier2(max_iter=5, random_state=42)),
    #('DPP3', DPPClassifier3(max_iter=5, random_state=42)),
    #('OGFS', OGFSClassifier(max_iter=5, random_state=42)),
    #('OSFS', OSFSClassifier(max_iter=5, random_state=42, fast_osfs=False)),
    #('Fast_OSFS', OSFSClassifier(max_iter=5, random_state=42)),
    ('Base', SGDClassifier(loss='log', max_iter=5, random_state=42))
]

In [11]:
ex_dat = class_train[0]
test = pd.read_csv(ex_dat)

In [12]:
ex_dat = class_train[0]
print(ex_dat, pd.read_csv(ex_dat).shape)
models = create_models()
for nm, mod in models:
    if nm != 'Base':
        print(nm, get_performance(mod, ex_dat))
    else:
        print(nm, get_performance(mod, ex_dat, base=True))


microarray\colon_train.csv (62, 2000)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-12-51eaac3ebd89> in <module>()
      4 for nm, mod in models:
      5     if nm != 'Base':
----> 6         print(nm, get_performance(mod, ex_dat))
      7     else:
      8         print(nm, get_performance(mod, ex_dat, base=True))

TypeError: get_performance() missing 1 required positional argument: 'mod_name'

In [ ]:
ex_dat = class_train[1]
print(ex_dat, pd.read_csv(ex_dat).shape)
models = create_models()
for nm, mod in models:
    print(nm, get_performance(mod, ex_dat, mod_name=nm))

In [ ]:
ex_dat = class_train[2]
print(ex_dat, pd.read_csv(ex_dat).shape)
models = create_models()
for nm, mod in models:
    print(nm, get_performance(mod, ex_dat, mod_name=nm))


microarray\lung_cancer_train.csv (181, 12533)
Grafting {'accuracy': 1.0, 'logloss': 0.00058025328079361707, 'feat_dim': (79,)}
	model: Fast_OSFS, iter: 0
	model: Fast_OSFS, iter: 126
	model: Fast_OSFS, iter: 252
	model: Fast_OSFS, iter: 378
	model: Fast_OSFS, iter: 504
	model: Fast_OSFS, iter: 630
	model: Fast_OSFS, iter: 756
	model: Fast_OSFS, iter: 882
	model: Fast_OSFS, iter: 1008
	model: Fast_OSFS, iter: 1134
		(181, 6126)
		(181, 6125)
		(181, 6124)
		(181, 6123)
		(181, 6122)
		(181, 6121)
		(181, 6120)
		(181, 6119)
		(181, 6118)
		(181, 6117)
		(181, 6116)
		(181, 6115)
		(181, 6114)
		(181, 6113)
		(181, 6112)
		(181, 6111)
		(181, 6110)
		(181, 6109)
		(181, 6108)
		(181, 6107)
		(181, 6106)
		(181, 6105)
		(181, 6104)
		(181, 6103)
		(181, 6102)
		(181, 6101)
		(181, 6100)
		(181, 6099)
		(181, 6098)
		(181, 6097)
		(181, 6096)
		(181, 6095)
		(181, 6094)
		(181, 6093)
		(181, 6092)
		(181, 6091)
		(181, 6090)
		(181, 6089)
		(181, 6088)
		(181, 6087)
		(181, 6086)
		(181, 6085)
		(181, 6084)
		(181, 6083)
		(181, 6082)
		(181, 6081)
		(181, 6080)
		(181, 6079)
		(181, 6078)
		(181, 6077)
		(181, 6076)
		(181, 6075)
		(181, 6074)
		(181, 6073)
		(181, 6072)
		(181, 6071)
		(181, 6070)
		(181, 6069)
		(181, 6068)
		(181, 6067)
		(181, 6066)
		(181, 6065)
		(181, 6064)
		(181, 6063)
		(181, 6062)
		(181, 6061)
		(181, 6060)
		(181, 6059)
		(181, 6058)
		(181, 6057)
		(181, 6056)
		(181, 6055)
		(181, 6054)
		(181, 6053)
		(181, 6052)
		(181, 6051)
		(181, 6050)
		(181, 6049)
		(181, 6048)
		(181, 6047)
		(181, 6046)
		(181, 6045)
		(181, 6044)
		(181, 6043)
		(181, 6042)
		(181, 6041)
		(181, 6040)
		(181, 6039)

In [ ]:
ex_dat = class_train[3]
print(ex_dat, pd.read_csv(ex_dat).shape)
models = create_models()
for nm, mod in models:
    print(nm, get_performance(mod, ex_dat, mod_name=nm))

In [ ]:


In [ ]: