In [1]:
import glob

import numpy as np
import pandas as pd

from grafting_classifier import GraftingClassifier
from sklearn.linear_model import SGDClassifier
from ogfs_classifier import OGFSClassifier
from osfs_classifier import OSFSClassifier
from dpp_classifier import DPPClassifier
from dpp_classifier_mitra import DPPClassifier as DPPClassifier2
from dpp_classifier_ogfs import DPPClassifier as DPPClassifier3

from sklearn.metrics import log_loss, accuracy_score

#import dask.dataframe as dd
#import dask.array as da

In [2]:
class_train = glob.glob("microarray/*_train.csv")
print(class_train)


['microarray\\colon_train.csv', 'microarray\\leukemia_train.csv', 'microarray\\lung_cancer_train.csv', 'microarray\\prostate_train.csv']

In [3]:
def train_label(fname):
    targetname = fname.replace(".csv", ".labels")
    return pd.read_csv(targetname)

In [16]:
def get_performance(mod, fpath, base=False):
    train1 = pd.read_csv(fpath).fillna(0)
    y = np.array(train_label(fpath)).flatten()
    
    # simulate streaming...
    # try splitting into groups of ~10,
    # if there is no splits, try ~5.
    train1_cols = np.array_split(range(train1.shape[1]), int(train1.shape[1]/10.0) + 1)
    if len(train1_cols) == 1:
        train1_cols = np.array_split(range(train1.shape[1]), int(train1.shape[1]/5.0) + 1)
    all_cols = []

    #mod = GraftingClassifier(max_iter=5)
    if base:
        mod.fit(train1, y)
        results = {'accuracy': accuracy_score(y, mod.predict(train1)), 
               'logloss': log_loss(y, mod.predict_proba(train1)), 
               'feat_dim': mod.coef_.flatten().shape}
        return results
    
    
    for idx, collist in enumerate(train1_cols):
        if idx == 0:
            column_list = list(np.array(list(train1.columns))[collist])
            mod.fit(train1[column_list], y)
            all_cols.extend(list(collist))
        else:
            all_cols.extend(list(collist))
            column_list = list(np.array(list(train1.columns))[all_cols])
            mod.partial_fit(train1[column_list], y)
    
    results = {'accuracy': accuracy_score(y, mod.predict(train1)), 
               'logloss': log_loss(y, mod.predict_proba(train1)), 
               'feat_dim': mod.coef_.flatten().shape}
    return results

In [39]:
mod = OSFSClassifier(max_iter=5, random_state=42)

In [40]:
fpath = class_train[0]

In [41]:
train1 = pd.read_csv(fpath).fillna(0)
y = np.array(train_label(fpath)).flatten()

In [42]:
train1 = (train1 - train1.mean())/(np.maximum(train1.std(), 1))

In [43]:
train1_cols = np.array_split(range(train1.shape[1]), int(train1.shape[1]/10.0) + 1)
if len(train1_cols) == 1:
    train1_cols = np.array_split(range(train1.shape[1]), int(train1.shape[1]/5.0) + 1)
all_cols = []

In [44]:
for idx, collist in enumerate(train1_cols):
    if idx == 0:
        column_list = list(np.array(list(train1.columns))[collist])
        mod.fit(train1[column_list], y)
        all_cols.extend(list(collist))
    else:
        all_cols.extend(list(collist))
        column_list = list(np.array(list(train1.columns))[all_cols])
        mod.partial_fit(train1[column_list], y)
    
    if idx % 10 == 0:
        results = {'accuracy': accuracy_score(y, mod.predict(train1[column_list])), 
               'logloss': log_loss(y, mod.predict_proba(train1[column_list])), 
               'feat_dim': mod.coef_.flatten().shape}
        print(idx, results)


0 {'accuracy': 0.58064516129032262, 'logloss': 3.5147263327881966, 'feat_dim': (3,)}
10 {'accuracy': 0.83870967741935487, 'logloss': 3.2458229029093255, 'feat_dim': (35,)}
20 {'accuracy': 0.69354838709677424, 'logloss': 8.3604802132901295, 'feat_dim': (52,)}
30 {'accuracy': 0.79032258064516125, 'logloss': 5.3817704444111945, 'feat_dim': (75,)}
40 {'accuracy': 0.85483870967741937, 'logloss': 3.3241350850662257, 'feat_dim': (108,)}
50 {'accuracy': 0.67741935483870963, 'logloss': 8.9828191446205228, 'feat_dim': (134,)}
60 {'accuracy': 0.72580645161290325, 'logloss': 8.4194215293829266, 'feat_dim': (159,)}
70 {'accuracy': 0.91935483870967738, 'logloss': 2.26335557436804, 'feat_dim': (185,)}
80 {'accuracy': 0.88709677419354838, 'logloss': 2.4297071780118875, 'feat_dim': (218,)}
90 {'accuracy': 0.90322580645161288, 'logloss': 3.0038900890478537, 'feat_dim': (240,)}
100 {'accuracy': 0.88709677419354838, 'logloss': 2.9995455091195455, 'feat_dim': (257,)}
110 {'accuracy': 0.95161290322580649, 'logloss': 1.3928408007112993, 'feat_dim': (274,)}
120 {'accuracy': 0.90322580645161288, 'logloss': 2.8492427458558387, 'feat_dim': (308,)}
130 {'accuracy': 0.70967741935483875, 'logloss': 8.3306526431434911, 'feat_dim': (335,)}
140 {'accuracy': 0.88709677419354838, 'logloss': 2.9968232009470093, 'feat_dim': (360,)}
150 {'accuracy': 0.74193548387096775, 'logloss': 7.643811439973736, 'feat_dim': (381,)}
160 {'accuracy': 0.85483870967741937, 'logloss': 4.8082803412510664, 'feat_dim': (406,)}
170 {'accuracy': 0.77419354838709675, 'logloss': 7.0372039111129627, 'feat_dim': (427,)}
180 {'accuracy': 0.75806451612903225, 'logloss': 7.7659312396251128, 'feat_dim': (449,)}
190 {'accuracy': 0.61290322580645162, 'logloss': 11.924715965893524, 'feat_dim': (470,)}
200 {'accuracy': 0.80645161290322576, 'logloss': 5.9516957386049656, 'feat_dim': (485,)}

In [45]:
len(mod.coef_info['strong_dep'])


Out[45]:
485

In [46]:
len(mod.coef_info['weak_dep'])


Out[46]:
0

In [47]:
len(mod.coef_info['cols'])


Out[47]:
485

In [48]:
len(mod.coef_info['excluded_cols'])


Out[48]:
1515

In [49]:
mod.coef_.shape


Out[49]:
(1, 485)

In [ ]: