In [1]:
import glob

import numpy as np
import pandas as pd

from grafting_classifier import GraftingClassifier

#import dask.dataframe as dd
#import dask.array as da

In [2]:
class_train = glob.glob("data/*.csv")
print(class_train)


['data\\arcene_train.csv', 'data\\BostonHousing_train.csv', 'data\\BreastCancer_train.csv', 'data\\colon_train.csv', 'data\\dexter_train.csv', 'data\\dorothea_train.csv', 'data\\gisette_train.csv', 'data\\leukemia_train.csv', 'data\\lung_cancer_train.csv', 'data\\madelon_train.csv', 'data\\PimaIndiansDiabetes_train.csv', 'data\\prostate_train.csv']

In [3]:
def train_label(fname):
    targetname = fname.replace(".csv", ".labels")
    return pd.read_csv(targetname)

In [6]:
train1 = pd.read_csv(class_train[8]).fillna(0)
y = train_label(class_train[8])
train1_cols = np.array_split(range(train1.shape[1]), int(train1.shape[1]/10.0) + 1)
all_cols = []

mod = GraftingClassifier()

for idx, collist in enumerate(train1_cols):
    if idx == 0:
        column_list = list(np.array(list(train1.columns))[collist])
        mod.fit(train1[column_list], y)
        all_cols.extend(list(collist))
    else:
        all_cols.extend(list(collist))
        column_list = list(np.array(list(train1.columns))[all_cols])
        mod.partial_fit(train1[column_list], y)


C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\sklearn\linear_model\stochastic_gradient.py:84: FutureWarning: max_iter and tol parameters have been added in <class 'grafting_classifier.GraftingClassifier'> in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.
  "and default tol will be 1e-3." % type(self), FutureWarning)
C:\Users\chapm\Anaconda3\envs\skrecipe\lib\site-packages\sklearn\utils\validation.py:547: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)

In [7]:
mod.coef_.shape


Out[7]:
(1, 160)

In [8]:
train1.shape


Out[8]:
(181, 12533)