In [44]:
import glob
import numpy as np
import pandas as pd
from grafting_classifier import GraftingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss, accuracy_score
#import dask.dataframe as dd
#import dask.array as da
In [32]:
class_train = glob.glob("microarray/*_train.csv")
print(class_train)
In [33]:
def train_label(fname):
targetname = fname.replace(".csv", ".labels")
return pd.read_csv(targetname)
In [34]:
fpath = class_train[3]
train1 = pd.read_csv(fpath).fillna(0)
y = np.array(train_label(fpath)).flatten()
train1_cols = np.array_split(range(train1.shape[1]), int(train1.shape[1]/10.0) + 1)
all_cols = []
mod = GraftingClassifier(max_iter=5)
for idx, collist in enumerate(train1_cols):
if idx == 0:
column_list = list(np.array(list(train1.columns))[collist])
mod.fit(train1[column_list], y)
all_cols.extend(list(collist))
else:
all_cols.extend(list(collist))
column_list = list(np.array(list(train1.columns))[all_cols])
mod.partial_fit(train1[column_list], y)
In [35]:
"target" in list(train1.columns)
Out[35]:
In [36]:
"y" in list(train1.columns)
Out[36]:
In [37]:
train1.shape
Out[37]:
In [38]:
y.shape
Out[38]:
In [39]:
accuracy_score(y, mod.predict(train1))
Out[39]:
In [40]:
log_loss(y, mod.predict(train1))
Out[40]:
In [41]:
train1.shape
Out[41]:
In [42]:
mod.coef_.flatten().shape
Out[42]:
In [45]:
mod_base = SGDClassifier(loss='log', max_iter=5)
mod_base.fit(train1, y)
print("Accuracy: {}".format(accuracy_score(y, mod_base.predict(train1))))
print("Logloss: {}".format(log_loss(y, mod_base.predict(train1))))