In [1]:
import pandas as pd
df = pd.read_csv('datasets/ozone.csv')
X, y = df.iloc[:,:-1], df['output']
Motive of the notebook is to give a brief overview as to how to use the evolutionary sampling powered ensemble models as part of the EvoML research project.
Will make the notebook more verbose if time permits. Priority will be to showcase the flexible API of the new estimators which encourage research and tinkering.
In [2]:
from evoml.subsampling import BasicSegmenter_FEMPO, BasicSegmenter_FEGT, BasicSegmenter_FEMPT
In [2]:
df = pd.read_csv('datasets/ozone.csv')
In [3]:
df.head(2)
Out[3]:
In [4]:
X, y = df.iloc[:,:-1], df['output']
In [6]:
print(BasicSegmenter_FEGT.__doc__)
In [7]:
from sklearn.tree import DecisionTreeRegressor
clf_dt = DecisionTreeRegressor(max_depth=3)
clf = BasicSegmenter_FEGT(base_estimator=clf_dt, statistics=True)
In [8]:
clf.fit(X, y)
Out[8]:
In [9]:
clf.score(X, y)
Out[9]:
In [10]:
EGs = clf.segments_
In [11]:
len(EGs)
Out[11]:
In [12]:
sampled_datasets = [eg.get_data() for eg in EGs]
In [13]:
[sd.shape for sd in sampled_datasets]
Out[13]:
In [ ]:
In [14]:
from evoml.subspacing import FeatureStackerFEGT, FeatureStackerFEMPO
In [15]:
print(FeatureStackerFEGT.__doc__)
In [16]:
clf = FeatureStackerFEGT(ngen=30)
In [17]:
clf.fit(X, y)
Out[17]:
In [18]:
clf.score(X, y)
Out[18]:
In [19]:
## Get the Hall of Fame individual
hof = clf.segment[0]
In [20]:
sampled_datasets = [eg.get_data() for eg in hof]
In [21]:
[data.columns.tolist() for data in sampled_datasets]
Out[21]:
In [22]:
## Original X columns
X.columns
Out[22]:
In [ ]:
# The exploration of the dataset by benchmark algorithms
clf = DecisionTreeClassifier(random_state=34092)
clf.fit(X_train, y_train)
pred_DTC = clf.predict(X_test)
print('Base DecisionTreeClassifier accuracy: {}'.format(clf.score(X_test, y_test)))
clf = RandomForestClassifier(random_state=34092)
clf.fit(X_train_tot, y_train)
pred_RFC = clf.predict(X_test)
print('Base RandomForestClassifier accuracy: {}'.format(clf.score(X_test, y_test)))
clf = GradientBoostingClassifier(random_state=34092)
clf.fit(X_train, y_train)
pred_GBC = clf.predict(X_test)
print('Base GradientBoostingClassifier accuracy: {}'.format(clf.score(X_test, y_test)))
print('')
In [1]:
import pandas as pd
df = pd.read_csv('datasets/ozone.csv')
X, y = df.iloc[:,:-1], df['output']
from evoml.subspacing import FeatureStackerFEGT, FeatureStackerFEMPO, FeatureStackerFECV
#print(FeatureStackerFECV.__doc__)
clf = FeatureStackerFECV(ngen=3)
clf.fit(X, y)
clf.predict(X)
Out[1]:
In [8]:
import numpy as np
import pandas as pd
from evoml.subspacing import FeatureStackerFEGT, FeatureStackerFEMPO, FeatureStackerFECV
from sklearn.cross_validation import train_test_split
data = pd.read_csv('datasets/GAMETES.csv',sep='\t')
headers_ = list(data.columns)
features = data[headers_[0:-1]]
output = data[headers_[-1]]
X_train, X_test, y_train, y_test = train_test_split(features, output, stratify=output,
train_size=0.75, test_size=0.25)
from sklearn.tree import DecisionTreeClassifier
clf_dt = DecisionTreeClassifier(max_features=None)
clf = FeatureStackerFECV(ngen=20, model_type='classification', base_estimator=clf_dt, folds_CV=10)
clf.fit(X_train, y_train)
Out[8]:
In [9]:
from sklearn.metrics import accuracy_score
pred = clf.predict(X_test)
accuracy_score(pred, y_test)
Out[9]:
In [6]:
StratifiedKFold?
In [ ]: