In [1]:
import pandas as pd
import numpy as np
from evoml.subspacing import FeatureStackerFEGT, FeatureStackerFEMPO, FeatureStackerFECV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_digits, load_breast_cancer, load_iris, load_mlcomp
In [70]:
data = load_breast_cancer()
In [71]:
features = data.data
features = pd.DataFrame(features)
output = pd.DataFrame(data.target)
In [6]:
print(features.shape)
output[0].unique()
Out[6]:
In [7]:
#data = pd.read_csv('datasets/GAMETES.csv',sep='\t')
# data = pd.read_csv('datasets/GAMETES_Epistasis_2-Way_1000atts_0.4H_EDM-1_EDM-1_1.txt',sep='\t')
# headers_ = list(data.columns)
# features = data[headers_[0:-1]]
# output = data[headers_[-1]]
In [8]:
from sklearn.tree import DecisionTreeClassifier
In [25]:
def check_for_benchmarks():
n_estimators=30
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# The exploration of the dataset by benchmark algorithms
clf = DecisionTreeClassifier(random_state=34092)
clf.fit(X_train, y_train)
pred_DTC = clf.predict(X_test)
a = clf.score(X_test, y_test)
print('Base DecisionTreeClassifier accuracy: {}'.format(clf.score(X_test, y_test)))
clf = RandomForestClassifier(random_state=34092, n_estimators=n_estimators)
clf.fit(X_train, y_train)
pred_RFC = clf.predict(X_test)
b = clf.score(X_test, y_test)
print('Base RandomForestClassifier accuracy: {}'.format(clf.score(X_test, y_test)))
clf = GradientBoostingClassifier(random_state=34092, n_estimators=n_estimators)
clf.fit(X_train, y_train)
pred_GBC = clf.predict(X_test)
c = clf.score(X_test, y_test)
print('Base GradientBoostingClassifier accuracy: {}'.format(clf.score(X_test, y_test)))
print('')
return a,b,c
In [18]:
check_for_benchmarks()
Out[18]:
In [34]:
all_acc = []
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
# max_features = int(pd.np.sqrt(X_train.shape[1]))
for i in range(0,1):
print(i)
X_train, X_test, y_train, y_test = train_test_split(features, output, stratify=output,
train_size=0.75, test_size=0.25)
a, b, c = check_for_benchmarks()
clf_dt = DecisionTreeClassifier(max_depth=None, random_state=34092)
# clf_lr = LogisticRegressionCV()
#clf = FeatureStackerFEMPO(base_estimator=clf_dt, model_type = 1, N_individual=5, ngen=10, verbose_flag = True, N_population=10, maxOrmin = 1)
clf = FeatureStackerFECV(base_estimator=clf_dt, model_type = 'classification', N_individual=30,
ngen=10, verbose_flag = True, N_population=30, maxOrMin = 1
,featMax = None, featMin=1, folds_CV=7)
clf.fit(X_train, y_train[0])
pred = clf.predict(X_test)
d = accuracy_score(pred,y_test)
all_acc.append([a,b,c,d])
In [51]:
d
Out[51]:
In [29]:
from sklearn.metrics import accuracy_score
pred = clf.predict(X_test)
accuracy_score(pred, y_test)
Out[29]:
In [34]:
shapes = [eg.X.shape[1] for eg in clf.hof[0]]
In [11]:
pd.DataFrame(all_acc)
Out[11]:
In [43]:
original = clf.hof[0][:]
In [ ]:
In [57]:
from evoml.subspacing.util import EstimatorGene
hof = []
bagged_hof = []
for i,eg in enumerate(original):
xcols = eg.X.columns
data = eg.X.copy()
data['class'] = y_train
data = data.sample(frac=1, replace=True)
X = data.loc[:,xcols]
y = data['class']
eg_ = EstimatorGene(X,y,[],[],clf_dt)
bagged_hof.append(eg_)
hof.append(bagged_hof)
In [58]:
# X.shape
In [59]:
clf.hof = hof[:]
In [60]:
pred = clf.predict(X_test)
accuracy_score(pred, y_test)
Out[60]:
In [56]:
xcols
Out[56]:
In [113]:
y.shape
Out[113]:
In [122]:
from sklearn.svm import SVC, LinearSVC
In [110]:
from sklearn.metrics import roc_curve, recall_score
from sklearn.preprocessing import label_binarize
In [117]:
pd.np.linspace(0.1,10,100)
Out[117]:
In [123]:
ts = []
fs = []
iris = load_breast_cancer()
X = iris.data
y = iris.target
# Binarize the output
# y = label_binarize(y, classes=[0, 1, 2])
# n_classes = y.shape[1]
# Add noisy features to make the problem harder
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
for c in pd.np.linspace(0.1,2,11):
clf = LinearSVC(C = c)
output = y
features = X
y_true = output
y_pred = clf.fit(features, output).predict(features)
tpr = recall_score(y_true, y_pred)
frame = pd.DataFrame({'y_true': y_true[0], 'y_pred':y_pred})
fpr =frame.loc[((frame.y_pred==1) & (frame.y_true==0))].shape[0]/float((y_true==0).sum())
ts.append(tpr)
fs.append(fpr)
In [ ]:
In [99]:
%matplotlib inline
In [102]:
import matplotlib.pyplot as plt
In [116]:
plt.scatter(ts, fs)
Out[116]:
In [104]:
ts
Out[104]:
In [105]:
fs
Out[105]:
In [ ]: