In [1]:
from tpot import TPOTClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

Load the IRIS data set and explore its contents.


In [2]:
iris = load_iris()
iris.data[0:5], iris.target


Out[2]:
(array([[ 5.1,  3.5,  1.4,  0.2],
        [ 4.9,  3. ,  1.4,  0.2],
        [ 4.7,  3.2,  1.3,  0.2],
        [ 4.6,  3.1,  1.5,  0.2],
        [ 5. ,  3.6,  1.4,  0.2]]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]))

Split the data set in train and test.


In [3]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,
                                                    train_size=0.75, test_size=0.25)
X_train.shape, X_test.shape, y_train.shape, y_test.shape


Out[3]:
((112, 4), (38, 4), (112,), (38,))

In [4]:
tpot = TPOTClassifier(verbosity=2, max_time_mins=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))


Warning: xgboost.XGBClassifier is not available and will not be used by TPOT.
Optimization Progress: 100%|██████████| 200/200 [00:18<00:00,  9.33pipeline/s]
Generation 1 - Current best internal CV score: 0.9825757575757577
Optimization Progress: 100%|██████████| 300/300 [00:28<00:00, 12.80pipeline/s]
Generation 2 - Current best internal CV score: 0.9825757575757577
Optimization Progress: 100%|██████████| 400/400 [00:39<00:00, 11.16pipeline/s]
Generation 3 - Current best internal CV score: 0.990909090909091
Optimization Progress: 100%|██████████| 500/500 [00:45<00:00, 17.87pipeline/s]
Generation 4 - Current best internal CV score: 0.990909090909091
Optimization Progress: 100%|██████████| 600/600 [00:52<00:00, 16.63pipeline/s]
Generation 5 - Current best internal CV score: 0.990909090909091
Optimization Progress: 100%|██████████| 700/700 [00:58<00:00, 17.42pipeline/s]
Generation 6 - Current best internal CV score: 0.990909090909091
Optimization Progress: 100%|██████████| 800/800 [01:06<00:00, 14.45pipeline/s]
Generation 7 - Current best internal CV score: 0.990909090909091
Optimization Progress: 100%|██████████| 900/900 [01:11<00:00, 12.45pipeline/s]
Generation 8 - Current best internal CV score: 0.990909090909091
Optimization Progress: 100%|██████████| 1000/1000 [01:17<00:00,  8.87pipeline/s]
Generation 9 - Current best internal CV score: 0.990909090909091
Optimization Progress: 100%|██████████| 1100/1100 [01:23<00:00, 11.12pipeline/s]
Generation 10 - Current best internal CV score: 0.990909090909091
Optimization Progress: 100%|██████████| 1200/1200 [01:29<00:00,  9.37pipeline/s]
Generation 11 - Current best internal CV score: 0.990909090909091
Optimization Progress: 100%|██████████| 1300/1300 [01:34<00:00,  9.55pipeline/s]
Generation 12 - Current best internal CV score: 0.990909090909091
Optimization Progress: 100%|██████████| 1400/1400 [01:40<00:00, 16.09pipeline/s]
Generation 13 - Current best internal CV score: 0.990909090909091
Optimization Progress: 100%|██████████| 1500/1500 [01:46<00:00,  9.96pipeline/s]
Generation 14 - Current best internal CV score: 0.990909090909091
Optimization Progress: 100%|██████████| 1600/1600 [01:52<00:00, 10.02pipeline/s]
Generation 15 - Current best internal CV score: 0.990909090909091
Optimization Progress: 100%|██████████| 1700/1700 [01:59<00:00,  7.28pipeline/s]
Generation 16 - Current best internal CV score: 0.990909090909091
                                                                                
TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: DecisionTreeClassifier(RBFSampler(input_matrix, RBFSampler__gamma=0.85), DecisionTreeClassifier__criterion=entropy, DecisionTreeClassifier__max_depth=3, DecisionTreeClassifier__min_samples_leaf=4, DecisionTreeClassifier__min_samples_split=9)
0.973684210526


In [5]:
tpot.export('tpot_iris_pipeline.py')

In [ ]:
# %load tpot_iris_pipeline.py
import numpy as np

from sklearn.kernel_approximation import RBFSampler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_classes, testing_classes = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(
    RBFSampler(gamma=0.8500000000000001),
    DecisionTreeClassifier(criterion="entropy", max_depth=3, min_samples_leaf=4, min_samples_split=9)
)

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)

In [ ]: