In [ ]:
from protoml import Pipeline
from protoml.nodes.data import *
from protoml.nodes.sklearn import *
from protoml.nodes import EstimatorNode,MetricNode,MachineEvaluatorNode
from protoml.viz import *
from protoml.feature import *
from protoml.extras import visualize_pipeline

from sklearn import svm
from sklearn import neighbors
from sklearn import cluster
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, accuracy_score

In [ ]:
# main protoml pipeline
P = Pipeline()

# load data
digits = datasets.load_digits()
test = int(digits.data.shape[0] * .25)
digits.data.shape

In [ ]:
# zero score and pca down to 32 dimensions
pca_ft = ("pca", "", PCA(n_components=32), True)

ft = Feature(digits.data)
ft.add_transforms([
                  ft_standardscaler(),
                  pca_ft
                  ])
ft.fit()
ft

In [ ]:
# data nodes
P.add_nodes(
    # node for training data
    ("Training Data", LabeledTrainingDataNode(ft["pca.*"][:-test],
                                              digits.target[:-test])),
    # node for validation data
    ("Validation Data", LabeledTestDataNode(ft["pca.*"][-test:],
                                            digits.target[-test:])),
)

In [ ]:
# create nodes using scikit-learn's support vector classification
# over logarithmic range of C from 0.01 to 1000
for exp in range(-1, 3):
    # use one vs all node for multi-classification
    #                   note use of node naming
    P.add_node("Machine SVM %s" % 10 ** exp, SklearnOneVsRestNode(svm.SVC, C=10 ** exp))

# scikit's K-nearest neighbors and K-means algorithms
P.add_node("Machine KNN" , SklearnOneVsRestNode(neighbors.KNeighborsClassifier,warn_on_equidistant=False))
P.add_node("Machine K-means", EstimatorNode(cluster.KMeans,n_clusters=2))

# mass evaluator
P.add_node("Evaluator", MachineEvaluatorNode())

In [ ]:
# create cross validation, 3-fold
P.add_node("Cross Validation", SklearnCrossValidationNode(accuracy_score, score_weight=-1, verbose=True,top_k=4))
P.add_node("Visualize CV", CrossValidationVisualizationNode()) # see scores

# final metric
P.add_node("Metrics", MetricNode([accuracy_score], error_weights=[-1], verbose=True, top_k = 1))

In [ ]:
# Link em up
P.reset_edges()
P << ["Training", "Cross"]
P << ["Machine\.*", "Cross", "Visualize CV"]
P << [["Cross","Training","Validation"],"Eval","Metrics"]
P << ["Validation","Metrics"]
visualize_pipeline(P)

In [ ]:
# EXECUTE
P.run(timer=True, verbose=False)

In [ ]: