In [ ]:
from protoml import Pipeline
from protoml.nodes.data import *
from protoml.nodes.sklearn import *
from protoml.nodes import EstimatorNode,MetricNode,MachineEvaluatorNode
from protoml.viz import *
from protoml.feature import *
from protoml.extras import visualize_pipeline
from sklearn import svm
from sklearn import neighbors
from sklearn import cluster
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, accuracy_score
In [ ]:
# main protoml pipeline
P = Pipeline()
# load data
digits = datasets.load_digits()
test = int(digits.data.shape[0] * .25)
digits.data.shape
In [ ]:
# zero score and pca down to 32 dimensions
pca_ft = ("pca", "", PCA(n_components=32), True)
ft = Feature(digits.data)
ft.add_transforms([
ft_standardscaler(),
pca_ft
])
ft.fit()
ft
In [ ]:
# data nodes
P.add_nodes(
# node for training data
("Training Data", LabeledTrainingDataNode(ft["pca.*"][:-test],
digits.target[:-test])),
# node for validation data
("Validation Data", LabeledTestDataNode(ft["pca.*"][-test:],
digits.target[-test:])),
)
In [ ]:
# create nodes using scikit-learn's support vector classification
# over logarithmic range of C from 0.01 to 1000
for exp in range(-1, 3):
# use one vs all node for multi-classification
# note use of node naming
P.add_node("Machine SVM %s" % 10 ** exp, SklearnOneVsRestNode(svm.SVC, C=10 ** exp))
# scikit's K-nearest neighbors and K-means algorithms
P.add_node("Machine KNN" , SklearnOneVsRestNode(neighbors.KNeighborsClassifier,warn_on_equidistant=False))
P.add_node("Machine K-means", EstimatorNode(cluster.KMeans,n_clusters=2))
# mass evaluator
P.add_node("Evaluator", MachineEvaluatorNode())
In [ ]:
# create cross validation, 3-fold
P.add_node("Cross Validation", SklearnCrossValidationNode(accuracy_score, score_weight=-1, verbose=True,top_k=4))
P.add_node("Visualize CV", CrossValidationVisualizationNode()) # see scores
# final metric
P.add_node("Metrics", MetricNode([accuracy_score], error_weights=[-1], verbose=True, top_k = 1))
In [ ]:
# Link em up
P.reset_edges()
P << ["Training", "Cross"]
P << ["Machine\.*", "Cross", "Visualize CV"]
P << [["Cross","Training","Validation"],"Eval","Metrics"]
P << ["Validation","Metrics"]
visualize_pipeline(P)
In [ ]:
# EXECUTE
P.run(timer=True, verbose=False)
In [ ]: