In [1]:
using RDatasets
using DecisionTree
In [2]:
iris = dataset("datasets", "iris")
head(iris)
Out[2]:
In [3]:
features = convert(Array, iris[:, 1:4])
labels = convert(Array, iris[:, 5])
Out[3]:
In [4]:
# train full-tree classifier
model = build_tree(labels, features)
Out[4]:
In [5]:
# prune tree: merge leaves having >= 90% combined purity (default: 100%)
model = prune_tree(model, 0.9)
Out[5]:
In [6]:
# pretty print of the tree, to a depth of 5 nodes (optional)
print_tree(model, 5)
In [7]:
# apply learned model
apply_tree(model, [5.9,3.0,5.1,1.9])
Out[7]:
In [8]:
# get the probability of each label
apply_tree_proba(model, [5.9,3.0,5.1,1.9], ["setosa", "versicolor", "virginica"])
Out[8]:
In [9]:
# run n-fold cross validation for pruned tree,
# using 90% purity threshold pruning, and 3 CV folds
accuracy = nfoldCV_tree(labels, features, 0.9, 3)
Out[9]:
In [10]:
# train random forest classifier
# using 2 random features, 10 trees, and 0.5 portion of samples per tree (optional)
model = build_forest(labels, features, 2, 10, 0.5)
Out[10]:
In [11]:
# apply learned model
apply_forest(model, [5.9,3.0,5.1,1.9])
Out[11]:
In [12]:
# get the probability of each label
apply_forest_proba(model, [5.9,3.0,5.1,1.9], ["setosa", "versicolor", "virginica"])
Out[12]:
In [13]:
# run n-fold cross validation for forests
# using 2 random features, 10 trees, 3 folds and 0.5 of samples per tree (optional)
accuracy = nfoldCV_forest(labels, features, 2, 10, 3, 0.5)
Out[13]: