pre-pruning
, but not post-pruning
In [1]:
import sklearn
import mglearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import Image, display
from sklearn.tree import DecisionTreeClassifier
In [2]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
print(cancer.keys())
In [3]:
print(cancer['target_names'])
In [4]:
print(cancer['feature_names'])
In [5]:
type(cancer)
Out[5]:
In [6]:
cancer.data.shape
Out[6]:
In [ ]:
# Add target_df to cancer_df, and export to CSV file
#cancer_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
#target_df = pd.DataFrame(cancer.target)
#cancer_df["target"] = target_df[0]
#cancer_df.to_csv("cancer_data.csv", sep=',', encoding='utf-8')
#cancer_df.tail()
In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, stratify=cancer.target, random_state=42)
In [8]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)
Out[8]:
In [9]:
print("Accuracy on the training: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on the test set: {:.3f}".format(tree.score(X_test, y_test)))
In [10]:
tree = DecisionTreeClassifier(max_depth=4, random_state=0)
tree.fit(X_train, y_train)
print("Accuracy on the training: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on the test set: {:.3f}".format(tree.score(X_test, y_test)))
In [11]:
from sklearn.tree import export_graphviz
export_graphviz(tree, out_file="tree.dot", class_names=["malignant", "benign"],
feature_names=cancer.feature_names, impurity=False, filled=True)
In [16]:
from IPython.display import display
import graphviz
with open('tree.dot') as f:
dot_graph = f.read()
display(graphviz.Source(dot_graph))
samples
per node, and value
(samples per class)0 = "Not at all"
, to 1 = perfectly predicts target"
In [17]:
print("Feature importances:\n{}".format(tree.feature_importances_))
In [23]:
def plot_feature_importances_cancer(model):
n_features = cancer.data.shape[1]
plt.barh(range(n_features), model.feature_importances_, align='center')
plt.yticks(np.arange(n_features), cancer.feature_names)
plt.xlabel("Feature importance")
plt.ylabel("Feature")
plot_feature_importances_cancer(tree)
n_estimators
parametern_samples
max_features
paramtermax_features
set to n_features
, each split can look at all features inthe dataset, no randomness in selectionmax_features
means the trees in a random forest will be quite similarmax_feature
means trees in random forest will be quite different
In [26]:
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, stratify=cancer.target, random_state=0)
forest = RandomForestClassifier(n_estimators=100, random_state=0)
forest.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))
In [27]:
plot_feature_importances_cancer(forest)
max_features
paramter determines how random each tree is; smaller max_features
reduces overfittinglearning_rate
parameter controls how strongly each tree tries to correct mistakes of previous treesn_estimators
, increases model complexity
In [30]:
from sklearn.ensemble import GradientBoostingClassifier
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, stratify=cancer.target, random_state=0)
gbrt = GradientBoostingClassifier(random_state=0, n_estimators=100, max_depth=3, learning_rate=0.01)
gbrt.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(gbrt.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(gbrt.score(X_test, y_test)))
In [31]:
gbrt = GradientBoostingClassifier(random_state=0, max_depth=1)
gbrt.fit(X_train, y_train)
plot_feature_importances_cancer(gbrt)
In [ ]: