by Alejandro Correa Bahnsen and Jesus Solano
version 1.5, February 2019
This notebook is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License. Special thanks goes to Kevin Markham)
Why are we learning about ensembling?
In [1]:
import pandas as pd
import numpy as np
# read in the data
url = 'https://raw.githubusercontent.com/albahnsen/PracticalMachineLearningClass/master/datasets/hitters.csv'
hitters = pd.read_csv(url)
# remove rows with missing values
hitters.dropna(inplace=True)
hitters.head()
Out[1]:
In [2]:
# encode categorical variables as integers
hitters['League'] = pd.factorize(hitters.League)[0]
hitters['Division'] = pd.factorize(hitters.Division)[0]
hitters['NewLeague'] = pd.factorize(hitters.NewLeague)[0]
hitters.head()
Out[2]:
In [3]:
# allow plots to appear in the notebook
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
In [4]:
# scatter plot of Years versus Hits colored by Salary
hitters.plot(kind='scatter', x='Years', y='Hits', c='Salary', colormap='jet', xlim=(0, 25), ylim=(0, 250))
Out[4]:
In [5]:
# define features: exclude career statistics (which start with "C") and the response (Salary)
feature_cols = hitters.columns[hitters.columns.str.startswith('C') == False].drop('Salary')
feature_cols
Out[5]:
In [6]:
hitters.Salary.describe()
Out[6]:
In [7]:
# define X and y
X = hitters[feature_cols]
y = (hitters.Salary > 425).astype(int)
In [8]:
X.columns
Out[8]:
In [9]:
max_depth = None
num_pct = 10
max_features = None
min_gain=0.001
For feature 1 calculate possible splitting points
In [10]:
j = 1
print(X.columns[j])
In [11]:
# Split the variable in num_ctp points
splits = np.percentile(X.iloc[:, j], np.arange(0, 100, 100.0 / num_pct).tolist())
In [12]:
# Only unique values for filter binary and few unique values features
splits = np.unique(splits)
In [13]:
splits
Out[13]:
split the data using split 5
In [14]:
k = 5
In [15]:
filter_l = X.iloc[:, j] < splits[k]
y_l = y.loc[filter_l]
y_r = y.loc[~filter_l]
For each node
In [16]:
def gini(y):
if y.shape[0] == 0:
return 0
else:
return 1 - (y.mean()**2 + (1 - y.mean())**2)
In [17]:
gini_l = gini(y_l)
gini_l
Out[17]:
In [18]:
gini_r = gini(y_r)
gini_r
Out[18]:
The gini impurity of the split is the Gini Impurity of each node is weighted by the fraction of points from the parent node in that node.
In [19]:
def gini_impurity(X_col, y, split):
"Calculate the gain of an split k on feature j"
filter_l = X_col < split
y_l = y.loc[filter_l]
y_r = y.loc[~filter_l]
n_l = y_l.shape[0]
n_r = y_r.shape[0]
gini_y = gini(y)
gini_l = gini(y_l)
gini_r = gini(y_r)
gini_impurity_ = gini_y - (n_l / (n_l + n_r) * gini_l + n_r / (n_l + n_r) * gini_r)
return gini_impurity_
In [20]:
gini_impurity(X.iloc[:, j], y, splits[k])
Out[20]:
In [ ]:
In [21]:
def best_split(X, y, num_pct=10):
features = range(X.shape[1])
best_split = [0, 0, 0] # j, split, gain
# For all features
for j in features:
splits = np.percentile(X.iloc[:, j], np.arange(0, 100, 100.0 / (num_pct+1)).tolist())
splits = np.unique(splits)[1:]
# For all splits
for split in splits:
gain = gini_impurity(X.iloc[:, j], y, split)
if gain > best_split[2]:
best_split = [j, split, gain]
return best_split
In [22]:
j, split, gain = best_split(X, y, 5)
j, split, gain
Out[22]:
In [23]:
filter_l = X.iloc[:, j] < split
y_l = y.loc[filter_l]
y_r = y.loc[~filter_l]
In [24]:
y.shape[0], y_l.shape[0], y_r.shape[0]
Out[24]:
In [25]:
y.mean(), y_l.mean(), y_r.mean()
Out[25]:
In [26]:
def tree_grow(X, y, level=0, min_gain=0.001, max_depth=None, num_pct=10):
# If only one observation
if X.shape[0] == 1:
tree = dict(y_pred=y.iloc[:1].values[0], y_prob=0.5, level=level, split=-1, n_samples=1, gain=0)
return tree
# Calculate the best split
j, split, gain = best_split(X, y, num_pct)
# save tree and estimate prediction
y_pred = int(y.mean() >= 0.5)
y_prob = (y.sum() + 1.0) / (y.shape[0] + 2.0) # Laplace correction
tree = dict(y_pred=y_pred, y_prob=y_prob, level=level, split=-1, n_samples=X.shape[0], gain=gain)
# Check stooping criteria
if gain < min_gain:
return tree
if max_depth is not None:
if level >= max_depth:
return tree
# No stooping criteria was meet, then continue to create the partition
filter_l = X.iloc[:, j] < split
X_l, y_l = X.loc[filter_l], y.loc[filter_l]
X_r, y_r = X.loc[~filter_l], y.loc[~filter_l]
tree['split'] = [j, split]
# Next iteration to each split
tree['sl'] = tree_grow(X_l, y_l, level + 1, min_gain=min_gain, max_depth=max_depth, num_pct=num_pct)
tree['sr'] = tree_grow(X_r, y_r, level + 1, min_gain=min_gain, max_depth=max_depth, num_pct=num_pct)
return tree
In [27]:
tree_grow(X, y, level=0, min_gain=0.001, max_depth=1, num_pct=10)
Out[27]:
In [28]:
tree = tree_grow(X, y, level=0, min_gain=0.001, max_depth=3, num_pct=10)
In [29]:
tree
Out[29]:
In [30]:
def tree_predict(X, tree, proba=False):
predicted = np.ones(X.shape[0])
# Check if final node
if tree['split'] == -1:
if not proba:
predicted = predicted * tree['y_pred']
else:
predicted = predicted * tree['y_prob']
else:
j, split = tree['split']
filter_l = (X.iloc[:, j] < split)
X_l = X.loc[filter_l]
X_r = X.loc[~filter_l]
if X_l.shape[0] == 0: # If left node is empty only continue with right
predicted[~filter_l] = tree_predict(X_r, tree['sr'], proba)
elif X_r.shape[0] == 0: # If right node is empty only continue with left
predicted[filter_l] = tree_predict(X_l, tree['sl'], proba)
else:
predicted[filter_l] = tree_predict(X_l, tree['sl'], proba)
predicted[~filter_l] = tree_predict(X_r, tree['sr'], proba)
return predicted
In [31]:
tree_predict(X, tree)
Out[31]:
In [32]:
# list of values to try for max_depth
max_depth_range = range(1, 21)
# list to store the average RMSE for each value of max_depth
accuracy_scores = []
# use 10-fold cross-validation with each value of max_depth
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
for depth in max_depth_range:
clf = DecisionTreeClassifier(max_depth=depth, random_state=1)
accuracy_scores.append(cross_val_score(clf, X, y, cv=10, scoring='accuracy').mean())
In [33]:
# plot max_depth (x-axis) versus RMSE (y-axis)
plt.plot(max_depth_range, accuracy_scores)
plt.xlabel('max_depth')
plt.ylabel('Accuracy')
Out[33]:
In [34]:
# show the best accuracy and the corresponding max_depth
sorted(zip(accuracy_scores, max_depth_range))[::-1][0]
Out[34]:
In [35]:
# max_depth=2 was best, so fit a tree using that parameter
clf = DecisionTreeClassifier(max_depth=4, random_state=1)
clf.fit(X, y)
Out[35]:
In [36]:
# compute feature importances
pd.DataFrame({'feature':feature_cols, 'importance':clf.feature_importances_}).sort_values('importance')
Out[36]:
In [37]:
pd.Series(cross_val_score(clf, X, y, cv=10)).describe()
Out[37]:
Random Forests is a slight variation of bagged trees that has even better performance:
What's the point?
In [38]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf
Out[38]:
In [39]:
pd.Series(cross_val_score(clf, X, y, cv=10)).describe()
Out[39]:
In [40]:
# list of values to try for n_estimators
estimator_range = range(10, 310, 10)
# list to store the average Accuracy for each value of n_estimators
accuracy_scores = []
# use 5-fold cross-validation with each value of n_estimators (WARNING: SLOW!)
for estimator in estimator_range:
clf = RandomForestClassifier(n_estimators=estimator, random_state=1, n_jobs=-1)
accuracy_scores.append(cross_val_score(clf, X, y, cv=5, scoring='accuracy').mean())
In [41]:
plt.plot(estimator_range, accuracy_scores)
plt.xlabel('n_estimators')
plt.ylabel('Accuracy')
Out[41]:
In [42]:
# list of values to try for max_features
feature_range = range(1, len(feature_cols)+1)
# list to store the average Accuracy for each value of max_features
accuracy_scores = []
# use 10-fold cross-validation with each value of max_features (WARNING: SLOW!)
for feature in feature_range:
clf = RandomForestClassifier(n_estimators=200, max_features=feature, random_state=1, n_jobs=-1)
accuracy_scores.append(cross_val_score(clf, X, y, cv=5, scoring='accuracy').mean())
In [43]:
plt.plot(feature_range, accuracy_scores)
plt.xlabel('max_features')
plt.ylabel('Accuracy')
Out[43]:
In [44]:
# max_features=6 is best and n_estimators=200 is sufficiently large
clf = RandomForestClassifier(n_estimators=200, max_features=6, random_state=1, n_jobs=-1)
clf.fit(X, y)
Out[44]:
In [45]:
# compute feature importances
pd.DataFrame({'feature':feature_cols, 'importance':clf.feature_importances_}).sort_values('importance')
Out[45]:
Advantages of Random Forests:
Disadvantages of Random Forests: