In [1]:
%matplotlib inline
In [2]:
from sklearn.datasets import make_classification
In [3]:
X, y = make_classification(n_samples=10000,
n_features=20,
n_informative=15,
flip_y=.5, weights=[.2, .8])
In [4]:
# Need to:
# 1. Create training/test sets.
# 2. Fit a baseline Random Forest to evaluate naive algorithm.
# 3. Alter parameters to see how fit improves.
In [5]:
import numpy as np
training = np.random.choice([True, False], p=[.8, .2],
size=y.shape)
In [6]:
from sklearn.ensemble import RandomForestClassifier
In [7]:
rf = RandomForestClassifier()
In [8]:
rf.fit(X[training], y[training])
Out[8]:
In [9]:
preds = rf.predict(X[~training])
In [10]:
print "Accuracy:\t", (preds == y[~training]).mean()
In [12]:
# accuracy is a good metrics but a confusion matrix will help
# understand what's going on.
# Iterate through the recommended choices for max_features and
# see what it does to the fit. Also, try iterating through a
# couple of floats which are the fraction of the features that
# will be used.
In [13]:
from sklearn.metrics import confusion_matrix
In [14]:
max_feature_params = ['auto', 'sqrt', 'log2', .01, .5, .99]
In [28]:
confusion_matricies = {}
for max_feature in max_feature_params:
rf = RandomForestClassifier(max_features=max_feature)
rf.fit(X[training], y[training])
preds = rf.predict(X[~training])
confusion_matricies[max_feature] = confusion_matrix(y[~training], preds).ravel()
In [30]:
import pandas as pd
import itertools
import matplotlib.pyplot as plt
confusion_df = pd.DataFrame(confusion_matricies)
f, ax = plt.subplots(figsize=(7,5))
confusion_df.plot(kind='bar', ax=ax)
ax.legend(loc='best')
ax.set_title('Guessed vs Correct (i, j) where i is the guess and j is the actual.')
ax.grid()
ax.set_xticklabels([str((i,j)) for i, j in
list(itertools.product(range(2), range(2)))])
ax.set_xlabel("Guessed vs Correct")
ax.set_ylabel('Correct')
Out[30]:
In [34]:
n_estimator_params = range(1, 20)
confusion_matricies = {}
accuracy = lambda x: np.trace(x) / np.sum(x, dtype=float)
for n_estimator in n_estimator_params:
rf = RandomForestClassifier(n_estimators=n_estimator)
rf.fit(X[training], y[training])
preds = rf.predict(X[~training])
val = confusion_matrix(y[~training],
preds)
confusion_matricies[n_estimator] = accuracy(val)
accuracy_series = pd.Series(confusion_matricies)
In [35]:
f, ax = plt.subplots(figsize=(7,5))
accuracy_series.plot(kind='bar', ax=ax, color='k', alpha=.75)
ax.grid()
ax.set_title("Accuracy by Number of Estimators")
ax.set_ylim(0, 1)
ax.set_ylabel('Accuracy')
ax.set_xlabel('Number of Estimators')
Out[35]:
In [36]:
rf = RandomForestClassifier(n_jobs=4, verbose=True)
rf.fit(X, y)
Out[36]:
In [37]:
rf.predict(X)
Out[37]:
In [ ]: