notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
from sklearn.datasets import make_classification



In [3]:

    
X, y = make_classification(n_samples=10000,
                           n_features=20,
                           n_informative=15,
                           flip_y=.5, weights=[.2, .8])



In [4]:

    
# Need to:
#  1. Create training/test sets.
#  2. Fit a baseline Random Forest to evaluate naive algorithm.
#  3. Alter parameters to see how fit improves.



In [5]:

    
import numpy as np
training = np.random.choice([True, False], p=[.8, .2],
                            size=y.shape)



In [6]:

    
from sklearn.ensemble import RandomForestClassifier



In [7]:

    
rf = RandomForestClassifier()



In [8]:

    
rf.fit(X[training], y[training])









    Out[8]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)



In [9]:

    
preds = rf.predict(X[~training])



In [10]:

    
print "Accuracy:\t", (preds == y[~training]).mean()









    



Accuracy:	0.664011946242



In [12]:

    
# accuracy is a good metrics but a confusion matrix will help
# understand what's going on.
# Iterate through the recommended choices for max_features and
# see what it does to the fit. Also, try iterating through a
# couple of floats which are the fraction of the features that
# will be used.



In [13]:

    
from sklearn.metrics import confusion_matrix



In [14]:

    
max_feature_params = ['auto', 'sqrt', 'log2', .01, .5, .99]



In [28]:

    
confusion_matricies = {}
for max_feature in max_feature_params:
    rf = RandomForestClassifier(max_features=max_feature)
    rf.fit(X[training], y[training])
    preds = rf.predict(X[~training])
    confusion_matricies[max_feature] = confusion_matrix(y[~training], preds).ravel()



In [30]:

    
import pandas as pd
import itertools
import matplotlib.pyplot as plt

confusion_df = pd.DataFrame(confusion_matricies)

f, ax = plt.subplots(figsize=(7,5))
confusion_df.plot(kind='bar', ax=ax)

ax.legend(loc='best')
ax.set_title('Guessed vs Correct (i, j) where i is the guess and j is the actual.')
ax.grid()
ax.set_xticklabels([str((i,j)) for i, j in
                    list(itertools.product(range(2), range(2)))])

ax.set_xlabel("Guessed vs Correct")
ax.set_ylabel('Correct')









    Out[30]:





<matplotlib.text.Text at 0x110f10cd0>



In [34]:

    
n_estimator_params = range(1, 20)
confusion_matricies = {}

accuracy = lambda x: np.trace(x) / np.sum(x, dtype=float)

for n_estimator in n_estimator_params:
    rf = RandomForestClassifier(n_estimators=n_estimator)
    rf.fit(X[training], y[training])
    preds = rf.predict(X[~training])
    val = confusion_matrix(y[~training],
                           preds)
    
    confusion_matricies[n_estimator] = accuracy(val)
    
accuracy_series = pd.Series(confusion_matricies)



In [35]:

    
f, ax = plt.subplots(figsize=(7,5))
accuracy_series.plot(kind='bar', ax=ax, color='k', alpha=.75)
ax.grid()

ax.set_title("Accuracy by Number of Estimators")
ax.set_ylim(0, 1)
ax.set_ylabel('Accuracy')
ax.set_xlabel('Number of Estimators')









    Out[35]:





<matplotlib.text.Text at 0x110eee290>



In [36]:

    
rf = RandomForestClassifier(n_jobs=4, verbose=True)
rf.fit(X, y)









    



[Parallel(n_jobs=4)]: Done   1 out of  10 | elapsed:    0.1s remaining:    0.5s
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.1s finished






    Out[36]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
            oob_score=False, random_state=None, verbose=True,
            warm_start=False)



In [37]:

    
rf.predict(X)









    



[Parallel(n_jobs=4)]: Done   1 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.0s finished






    Out[37]:





array([0, 1, 1, ..., 1, 1, 1])



In [ ]: