notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
# Using Random Forest for Classification. RF are good because
# they're very robust to overfitting and perform well in a
# variety of situations.



In [3]:

    
# RF work by constructing a lot of very shallow trees, and then
# taking a vote of the class that each tree "voted" for. This
# idea is powerful in machine learning. If we assume that one
# classifier is only right 60% of the time, we can train multiple
# classifiers and they are generally right and learn together.



In [4]:

    
# Actions:
#   1. Create a sample dataset.
#   2. Train a basic random forest object.
#   3. Take a look at some attributes of a trained object.



In [5]:

    
from sklearn.datasets import make_classification



In [6]:

    
X, y = make_classification(1000)



In [7]:

    
from sklearn.ensemble import RandomForestClassifier



In [9]:

    
rf = RandomForestClassifier()
rf.fit(X, y)









    Out[9]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)



In [10]:

    
# print accuracy on training data:
print "Accuracy:\t", (y == rf.predict(X)).mean()









    



Accuracy:	0.996



In [11]:

    
print "Total Correct:\t", (y == rf.predict(X)).sum()









    



Total Correct:	996



In [12]:

    
# Looking at useful attributes:
#  -> rf.criterion: Determines splits. default: 'gini'.
#  -> rf.bootstrap: Bool determines use of bootstrap samples.
#  -> rf.n_jobs: # of jobs (processors) to train/predict.
#  -> rf.max_features: # of features to determine a split.
#  -> rf.max_depth: how each tree goes.



In [13]:

    
# predict isn't only useful method. We can get the probability
# of each class from individual samples.



In [20]:

    
probs = rf.predict_proba(X)
probs[:5]









    Out[20]:





array([[ 0. ,  1. ],
       [ 1. ,  0. ],
       [ 0. ,  1. ],
       [ 0.2,  0.8],
       [ 0. ,  1. ]])



In [15]:

    
import pandas as pd
import matplotlib.pyplot as plt



In [19]:

    
probs_df = pd.DataFrame(probs, columns=['0','1'])
probs_df['was_correct'] = rf.predict(X) == y









    Out[19]:






  
    
      
      0
      1
      was_correct
    
  
  
    
      0 
       0.0
       1.0
       True
    
    
      1 
       1.0
       0.0
       True
    
    
      2 
       0.0
       1.0
       True
    
    
      3 
       0.2
       0.8
       True
    
    
      4 
       0.0
       1.0
       True
    
    
      5 
       1.0
       0.0
       True
    
    
      6 
       0.3
       0.7
       True
    
    
      7 
       0.0
       1.0
       True
    
    
      8 
       1.0
       0.0
       True
    
    
      9 
       0.0
       1.0
       True
    
    
      10
       1.0
       0.0
       True
    
    
      11
       0.0
       1.0
       True
    
    
      12
       0.0
       1.0
       True
    
    
      13
       0.7
       0.3
       True
    
    
      14
       0.8
       0.2
       True
    
    
      15
       0.0
       1.0
       True
    
    
      16
       0.0
       1.0
       True
    
    
      17
       0.7
       0.3
       True
    
    
      18
       0.0
       1.0
       True
    
    
      19
       0.1
       0.9
       True



In [17]:

    
f, ax = plt.subplots(figsize=(7,5))
probs_df.groupby('0').was_correct.mean().plot(kind='bar', ax=ax)
ax.set_title("Accuracy at 0 class probability")
ax.set_ylabel('% correct')
ax.set_xlabel('% trees for 0')









    Out[17]:





<matplotlib.text.Text at 0x1029dc350>



In [21]:

    
# we can use RF to determine feature importance.



In [31]:

    
f, ax = plt.subplots(figsize=(7,5))
ax.bar(range(0, len(rf.feature_importances_)), rf.feature_importances_)
ax.set_title('feature importances')









    Out[31]:





<matplotlib.text.Text at 0x108a5ffd0>



In [35]:

    
importances = pd.DataFrame(rf.feature_importances_)
importances.sort(0, ascending=False)



In [ ]:

	0
19	0.598794
18	0.088215
0	0.067985
3	0.047320
16	0.017112
5	0.016041
1	0.015733
15	0.014727
8	0.014089
12	0.013343
2	0.013222
13	0.012580
9	0.012522
11	0.012422
10	0.011421
4	0.010115
17	0.009518
14	0.008525
7	0.008396
6	0.007919

	0	1	was_correct
0	0.0	1.0	True
1	1.0	0.0	True
2	0.0	1.0	True
3	0.2	0.8	True
4	0.0	1.0	True
5	1.0	0.0	True
6	0.3	0.7	True
7	0.0	1.0	True
8	1.0	0.0	True
9	0.0	1.0	True
10	1.0	0.0	True
11	0.0	1.0	True
12	0.0	1.0	True
13	0.7	0.3	True
14	0.8	0.2	True
15	0.0	1.0	True
16	0.0	1.0	True
17	0.7	0.3	True
18	0.0	1.0	True
19	0.1	0.9	True