notebook.community

Edit and run



In [1]:

    
%matplotlib inline
%config InlineBackend.figure_format='retina'

from __future__ import absolute_import, division, print_function
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib.pyplot import GridSpec
import seaborn as sns
import numpy as np
import pandas as pd
import warnings

from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier



In [2]:

    
n_features = 20

nonsense_column_names = [''.join(np.random.choice(['ab', 'cd', 'ef', '  ', 'gh'],
                                                  size=np.random.randint(1, 12)))
                                  for _ in range(n_features)]
nonsense_column_names









    Out[2]:





['cd',
 'efefef  gh  ',
 '  efef  cdabcd  gh',
 'ef  efghghcd    ',
 'efababefghef  ',
 'cdcdgh',
 'cd',
 'ghgh',
 'cd',
 '    cdefcd  ghab',
 'ghghgh',
 'cdabefabcdabef',
 'cd',
 'cdcdcd  ghab  abcd',
 'cdabefefefefghgh',
 '    cdgh    cdabef  ',
 'ghefcdcdcd',
 'ef',
 'efghcdef  ef  ghabcd  ',
 'cd']



In [3]:

    
X, y = make_classification(n_features=n_features)
X = pd.DataFrame(data=X, columns=nonsense_column_names)



In [4]:

    
clf = RandomForestClassifier(min_samples_leaf=5)
clf.fit(X, y)









    Out[4]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)



In [5]:

    
clf.feature_importances_









    Out[5]:





array([ 0.00106261,  0.01539515,  0.07398669,  0.00974474,  0.01253549,
        0.02184585,  0.00558645,  0.15462275,  0.00463382,  0.0021489 ,
        0.05179587,  0.        ,  0.        ,  0.01428622,  0.52419094,
        0.01502966,  0.        ,  0.02913962,  0.04103748,  0.02295775])



In [6]:

    
rf_importances = pd.DataFrame({'name':X.columns,
                               'importance':clf.feature_importances_
                              }).sort_values(by='importance', 
                                              ascending=False).reset_index(drop=True)



In [7]:

    
rf_importances









    Out[7]:







  
    
      
      importance
      name
    
  
  
    
      0
      0.524191
      cdabefefefefghgh
    
    
      1
      0.154623
      ghgh
    
    
      2
      0.073987
      efef  cdabcd  gh
    
    
      3
      0.051796
      ghghgh
    
    
      4
      0.041037
      efghcdef  ef  ghabcd
    
    
      5
      0.029140
      ef
    
    
      6
      0.022958
      cd
    
    
      7
      0.021846
      cdcdgh
    
    
      8
      0.015395
      efefef  gh
    
    
      9
      0.015030
      cdgh    cdabef
    
    
      10
      0.014286
      cdcdcd  ghab  abcd
    
    
      11
      0.012535
      efababefghef
    
    
      12
      0.009745
      ef  efghghcd
    
    
      13
      0.005586
      cd
    
    
      14
      0.004634
      cd
    
    
      15
      0.002149
      cdefcd  ghab
    
    
      16
      0.001063
      cd
    
    
      17
      0.000000
      cdabefabcdabef
    
    
      18
      0.000000
      cd
    
    
      19
      0.000000
      ghefcdcdcd



In [8]:

    
sns.set_context('poster', font_scale=1.3)



In [9]:

    
fig, ax = plt.subplots(figsize=(12, 8))
rf_importances.set_index('name').sort_values(by='importance').plot.barh(ax=ax)
ax.set_xlabel('Feature Importance')
ax.set_ylabel('Feature')
ax.legend(loc='best')
fig.tight_layout();



In [11]:

    
fig, ax = plt.subplots(figsize=(12, 8))
rf_importances.set_index('name').plot.bar(ax=ax)
ax.set_xlabel('Feature')
ax.set_ylabel('Feature Importance')
fig.autofmt_xdate()
fig.tight_layout()



In [ ]:

	importance	name
0	0.524191	cdabefefefefghgh
1	0.154623	ghgh
2	0.073987	efef cdabcd gh
3	0.051796	ghghgh
4	0.041037	efghcdef ef ghabcd
5	0.029140	ef
6	0.022958	cd
7	0.021846	cdcdgh
8	0.015395	efefef gh
9	0.015030	cdgh cdabef
10	0.014286	cdcdcd ghab abcd
11	0.012535	efababefghef
12	0.009745	ef efghghcd
13	0.005586	cd
14	0.004634	cd
15	0.002149	cdefcd ghab
16	0.001063	cd
17	0.000000	cdabefabcdabef
18	0.000000	cd
19	0.000000	ghefcdcdcd