In [1]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from __future__ import absolute_import, division, print_function
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib.pyplot import GridSpec
import seaborn as sns
import numpy as np
import pandas as pd
import warnings

from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

In [2]:
n_features = 20

nonsense_column_names = [''.join(np.random.choice(['ab', 'cd', 'ef', '  ', 'gh'],
                                                  size=np.random.randint(1, 12)))
                                  for _ in range(n_features)]
nonsense_column_names


Out[2]:
['cd',
 'efefef  gh  ',
 '  efef  cdabcd  gh',
 'ef  efghghcd    ',
 'efababefghef  ',
 'cdcdgh',
 'cd',
 'ghgh',
 'cd',
 '    cdefcd  ghab',
 'ghghgh',
 'cdabefabcdabef',
 'cd',
 'cdcdcd  ghab  abcd',
 'cdabefefefefghgh',
 '    cdgh    cdabef  ',
 'ghefcdcdcd',
 'ef',
 'efghcdef  ef  ghabcd  ',
 'cd']

In [3]:
X, y = make_classification(n_features=n_features)
X = pd.DataFrame(data=X, columns=nonsense_column_names)

In [4]:
clf = RandomForestClassifier(min_samples_leaf=5)
clf.fit(X, y)


Out[4]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [5]:
clf.feature_importances_


Out[5]:
array([ 0.00106261,  0.01539515,  0.07398669,  0.00974474,  0.01253549,
        0.02184585,  0.00558645,  0.15462275,  0.00463382,  0.0021489 ,
        0.05179587,  0.        ,  0.        ,  0.01428622,  0.52419094,
        0.01502966,  0.        ,  0.02913962,  0.04103748,  0.02295775])

In [6]:
rf_importances = pd.DataFrame({'name':X.columns,
                               'importance':clf.feature_importances_
                              }).sort_values(by='importance', 
                                              ascending=False).reset_index(drop=True)

In [7]:
rf_importances


Out[7]:
importance name
0 0.524191 cdabefefefefghgh
1 0.154623 ghgh
2 0.073987 efef cdabcd gh
3 0.051796 ghghgh
4 0.041037 efghcdef ef ghabcd
5 0.029140 ef
6 0.022958 cd
7 0.021846 cdcdgh
8 0.015395 efefef gh
9 0.015030 cdgh cdabef
10 0.014286 cdcdcd ghab abcd
11 0.012535 efababefghef
12 0.009745 ef efghghcd
13 0.005586 cd
14 0.004634 cd
15 0.002149 cdefcd ghab
16 0.001063 cd
17 0.000000 cdabefabcdabef
18 0.000000 cd
19 0.000000 ghefcdcdcd

In [8]:
sns.set_context('poster', font_scale=1.3)

In [9]:
fig, ax = plt.subplots(figsize=(12, 8))
rf_importances.set_index('name').sort_values(by='importance').plot.barh(ax=ax)
ax.set_xlabel('Feature Importance')
ax.set_ylabel('Feature')
ax.legend(loc='best')
fig.tight_layout();



In [11]:
fig, ax = plt.subplots(figsize=(12, 8))
rf_importances.set_index('name').plot.bar(ax=ax)
ax.set_xlabel('Feature')
ax.set_ylabel('Feature Importance')
fig.autofmt_xdate()
fig.tight_layout()



In [ ]: