In [1]:
%matplotlib inline
%config InlineBackend.figure_format='retina'
from __future__ import absolute_import, division, print_function
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib.pyplot import GridSpec
import seaborn as sns
import numpy as np
import pandas as pd
import warnings
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
In [2]:
n_features = 20
nonsense_column_names = [''.join(np.random.choice(['ab', 'cd', 'ef', ' ', 'gh'],
size=np.random.randint(1, 12)))
for _ in range(n_features)]
nonsense_column_names
Out[2]:
In [3]:
X, y = make_classification(n_features=n_features)
X = pd.DataFrame(data=X, columns=nonsense_column_names)
In [4]:
clf = RandomForestClassifier(min_samples_leaf=5)
clf.fit(X, y)
Out[4]:
In [5]:
clf.feature_importances_
Out[5]:
In [6]:
rf_importances = pd.DataFrame({'name':X.columns,
'importance':clf.feature_importances_
}).sort_values(by='importance',
ascending=False).reset_index(drop=True)
In [7]:
rf_importances
Out[7]:
In [8]:
sns.set_context('poster', font_scale=1.3)
In [9]:
fig, ax = plt.subplots(figsize=(12, 8))
rf_importances.set_index('name').sort_values(by='importance').plot.barh(ax=ax)
ax.set_xlabel('Feature Importance')
ax.set_ylabel('Feature')
ax.legend(loc='best')
fig.tight_layout();
In [11]:
fig, ax = plt.subplots(figsize=(12, 8))
rf_importances.set_index('name').plot.bar(ax=ax)
ax.set_xlabel('Feature')
ax.set_ylabel('Feature Importance')
fig.autofmt_xdate()
fig.tight_layout()
In [ ]: