In [8]:
import randomlogits
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
from sklearn.datasets import make_classification
from sklearn.cross_validation import train_test_split
X, Y = make_classification(n_samples= 10000, n_features=50, n_redundant=10, n_informative=10,
random_state=1, n_clusters_per_class=3, n_repeated = 10)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
In [3]:
clfrl = randomlogits.TrainRandomLogits(X = X_train, y = y_train, n_logits = 100, n_features = 10)
In [4]:
a, mean_importance_rl, std_importance_rl = randomlogits.GetFeatureImportances(clfrl)
In [5]:
# let's compare it to RFs
clfrf = RandomForestClassifier(n_estimators= 100, max_features = 10, n_jobs=-1)
clfrf.fit(X_train, y_train)
clfrf.feature_importances_
Out[5]:
In [6]:
def plot_feature_importance(mean_importance_array, std_importance_array, indices, label):
# Print the feature ranking
print("Ranking:")
for f in range(X.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], mean_importance_array[indices[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title(label)
plt.bar(range(X.shape[1]), mean_importance_array[indices],
color="r", yerr=std_importance_array[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()
In [9]:
indices_rl = np.argsort(np.abs(mean_importance_rl))[::-1]
# std_important_rl is buggy but will fix it later
plot_feature_importance(mean_importance_rl, std_importance_rl, indices_rl, "randomlogit")
importance_rf = clfrf.feature_importances_
std_importance_rf = np.std([tree.feature_importances_ for tree in clfrf.estimators_],
axis=0)
indices_rf = np.argsort(importance_rf)[::-1]
plot_feature_importance(importance_rf, std_importance_rf, indices_rf, "randomforest")
In [ ]: