In [1]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectPercentile
In [4]:
%matplotlib inline
#http://stackoverflow.com/questions/22409855/randomforestclassifier-vs-extratreesclassifier-in-scikit-learn
In [5]:
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
# Build a classification task using 3 informative features
X, y = make_classification(n_samples=1000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
n_classes=2,
random_state=0,
shuffle=False)
# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,
random_state=0)
forest.fit(X, y)
importances = forest.feature_importances_
# Standard deviations for plotting error bars
# Pull out the feature importances of each tree that was created into list of arrays
# Get the standard deviation of each column with
# Axis 0: by columns
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
axis=0)
# np.argsort() : Sort the resulting importances, ascending order
# [::-1]: Reverse the order, best to worst
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(10):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(10), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(10), indices)
plt.xlim([-1, 10])
plt.show()
In [6]:
indices
Out[6]:
In [7]:
importances
Out[7]:
In [8]:
np.argsort(importances)
Out[8]:
In [10]:
np.argsort(importances)[::-1]
Out[10]:
In [20]:
In [29]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target
print X.shape
(150, 4)
clf = ExtraTreesClassifier()
# Fit(): fit the model.
# Transform(): Reduce X to its most important features.
X_new = clf.fit(X, y).transform(X, threshold='mean')
print clf.feature_importances_
print X_new.shape
In [142]:
import pickle
data_dict = pickle.load(open("../ud120-projects/final_project/final_project_dataset.pkl", "r") )
In [143]:
import pandas as pd
df = pd.DataFrame.from_dict(data_dict, orient='index')
In [155]:
df = df.drop(['email_address'], axis=1)
In [156]:
[suspect for suspect in df.index if " " not in suspect]
Out[156]:
In [157]:
df = df.drop('TOTAL', axis=0)
In [157]:
In [158]:
labels = df.poi
features = df.drop(['poi'], axis=1)
In [159]:
print labels.shape
print features.shape
In [206]:
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(features,
labels,
test_size=0.2)
In [207]:
pd.DataFrame(features_train, columns=features.columns)
sum(labels_train)
Out[207]:
In [208]:
from sklearn.cross_validation import StratifiedKFold
In [298]:
# Create a temporary stratified 5-fold list of indices for train/test splitting
k_temp = StratifiedKFold(labels, n_folds=5, shuffle=True)
In [299]:
for train, test in k_temp:
# iterate through and overwrite.
# Only one stratified 80% train, 20% split is needed to
# create an out-of-sample test set.
train_ind = train
test_ind = test
In [338]:
print test_ind
print train_ind
In [395]:
# Use the indices to filter a training and a testing set.
train_df = df.iloc[train_ind]
holdout_df = df.iloc[test_ind]
In [396]:
# Verify the shape and stratified distribution of boolean y-labels.
print train_df.shape
print holdout_df.shape
print sum(train_df.poi)
print sum(holdout_df.poi)
In [ ]:
In [399]:
train_df = train_df.replace('NaN', np.nan)
In [ ]:
#train_df = train_df.apply(lambda x: x.fillna(x.median()), axis=0)
In [ ]:
In [401]:
train_df[train_df.poi==True].salary
Out[401]:
In [410]:
ax = train_df.plot(kind='scatter', x='shared_receipt_with_poi', y='from_this_person_to_poi',
color='DarkGreen', label='not poi')
train_df[train_df.poi==True].plot(kind='scatter', x='shared_receipt_with_poi', y='from_this_person_to_poi',
color='DarkBlue', label='poi', ax=ax)
Out[410]:
In [416]:
In [405]:
train_df.columns
Out[405]:
In [357]:
Out[357]:
In [ ]:
In [28]:
clf = Pipeline([
('feature_selection', LinearSVC(penalty="l1")),
('classification', RandomForestClassifier())
])
clf.fit(X, y)
In [ ]:
In [ ]:
from sklearn.preprocessing import Imputer
# Impute values of an np.array by column (axis=0)
# Create Imputer() to train on training set, and use for test set.
imputer = Imputer(missing_values='NaN', strategy='median', axis=0)
imputer.fit(train_data)
imputer.transform(train_data)
imputer.transform(test_data)
In [128]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('imp', Imputer(missing_values='NaN', axis=0)),
('rf', ExtraTreesClassifier())
])
In [129]:
parameters = {'imp__strategy': ('median', 'mean')}
grid_search = GridSearchCV(pipeline, parameters)
In [132]:
grid_search.fit(train_data)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [39]:
# Get median of the columns
# WARNING: np.median treats NaN as a value
a = [[1,2], [10,1], [np.nan, 2]]
np.median(a, axis=0)
Out[39]:
In [40]:
# scipy.stats ignores the NaN so an unbiased median can be estimated
# for imputation
from scipy import stats
stats.nanmedian(a, axis=0)
Out[40]:
In [109]:
In [130]:
train_data = [['1', 'NaN', 'NaN', '0.0127034', '0.0435092'],
['1', 'NaN', 'NaN', '0.0113187', '0.228205'],
['1', '0.648', '0.248', '0.0142176', '0.202707'],
['1', '0.357', '0.470', '0.0328121', '0.255039'],
['1', 'NaN', 'NaN', '0.00311825', '0.0381745'],
['1', 'NaN', 'NaN', '0.0332604', '0.2857']]
train_data
Out[130]:
In [111]:
A = np.array(train_data)
A[A=='NaN'] = np.nan
In [112]:
# Impute np.array with median values
#
imputer.fit(train_data)
imputer.transform(train_data).round(2)
Out[112]:
In [131]:
test_data = [['1', 'NaN', 'NaN', '0.0127034', '0.0435092'],
['1', 'NaN', 'NaN', '0.0113187', '0.228205'],
['1', '0.648', '0.248', '0.0142176', '0.202707'],
['1', '0.357', '0.470', '0.0328121', '0.255039'],
['1', '10', 'NaN', '0.00311825', '0.0381745'],
['1', 'NaN', 'NaN', '0.0332604', '0.2857']]
In [ ]:
B = np.array(test_data)
B[B=='NaN'] = np.nan
B
In [114]:
B = imputer.transform(B)
B.round(2)
Out[114]:
In [114]:
In [ ]: