In [269]:
%matplotlib inline
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

from pandas import DataFrame
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn.base import TransformerMixin
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import SelectKBest
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import f_classif
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [270]:
data_dict = pickle.load(open("../ud120-projects/final_project/final_project_dataset.pkl", "r") )

Holdout

Since the dataset for this project is so small, a hold-out set will not be used, and only k-fold testing and training splits will be used to measure accuracy.

This is because even with a stratified hold-out set of 20%, with only 146 data points, lots of missing data and and 18 poi's, there would be only 3 or so points to do a final test on. This does not give much confidence in the precision of the performance metrics on such a small hold-out set, while also negatively impacting the ability to create the model.

"when the number of samples is not large, a strong case can be made that a test set should be avoided because every sample may be needed for model building. (...) Additionally, the size of the test set may not have sufficient power or precision to make reasonable judgements. "

[1] Kuhn M., Kjell J.(2013). Applied Predictive Modeling. Springer. pp.67

Hawkins et al. (2003) concisely summarize this point:“holdout samples of tolerable size [. . . ] do not match the cross-validation itself for reliability in assessing model fit and are hard to motivate.”

[2] Hawkins D, Basak S, Mills D (2003). “Assessing Model Fit by Cross– Validation.” Journal of Chemical Information and Computer Sciences, 43(2), 579–586

This will be addressed with K-fold cross-validation resampling techniques.

Version 2 - Cross Validation Scheme

  1. Define sets of model parameters values to evaluate
  2. for each parameter set in grid search DO
    1. For each k-fold resampling iteration DO
      1. Hold-out 1/k samples/fold
      2. Pre-Process Data (Create functions on training set, apply to test set with same)
        1. Impute data (median)
        2. Scale features (x_i - mean))/std
        3. Perform any univariate feature selection (remove very low variation features)
        4. Modeling feature selection (ExtraTreesClassifier)
      3. Fit the model on the k/K training fold
      4. Predict the hold-out samples/fold
    2. END
    3. Calculate the average performance across hold-out predictions
  3. END
  4. Determine the optimal parameter set
  5. Fit the final model to all training data using the optimal parameter set

Fix 2 out-of-sync records


In [271]:
data_dict['BELFER ROBERT'] = {'bonus': 'NaN',
                              'deferral_payments': 'NaN',
                              'deferred_income': -102500,
                              'director_fees': 102500,
                              'email_address': 'NaN',
                              'exercised_stock_options': 'NaN',
                              'expenses': 3285,
                              'from_messages': 'NaN',
                              'from_poi_to_this_person': 'NaN',
                              'from_this_person_to_poi': 'NaN',
                              'loan_advances': 'NaN',
                              'long_term_incentive': 'NaN',
                              'other': 'NaN',
                              'poi': False,
                              'restricted_stock': -44093,
                              'restricted_stock_deferred': 44093,
                              'salary': 'NaN',
                              'shared_receipt_with_poi': 'NaN',
                              'to_messages': 'NaN',
                              'total_payments': 3285,
                              'total_stock_value': 'NaN'}

data_dict['BHATNAGAR SANJAY'] = {'bonus': 'NaN',
                                 'deferral_payments': 'NaN',
                                 'deferred_income': 'NaN',
                                 'director_fees': 'NaN',
                                 'email_address': 'sanjay.bhatnagar@enron.com',
                                 'exercised_stock_options': 15456290,
                                 'expenses': 137864,
                                 'from_messages': 29,
                                 'from_poi_to_this_person': 0,
                                 'from_this_person_to_poi': 1,
                                 'loan_advances': 'NaN',
                                 'long_term_incentive': 'NaN',
                                 'other': 'NaN',
                                 'poi': False,
                                 'restricted_stock': 2604490,
                                 'restricted_stock_deferred': -2604490,
                                 'salary': 'NaN',
                                 'shared_receipt_with_poi': 463,
                                 'to_messages': 523,
                                 'total_payments': 137864,
                                 'total_stock_value': 15456290}

In [484]:
df = pd.DataFrame.from_dict(data_dict, orient='index')
df = df.drop('TOTAL', axis=0)


Out[484]:
salary                       object
to_messages                  object
deferral_payments            object
total_payments               object
exercised_stock_options      object
bonus                        object
restricted_stock             object
shared_receipt_with_poi      object
restricted_stock_deferred    object
total_stock_value            object
expenses                     object
loan_advances                object
from_messages                object
other                        object
from_this_person_to_poi      object
poi                            bool
director_fees                object
deferred_income              object
long_term_incentive          object
email_address                object
from_poi_to_this_person      object
dtype: object

'NaN' was imported as a string instead of a missing value. We will convert these to NaN type and look how many missing values our data has.


In [485]:
# Replace 'NaN' strings with 0's
df = df.replace('NaN', 0)
# Replace email strings with True/False boolean as to whether an email was present or not
# df['email_address'] = df['email_address'].fillna(0).apply(lambda x: x != 0, 1)
# Remove 'email_address' string as a feature
del df['email_address']


Out[485]:
salary                       int64
to_messages                  int64
deferral_payments            int64
total_payments               int64
exercised_stock_options      int64
bonus                        int64
restricted_stock             int64
shared_receipt_with_poi      int64
restricted_stock_deferred    int64
total_stock_value            int64
expenses                     int64
loan_advances                int64
from_messages                int64
other                        int64
from_this_person_to_poi      int64
poi                           bool
director_fees                int64
deferred_income              int64
long_term_incentive          int64
from_poi_to_this_person      int64
dtype: object

In [464]:


In [427]:
df_original = df.copy()

In [884]:
# Convert features to floats since MinMaxScaler does not like int64's
X_original = df.drop(['poi'], axis=1).astype(float)
y_original = df['poi']

# Drop any row that has only zeros in it, drop from labels first, then from features
y_original = y_original[X_original.abs().sum(axis=1) != 0]
X_original = X_original[X_original.abs().sum(axis=1) != 0]

# Save the names of the features 
X_names = X_original.columns
#X_original = X_original.apply(lambda x: x.fillna(0), axis=0)

# Scale the features
standardized = MinMaxScaler().fit_transform(X_original)

# Score the features using a classification scoring function using 
# the Anova F-value for the provided sample
selection = SelectKBest(k='all', score_func=f_classif).fit(standardized, y_original)

#new_X = selection.transform(standardized)

#KBestNames = X_names[selection.get_support()]

# Create a pd.DataFrame of the names and scores
scores = pd.DataFrame([X_names, selection.scores_])
scores = scores.T
scores.columns = ['Features', 'Scores']
scores = scores.sort(['Scores'], ascending=False).reset_index(drop=True)
scores


Out[884]:
Features Scores
0 exercised_stock_options_squared 25.04327
1 total_stock_value 22.78211
2 exercised_stock_options 22.61053
3 bonus 21.06
4 bonus_total_pay_ratio 20.98877
5 salary 18.5757
6 total_compensation 17.18271
7 long_term_incentive_total_pay_ratio 14.01403
8 salary_squared 13.75712
9 deferred_income 11.56189
10 bonus_squared 10.69425
11 long_term_incentive 10.07245
12 deferred_income_squared 10.01498
13 total_payments 9.380237
14 restricted_stock 8.964964
15 total_poi_interaction 8.773847
16 shared_receipt_with_poi 8.746486
17 loan_advances_squared 7.307514
18 loan_advances 7.24273
19 bonus_total_compensation_ratio 6.731432
20 shared_poi_from_messages_ratio 5.793909
21 expenses 5.550684
22 loan_advances_total_pay_ratio 5.396396
23 from_poi_to_this_person 5.344942
24 from_messages_from_poi_ratio 5.20965
25 shared_receipt_with_poi_squared 4.979852
26 total_active_poi_interaction 4.955198
27 bonus_by_total_stock 4.920968
28 other_squared 4.828893
29 restricted_stock_squared 4.794726
... ... ...
41 director_fees_squared 1.913096
42 deferral_payments_total_pay_ratio 1.779904
43 to_messages 1.698824
44 from_poi_to_this_person_total_poi_int_ratio 1.429218
45 from_this_person_to_poi_total_poi_int_ratio 1.245077
46 expenses_total_compensation_ratio 1.196422
47 restricted_stock_total_stock_ratio 1.128587
48 to_poi_total_active_poi_ratio 1.115302
49 deferral_payments_total_compensation_ratio 1.085361
50 expenses_squared 0.776179
51 restricted_stock_deferred 0.7434934
52 other_total_pay_ratio 0.7191198
53 from_poi_to_this_person_squared 0.6159672
54 restricted_stock_deferred_squared 0.311332
55 director_fees_total_pay_ratio 0.2215513
56 deferral_payments 0.2212145
57 director_fees_total_compensation_ratio 0.2199087
58 deferral_payments_squared 0.1872097
59 from_messages 0.1641645
60 restricted_stock_deferred_total_compensation_r... 0.142
61 expenses_total_pay_ratio 0.1220853
62 exercised_stock_options_total_stock_ratio 0.06276039
63 restricted_stock_total_compensation_ratio 0.04006004
64 exercised_stock_options_total_compensation_ratio 0.03827046
65 other_total_compensation_ratio 0.02193787
66 shared_poi_total_compensation 0.006134506
67 salary_total_compensation_ratio 0.002550889
68 deferred_income_total_pay_ratio NaN
69 restricted_stock_deferred_total_stock_ratio NaN
70 deferred_income_total_compensation_ratio NaN

71 rows × 2 columns


In [885]:
topKBest = list(scores.Features[0:17])
topKBest


Out[885]:
['exercised_stock_options_squared',
 'total_stock_value',
 'exercised_stock_options',
 'bonus',
 'bonus_total_pay_ratio',
 'salary',
 'total_compensation',
 'long_term_incentive_total_pay_ratio',
 'salary_squared',
 'deferred_income',
 'bonus_squared',
 'long_term_incentive',
 'deferred_income_squared',
 'total_payments',
 'restricted_stock',
 'total_poi_interaction',
 'shared_receipt_with_poi']

In [867]:
ET_selection = ExtraTreesClassifier(n_estimators=1000).fit(standardized, y_original)
#print ET_selection.feature_importances_

ET_new_X = selection.transform(standardized)

# Create a pd.DataFrame of the names and importances
scores = pd.DataFrame(ET_selection.feature_importances_, index=X_names)
#scores = scores.T

scores.columns = ['Importance']
scores = scores.sort(['Importance'], ascending=False)
print "TOP10: \n", list(scores.index[0:9])
print scores
scores.sort(['Importance'], ascending=True).plot(kind='barh')


TOP10: 
['exercised_stock_options_squared', 'exercised_stock_options', 'total_stock_value', 'bonus_total_pay_ratio', 'long_term_incentive_total_pay_ratio', 'bonus', 'total_compensation', 'bonus_squared', 'deferred_income']
                                                    Importance
exercised_stock_options_squared                       0.043068
exercised_stock_options                               0.037059
total_stock_value                                     0.033604
bonus_total_pay_ratio                                 0.029329
long_term_incentive_total_pay_ratio                   0.028237
bonus                                                 0.027221
total_compensation                                    0.024189
bonus_squared                                         0.023910
deferred_income                                       0.023385
deferred_income_squared                               0.023150
from_messages_from_poi_ratio                          0.022344
to_messages_to_poi_ratio                              0.020644
other_total_pay_ratio                                 0.020500
expenses                                              0.019571
bonus_by_total_stock                                  0.019283
salary_squared                                        0.019044
other                                                 0.018869
restricted_stock                                      0.018736
other_squared                                         0.018377
salary                                                0.018345
from_this_person_to_poi_squared                       0.018232
other_total_compensation_ratio                        0.018150
restricted_stock_total_stock_ratio                    0.017540
shared_poi_from_messages_ratio                        0.017433
expenses_squared                                      0.017022
restricted_stock_squared                              0.017019
exercised_stock_options_total_stock_ratio             0.016967
shared_receipt_with_poi                               0.016761
from_this_person_to_poi_total_poi_int_ratio           0.016709
bonus_total_compensation_ratio                        0.016191
...                                                        ...
from_poi_to_this_person                               0.013841
total_active_poi_interaction                          0.013830
expenses_total_compensation_ratio                     0.012826
to_poi_total_active_poi_ratio                         0.012313
from_poi_to_this_person_squared                       0.012243
long_term_incentive_total_compensation_ratio          0.012142
from_poi_total_active_poi_ratio                       0.011374
shared_receipt_with_poi_total_poi_int_ratio           0.010444
shared_poi_total_compensation                         0.009914
from_poi_to_this_person_total_poi_int_ratio           0.009603
to_messages                                           0.009144
from_messages                                         0.007447
deferral_payments_total_compensation_ratio            0.006714
deferral_payments                                     0.006280
deferral_payments_total_pay_ratio                     0.006070
deferral_payments_squared                             0.005598
loan_advances                                         0.002572
loan_advances_squared                                 0.002287
loan_advances_total_pay_ratio                         0.001858
loan_advances_total_compensation_ratio                0.001720
restricted_stock_deferred                             0.001679
restricted_stock_deferred_squared                     0.001584
director_fees                                         0.000214
director_fees_squared                                 0.000166
director_fees_total_pay_ratio                         0.000007
restricted_stock_deferred_total_stock_ratio           0.000000
deferred_income_total_pay_ratio                       0.000000
director_fees_total_compensation_ratio                0.000000
deferred_income_total_compensation_ratio              0.000000
restricted_stock_deferred_total_compensation_ratio    0.000000

[71 rows x 1 columns]
Out[867]:
<matplotlib.axes._subplots.AxesSubplot at 0x31489ac8>

In [868]:



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-868-af103a5dbfcd> in <module>()
----> 1 topKBest = list(scores.Features[0:17])
      2 topKBest

c:\Anaconda\lib\site-packages\pandas\core\generic.pyc in __getattr__(self, name)
   1841                 return self[name]
   1842             raise AttributeError("'%s' object has no attribute '%s'" %
-> 1843                                  (type(self).__name__, name))
   1844 
   1845     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'Features'

In [ ]:


In [503]:
for i in range(10): 
    sys.stdout.write('{0}..'.format(i)) 
    sys.stdout.flush() 
    time.sleep(.1)


0..1..2..3..4..5..6..7..8..9..

In [6]:
# Replace with index watcher
# A quick look at the original finanical spreadsheet shows TOTAL at the bottom 
# totaling all entries for everyone. This is obviously an outlier with no 
# meaningful information and can be removed.

# df[df['salary'] > 1000000]
# df[df.index == 'TOTAL']
df = df.drop('TOTAL', axis=0)

In [ ]:

By default, the GridSearchCV uses a 3-fold cross-validation. However, if it detects that a classifier is passed, rather than a regressor, it uses a stratified 3-fold.

http://scikit-learn.org/stable/tutorial/statistical_inference/model_selection.html

Remove columns with less than 50% of entries present.

Remove rows with no non-NA values


In [7]:
# low_var_remover = VarianceThreshold(threshold=.5)

In [8]:
# ************************
# Encode as 0 instead.
# Remove columns with more than 50% NA's
# df_50 = df.dropna(axis=1, thresh=len(df)/2)
# ************************

# Since email_address and poi are True/False, every record should have at least 2 non-NA.
# We'll next remove any rows that don't have at least 2 non-NA values besides these.
# The criteria is: No more than 11 NA's per row.
# df_50 = df_50.dropna(axis=0, thresh=5)

# 128 records remain.
# df_50.info()

Financial NA's

When looking at the source of the data, the NA entries in the financial data seem values that are reported as zero since all payments/stock values add up to the total payments/stocks values. These NA values should then be set to 0 to add up to the totals reported by the accounting spreadsheet.

Email statistics NA's

The missing values for NA's for email statistics may be a little more subjective.

  1. Some email statistics are features created with prior knowledge of the entire dataset (i.e. emails to/from poi's). This may be data snooping, since if new data/pois were somehow introduced, it would not be possible to generate these features without prior knowledge of which new data were the poi's.

  2. NA's here imply that the person did not have an email account with Enron, or were not involved in emailing by some other way.

This means all email data features ar NA if even one column had missing email data for that person. It is hard to judge any distribution that they could have if they were given an email account since they have no ties to the financial data to infer distributions.

We have no real way to infer a person having sent/recieved 10 emails or 10,000 from completely unrelated financial data from a different dataset with many different people.

For this reason, these NA will also be encoded as 0.


In [9]:
df = df.apply(lambda x: x.fillna(0), axis=0)

In [ ]:

Imputation


In [570]:
import seaborn as sns
sns.set(style='darkgrid')

f, ax = plt.subplots(figsize=(14, 14))
cmap = sns.diverging_palette(10, 220, as_cmap=True)
sns.corrplot(df.corr(), annot=True, sig_stars=False,
             diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()



In [587]:
corrs = df.corr()
corrs.sort(['poi'], ascending=False)['poi']


Out[587]:
poi                                                   1.000000
ex_stock_squared                                      0.387501
total_stock_value                                     0.372603
exercised_stock_options                               0.371336
bonus                                                 0.360262
bonus_total_pay_ratio                                 0.359778
salary                                                0.341365
total_compensation                                    0.329207
long_term_incentive_total_pay_ratio                   0.300332
long_term_incentive                                   0.258301
total_payments                                        0.249394
restricted_stock                                      0.244578
total_poi_interaction                                 0.242457
shared_receipt_with_poi                               0.242105
loan_advances                                         0.220405
bonus_total_compensation_ratio                        0.214250
shared_poi_from_messages_ratio                        0.199026
expenses                                              0.195503
from_poi_to_this_person                               0.191549
loan_advances_total_pay_ratio                         0.191546
from_messages_from_poi_ratio                          0.188969
total_active_poi_interaction                          0.184635
bonus_by_total_stock                                  0.183942
loan_advances_total_compensation_ratio                0.177439
from_poi_total_active_poi_ratio                       0.172996
other                                                 0.170443
to_messages_to_poi_ratio                              0.169767
salary_total_pay_ratio                                0.140007
shared_receipt_with_poi_total_poi_int_ratio           0.133709
from_this_person_to_poi                               0.130319
long_term_incentive_total_compensation_ratio          0.120949
to_messages                                           0.110006
from_poi_to_this_person_total_poi_int_ratio           0.101335
from_this_person_to_poi_total_poi_int_ratio           0.094211
restricted_stock_total_stock_ratio                    0.090765
to_poi_total_active_poi_ratio                         0.089748
other_total_pay_ratio                                 0.072171
restricted_stock_deferred                             0.071629
other_total_compensation_ratio                        0.013383
salary_total_compensation_ratio                      -0.002503
shared_poi_total_compensation                        -0.005917
exercised_stock_options_total_compensation_ratio     -0.013675
restricted_stock_total_compensation_ratio            -0.015168
exercised_stock_options_total_stock_ratio            -0.017560
expenses_total_pay_ratio                             -0.028187
restricted_stock_deferred_total_compensation_ratio   -0.031373
from_messages                                        -0.033302
deferral_payments                                    -0.038635
director_fees_total_compensation_ratio               -0.039031
director_fees_total_pay_ratio                        -0.039176
deferral_payments_total_compensation_ratio           -0.085954
expenses_total_compensation_ratio                    -0.090538
deferral_payments_total_pay_ratio                    -0.109995
director_fees                                        -0.120144
deferred_income                                      -0.274998
deferred_income_total_pay_ratio                            NaN
restricted_stock_deferred_total_stock_ratio                NaN
deferred_income_total_compensation_ratio                   NaN
Name: poi, Length: 58, dtype: float64

In [11]:
# Pick a column which we are predicting.
# Find other variables correlated to used KMeansNeighborsRegression to predict/impute
# the missing values.
# df_50.corr().ix[: ,'salary']

In [12]:
# cols1 = ['salary', 'other', 'total_stock_value', 'exercised_stock_options', 
#        'total_payments', 'restricted_stock']
# Bonus and salary values don't seem to be missing at random. Anytime there is a null value
# for salary, there is also one for bonus. So bonus can't be used to predict salary on
# the first pass. Predicted salary values will be used to predict bonus values though 
# on a second pass.
# cols2= ['salary', 'other', 'total_stock_value', 'exercised_stock_options', 
#        'total_payments', 'restricted_stock', 'bonus']
# cols3 = ['to_messages', 'from_this_person_to_poi', 'from_messages', 
# 'shared_receipt_with_poi', 'from_poi_to_this_person']

In [13]:
def kcluster_null(df=None, cols=None, process_all=True):
    '''
    Input: Takes pandas dataframe with values to impute, and a list of columns to impute
        and use for imputing.
    Returns: Pandas dataframe with null values imputed for list of columns passed in.
    
    # Ideally columns should be somewhat correlated since they will be used in KNN to
    # predict each other, one column at a time.
    
    '''
    
    # Create a KNN regression estimator for 
    income_imputer = KNeighborsRegressor(n_neighbors=1)
    # Loops through the columns passed in to impute each one sequentially.
    
    if not process_all:
        to_pred = cols[0]
        predictor_cols = cols[1:]
        
        
    for each in cols:
        # Create a temp list that does not include the column being predicted.
        temp_cols = [col for col in cols if col != each]
        # Create a dataframe that contains no missing values in the columns being predicted.
        # This will be used to train the KNN estimator.
        df_col = df[df[each].isnull()==False]
        
        # Create a dataframe with all of the nulls in the column being predicted.
        df_null_col = df[df[each].isnull()==True]
        
        # Create a temp dataframe filling in the medians for each column being used to
        # predict that is missing values.
        # This step is needed since we have so many missing values distributed through 
        # all of the columns.
        temp_df_medians = df_col[temp_cols].apply(lambda x: x.fillna(x.median()), axis=0)
        
        # Fit our KNN imputer to this dataframe now that we have values for every column.
        income_imputer.fit(temp_df_medians, df_col[each])
        
        # Fill the df (that has null values being predicted) with medians in the other
        # columns not being predicted.
        # ** This currently uses its own medians and should ideally use the predictor df's
        # ** median values to fill in NA's of columns being used to predict.
        temp_null_medians = df_null_col[temp_cols].apply(lambda x: x.fillna(x.median()), axis=0)
        
        # Predict the null values for the current 'each' variable.
        new_values = income_imputer.predict(temp_null_medians[temp_cols])

        # Replace the null values of the original null dataframe with the predicted values.
        df_null_col[each] = new_values
        
        # Append the new predicted nulls dataframe to the dataframe which containined
        # no null values.
        # Overwrite the original df with this one containing predicted columns. 
        # Index order will not be preserved since it is rearranging each time by 
        # null values.
        df = df_col.append(df_null_col)
        
    # Returned final dataframe sorted by the index names.
    return df.sort_index(axis=0)

In [ ]:


In [14]:
df.irow(127)


Out[14]:
salary                            0
to_messages                       0
deferral_payments                 0
total_payments               362096
exercised_stock_options           0
bonus                             0
restricted_stock                  0
shared_receipt_with_poi           0
restricted_stock_deferred         0
total_stock_value                 0
expenses                          0
loan_advances                     0
from_messages                     0
other                        362096
from_this_person_to_poi           0
poi                           False
director_fees                     0
deferred_income                   0
long_term_incentive               0
email_address                 False
from_poi_to_this_person           0
Name: THE TRAVEL AGENCY IN THE PARK, dtype: object

In [15]:
#cols = [x for x in df.columns]
#for each in cols:
#    g = sns.FacetGrid(df, col='poi', margin_titles=True, size=6)
#    g.map(plt.hist, each, color='steelblue')

In [16]:
from pandas.tools.plotting import scatter_matrix

In [17]:
list(df.columns)


Out[17]:
['salary',
 'to_messages',
 'deferral_payments',
 'total_payments',
 'exercised_stock_options',
 'bonus',
 'restricted_stock',
 'shared_receipt_with_poi',
 'restricted_stock_deferred',
 'total_stock_value',
 'expenses',
 'loan_advances',
 'from_messages',
 'other',
 'from_this_person_to_poi',
 'poi',
 'director_fees',
 'deferred_income',
 'long_term_incentive',
 'email_address',
 'from_poi_to_this_person']

In [18]:
financial_cols = np.array(['salary', 'deferral_payments', 'total_payments', 'exercised_stock_options', 
                  'bonus', 'restricted_stock', 'restricted_stock_deferred', 'total_stock_value',
                  'expenses', 'loan_advances', 'other', 'director_fees', 'deferred_income', 
                  'long_term_incentive'])

email_cols = np.array(['from_messages', 'to_messages', 'shared_receipt_with_poi', 
              'from_this_person_to_poi', 'from_poi_to_this_person', 'email_address'])

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
clf = RandomForestClassifier(n_estimators=1000)
clf.fit(df[financial_cols], df['poi'])


Out[20]:
RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0)

In [21]:
importances = clf.feature_importances_
sorted_idx = np.argsort(importances)

In [22]:
padding = np.arange(len(financial_cols)) + 0.5
plt.figure(figsize=(14, 12))
plt.barh(padding, importances[sorted_idx], align='center')
plt.yticks(padding, financial_cols[sorted_idx])
plt.xlabel("Relative Importance")
plt.title("Variable Importance")
plt.show()



In [23]:
clf = RandomForestClassifier(n_estimators=1000)
clf.fit(df[email_cols], df['poi'])

importances = clf.feature_importances_
sorted_idx = np.argsort(importances)

padding = np.arange(len(email_cols)) + 0.5
plt.figure(figsize=(14, 12))
plt.barh(padding, importances[sorted_idx], align='center')
plt.yticks(padding, email_cols[sorted_idx])
plt.xlabel("Relative Importance")
plt.title("Variable Importance")
plt.show()



In [24]:
all_cols = np.concatenate([email_cols, financial_cols])
clf = RandomForestClassifier(n_estimators=1000)
clf.fit(df[all_cols], df['poi'])

importances = clf.feature_importances_
sorted_idx = np.argsort(importances)

padding = np.arange(len(all_cols)) + 0.5
plt.figure(figsize=(14, 12))
plt.barh(padding, importances[sorted_idx], align='center')
plt.yticks(padding, all_cols[sorted_idx])
plt.xlabel("Relative Importance")
plt.title("Variable Importance")
plt.show()



In [25]:
df['ex_stock_bins'] = pd.cut(df.exercised_stock_options, bins=15, labels=False)
pd.value_counts(df.ex_stock_bins)


Out[25]:
0     118
1      10
2       6
3       4
8       2
6       2
14      1
13      1
4       1
dtype: int64

In [26]:
df.exercised_stock_options.plot()


Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a277b70>

In [27]:
def capValues(x, cap):
    return (cap if x > cap else x)

In [28]:
df.exercised_stock_options = df.exercised_stock_options.apply(lambda x: capValues(x, 5000000))

In [29]:
df['ex_stock_bins'] = pd.cut(df.exercised_stock_options, bins=15, labels=False)
pd.value_counts(df.ex_stock_bins)


Out[29]:
0     60
1     18
14    17
2     13
4     12
6      7
3      5
5      4
12     3
13     2
9      2
7      2
dtype: int64

In [30]:
df[['ex_stock_bins', 'poi']].groupby('ex_stock_bins').mean().plot()


Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x187cbda0>

In [31]:
df.columns


Out[31]:
Index([u'salary', u'to_messages', u'deferral_payments', u'total_payments', u'exercised_stock_options', u'bonus', u'restricted_stock', u'shared_receipt_with_poi', u'restricted_stock_deferred', u'total_stock_value', u'expenses', u'loan_advances', u'from_messages', u'other', u'from_this_person_to_poi', u'poi', u'director_fees', u'deferred_income', u'long_term_incentive', u'email_address', u'from_poi_to_this_person', u'ex_stock_bins'], dtype='object')

In [32]:
df[['bonus', 'poi']].groupby('bonus').mean().plot()


Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a622438>

In [33]:
df.shared_receipt_with_poi.plot()


Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a5990f0>

In [34]:
max(df.shared_receipt_with_poi)


Out[34]:
5521.0

In [35]:
# Create bins for shared receipt with poi
my_bins = [min(df.shared_receipt_with_poi)] + [250] + range(500, 5000, 500) + [max(df.shared_receipt_with_poi)]
df['shared_poi_bins'] = pd.cut(df.shared_receipt_with_poi, bins=my_bins, labels=False, include_lowest=True)
pd.value_counts(df['shared_poi_bins'])


Out[35]:
0     81
2     19
5     11
3      9
1      9
4      6
8      4
6      4
10     2
dtype: int64

In [ ]:


In [36]:
df[['shared_poi_bins', 'poi']].groupby('shared_poi_bins').mean().plot()


Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a80f400>

In [37]:
df.total_stock_value


Out[37]:
ALLEN PHILLIP K          1729541
BADUM JAMES P             257817
BANNANTINE JAMES M       5243487
BAXTER JOHN C           10623258
BAY FRANKLIN R             63014
BAZELIDES PHILIP J       1599641
BECK SALLY W              126027
BELDEN TIMOTHY N         1110705
BELFER ROBERT                  0
BERBERIAN DAVID          2493616
BERGSIEKER RICHARD P      659249
BHATNAGAR SANJAY        15456290
BIBI PHILIPPE A          1843816
BLACHMAN JEREMY M         954354
BLAKE JR. NORMAN P             0
...
UMANOFF ADAM S                  0
URQUHART JOHN A                 0
WAKEHAM JOHN                    0
WALLS JR ROBERT H         5898997
WALTERS GARETH W          1030329
WASAFF GEORGE             2056427
WESTFAHL RICHARD K         384930
WHALEY DAVID A              98718
WHALLEY LAWRENCE G        6079137
WHITE JR THOMAS E        15144123
WINOKUR JR. HERBERT S           0
WODRASKA JOHN                   0
WROBEL BRUCE               139130
YEAGER F SCOTT           11884758
YEAP SOON                  192758
Name: total_stock_value, Length: 145, dtype: float64

In [ ]:


In [38]:
from sklearn.preprocessing import StandardScaler

df['total_stock_scaled'] = StandardScaler().fit_transform(df[['total_stock_value']])
df['bonus_scaled'] = StandardScaler().fit_transform(df[['bonus']])

print df.total_stock_scaled.describe() plt.hist(df.total_stock_scaled)


In [39]:
def dont_neg_log(x):
    if x >=0:
        return np.log1p(x)
    else:
        return 0
    
df['stock_log'] = df['total_stock_value'].apply(lambda x: dont_neg_log(x))

Feature Ratio Creation


In [561]:
financial_cols = np.array(['salary', 'deferral_payments', 'total_payments', 'exercised_stock_options', 
                  'bonus', 'restricted_stock', 'restricted_stock_deferred', 'total_stock_value',
                  'expenses', 'loan_advances', 'other', 'director_fees', 'deferred_income', 
                  'long_term_incentive'])

email_cols = np.array(['from_messages', 'to_messages', 'shared_receipt_with_poi', 
              'from_this_person_to_poi', 'from_poi_to_this_person', 'email_address'])

In [562]:
payment_comp = ['salary', 'deferral_payments','bonus', 'expenses', 'loan_advances',
                'other', 'director_fees', 'deferred_income', 'long_term_incentive']
payment_total = ['total_payments']

stock_comp = ['exercised_stock_options', 'restricted_stock','restricted_stock_deferred',]
stock_total = ['total_stock_value']

all_comp = payment_comp + stock_comp

email_comp = ['shared_receipt_with_poi', 'from_this_person_to_poi', 'from_poi_to_this_person' ]
email_totals = ['from_messages', 'to_messages'] # interaction_w_poi = total(from/to/shared poi)

In [ ]:


In [636]:
df['total_compensation'] = df['total_payments'] + df['total_stock_value']

for each in payment_comp:
    df['{0}_{1}_ratio'.format(each, 'total_pay')] = df[each]/df['total_payments']

for each in stock_comp:
    df['{0}_{1}_ratio'.format(each, 'total_stock')] = df[each]/df['total_stock_value']

for each in all_comp:
    df['{0}_{1}_ratio'.format(each, 'total_compensation')] = df[each]/df['total_compensation']

    
df['total_poi_interaction'] = df['shared_receipt_with_poi'] + df['from_this_person_to_poi'] + \
df['from_poi_to_this_person']

for each in email_comp:
    df['{0}_{1}_ratio'.format(each, 'total_poi_int')] = df[each]/df['total_poi_interaction']

df['total_active_poi_interaction'] = df['from_this_person_to_poi'] + df['from_poi_to_this_person']
df['to_poi_total_active_poi_ratio'] = df['from_this_person_to_poi']/df['total_active_poi_interaction']
df['from_poi_total_active_poi_ratio'] = df['from_poi_to_this_person']/df['total_active_poi_interaction']

df['to_messages_to_poi_ratio'] = df['from_this_person_to_poi']/ df['to_messages']
df['from_messages_from_poi_ratio'] = df['from_poi_to_this_person']/df['from_messages']
df['shared_poi_from_messages_ratio'] = df['shared_receipt_with_poi']/df['from_messages']
df['shared_poi_total_compensation'] = df['shared_receipt_with_poi']/df['total_compensation']
df['bonus_by_total_stock'] = df['bonus']/df['total_stock_value']

## Add squared features
for each in all_comp:
    df['{0}_squared'.format(each)] = df[each]**2
    
for each in email_comp:
    df['{0}_squared'.format(each)] = df[each]**2

A good portion of people were paid either only in stock or payments. Another good portion also didn't have email statistics available.

These ratios will need to be set to zero manually due to division by 0 - NaN.


In [640]:
df = df.apply(lambda x: x.fillna(0), axis=0)

In [644]:



<class 'pandas.core.frame.DataFrame'>
Index: 145 entries, ALLEN PHILLIP K to YEAP SOON
Data columns (total 73 columns):
salary                                                145 non-null int64
to_messages                                           145 non-null int64
deferral_payments                                     145 non-null int64
total_payments                                        145 non-null int64
exercised_stock_options                               145 non-null int64
bonus                                                 145 non-null int64
restricted_stock                                      145 non-null int64
shared_receipt_with_poi                               145 non-null int64
restricted_stock_deferred                             145 non-null int64
total_stock_value                                     145 non-null int64
expenses                                              145 non-null int64
loan_advances                                         145 non-null int64
from_messages                                         145 non-null int64
other                                                 145 non-null int64
from_this_person_to_poi                               145 non-null int64
poi                                                   145 non-null bool
director_fees                                         145 non-null int64
deferred_income                                       145 non-null int64
long_term_incentive                                   145 non-null int64
from_poi_to_this_person                               145 non-null int64
total_compensation                                    145 non-null int64
salary_total_pay_ratio                                145 non-null float64
deferral_payments_total_pay_ratio                     145 non-null float64
bonus_total_pay_ratio                                 145 non-null float64
expenses_total_pay_ratio                              145 non-null float64
loan_advances_total_pay_ratio                         145 non-null float64
other_total_pay_ratio                                 145 non-null float64
director_fees_total_pay_ratio                         145 non-null float64
deferred_income_total_pay_ratio                       145 non-null float64
long_term_incentive_total_pay_ratio                   145 non-null float64
exercised_stock_options_total_stock_ratio             145 non-null float64
restricted_stock_total_stock_ratio                    145 non-null float64
restricted_stock_deferred_total_stock_ratio           145 non-null float64
salary_total_compensation_ratio                       145 non-null float64
deferral_payments_total_compensation_ratio            145 non-null float64
bonus_total_compensation_ratio                        145 non-null float64
expenses_total_compensation_ratio                     145 non-null float64
loan_advances_total_compensation_ratio                145 non-null float64
other_total_compensation_ratio                        145 non-null float64
director_fees_total_compensation_ratio                145 non-null float64
deferred_income_total_compensation_ratio              145 non-null float64
long_term_incentive_total_compensation_ratio          145 non-null float64
exercised_stock_options_total_compensation_ratio      145 non-null float64
restricted_stock_total_compensation_ratio             145 non-null float64
restricted_stock_deferred_total_compensation_ratio    145 non-null float64
total_poi_interaction                                 145 non-null int64
shared_receipt_with_poi_total_poi_int_ratio           145 non-null float64
from_this_person_to_poi_total_poi_int_ratio           145 non-null float64
from_poi_to_this_person_total_poi_int_ratio           145 non-null float64
total_active_poi_interaction                          145 non-null int64
to_poi_total_active_poi_ratio                         145 non-null float64
from_poi_total_active_poi_ratio                       145 non-null float64
to_messages_to_poi_ratio                              145 non-null float64
from_messages_from_poi_ratio                          145 non-null float64
shared_poi_from_messages_ratio                        145 non-null float64
shared_poi_total_compensation                         145 non-null float64
bonus_by_total_stock                                  145 non-null float64
ex_stock_squared                                      145 non-null int64
salary_squared                                        145 non-null int64
deferral_payments_squared                             145 non-null int64
bonus_squared                                         145 non-null int64
expenses_squared                                      145 non-null int64
loan_advances_squared                                 145 non-null int64
other_squared                                         145 non-null int64
director_fees_squared                                 145 non-null int64
deferred_income_squared                               145 non-null int64
long_term_incentive_squared                           145 non-null int64
exercised_stock_options_squared                       145 non-null int64
restricted_stock_squared                              145 non-null int64
restricted_stock_deferred_squared                     145 non-null int64
shared_receipt_with_poi_squared                       145 non-null int64
from_this_person_to_poi_squared                       145 non-null int64
from_poi_to_this_person_squared                       145 non-null int64
dtypes: bool(1), float64(34), int64(38)

In [565]:
df[['poi', 'director_fees_total_pay_ratio', 'director_fees', 'total_payments']]


Out[565]:
poi director_fees_total_pay_ratio director_fees total_payments
ALLEN PHILLIP K False 0.000000 0 4484442
BADUM JAMES P False 0.000000 0 182466
BANNANTINE JAMES M False 0.000000 0 916197
BAXTER JOHN C False 0.000000 0 5634343
BAY FRANKLIN R False 0.000000 0 827696
BAZELIDES PHILIP J False 0.000000 0 860136
BECK SALLY W False 0.000000 0 969068
BELDEN TIMOTHY N True 0.000000 0 5501630
BELFER ROBERT False 31.202435 102500 3285
BERBERIAN DAVID False 0.000000 0 228474
BERGSIEKER RICHARD P False 0.000000 0 618850
BHATNAGAR SANJAY False 0.000000 0 137864
BIBI PHILIPPE A False 0.000000 0 2047593
BLACHMAN JEREMY M False 0.000000 0 2014835
BLAKE JR. NORMAN P False 88.963253 113784 1279
BOWEN JR RAYMOND M True 0.000000 0 2669589
BROWN MICHAEL False 0.000000 0 49288
BUCHANAN HAROLD G False 0.000000 0 1054637
BUTTS ROBERT H False 0.000000 0 1271582
BUY RICHARD B False 0.000000 0 2355702
CALGER CHRISTOPHER F True 0.000000 0 1639297
CARTER REBECCA C False 0.000000 0 477557
CAUSEY RICHARD A True 0.000000 0 1868758
CHAN RONNIE False inf 98784 0
CHRISTODOULOU DIOMEDES False inf 0 0
CLINE KENNETH W False inf 0 0
COLWELL WESLEY True 0.000000 0 1490344
CORDES WILLIAM R False inf 0 0
COX DAVID False 0.000000 0 1101393
CUMBERLAND MICHAEL S False 0.000000 0 807956
... ... ... ... ...
SCRIMSHAW MATTHEW False inf 0 0
SHANKMAN JEFFREY A False 0.000000 0 3038702
SHAPIRO RICHARD S False 0.000000 0 1057548
SHARP VICTORIA T False 0.000000 0 1576511
SHELBY REX True 0.000000 0 2003885
SHERRICK JEFFREY B False inf 0 0
SHERRIFF JOHN R False 0.000000 0 4335388
SKILLING JEFFREY K True 0.000000 0 8682716
STABLER FRANK False 0.000000 0 1112087
SULLIVAN-SHAKLOVITZ COLLEEN False 0.000000 0 999356
SUNDE MARTIN False 0.000000 0 1545059
TAYLOR MITCHELL S False 0.000000 0 1092663
THE TRAVEL AGENCY IN THE PARK False 0.000000 0 362096
THORN TERENCE H False 0.000000 0 911453
TILNEY ELIZABETH A False 0.000000 0 399393
UMANOFF ADAM S False 0.000000 0 1130461
URQUHART JOHN A False 0.160354 36666 228656
WAKEHAM JOHN False 0.512965 109298 213071
WALLS JR ROBERT H False 0.000000 0 1798780
WALTERS GARETH W False 0.000000 0 87410
WASAFF GEORGE False 0.000000 0 1034395
WESTFAHL RICHARD K False 0.000000 0 762135
WHALEY DAVID A False inf 0 0
WHALLEY LAWRENCE G False 0.000000 0 4677574
WHITE JR THOMAS E False 0.000000 0 1934359
WINOKUR JR. HERBERT S False 1.277520 108579 84992
WODRASKA JOHN False 0.000000 0 189583
WROBEL BRUCE False inf 0 0
YEAGER F SCOTT True 0.000000 0 360300
YEAP SOON False 0.000000 0 55097

145 rows × 4 columns


In [566]:
df[df['poi']==True]


Out[566]:
salary to_messages deferral_payments total_payments exercised_stock_options bonus restricted_stock shared_receipt_with_poi restricted_stock_deferred total_stock_value ... from_this_person_to_poi_total_poi_int_ratio from_poi_to_this_person_total_poi_int_ratio total_active_poi_interaction to_poi_total_active_poi_ratio from_poi_total_active_poi_ratio to_messages_to_poi_ratio from_messages_from_poi_ratio shared_poi_from_messages_ratio shared_poi_total_compensation bonus_by_total_stock
BELDEN TIMOTHY N 213999 7991 2144013 5501630 953136 5249999 157569 5521 0 1110705 ... 0.018439 0.038928 336 0.321429 0.678571 0.013515 0.471074 11.407025 0.000835 4.726727
BOWEN JR RAYMOND M 278601 1858 0 2669589 0 1350000 252055 1593 0 252055 ... 0.008581 0.080092 155 0.096774 0.903226 0.008073 5.185185 59.000000 0.000545 5.355974
CALGER CHRISTOPHER F 240189 2598 0 1639297 0 1250000 126027 2188 0 126027 ... 0.010365 0.082504 224 0.111607 0.888393 0.009623 1.381944 15.194444 0.001239 9.918510
CAUSEY RICHARD A 415189 1892 0 1868758 0 1000000 2502063 1585 0 2502063 ... 0.007251 0.035045 70 0.171429 0.828571 0.006342 1.183673 32.346939 0.000363 0.399670
COLWELL WESLEY 288542 1758 27610 1490344 0 1200000 698242 1132 0 698242 ... 0.007954 0.173536 251 0.043825 0.956175 0.006257 6.000000 28.300000 0.000517 1.718602
DELAINEY DAVID W 365163 3093 0 4747979 2291113 3000000 1323148 2097 0 3614261 ... 0.219697 0.023810 675 0.902222 0.097778 0.196896 0.021505 0.683284 0.000251 0.830045
FASTOW ANDREW S 440698 0 0 2424083 0 1300000 1794412 0 0 1794412 ... inf inf 0 inf inf inf inf inf 0.000000 0.724471
GLISAN JR BEN F 274975 873 0 1272284 384728 600000 393818 874 0 778546 ... 0.006438 0.055794 58 0.103448 0.896552 0.006873 3.250000 54.625000 0.000426 0.770667
HANNON KEVIN P 243293 1045 0 288682 5538001 1500000 853064 1035 0 6391065 ... 0.019301 0.029412 53 0.396226 0.603774 0.020096 1.000000 32.343750 0.000155 0.234703
HIRKO JOSEPH 0 0 10259 91093 30766064 0 0 0 0 30766064 ... inf inf 0 inf inf inf inf inf 0.000000 0.000000
KOENIG MARK E 309946 2374 0 1587421 671737 700000 1248318 2271 0 1920055 ... 0.006413 0.022659 68 0.220588 0.779412 0.006318 0.868852 37.229508 0.000647 0.364573
KOPPER MICHAEL J 224305 0 0 2652612 0 800000 985032 0 0 985032 ... inf inf 0 inf inf inf inf inf 0.000000 0.812156
LAY KENNETH L 1072321 4273 202911 103559793 34348384 7000000 14761694 2411 0 49110078 ... 0.006275 0.048235 139 0.115108 0.884892 0.003744 3.416667 66.972222 0.000016 0.142537
RICE KENNETH D 420636 905 0 505050 19794175 1750000 2748364 864 0 22542539 ... 0.004396 0.046154 46 0.086957 0.913043 0.004420 2.333333 48.000000 0.000037 0.077631
RIEKER PAULA H 249201 1328 214678 1099100 1635238 700000 283649 1258 0 1918887 ... 0.035794 0.026100 83 0.578313 0.421687 0.036145 0.426829 15.341463 0.000417 0.364795
SHELBY REX 211844 225 0 2003885 1624396 200000 869220 91 0 2493616 ... 0.118644 0.110169 27 0.518519 0.481481 0.062222 0.333333 2.333333 0.000020 0.080205
SKILLING JEFFREY K 1111258 3627 0 8682716 19250000 5600000 6843672 2042 0 26093672 ... 0.013889 0.040741 118 0.254237 0.745763 0.008271 0.814815 18.907407 0.000059 0.214611
YEAGER F SCOTT 158403 0 0 360300 8308552 0 3576206 0 0 11884758 ... inf inf 0 inf inf inf inf inf 0.000000 0.000000

18 rows × 57 columns

director_fees_total_pay_ratio, deferred_income_total_pay_ratio, exercised_stock_options_total_stock_ratio, exercised_stock_options_total_stock_ratio, restricted_stock_deferred_total_stock_ratio, restricted_stock_total_stock_ratio, director_fees_total_compensation_ratio, deferred_income_total_compensation_ratio, restricted_stock_total_compensation_ratio, restricted_stock_deferred_total_compensation_ratio

Replace Inf/-Inf created in pandas from dividing a -/+ number by zero


In [647]:
df = df.replace([np.inf, -np.inf], 0)

In [47]:
#df.ix[20:30, 30:40]

In [48]:
# Column/row slicing by number
# df.ix[11,:]

In [796]:
#all_cols2 = np.concatenate([all_cols, np.array(['shared_poi_bins', 'ex_stock_bins', 
#                                                'total_stock_scaled', 'bonus_scaled',
#                                                'stock_log'])])
# from_messages_from_poi_to_ratio

features = np.array(df.drop('poi', axis=1).columns)

clf = ExtraTreesClassifier(n_estimators=3000)
clf.fit(df[features], df['poi'])

importances = clf.feature_importances_
sorted_idx = np.argsort(importances)

padding = np.arange(len(features)) + 0.5
plt.figure(figsize=(16,14))
plt.barh(padding, importances[sorted_idx], align='center')
plt.yticks(padding, features[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()

top10_features_RF = ['bonus', 'total_stock_value', 'other', 'total_compensation', 'expenses',
                 'other_total_pay_ratio', 'from_messages_from_poi_ratio', 'restricted_stock',
                 'shared_poi_from_messages_ratio', 'total_payments']

top10_features_ET = ['exercised_stock_options_squared', 'total_stock_value', 'bonus_total_pay_ratio', 
                     'long_term_incentive_total_pay_ratio', 'bonus', 'deferred_income',
                     'total_compensation', 'to_messages_to_poi_ratio',
                     'from_messages_from_poi_ratio', 'to_messages_to_poi_ratio', 'other_total_pay_ratio',
                     'salary_squared', 'other']



In [ ]:


In [569]:
confusion_matrix(df['poi'], clf.predict(df[features]))


Out[569]:
array([[127,   0],
       [  0,  18]])

In [206]:
#X_df = df.drop('poi', axis=1)
#y_df = df['poi']
#selector = SelectKBest(k=12, score_func=f_classif)
#selector = selector.fit_transform(X_df, y_df)
#selector


Out[206]:
array([[201955.0, 4484442.0, 4175000.0, ..., 6213983.0, 0.9309965431596617,
        0.06796943744617502],
       [0.0, 182466.0, 0.0, ..., 440283.0, 0.0, 0.0],
       [477.0, 916197.0, 0.0, ..., 6159684.0, 0.0, 0.0],
       ..., 
       [0.0, 0.0, 0.0, ..., 139130.0, 0.0, 0.0],
       [158403.0, 360300.0, 0.0, ..., 12245058.0, 0.0, 0.0],
       [0.0, 55097.0, 0.0, ..., 247855.0, 0.0, 0.0]], dtype=object)

Train


In [ ]:
FINANCIAL_FIELDS = ['salary', 'deferral_payments', 'total_payments', 'exercised_stock_options', 
                  'bonus', 'restricted_stock', 'restricted_stock_deferred', 'total_stock_value',
                  'expenses', 'loan_advances', 'other', 'director_fees', 'deferred_income', 
                  'long_term_incentive', 'ex_stock_bins', 'stock_log']

EMAIL_FIELDS = ['from_messages', 'to_messages', 'shared_receipt_with_poi', 
              'from_this_person_to_poi', 'from_poi_to_this_person', 'email_address',
              'shared_poi_bins']

In [222]:
class ColumnExtractor(TransformerMixin):
    '''
    Columns extractor transformer for sklearn pipelines.
    Inherits fit_transform() from TransformerMixin, but this is explicitly
    defined here for clarity.
    
    Methods to extract pandas dataframe columns are defined for this class.
    
    '''
    def __init__(self, columns=[]):
        self.columns = columns
    
    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)
    
    def transform(self, X, **transform_params):
        '''
        Input: A pandas dataframe and a list of column names to extract.
        Output: A pandas dataframe containing only the columns of the names passed in.
        '''
        return X[self.columns]
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def get_params(self, deep=True):
        """Get parameters for this estimator.
        Parameters
        ----------
        deep: boolean, optional
            If True, will return the parameters for this estimator and
            contained subobjects that are estimators.
        Returns
        -------
        params : mapping of string to any
            Parameter names mapped to their values.
        """

        return self

In [545]:
top10_features_ET
top10 = ['exercised_stock_options', 'total_stock_value', 'bonus', 'salary', 'deferred_income', 
        'long_term_incentive', 'restricted_stock', 'total_payments', 'loan_advances',
         'shared_receipt_with_poi','total_compensation', 'from_messages_from_poi_ratio']

In [ ]:


In [936]:
#X_df = df[['total_payments', 'total_stock_value', 'shared_receipt_with_poi', 'bonus']].astype(float)
X_df = df.drop('poi', axis=1).astype(float)
#X_df = df[top10_features_ET]

#X_df = df[topKBest].astype(float)
y_df = df['poi']

y_df = y_df[X_df.abs().sum(axis=1) != 0]
X_df = X_df[X_df.abs().sum(axis=1) != 0]


from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.lda import LDA
from sklearn.linear_model import Lars
from sklearn.linear_model import SGDClassifier
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report


sk_fold = StratifiedShuffleSplit(y_df, n_iter=100, test_size=0.1) 
        
pipeline = Pipeline(steps=[#('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy="median", verbose=0)),
                           #('standardizer', StandardScaler(copy=True, with_mean=True, with_std=True)),
                           ('minmaxer', MinMaxScaler()),
                           #('low_var_remover', VarianceThreshold()),
                           ('selection', SelectKBest(score_func=f_classif)),
                           ('reducer', PCA()),
                           #('classifier', LinearSVC(penalty='l1', dual=False)),
                           #('KMeans', KMeans(n_clusters=2))
                           ('classifier', LogisticRegression())
                           #('classifier2', SGDClassifier(n_iter=300))
                                                     ]) # ,
                           #('ET', ExtraTreesClassifier(bootstrap=True, compute_importances=None,
                           #                            criterion='gini', n_estimators=1500, n_jobs=1,
                           #                            oob_score=True, random_state=None, verbose=0,
                           #                            max_features='auto', min_samples_split=2,
                           #                            min_samples_leaf=1))])

                    
params = {
          #'ET__n_estimators': [1500],
          #'ET__max_features': ['auto', None, 3, 5, 10, 20],
          #'ET__min_samples_split': [2, 4, 10],
          #'ET__min_samples_leaf': [1, 2, 5],
          'selection__k': [20, 17, 15],
          'classifier__C': [1, 10, 100, 1000],
          #'classifier2__alpha': [0.0001, 0.001],
          #'classifier2__loss': ['hinge', 'log', 'modified_huber'],
          #'classifier2__class_weight': [{True: 4, False: 1}, {True: 10, False: 1}],
          #'classifier__penalty': ['l1', 'l2'],
          'classifier__class_weight': [{True: 12, False: 1}, {True: 10, False: 1}, {True: 8, False: 1}],
          'classifier__tol': [1e-1, 1e-2, 1e-4, 1e-8, 1e-16, 1e-32],
          'reducer__n_components': [1, 2, 3, 4, 5],
          'reducer__whiten': [True, False]
          #'feature_selection__k': [3, 5, 10, 20]
          #'ET__criterion' : ['gini', 'entropy'],
          #'imputer__strategy': ['median', 'mean'],
          #'low_var_remover__threshold': [0, 0.1, .25, .50, .75, .90, .99]
          }
# Scoring: average_precision, roc_auc, f1, recall, precision
grid_search = GridSearchCV(pipeline, param_grid=params, cv=sk_fold, n_jobs = 1, scoring='f1')
grid_search.fit(X_df, y=y_df)
#test_pred = grid_search.predict(X_test)
#print "Cross_Val_score: ", cross_val_score(grid_search, X_train, y_train)
print "Best Estimator: ", grid_search.best_estimator_
    #f1_avg.append(f1_score(y_test, test_pred))
#print "F1: ", f1_score(y_test, test_pred)
#print "Confusion Matrix: "
#print confusion_matrix(y_test, test_pred)
#print "Accuracy Score: ", accuracy_score(y_test, test_pred)
print "Best Params: ", grid_search.best_params_


Best Estimator:  Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))), ('selection', SelectKBest(k=20, score_func=<function f_classif at 0x0000000016AA6C88>)), ('reducer', PCA(copy=True, n_components=1, whiten=True)), ('classifier', LogisticRegression(C=1, class_weight={False: 1, True: 10}, dual=False,
          fit_intercept=True, intercept_scaling=1, penalty='l2',
          random_state=None, tol=0.1))])
Best Params:  {'reducer__n_components': 1, 'classifier__class_weight': {False: 1, True: 10}, 'classifier__tol': 0.1, 'selection__k': 20, 'reducer__whiten': True, 'classifier__C': 1}

In [937]:
n_iter = 1000
sk_fold = StratifiedShuffleSplit(y_df, n_iter=n_iter, test_size=0.1)
f1_avg = []
recall_avg = []
precision_avg = []
for i, all_index in enumerate(sk_fold):
    train_index = all_index[0]
    test_index = all_index[1]
    X_train, X_test = X_df.irow(train_index), X_df.irow(test_index)
    y_train, y_test = y_df[train_index], y_df[test_index]

    grid_search.best_estimator_.fit(X_train, y=y_train)
    # pipeline.fit(X_train, y=y_train)
    test_pred = grid_search.predict(X_test)
    #test_pred = pipeline.predict(X_test)

    #print "Cross_Val_score: ", cross_val_score(grid_search, X_train, y_train)
    #print "Best Estimator: ", grid_search.best_estimator_
    #print f1_score(y_test, test_pred)
    if i % round(n_iter/10) == 0:
        sys.stdout.write('{0}%..'.format(float(i)/n_iter*100)) 
        sys.stdout.flush()        
    f1_avg.append(f1_score(y_test, test_pred))
    precision_avg.append(precision_score(y_test, test_pred))
    recall_avg.append(recall_score(y_test, test_pred))

print "Done!"
print ""
print "F1 Avg: ", sum(f1_avg)/n_iter
print "Precision Avg: ", sum(precision_avg)/n_iter
print "Recall Avg: ", sum(recall_avg)/n_iter


0.0%..10.0%..20.0%..30.0%..40.0%..50.0%..60.0%..70.0%..80.0%..90.0%..Done!

F1 Avg:  0.415665673216
Precision Avg:  0.299662842713
Recall Avg:  0.755

In [ ]:

F1 Avg: 0.309882173382 Precision Avg: 0.226065462315

Recall Avg: 0.5515

Best Estimator: Pipeline(steps=[('standardizer', StandardScaler(copy=True, with_mean=True, with_std=True)), ('low_var_remover', VarianceThreshold(threshold=0.1)), ('classifier', LinearSVC(C=0.1, class_weight='auto', dual=False, fit_intercept=True, intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l1', random_state=None, tol=1e-07, verbose=0))]) Best Params: {'classifierclass_weight': 'auto', 'low_var_removerthreshold': 0.1, 'classifierC': 0.1, 'classifiertol': 1e-07}

F1 Avg: 0.39108035853 Precision Avg: 0.263075613276

Recall Avg: 0.8335

Best Estimator: Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))), ('reducer', PCA(copy=True, n_components=5, whiten=True)), ('classifier', LogisticRegression(C=0.01, class_weight='auto', dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.01))]) Best Params: {'reducerwhiten': True, 'classifierclass_weight': 'auto', 'classifierC': 0.01, 'reducern_components': 5, 'classifier__tol': 0.01}

F1 Avg: 0.408565806416 Precision Avg: 0.301739249639

Recall Avg: 0.725

Best Estimator: Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))), ('reducer', PCA(copy=True, n_components=5, whiten=False)), ('classifier2', SGDClassifier(alpha=0.0001, class_weight='auto', epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=300, n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None, shuffle=False, verbose=0, warm_start=False))]) Best Params: {'reducern_components': 5, 'classifier2alpha': 0.0001, 'classifier2class_weight': 'auto', 'classifier2loss': 'hinge', 'reducerwhiten': False, 'classifier2penalty': 'elasticnet'}

F1 Avg: 0.293634931735 Precision Avg: 0.219107395382

Recall Avg: 0.5055

Best Estimator: Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))), ('classifier', LinearSVC(C=1, class_weight='auto', dual=False, fit_intercept=True, intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l1', random_state=None, tol=1e-08, verbose=0)), ('classifier2', SGDClassifier(a..., penalty='l2', power_t=0.5, random_state=None, shuffle=False, verbose=0, warm_start=False))]) Best Params: {'classifier2alpha': 0.001, 'classifierclass_weight': 'auto', 'classifier2class_weight': 'auto', 'classifier2loss': 'hinge', 'classifiertol': 1e-08, 'classifier2penalty': 'l2', 'classifier__C': 1}

F1 Avg: 0.392249062049 Precision Avg: 0.300678174603

Recall Avg: 0.636

Best Estimator: Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))), ('reducer', PCA(copy=True, n_components=4, whiten=True)), ('classifier', LogisticRegression(C=10, class_weight='auto', dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001))]) Best Params: {'reducern_components': 4, 'classifierclass_weight': 'auto', 'classifiertol': 0.0001, 'reducerwhiten': True, 'classifierC': 10, 'classifierpenalty': 'l2'}

F1 Avg: 0.461406277056 Precision Avg: 0.364574206349

Recall Avg: 0.7095

Best Estimator: Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))), ('reducer', PCA(copy=True, n_components=1, whiten=True)), ('classifier', LogisticRegression(C=100, class_weight={False: 1, True: 8}, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', random_state=None, tol=0.1))]) Best Params: {'reducerwhiten': True, 'classifierclass_weight': {False: 1, True: 8}, 'classifierC': 100, 'reducern_components': 1, 'classifier__tol': 0.1}


In [ ]:
pipeline = Pipeline(steps=[#('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)),
                           #('standardizer', StandardScaler(copy=True, with_mean=True, with_std=True)),
                           #('low_var_remover', VarianceThreshold(threshold=0.1)), 
                           #('feature_selection', LinearSVC()),
                           ('features', FeatureUnion([
                                ('financial', Pipeline([
                                    ('extract', ColumnExtractor(FINANCIAL_FIELDS)),
                                    ('scale', StandardScaler()),
                                    ('reduce', LinearSVC())
                                ])),

                                ('email', Pipeline([
                                    ('extract2', ColumnExtractor(EMAIL_FIELDS)),
                                    ('scale2', StandardScaler()),
                                    ('reduce2', LinearSVC())
                                ]))

                            ])),
                           ('ET', ExtraTreesClassifier(bootstrap=True, compute_importances=None,
                                                       criterion='gini', n_estimators=1500, n_jobs=1,
                                                       oob_score=True, random_state=None, verbose=0,
                                                       max_features=None, min_samples_split=2,
                                                       min_samples_leaf=1))
                            ])

In [938]:
PERF_FORMAT_STRING = "\
\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\tFalse negatives: {:4d}\tTrue negatives: {:4d}"


def test_classifier(clf, dataset, feature_list, folds = 1000):
    #data = featureFormat(dataset, feature_list, sort_keys = True)
    #labels, features = targetFeatureSplit(data)
    labels = y_df
    features = X_df
    cv = StratifiedShuffleSplit(labels, n_iter=folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        #for ii in train_idx:
        #    features_train.append( features[ii] )
        #    labels_train.append( labels[ii] )
        #for jj in test_idx:
        #    features_test.append( features[jj] )
        #    labels_test.append( labels[jj] )
        features_train, features_test = features.irow(train_index), features.irow(test_index)
        labels_train, labels_test = labels[train_index], labels[test_index]
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)

        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            else:
                true_positives += 1
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives

        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        
        precision = 1.0*true_positives/(true_positives+false_positives)
        
        recall = 1.0*true_positives/(true_positives+false_negatives)
        
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
       
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)

        print clf
        print ""
        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)

        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf

In [939]:
clf = Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))), 
                      ('reducer', PCA(copy=True, n_components=4, whiten=True)), 
                      ('classifier', LogisticRegression(C=10, class_weight='auto',
                                                        dual=False, fit_intercept=True,
                                                        intercept_scaling=1, penalty='l2',
                                                        random_state=None, tol=0.0001))])

In [940]:
#test_classifier(clf, None, None, folds=1000)
test_classifier(grid_search.best_estimator_, None, None, folds=1000)


Pipeline(steps=[('minmaxer', MinMaxScaler(copy=True, feature_range=(0, 1))), ('selection', SelectKBest(k=20, score_func=<function f_classif at 0x0000000016AA6C88>)), ('reducer', PCA(copy=True, n_components=1, whiten=True)), ('classifier', LogisticRegression(C=1, class_weight={False: 1, True: 10}, dual=False,
          fit_intercept=True, intercept_scaling=1, penalty='l2',
          random_state=None, tol=0.1))])

	Accuracy: 0.86667	Precision: 0.50000	Recall: 1.00000	F1: 0.66667	F2: 0.83333
	Total predictions: 15000	True positives: 2000	False positives: 2000	False negatives:    0	True negatives: 11000


In [784]:
#test_classifier(clf, None, None, folds=1000)

In [ ]:


In [ ]:
#!/usr/bin/python

import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import test_classifier, dump_classifier_and_data

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary'] # You will need to use more features

### Load the dictionary containing the dataset
data_dict = pickle.load(open("final_project_dataset.pkl", "r") )

### Task 2: Remove outliers
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()    # Provided to give you a starting point. Try a varity of classifiers.

### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script.
### Because of the small size of the dataset, the script uses stratified
### shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

test_classifier(clf, my_dataset, features_list)

### Dump your classifier, dataset, and features_list so 
### anyone can run/check your results.

dump_classifier_and_data(clf, my_dataset, features_list)

In [ ]: