Complex feature selection as a preprocessing step to learning and clasification


In [3]:
# Standard
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

# Dimensionality reduction and Clustering
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import manifold, datasets
from itertools import cycle

# Plotting tools and classifiers
from matplotlib.colors import ListedColormap
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn import cross_validation
from sklearn.cross_validation import LeaveOneOut

# Let's read the data in and clean it

def get_NaNs(df):
    columns = list(df.columns.get_values()) 
    row_metrics = df.isnull().sum(axis=1)
    rows_with_na = []
    for i, x in enumerate(row_metrics):
        if x > 0: rows_with_na.append(i)
    return rows_with_na

def remove_NaNs(df):
    rows_with_na = get_NaNs(df)
    cleansed_df = df.drop(df.index[rows_with_na], inplace=False)     
    return cleansed_df

initial_data = pd.DataFrame.from_csv('Data_Adults_1_reduced_inv4.csv')
cleansed_df = remove_NaNs(initial_data)

# Let's also get rid of nominal data
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
X = cleansed_df.select_dtypes(include=numerics)

# Let's now clean columns getting rid of certain columns that might not be important to our analysis

cols2drop = ['GROUP_ID', 'doa', 'Baseline_header_id', 'Concentration_header_id', 'Baseline_Reading_id',
             'Concentration_Reading_id']
X = X.drop(cols2drop, axis=1, inplace=False)

# For our studies children skew the data, it would be cleaner to just analyse adults
X = X.loc[X['Age'] >= 18]
Y = X.loc[X['race_id'] == 1]
X  = X.loc[X['Gender_id'] == 1]

In [4]:
# Let's extract ADHd and Bipolar patients (mutually exclusive)

ADHD_men = X.loc[X['ADHD'] == 1]
ADHD_men = ADHD_men.loc[ADHD_men['Bipolar'] == 0]

BP_men = X.loc[X['Bipolar'] == 1]
BP_men = BP_men.loc[BP_men['ADHD'] == 0]

print ADHD_men.shape
print BP_men.shape

# Keeping a backup of the data frame object because numpy arrays don't play well with certain scikit functions
ADHD_men = pd.DataFrame(ADHD_men.drop(['Patient_ID', 'Gender_id', 'ADHD', 'Bipolar', 'Age', 'race_id']
                                      , axis = 1, inplace = False))
BP_men = pd.DataFrame(BP_men.drop(['Patient_ID', 'Gender_id', 'ADHD', 'Bipolar', 'Age', 'race_id']
                                  , axis = 1, inplace = False))


(946, 262)
(223, 262)

Feature Selection

We are now going to explore Some feature selection procedures, the output of this will then be sent to a classifier

  1. Recursive elimination with cross validation
  2. Simple best percentile features
  3. Tree based feature selection

The output from this is then sent to the following classifiers

  1. Random Forrests - Good ensemble technique
  2. QDA - Other experiments with this classifier have been successful
  3. LDA - A good simple technique
  4. Gaussian Naive Bayes - Experiments with this classifier have proven successful in the past

In [16]:
from sklearn.svm import SVC
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier

# Make the Labels vector
clabels1 = [1] * 946 + [0] * 223

# Concatenate and Scale
combined1 = pd.concat([ADHD_men, BP_men])
combined1 = pd.DataFrame(preprocessing.scale(combined1))

In [19]:
# Recursive Feature elimination with cross validation
svc = SVC(kernel="linear")
rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(clabels1, 2),
              scoring='accuracy')
rfecv.fit(combined1, clabels1)
combined1_recf = rfecv.transform(combined1)

In [22]:
combined1_recf = pd.DataFrame(combined1_recf)
print combined1_recf.head()


          0
0  0.127190
1  0.755222
2 -0.977009
3 -0.920686
4  1.121645

In [39]:
# Percentile base feature selection 

from sklearn.feature_selection import SelectPercentile, f_classif
selector = SelectPercentile(f_classif, percentile=5)
combined_kpercentile = selector.fit_transform(combined1, clabels1)

In [41]:
combined1_kpercentile = pd.DataFrame(combined_kpercentile)
print combined1_kpercentile.head()


         0         1         2         3         4         5         6   \
0  0.169847 -0.209445 -0.275601 -0.666111 -0.693900 -0.106650 -0.896754   
1  0.528386 -1.450367  0.845388  0.993640  0.548840  0.485510  0.881838   
2 -0.114695  0.092482 -0.311967 -0.144872 -0.236797 -0.141822  0.176657   
3 -0.328627 -0.195417 -0.914532 -0.715598 -0.795049 -0.979397 -0.366009   
4 -1.798501 -0.282389  1.196440  1.232808  1.138202  0.632665  1.329041   

         7         8         9         10        11        12  
0  0.568321 -0.584540 -0.490285  0.244672 -0.656037 -0.125655  
1  1.995632  0.687287  1.090067  0.663607  0.103919  0.242009  
2 -0.270023 -0.271800 -1.089844 -1.227914 -0.745945 -0.586975  
3 -0.295796 -0.403384 -1.374586 -1.591762 -1.334016 -0.257680  
4 -0.222387  1.641005  2.128552  1.318883  1.785227  0.851758  

In [43]:
# Tree based selection
from sklearn.ensemble import ExtraTreesClassifier

clf = ExtraTreesClassifier()
clf = clf.fit(combined1, clabels1)
combined1_trees = SelectFromModel(clf, prefit=True).transform(combined1)

In [44]:
combined1_trees = pd.DataFrame(combined1_trees)
print combined1_trees.head()


        0         1         2         3         4         5         6    \
0 -0.985856 -0.230012 -0.696085 -0.679657 -0.437323 -0.818528 -0.378125   
1  1.008900  0.787332  0.587340  0.660063  0.897006  0.463454  0.579172   
2 -0.202951 -0.240874 -0.090505 -0.253443 -0.258227 -0.460434 -0.397930   
3 -1.710840 -0.819808 -0.929914 -1.020154 -1.212833 -1.107059 -1.410461   
4  1.382728  0.950863  0.989392  0.796090  0.863338  1.492834  0.377695   

        7         8         9      ...          113       114       115  \
0 -0.598823 -0.695587  0.169847    ...    -0.137138  0.692224 -0.261565   
1  0.539337  0.362448  0.528386    ...     0.363758  1.061854  0.466651   
2 -0.518928 -0.021714 -0.114695    ...    -0.813576 -0.604356 -1.194178   
3 -1.497622 -0.728785 -0.328627    ...    -0.819188 -0.823223 -1.269110   
4  1.605175 -2.591926 -1.798501    ...     2.538392  1.780245  2.345056   

        116       117       118       119       120       121       122  
0  0.473379 -0.209817 -0.599742 -0.214000  0.260055  0.547382  0.435904  
1  1.353621  0.673496  0.338288  0.330374  0.437566  0.309658  0.098400  
2 -1.264397 -1.477834 -1.054814 -0.535019 -0.998927 -1.003354 -0.738059  
3 -1.216525 -1.466187 -1.149187 -1.933939 -1.208878 -1.562288 -1.587445  
4  2.188733  1.304474  2.002857  2.187624  1.344800  0.644545  1.835503  

[5 rows x 123 columns]

Classifiers


In [46]:
# Leave one Out cross validation
def leave_one_out(classifier, values, labels):
    leave_one_out_validator = LeaveOneOut(len(values))
    classifier_metrics = cross_validation.cross_val_score(classifier, values, labels, cv=leave_one_out_validator)
    accuracy = classifier_metrics.mean()
    deviation = classifier_metrics.std()
    return accuracy, deviation

In [47]:
rf = RandomForestClassifier(n_estimators = 22) 
qda = QDA()
lda = LDA()
gnb = GaussianNB()
classifier_accuracy_list = []
classifiers = [(rf, "Random Forest"), (lda, "LDA"), (qda, "QDA"), (gnb, "Gaussian NB")]
for classifier, name in classifiers:
    accuracy, deviation = leave_one_out(classifier, combined1_recf, clabels1)
    print '%s accuracy is %0.4f (+/- %0.3f)' % (name, accuracy, deviation)
    classifier_accuracy_list.append((name, accuracy))


Random Forest accuracy is 0.6638 (+/- 0.472)
LDA accuracy is 0.8092 (+/- 0.393)
QDA accuracy is 0.8092 (+/- 0.393)
Gaussian NB accuracy is 0.8092 (+/- 0.393)

In [48]:
for classifier, name in classifiers:
    accuracy, deviation = leave_one_out(classifier, combined1_kpercentile, clabels1)
    print '%s accuracy is %0.4f (+/- %0.3f)' % (name, accuracy, deviation)
    classifier_accuracy_list.append((name, accuracy))


Random Forest accuracy is 0.8007 (+/- 0.399)
LDA accuracy is 0.8084 (+/- 0.394)
QDA accuracy is 0.7793 (+/- 0.415)
Gaussian NB accuracy is 0.7990 (+/- 0.401)

In [49]:
for classifier, name in classifiers:
    accuracy, deviation = leave_one_out(classifier, combined1_trees, clabels1)
    print '%s accuracy is %0.4f (+/- %0.3f)' % (name, accuracy, deviation)
    classifier_accuracy_list.append((name, accuracy))


Random Forest accuracy is 0.7981 (+/- 0.401)
LDA accuracy is 0.7819 (+/- 0.413)
QDA accuracy is 0.8075 (+/- 0.394)
Gaussian NB accuracy is 0.5141 (+/- 0.500)

In [ ]: