Complex feature selection as a preprocessing step to learning and clasification



In [3]:

    
# Standard
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

# Dimensionality reduction and Clustering
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import manifold, datasets
from itertools import cycle

# Plotting tools and classifiers
from matplotlib.colors import ListedColormap
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn import cross_validation
from sklearn.cross_validation import LeaveOneOut

# Let's read the data in and clean it

def get_NaNs(df):
    columns = list(df.columns.get_values()) 
    row_metrics = df.isnull().sum(axis=1)
    rows_with_na = []
    for i, x in enumerate(row_metrics):
        if x > 0: rows_with_na.append(i)
    return rows_with_na

def remove_NaNs(df):
    rows_with_na = get_NaNs(df)
    cleansed_df = df.drop(df.index[rows_with_na], inplace=False)     
    return cleansed_df

initial_data = pd.DataFrame.from_csv('Data_Adults_1_reduced_inv4.csv')
cleansed_df = remove_NaNs(initial_data)

# Let's also get rid of nominal data
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
X = cleansed_df.select_dtypes(include=numerics)

# Let's now clean columns getting rid of certain columns that might not be important to our analysis

cols2drop = ['GROUP_ID', 'doa', 'Baseline_header_id', 'Concentration_header_id', 'Baseline_Reading_id',
             'Concentration_Reading_id']
X = X.drop(cols2drop, axis=1, inplace=False)

# For our studies children skew the data, it would be cleaner to just analyse adults
X = X.loc[X['Age'] >= 18]
Y = X.loc[X['race_id'] == 1]
X  = X.loc[X['Gender_id'] == 1]



In [4]:

    
# Let's extract ADHd and Bipolar patients (mutually exclusive)

ADHD_men = X.loc[X['ADHD'] == 1]
ADHD_men = ADHD_men.loc[ADHD_men['Bipolar'] == 0]

BP_men = X.loc[X['Bipolar'] == 1]
BP_men = BP_men.loc[BP_men['ADHD'] == 0]

print ADHD_men.shape
print BP_men.shape

# Keeping a backup of the data frame object because numpy arrays don't play well with certain scikit functions
ADHD_men = pd.DataFrame(ADHD_men.drop(['Patient_ID', 'Gender_id', 'ADHD', 'Bipolar', 'Age', 'race_id']
                                      , axis = 1, inplace = False))
BP_men = pd.DataFrame(BP_men.drop(['Patient_ID', 'Gender_id', 'ADHD', 'Bipolar', 'Age', 'race_id']
                                  , axis = 1, inplace = False))









    



(946, 262)
(223, 262)

Feature Selection

We are now going to explore Some feature selection procedures, the output of this will then be sent to a classifier

Recursive elimination with cross validation
Simple best percentile features
Tree based feature selection

The output from this is then sent to the following classifiers

Random Forrests - Good ensemble technique
QDA - Other experiments with this classifier have been successful
LDA - A good simple technique
Gaussian Naive Bayes - Experiments with this classifier have proven successful in the past



In [16]:

    
from sklearn.svm import SVC
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier

# Make the Labels vector
clabels1 = [1] * 946 + [0] * 223

# Concatenate and Scale
combined1 = pd.concat([ADHD_men, BP_men])
combined1 = pd.DataFrame(preprocessing.scale(combined1))



In [19]:

    
# Recursive Feature elimination with cross validation
svc = SVC(kernel="linear")
rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(clabels1, 2),
              scoring='accuracy')
rfecv.fit(combined1, clabels1)
combined1_recf = rfecv.transform(combined1)



In [22]:

    
combined1_recf = pd.DataFrame(combined1_recf)
print combined1_recf.head()









    



          0
0  0.127190
1  0.755222
2 -0.977009
3 -0.920686
4  1.121645



In [39]:

    
# Percentile base feature selection 

from sklearn.feature_selection import SelectPercentile, f_classif
selector = SelectPercentile(f_classif, percentile=5)
combined_kpercentile = selector.fit_transform(combined1, clabels1)



In [41]:

    
combined1_kpercentile = pd.DataFrame(combined_kpercentile)
print combined1_kpercentile.head()









    



         0         1         2         3         4         5         6   \
0  0.169847 -0.209445 -0.275601 -0.666111 -0.693900 -0.106650 -0.896754   
1  0.528386 -1.450367  0.845388  0.993640  0.548840  0.485510  0.881838   
2 -0.114695  0.092482 -0.311967 -0.144872 -0.236797 -0.141822  0.176657   
3 -0.328627 -0.195417 -0.914532 -0.715598 -0.795049 -0.979397 -0.366009   
4 -1.798501 -0.282389  1.196440  1.232808  1.138202  0.632665  1.329041   

         7         8         9         10        11        12  
0  0.568321 -0.584540 -0.490285  0.244672 -0.656037 -0.125655  
1  1.995632  0.687287  1.090067  0.663607  0.103919  0.242009  
2 -0.270023 -0.271800 -1.089844 -1.227914 -0.745945 -0.586975  
3 -0.295796 -0.403384 -1.374586 -1.591762 -1.334016 -0.257680  
4 -0.222387  1.641005  2.128552  1.318883  1.785227  0.851758



In [43]:

    
# Tree based selection
from sklearn.ensemble import ExtraTreesClassifier

clf = ExtraTreesClassifier()
clf = clf.fit(combined1, clabels1)
combined1_trees = SelectFromModel(clf, prefit=True).transform(combined1)



In [44]:

    
combined1_trees = pd.DataFrame(combined1_trees)
print combined1_trees.head()









    



        0         1         2         3         4         5         6    \
0 -0.985856 -0.230012 -0.696085 -0.679657 -0.437323 -0.818528 -0.378125   
1  1.008900  0.787332  0.587340  0.660063  0.897006  0.463454  0.579172   
2 -0.202951 -0.240874 -0.090505 -0.253443 -0.258227 -0.460434 -0.397930   
3 -1.710840 -0.819808 -0.929914 -1.020154 -1.212833 -1.107059 -1.410461   
4  1.382728  0.950863  0.989392  0.796090  0.863338  1.492834  0.377695   

        7         8         9      ...          113       114       115  \
0 -0.598823 -0.695587  0.169847    ...    -0.137138  0.692224 -0.261565   
1  0.539337  0.362448  0.528386    ...     0.363758  1.061854  0.466651   
2 -0.518928 -0.021714 -0.114695    ...    -0.813576 -0.604356 -1.194178   
3 -1.497622 -0.728785 -0.328627    ...    -0.819188 -0.823223 -1.269110   
4  1.605175 -2.591926 -1.798501    ...     2.538392  1.780245  2.345056   

        116       117       118       119       120       121       122  
0  0.473379 -0.209817 -0.599742 -0.214000  0.260055  0.547382  0.435904  
1  1.353621  0.673496  0.338288  0.330374  0.437566  0.309658  0.098400  
2 -1.264397 -1.477834 -1.054814 -0.535019 -0.998927 -1.003354 -0.738059  
3 -1.216525 -1.466187 -1.149187 -1.933939 -1.208878 -1.562288 -1.587445  
4  2.188733  1.304474  2.002857  2.187624  1.344800  0.644545  1.835503  

[5 rows x 123 columns]

Classifiers



In [46]:

    
# Leave one Out cross validation
def leave_one_out(classifier, values, labels):
    leave_one_out_validator = LeaveOneOut(len(values))
    classifier_metrics = cross_validation.cross_val_score(classifier, values, labels, cv=leave_one_out_validator)
    accuracy = classifier_metrics.mean()
    deviation = classifier_metrics.std()
    return accuracy, deviation



In [47]:

    
rf = RandomForestClassifier(n_estimators = 22) 
qda = QDA()
lda = LDA()
gnb = GaussianNB()
classifier_accuracy_list = []
classifiers = [(rf, "Random Forest"), (lda, "LDA"), (qda, "QDA"), (gnb, "Gaussian NB")]
for classifier, name in classifiers:
    accuracy, deviation = leave_one_out(classifier, combined1_recf, clabels1)
    print '%s accuracy is %0.4f (+/- %0.3f)' % (name, accuracy, deviation)
    classifier_accuracy_list.append((name, accuracy))









    



Random Forest accuracy is 0.6638 (+/- 0.472)
LDA accuracy is 0.8092 (+/- 0.393)
QDA accuracy is 0.8092 (+/- 0.393)
Gaussian NB accuracy is 0.8092 (+/- 0.393)



In [48]:

    
for classifier, name in classifiers:
    accuracy, deviation = leave_one_out(classifier, combined1_kpercentile, clabels1)
    print '%s accuracy is %0.4f (+/- %0.3f)' % (name, accuracy, deviation)
    classifier_accuracy_list.append((name, accuracy))









    



Random Forest accuracy is 0.8007 (+/- 0.399)
LDA accuracy is 0.8084 (+/- 0.394)
QDA accuracy is 0.7793 (+/- 0.415)
Gaussian NB accuracy is 0.7990 (+/- 0.401)



In [49]:

    
for classifier, name in classifiers:
    accuracy, deviation = leave_one_out(classifier, combined1_trees, clabels1)
    print '%s accuracy is %0.4f (+/- %0.3f)' % (name, accuracy, deviation)
    classifier_accuracy_list.append((name, accuracy))









    



Random Forest accuracy is 0.7981 (+/- 0.401)
LDA accuracy is 0.7819 (+/- 0.413)
QDA accuracy is 0.8075 (+/- 0.394)
Gaussian NB accuracy is 0.5141 (+/- 0.500)



In [ ]: