Feature Selection Experiments - Srinivas

In this notebook we examine using univariate feature selection


In [46]:
# Standard
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

# Dimensionality reduction and Clustering
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import manifold, datasets
from itertools import cycle

# Plotting tools and classifiers
from matplotlib.colors import ListedColormap
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn import cross_validation
from sklearn.cross_validation import LeaveOneOut

# Let's read the data in and clean it

def get_NaNs(df):
    columns = list(df.columns.get_values()) 
    row_metrics = df.isnull().sum(axis=1)
    rows_with_na = []
    for i, x in enumerate(row_metrics):
        if x > 0: rows_with_na.append(i)
    return rows_with_na

def remove_NaNs(df):
    rows_with_na = get_NaNs(df)
    cleansed_df = df.drop(df.index[rows_with_na], inplace=False)     
    return cleansed_df

initial_data = pd.DataFrame.from_csv('Data_Adults_1_reduced_inv4.csv')
cleansed_df = remove_NaNs(initial_data)

# Let's also get rid of nominal data
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
X = cleansed_df.select_dtypes(include=numerics)
print X.shape

# Let's now clean columns getting rid of certain columns that might not be important to our analysis

cols2drop = ['GROUP_ID', 'doa', 'Baseline_header_id', 'Concentration_header_id', 'Baseline_Reading_id',
             'Concentration_Reading_id']
X = X.drop(cols2drop, axis=1, inplace=False)
print X.shape

# For our studies children skew the data, it would be cleaner to just analyse adults
X = X.loc[X['Age'] >= 18]
Y = X.loc[X['race_id'] == 1]
X  = X.loc[X['Gender_id'] == 1]

print X.shape
print Y.shape


(3926, 268)
(3926, 262)
(2341, 262)
(2675, 262)

In [47]:
# Let's extract ADHd and Bipolar patients (mutually exclusive)

ADHD_men = X.loc[X['ADHD'] == 1]
ADHD_men = ADHD_men.loc[ADHD_men['Bipolar'] == 0]

BP_men = X.loc[X['Bipolar'] == 1]
BP_men = BP_men.loc[BP_men['ADHD'] == 0]

ADHD_cauc = Y.loc[Y['ADHD'] == 1]
ADHD_cauc = ADHD_cauc.loc[ADHD_cauc['Bipolar'] == 0]

BP_cauc = Y.loc[Y['Bipolar'] == 1]
BP_cauc = BP_cauc.loc[BP_cauc['ADHD'] == 0]

print ADHD_men.shape
print BP_men.shape

print ADHD_cauc.shape
print BP_cauc.shape

# Keeping a backup of the data frame object because numpy arrays don't play well with certain scikit functions
ADHD_men = pd.DataFrame(ADHD_men.drop(['Patient_ID', 'Gender_id', 'ADHD', 'Bipolar', 'Age', 'race_id']
                                      , axis = 1, inplace = False))
BP_men = pd.DataFrame(BP_men.drop(['Patient_ID', 'Gender_id', 'ADHD', 'Bipolar', 'Age', 'race_id']
                                  , axis = 1, inplace = False))

ADHD_cauc = pd.DataFrame(ADHD_cauc.drop(['Patient_ID', 'race_id', 'ADHD', 'Bipolar', 'Age', 'Gender_id']
                                        , axis = 1, inplace = False))
BP_cauc = pd.DataFrame(BP_cauc.drop(['Patient_ID', 'race_id', 'ADHD', 'Bipolar', 'Age', 'Gender_id']
                                    , axis = 1, inplace = False))


(946, 262)
(223, 262)
(992, 262)
(287, 262)

Feature Selection


In [48]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif

combined1 = pd.concat([ADHD_men, BP_men])
combined2 = pd.concat([ADHD_cauc, BP_cauc])

print combined1.shape
print combined2.shape

combined1_scaled = pd.DataFrame(preprocessing.scale(combined1))
combined2_scaled = pd.DataFrame(preprocessing.scale(combined2))


(1169, 256)
(1279, 256)

In [49]:
clabels1 = [1] * 946 + [0] * 223
clabels2 = [1] * 992 + [0] * 287

combined1_reduced = pd.DataFrame(SelectKBest(f_classif, k=5).fit_transform(combined1, clabels1))
combined2_reduced = pd.DataFrame(SelectKBest(f_classif, k=5).fit_transform(combined2, clabels2))
combined1_scaled_reduced = pd.DataFrame(SelectKBest(f_classif, k=5).fit_transform(combined1_scaled, clabels1))
combined2_scaled_reduced = pd.DataFrame(SelectKBest(f_classif, k=5).fit_transform(combined2_scaled, clabels2))
combined1_reduced_scaled = pd.DataFrame(preprocessing.scale(combined1_reduced))
combined2_reduced_scaled = pd.DataFrame(preprocessing.scale(combined2_reduced))

In [58]:
combined2_reduced.head()


Out[58]:
0 1 2 3 4
0 331.640 411.586 356.212 402.042 364.597
1 246.828 354.039 294.013 184.752 293.547
2 362.868 438.202 367.171 359.592 379.418
3 288.224 352.568 312.076 307.301 326.313
4 312.770 292.112 279.135 416.521 373.754

Classifiers


In [51]:
# Leave one Out cross validation
def leave_one_out(classifier, values, labels):
    leave_one_out_validator = LeaveOneOut(len(values))
    classifier_metrics = cross_validation.cross_val_score(classifier, values, labels, cv=leave_one_out_validator)
    accuracy = classifier_metrics.mean()
    deviation = classifier_metrics.std()
    return accuracy, deviation

In [52]:
rf = RandomForestClassifier(n_estimators = 22) 
qda = QDA()
lda = LDA()
gnb = GaussianNB()
classifier_accuracy_list = []
classifiers = [(rf, "Random Forest"), (lda, "LDA"), (qda, "QDA"), (gnb, "Gaussian NB")]
for classifier, name in classifiers:
    accuracy, deviation = leave_one_out(classifier, combined1_reduced, clabels1)
    print '%s accuracy is %0.4f (+/- %0.3f)' % (name, accuracy, deviation)
    classifier_accuracy_list.append((name, accuracy))


Random Forest accuracy is 0.7750 (+/- 0.418)
LDA accuracy is 0.8092 (+/- 0.393)
QDA accuracy is 0.8007 (+/- 0.399)
Gaussian NB accuracy is 0.8092 (+/- 0.393)

In [53]:
for classifier, name in classifiers:
    accuracy, deviation = leave_one_out(classifier, combined1_scaled_reduced, clabels1)
    print '%s accuracy is %0.4f (+/- %0.3f)' % (name, accuracy, deviation)
    classifier_accuracy_list.append((name, accuracy))


Random Forest accuracy is 0.7836 (+/- 0.412)
LDA accuracy is 0.8092 (+/- 0.393)
QDA accuracy is 0.8007 (+/- 0.399)
Gaussian NB accuracy is 0.8092 (+/- 0.393)

In [54]:
for classifier, name in classifiers:
    accuracy, deviation = leave_one_out(classifier, combined1_reduced_scaled, clabels1)
    print '%s accuracy is %0.4f (+/- %0.3f)' % (name, accuracy, deviation)
    classifier_accuracy_list.append((name, accuracy))


Random Forest accuracy is 0.7896 (+/- 0.408)
LDA accuracy is 0.8092 (+/- 0.393)
QDA accuracy is 0.8007 (+/- 0.399)
Gaussian NB accuracy is 0.8092 (+/- 0.393)

In [55]:
for classifier, name in classifiers:
    accuracy, deviation = leave_one_out(classifier, combined2_reduced, clabels2)
    print '%s accuracy is %0.4f (+/- %0.3f)' % (name, accuracy, deviation)
    classifier_accuracy_list.append((name, accuracy))


Random Forest accuracy is 0.7295 (+/- 0.444)
LDA accuracy is 0.7756 (+/- 0.417)
QDA accuracy is 0.7733 (+/- 0.419)
Gaussian NB accuracy is 0.7185 (+/- 0.450)

In [56]:
for classifier, name in classifiers:
    accuracy, deviation = leave_one_out(classifier, combined2_scaled_reduced, clabels2)
    print '%s accuracy is %0.4f (+/- %0.3f)' % (name, accuracy, deviation)
    classifier_accuracy_list.append((name, accuracy))


Random Forest accuracy is 0.7396 (+/- 0.439)
LDA accuracy is 0.7756 (+/- 0.417)
QDA accuracy is 0.7733 (+/- 0.419)
Gaussian NB accuracy is 0.7185 (+/- 0.450)

In [57]:
for classifier, name in classifiers:
    accuracy, deviation = leave_one_out(classifier, combined2_reduced_scaled, clabels2)
    print '%s accuracy is %0.4f (+/- %0.3f)' % (name, accuracy, deviation)
    classifier_accuracy_list.append((name, accuracy))


Random Forest accuracy is 0.7287 (+/- 0.445)
LDA accuracy is 0.7756 (+/- 0.417)
QDA accuracy is 0.7733 (+/- 0.419)
Gaussian NB accuracy is 0.7185 (+/- 0.450)

In [ ]: