In [3]:
# Standard
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
# Dimensionality reduction and Clustering
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import manifold, datasets
from itertools import cycle
# Plotting tools and classifiers
from matplotlib.colors import ListedColormap
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn import cross_validation
from sklearn.cross_validation import LeaveOneOut
# Let's read the data in and clean it
def get_NaNs(df):
columns = list(df.columns.get_values())
row_metrics = df.isnull().sum(axis=1)
rows_with_na = []
for i, x in enumerate(row_metrics):
if x > 0: rows_with_na.append(i)
return rows_with_na
def remove_NaNs(df):
rows_with_na = get_NaNs(df)
cleansed_df = df.drop(df.index[rows_with_na], inplace=False)
return cleansed_df
initial_data = pd.DataFrame.from_csv('Data_Adults_1_reduced_inv4.csv')
cleansed_df = remove_NaNs(initial_data)
# Let's also get rid of nominal data
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
X = cleansed_df.select_dtypes(include=numerics)
# Let's now clean columns getting rid of certain columns that might not be important to our analysis
cols2drop = ['GROUP_ID', 'doa', 'Baseline_header_id', 'Concentration_header_id', 'Baseline_Reading_id',
'Concentration_Reading_id']
X = X.drop(cols2drop, axis=1, inplace=False)
# For our studies children skew the data, it would be cleaner to just analyse adults
X = X.loc[X['Age'] >= 18]
Y = X.loc[X['race_id'] == 1]
X = X.loc[X['Gender_id'] == 1]
In [4]:
# Let's extract ADHd and Bipolar patients (mutually exclusive)
ADHD_men = X.loc[X['ADHD'] == 1]
ADHD_men = ADHD_men.loc[ADHD_men['Bipolar'] == 0]
BP_men = X.loc[X['Bipolar'] == 1]
BP_men = BP_men.loc[BP_men['ADHD'] == 0]
print ADHD_men.shape
print BP_men.shape
# Keeping a backup of the data frame object because numpy arrays don't play well with certain scikit functions
ADHD_men = pd.DataFrame(ADHD_men.drop(['Patient_ID', 'Gender_id', 'ADHD', 'Bipolar', 'Age', 'race_id']
, axis = 1, inplace = False))
BP_men = pd.DataFrame(BP_men.drop(['Patient_ID', 'Gender_id', 'ADHD', 'Bipolar', 'Age', 'race_id']
, axis = 1, inplace = False))
We are now going to explore Some feature selection procedures, the output of this will then be sent to a classifier
The output from this is then sent to the following classifiers
In [16]:
from sklearn.svm import SVC
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
# Make the Labels vector
clabels1 = [1] * 946 + [0] * 223
# Concatenate and Scale
combined1 = pd.concat([ADHD_men, BP_men])
combined1 = pd.DataFrame(preprocessing.scale(combined1))
In [19]:
# Recursive Feature elimination with cross validation
svc = SVC(kernel="linear")
rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(clabels1, 2),
scoring='accuracy')
rfecv.fit(combined1, clabels1)
combined1_recf = rfecv.transform(combined1)
In [22]:
combined1_recf = pd.DataFrame(combined1_recf)
print combined1_recf.head()
In [39]:
# Percentile base feature selection
from sklearn.feature_selection import SelectPercentile, f_classif
selector = SelectPercentile(f_classif, percentile=5)
combined_kpercentile = selector.fit_transform(combined1, clabels1)
In [41]:
combined1_kpercentile = pd.DataFrame(combined_kpercentile)
print combined1_kpercentile.head()
In [43]:
# Tree based selection
from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier()
clf = clf.fit(combined1, clabels1)
combined1_trees = SelectFromModel(clf, prefit=True).transform(combined1)
In [44]:
combined1_trees = pd.DataFrame(combined1_trees)
print combined1_trees.head()
In [46]:
# Leave one Out cross validation
def leave_one_out(classifier, values, labels):
leave_one_out_validator = LeaveOneOut(len(values))
classifier_metrics = cross_validation.cross_val_score(classifier, values, labels, cv=leave_one_out_validator)
accuracy = classifier_metrics.mean()
deviation = classifier_metrics.std()
return accuracy, deviation
In [47]:
rf = RandomForestClassifier(n_estimators = 22)
qda = QDA()
lda = LDA()
gnb = GaussianNB()
classifier_accuracy_list = []
classifiers = [(rf, "Random Forest"), (lda, "LDA"), (qda, "QDA"), (gnb, "Gaussian NB")]
for classifier, name in classifiers:
accuracy, deviation = leave_one_out(classifier, combined1_recf, clabels1)
print '%s accuracy is %0.4f (+/- %0.3f)' % (name, accuracy, deviation)
classifier_accuracy_list.append((name, accuracy))
In [48]:
for classifier, name in classifiers:
accuracy, deviation = leave_one_out(classifier, combined1_kpercentile, clabels1)
print '%s accuracy is %0.4f (+/- %0.3f)' % (name, accuracy, deviation)
classifier_accuracy_list.append((name, accuracy))
In [49]:
for classifier, name in classifiers:
accuracy, deviation = leave_one_out(classifier, combined1_trees, clabels1)
print '%s accuracy is %0.4f (+/- %0.3f)' % (name, accuracy, deviation)
classifier_accuracy_list.append((name, accuracy))
In [ ]: