In [1]:
import pandas as pd
import numpy as np
df_adhd = pd.read_csv('ADHD_Gender_rCBF.csv')
df_bipolar = pd.read_csv('Bipolar_Gender_rCBF.csv')
n1, n2 = df_adhd.shape[0], df_bipolar.shape[0]
print 'Number of ADHD patients (without Bipolar) is', n1
print 'Number of Bipolar patients (without ADHD) is', n2
print 'Chance before gender separation is', float(n1) / (n1 + n2)
In [2]:
# Separate the genders
adhd1_id, adhd2_id = list(), list()
bipolar1_id, bipolar2_id = list(), list()
for i, g in df_adhd[['Patient_ID', 'Gender_id']].values:
if g == 1:
adhd1_id.append(i)
elif g == 2:
adhd2_id.append(i)
for i, g in df_bipolar[['Patient_ID', 'Gender_id']].values:
if g == 1:
bipolar1_id.append(i)
elif g == 2:
bipolar2_id.append(i)
print 'Number of Gender 1 ADHD patients (without Bipolar) is', len(adhd1_id)
print 'Number of Gender 2 ADHD patients (without Bipolar) is', len(adhd2_id)
print 'Number of Gender 1 Bipolar patients (without ADHD) is', len(bipolar1_id)
print 'Number of Gender 2 Bipolar patients (without ADHD) is', len(bipolar2_id)
In [3]:
# Separate ADHD data gender-wise
df_adhd1 = df_adhd.loc[df_adhd['Patient_ID'].isin(adhd1_id)].drop(['Patient_ID', 'Gender_id'], axis=1)
df_adhd2 = df_adhd.loc[df_adhd['Patient_ID'].isin(adhd2_id)].drop(['Patient_ID', 'Gender_id'], axis=1)
# Separate Bipolar data gender-wise
df_bipolar1 = df_bipolar.loc[df_bipolar['Patient_ID'].isin(bipolar1_id)].drop(['Patient_ID', 'Gender_id'], axis=1)
df_bipolar2 = df_bipolar.loc[df_bipolar['Patient_ID'].isin(bipolar2_id)].drop(['Patient_ID', 'Gender_id'], axis=1)
# Create disorder labels for classification
# ADHD: 0, Bipolar: 1
n1_adhd, n1_bipolar = len(adhd1_id), len(bipolar1_id)
n2_adhd, n2_bipolar = len(adhd2_id), len(bipolar2_id)
# Labels for gender 1
y1 = [0] * n1_adhd + [1] * n1_bipolar
# Labels for gender 2
y2 = [0] * n2_adhd + [1] * n2_bipolar
print 'Shape check:'
print 'ADHD:', df_adhd1.shape, df_adhd2.shape
print 'Bipolar:', df_bipolar1.shape, df_bipolar2.shape
# Gender1 data
df1_all = pd.concat([df_adhd1, df_bipolar1], axis=0)
# Gender2 data
df2_all = pd.concat([df_adhd2, df_bipolar2], axis=0)
print '\nDouble shape check:'
print 'Gender 1:', df1_all.shape, len(y1)
print 'Gender 2:', df2_all.shape, len(y2)
# Compute chances
chance1 = float(n1_adhd)/(n1_adhd + n1_bipolar)
chance2 = float(n2_adhd)/(n2_adhd + n2_bipolar)
print 'Chance for gender 1 is', chance1
print 'Chance for gender 2 is', chance2
In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
def kmeans(df, title, k=4):
data = df.values.T
kmeans = KMeans(n_clusters=k)
kmeans.fit(data)
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
fig = plt.figure()
fig.suptitle('K-Means on '+title+' Features' , fontsize=14, fontweight='bold')
# Plot clusters
for i in range(k):
# Extract observations within each cluster
ds = data[np.where(labels==i)]
# Plot the observations with symbol o
plt.plot(ds[:,0], ds[:,1], 'o')
# Plot the centroids with simbol x
lines = plt.plot(centroids[i,0], centroids[i,1], 'x')
plt.setp(lines, ms=8.0)
plt.setp(lines, mew=2.0)
In [5]:
from sklearn import preprocessing
from sklearn.decomposition import PCA
# Plot explained variance ratio
def plot_evr(ex_var_ratio):
plt.title('Explained Variance Ratios by PCA')
plt.plot(ex_var_ratio)
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Component')
def pca(df, n=20):
'''
Default number of principal components: 20
'''
# Scale
X = df.values
X_scaled = preprocessing.scale(X)
# PCA
pca = PCA(n_components=n)
pc = pca.fit_transform(X_scaled)
print '\nExplained Variance Ratios:'
print pca.explained_variance_ratio_
print '\nSum of Explained Variance Ratios of the first', n, 'components is',
print np.sum(pca.explained_variance_ratio_)
plot_evr(pca.explained_variance_ratio_)
return pc
In [6]:
from sklearn.manifold import LocallyLinearEmbedding
# Compute explained variance ratio of transformed data
def compute_explained_variance_ratio(transformed_data):
explained_variance = np.var(transformed_data, axis=0)
explained_variance_ratio = explained_variance / np.sum(explained_variance)
explained_variance_ratio = np.sort(explained_variance_ratio)[::-1]
return explained_variance_ratio
def lle(X, n=10):
# Scale
X_scaled = preprocessing.scale(X)
# LLE
lle = LocallyLinearEmbedding(n_neighbors=25, n_components=n, method='ltsa')
pc = lle.fit_transform(X_scaled)
ex_var_ratio = compute_explained_variance_ratio(pc)
print '\nExplained Variance Ratios:'
print ex_var_ratio
# print '\nSum of Explained Variance Ratios of ', n, 'components is',
# print np.sum(ex_var_ratio)
return pc
In [7]:
from sklearn import cross_validation
from sklearn.cross_validation import KFold
def train_test_clf(clf, clf_name, X, y, k=10):
'''
Train and test a classifier using # K-fold cross validation
Args:
clf: sklearn classifier
clf_name: classifier name (for printing)
X: training data (2D numpy matrix)
y: labels (1D vector)
k: number of folds (default=10)
'''
kf = KFold(len(X), n_folds=k)
scores = cross_validation.cross_val_score(clf, X, y, cv=kf)
acc, acc_std = scores.mean(), scores.std()
print clf_name + ' accuracy is %0.4f (+/- %0.3f)' % (acc, acc_std)
return acc, acc_std
In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.lda import LDA
from sklearn.qda import QDA
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
def classify(X, y, gender, feature_type):
lg = LogisticRegression(penalty='l2')
knn = KNeighborsClassifier(n_neighbors=7)
svc = LinearSVC()
lda = LDA()
qda = QDA()
rf = RandomForestClassifier(n_estimators=30)
gb = GradientBoostingClassifier(n_estimators=20, max_depth=3)
et = ExtraTreesClassifier(n_estimators=40, max_depth=5)
ada = AdaBoostClassifier()
classifiers = [lg, knn, svc, lda, qda, rf, gb, et, ada]
clf_names = ['Logistic Regression', 'KNN', 'Linear SVM', 'LDA', 'QDA', \
'Random Forest', 'Gradient Boosting', 'Extra Trees', 'AdaBoost']
accuracies = list()
for clf, name in zip(classifiers, clf_names):
acc, acc_std = train_test_clf(clf, name, X, y)
accuracies.append(acc)
# Visualize classifier performance
x = range(len(accuracies))
width = 0.6/1.5
plt.bar(x, accuracies, width)
# Compute chance
n0, n1 = y.count(0), y.count(1)
chance = max(n0, n1) / float(n0 + n1)
fig_title = gender + ' Classifier Performance on ' + feature_type + ' features'
plt.title(fig_title)
plt.xticks(x, clf_names, rotation=50)
plt.xlabel('Classifier')
plt.gca().xaxis.set_label_coords(1.1, -0.025)
plt.ylabel('Accuracy')
plt.axhline(chance, color='red', linestyle='--', label='chance') # plot chance
plt.legend(loc='center left', bbox_to_anchor=(1, 0.85))
In [9]:
%matplotlib inline
import matplotlib.pyplot as plt
plot = df1_all.plot(kind='hist', alpha=0.5, title='Gender 1 Data Distribution', legend=None)
In [10]:
# Cluster Gender 1 rCBF features
kmeans(df1_all, 'Gender 1')
In [11]:
# PCA
X1_pca = pca(df1_all, 20)
In [12]:
# LLE
X1_lle = lle(df1_all, 20)
In [13]:
# Classification using PCA features
print 'Using PCA features:'
classify(X1_pca, y1, 'Gender 1', 'PCA')
In [14]:
# Classification using LLE features
print 'Using LLE features:'
classify(X1_lle, y1, 'Gender 1', 'LLE')
In [15]:
plot = df2_all.plot(kind='hist', alpha=0.5, title='Gender 2 Data Distribution', legend=None)
In [16]:
# Cluster Gender 2 rCBF features
kmeans(df2_all, 'Gender 2')
In [17]:
# PCA
X2_pca = pca(df2_all, 20)
In [18]:
# LLE
X2_lle = lle(df2_all, 20)
In [19]:
# Classification using PCA features
print 'Using PCA features:'
classify(X2_pca, y2, 'Gender 2', 'PCA')
In [20]:
# Classification using LLE features
print 'Using LLE features:'
classify(X2_lle, y2, 'Gender 2', 'LLE')
QDA with LLE features finally does much better than chance! \(^o^)/
It seems that non-linear classifiers on features extracted using manifold learning would work for this problem.