In [1]:
    
import pandas as pd
import numpy as np
df_adhd = pd.read_csv('ADHD_Gender_rCBF.csv')
df_bipolar = pd.read_csv('Bipolar_Gender_rCBF.csv')
n1, n2 = df_adhd.shape[0], df_bipolar.shape[0]
print 'Number of ADHD patients (without Bipolar) is', n1
print 'Number of Bipolar patients (without ADHD) is', n2
print 'Chance before gender separation is', float(n1) / (n1 + n2)
    
    
In [2]:
    
# Separate the genders
adhd1_id, adhd2_id = list(), list()
bipolar1_id, bipolar2_id = list(), list()
for i, g in df_adhd[['Patient_ID', 'Gender_id']].values:
    if g == 1:
        adhd1_id.append(i)
    elif g == 2:
        adhd2_id.append(i)
        
for i, g in df_bipolar[['Patient_ID', 'Gender_id']].values:
    if g == 1:
        bipolar1_id.append(i)
    elif g == 2:
        bipolar2_id.append(i)
        
print 'Number of Gender 1 ADHD patients (without Bipolar) is', len(adhd1_id)
print 'Number of Gender 2 ADHD patients (without Bipolar) is', len(adhd2_id)
print 'Number of Gender 1 Bipolar patients (without ADHD) is', len(bipolar1_id)
print 'Number of Gender 2 Bipolar patients (without ADHD) is', len(bipolar2_id)
    
    
In [3]:
    
# Separate ADHD data gender-wise
df_adhd1 = df_adhd.loc[df_adhd['Patient_ID'].isin(adhd1_id)].drop(['Patient_ID', 'Gender_id'], axis=1)
df_adhd2 = df_adhd.loc[df_adhd['Patient_ID'].isin(adhd2_id)].drop(['Patient_ID', 'Gender_id'], axis=1)
# Separate Bipolar data gender-wise
df_bipolar1 = df_bipolar.loc[df_bipolar['Patient_ID'].isin(bipolar1_id)].drop(['Patient_ID', 'Gender_id'], axis=1)
df_bipolar2 = df_bipolar.loc[df_bipolar['Patient_ID'].isin(bipolar2_id)].drop(['Patient_ID', 'Gender_id'], axis=1)
# Create disorder labels for classification
# ADHD: 0, Bipolar: 1
n1_adhd, n1_bipolar = len(adhd1_id), len(bipolar1_id)
n2_adhd, n2_bipolar = len(adhd2_id), len(bipolar2_id)
# Labels for gender 1
y1 = [0] * n1_adhd + [1] * n1_bipolar  
# Labels for gender 2
y2 = [0] * n2_adhd + [1] * n2_bipolar 
print 'Shape check:'
print 'ADHD:', df_adhd1.shape, df_adhd2.shape
print 'Bipolar:', df_bipolar1.shape, df_bipolar2.shape
# Gender1 data
df1_all = pd.concat([df_adhd1, df_bipolar1], axis=0)
# Gender2 data
df2_all = pd.concat([df_adhd2, df_bipolar2], axis=0)
print '\nDouble shape check:'
print 'Gender 1:', df1_all.shape, len(y1)
print 'Gender 2:', df2_all.shape, len(y2)
# Compute chances 
chance1 = float(n1_adhd)/(n1_adhd + n1_bipolar)
chance2 = float(n2_adhd)/(n2_adhd + n2_bipolar)
print 'Chance for gender 1 is', chance1
print 'Chance for gender 2 is', chance2
    
    
In [4]:
    
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
def kmeans(df, title, k=4):
    data = df.values.T
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(data)
    labels = kmeans.labels_
    centroids = kmeans.cluster_centers_
    
    fig = plt.figure()
    fig.suptitle('K-Means on '+title+' Features' , fontsize=14, fontweight='bold')
    
    # Plot clusters
    for i in range(k):
        # Extract observations within each cluster
        ds = data[np.where(labels==i)]
        # Plot the observations with symbol o
        plt.plot(ds[:,0], ds[:,1], 'o')
        # Plot the centroids with simbol x
        lines = plt.plot(centroids[i,0], centroids[i,1], 'x')
        plt.setp(lines, ms=8.0)
        plt.setp(lines, mew=2.0)
    
In [5]:
    
from sklearn import preprocessing
from sklearn.decomposition import PCA
# Plot explained variance ratio
def plot_evr(ex_var_ratio):
    plt.title('Explained Variance Ratios by PCA')
    plt.plot(ex_var_ratio)
    plt.ylabel('Explained Variance Ratio')
    plt.xlabel('Principal Component')
    
def pca(df, n=20):
    '''
    Default number of principal components: 20
    '''
    # Scale
    X = df.values
    X_scaled = preprocessing.scale(X)
    # PCA
    pca = PCA(n_components=n)
    pc = pca.fit_transform(X_scaled)
    
    print '\nExplained Variance Ratios:'
    print pca.explained_variance_ratio_
    print '\nSum of Explained Variance Ratios of the first', n, 'components is', 
    print np.sum(pca.explained_variance_ratio_)
    
    plot_evr(pca.explained_variance_ratio_)
    return pc
    
In [6]:
    
from sklearn.manifold import LocallyLinearEmbedding
# Compute explained variance ratio of transformed data
def compute_explained_variance_ratio(transformed_data):
    explained_variance = np.var(transformed_data, axis=0)
    explained_variance_ratio = explained_variance / np.sum(explained_variance)
    explained_variance_ratio = np.sort(explained_variance_ratio)[::-1]
    return explained_variance_ratio
def lle(X, n=10):
    # Scale
    X_scaled = preprocessing.scale(X)
    # LLE
    lle = LocallyLinearEmbedding(n_neighbors=25, n_components=n, method='ltsa')
    pc = lle.fit_transform(X_scaled)
    ex_var_ratio = compute_explained_variance_ratio(pc)
    
    print '\nExplained Variance Ratios:'
    print ex_var_ratio
    # print '\nSum of Explained Variance Ratios of ', n, 'components is', 
    # print np.sum(ex_var_ratio)
    
    return pc
    
In [7]:
    
from sklearn import cross_validation
from sklearn.cross_validation import KFold
def train_test_clf(clf, clf_name, X, y, k=10):
    '''
    Train and test a classifier using # K-fold cross validation
    Args:
        clf: sklearn classifier
        clf_name: classifier name (for printing)
        X: training data (2D numpy matrix)
        y: labels (1D vector)
        k: number of folds (default=10)
    '''
    kf = KFold(len(X), n_folds=k)
    scores = cross_validation.cross_val_score(clf, X, y, cv=kf)
    acc, acc_std = scores.mean(), scores.std()
    print clf_name + ' accuracy is %0.4f (+/- %0.3f)' % (acc, acc_std)
    return acc, acc_std
    
In [8]:
    
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.lda import LDA
from sklearn.qda import QDA
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
def classify(X, y, gender, feature_type):
    lg = LogisticRegression(penalty='l2')
    knn = KNeighborsClassifier(n_neighbors=7)
    svc = LinearSVC()
    lda = LDA()
    qda = QDA()
    rf = RandomForestClassifier(n_estimators=30) 
    gb = GradientBoostingClassifier(n_estimators=20, max_depth=3)
    et = ExtraTreesClassifier(n_estimators=40, max_depth=5)
    ada = AdaBoostClassifier()
    classifiers = [lg, knn, svc, lda, qda, rf, gb, et, ada]
    clf_names = ['Logistic Regression', 'KNN', 'Linear SVM', 'LDA', 'QDA', \
                 'Random Forest', 'Gradient Boosting', 'Extra Trees', 'AdaBoost']
    accuracies = list()
    
    for clf, name in zip(classifiers, clf_names):
        acc, acc_std = train_test_clf(clf, name, X, y)
        accuracies.append(acc)
        
    # Visualize classifier performance
    x = range(len(accuracies))
    width = 0.6/1.5
    plt.bar(x, accuracies, width)
    
    # Compute chance
    n0, n1 = y.count(0), y.count(1)
    chance = max(n0, n1) / float(n0 + n1)
    
    fig_title = gender + ' Classifier Performance on ' + feature_type + ' features'
    plt.title(fig_title)
    plt.xticks(x, clf_names, rotation=50)
    plt.xlabel('Classifier')
    plt.gca().xaxis.set_label_coords(1.1, -0.025)
    plt.ylabel('Accuracy')
    plt.axhline(chance, color='red', linestyle='--', label='chance') # plot chance
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.85))
    
In [9]:
    
%matplotlib inline
import matplotlib.pyplot as plt
plot = df1_all.plot(kind='hist', alpha=0.5, title='Gender 1 Data Distribution', legend=None)
    
    
In [10]:
    
# Cluster Gender 1 rCBF features
kmeans(df1_all, 'Gender 1')
    
    
In [11]:
    
# PCA 
X1_pca = pca(df1_all, 20)
    
    
    
In [12]:
    
# LLE 
X1_lle = lle(df1_all, 20)
    
    
In [13]:
    
# Classification using PCA features
print 'Using PCA features:'
classify(X1_pca, y1, 'Gender 1', 'PCA')
    
    
    
In [14]:
    
# Classification using LLE features
print 'Using LLE features:'
classify(X1_lle, y1, 'Gender 1', 'LLE')
    
    
    
    
In [15]:
    
plot = df2_all.plot(kind='hist', alpha=0.5, title='Gender 2 Data Distribution', legend=None)
    
    
In [16]:
    
# Cluster Gender 2 rCBF features
kmeans(df2_all, 'Gender 2')
    
    
In [17]:
    
# PCA
X2_pca = pca(df2_all, 20)
    
    
    
In [18]:
    
# LLE
X2_lle = lle(df2_all, 20)
    
    
In [19]:
    
# Classification using PCA features
print 'Using PCA features:'
classify(X2_pca, y2, 'Gender 2', 'PCA')
    
    
    
In [20]:
    
# Classification using LLE features
print 'Using LLE features:'
classify(X2_lle, y2, 'Gender 2', 'LLE')
    
    
    
QDA with LLE features finally does much better than chance! \(^o^)/
It seems that non-linear classifiers on features extracted using manifold learning would work for this problem.