Imports and Function Definitions


In [124]:
import numpy as np
import networkx as nx
import scipy as sp
import colorlover as clov
import os
import csv
import pickle

from collections import OrderedDict
from sklearn import cross_validation
from sklearn.preprocessing import normalize
from sklearn.cross_validation import LeaveOneOut, LeaveOneLabelOut
from sklearn.neighbors import KNeighborsClassifier
from IPython.display import HTML
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot, plot
from plotly.graph_objs import *
from numpy import array

init_notebook_mode()
np.random.seed(12345678)  # for reproducibility



In [2]:
def loadGraphs(filenames, verb=False):
    """
    Given a list of files, returns a dictionary of graphs

    Required parameters:
        filenames:
            - List of filenames for graphs
    Optional parameters:
        verb:
            - Toggles verbose output statements
    """
    #  Initializes empty dictionary
    gstruct = OrderedDict()
    for idx, files in enumerate(filenames):
        if verb:
            print "Loading: " + files
        #  Adds graphs to dictionary with key being filename
        fname = os.path.basename(files)
        gstruct[fname] = nx.read_gpickle(files)
    return gstruct

def constructGraphDict(names, fs, verb=False):
    """
    Given a set of files and a directory to put things, loads graphs.

    Required parameters:
        names:
            - List of names of the datasets
        fs:
            - Dictionary of lists of files in each dataset
    Optional parameters:
        verb:
            - Toggles verbose output statements
    """
    #  Loads graphs into memory for all datasets
    graphs = OrderedDict()
    for idx, name in enumerate(names):
        if verb:
            print "Loading Dataset: " + name
        # The key for the dictionary of graphs is the dataset name
        graphs[name] = loadGraphs(fs[name], verb=verb)
    return graphs

Data Preparation


In [3]:
dataset_names = ['BNU1',
                 'BNU3',
                 'HNU1',
                 'KKI2009',
                 'SWU4',
                 'Templeton114']

xlabs = ['Raw',
         'Ranked',
         'UN',
         '-CM',
         '-PM-CM']

series = dataset_names + ['Pooled', 'LOO-Dataset']

cats = OrderedDict()
xs = range(len(xlabs))

for ser in series:
    cats[ser] = {'x': xs,
                 'y': []}

In [4]:
basepath = '/Users/gkiar/code/neurodata/ndmg-paper/data/cloud/'
dir_names = [basepath + '/' + d for d in dataset_names]

fs = OrderedDict()
for idx, dd in enumerate(dataset_names):
    fs[dd] = [root + "/" + fl for root, dirs, files in os.walk(dir_names[idx])
              for fl in files if fl.endswith(".gpickle")]

ps = {os.path.splitext(os.path.basename(fl))[0] : root + "/" + fl
      for root, dirs, files in os.walk(basepath+'phenotypes')
      for fl in files if fl.endswith(".csv") }

print "Datasets: " + ", ".join([fkey + ' (' + str(len(fs[fkey])) + ')'
                                for fkey in fs])
print "Total Subjects: %d" % (sum([len(fs[key]) for key in fs]))


Datasets: BNU1 (114), BNU3 (47), HNU1 (300), KKI2009 (42), SWU4 (454), Templeton114 (114)
Total Subjects: 1071

In [5]:
graphs = constructGraphDict(dataset_names, fs, verb=False)

In [6]:
phenotypes = OrderedDict()
for dataset in dataset_names:
    tmp = csv.reader(open(ps[dataset]))
    pheno = OrderedDict()
    if dataset == 'KKI2009':
        triple = [[t[1].strip(), t[4], int(t[5] == 'M')] for t in tmp][1:]  # female=F->0, male=M->1
    elif dataset == 'Templeton114':
        triple = [[t[0].strip(), t[3], int(t[2] == '1')] for t in tmp][1:]  # female=0->0, male=1->1
    else:
        triple = [[t[0].strip(), t[2], int(t[3] == '2')] for t in tmp
                  if t[3] != '#' and t[2] != '#'][1:]  # female=1->0, male=2->1
    
    for idx, trip in enumerate(triple):
        pheno[trip[0]] = trip[1:]
    phenotypes[dataset] = pheno

In [36]:
N = nx.number_of_nodes(graphs[graphs.keys()[0]][graphs[graphs.keys()[0]].keys()[0]])
feat = np.empty((0, int(sp.special.binom(N,2))), int)
dats = list(())
ages = np.array(())
sexy = np.array(())
sbjs = list(())

for idx1, dset in enumerate(graphs):
    print(dset),
    for idx2, subj in enumerate(graphs[dset]):
        A = nx.adjacency_matrix(graphs[dset][subj]).todense()
        Au = A[np.triu_indices(A.shape[0], 1)]
        feat = np.append(feat, Au, axis=0)
        dats.append(dset)

        try:
            subj_id = str(int(subj.split('-')[1].split('_')[0]))
        except:
            subj_id = subj.split('-')[1].split('_')[0]

        sbjs.append(subj_id)
        ages = np.append(ages, int(phenotypes[dset][subj_id][0]))
        sexy = np.append(sexy, int(phenotypes[dset][subj_id][1]))
print feat.shape, ages.shape, sexy.shape
print sum(sexy == 0), sum(sexy == 1)


BNU1 BNU3 HNU1 KKI2009 SWU4 Templeton114 (1071, 2415) (1071,) (1071,)
522 549

Classification helpers


In [31]:
def classify_help(feat, labs, loos, verb=False):
    means = np.array(())
    neighbourhoods = (np.arange(10)+2)*2-1
    if verb:
        print("Neighbourhoods complete:"),
    for i in neighbourhoods:
        classif = KNeighborsClassifier(i)
        loo = LeaveOneLabelOut(loos)
        score = cross_validation.cross_val_score(classif, feat, labs, cv=loo)
        means = np.append(means, score.mean())
        if verb:
            print(i),
    if verb:
        print(".")
        print("Max score: {}".format(np.max(means)))
    return np.max(means)

In [66]:
def rank(data):
    ranked = np.copy(data)
    for idx in range(ranked.shape[0]):
        ranked[idx, :] = sp.stats.rankdata(ranked[idx,:])
    return ranked

def cohort(data):
    data_cm = np.copy(data)
    for idx, dset in enumerate(dataset_names):
        ind = np.array([dset == d for d in dats])
        curr = data_cm[ind > 0]
        mean = np.mean(curr, axis=0)
        updated = curr - mean
        data_cm[ind > 0] = updated
    return data_cm

def popcohort(data):
    mean = np.mean(data, axis=0)
    return data - globalmean - cohort(data-globalmean)

def unit(data):
    return normalize(data)

In [67]:
verb = True

cohortmeans = cohort(feat)
popcohortmeans = popcohort(feat)

for ser in series:
    yt = []
    if verb:
        print("Series: {}".format(ser))
    if ser in ["Pooled", "LOO-Dataset"]:
        if ser == "Pooled":
            feet = feat
            labs = sexy
            loos = sbjs
        else:
            feet = feat
            labs = sexy
            loos = dats
        # Raw
        yt += [classify_help(feet, labs, loos, verb=verb)]
        # Rank
        yt += [classify_help(rank(feet), labs, loos, verb=verb)]
        # UN
        yt += [classify_help(unit(feet), labs, loos, verb=verb)]
        # -CM
        yt += [classify_help(cohortmeans, labs, loos, verb=verb)]
        # -PM-CM
        yt += [classify_help(popcohortmeans, labs, loos, verb=verb)]
        
    else:
        inds = [i for i,d in enumerate(dats) if d == ser]
        feet = feat[inds, :]
        labs = sexy[inds] 
        loos = [sbjs[i] for i in inds]
        # Raw
        yt += [classify_help(feet, labs, loos, verb=verb)]
        # Rank
        yt += [classify_help(rank(feet), labs, loos, verb=verb)]
        # UN
        yt += [classify_help(unit(feet), labs, loos, verb=verb)]
        # -CM
        cohorts = cohortmeans[inds]
        yt += [classify_help(cohorts, labs, loos, verb=verb)]
        # -PM-CM
        popcohorts = popcohortmeans[inds]
        yt += [classify_help(popcohorts, labs, loos, verb=verb)]

    cats[ser]['y'] = yt


Series: BNU1
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.798245614035
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.728070175439
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.80701754386
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.798245614035
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.605263157895
Series: BNU3
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.659574468085
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 
/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.py:420: DataConversionWarning:

Data with input dtype int64 was converted to float64 by the normalize function.

/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.py:420: DataConversionWarning:

Data with input dtype int64 was converted to float64 by the normalize function.

21 .
Max score: 0.574468085106
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.595744680851
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.659574468085
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.63829787234
Series: HNU1
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.59
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.376666666667
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.62
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.59
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.466666666667
Series: KKI2009
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.690476190476
Neighbourhoods complete: 3 5 7 9 11 13 15 17 
/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.py:420: DataConversionWarning:

Data with input dtype int64 was converted to float64 by the normalize function.

/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.py:420: DataConversionWarning:

Data with input dtype int64 was converted to float64 by the normalize function.

19 21 .
Max score: 0.595238095238
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.714285714286
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.690476190476
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.52380952381
Series: SWU4
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.700440528634
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.581497797357
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.693832599119
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.700440528634
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.590308370044
Series: Templeton114
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.587719298246
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 
/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.py:420: DataConversionWarning:

Data with input dtype int64 was converted to float64 by the normalize function.

/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.py:420: DataConversionWarning:

Data with input dtype int64 was converted to float64 by the normalize function.

21 .
Max score: 0.631578947368
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.622807017544
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.587719298246
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.59649122807
Series: Pooled
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.62439516129
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.614516129032
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.621975806452
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.630846774194
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.57439516129
Series: LOO-Dataset
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.585293903049
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 
/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.py:420: DataConversionWarning:

Data with input dtype int64 was converted to float64 by the normalize function.

/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.py:420: DataConversionWarning:

Data with input dtype int64 was converted to float64 by the normalize function.

21 .
Max score: 0.606054101339
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.599602066998
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.600381154091
Neighbourhoods complete: 3 5 7 9 11 13 15 17 19 21 .
Max score: 0.533708883622

In [136]:
for idx, ser in enumerate(series):
    cats[ser]['x'] = list(np.asarray(xs) + 0.02*(len(series)/2 - idx))

In [101]:
fname = '/Users/gkiar/code/neurodata/ndmg-paper/data/cloud/betterclass.pkl'
with open(fname, 'w') as fhand:
    pickle.dump(cats, fhand)

In [128]:
dat = pickle.load(open(fname, 'r'))
cats = OrderedDict(dat.items())

In [130]:
42+300+47+114+114


Out[130]:
617

In [138]:
nsubs = [42, 300, 47, 114, 114, 454, 1071, 1071]
series_sorted = ['KKI2009', 'HNU1', 'BNU3', 'BNU1', 'Templeton114', 'SWU4', 'Pooled', 'LOO-Dataset']
key_list = series_sorted

cols = {'KKI2009': 'rgb(253,212,158)',
        'HNU1': 'rgb(253,187,132)',
        'BNU3': 'rgb(252,141,89)',
        'BNU1': 'rgb(239,101,72)',
        'Templeton114': 'rgb(215,48,31)',
        'SWU4': 'rgb(127,0,0)',
        'Templeton255': 'rgb(90,0,0)',
        'Pooled': 'rgb(0,200,50)',
        'LOO-Dataset': 'rgb(0,50,200)'}

data = []
for idx, ser in enumerate(key_list):
    shape = 'diamond'if ser in ["Pooled", "LOO-Dataset"] else 'dot' 
    size = 1.5*np.log2(nsubs[idx])
    data += [
        Scatter(x=cats[ser]['x'],
                y=cats[ser]['y'],
                marker=Marker(
                    symbol=shape,
                    size=size,
                    opacity=0.9,
                    color=cols[ser]
                ),
                mode='markers',
                name=ser,
                hoverinfo='y+name',
                legendgroup=ser
        )
    ]

chance = np.max((sum(sexy), len(sexy)-sum(sexy)))/len(sexy)
data += [{'legendgroup': 'Chance',
          'line': {'color': 'rgba(100,100,100, 0.8)',
                   'dash': 'dash'},
          'marker': {'symbol': 'none'},
          'name': 'Chance',
          'type': 'scatter',
          'x': [-1, 5],
          'y': [chance, chance]}]

fig = Figure(data=data)
fig.layout['yaxis']['title'] = 'Classification Accuracy'
# fig.layout['yaxis']['nticks'] = 5

fig.layout['xaxis']['range'] = [-0.5, 4.5]
# fig.layout['xaxis']['title'] = 'Neighbourhood Size (k)'
fig.layout['xaxis']['tickvals'] = xs
fig.layout['xaxis']['ticktext'] = xlabs
fig.layout['xaxis']['zeroline'] = False
# fig.layout['legend']['orientation'] = 'h'

fig.layout['title'] = 'KNN Sex Classification With Various Normalizations Using the Desikan Parcellation'
plot(fig, validate=False, filename='/Users/gkiar/code/neurodata/ndmg-paper/code/class_regress/knn-classif.html')


Out[138]:
'file:///Users/gkiar/code/neurodata/ndmg-paper/code/class_regress/knn-classif.html'

In [ ]: