In [124]:
import numpy as np
import networkx as nx
import scipy as sp
import colorlover as clov
import os
import csv
import pickle
from collections import OrderedDict
from sklearn import cross_validation
from sklearn.preprocessing import normalize
from sklearn.cross_validation import LeaveOneOut, LeaveOneLabelOut
from sklearn.neighbors import KNeighborsClassifier
from IPython.display import HTML
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot, plot
from plotly.graph_objs import *
from numpy import array
init_notebook_mode()
np.random.seed(12345678) # for reproducibility
In [2]:
def loadGraphs(filenames, verb=False):
"""
Given a list of files, returns a dictionary of graphs
Required parameters:
filenames:
- List of filenames for graphs
Optional parameters:
verb:
- Toggles verbose output statements
"""
# Initializes empty dictionary
gstruct = OrderedDict()
for idx, files in enumerate(filenames):
if verb:
print "Loading: " + files
# Adds graphs to dictionary with key being filename
fname = os.path.basename(files)
gstruct[fname] = nx.read_gpickle(files)
return gstruct
def constructGraphDict(names, fs, verb=False):
"""
Given a set of files and a directory to put things, loads graphs.
Required parameters:
names:
- List of names of the datasets
fs:
- Dictionary of lists of files in each dataset
Optional parameters:
verb:
- Toggles verbose output statements
"""
# Loads graphs into memory for all datasets
graphs = OrderedDict()
for idx, name in enumerate(names):
if verb:
print "Loading Dataset: " + name
# The key for the dictionary of graphs is the dataset name
graphs[name] = loadGraphs(fs[name], verb=verb)
return graphs
In [3]:
dataset_names = ['BNU1',
'BNU3',
'HNU1',
'KKI2009',
'SWU4',
'Templeton114']
xlabs = ['Raw',
'Ranked',
'UN',
'-CM',
'-PM-CM']
series = dataset_names + ['Pooled', 'LOO-Dataset']
cats = OrderedDict()
xs = range(len(xlabs))
for ser in series:
cats[ser] = {'x': xs,
'y': []}
In [4]:
basepath = '/Users/gkiar/code/neurodata/ndmg-paper/data/cloud/'
dir_names = [basepath + '/' + d for d in dataset_names]
fs = OrderedDict()
for idx, dd in enumerate(dataset_names):
fs[dd] = [root + "/" + fl for root, dirs, files in os.walk(dir_names[idx])
for fl in files if fl.endswith(".gpickle")]
ps = {os.path.splitext(os.path.basename(fl))[0] : root + "/" + fl
for root, dirs, files in os.walk(basepath+'phenotypes')
for fl in files if fl.endswith(".csv") }
print "Datasets: " + ", ".join([fkey + ' (' + str(len(fs[fkey])) + ')'
for fkey in fs])
print "Total Subjects: %d" % (sum([len(fs[key]) for key in fs]))
In [5]:
graphs = constructGraphDict(dataset_names, fs, verb=False)
In [6]:
phenotypes = OrderedDict()
for dataset in dataset_names:
tmp = csv.reader(open(ps[dataset]))
pheno = OrderedDict()
if dataset == 'KKI2009':
triple = [[t[1].strip(), t[4], int(t[5] == 'M')] for t in tmp][1:] # female=F->0, male=M->1
elif dataset == 'Templeton114':
triple = [[t[0].strip(), t[3], int(t[2] == '1')] for t in tmp][1:] # female=0->0, male=1->1
else:
triple = [[t[0].strip(), t[2], int(t[3] == '2')] for t in tmp
if t[3] != '#' and t[2] != '#'][1:] # female=1->0, male=2->1
for idx, trip in enumerate(triple):
pheno[trip[0]] = trip[1:]
phenotypes[dataset] = pheno
In [36]:
N = nx.number_of_nodes(graphs[graphs.keys()[0]][graphs[graphs.keys()[0]].keys()[0]])
feat = np.empty((0, int(sp.special.binom(N,2))), int)
dats = list(())
ages = np.array(())
sexy = np.array(())
sbjs = list(())
for idx1, dset in enumerate(graphs):
print(dset),
for idx2, subj in enumerate(graphs[dset]):
A = nx.adjacency_matrix(graphs[dset][subj]).todense()
Au = A[np.triu_indices(A.shape[0], 1)]
feat = np.append(feat, Au, axis=0)
dats.append(dset)
try:
subj_id = str(int(subj.split('-')[1].split('_')[0]))
except:
subj_id = subj.split('-')[1].split('_')[0]
sbjs.append(subj_id)
ages = np.append(ages, int(phenotypes[dset][subj_id][0]))
sexy = np.append(sexy, int(phenotypes[dset][subj_id][1]))
print feat.shape, ages.shape, sexy.shape
print sum(sexy == 0), sum(sexy == 1)
In [31]:
def classify_help(feat, labs, loos, verb=False):
means = np.array(())
neighbourhoods = (np.arange(10)+2)*2-1
if verb:
print("Neighbourhoods complete:"),
for i in neighbourhoods:
classif = KNeighborsClassifier(i)
loo = LeaveOneLabelOut(loos)
score = cross_validation.cross_val_score(classif, feat, labs, cv=loo)
means = np.append(means, score.mean())
if verb:
print(i),
if verb:
print(".")
print("Max score: {}".format(np.max(means)))
return np.max(means)
In [66]:
def rank(data):
ranked = np.copy(data)
for idx in range(ranked.shape[0]):
ranked[idx, :] = sp.stats.rankdata(ranked[idx,:])
return ranked
def cohort(data):
data_cm = np.copy(data)
for idx, dset in enumerate(dataset_names):
ind = np.array([dset == d for d in dats])
curr = data_cm[ind > 0]
mean = np.mean(curr, axis=0)
updated = curr - mean
data_cm[ind > 0] = updated
return data_cm
def popcohort(data):
mean = np.mean(data, axis=0)
return data - globalmean - cohort(data-globalmean)
def unit(data):
return normalize(data)
In [67]:
verb = True
cohortmeans = cohort(feat)
popcohortmeans = popcohort(feat)
for ser in series:
yt = []
if verb:
print("Series: {}".format(ser))
if ser in ["Pooled", "LOO-Dataset"]:
if ser == "Pooled":
feet = feat
labs = sexy
loos = sbjs
else:
feet = feat
labs = sexy
loos = dats
# Raw
yt += [classify_help(feet, labs, loos, verb=verb)]
# Rank
yt += [classify_help(rank(feet), labs, loos, verb=verb)]
# UN
yt += [classify_help(unit(feet), labs, loos, verb=verb)]
# -CM
yt += [classify_help(cohortmeans, labs, loos, verb=verb)]
# -PM-CM
yt += [classify_help(popcohortmeans, labs, loos, verb=verb)]
else:
inds = [i for i,d in enumerate(dats) if d == ser]
feet = feat[inds, :]
labs = sexy[inds]
loos = [sbjs[i] for i in inds]
# Raw
yt += [classify_help(feet, labs, loos, verb=verb)]
# Rank
yt += [classify_help(rank(feet), labs, loos, verb=verb)]
# UN
yt += [classify_help(unit(feet), labs, loos, verb=verb)]
# -CM
cohorts = cohortmeans[inds]
yt += [classify_help(cohorts, labs, loos, verb=verb)]
# -PM-CM
popcohorts = popcohortmeans[inds]
yt += [classify_help(popcohorts, labs, loos, verb=verb)]
cats[ser]['y'] = yt
In [136]:
for idx, ser in enumerate(series):
cats[ser]['x'] = list(np.asarray(xs) + 0.02*(len(series)/2 - idx))
In [101]:
fname = '/Users/gkiar/code/neurodata/ndmg-paper/data/cloud/betterclass.pkl'
with open(fname, 'w') as fhand:
pickle.dump(cats, fhand)
In [128]:
dat = pickle.load(open(fname, 'r'))
cats = OrderedDict(dat.items())
In [130]:
42+300+47+114+114
Out[130]:
In [138]:
nsubs = [42, 300, 47, 114, 114, 454, 1071, 1071]
series_sorted = ['KKI2009', 'HNU1', 'BNU3', 'BNU1', 'Templeton114', 'SWU4', 'Pooled', 'LOO-Dataset']
key_list = series_sorted
cols = {'KKI2009': 'rgb(253,212,158)',
'HNU1': 'rgb(253,187,132)',
'BNU3': 'rgb(252,141,89)',
'BNU1': 'rgb(239,101,72)',
'Templeton114': 'rgb(215,48,31)',
'SWU4': 'rgb(127,0,0)',
'Templeton255': 'rgb(90,0,0)',
'Pooled': 'rgb(0,200,50)',
'LOO-Dataset': 'rgb(0,50,200)'}
data = []
for idx, ser in enumerate(key_list):
shape = 'diamond'if ser in ["Pooled", "LOO-Dataset"] else 'dot'
size = 1.5*np.log2(nsubs[idx])
data += [
Scatter(x=cats[ser]['x'],
y=cats[ser]['y'],
marker=Marker(
symbol=shape,
size=size,
opacity=0.9,
color=cols[ser]
),
mode='markers',
name=ser,
hoverinfo='y+name',
legendgroup=ser
)
]
chance = np.max((sum(sexy), len(sexy)-sum(sexy)))/len(sexy)
data += [{'legendgroup': 'Chance',
'line': {'color': 'rgba(100,100,100, 0.8)',
'dash': 'dash'},
'marker': {'symbol': 'none'},
'name': 'Chance',
'type': 'scatter',
'x': [-1, 5],
'y': [chance, chance]}]
fig = Figure(data=data)
fig.layout['yaxis']['title'] = 'Classification Accuracy'
# fig.layout['yaxis']['nticks'] = 5
fig.layout['xaxis']['range'] = [-0.5, 4.5]
# fig.layout['xaxis']['title'] = 'Neighbourhood Size (k)'
fig.layout['xaxis']['tickvals'] = xs
fig.layout['xaxis']['ticktext'] = xlabs
fig.layout['xaxis']['zeroline'] = False
# fig.layout['legend']['orientation'] = 'h'
fig.layout['title'] = 'KNN Sex Classification With Various Normalizations Using the Desikan Parcellation'
plot(fig, validate=False, filename='/Users/gkiar/code/neurodata/ndmg-paper/code/class_regress/knn-classif.html')
Out[138]:
In [ ]: