In [15]:
import networkx as nx
import numpy as np
import os.path as op
import scipy as sp
import os
import csv
from copy import deepcopy
from collections import OrderedDict
from sklearn.preprocessing import normalize
from sklearn.neighbors import DistanceMetric
from scipy.linalg import svd
from scipy.linalg import norm
np.random.seed(12345678) # for reproducibility, set random seed
In [5]:
# path = '/Users/gkiar/code/ocp/ndmg-paper/data/new/'
# dsets = ['BNU1', 'HNU1', 'KKI2009', 'MRN114', 'MRN1313', 'NKI1', 'SWU4']
path = '/Users/gkiar/code/ocp/ndmg-paper/data/cloud/'
# dsets = ['BNU1', 'BNU3', 'HNU1', 'Jung2015', 'KKI2009', 'MRN114', 'MRN1313', 'NKI1', 'NKIENH', 'SWU4']
dsets = ['BNU1', 'KKI2009', 'NKI1', 'SWU4', 'HNU1']
# dsets = ['BNU1', 'BNU3', 'HNU1', 'Jung2015', 'MRN114', 'MRN1313', 'NKI1', 'NKIENH', 'SWU4']
# dsets = ['BNU1', 'KKI2009', 'HNU1', 'NKI1', 'SWU4']
# dsets = ['BNU1', 'HNU1', 'KKI2009', 'MRN114', 'MRN1313', 'NKI24', 'SWU4']
# dsets = ['KKI2009']
# dsets = ['SWU4']
dir_names = [path + '/' + d for d in dsets]
N = 70
fs = OrderedDict()
for idx, dd in enumerate(dsets):
fs[dd] = [root + "/" + fl for root, dirs, files in os.walk(dir_names[idx])
for fl in files if fl.endswith(".gpickle")]
# ps = {os.path.splitext(os.path.basename(fl))[0] : root + "/" + fl
# for root, dirs, files in os.walk(path+'phenotypes')
# for fl in files if fl.endswith(".csv") }
print "Datasets: " + ", ".join([fkey + ' (' + str(len(fs[fkey])) + ')'
for fkey in fs])
S = sum([len(fs[key]) for key in fs])
print "Total Subjects: %d" % (S)
In [19]:
def loadGraphs(filenames, verb=False):
"""
Given a list of files, returns a dictionary of graphs
Required parameters:
filenames:
- List of filenames for graphs
Optional parameters:
verb:
- Toggles verbose output statements
"""
# Initializes empty dictionary
gstruct = OrderedDict()
for idx, files in enumerate(filenames):
if verb:
print "Loading: " + files
# Adds graphs to dictionary with key being filename
fname = os.path.basename(files)
gstruct[fname] = nx.read_gpickle(files)
return gstruct
def constructGraphDict(names, fs, verb=False):
"""
Given a set of files and a directory to put things, loads graphs.
Required parameters:
names:
- List of names of the datasets
fs:
- Dictionary of lists of files in each dataset
Optional parameters:
verb:
- Toggles verbose output statements
"""
# Loads graphs into memory for all datasets
graphs = OrderedDict()
for idx, name in enumerate(names):
if verb:
print "Loading Dataset: " + name
# The key for the dictionary of graphs is the dataset name
graphs[name] = loadGraphs(fs[name], verb=verb)
return graphs
def rdf(dist, ids):
N = dist.shape[0]
assert(N == len(ids))
uniqids = list(set(ids))
countvec = [ids.count(uniqid) for uniqid in uniqids]
scans = np.max(countvec)
# rdf = np.empty((N*(scans-1)))
rdf = []
for i in np.arange(0, N):
ind = [idx for idx, x in enumerate(ids) if x == ids[i]]
for j in ind:
if i != j:
di = deepcopy(dist[i,:])
di[ind] = np.inf
d = dist[i,j]
diff = di[np.where(~np.isinf(di))]
# import pdb; pdb.set_trace()
rdf += [1.0 - ((np.sum(diff < d) + 0.5*np.sum(diff == d)) / (1.0*(N-len(ind))))]
return rdf
def partial_disc(D, labels, subject, trial1, trial2):
enum = np.arange(D.shape[0])
idx1 = [i for i, x in enumerate(labels) if x == subject]
t1 = enum[idx1][trial1]
t2 = enum[idx1][trial2]
d_t1_t2 = D[t1][t2]
idx2 = [i for i, x in enumerate(labels) if x != subject]
d_ra = [D[t1][x] for x in enum[idx2]]
return np.mean(d_t1_t2 < d_ra)
def distance_matrix(data, metric, symmetric = True):
n = data.shape[2]
dist_matrix = np.zeros((n, n))
if symmetric:
for i in range(n):
for j in range(i):
tmp = metric(data[:,:,i] - data[:,:,j])
dist_matrix[i][j] = tmp
dist_matrix[j][i] = tmp
else:
for i in range(n):
for j in range(n):
dist_matrix[i][j] = metric(data[i] - data[j])
return dist_matrix
def discriminibility(data, labels, metric):
dist_matrix = distance_matrix(data, metric)
partials = []
for s in list(set(labels)):
num = ids.count(s)
for t in range(num):
for tt in range(num):
if tt != t:
p = partial_disc(dist_matrix, labels, s, t, tt)
partials.append(p)
return dist_matrix, np.mean(partials)
In [7]:
graphs = constructGraphDict(dsets, fs, verb=False)
In [23]:
bigmat = np.zeros((70, 70, 0))
bigids = []
for dset in graphs.keys():
mat = np.zeros((N, N, len(graphs[dset].keys())))
ids = []
c = 0
for subj in graphs[dset].keys():
ids += [subj.split("_")[0].split("-")[1]]
tmpg = np.array(nx.adj_matrix(graphs[dset][subj]).todense())
mat[:, :, c] = tmpg
c += 1
print "Shape: {}, {}".format(mat.shape, len(ids))
dist = distance_matrix(mat, norm)
myrdf = rdf(dist, ids)
disc = np.mean(myrdf)
# dist, disc = discriminibility(mat, ids, norm)
print '{} ({}): {}'.format(dset, len(graphs[dset].keys()), disc)
f = open('ids_{}.csv'.format(dset), 'wb')
f.write("\n".join([i for i in ids]))
f.close()
np.savetxt('dist_{}.csv'.format(dset), dist, delimiter=",")
bigmat = np.append(bigmat, mat, axis=2)
bigids += ids
print "Shape: {}, {}".format(bigmat.shape, len(bigids))
bigdist = distance_matrix(bigmat, norm)
myrdf = rdf(bigdist, bigids)
disc = np.mean(myrdf)
print 'Total ({}): {}'.format(S, disc)
f = open('ids_all.csv', 'wb')
f.write("\n".join([i for i in bigids]))
f.close()
np.savetxt('dist_all.csv', dist, delimiter=",")
In [ ]: