notebook.community



In [15]:

    
import networkx as nx
import numpy as np
import os.path as op
import scipy as sp

import os
import csv

from copy import deepcopy
from collections import OrderedDict
from sklearn.preprocessing import normalize
from sklearn.neighbors import DistanceMetric
from scipy.linalg import svd
from scipy.linalg import norm

np.random.seed(12345678)  # for reproducibility, set random seed



In [5]:

    
# path = '/Users/gkiar/code/ocp/ndmg-paper/data/new/'
# dsets = ['BNU1', 'HNU1', 'KKI2009', 'MRN114', 'MRN1313', 'NKI1', 'SWU4']
path = '/Users/gkiar/code/ocp/ndmg-paper/data/cloud/'
# dsets = ['BNU1', 'BNU3', 'HNU1', 'Jung2015', 'KKI2009', 'MRN114', 'MRN1313', 'NKI1', 'NKIENH', 'SWU4']
dsets = ['BNU1', 'KKI2009', 'NKI1', 'SWU4', 'HNU1']
# dsets = ['BNU1', 'BNU3', 'HNU1', 'Jung2015', 'MRN114', 'MRN1313', 'NKI1', 'NKIENH', 'SWU4']

# dsets = ['BNU1', 'KKI2009', 'HNU1', 'NKI1', 'SWU4']
# dsets = ['BNU1', 'HNU1', 'KKI2009', 'MRN114', 'MRN1313', 'NKI24', 'SWU4']
# dsets = ['KKI2009']
# dsets = ['SWU4']

dir_names = [path + '/' + d for d in dsets]

N = 70

fs = OrderedDict()
for idx, dd in enumerate(dsets):
    fs[dd] = [root + "/" + fl for root, dirs, files in os.walk(dir_names[idx])
              for fl in files if fl.endswith(".gpickle")]

# ps = {os.path.splitext(os.path.basename(fl))[0] : root + "/" + fl
#       for root, dirs, files in os.walk(path+'phenotypes')
#       for fl in files if fl.endswith(".csv") }

print "Datasets: " + ", ".join([fkey + ' (' + str(len(fs[fkey])) + ')'
                                for fkey in fs])
S = sum([len(fs[key]) for key in fs])
print "Total Subjects: %d" % (S)









    



Datasets: BNU1 (114), KKI2009 (42), NKI1 (40), SWU4 (454), HNU1 (300)
Total Subjects: 950



In [19]:

    
def loadGraphs(filenames, verb=False):
    """
    Given a list of files, returns a dictionary of graphs

    Required parameters:
        filenames:
            - List of filenames for graphs
    Optional parameters:
        verb:
            - Toggles verbose output statements
    """
    #  Initializes empty dictionary
    gstruct = OrderedDict()
    for idx, files in enumerate(filenames):
        if verb:
            print "Loading: " + files
        #  Adds graphs to dictionary with key being filename
        fname = os.path.basename(files)
        gstruct[fname] = nx.read_gpickle(files)
    return gstruct

def constructGraphDict(names, fs, verb=False):
    """
    Given a set of files and a directory to put things, loads graphs.

    Required parameters:
        names:
            - List of names of the datasets
        fs:
            - Dictionary of lists of files in each dataset
    Optional parameters:
        verb:
            - Toggles verbose output statements
    """
    #  Loads graphs into memory for all datasets
    graphs = OrderedDict()
    for idx, name in enumerate(names):
        if verb:
            print "Loading Dataset: " + name
        # The key for the dictionary of graphs is the dataset name
        graphs[name] = loadGraphs(fs[name], verb=verb)
    return graphs

def rdf(dist, ids):
    N = dist.shape[0]
    assert(N == len(ids))
    uniqids = list(set(ids))
    countvec = [ids.count(uniqid) for uniqid in uniqids]
    scans = np.max(countvec)
#     rdf = np.empty((N*(scans-1)))
    rdf = []

    for i in np.arange(0, N):
        ind = [idx for idx, x in enumerate(ids) if x == ids[i]]
        for j in ind:
            if i != j:
                di = deepcopy(dist[i,:])
                di[ind] = np.inf
                d = dist[i,j]
                diff = di[np.where(~np.isinf(di))]
#                 import pdb; pdb.set_trace()
                rdf += [1.0 - ((np.sum(diff < d) + 0.5*np.sum(diff == d)) / (1.0*(N-len(ind))))]
    return rdf

def partial_disc(D, labels, subject, trial1, trial2):
    enum = np.arange(D.shape[0])
    idx1 = [i for i, x in enumerate(labels) if x == subject]
    t1 = enum[idx1][trial1]
    t2 = enum[idx1][trial2]
    d_t1_t2 = D[t1][t2]
    
    idx2 = [i for i, x in enumerate(labels) if x != subject]
    d_ra = [D[t1][x] for x in enum[idx2]]
    
    return np.mean(d_t1_t2 < d_ra)

def distance_matrix(data, metric, symmetric = True):
    n = data.shape[2]
    dist_matrix = np.zeros((n, n))
    if symmetric:
        for i in range(n):
            for j in range(i):
                tmp = metric(data[:,:,i] - data[:,:,j])
                dist_matrix[i][j] = tmp
                dist_matrix[j][i] = tmp
    else:
        for i in range(n):
            for j in range(n):
                dist_matrix[i][j] = metric(data[i] - data[j])
    return dist_matrix

def discriminibility(data, labels, metric):
    dist_matrix = distance_matrix(data, metric)
    partials = []
    for s in list(set(labels)):
        num = ids.count(s)
        for t in range(num):
            for tt in range(num):
                if tt != t:
                    p = partial_disc(dist_matrix, labels, s, t, tt)
                    partials.append(p)
    return dist_matrix, np.mean(partials)



In [7]:

    
graphs = constructGraphDict(dsets, fs, verb=False)

All subjs, ID = subj id



In [23]:

    
bigmat = np.zeros((70, 70, 0))
bigids = []

for dset in graphs.keys():
    mat = np.zeros((N, N, len(graphs[dset].keys())))
    ids = []
    c = 0
    for subj in graphs[dset].keys():
        ids += [subj.split("_")[0].split("-")[1]]
        
        tmpg = np.array(nx.adj_matrix(graphs[dset][subj]).todense())
        mat[:, :, c] = tmpg
        c += 1
    print "Shape: {}, {}".format(mat.shape, len(ids))
    
    dist = distance_matrix(mat, norm)
    myrdf = rdf(dist, ids)
    disc = np.mean(myrdf)
#     dist, disc = discriminibility(mat, ids, norm)
    print '{} ({}): {}'.format(dset, len(graphs[dset].keys()), disc)

    f = open('ids_{}.csv'.format(dset), 'wb')
    f.write("\n".join([i for i in ids]))
    f.close()
    np.savetxt('dist_{}.csv'.format(dset), dist, delimiter=",")
    
    bigmat = np.append(bigmat, mat, axis=2)
    bigids += ids

print "Shape: {}, {}".format(bigmat.shape, len(bigids))
bigdist = distance_matrix(bigmat, norm)
myrdf = rdf(bigdist, bigids)
disc = np.mean(myrdf)
print 'Total ({}): {}'.format(S, disc)

f = open('ids_all.csv', 'wb')
f.write("\n".join([i for i in bigids]))
f.close()
np.savetxt('dist_all.csv', dist, delimiter=",")









    



Shape: (70, 70, 114), 114
BNU1 (114): 0.984257518797
Shape: (70, 70, 42), 42
KKI2009 (42): 1.0
Shape: (70, 70, 40), 40
NKI1 (40): 0.983552631579
Shape: (70, 70, 454), 454
SWU4 (454): 0.883630267826
Shape: (70, 70, 300), 300
HNU1 (300): 0.993232439336
Shape: (70, 70, 950), 950
Total (950): 0.979084429723



In [ ]: