Heavily borrowed text materials and formatting from grelliam
For independent graphs, check that off diagonal covariance is approximately 0.
$x_i \stackrel{iid}{\sim} F$
$(x_1, x_2, ..., x_n) \sim F = \prod_i^n F_i$
$F_i = F_j, \forall i,j$
For identical graphs, check the optimal number of clusters and see if that is 1.
$F = \prod_j^J F_j, J < n$
$\prod_j^J w_jF_j(\theta)$
In [3]:
# change working dir
path = "/Users/albertlee/claritycontrol/code/scripts" # use your own path
import os
os.chdir(path)
In [2]:
import numpy as np
import matplotlib.pyplot as plt
import os
import csv
import igraph as ig
%matplotlib inline
# Initializing dataset names
dnames = list(['../data/hist'])
print "Dataset: " + ", ".join(dnames)
# Getting graph names
fs = list()
for dd in dnames:
fs.extend([root+'/'+file for root, dir, files in os.walk(dd) for file in files])
fs = fs[1:]
def loadGraphs(filenames, rois, printer=False):
A = np.zeros((rois, rois, len(filenames)))
for idx, files in enumerate(filenames):
if printer:
print "Loading: " + files
g = ig.Graph.Read_GraphML(files)
tempg = g.get_adjacency(attribute='weight')
A[:,:,idx] = np.asarray(tempg.data)
return A
# Load X
X = loadGraphs(fs, 70)
print X.shape
# Load Y
ys = csv.reader(open('../data/points/Fear199.csv'))
y = [y[5] for y in ys]
y = [1 if x=='F' else 0 for x in y[1:]]
In [ ]:
vectorized = np.reshape(X, (X.shape[0]**2, X.shape[2])).T
covar = np.cov(vectorized)
plt.figure(figsize=(7,7))
plt.imshow(covar)
plt.title('Covariance of KKI2009 dataset')
plt.colorbar()
plt.show()
diag = covar.diagonal()*np.eye(covar.shape[0])
hollow = covar-diag
d_det = np.linalg.det(diag)
h_det = np.linalg.det(hollow)
plt.figure(figsize=(11,8))
plt.subplot(121)
plt.imshow(diag)
plt.clim([0, np.max(covar)])
plt.title('Determinant of on-diagonal: ' + str(d_det))
plt.subplot(122)
plt.imshow(hollow)
plt.clim([0, np.max(covar)])
plt.title('Determinant of off-diagonal: ' + str(h_det))
plt.show()
print "Ratio of on- and off-diagonal determinants: " + str(d_det/h_det)
From the above, we conclude that the assumption that the graphs were independent is false. This is because the off-diagonal components of the covariance are highly significant in the cross-graph covariance matrix.
In [ ]:
import sklearn.mixture
i = np.linspace(1,15,15,dtype='int')
print i
bic = np.array(())
for idx in i:
print "Fitting and evaluating model with " + str(idx) + " clusters."
gmm = sklearn.mixture.GMM(n_components=idx,n_iter=1000,covariance_type='diag')
gmm.fit(vectorized)
bic = np.append(bic, gmm.bic(vectorized))
plt.figure(figsize=(7,7))
plt.plot(i, 1.0/bic)
plt.title('BIC')
plt.ylabel('score')
plt.xlabel('number of clusters')
plt.show()
print bic
From the above we observe that, since the elbow of the bic curve lies at 6, that our data may not have been sampled identically from one distribution. This assumption based on the evidence provided is also false.
In [ ]:
vect = np.reshape(X, (X.shape[0]**2, X.shape[2]))
covar = np.cov(vect)
plt.figure(figsize=(7,7))
plt.imshow(covar)
plt.title('Covariance of KKI2009 dataset')
plt.colorbar()
plt.show()
diag = covar.diagonal()*np.eye(covar.shape[0])
hollow = covar-diag
d_det = np.sum(diag)
h_det = np.sum(hollow)
plt.figure(figsize=(11,8))
plt.subplot(121)
plt.imshow(diag)
plt.clim([0, np.max(covar)])
plt.title('Sum of on-diagonal: ' + str(d_det))
plt.subplot(122)
plt.imshow(hollow)
plt.clim([0, np.max(covar)])
plt.title('Sum of off-diagonal: ' + str(h_det))
plt.show()
print "Ratio of on- and off-diagonal covariance sums: " + str(d_det/h_det)
From the above, we can conclude that the edges are not independent of one another, as the ratio of on- to off-diagonal covariance is very small. This assumption is false.
In [ ]:
import sklearn.mixture
i = np.linspace(1,15,15,dtype='int')
print i
bic2 = np.array(())
for idx in i:
print "Fitting and evaluating model with " + str(idx) + " clusters."
gmm = sklearn.mixture.GMM(n_components=idx,n_iter=1000,covariance_type='diag')
gmm.fit(vect.T)
bic2 = np.append(bic2, gmm.bic(vect.T))
plt.figure(figsize=(7,7))
plt.plot(i, 1.0 / bic2)
plt.title('BIC')
plt.ylabel('score')
plt.xlabel('number of clusters')
plt.show()
print bic2
Looking at the bic curve, we observe that the optimal number of clusters is 5, indicating that edges are not in fact identically distributed. This assumptions appears to be false.
In [ ]:
import scipy.stats as ss
ya = np.array(y)
# clf = sklm.LinearRegression()
# print vectorized.T.shape
edgeprob = 1.0*np.sum(1.0*(vectorized.T>0),1)/4900
# print edgeprob
# print ya.shape
# vals = clf.fit(edgeprob.T, ya)
# slope = clf.coef_
# intercept = clf.intercept_
vals = ss.linregress(edgeprob, ya)
m = vals[0]
c = vals[1]
def comp_value(m, c, data):
return m.T*data + c
resi = np.array(())
for idx, subj in enumerate(ya):
temp = comp_value(m, c, edgeprob[idx])
resi = np.append(resi, subj - temp)
plt.figure(figsize=(7,7))
plt.scatter(edgeprob, resi)
plt.title('Residual assignment error')
plt.xlabel('edge probability')
plt.ylabel('error')
plt.show()
From the above we can see quite plainly that our classifier fails to separate subjects based on their edge probability. Thus, this assumption is also false.