In [ ]:
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import networkx as nx
%matplotlib inline
We will start by node label prediction. Download this network. It contains protein communications in Baker’s yeast. Each node (protein) has a special binary attribute ICSC (intracellular signaling cascade).
In [ ]:
g = nx.read_gml('./data/ppi.CC.gml.txt')
cc = list(nx.connected_components(g))
g = nx.subgraph(g,cc[0])
g = nx.relabel.convert_node_labels_to_integers(g)
In [ ]:
labels = np.array(nx.get_node_attributes(g, 'ICSC').values(), dtype=float)
In [ ]:
nx.draw_spring(g, node_color = labels)
It might not be clear from the picture above but the level of homogeneity is quite high. For each node we are able to compute the average value of label
In [ ]:
nnICSC = np.asarray(map(lambda(v): np.mean(labels[g.neighbors(v)]), g.nodes_iter()))
nnICSC
plt.figure(figsize=(10,5))
plt.hist(nnICSC[np.where(labels == 1)], bins=6,)
ICM is kind of NN-classifier. Our prediction is based on the largest ratio of nearest neighbours of unlabeled nodes.
In [ ]:
lablNN = labels[:]
idx = np.random.randint(0,len(lablNN), size=40)
lablNN[idx] = np.nan
In [ ]:
# Your code here
## Get the adjacency matrix
A = nx.adjacency_matrix( g )
## Find the unclassified nodes
unlabelled = np.isnan( lablNN )
## Slice the adjacency matrix
# B = A[unlabelled].tocsc()[:,~unlabelled].tocsr()
B = A.tocsc()[:,~unlabelled].tocsr()
## Compute the mean label of the labelled neighbours of each unlabelled node.
new_labels = B.dot( lablNN[~unlabelled] ) / B.sum( axis = 1 ).getA1( )
## Update the labels
lablNN[unlabelled] = new_labels[unlabelled]
Now instead of looking at neigbours we are switching random walks between all the nodes
Just to recall the Label Propagation method:
In [ ]:
# It is better to initialize like that
fixedLabels = labels[:]+1
curLabels = labels[:]+1
# And indicate labeled nodes instead of unlabeled
idxLabeled = np.zeros((g.number_of_nodes(),), dtype=bool)
idxLabeled[np.random.randint(0,len(labels), size=90)] = True
curLabels[~idxLabeled] = 0
In [ ]:
A = nx.adj_matrix( g )
D = sp.sparse.diags( 1.0 / A.sum( axis = 1 ).getA1( ), offsets = 0 )
P = D.dot( A )
In [ ]:
In [ ]:
def LabelPropagation(G, idxLabeled, curLabels, fixedLabels, iters = 1000):
A = nx.adj_matrix( g )
D = sp.sparse.diags( 1.0 / A.sum( axis = 1 ).getA1( ), offsets = 0 )
P = D.dot( A )
# Your code here
return np.round(resultLabels)
In this section we will implement some scoring functions for link prediction and compare the values for adjacent and non-adjacent nodes.
Load french blog network and compute the following scores:
In [ ]:
g = nx.read_gml('./data/fblog.gml.txt')
vNum = g.number_of_nodes()
In [ ]:
def matrixPlot(A):
plt.figure(1, figsize=(6, 6))
plt.imshow(A,
interpolation="none"
)
In [ ]:
# Your code here
spath = nx.floyd_warshall_numpy( g )
matrixPlot( spath )
In [ ]:
# Your code here
A = nx.adjacency_matrix( g )
common_neighbour = A.dot( A ).todense()
matrixPlot( common_neighbour )
In [ ]:
# Your code here
jaccard_score = np.asarray( [ ( len( np.intersect1d( g[v].keys(), g[u].keys() ) ) + 0.0 ) / len( np.union1d( g[v].keys(), g[u].keys() ) )
for v in g.nodes_iter( ) for u in g.nodes_iter( ) ], dtype = np.float ).reshape( 2*[g.number_of_nodes()] )
In [ ]:
matrixPlot( jaccard_score )
$Score(a,b) = \sum\limits_{v \in \text{NN}(a) \cap \text{NN}(b)} \frac{1}{\log |\text{NN}(v)|}$
In [ ]:
# Your code here
adar_score = np.asarray( [ np.sum( [ 1.0 / np.log( len( g[w].keys() ) ) for w in np.intersect1d( g[v].keys(), g[u].keys() ) ] )
for v in g.nodes_iter( ) for u in g.nodes_iter( ) ], dtype = np.float ).reshape( 2*[g.number_of_nodes()] )
In [ ]:
matrixPlot( adar_score )
$Score(a,b) = |\text{NN}(a)| \times |\text{NN}(b)|$
In [ ]:
# Your code here
pref_score = np.asarray( [ len( g[v].keys() ) * len( g[u].keys() )
for v in g.nodes_iter( ) for u in g.nodes_iter( ) ], dtype = np.float ).reshape( 2*[g.number_of_nodes()] )
In [ ]:
matrixPlot( pref_score )
$Score(a,b) = \sum\limits_{\text{Paths}_{x,y}} \beta^{|p_{a,b}|}\times|p_{a,b}|$
In [ ]:
# Your code here
A = nx.adjacency_matrix( g ).tocsc()
beta = 0.5
In [ ]:
I = sp.sparse.eye(*A.shape)
katzScore = ( sp.sparse.linalg.inv( I - beta * A ) - I ).todense()
matrixPlot( katzScore )
Let's compare the scores behavious for pairs of nodes with or without edge in-between
In [ ]:
A = np.asarray(nx.adj_matrix(g).todense())
xyTriu = np.vstack(np.triu_indices_from(A, k=1)).T
wEdge = [katzScore[xy[0],xy[1]] for xy in xyTriu if A[xy[0],xy[1]]]
woEdge = [katzScore[xy[0],xy[1]] for xy in xyTriu if ~A[xy[0],xy[1]]]
data = [wEdge, woEdge]
In [ ]:
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(10,5))
axes.violinplot(data, showmeans=True)
axes.set_xticklabels(['', 'With Edges', '', 'W/o Edges'])
In [ ]: