In [1]:
from bigbang import plot
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
TO DO:
In [2]:
%matplotlib inline
In [3]:
from bigbang.archive import Archive
arx = Archive("ipython-dev",archive_dir="../archives")
Get the activity of a list
In [4]:
act = arx.get_activity(resolved=True)
In [5]:
#act.shape
Since are going to be computing correlations between N different time series data sets and that's an $O(N^2)$ operation, let's limit N.
In [6]:
cutoff = 5
def filtered_participants(cutoff):
xc = act.sum() > cutoff
return act.columns[xc]
filtered_participants(cutoff)[:10]
Out[6]:
In [7]:
from scipy.stats.stats import pearsonr
fc = filtered_participants(cutoff)
n = len(fc)
pc = np.zeros([n,n])
for i in range(0,n):
for j in range(i,n):
# since pearson correlation assumes normally distributed data
# and we believe activity to be log-normally distributed (see Shalizi...)
# we'll take the log of values here
si = np.log1p(act[fc[i]])
sj = np.log1p(act[fc[j]])
c = pearsonr(si,sj)[0]
pc[i,j] = c
pc[j,i] = c
pc
Out[7]:
In [8]:
G = nx.Graph(pc)
labels = dict(enumerate(fc))
G = nx.relabel_nodes(G,labels)
In [9]:
# order nodes by number of messages sent by each participant
o = list(act[fc].sum().order(ascending=False).keys())
plot.draw_adjacency_matrix(G,node_order=o)
In [10]:
from sklearn import cluster
n_clusters = 2
sc = cluster.SpectralClustering(n_clusters=n_clusters)
partition = sc.fit_predict(pc)
partition_order = [fc[x] for x in np.argsort(partition)]
Modularity metric from
Zanetti, M. and Schweitzer, F. 2012. "A Network Perspective on Software Modularity" ARCS Workshops 2012, pp. 175-186.
http://arxiv.org/pdf/1201.3771v6.pdf :
Here we are trying to adapt this to edges weighted from $(0,1]$
In [11]:
from bigbang.process import modularity
In [12]:
def get_colors(num):
return plt.cm.Set1(np.linspace(0,1,num=num))
def ksc(m,k=3):
"""
Perform spectral clustering on matrix finding the number of
clusters that optimizes modularity.
"""
sc = cluster.SpectralClustering(n_clusters=k)
partition = sc.fit_predict(pc)
return partition
In [13]:
partition = ksc(pc,k=3)
In [14]:
partition
Out[14]:
In [15]:
partition_order = [fc[x] for x in np.argsort(partition)]
np.sort(partition)
Out[15]:
In [16]:
from collections import defaultdict
p = {}
parts = {}
p_n = {}
mods = []
for k in range(2,20):
p[k] = ksc(pc,k=k)
parts[k] = defaultdict(list)
p_n[k] = defaultdict(list)
for i in range(len(p[k])):
parts[k][p[k][i]].append(fc[i])
p_n[k][p[k][i]].append(i)
#print [len(x) for x in p_n[k].values()]
mod = modularity(pc,list(p_n[k].values()))
#print mod
mods.append(mod)
In [17]:
print(mods)
plt.plot(list(range(2,20)),mods)
Out[17]:
In [18]:
k = np.argmax(mods) + 2
partition_order = [fc[x] for x in np.argsort(p[k])]
In [19]:
print(k)
len(list(parts[k].values()))
Out[19]:
In [20]:
plot.draw_adjacency_matrix(G,
node_order=partition_order,
partitions=list(parts[k].values()),
colors=get_colors(k))
In [21]:
for x,v in list(parts[k].items()):
print(str(x) + ": " + str(len(v)))
In [22]:
node_parts = {}
for x,v in list(parts[k].items()):
for n in v:
node_parts[n] = int(x)
In [23]:
import networkx as nx
#FG = nx.Graph(pc > .01)
FG = nx.from_numpy_matrix(pc * (pc > 0))
labels = dict(enumerate(fc))
FG = nx.relabel_nodes(FG,labels)
nx.set_node_attributes(FG,'corr-cluster',node_parts)
pos = nx.spring_layout(FG, k = 0.6, iterations = 1000)
nx.draw(FG,pos)
nx.write_gexf(FG,'corr.gexf')
In [24]:
from bigbang import plot
plot.stack(act,partition=parts[k],smooth=7)
In [25]:
plot.stack(act,partition=parts[3],smooth=7)
In [26]:
plot.stack(act,partition=parts[4],smooth=7)
In [27]:
plot.stack(act,partition=parts[5],smooth=7)
In [ ]: