In [1]:
from py2cytoscape.data.cynetwork import CyNetwork
from py2cytoscape.data.cyrest_client import CyRestClient
from py2cytoscape.data.style import StyleUtil
import sand
import matplotlib.pyplot as plt
%matplotlib notebook
In [2]:
network_name = "5b2a6e3"
network_collection = "lein-topology"
data_path = "./data/" + network_collection + "-" + network_name
edgelist_file = data_path + ".csv"
edgelist_data = sand.csv_to_dicts(edgelist_file,header=['source', 'target', 'weight'])
edgelist_data[:5]
Out[2]:
In [3]:
g = sand.from_edges(edgelist_data)
g.summary()
Out[3]:
In [4]:
g.is_simple()
Out[4]:
In [5]:
g.vs['group'][:5]
Out[5]:
The vertices in the lein topology
data set contain fully-qualified namespaces for functions. Grouping by name isn't particularly useful here:
In [6]:
len(set(g.vs['group']))
Out[6]:
In [7]:
len(g.vs)
Out[7]:
Because sandbook
was build specifically for analyzing software and system networks, a fqn_to_groups
grouping function is built in:
In [8]:
g.vs['group'] = sand.fqn_to_groups(g.vs['label'])
In [9]:
len(set(g.vs['group']))
Out[9]:
This is a much more managable number of groups.
Namespaces may also be useful as a separate attribute on the vertices:
In [10]:
g.vs['namespace'] = sand.namespaces(g.vs['label'])
Outdegree represents the vertices that have the most dependencies, i.e. call the most number of functions. These functions could potentially be split into smaller, more cohesive functions.
Indegree represents the vertices are depended on the most...changing them will have the most impact on other parts of the system.
In [13]:
from sand.degree import degree_distribution_count
degree, count = degree_distribution_count(g)
In [14]:
plt.title("Degree Distribution")
plt.ylabel("count")
plt.xlabel("degree")
infig, = plt.loglog(degree, count, 'b-', marker='o')
In [15]:
g.vs['outdegree'][:5]
Out[15]:
In [16]:
g.vs['indegree'][:5]
Out[16]:
Which vertices have a degree of more than some majority percentage of the maxdegree?
In [17]:
g.maxdegree(mode='IN')
Out[17]:
In [18]:
g.maxdegree(mode='OUT')
Out[18]:
In [19]:
score = g.maxdegree() * .80
In [20]:
[v['name'] for v in g.vs.select(lambda vertex: vertex['outdegree'] >= 10)]
Out[20]:
In [21]:
[v['name'] for v in g.vs.select(lambda vertex: vertex['indegree'] >= 4)]
Out[21]:
In this case, clojure.core
and clojure.test
namespaces have the most dependencies...unsurprising, given these are the foundational libraries of the language! These are good candidates to filter out of a visualization, since they often don't add deep insight to the aspects of the design specific to the program.
There are some analyses where it will be useful to see all the vertices. For the high-level architecture diagram, we can focus on the functions local to the library's namespaces. We'll also keep functions that have side-effects to see if these are isolated to only a few key parts of the program:
In [22]:
# List all patterns of vertex names that we want to keep:
names_to_keep = ('topology', 'clojure.core/*err*', 'clojure.core/println',
'clojure.zip', 'clojure.java.io', 'clojure.tools.namespace',
'leiningen.core.eval', 'clojure.repl')
lv = g.vs(lambda v: any(match in v['label'] for match in names_to_keep))
lg = g.subgraph(lv)
# Recompute degree after building the local subgraph (lg):
lg.vs['indegree'] = lg.degree(mode="in")
lg.vs['outdegree'] = lg.degree(mode="out")
lg.summary()
Out[22]:
In [24]:
import sand.cytoscape as sc
sc.print_version()
In [25]:
# Create py2cytoscape client
cy = CyRestClient()
In [26]:
# Optional: delete existing Cytoscape sessions.
cy.session.delete()
In [27]:
# Load the network
network = cy.network.create_from_igraph(lg, name=network_name, collection=network_collection)
In [28]:
# Apply default layout
cy.layout.apply(name='force-directed', network=network)
In [30]:
style = cy.style.create('Ops')
style.update_defaults(sc.ops)
# Map the label property in the igraph data to Cytoscape's NODE_LABEL visual property
style.create_passthrough_mapping(column='label', vp='NODE_LABEL', col_type='String')
In [32]:
from sand.cytoscape import colors
border_colors = {
'topology.finder': colors.BRIGHT_YELLOW,
'topology.dependencies': colors.BRIGHT_ORANGE,
'topology.dependencies-test': colors.BRIGHT_ORANGE,
'topology.qualifier': colors.BRIGHT_PURPLE,
'topology.symbols': colors.BRIGHT_BLUE,
'clojure.core': colors.BRIGHT_RED,
'clojure.java.io': colors.BRIGHT_RED,
'topology.printer': colors.BRIGHT_RED,
'leiningen.topology': colors.BRIGHT_WHITE,
}
fill_colors = {
'topology.finder': colors.DARK_YELLOW,
'topology.dependencies': colors.DARK_ORANGE,
'topology.dependencies-test': colors.DARK_ORANGE,
'topology.qualifier': colors.DARK_PURPLE,
'topology.symbols': colors.DARK_BLUE,
'clojure.core': colors.DARK_RED,
'clojure.java.io': colors.DARK_RED,
'topology.printer': colors.DARK_RED,
'leiningen.topology': colors.DARK_WHITE,
}
style.create_discrete_mapping(column='namespace', col_type='String', vp='NODE_FILL_COLOR', mappings=fill_colors)
style.create_discrete_mapping(column='namespace', col_type='String', vp='NODE_BORDER_PAINT', mappings=border_colors)
In [33]:
cy.style.apply(style, network)
In [34]:
positions_file = data_path + "-positions.csv"
sc.layout_from_positions_csv(network, positions_file, cy)
In [35]:
sc.positions_to_csv(network=network, path=positions_file)
In [36]:
# Hide all panels in the UI
sc.hide_panels()
Out[36]:
In [37]:
# Fit to the window:
cy.layout.fit(network=network)
In [38]:
view_id = network.get_views()[0]
view = network.get_view(view_id=view_id, format='view')
In [39]:
# Zoom out slightly:
view.update_network_view('NETWORK_SCALE_FACTOR', 0.65)
In [40]:
# Shift the network to the left:
view.update_network_view('NETWORK_CENTER_X_LOCATION', 150.0)
In [41]:
from IPython.display import SVG, display
svg_data = network.get_svg()
display(SVG(svg_data))
In [50]:
# Write the svg to a file if everything looks good:
with open(data_path + '.svg', 'wb') as f:
f.write(svg_data)
In [42]:
from bokeh.plotting import show, output_notebook
from bokeh.palettes import all_palettes
output_notebook()
In [43]:
num_groups = max(set(lg.vs['group']))
num_groups
Out[43]:
Now we need to choose an appropriate palette that accomodates this number of groups and achieves the desired visual separation.
As a general rule, you'll need one of the large palettes for > 20 groups.
Pass the name of the palette you choose to the all_palettes
function:
In [44]:
palette = all_palettes['Category20'][num_groups + 1]
In [45]:
lg.vs.attributes()
Out[45]:
In [46]:
p = sand.matrix(lg, 'indegree', "{}-{} by indegree".format(network_collection, network_name), 900, palette)
show(p)
In [56]:
p = matrix(lg, 'outdegree', "{}-{} by outdegree".format(network_collection, network_name), 900, palette)
show(p)
In [47]:
p = sand.matrix(lg, 'group', "{}-{} by group".format(network_collection, network_name), 900, palette)
show(p)
Ordering by group / modules gives us a visual indication of how well the system accomplishes the design goal of loosely coupled and highly cohesive modules. We can quantify this idea.
Clustering is a type of assignment problem seeking the optimal allocation of N components to M clusters. One of the prominent heuristics of system architecting is to choose modules such that they are as independent as possible...low coupling and high cohesion.
We can objectively score these clustering algorithms using an objective function that considers both the size of the clusters ($C_i$) and the number of interactions outside the clusters ($I_0$) according to the following equation, where $\alpha = 10$, $\beta = 100$ or $\alpha = 1$, $\beta = 10$, and $M$ is the number of clusters:
$Obj = \alpha \sum_{i=1}^{M}C_i^2 + \beta I_0$
See page 25 of Design Structure Matrix Methods and Applications for more information.
Clustering objectives work against two competing forces:
The objective function could be evaluated for any number of potential designs that were manually or automatically created. This essentially provides a real-time feedback loop about the potential quality of a design. The range of the function is immediately bound by the two extremes. Your job as an architect and designer is to minimize this function.
For us to apply this scoring methodology meaningfully, we'll make a couple of simplifying assumptions:
clojure.core
functions aren't moving to a different namespace.With these, we can apply the filtering from above a bit more strictly to get an even smaller subgraph of the function call network:
In [48]:
v_to_keep = g.vs(lambda v: 'topology' in v['label'] and not 'test' in v['label'])
tg = g.subgraph(v_to_keep)
# Recompute degree after building the subgraph:
tg.vs['indegree'] = tg.degree(mode="in")
tg.vs['outdegree'] = tg.degree(mode="out")
tg.summary()
Out[48]:
The baseline modularity score of lein-topology
's core function dependency graph is:
In [49]:
from sand.modularity import objective
objective(tg, tg.vs['group'])
Out[49]:
Where is this on the range of possibilities?
Suppose all functions were in the same namespace. We'll simulate this by setting the group membership vector to all 1's:
In [50]:
objective(tg, [1 for _ in range(len(tg.vs))])
Out[50]:
This is the degenerate case of M=1, so the objective function simply returns the square of the number of vertices:
In [51]:
len(tg.vs) * len(tg.vs)
Out[51]:
The other extreme occurs when we have the extreme of M=N, or all functions in their own namespace. We can simulate this by providing a unique group membership id for each vertex:
In [52]:
objective(tg, range(len(tg.vs)))
Out[52]:
Finally, we can compare our actual modularity score to a computational result. We can use Girvan-Newman edge-betweenness community detection to generate a modular design based on the network structure alone:
In [53]:
eb_membership = sand.edge_betweenness(tg, directed=True)
In [54]:
len(set(eb_membership))
Out[54]:
In [55]:
len(set(tg.vs['group']))
Out[55]:
So the edge betweenness algorithm comes up with fewer communities, i.e. namespace in this context. Let's see how the modularity score compares:
In [56]:
objective(tg, eb_membership)
Out[56]:
If this score is lower than our actual baseline, than the computational community structure may represent an improvement over the current structure. Which namespaces have changed groups? We may wish to refactor the code to reflect this structure.
If the edge betweenness modularity score is higher than our baseline, this fact acts as a quantitative defense of our design.
In this case, our current score of 121 is less than the algorithmic optimum of 133, so we can conclude we have a structure with acceptable coupling and cohesion.