In [ ]:
import pandas as pd
import networkx as nx  # pip install --user networkx

Load data


In [ ]:
nodes = pd.read_pickle("cachenodes.pkl")
edges = pd.read_pickle("edges.pkl")
comp_nodes = pd.read_pickle("comp_nodes.pkl")

Build topology


In [ ]:
def build_topology(nodes, edges):
    topology = nx.Graph()
    
    # add all nodes
    for index, row in nodes.iterrows():
        node_name = row["name"]
        node_attributes = row.drop(["name"]).to_dict()
        topology.add_node(node_name, **node_attributes)
    
    # add all edges
    for index, row in edges.iterrows():
        node1_name = row["node1"]
        node2_name = row["node2"]
        edge_attributes = row.drop(["node1", "node2"]).to_dict()
        topology.add_edge(node1_name, node2_name, **edge_attributes)
    
    return topology

In [ ]:
topology = build_topology(nodes, edges)

Calculate shortest path for every pair of computational nodes

Helper function for caching of results

It allows the program to save calculated tables or other objects or load them from disk if they are already there


In [ ]:
from libcrap.core import calcsave_or_load
from functools import partial

In [ ]:
pd_diskcache = partial(calcsave_or_load, load_func=pd.read_pickle, save_func=pd.to_pickle)

Actually do the work


In [ ]:
import itertools

In [ ]:
@pd_diskcache("paths.pkl")
def find_comp_to_comp_shortest_paths(topology, comp_nodes):
    paths_ugly = dict(nx.all_pairs_shortest_path(topology))
    # calculates shortest paths and stores them in a dict of dicts
    
    # build a table with all computational node pairs
    # they are not duplicated
    # if there is ("n48001", "n49419") then there is no ("n49419", "n48001") pair
    comp_node_pairs = pd.DataFrame.from_records(
        itertools.chain.from_iterable(
            [(node1, node2) for node2 in comp_nodes.iloc[index:]]
            for (index, node1) in comp_nodes.iteritems()
        ),
        columns=["node1", "node2"]
    )

    # write shortest paths to this table
    comp_node_pairs["shortest_path"] = comp_node_pairs.apply(
        lambda row: paths_ugly[row.loc["node1"]][row.loc["node2"]],
        axis=1
    )
    return comp_node_pairs

In [ ]:
# shortest paths between all computational nodes
paths = find_comp_to_comp_shortest_paths(topology, comp_nodes)

Calculate feature lists of these paths

We also add a new column to paths table here

Helper functions


In [ ]:
def interleave(it1, it2):
    """
    >>> list(interleave([1, 2, 3, 4], ["a", "b", "c"]))
    [1, 'a', 2, 'b', 3, 'c', 4]
    """
    return (
        item for item
        in itertools.chain.from_iterable(itertools.zip_longest(it1, it2))
        if item is not None)

In [ ]:
def get_node_features(topology, node):
    """Returns node features as a tuple of tuples.
    
    >>> topology = nx.Graph()
    >>> topology.add_node("kek", a=1, b="lol")
    >>> sorted(get_node_features(topology, "kek"), key=lambda pair: pair[0])
    [('a', 1), ('b', 'lol')]
    """
    return tuple(topology.node[node].items())

In [ ]:
def get_edge_features(topology, node1, node2):
    """Returns features of an edge as tuple of tuples.
    
    >>> topology = nx.Graph()
    >>> topology.add_node("a1")
    >>> topology.add_node("b1")
    >>> topology.add_edge("a1", "b1", foo="bar", shim="sham")
    >>> get_edge_features(topology, "a1", "b1")
    (('foo', 'bar'), ('shim', 'sham'))
    """
    return tuple(topology.edges[node1, node2].items())

In [ ]:
def maybe_reverse(l):
    """
    Takes list or tuple and reverses it, or not.
    Using maybe_reverse on some list and on its reversed version will
    yield the same result.
    
    >>> maybe_reverse([1, 2, 3])
    [1, 2, 3]
    >>> maybe_reverse([3, 2, 1])
    [1, 2, 3]
    >>> maybe_reverse(('a', 'b', 'c'))
    ('a', 'b', 'c')
    >>> maybe_reverse(('c', 'b', 'a'))
    ('a', 'b', 'c')
    """
    if type(l) == list:
        constructor = list
    elif type(l) == tuple:
        constructor = tuple
    else:
        raise TypeError("can only take list or tuple arguments")
        
    reversed_l = constructor(reversed(l))
    if str(l) <= str(reversed_l):
        return l
    return reversed_l

In [ ]:
def get_features_of_path(topology, path):
    """Returns features of path as a tuple of tuples of tuples.
    The list of features will be normalized, so that
    this function returns the same features in the same order for
    path (A, B, C, D) and for path (D, C, B, A)"""
    nodes_features = (get_node_features(topology, node) for node in path)
    
    edges_features = (get_edge_features(topology, node1, node2)
                        for (node1, node2) in zip(path[:-1], path[1:]))
    return maybe_reverse(tuple(interleave(nodes_features, edges_features)))

In [ ]:
def df_loc_by_sequence(df, sequence):
    """
    Use this instead of `df.loc[sequence]`.
    
    Pandas df gets confused by tuples and possibly by other
    sequences. If you do `df.loc[(1, 2)]`, it will look for 1
    or 2 in df's index instead of looking for the tuple itself.
    You can use df.xs to overcome this problem. Or use this
    function which hides the ugliness.
    
    Also see
    [stackoverflow question](https://goo.gl/emtjB8)
    for better description of the problem."""
    
    return df.xs(sequence)

Test helper functions


In [ ]:
import doctest

In [ ]:
def test_get_node_features():
    doctest.run_docstring_examples(get_node_features, globals())
    assert get_node_features(topology, "КГК.48.0.3") == (("type_", "switch"),)

In [ ]:
def test_get_edge_features():
    doctest.run_docstring_examples(get_edge_features, globals())
    
    correct_result = (("connection_type", "backplane"),)
    result1 = get_edge_features(topology, "КГК.48.0.3", "n48022")
    result2 = get_edge_features(topology, "n48022", "КГК.48.0.3")
    assert result1 == correct_result == result2

In [ ]:
doctest.run_docstring_examples(interleave, globals())
test_get_node_features()
test_get_edge_features()
doctest.run_docstring_examples(maybe_reverse, globals())

Do the work


In [ ]:
@pd_diskcache("classes.pkl")
def list_path_classes(topology, paths):
    unique_features_classes = frozenset(
        get_features_of_path(topology, path)
        for path in paths["shortest_path"]
    )
    
    return pd.DataFrame.from_records(
        ([features] for features in sorted(unique_features_classes)),
        columns=["features"]
    )

In [ ]:
@pd_diskcache("paths_with_classes.pkl")
def add_class_id_col(paths, classes):
    """Adds class_id column to paths table"""
    
    # create pandas table for quick getting index by value of features list
    classes_reverse_lookup = classes.reset_index().set_index("features", verify_integrity=True)
    
    def get_class_id_by_path(path):
        return df_loc_by_sequence(classes_reverse_lookup, get_features_of_path(topology, path))["index"]
    
    return paths.assign(class_=paths["shortest_path"].apply(get_class_id_by_path))

In [ ]:
classes = list_path_classes(topology, paths)
paths_with_classes = add_class_id_col(paths, classes)