In [ ]:
import openpyxl  # pip install --user openpyxl
import re
import itertools
import pickle

Code for Extracting Data From Openpyxl Workbook

Helper functions


In [ ]:
def get_column_name(column):
    """Takes column as tuple as argument and returns
    its name as string"""
    return column[0].column

In [ ]:
def extract_columns(worksheet, column_names):
    """
    parameters:
        worksheet -- worksheet
        column_names -- list of strings, for example
            ['A', 'C', 'E']
    returns:
        list of columns, where every column is represented
        as a tuple"""
    all_columns = worksheet.columns
    extracted_columns = [col for col in all_columns
                         if get_column_name(col) in column_names]
    assert len(extracted_columns) == len(column_names)
    return extracted_columns

In [ ]:
def columns_to_tuples(columns):
    """parameters:
        columns -- columns as a tuple/list of tuples
    returns:
        list of lists/tuples, each one represents a row"""
    return [[cell.value for cell in row] for row in zip(*columns)]

Main function


In [ ]:
def parse_switch_pairs(workbook):
    """Parse openpyxl workbook and extract a list of
    pairs of switches. Pair (A, B) means that swithes A
    and B are connected.
    Returns list of pairs of strings."""
    return list(itertools.chain(*[columns_to_tuples(extract_columns(worksheet, ['C', 'K']))
                        for worksheet in workbook]))

Now we actually do the work


In [ ]:
# path to "cable journal" excel file
# it's a MS Excel spreadsheet with a list of node connections
# in Lomonosov 2 cluster
# I am not allowed to share it.
SPREADSHEET_FILENAME = r'wire_journal_48_53.xlsx'

In [ ]:
def parse_xslx():
    workbook = openpyxl.load_workbook(SPREADSHEET_FILENAME, data_only=True)
    switch_pairs = parse_switch_pairs(workbook)
    return switch_pairs

In [ ]:
switch_pairs = parse_xslx()

Build 2 big tables: all nodes and all edges

Deconstruct switch names into numbers


In [ ]:
import pandas as pd

Helper functions


In [ ]:
# regex for parsing rack number and other numbers from cells
# with switch names in the spreadsheet
switch_regex = re.compile(
    r"""
    КГК\.       # literally match what is written here
    (?P<rack>\d+)\.        # rack number is one or more digits, followed by dot
    (?P<second_number>\d+)\.            # then goes another non-negative integer followed by dot
    (?P<last_number>\d+)            # and another integer of the same form
    """,
    re.VERBOSE)

assert switch_regex.match("КГК.63.2.4").groups() == ("63", "2", "4")

In [ ]:
def get_rack(switch_name):
    """Determines rack number from switch name"""
    return int(switch_regex.match(switch_name).group('rack'))

In [ ]:
def get_second_number(switch_name):
    return switch_regex.match(switch_name).groups()[1]

In [ ]:
def get_third_number(switch_name):
    return switch_regex.match(switch_name).groups()[2]

Test helper functions


In [ ]:
assert get_rack('КГК.48.0.1') == 48

Do the work


In [ ]:
def deconstruct_switches(switch_pairs):
    """Builds pandas table with all switches.
    Columns:
      * name -- e.g. 'КГК.48.0.1'
      * rack_number -- e.g. 48
      * second_number -- e.g. 0
      * third_number -- e.g. 1
    """
    switches = pd.DataFrame({
        "name": sorted(list(frozenset(itertools.chain(*switch_pairs))))
    })
    switches["rack_number"] = switches["name"].apply(get_rack)
    switches["second_number"] = switches["name"].apply(get_second_number)
    switches["third_number"] = switches["name"].apply(get_third_number)
    return switches

In [ ]:
switches = deconstruct_switches(switch_pairs)

Build table with switch-to-switch connections

Helper functions


In [ ]:
# Lomonosov2's racks are grouped into pairs
# Switches in the same rack or pair of racks are connected with copper wires
# Switches in different pairs of racks are connected with optic cable
RACK_PAIRS = ((48, 49), (50, 51), (52, 53))

In [ ]:
def determine_material_between_switches(rack1, rack2):
    """Switches have different material between them.
    See comment about RACK_PAIRS.
    
    This function determines cable material between two switches
    by using their rack numbers and returns it as string"""
    racks = (rack1, rack2)
    if any(
            all(rack in rack_pair for rack in racks)
            for rack_pair in RACK_PAIRS):
        # they are in the same pair of racks
        return 'copper'
    return 'optic'

Do work


In [ ]:
def build_switch_to_switch_table(switch_pairs, switches):
    return (
        # add rack numbers for switch1 column
        pd.merge(
            # convert list of pairs to DataFrame
            pd.DataFrame.from_records(switch_pairs, columns=["switch1", "switch2"]),
            switches,
            left_on=["switch1"], right_on=["name"])
        .rename(columns={"rack_number": "switch1_rack"})
        [["switch1", "switch2", "switch1_rack"]]

        # add rack numbers for switch2 column
        .merge(
            switches,
            left_on=["switch2"], right_on=["name"])
        .rename(columns={"rack_number": "switch2_rack"})
        [["switch1", "switch2", "switch1_rack", "switch2_rack"]]

        # add cable type
        .assign(cable_type=lambda df: df.apply(
            lambda row: determine_material_between_switches(row["switch1_rack"], row["switch2_rack"]),
            axis=1
        ))
        .drop(["switch1_rack", "switch2_rack"], axis=1)
    )

In [ ]:
switch_to_switch = build_switch_to_switch_table(switch_pairs, switches)

Build table with computational-node-to-switch connections

Helper code


In [ ]:
def get_matching_computational_nodes(switch):
    """params:
        switch -- string, name of the switch
    returns:
        list of strings which are names of computational
        nodes connected to this switch
    """
    get_thingie = switch_regex.match(switch).group
    return [
        'n{0}{1}{2:02d}'.format(
            get_thingie('rack'),
            get_thingie('second_number'),
            (int(get_thingie('last_number')) - 1) * 8 + i
        )
        for i in range(1, 9)
    ]

Test helper code


In [ ]:
assert get_matching_computational_nodes("КГК.48.2.3") == [
    'n48217',
    'n48218',
    'n48219',
    'n48220',
    'n48221',
    'n48222',
    'n48223',
    'n48224'
]

Do work


In [ ]:
def build_comp_to_switch_table(switches):
    return pd.concat(
        (pd.DataFrame.from_dict({
            "computational_node": get_matching_computational_nodes(switch),
            "switch": switch})
        for switch in switches["name"]),
        ignore_index=True
    )

In [ ]:
comp_to_switch = build_comp_to_switch_table(switches)
assert len(comp_to_switch) == 1536

Build one of the two main tables: the Nodes table


In [ ]:
def build_nodes_table(switches, comp_to_switch):
    table = pd.concat([
        switches[["name"]]
            .assign(type_="switch"),
        comp_to_switch[["computational_node"]]
            .rename(columns={"computational_node": "name"})
            .assign(type_="computational")
    ])
    table["type_"] = table["type_"].astype("category")
    return table

In [ ]:
nodes = build_nodes_table(switches, comp_to_switch)

Build the other main table: the Edges table


In [ ]:
def build_edges_table(switch_to_switch, comp_to_switch):
    table = pd.concat([
        switch_to_switch
            .rename(columns={
                "switch1": "node1",
                "switch2": "node2",
                "cable_type": "connection_type"
            }),
        comp_to_switch
            .rename(columns={
                "computational_node": "node1",
                "switch": "node2"
            })
            .assign(connection_type="backplane")
    ])
    table["connection_type"] = table["connection_type"].astype("category")
    return table

In [ ]:
edges = build_edges_table(switch_to_switch, comp_to_switch)

Build table with only computational nodes

We will write it to disk too so that our lomonosov2-agnostic tool knows which nodes are computational. Because it should only build pairs of computational nodes.


In [ ]:
# this is a pandas Series of all computational nodes
comp_nodes = nodes[nodes["type_"] =="computational"]["name"]

Write 3 resulting tables to disk


In [ ]:
for (obj, filename) in (
        (nodes, "cachenodes.pkl"),
        (edges, "edges.pkl"),
        (comp_nodes, "comp_nodes.pkl")):
    pd.to_pickle(obj, filename)