In [ ]:
import openpyxl # pip install --user openpyxl
import re
import itertools
import pickle
In [ ]:
def get_column_name(column):
"""Takes column as tuple as argument and returns
its name as string"""
return column[0].column
In [ ]:
def extract_columns(worksheet, column_names):
"""
parameters:
worksheet -- worksheet
column_names -- list of strings, for example
['A', 'C', 'E']
returns:
list of columns, where every column is represented
as a tuple"""
all_columns = worksheet.columns
extracted_columns = [col for col in all_columns
if get_column_name(col) in column_names]
assert len(extracted_columns) == len(column_names)
return extracted_columns
In [ ]:
def columns_to_tuples(columns):
"""parameters:
columns -- columns as a tuple/list of tuples
returns:
list of lists/tuples, each one represents a row"""
return [[cell.value for cell in row] for row in zip(*columns)]
In [ ]:
def parse_switch_pairs(workbook):
"""Parse openpyxl workbook and extract a list of
pairs of switches. Pair (A, B) means that swithes A
and B are connected.
Returns list of pairs of strings."""
return list(itertools.chain(*[columns_to_tuples(extract_columns(worksheet, ['C', 'K']))
for worksheet in workbook]))
In [ ]:
# path to "cable journal" excel file
# it's a MS Excel spreadsheet with a list of node connections
# in Lomonosov 2 cluster
# I am not allowed to share it.
SPREADSHEET_FILENAME = r'wire_journal_48_53.xlsx'
In [ ]:
def parse_xslx():
workbook = openpyxl.load_workbook(SPREADSHEET_FILENAME, data_only=True)
switch_pairs = parse_switch_pairs(workbook)
return switch_pairs
In [ ]:
switch_pairs = parse_xslx()
In [ ]:
import pandas as pd
In [ ]:
# regex for parsing rack number and other numbers from cells
# with switch names in the spreadsheet
switch_regex = re.compile(
r"""
КГК\. # literally match what is written here
(?P<rack>\d+)\. # rack number is one or more digits, followed by dot
(?P<second_number>\d+)\. # then goes another non-negative integer followed by dot
(?P<last_number>\d+) # and another integer of the same form
""",
re.VERBOSE)
assert switch_regex.match("КГК.63.2.4").groups() == ("63", "2", "4")
In [ ]:
def get_rack(switch_name):
"""Determines rack number from switch name"""
return int(switch_regex.match(switch_name).group('rack'))
In [ ]:
def get_second_number(switch_name):
return switch_regex.match(switch_name).groups()[1]
In [ ]:
def get_third_number(switch_name):
return switch_regex.match(switch_name).groups()[2]
In [ ]:
assert get_rack('КГК.48.0.1') == 48
In [ ]:
def deconstruct_switches(switch_pairs):
"""Builds pandas table with all switches.
Columns:
* name -- e.g. 'КГК.48.0.1'
* rack_number -- e.g. 48
* second_number -- e.g. 0
* third_number -- e.g. 1
"""
switches = pd.DataFrame({
"name": sorted(list(frozenset(itertools.chain(*switch_pairs))))
})
switches["rack_number"] = switches["name"].apply(get_rack)
switches["second_number"] = switches["name"].apply(get_second_number)
switches["third_number"] = switches["name"].apply(get_third_number)
return switches
In [ ]:
switches = deconstruct_switches(switch_pairs)
In [ ]:
# Lomonosov2's racks are grouped into pairs
# Switches in the same rack or pair of racks are connected with copper wires
# Switches in different pairs of racks are connected with optic cable
RACK_PAIRS = ((48, 49), (50, 51), (52, 53))
In [ ]:
def determine_material_between_switches(rack1, rack2):
"""Switches have different material between them.
See comment about RACK_PAIRS.
This function determines cable material between two switches
by using their rack numbers and returns it as string"""
racks = (rack1, rack2)
if any(
all(rack in rack_pair for rack in racks)
for rack_pair in RACK_PAIRS):
# they are in the same pair of racks
return 'copper'
return 'optic'
In [ ]:
def build_switch_to_switch_table(switch_pairs, switches):
return (
# add rack numbers for switch1 column
pd.merge(
# convert list of pairs to DataFrame
pd.DataFrame.from_records(switch_pairs, columns=["switch1", "switch2"]),
switches,
left_on=["switch1"], right_on=["name"])
.rename(columns={"rack_number": "switch1_rack"})
[["switch1", "switch2", "switch1_rack"]]
# add rack numbers for switch2 column
.merge(
switches,
left_on=["switch2"], right_on=["name"])
.rename(columns={"rack_number": "switch2_rack"})
[["switch1", "switch2", "switch1_rack", "switch2_rack"]]
# add cable type
.assign(cable_type=lambda df: df.apply(
lambda row: determine_material_between_switches(row["switch1_rack"], row["switch2_rack"]),
axis=1
))
.drop(["switch1_rack", "switch2_rack"], axis=1)
)
In [ ]:
switch_to_switch = build_switch_to_switch_table(switch_pairs, switches)
In [ ]:
def get_matching_computational_nodes(switch):
"""params:
switch -- string, name of the switch
returns:
list of strings which are names of computational
nodes connected to this switch
"""
get_thingie = switch_regex.match(switch).group
return [
'n{0}{1}{2:02d}'.format(
get_thingie('rack'),
get_thingie('second_number'),
(int(get_thingie('last_number')) - 1) * 8 + i
)
for i in range(1, 9)
]
In [ ]:
assert get_matching_computational_nodes("КГК.48.2.3") == [
'n48217',
'n48218',
'n48219',
'n48220',
'n48221',
'n48222',
'n48223',
'n48224'
]
In [ ]:
def build_comp_to_switch_table(switches):
return pd.concat(
(pd.DataFrame.from_dict({
"computational_node": get_matching_computational_nodes(switch),
"switch": switch})
for switch in switches["name"]),
ignore_index=True
)
In [ ]:
comp_to_switch = build_comp_to_switch_table(switches)
assert len(comp_to_switch) == 1536
In [ ]:
def build_nodes_table(switches, comp_to_switch):
table = pd.concat([
switches[["name"]]
.assign(type_="switch"),
comp_to_switch[["computational_node"]]
.rename(columns={"computational_node": "name"})
.assign(type_="computational")
])
table["type_"] = table["type_"].astype("category")
return table
In [ ]:
nodes = build_nodes_table(switches, comp_to_switch)
In [ ]:
def build_edges_table(switch_to_switch, comp_to_switch):
table = pd.concat([
switch_to_switch
.rename(columns={
"switch1": "node1",
"switch2": "node2",
"cable_type": "connection_type"
}),
comp_to_switch
.rename(columns={
"computational_node": "node1",
"switch": "node2"
})
.assign(connection_type="backplane")
])
table["connection_type"] = table["connection_type"].astype("category")
return table
In [ ]:
edges = build_edges_table(switch_to_switch, comp_to_switch)
In [ ]:
# this is a pandas Series of all computational nodes
comp_nodes = nodes[nodes["type_"] =="computational"]["name"]
In [ ]:
for (obj, filename) in (
(nodes, "cachenodes.pkl"),
(edges, "edges.pkl"),
(comp_nodes, "comp_nodes.pkl")):
pd.to_pickle(obj, filename)