In [1]:
import pandas as pd
import numpy as np
from path import Path  # pip install --user path.py
import re
from IPython.display import display
from pprint import pprint
import netCDF4
from IPython.core.debugger import Pdb
from collections import namedtuple

In [2]:
TEST_RESULT_DIRECTORIES = Path("/home/shibbiry/Dropbox/documents/msu/bachelors_thesis_cluster_topology/test_results") \
    .dirs()

In [4]:
display(TEST_RESULT_DIRECTORIES)


[Path('/home/shibbiry/Dropbox/documents/msu/bachelors_thesis_cluster_topology/test_results/2016-11-04-lom2_100_nodes'),
 Path('/home/shibbiry/Dropbox/documents/msu/bachelors_thesis_cluster_topology/test_results/2016-02-10__110_nodes'),
 Path('/home/shibbiry/Dropbox/documents/msu/bachelors_thesis_cluster_topology/test_results/2017-02-12__118_nodes'),
 Path('/home/shibbiry/Dropbox/documents/msu/bachelors_thesis_cluster_topology/test_results/2017-02-10__110_nodes')]

In [5]:
def read_benchmark_hostnames(path_to_file):
    lines = path_to_file.lines()
    return (re.match(r"^(n\d{5})\.", line).groups()[0] for line in lines)

In [6]:
TestResults = namedtuple("TestResults", ["hostnames", "medians", "msg_lengths"])

In [7]:
def import_data(directory):
    hostnames = tuple(read_benchmark_hostnames(directory.joinpath("network_hosts.txt")))
    with netCDF4.Dataset(directory.joinpath("network_median.nc"), "r")  as dataset:
        step_len = dataset["step_length"][0]
        start_len = dataset["begin_mes_length"][0]
        end_len = dataset["end_mes_length"][0]
        
        assert len(hostnames) == dataset["proc_num"][0]
        assert dataset["test_type"][0] == 1
        assert start_len == 0
        assert end_len == 10000  # last message length should be 9900
        assert step_len == 100
        steps = (end_len - start_len) // step_len - 1
        assert start_len + (steps + 1) * step_len == end_len
        
        lengths = range(start_len, end_len, step_len)
        
        data = {
            "message_len_{0}".format(length): \
                pd.DataFrame(dataset["data"][index], index=hostnames, columns=hostnames)
            for (index, length) in enumerate(lengths)
        }
        panel = pd.Panel(data)
    return TestResults(hostnames=hostnames, medians=panel, msg_lengths=list(lengths))

In [8]:
def uniques_in_matrix(matrix):
    return frozenset(matrix[col].loc[row] for col in matrix.columns for row in matrix.index)

In [9]:
# tests for the previous function
def test_uniques_in_matrix():
    uniques1 = uniques_in_matrix(
        pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["c1", "c2", "c3"], index=["i1", "i2", "i3"])
    )
    assert frozenset({1, 2, 3, 4, 5, 6, 7, 8, 9}) == uniques1
    
    uniques2 = uniques_in_matrix(
        pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["i1", "i2", "i3"], index=["i1", "i2", "i3"])
    )
    assert uniques2 == uniques1

In [10]:
test_uniques_in_matrix()

In [11]:
def count_unique_medians(medians):
    uniques_counts = [len(uniques_in_matrix(medians.iloc[i])) for i in range(len(medians))]
    ind_with_min_count, min_count = min(enumerate(uniques_counts), key=lambda pair: pair[1])
    ind_with_max_count, max_count = max(enumerate(uniques_counts), key=lambda pair: pair[1])
    print(
        "Minimum number of unique values in matrix is {0}. Message length = {1}."
            .format(min_count, medians.keys()[ind_with_min_count])
    )
    print(
        "Maximum number of unique values in matrix is {0}. Message length = {1}."
            .format(max_count, medians.keys()[ind_with_max_count])
    )

In [12]:
for directory in TEST_RESULT_DIRECTORIES:
    medians = import_data(directory).medians
    print(directory.basename())
    count_unique_medians(medians)


2016-11-04-lom2_100_nodes
Minimum number of unique values in matrix is 5. Message length = message_len_600.
Maximum number of unique values in matrix is 38. Message length = message_len_6200.
2016-02-10__110_nodes
Minimum number of unique values in matrix is 15. Message length = message_len_8000.
Maximum number of unique values in matrix is 31. Message length = message_len_4100.
2017-02-12__118_nodes
Minimum number of unique values in matrix is 8. Message length = message_len_100.
Maximum number of unique values in matrix is 18. Message length = message_len_5000.
2017-02-10__110_nodes
Minimum number of unique values in matrix is 15. Message length = message_len_8000.
Maximum number of unique values in matrix is 31. Message length = message_len_4100.