In [1]:
import networkx as nx

DATA_FILENAME = '/home/sami/py-graph/data/lesmis.gml'

print("Loading graph data...")
U = nx.read_gml(DATA_FILENAME, label='id')
G = U.to_directed()

print("Nodes: {}".format(G.number_of_nodes()))
print("Edges: {}".format(G.number_of_edges()))


Loading graph data...
Nodes: 77
Edges: 508

In [2]:
%load_ext Cython
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [3]:
%%cython
import numpy as np
cimport cython
from shared import fixed_width_print

cdef int UNMAPPED = -1

def linear_deterministic_greedy(int[:,::] edges,
                                float[::] edge_weights,
                                float[::] node_weights,
                                int num_partitions,
                                int[::] partition,
                                int[::] fixed,
                                int debug):
    """
    This algorithm favors a cluster if it has many neighbors of a node, but
    penalizes the cluster if it is close to capacity.
    
    edges: An [:,2] array of edges.
    edge_weights: An [:,2] array of edge weights. Length should match number of edges.
    node_weights: An [:,2] array of node weights. Length should match number of nodes.
    num_partitions: How many partitions we are breaking the graph into.
    partition: The partition from a previous run. Used for restreaming.
    fixed: To denote which nodes in the partition have been locked in place.
    debug: Prints helpful debug information.

    Returns: A new partition.
    """

    cdef int num_nodes = len(node_weights)

    # The output partition
    if partition is None:
        partition = np.repeat(np.int32(UNMAPPED), num_nodes)
    if fixed is None:
        fixed = np.repeat(np.int32(UNMAPPED), num_nodes)

    cdef float[::] partition_sizes = np.zeros(num_partitions, dtype=np.float32)
         
    cdef float[::] partition_votes = np.zeros(num_partitions, dtype=np.float32)
    
    # Fine to be a little off, to stay integers
    cdef float partition_capacity = sum(node_weights) / num_partitions
    
    cdef int last_left = edges[0,0]
    cdef int i = 0
    cdef int left = 0
    cdef int right = 0
    cdef int arg = 0
    cdef int max_arg = 0
    cdef float max_val = 0
    cdef float val = 0
    cdef int len_edges = len(edges)

    for i in range(len_edges):
        left = edges[i,0]
        right = edges[i,1]

        if last_left != left:
            if fixed[last_left] != UNMAPPED:
                if debug:
                    print("Skipping node {}".format(last_left))
                partition_votes[:] = 0
                partition_sizes[partition[last_left]] += node_weights[last_left]
                last_left = left

            else:
                # We have found a new node so assign last_left to a partition

                # Calculate available space in each partition, multiply that by partition_votes to get max_val
                #max_arg: most likely partition asssignment, ie. the partition with enough space available
                #         and the highest number of edges (relationships to other nodes)
                #max_val: current highest value of votes against remaining capacity
                #partition_votes: the number of right nodes in a partition related to current left node

                if debug:
                    print("Assigning node {}".format(last_left))

                max_arg = 0
                max_val = (partition_votes[0]) * (
                           partition_capacity - partition_sizes[0] - node_weights[last_left])
                if debug:
                    print("\tP{} = {} x ({} - {} - {}) = {}".format(0,
                                                               partition_votes[0],
                                                               partition_capacity,
                                                               partition_sizes[0],
                                                               node_weights[last_left],
                                                               max_val))

                for arg in range(1, num_partitions):
                    val = (partition_votes[arg]) * (
                           partition_capacity - partition_sizes[arg] - node_weights[last_left])

                    if debug:
                        print("\tP{} = {} x ({} - {} - {}) = {}".format(arg,
                                                                   partition_votes[arg],
                                                                   partition_capacity,
                                                                   partition_sizes[arg],
                                                                   node_weights[last_left],
                                                                   val))
                    if val > max_val:
                        max_arg = arg
                        max_val = val

                if max_val <= 0:
                    max_arg = arg
                    # No neighbors (or multiple maxed out) so "randomly" select
                    # the smallest partition
                    for arg in range(i % num_partitions, num_partitions):
                        if partition_sizes[arg] + node_weights[last_left] < partition_capacity:
                            max_arg = arg
                            max_val = 1
                            break
                    if max_val <= 0:
                        for arg in range(0, i % num_partitions):
                            if partition_sizes[arg] + node_weights[last_left] < partition_capacity:
                                max_arg = arg
                                break

                partition_sizes[max_arg] += node_weights[last_left]
                partition[last_left] = max_arg
                partition_votes[:] = 0

                if debug:
                    print("\tassigned to P{}".format(partition[last_left]))
                    fixed_width_print(np.asarray(partition))
                    fixed_width_print(np.asarray(fixed))

                last_left = left
            
        if partition[right] != UNMAPPED:
            partition_votes[partition[right]] += edge_weights[i]

    # Clean up the last assignment
    if fixed[left] == UNMAPPED:
        if debug:
            print("Assigning last node {}".format(left))

        max_arg = 0
        max_val = 0
        for arg in range(0, num_partitions):
            if partition_sizes[arg] < partition_capacity:
                # XXX: possibly use calculation from above if we introduce target variable partition sizes
                val = (partition_votes[arg]) * (
                        1 - partition_sizes[arg] / partition_capacity)

                if debug:
                    print("\tP{} = {} x (1 - {} / {}) = {}".format(arg,
                                                                   partition_votes[arg],
                                                                   partition_sizes[arg],
                                                                   partition_capacity,
                                                                   val))

                if val > max_val:
                    max_arg = arg
                    max_val = val

        partition[left] = max_arg
        if debug:
            print("\tassigned to P{}".format(partition[left]))

    return (np.asarray(partition), np.asarray(fixed))

In [4]:
import shared

# the number of iterations for the prediction model
num_iterations = 20
num_partitions = 4

edges = np.array(G.edges(), dtype=np.int32)
edge_weights = np.array([x[2]['weight'] for x in G.edges(data=True)], dtype=np.float32)
node_weights = np.array([x[1]['weight'] for x in G.nodes(data=True)], dtype=np.float32)

# Order of people arriving
arrivals = list(range(0, G.number_of_nodes()))
#random.shuffle(arrivals)
#arrivals = arrivals[0:38]

# run first pass - this is our prediction model
assignments = None
fixed = None
print("PREDICTION MODEL")
print("----------------\n")
print("WASTE\t\t\tCUT RATIO\t\tMISMATCH")
for i in range(num_iterations):
    assignments, fixed = linear_deterministic_greedy(edges, edge_weights, node_weights,
                                                     num_partitions, assignments, fixed, 0)
    x = shared.score(assignments, edges)
    print("{}\t{}\t{}".format(x[0], x[1], x[2]))

print("\nAssignments:")
fixed_width_print(assignments)
fixed_width_print(fixed)
print("\n")

print("Re-streaming as nodes arrive")
print("----------------------------\n")
print("WASTE\t\t\tCUT RATIO\t\tMISMATCH")
for a in arrivals:
    fixed[a] = 1
    
    # restream non-fixed assignments
    assignments, fixed = linear_deterministic_greedy(edges, edge_weights, node_weights,
                                                     num_partitions, assignments, fixed, 0)
    x = shared.score(assignments, edges)
    print("{}\t{}\t{}".format(x[0], x[1], x[2]))

# remove nodes not fixed
for i in range(0, len(assignments)):
    if fixed[i] == -1:
        assignments[i] = -1
print("\nAssignments:")
fixed_width_print(assignments)
fixed_width_print(fixed)

print("\nPartitions - nodes (weight):")
parts = [0, 0, 0, 0]
for i in range(0, len(assignments)):
    if assignments[i] >= 0:
        parts[assignments[i]] += 1
for p in range(0, len(parts)):
    print("P{}: {}".format(p, parts[p]))

#print("\nPartitions - nodes (weight):")
#partition_size_nodes = np.bincount(assignments, minlength=num_partitions).astype(np.float32)
#partition_size_weights = np.bincount(assignments, weights=node_weights, minlength=num_partitions).astype(np.float32)
#for p in range(0, num_partitions):
#    print("P{}: {} ({})".format(p, partition_size_nodes[p], partition_size_weights[p]))

for i in range(0, len(assignments)):
    G.add_nodes_from([i], partition=str(assignments[i]))

nx.write_gml(G, "test.gml")


PREDICTION MODEL
----------------

WASTE			CUT RATIO		MISMATCH
0.03896103896103889	0.4094488188976378	208
0.03896103896103889	0.3425196850393701	174
0.03896103896103889	0.41338582677165353	210
0.03896103896103889	0.4566929133858268	232
0.03896103896103889	0.47244094488188976	240
0.03896103896103889	0.452755905511811	230
0.03896103896103889	0.38188976377952755	194
0.03896103896103889	0.4448818897637795	226
0.03896103896103889	0.44881889763779526	228
0.03896103896103889	0.4330708661417323	220
0.03896103896103889	0.4645669291338583	236
0.03896103896103889	0.41338582677165353	210
0.03896103896103889	0.46062992125984253	234
0.03896103896103889	0.4330708661417323	220
0.03896103896103889	0.44881889763779526	228
0.03896103896103889	0.46062992125984253	234
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.4645669291338583	236
0.03896103896103889	0.4448818897637795	226
0.03896103896103889	0.4409448818897638	224

Assignments:
[ 2  2  2  2  2  2  2  2  2  2  0  0  3  0  0  0  3  3  3  3  3  3  3  3  1  1  1  0  1  0  0  0  0  0  0  0  0  0  0  1  1  1  1  1  1  1  2  2  2  3  3  3  1  3  3  1  1  2  2  1  2  2  1  1  2  1  2  0  0  3  3  3  0  3  3  1  2]
[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]


Re-streaming as nodes arrive
----------------------------

WASTE			CUT RATIO		MISMATCH
0.03896103896103889	0.44881889763779526	228
0.03896103896103889	0.38976377952755903	198
0.03896103896103889	0.41338582677165353	210
0.03896103896103889	0.4330708661417323	220
0.03896103896103889	0.41338582677165353	210
0.03896103896103889	0.41732283464566927	212
0.03896103896103889	0.42913385826771655	218
0.03896103896103889	0.43700787401574803	222
0.03896103896103889	0.452755905511811	230
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.4094488188976378	208
0.03896103896103889	0.42913385826771655	218
0.03896103896103889	0.452755905511811	230
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.4094488188976378	208
0.03896103896103889	0.42913385826771655	218
0.03896103896103889	0.452755905511811	230
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.4094488188976378	208
0.03896103896103889	0.42913385826771655	218
0.03896103896103889	0.452755905511811	230
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.4094488188976378	208
0.03896103896103889	0.42913385826771655	218
0.03896103896103889	0.452755905511811	230
0.03896103896103889	0.452755905511811	230
0.03896103896103889	0.4448818897637795	226
0.03896103896103889	0.41732283464566927	212
0.03896103896103889	0.4566929133858268	232
0.03896103896103889	0.4251968503937008	216
0.03896103896103889	0.43700787401574803	222
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.421259842519685	214
0.03896103896103889	0.4566929133858268	232
0.03896103896103889	0.4251968503937008	216
0.03896103896103889	0.43700787401574803	222
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.421259842519685	214
0.03896103896103889	0.4566929133858268	232
0.03896103896103889	0.4251968503937008	216
0.03896103896103889	0.43700787401574803	222
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.421259842519685	214
0.03896103896103889	0.4566929133858268	232
0.03896103896103889	0.4251968503937008	216
0.03896103896103889	0.43700787401574803	222
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.421259842519685	214
0.03896103896103889	0.4566929133858268	232
0.03896103896103889	0.4251968503937008	216
0.03896103896103889	0.43700787401574803	222
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.43700787401574803	222
0.03896103896103889	0.4566929133858268	232
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.43700787401574803	222
0.03896103896103889	0.4566929133858268	232
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.43700787401574803	222
0.03896103896103889	0.4566929133858268	232
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.43700787401574803	222
0.03896103896103889	0.4566929133858268	232
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.44881889763779526	228
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.4409448818897638	224
0.03896103896103889	0.4409448818897638	224

Assignments:
[ 2  2  2  2  2  2  2  2  2  2  1  1  3  1  1  1  3  3  3  3  3  3  3  3  1  0  1  1  1  0  1  1  1  1  0  0  0  0  0  0  0  0  0  1  1  1  2  2  2  3  3  3  0  3  3  0  0  2  2  2  0  2  0  2  0  0  2  0  3  3  1  1  1  3  3  3  2]
[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1]

Partitions - nodes (weight):
P0: 19
P1: 19
P2: 20
P3: 19

In [ ]: