notebook.community

Edit and run



In [1]:

    
import os
import csv
import time
import platform
import datetime
import pandas as pd
import networkx as nx
from graph_partitioning import GraphPartitioning, utils

cols = ["WASTE", "CUT RATIO", "EDGES CUT", "TOTAL COMM VOLUME", "MODULARITY", "LONELINESS", "NETWORK PERMANENCE", "NORM. MUTUAL INFO", "EDGE CUT WEIGHT", "FSCORE", "FSCORE RELABEL IMPROVEMENT"]
pwd = %pwd


# [] send: experiement plots, ask what was expected output/behaviour + new experiments list
# [] test size & density of networks: test if fennel is better always or certain network setups
# [] edge expansion off - is it off for patoh as well?

# [] networkkit PermanenceCentrality - check maxperm works with sorted nodes: permanence gives out the same score
# make maxperm work
# community detection -> permanence score

# [] randomize order of people, everyone arrives if no file there
# [] setup computer ready


# [] STATS fennel reoerdering centrality = minimal population size
# [] centrality: impact on SCOTCH + PATOH
# [] centrality: everyone people batch, 1.0 node and edge weight, everyone arrives


# [] (scale virtual node weight * percentage of highest partition score at the moment and update the virtual node weight)
# [] stability?
# [] automatic detection of edgelist/metis


# [x] edge weights for people + new metric for cost of cutting valuable edges
# [x] generate edge weights
# [x] hyperedge expansion for patoh
# [x] NDA/IP
# [x] add f-score before relabeling + fscore after relabeling metric (delta between the two)
# [x] add Random generated dataset


# 18.45 - 19.45 start

'''
1. edge weights
2. new metrics - + random generated datasets
3. experiments
4. centrality stuff - stats functions/equations used at the moment to compute the scores for centralities

'''


# parametrized config
parametrized_config = {
    "DATA_FILENAME": os.path.join(pwd, "data", "predition_model_tests", "network", "network_#networkID#.txt"),
    "OUTPUT_DIRECTORY": os.path.join(pwd, "output"),

    # Set which algorithm is run for the PREDICTION MODEL.
    # Either: 'FENNEL' or 'SCOTCH'
    "PREDICTION_MODEL_ALGORITHM": "",

    # Alternativly, read input file for prediction model.
    # Set to empty to generate prediction model using algorithm value above.
    "PREDICTION_MODEL": "",

    
    "PARTITIONER_ALGORITHM": "",

    # File containing simulated arrivals. This is used in simulating nodes
    # arriving at the shelter. Nodes represented by line number; value of
    # 1 represents a node as arrived; value of 0 represents the node as not
    # arrived or needing a shelter.
    "SIMULATED_ARRIVAL_FILE": os.path.join(pwd,
                                           "data",
                                           "predition_model_tests",
                                           "dataset_1_shift_rotate",
                                           "simulated_arrival_list",
                                           "percentage_of_prediction_correct_#correctedness#",
                                           "arrival_#correctedness#_#networkID#.txt"
                                          ),
    
    # File containing the prediction of a node arriving. This is different to the
    # simulated arrivals, the values in this file are known before the disaster.
    "PREDICTION_LIST_FILE": os.path.join(pwd,
                                         "data",
                                         "predition_model_tests",
                                         "dataset_1_shift_rotate",
                                         "prediction_list",
                                         "prediction_#networkID#.txt"
                                        ),

    # File containing the geographic location of each node, in "x,y" format.
    "POPULATION_LOCATION_FILE": os.path.join(pwd,
                                             "data",
                                             "predition_model_tests",
                                             "coordinates",
                                             "coordinates_#networkID#.txt"
                                            ),

    # Number of shelters
    "num_partitions": 6,

    # The number of iterations when making prediction model
    "num_iterations": 1,

    # Percentage of prediction model to use before discarding
    # When set to 0, prediction model is discarded, useful for one-shot
    "prediction_model_cut_off": 0.10,

    # Alpha value used in one-shot (when restream_batches set to 1)
    "one_shot_alpha": 0.5,

    # Number of arrivals to batch before recalculating alpha and restreaming.
    # When set to 1, one-shot is used with alpha value from above
    "restream_batches": 10,

    # When the batch size is reached: if set to True, each node is assigned
    # individually as first in first out. If set to False, the entire batch
    # is processed and empty before working on the next batch.
    "sliding_window": False,

    # Create virtual nodes based on prediction model
    "use_virtual_nodes": False,

    # Virtual nodes: edge weight
    "virtual_edge_weight": 1.0,

    # Loneliness score parameter. Used when scoring a partition by how many
    # lonely nodes exist.
    "loneliness_score_param": 1.2,

    ####
    # GRAPH MODIFICATION FUNCTIONS

    # Also enables the edge calculation function.
    "graph_modification_functions": True,

    # If set, the node weight is set to 100 if the node arrives at the shelter,
    # otherwise the node is removed from the graph.
    "alter_arrived_node_weight_to_100": False,

    # Uses generalized additive models from R to generate prediction of nodes not
    # arrived. This sets the node weight on unarrived nodes the the prediction
    # given by a GAM.
    # Needs POPULATION_LOCATION_FILE to be set.
    "alter_node_weight_to_gam_prediction": False,

    # Enables edge expansion when graph_modification_functions is set to true
    "edge_expansion_enabled": True,
    
    # The value of 'k' used in the GAM will be the number of nodes arrived until
    # it reaches this max value.
    "gam_k_value": 100,

    # Alter the edge weight for nodes that haven't arrived. This is a way to
    # de-emphasise the prediction model for the unknown nodes.
    "prediction_model_emphasis": 1.0,
    
    # This applies the prediction_list_file node weights onto the nodes in the graph
    # when the prediction model is being computed and then removes the weights
    # for the cutoff and batch arrival modes
    "apply_prediction_model_weights": True,
    
    # Path to the scotch shared library
    "SCOTCH_LIB_PATH": os.path.join(pwd, "libs/scotch/macOS/libscotch.dylib")
    if 'Darwin' in platform.system()
    else "/usr/local/lib/libscotch.so",
    
    # Path to the PaToH shared library
    "PATOH_LIB_PATH": os.path.join(pwd, "libs/patoh/lib/macOS/libpatoh.dylib")
    if 'Darwin' in platform.system()
    else os.path.join(pwd, "libs/patoh/lib/linux/libpatoh.so"),
    
    "PATOH_ITERATIONS": 5,

    # Expansion modes: 'avg_node_weight', 'total_node_weight', 'smallest_node_weight'
    # 'largest_node_weight'
    # add '_squared' or '_sqrt' at the end of any of the above for ^2 or sqrt(weight)
    # i.e. 'avg_node_weight_squared
    "PATOH_HYPEREDGE_EXPANSION_MODE": 'avg_node_weight',


    # Alters how much information to print. Keep it at 1 for this notebook.
    # 0 - will print nothing, useful for batch operations.
    # 1 - prints basic information on assignments and operations.
    # 2 - prints more information as it batches arrivals.
    "verbose": 0
}

#gp = GraphPartitioning(config)

# Optional: shuffle the order of nodes arriving
# Arrival order should not be shuffled if using GAM to alter node weights
#random.shuffle(gp.arrival_order)

%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [6]:

    
import networkx as nx
import operator
import random

gg = nx.Graph()
gg.add_edge(1,2)
gg.add_edge(1,3)
gg.add_edge(1,4)
gg.add_edge(2,4)
gg.add_edge(2,3)
gg.add_edge(5,4)
gg.add_edge(5,6)
gg.add_edge(1,5)
gg.add_edge(1,6)
gg.add_edge(1,7)
gg.add_node(8)
gg.add_node(9)
gg.add_node(10)
gg.add_node(11)


print(gg.nodes())
print(gg.degree())
lev = utils.leverage_centrality(gg)
print(lev)

wanted = [1,3,5]
orderedNodes = []
for l in lev:
    if l[0] in wanted:
        orderedNodes.append(l[0])
        print(l[0], 'found with leverage=', l[1])
print(orderedNodes)


print(utils.reorder_nodes_based_on_leverage_centrality(lev, random.sample(gg.nodes(), len(gg.nodes()))))
#nx.write_edgelist(gg,'/Users/voreno/Desktop/g.txt')









    



[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
{1: 6, 2: 3, 3: 2, 4: 3, 5: 3, 6: 2, 7: 1, 8: 0, 9: 0, 10: 0, 11: 0}
[(1, 0.45238095238095233), (2, -0.04444444444444443), (5, -0.04444444444444443), (4, -0.1111111111111111), (3, -0.35), (6, -0.35), (7, -0.7142857142857143), (8, -1000.0), (9, -1000.0), (10, -1000.0), (11, -1000.0)]
1 found with leverage= 0.45238095238095233
5 found with leverage= -0.04444444444444443
3 found with leverage= -0.35
[1, 5, 3]
[1, 2, 5, 4, 3, 6, 7, 11, 9, 8, 10]



In [ ]:

    
import experiments.io as expIO
import experiments.utils as expUtils

def runPartitioning(config, outFilePath, numPartitions, networkID):
    start_time = time.time()
    
    gp = GraphPartitioning(config)
    
    outFile = expIO.OutFile(outFilePath)
    expIO.writeConfig(outFile, config, numPartitions, networkID)
    
    gp.load_network()
    gp.init_partitioner()
    
    m = gp.prediction_model()
    expIO.writePartitionStats(outFile, "prediction_model", gp, m)
        
    m = gp.assign_cut_off()
    expIO.writePartitionStats(outFile, "cutoff", gp, m)

    m = gp.batch_arrival()
    expIO.writePartitionStats(outFile, "batch_arrivals", gp, m)
    
    outFile.write("Experiment duration = " + str(time.time() - start_time))
    outFile.save()
    
    del m
    del gp
    del outFile
    
def runPartitioningNoStorage(config, numPartitions, networkID):
    start_time = time.time()
    
    gp = GraphPartitioning(config)    
    gp.load_network()
    gp.init_partitioner()
    
    m = gp.prediction_model()
    m = gp.assign_cut_off()
    m = gp.batch_arrival()
        
    del m
    del gp



In [ ]:

    
from sklearn.metrics import f1_score
from scipy.optimize import linear_sum_assignment
from sklearn.metrics.cluster import normalized_mutual_info_score

import time 

def relabel(array, v1, v2):
    newArr = []
    for val in array:
        if(val == v1):
            newArr.append(v2)
        elif (val == v2):
            newArr.append(v1)
        else:
            newArr.append(val)
    return newArr

#test = [0,0,1,1,2,2]
#print(relabel(test, 0, 0))

testNMIRelabeling = True
if testNMIRelabeling:
    num_partitions = 4
    config = expUtils.getConfig(parametrized_config, "PATOH", num_partitions, 100, 0.0, 1, False, False, False)
    
    gp = GraphPartitioning(config)    
    gp.load_network()
    gp.init_partitioner()

    m = gp.prediction_model()

    predictionModel = np.array(gp.assignments, copy=True)

    m = gp.assign_cut_off()
    m = gp.batch_arrival()
    
    batchModel = np.array(gp.assignments, copy=True)
    
    pm = []
    batch = []
    for i, partition in enumerate(batchModel):
        if partition >= 0:
            pm.append(predictionModel[i])
            batch.append(partition)
            
    prediction = np.array(pm)
    batch = np.array(batch)
    
    startt = time.time()
    
    print("Prediction Model", prediction)
    print("Batch Arrivals", batch)
    
    print("NMI:", normalized_mutual_info_score(prediction, batch))
    print("fscore:",f1_score(prediction, batch, average='weighted'))

    fscores = []
    fscores_correct = []

    for i in range(0, num_partitions):
        fi = []
        fi_correct = []
        for j in range(0, num_partitions):
            batch_ij = relabel(batch, i, j)
            fi.append(1.0 - f1_score(prediction, batch_ij, average='weighted'))
            fi_correct.append(f1_score(prediction, batch_ij, average='weighted'))
        fscores.append(fi)
        fscores_correct.append(fi_correct)
        
    
    print('fscore Matrix:', fscores_correct)

    cost = np.array(fscores) 
    row_ind, col_ind = linear_sum_assignment(cost)
    print('Hungarian rows', row_ind)
    print('Hungarian cols', col_ind)
    
    relabelled_batch = batch
    
    relabel_done = {}
    for i, row in enumerate(row_ind):
        # check if done already        
        col = col_ind[i]
        if col == row:
            continue

        if(col < row):
            if col in relabel_done:
                #if row in relabel_done[col]:
                continue
            relabel_done[col] = row
        else:
            if row in relabel_done:
                continue
            relabel_done[row] = col
        
            
        print(row, '->', col)
        relabelled_batch = relabel(relabelled_batch, row, col_ind[i])
        print("nmi:", normalized_mutual_info_score(prediction, relabelled_batch))
        print("relabelled batch:", np.array(relabelled_batch))
        print("fscore relabelled:",f1_score(prediction, relabelled_batch, average='weighted'))
    print("elapsed = ", time.time() - startt)

        
    print("NMI relabelled:", normalized_mutual_info_score(prediction, relabelled_batch))
    print("fscore relabelled:",f1_score(prediction, relabelled_batch, average='weighted'))

    del gp
    del m
    del predictionModel



In [ ]:

    
# All possible ways to run the experiments
all_algorithms = ["FENNEL", "SCOTCH", "PATOH"]

all_correctedness = expUtils.fillRange(0, 11, expUtils.times10)
all_partitions = expUtils.fillRange(4, 13)
all_networks = expUtils.fillRange(1, 1001)
all_predictionModelCutoff = expUtils.fillRange(0, 21, expUtils.cutoffValueFunc)

storeResults = True

all_slidingWindow = [True, False]
all_virtualNodes = [True, False]
all_GAMModel = [True, False]



In [ ]:

    
# EXPERIMENT 1: how does changing the cutoff on prediction model, alter how well partitions can be made

# todo avoid double experiments, cutoff, size of batch
# todo check edgescut = subset of population

# pick a subset of settings, if needed
algorithms = all_algorithms
#partitions = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
partitions = [4]
correctedness = [100]
predictionModelCutoff = [0.0]
#predictionModelCutoff = expUtils.fillRange(0, 21, expUtils.cutoffValueFunc)
networks = [1]
slidingWindows = [False]
virtualNodes = [True, False]
#GAMModels = [True, False]

outputDir = expUtils.experimentDir(parametrized_config)
if not os.path.exists(outputDir):
    os.makedirs(outputDir)

experimentCount = 0
experimentTimes = []

dontRunExperiments = False # safety net to avoid overwriting experiments
for algorithm in algorithms:
    # run the experiment separately for each algorithm
    if dontRunExperiments == True:
        break
        
    for partition in partitions:
        # run the experiment with differnt number of partitions
        for correct in correctedness:
            # run the experiment for different correctedness predictions
            for cutoff in predictionModelCutoff:
                # this is the real experiment - different cutoff values
                for network in networks:
                    # sliding window
                    for sliding in slidingWindows:
                        # virtual nodes
                        for virtualNode in virtualNodes:
                            # GAM
                            for GAMModel in GAMModels:
                                start_time = time.time()

                                # setup the experiment
                                config = expUtils.getConfig(parametrized_config, algorithm, partition, correct, cutoff, network, sliding, virtualNode, GAMModel)

                                outFN = expUtils.getOutFileName(algorithm, partition, correct, cutoff, network, sliding, virtualNode, GAMModel)
                                outfPath = os.path.join(outputDir, outFN)
                                #with open(outfPath, 'w+') as experimentFile:
                                    # run experiment and store results
                                runPartitioning(config, outfPath, partition, network)
                                
                                elapsed_time = time.time() - start_time
                                print("Experiment", experimentCount, elapsed_time, outFN)
                                experimentTimes.append(elapsed_time)
                                experimentCount += 1

expUtils.purgeEmptyDir(outputDir)



In [ ]:

    
# experiment for prediction model cutoff

algorithms = all_algorithms
correctedness = [100]
networks = [1]
slidingWindows = [False]
virtualNodes = [False]
GAMModels = [False]


'''
resultsDir = os.path.join(parametrized_config['OUTPUT_DIRECTORY'], "17_04_27", "10_56_16")
partitions = [4]
predictionModelCutoff = expUtils.fillRange(0, 21, expUtils.cutoffValueFunc)
xlabel = "Prediction Model Cut-off"
x = []
for p in predictionModelCutoff:
    x.append(p * 10)

# experiment for partition size - no prediction model
resultsDir = os.path.join(parametrized_config['OUTPUT_DIRECTORY'], "17_04_27", "11_04_09")
partitions = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
predictionModelCutoff = [0.0]
xlabel = "Number of Partitions"
x=partitions
'''


# experiment for virtual nodes
resultsDir = os.path.join(parametrized_config['OUTPUT_DIRECTORY'], "17_04_27", "14_49_14")
partitions = [4]
predictionModelCutoff = [0.0]
xlabel = "Virtual Nodes on/off"
virtualNodes = [False, True]
x=[0, 1]


'''
# experiment for virtual nodes
resultsDir = os.path.join(parametrized_config['OUTPUT_DIRECTORY'], "17_04_27", "11_28_07")
partitions = [4]
predictionModelCutoff = [0.5]
xlabel = "Virtual Nodes on/off"
virtualNodes = [False, True]
x=[0, 1]

# experiment for virtual nodes
resultsDir = os.path.join(parametrized_config['OUTPUT_DIRECTORY'], "17_04_27", "11_31_20")
partitions = [4]
predictionModelCutoff = [0.5]
xlabel = "GAM Model on/off"
GAMModels = [False, True]
x=[0, 1]
'''



class ExperimentData:
    def __init__(self, name, path, algorithm, partitions, correctedness, cutoff, networkID, slidingWindow, virtualNodes, GAMModel):
        self.name = name
        self.path = path
        
        self.algorithm = algorithm
        self.partitions = partitions
        self.correctedness = correctedness
        self.cutoff = cutoff
        self.networkID = networkID
        self.slidingWindow = slidingWindow
        self.virtualNodes = virtualNodes
        self.gamModel = GAMModel
        
        self.predictionModelMetrics = []
        self.batchMetrics = []


def aggregatePredictionData(experiments, metricID):
    data = []
    for experiment in experiments:
        if(metricID >= 0 and metricID < len(experiment.predictionModelMetrics)):
            data.append(experiment.predictionModelMetrics[metricID])
    return data

def aggregateBatchData(experiments, metricID):
    data = []
    for experiment in experiments:
        # get the last row
        if(len(experiment.batchMetrics)):
            rowData = experiment.batchMetrics[len(experiment.batchMetrics) - 1]
            if(metricID >= 0 and metricID < len(rowData)):
                data.append(rowData[metricID])
    return data

def plotData(x, dataset, xlabel, ylabel):
    plt.plot(x , dataset[0], 'r--', label="FENNEL")
    plt.plot(x , dataset[1], 'b--', label="SCOTCH")
    plt.plot(x , dataset[2], 'g--', label="PATOH")
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.show()

edgesCut = []
tcv = []
modularity = []
lonliness = []
nmi = []
# load the expected experiments
for algorithm in algorithms:
    # run the experiment separately for each algorithm        
    experiments = []
    for partition in partitions:
        # run the experiment with differnt number of partitions
        for correct in correctedness:
            # run the experiment for different correctedness predictions
            for cutoff in predictionModelCutoff:
                # this is the real experiment - different cutoff values
                for network in networks:
                    # sliding window
                    for sliding in slidingWindows:
                        # virtual nodes
                        for virtualNode in virtualNodes:
                            # GAM
                            for GAMModel in GAMModels:
                                outFN = expUtils.getOutFileName(algorithm, partition, correct, cutoff, network, sliding, virtualNode, GAMModel)
                                outfPath = os.path.join(resultsDir, outFN)

                                experiment = ExperimentData(outFN, outfPath, algorithm, partition, correct, cutoff, network, sliding, virtualNode, GAMModel)
                                
                                outFile = expIO.OutFile(outfPath)
                                data = outFile.load()
                                
                                experiment.predictionModelMetrics = data[0]
                                experiment.batchMetrics = data[1]
                                
                                experiments.append(experiment)

    edgesCut.append(aggregateBatchData(experiments, 2))
    tcv.append(aggregateBatchData(experiments, 3))
    modularity.append(aggregateBatchData(experiments, 4))
    lonliness.append(aggregateBatchData(experiments, 5))
    nmi.append(aggregateBatchData(experiments, 7))



plotData(x, edgesCut, xlabel, "Edges Cut")
plotData(x, tcv, xlabel, "Total Comm. Volumne")
plotData(x, modularity, xlabel, "Modularity")
plotData(x, lonliness, xlabel, "Lonliness")
plotData(x, nmi, xlabel, "NMI")



In [ ]:



In [ ]: