Link Prediction in NetworKit

In this notebook, the usage of the link prediction module will be demonstrated.

1) Load NetworKit module as well as external modules (e.g. for supervised approaches)



In [1]:

    
cd ../../
from networkit import linkprediction as lp, readGraph, Format
# Needed to draw plots
%matplotlib inline
import matplotlib.pyplot as plt 
# Machine learning
from sklearn import *









    



  File "<ipython-input-1-f46d96c0d377>", line 1
    cd ../../
        ^
SyntaxError: invalid syntax

2) Helper function to draw ROC curve in the notebook



In [2]:

    
def drawROC(indexName, fpr, tpr):
    plt.suptitle(indexName, fontsize=18, fontweight="bold")
    plt.xticks([z / 10 for z in range(11)])
    plt.yticks([z / 10 for z in range(11)])
    plt.xlabel("False positive rate", fontsize=18)
    plt.ylabel("True positive rate", fontsize=18)
    plt.axis([0, 1, 0, 1])
    curve, = plt.plot(fpr, tpr)
    plt.plot([0, 1], [0, 1], 'grey', ls='--')
def drawUnsupervisedROC(indexName, predictions):
    fpr, tpr = roc.getCurve(predictions)
    drawROC(indexName, fpr, tpr)
def drawSupervisedROC(indexName, predictions, testLabels):
    predictionLabels = [p[1] for p in predictions]
    fpr, tpr, thresholds = metrics.roc_curve(testLabels, predictionLabels)
    drawROC(indexName, fpr, tpr)

3) Load an actual network that should be analyzed



In [3]:

    
testGraph = readGraph("input/celegans_metabolic.graph", Format.METIS)
print("Loaded graph with", testGraph.numberOfEdges(), "edges.")









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-ea0025a838ff> in <module>()
----> 1 testGraph = readGraph("input/celegans_metabolic.graph", Format.METIS)
      2 print("Loaded graph with", testGraph.numberOfEdges(), "edges.")

NameError: name 'readGraph' is not defined

4) Initialize ROC/PR metrics for testGraph



In [ ]:

    
roc = lp.ROCMetric(testGraph)
pr = lp.PrecisionRecallMetric(testGraph)

5) Generate training and feature graphs



In [ ]:

    
trainingGraph = lp.RandomLinkSampler.byPercentage(testGraph, 0.9)
print("Left out", testGraph.numberOfEdges() - trainingGraph.numberOfEdges(), "edges from test graph")
featureGraph = lp.RandomLinkSampler.byPercentage(trainingGraph, 0.7)
print("Left out", trainingGraph.numberOfEdges() - featureGraph.numberOfEdges(), "edges from training graph")

6) Initialize similarity indices



In [ ]:

    
featureCommonNeighborsIndex = lp.CommonNeighborsIndex(featureGraph)
featureJaccardIndex = lp.JaccardIndex(featureGraph)
featurePreferentialAttachmentIndex = lp.PreferentialAttachmentIndex(featureGraph)
featureAdamicAdarIndex = lp.AdamicAdarIndex(featureGraph)
featureUDegreeIndex = lp.UDegreeIndex(featureGraph)
featureVDegreeIndex = lp.VDegreeIndex(featureGraph)
featureAlgebraicDistanceIndex = lp.AlgebraicDistanceIndex(featureGraph, 5, 15)
featureAlgebraicDistanceIndex.preprocess()
featureNeighborhoodDistanceIndex = lp.NeighborhoodDistanceIndex(featureGraph)
featureTotalNeighborsIndex = lp.TotalNeighborsIndex(featureGraph)
featureNeighborsMeasureIndex = lp.NeighborsMeasureIndex(featureGraph)
featureSameCommunityIndex = lp.SameCommunityIndex(featureGraph)
featureAdjustedRandIndex = lp.AdjustedRandIndex(featureGraph)
featureResourceAllocationIndex = lp.ResourceAllocationIndex(featureGraph)

commonNeighborsIndex = lp.CommonNeighborsIndex(trainingGraph)
jaccardIndex = lp.JaccardIndex(trainingGraph)
preferentialAttachmentIndex = lp.PreferentialAttachmentIndex(trainingGraph)
adamicAdarIndex = lp.AdamicAdarIndex(trainingGraph)
uDegreeIndex = lp.UDegreeIndex(trainingGraph)
vDegreeIndex = lp.VDegreeIndex(trainingGraph)
algebraicDistanceIndex = lp.AlgebraicDistanceIndex(trainingGraph, 5, 15)
algebraicDistanceIndex.preprocess()
neighborhoodDistanceIndex = lp.NeighborhoodDistanceIndex(trainingGraph)
totalNeighborsIndex = lp.TotalNeighborsIndex(trainingGraph)
neighborsMeasureIndex = lp.NeighborsMeasureIndex(trainingGraph)
sameCommunityIndex = lp.SameCommunityIndex(trainingGraph)
adjustedRandIndex = lp.AdjustedRandIndex(trainingGraph)
resourceAllocationIndex = lp.ResourceAllocationIndex(trainingGraph)

featureLinkPredictors = (featureCommonNeighborsIndex, featureJaccardIndex, featureAdamicAdarIndex, featurePreferentialAttachmentIndex, featureUDegreeIndex, featureVDegreeIndex, featureAlgebraicDistanceIndex, featureNeighborhoodDistanceIndex, featureTotalNeighborsIndex, featureNeighborsMeasureIndex, featureSameCommunityIndex, featureResourceAllocationIndex, featureAdjustedRandIndex)
linkPredictors = (commonNeighborsIndex, jaccardIndex, adamicAdarIndex, preferentialAttachmentIndex, uDegreeIndex, vDegreeIndex, algebraicDistanceIndex, neighborhoodDistanceIndex, totalNeighborsIndex, neighborsMeasureIndex, sameCommunityIndex, resourceAllocationIndex, adjustedRandIndex)

7) Generate testing and training sets



In [ ]:

    
trainingSet = lp.MissingLinksFinder(featureGraph).findAtDistance(2)
testingSet = lp.MissingLinksFinder(trainingGraph).findAtDistance(2)

labelTrain = lp.getLabels(trainingSet, trainingGraph)
featureTrain = lp.getFeatures(trainingSet, *featureLinkPredictors)

labelTest = lp.getLabels(testingSet, testGraph)
featureTest = lp.getFeatures(testingSet, *linkPredictors)

print("Training set size:", len(trainingSet))
print("Testing set size:", len(testingSet))



In [ ]:

    
# Create and train classifier
dtc = ensemble.BaggingClassifier(n_estimators=25, max_features=0.2, n_jobs=-1)
dtc.fit(featureTrain, labelTrain)
# Actual prediction
predictions = dtc.predict_proba(featureTest)
drawSupervisedROC("DT Bagging", predictions, labelTest)



In [ ]:

    
cnPreds = commonNeighborsIndex.runOn(testingSet)
drawUnsupervisedROC("Common Neighbors", cnPreds)



In [ ]:

    
lp.LinkThresholder.byCount(cnPreds, 10)



In [ ]:



In [ ]: