In [1]:
import numpy as np
import networkx as nx
import gensim
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist, pdist
import pickle
from collections import defaultdict
import random
%matplotlib inline

Use Citation Network for Testing. The dataset has papers between the years 1992 - 2002. Papers published between 1992 and 1995 will be taken separately as a new training graph.

Papers generated between 1996-1997 will be used for testing. Edges belonging to papers published in those years are removed at random (which are stored separately and kept). We will use this as the test set and evaluate the performance of node2vec and glove

Part 1 : Build training set


In [2]:
#import text file containing edge list and write to dictionary
dict_edges = defaultdict(int)

with open('graph/ca-GrQc.txt') as f:
    for line in f:
        cols = line.split()
        if(len(cols)>2):
            continue
        dict_edges[cols[0]+"-"+cols[1]] = 1

In [3]:
print '#-FromNodeId' in dict_edges.keys()
print '#-Directed' in dict_edges.keys()
print '9504145-9309097' in dict_edges.keys()


False
False
False

In [4]:
# Get 90% of edges and store in training data 

len_training_set = int(0.9*len(dict_edges.keys()))
print len_training_set


26082

In [5]:
training_set = random.sample(dict_edges.keys(),len_training_set)

In [6]:
training_dict = {}
for edge in training_set:
    training_dict[edge] = 1

In [7]:
test_set = {}

for edge in dict_edges.keys():
    if edge in training_dict:
        continue
    else:
        test_set[edge] = 1

In [8]:
len_test_set = len(test_set)
print len(dict_edges.keys())-len_training_set-len_test_set


0

In [9]:
# Add nodes without an edge between them to training set
filehandle = open('graph/ca-GrQc.txt','rb')
data = filehandle.readlines()
G = nx.DiGraph()
for x in data:
    temp = str(x)
    temp = temp.split()
    G.add_edge(temp[0],temp[1])
    
node_list = G.nodes()

In [10]:
# Keep sampling pairs of nodes until we get half the length of training set pairs of nodes without edges between them 
dict_node_no_edge = defaultdict(int)
counter = 0
loop_index = 0
while(loop_index <= len_training_set):
    node_pair = random.sample(node_list,2)
    if str(node_pair[0])+'-'+str(node_pair[1]) in dict_edges or str(node_pair[0])+'-'+str(node_pair[1]) in dict_node_no_edge:
        continue
    else:
        dict_node_no_edge[str(node_pair[0])+'-'+str(node_pair[1])]=0
        counter += 1
        if(counter >= int(len_training_set)):
            break
    loop_index += 1

In [11]:
loop_index


Out[11]:
13040

In [12]:
for node_pair in dict_node_no_edge:
    nodes = node_pair.split('-')
    if G.has_edge(nodes[0],nodes[1]):
        print "Exception! Breaking"
        break
    else:
        continue

In [13]:
training_set_append = random.sample(dict_node_no_edge.keys(),int(0.9*len(dict_node_no_edge.keys())))

In [14]:
for edge in training_set_append:
    training_dict[edge] = 0

In [15]:
for edge in dict_node_no_edge.keys():
    if edge in training_dict or edge in test_set:
        continue
    else:
        test_set[edge] = 0

In [16]:
len(test_set.keys())


Out[16]:
4203

In [17]:
len(training_dict.keys())


Out[17]:
37818

In [18]:
import pickle

with open('training_data.txt', 'wb') as handle:
  pickle.dump(training_dict, handle)

with open('testing_data.txt', 'wb') as handle_1:
  pickle.dump(test_set, handle_1)

In [19]:
G.number_of_nodes()


Out[19]:
5242

In [ ]: