In [42]:
import numpy as np
import networkx as nx
import gensim
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist, pdist
import pickle
from collections import defaultdict
import random
%matplotlib inline

In [43]:
#import text file containing edge list and write to dictionary
dict_edges = defaultdict(int)

with open('graph/ca-GrQc.txt') as f:
    for line in f:
        cols = line.split()
        if(len(cols)>2):
            continue
        dict_edges[cols[0]+"-"+cols[1]] = 1

In [44]:
print '#-FromNodeId' in dict_edges.keys()
print '#-Directed' in dict_edges.keys()
print '9504145-9309097' in dict_edges.keys()


False
False
False

In [45]:
# Get 90% of edges and store in training data 

len_training_set = int(0.9*len(dict_edges.keys()))
print len_training_set


26082

In [46]:
training_set = random.sample(dict_edges.keys(),len_training_set)

In [47]:
training_dict = {}
for edge in training_set:
    training_dict[edge] = 1

In [48]:
test_set = {}

for edge in dict_edges.keys():
    if edge in training_dict:
        continue
    else:
        test_set[edge] = 1

In [49]:
len_test_set = len(test_set)
print len(dict_edges.keys())-len_training_set-len_test_set


0

In [50]:
# Add nodes without an edge between them to training set
filehandle = open('graph/ca-GrQc.txt','rb')
data = filehandle.readlines()
G = nx.DiGraph()
for x in data:
    temp = str(x)
    temp = temp.split()
    G.add_edge(temp[0],temp[1])
    
node_list = G.nodes()

In [51]:
# Keep sampling pairs of nodes until we get half the length of training set pairs of nodes without edges between them 
dict_node_no_edge = defaultdict(int)
counter = 0
loop_index = 0
while(loop_index <= 2*len_training_set):
    node_pair = random.sample(node_list,2)
    if str(node_pair[0])+'-'+str(node_pair[1]) in dict_edges or str(node_pair[0])+'-'+str(node_pair[1]) in dict_node_no_edge:
        continue
    else:
        dict_node_no_edge[str(node_pair[0])+'-'+str(node_pair[1])]=0
        counter += 1
        if(counter >= int(len_training_set)):
            break
    loop_index += 1

In [52]:
loop_index


Out[52]:
26081

In [53]:
for node_pair in dict_node_no_edge:
    nodes = node_pair.split('-')
    if G.has_edge(nodes[0],nodes[1]):
        print "Exception! Breaking"
        break
    else:
        continue

In [54]:
training_set_append = random.sample(dict_node_no_edge.keys(),int(0.9*len(dict_node_no_edge.keys())))

In [55]:
for edge in training_set_append:
    training_dict[edge] = 0

In [56]:
for edge in dict_node_no_edge.keys():
    if edge in training_dict or edge in test_set:
        continue
    else:
        test_set[edge] = 0

In [57]:
len(test_set.keys())


Out[57]:
5507

In [58]:
len(training_dict.keys())


Out[58]:
49555

In [59]:
import pickle

with open('training_data.txt', 'wb') as handle:
  pickle.dump(training_dict, handle)

with open('testing_data.txt', 'wb') as handle_1:
  pickle.dump(test_set, handle_1)

In [60]:
G.number_of_nodes()


Out[60]:
5242

In [61]:
test = training_dict.values()

In [62]:
test = np.asarray(test)

In [63]:
np.count_nonzero(test)


Out[63]:
26082

In [64]:
len(test)


Out[64]:
49555

In [ ]: