In [42]:
import numpy as np
import networkx as nx
import gensim
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist, pdist
import pickle
from collections import defaultdict
import random
%matplotlib inline
In [43]:
#import text file containing edge list and write to dictionary
dict_edges = defaultdict(int)
with open('graph/ca-GrQc.txt') as f:
for line in f:
cols = line.split()
if(len(cols)>2):
continue
dict_edges[cols[0]+"-"+cols[1]] = 1
In [44]:
print '#-FromNodeId' in dict_edges.keys()
print '#-Directed' in dict_edges.keys()
print '9504145-9309097' in dict_edges.keys()
In [45]:
# Get 90% of edges and store in training data
len_training_set = int(0.9*len(dict_edges.keys()))
print len_training_set
In [46]:
training_set = random.sample(dict_edges.keys(),len_training_set)
In [47]:
training_dict = {}
for edge in training_set:
training_dict[edge] = 1
In [48]:
test_set = {}
for edge in dict_edges.keys():
if edge in training_dict:
continue
else:
test_set[edge] = 1
In [49]:
len_test_set = len(test_set)
print len(dict_edges.keys())-len_training_set-len_test_set
In [50]:
# Add nodes without an edge between them to training set
filehandle = open('graph/ca-GrQc.txt','rb')
data = filehandle.readlines()
G = nx.DiGraph()
for x in data:
temp = str(x)
temp = temp.split()
G.add_edge(temp[0],temp[1])
node_list = G.nodes()
In [51]:
# Keep sampling pairs of nodes until we get half the length of training set pairs of nodes without edges between them
dict_node_no_edge = defaultdict(int)
counter = 0
loop_index = 0
while(loop_index <= 2*len_training_set):
node_pair = random.sample(node_list,2)
if str(node_pair[0])+'-'+str(node_pair[1]) in dict_edges or str(node_pair[0])+'-'+str(node_pair[1]) in dict_node_no_edge:
continue
else:
dict_node_no_edge[str(node_pair[0])+'-'+str(node_pair[1])]=0
counter += 1
if(counter >= int(len_training_set)):
break
loop_index += 1
In [52]:
loop_index
Out[52]:
In [53]:
for node_pair in dict_node_no_edge:
nodes = node_pair.split('-')
if G.has_edge(nodes[0],nodes[1]):
print "Exception! Breaking"
break
else:
continue
In [54]:
training_set_append = random.sample(dict_node_no_edge.keys(),int(0.9*len(dict_node_no_edge.keys())))
In [55]:
for edge in training_set_append:
training_dict[edge] = 0
In [56]:
for edge in dict_node_no_edge.keys():
if edge in training_dict or edge in test_set:
continue
else:
test_set[edge] = 0
In [57]:
len(test_set.keys())
Out[57]:
In [58]:
len(training_dict.keys())
Out[58]:
In [59]:
import pickle
with open('training_data.txt', 'wb') as handle:
pickle.dump(training_dict, handle)
with open('testing_data.txt', 'wb') as handle_1:
pickle.dump(test_set, handle_1)
In [60]:
G.number_of_nodes()
Out[60]:
In [61]:
test = training_dict.values()
In [62]:
test = np.asarray(test)
In [63]:
np.count_nonzero(test)
Out[63]:
In [64]:
len(test)
Out[64]:
In [ ]: