In [1]:
import numpy as np
import networkx as nx
import gensim
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist, pdist
import pickle
from collections import defaultdict
import random
%matplotlib inline
In [2]:
#import text file containing edge list and write to dictionary
dict_edges = defaultdict(int)
with open('graph/ca-GrQc.txt') as f:
for line in f:
cols = line.split()
if(len(cols)>2):
continue
dict_edges[cols[0]+"-"+cols[1]] = 1
In [3]:
print '#-FromNodeId' in dict_edges.keys()
print '#-Directed' in dict_edges.keys()
print '9504145-9309097' in dict_edges.keys()
In [4]:
# Get 90% of edges and store in training data
len_training_set = int(0.9*len(dict_edges.keys()))
print len_training_set
In [5]:
training_set = random.sample(dict_edges.keys(),len_training_set)
In [6]:
training_dict = {}
for edge in training_set:
training_dict[edge] = 1
In [7]:
test_set = {}
for edge in dict_edges.keys():
if edge in training_dict:
continue
else:
test_set[edge] = 1
In [8]:
len_test_set = len(test_set)
print len(dict_edges.keys())-len_training_set-len_test_set
In [9]:
# Add nodes without an edge between them to training set
filehandle = open('graph/ca-GrQc.txt','rb')
data = filehandle.readlines()
G = nx.DiGraph()
for x in data:
temp = str(x)
temp = temp.split()
G.add_edge(temp[0],temp[1])
node_list = G.nodes()
In [10]:
# Keep sampling pairs of nodes until we get half the length of training set pairs of nodes without edges between them
dict_node_no_edge = defaultdict(int)
counter = 0
loop_index = 0
while(loop_index <= len_training_set):
node_pair = random.sample(node_list,2)
if str(node_pair[0])+'-'+str(node_pair[1]) in dict_edges or str(node_pair[0])+'-'+str(node_pair[1]) in dict_node_no_edge:
continue
else:
dict_node_no_edge[str(node_pair[0])+'-'+str(node_pair[1])]=0
counter += 1
if(counter >= int(len_training_set)):
break
loop_index += 1
In [11]:
loop_index
Out[11]:
In [12]:
for node_pair in dict_node_no_edge:
nodes = node_pair.split('-')
if G.has_edge(nodes[0],nodes[1]):
print "Exception! Breaking"
break
else:
continue
In [13]:
training_set_append = random.sample(dict_node_no_edge.keys(),int(0.9*len(dict_node_no_edge.keys())))
In [14]:
for edge in training_set_append:
training_dict[edge] = 0
In [15]:
for edge in dict_node_no_edge.keys():
if edge in training_dict or edge in test_set:
continue
else:
test_set[edge] = 0
In [16]:
len(test_set.keys())
Out[16]:
In [17]:
len(training_dict.keys())
Out[17]:
In [18]:
import pickle
with open('training_data.txt', 'wb') as handle:
pickle.dump(training_dict, handle)
with open('testing_data.txt', 'wb') as handle_1:
pickle.dump(test_set, handle_1)
In [19]:
G.number_of_nodes()
Out[19]:
In [ ]: