In [99]:
import numpy as np
import networkx as nx
import gensim
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist, pdist
import pickle
from collections import defaultdict
import random
import pickle
import os
%matplotlib inline
In [100]:
# Import training set
training_set = pickle.load(open("training_data.txt", "rb"))
testing_set = pickle.load(open("testing_data.txt","rb"))
In [101]:
# Create graph from training set
G = nx.Graph()
for edge in training_set.keys():
nodes = edge.split('-')
if(training_set[edge]==1):
if(G.has_edge(nodes[0],nodes[1])):
G[nodes[0]][nodes[1]]['weight'] += 1
else:
G.add_edge(nodes[0],nodes[1],weight = 1)
else:
G.add_node(nodes[0])
G.add_node(nodes[1])
In [102]:
G.number_of_nodes()
Out[102]:
In [103]:
for node in G.nodes():
if G.degree(node) == 0:
G.add_edge(node,node,weight = 1)
In [104]:
# Add only the nodes from test set to graph if not already present in generated graph
node_list_conn = G.nodes()
for edge in testing_set.keys():
nodes = edge.split('-')
for node in nodes:
if node in node_list_conn:
continue
else:
G.add_node(node)
In [105]:
# Build new edgelist node2vec can utilize for generating embeddings
nx.write_edgelist(G,'graph/train_n2v.txt')
In [ ]:
CmdStr = "python main.py --p 1 --q 0.5 --iter 200 --input graph/train_n2v.txt \
--output emb/emb_train_n2v.emb --dimensions 64"
os.system(CmdStr)
In [107]:
node_list_conn_int = sorted(map(lambda x : int(x),node_list_conn))
node_list_conn = map(lambda x : str(x),node_list_conn_int)
In [108]:
# Sanity check to make sure same number of nodes reappears in new graph generated
G.number_of_nodes()
Out[108]:
In [123]:
## Read embeddings file from and generate features
model = gensim.models.KeyedVectors.load_word2vec_format('emb/iter_20_dim_128_train_n2v.emb')
In [124]:
embeddings = {}
err_count = 0
missing_node = []
for node in node_list_conn:
try:
embeddings[node] = model.word_vec(node)
except:
err_count += 1
missing_node.append(node)
continue
In [125]:
missing_node
Out[125]:
In [126]:
def combine_embedding(method,n_out,n_in):
if(method == 1):
#print "Implementing Simple average"
return (n_out+n_in)/2.0
elif(method == 2):
#print "Implementing Hadamard"
#print n_in,n_out
return np.multiply(n_in,n_out)
else:
print "Invalid Method. Enter 1 or 2"
return
In [127]:
# Try Hadamard first
feature = []
label = []
for edge in training_set.keys():
nodes = edge.split('-')
feature.append(combine_embedding(2,embeddings[nodes[0]],embeddings[nodes[1]]))
label.append(training_set[edge])
In [128]:
len(feature)
Out[128]:
In [129]:
G.number_of_edges()
Out[129]:
In [130]:
feature_np = np.asarray(feature)
print feature_np.shape
label_np = np.asarray(label)
print label_np.shape
In [131]:
#x,residuals,rank,s = np.linalg.lstsq(feature_np,label_np)
from sklearn import svm
clf = svm.SVC()
In [132]:
clf.fit(feature_np,label_np)
Out[132]:
In [133]:
def evaluate_perf(data,clf,labels):
label_pred = clf.predict(data)
print label_pred.shape
diff = np.abs(np.subtract(label_pred,labels))
return np.sum(diff)*1.0/len(labels)
In [134]:
x.shape
Out[134]:
In [135]:
error = evaluate_perf(feature_np,clf,label_np)
error
Out[135]:
In [137]:
#Get test data and evaluate performance
feature_test = []
label_test = []
for edge in testing_set.keys():
nodes = edge.split('-')
feature_test.append(combine_embedding(2,embeddings[nodes[0]],embeddings[nodes[1]]))
label_test.append(testing_set[edge])
In [139]:
error = evaluate_perf(feature_test,clf,label_test)
error
Out[139]:
In [ ]:
In [ ]: