In [1]:
import networkx as nx
from graph_tool.all import *
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os
import sys
In [3]:
with open('citeseer.data', 'rb') as f:
citeseer = pickle.load(f)
In [4]:
citeseer.keys()
Out[4]:
In [2]:
from motifwalk.utils.Graph import GraphContainer
In [3]:
from motifwalk.utils import find_meta, set_dataloc, get_metadata
In [4]:
set_dataloc(path_to_data=os.path.abspath('./'))
In [5]:
metadata = get_metadata()
In [9]:
citeseer_meta = find_meta('citeseer')
In [10]:
citeseer_pack = GraphContainer(citeseer_meta, dataloc=os.path.abspath('./'))
In [11]:
citeseer_pack.get_labels()
Out[11]:
In [12]:
citeseer_gt = citeseer_pack.get_gt_graph()
In [43]:
graph_draw(citeseer_gt, vertex_fill_color=label, output="citeseer_with_labels.pdf")
Out[43]:
In [15]:
citeseer_nx = citeseer_pack.get_graph()
In [38]:
label = citeseer_gt.new_vertex_property("int")
In [40]:
classes = np.argmax(citeseer_pack.get_labels(), axis=1)
In [41]:
for i in range(classes.size):
label[i] = classes[i]
In [44]:
bc_meta = find_meta('blogcatalog')
In [52]:
bc_pack = GraphContainer(bc_meta, dataloc=os.path.abspath('./'))
In [53]:
bc_gt = bc_pack.get_gt_graph()
In [54]:
label = bc_gt.new_vertex_property("int")
In [59]:
classes = np.argmax(bc_pack.get_labels().toarray(), axis=1)
In [60]:
for i in range(classes.size):
label[i] = classes[i]
In [62]:
from time import time
t = time()
graph_draw(bc_gt, vertex_fill_color=label, output="blogcatalog_with_labels.pdf", output_size=(1200, 1200))
print(time()-t)
In [65]:
cora_meta = find_meta('cora')
In [66]:
cora_pack = GraphContainer(metadata=cora_meta, dataloc=os.path.abspath('./'))
In [67]:
labels = cora_pack.get_labels()
In [69]:
cora_gt = cora_pack.get_gt_graph()
In [70]:
classes = np.argmax(cora_pack.get_labels(), axis=1)
In [71]:
label = cora_gt.new_vertex_property("int")
In [72]:
for i in range(classes.size):
label[i] = classes[i]
In [73]:
graph_draw(cora_gt, vertex_fill_color=label, output="cora_with_labels.pdf")
Out[73]:
Preprocess Amazon co-purchasing data
In [2]:
aloc = "/home/gear/Dropbox/CompletedProjects/motifwalk/data/raw/amazon_copurchasing"
In [3]:
with open(aloc+'/com-amazon.top5000.cmty.txt') as f:
top5k = f.read()
In [4]:
top5k = top5k.split('\n')
In [5]:
top5klist = [i.split('\t') for i in top5k]
In [6]:
len(max(top5klist, key=len))
Out[6]:
In [49]:
top5klist[0]
Out[49]:
In [30]:
amazon_nx = nx.read_edgelist('./raw/amazon_copurchasing/com-amazon.ungraph.txt')
In [8]:
amazon_nx.is_directed()
Out[8]:
In [9]:
amazon_nx.size()
Out[9]:
In [10]:
len(amazon_nx.nodes())
Out[10]:
In [19]:
sorted_nodes_amazon = sorted(amazon_nx.nodes(), key=int)
In [20]:
map_amazon = {}
for i, node_id in enumerate(sorted_nodes_amazon):
map_amazon[node_id] = i
In [21]:
len(map_amazon)
Out[21]:
In [22]:
max(amazon_nx.nodes(), key=int)
Out[22]:
In [23]:
map_amazon['548551']
Out[23]:
In [36]:
amazon_nx
Out[36]:
In [37]:
def amazon_type_map(s):
return map_amazon[s]
In [38]:
amazon_nx = nx.read_edgelist('./raw/amazon_copurchasing/com-amazon.ungraph.txt', nodetype=amazon_type_map)
In [39]:
amazon_nx[0]
Out[39]:
In [40]:
max(amazon_nx.nodes())
Out[40]:
In [41]:
from scipy.sparse import csr_matrix
In [59]:
label_amazon = np.zeros(shape=(334863, 5000), dtype=np.int8)
In [60]:
for cmty, nodelist in enumerate(top5klist[:-1]):
for node in nodelist:
label_amazon[map_amazon[node]][cmty] = 1
In [63]:
label_amazon = csr_matrix(label_amazon, dtype=np.int8)
In [64]:
label_amazon
Out[64]:
In [70]:
label_amazon[4]
Out[70]:
In [71]:
map_amazon['164985']
Out[71]:
In [72]:
label_amazon[100150]
Out[72]:
In [73]:
np.nonzero(label_amazon[100150])
Out[73]:
In [74]:
amazon = {}
In [75]:
amazon['Labels'] = label_amazon
In [76]:
amazon['NXGraph'] = amazon_nx
In [77]:
with open('amazon.data', 'wb') as f:
pickle.dump(amazon, f)
In [83]:
amazon_meta = find_meta('amazon')
In [84]:
amazon_pack = GraphContainer(metadata=amazon_meta, dataloc=os.path.abspath('./'))
In [85]:
amazon_gt = amazon_pack.get_gt_graph()
In [86]:
amazon_gt
Out[86]:
In [88]:
%time graph_draw(amazon_gt, output="amazon_graph.pdf", output_size=(1200, 1200))
Out[88]:
In [89]:
with open('/home/gear/Dropbox/CompletedProjects/motifwalk/data/raw/pubmed/Pubmed-Diabetes.DIRECTED.cites.tab') as f:
pubmed = f.read()
In [91]:
pubmed[0:100]
Out[91]:
In [92]:
edges = pubmed.split('\n')
In [96]:
edges = edges[2:]
In [98]:
edges[0]
Out[98]:
In [99]:
pubmed_graph = nx.DiGraph()
In [100]:
tuples = []
In [101]:
edges[-1]
Out[101]:
In [105]:
ix, src, _ ,dst = edges[-2].split('\t')
In [109]:
edges[-2]
Out[109]:
In [106]:
ix
Out[106]:
In [107]:
src
Out[107]:
In [108]:
dst
Out[108]:
In [112]:
src_id = src.split(':')[-1]
In [113]:
src_id
Out[113]:
In [114]:
for e in edges[:-1]:
idx, src, _, dst = e.split('\t')
src_id = src.split(':')[-1]
dst_id = dst.split(':')[-1]
tuples.append((src_id, dst_id))
In [115]:
len(tuples)
Out[115]:
In [116]:
pubmed_graph.add_edges_from(tuples)
In [117]:
pubmed_graph.is_directed()
Out[117]:
In [118]:
with open('/home/gear/Dropbox/CompletedProjects/motifwalk/data/raw/pubmed/Pubmed-Diabetes.NODE.paper.tab') as f:
pubmed = f.read()
In [122]:
data = pubmed.split('\n')
In [123]:
data[0]
Out[123]:
In [126]:
data[2]
Out[126]:
In [129]:
template = data[1].split('\t')
In [130]:
template
Out[130]:
In [131]:
kw_id_map = {}
In [132]:
i = 0
for words in template[1:-1]:
_, word, _ = words.split(':')
kw_id_map[word] = i
i += 1
In [134]:
kw_id_map['w-use']
Out[134]:
In [135]:
pubmed_graph.nodes()[:10]
Out[135]:
In [136]:
all_pubmed_nodes = sorted(pubmed_graph.nodes(), key=int)
In [138]:
all_pubmed_nodes[:5]
Out[138]:
In [142]:
all_pubmed_nodes[-1]
Out[142]:
In [143]:
len(all_pubmed_nodes)
Out[143]:
In [139]:
map_pubmed = {}
for i, node_id in enumerate(all_pubmed_nodes):
map_pubmed[node_id] = i
In [140]:
map_pubmed['29094']
Out[140]:
In [144]:
def pubmed_type(node_id):
return map_pubmed[node_id]
In [145]:
len(kw_id_map)
Out[145]:
In [146]:
len(map_pubmed)
Out[146]:
In [148]:
pubmed_features = np.zeros(shape=(19717, 500), dtype=np.float32)
In [149]:
pubmed_labels = np.zeros(shape=(19717, 3), dtype=np.uint8)
In [150]:
data[-1]
Out[150]:
In [153]:
test = data[2]
In [156]:
node_id, *features_vec, _ = test.split('\t')
In [157]:
node_id
Out[157]:
In [158]:
features_vec
Out[158]:
In [161]:
for d in data[2:-1]:
node_id, label, *feature_vec, summary = d.split('\t')
int_id = map_pubmed[node_id]
label = int(label.split('=')[-1]) - 1
pubmed_labels[int_id][label] = 1
for f in feature_vec:
word, val = f.split('=')
feature_id = kw_id_map[word]
pubmed_features[int_id][feature_id] = float(val)
In [162]:
map_pubmed["12187484"]
Out[162]:
In [166]:
pubmed_labels[11943]
Out[166]:
In [167]:
test_labels = np.sum(pubmed_labels, axis=1)
In [172]:
np.count_nonzero(test_labels)
Out[172]:
In [173]:
len(edges)
Out[173]:
In [180]:
pubmed = nx.DiGraph()
In [181]:
for t in pubmed_graph.edges():
s, d = map(pubmed_type, t)
pubmed.add_edge(s,d)
In [184]:
pubmed.size()
Out[184]:
In [185]:
pubmed.number_of_nodes()
Out[185]:
In [186]:
pubmed.number_of_edges()
Out[186]:
In [187]:
pubmed.is_directed()
Out[187]:
In [189]:
pubmed.edge[11943]
Out[189]:
In [191]:
pubmed.in_edges(11943)
Out[191]:
In [192]:
pubmed.out_edges(11943)
Out[192]:
In [193]:
all_pubmed = {}
all_pubmed['NXGraph'] = pubmed
all_pubmed['Labels'] = pubmed_labels
all_pubmed['CSRFeatures'] = csr_matrix(pubmed_features)
In [194]:
all_pubmed['CSRFeatures']
Out[194]:
In [196]:
with open('./pubmed.data', 'wb') as f:
pickle.dump(all_pubmed, f)
In [6]:
pubmed_meta = find_meta('pubmed')
In [7]:
pubmed = GraphContainer(pubmed_meta, dataloc='.')
In [8]:
pubmed_gt = pubmed.get_gt_graph()
In [10]:
node_color = pubmed_gt.new_vertex_property("int")
for i, l in enumerate(pubmed.get_labels()):
node_color[i] = np.argmax(l)
In [13]:
%time graph_draw(pubmed_gt, vertex_fill_color=node_color, output="pubmed_with_labels.png", output_size=(1200,1200))
Out[13]:
In [9]:
from motifwalk.motifs import all_3
In [18]:
for m in all_3:
motif = m.gt_motif
text = motif.new_vertex_property("string")
for n in motif.vertices():
text[n] = str(n)
graph_draw(m.gt_motif, vertex_text=text, output_size=(80,80))
In [10]:
feed_forward = all_3[9]
In [11]:
motif = feed_forward.gt_motif
text = motif.new_vertex_property("string")
for n in motif.vertices():
text[n] = str(n)
graph_draw(motif, vertex_text=text, output_size=(100,100))
Out[11]:
In [11]:
feed_forward.anchors = {1,2}
In [12]:
from motifwalk.motifs.analysis import construct_motif_graph
In [13]:
ff_pubmed = construct_motif_graph(pubmed, feed_forward)
In [14]:
ff_pubmed
Out[14]:
In [16]:
%time graph_draw(ff_pubmed, output="pubmed_with_labels_feedforward.png", output_size=(1200,1200))
Out[16]:
In [17]:
vfilt = ff_pubmed.new_vertex_property('bool');
for i in ff_pubmed.vertices():
v = ff_pubmed.vertex(i)
if v.out_degree() > 0:
vfilt[i] = True
else:
vfilt[i] = False
In [18]:
ff_pubmed_filtered = GraphView(ff_pubmed, vfilt)
In [19]:
%time graph_draw(ff_pubmed_filtered, output="pubmed_with_labels_feedforward_filtered.png", output_size=(1200,1200))
Out[19]:
In [20]:
node_color = pubmed_gt.new_vertex_property("int")
for i, l in enumerate(pubmed.get_labels()):
node_color[i] = np.argmax(l)
In [23]:
%time graph_draw(ff_pubmed_filtered, output="pubmed_with_labels_feedforward_filtered.png", vertex_fill_color=node_color, output_size=(1200,1200))
Out[23]:
In [ ]: