In [1]:
    
import networkx as nx
from graph_tool.all import *
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os
import sys
    
    
In [3]:
    
with open('citeseer.data', 'rb') as f:
    citeseer = pickle.load(f)
    
In [4]:
    
citeseer.keys()
    
    Out[4]:
In [2]:
    
from motifwalk.utils.Graph import GraphContainer
    
In [3]:
    
from motifwalk.utils import find_meta, set_dataloc, get_metadata
    
In [4]:
    
set_dataloc(path_to_data=os.path.abspath('./'))
    
In [5]:
    
metadata = get_metadata()
    
In [9]:
    
citeseer_meta = find_meta('citeseer')
    
In [10]:
    
citeseer_pack = GraphContainer(citeseer_meta, dataloc=os.path.abspath('./'))
    
In [11]:
    
citeseer_pack.get_labels()
    
    Out[11]:
In [12]:
    
citeseer_gt = citeseer_pack.get_gt_graph()
    
In [43]:
    
graph_draw(citeseer_gt, vertex_fill_color=label, output="citeseer_with_labels.pdf")
    
    
    Out[43]:
In [15]:
    
citeseer_nx = citeseer_pack.get_graph()
    
In [38]:
    
label = citeseer_gt.new_vertex_property("int")
    
In [40]:
    
classes = np.argmax(citeseer_pack.get_labels(), axis=1)
    
In [41]:
    
for i in range(classes.size):
    label[i] = classes[i]
    
In [44]:
    
bc_meta = find_meta('blogcatalog')
    
In [52]:
    
bc_pack = GraphContainer(bc_meta, dataloc=os.path.abspath('./'))
    
In [53]:
    
bc_gt = bc_pack.get_gt_graph()
    
In [54]:
    
label = bc_gt.new_vertex_property("int")
    
In [59]:
    
classes = np.argmax(bc_pack.get_labels().toarray(), axis=1)
    
In [60]:
    
for i in range(classes.size):
    label[i] = classes[i]
    
In [62]:
    
from time import time
t = time()
graph_draw(bc_gt, vertex_fill_color=label, output="blogcatalog_with_labels.pdf", output_size=(1200, 1200))
print(time()-t)
    
    
    
In [65]:
    
cora_meta = find_meta('cora')
    
In [66]:
    
cora_pack = GraphContainer(metadata=cora_meta, dataloc=os.path.abspath('./'))
    
In [67]:
    
labels = cora_pack.get_labels()
    
In [69]:
    
cora_gt = cora_pack.get_gt_graph()
    
In [70]:
    
classes = np.argmax(cora_pack.get_labels(), axis=1)
    
In [71]:
    
label = cora_gt.new_vertex_property("int")
    
In [72]:
    
for i in range(classes.size):
    label[i] = classes[i]
    
In [73]:
    
graph_draw(cora_gt, vertex_fill_color=label, output="cora_with_labels.pdf")
    
    
    Out[73]:
Preprocess Amazon co-purchasing data
In [2]:
    
aloc = "/home/gear/Dropbox/CompletedProjects/motifwalk/data/raw/amazon_copurchasing"
    
In [3]:
    
with open(aloc+'/com-amazon.top5000.cmty.txt') as f:
    top5k = f.read()
    
In [4]:
    
top5k = top5k.split('\n')
    
In [5]:
    
top5klist = [i.split('\t') for i in top5k]
    
In [6]:
    
len(max(top5klist, key=len))
    
    Out[6]:
In [49]:
    
top5klist[0]
    
    Out[49]:
In [30]:
    
amazon_nx = nx.read_edgelist('./raw/amazon_copurchasing/com-amazon.ungraph.txt')
    
In [8]:
    
amazon_nx.is_directed()
    
    Out[8]:
In [9]:
    
amazon_nx.size()
    
    Out[9]:
In [10]:
    
len(amazon_nx.nodes())
    
    Out[10]:
In [19]:
    
sorted_nodes_amazon = sorted(amazon_nx.nodes(), key=int)
    
In [20]:
    
map_amazon = {}
for i, node_id in enumerate(sorted_nodes_amazon):
    map_amazon[node_id] = i
    
In [21]:
    
len(map_amazon)
    
    Out[21]:
In [22]:
    
max(amazon_nx.nodes(), key=int)
    
    Out[22]:
In [23]:
    
map_amazon['548551']
    
    Out[23]:
In [36]:
    
amazon_nx
    
    Out[36]:
In [37]:
    
def amazon_type_map(s):
    return map_amazon[s]
    
In [38]:
    
amazon_nx = nx.read_edgelist('./raw/amazon_copurchasing/com-amazon.ungraph.txt', nodetype=amazon_type_map)
    
In [39]:
    
amazon_nx[0]
    
    Out[39]:
In [40]:
    
max(amazon_nx.nodes())
    
    Out[40]:
In [41]:
    
from scipy.sparse import csr_matrix
    
In [59]:
    
label_amazon = np.zeros(shape=(334863, 5000), dtype=np.int8)
    
In [60]:
    
for cmty, nodelist in enumerate(top5klist[:-1]):
    for node in nodelist:
        label_amazon[map_amazon[node]][cmty] = 1
    
In [63]:
    
label_amazon = csr_matrix(label_amazon, dtype=np.int8)
    
In [64]:
    
label_amazon
    
    Out[64]:
In [70]:
    
label_amazon[4]
    
    Out[70]:
In [71]:
    
map_amazon['164985']
    
    Out[71]:
In [72]:
    
label_amazon[100150]
    
    Out[72]:
In [73]:
    
np.nonzero(label_amazon[100150])
    
    Out[73]:
In [74]:
    
amazon = {}
    
In [75]:
    
amazon['Labels'] = label_amazon
    
In [76]:
    
amazon['NXGraph'] = amazon_nx
    
In [77]:
    
with open('amazon.data', 'wb') as f:
    pickle.dump(amazon, f)
    
In [83]:
    
amazon_meta = find_meta('amazon')
    
In [84]:
    
amazon_pack = GraphContainer(metadata=amazon_meta, dataloc=os.path.abspath('./'))
    
In [85]:
    
amazon_gt = amazon_pack.get_gt_graph()
    
In [86]:
    
amazon_gt
    
    Out[86]:
In [88]:
    
%time graph_draw(amazon_gt, output="amazon_graph.pdf", output_size=(1200, 1200))
    
    
    
    Out[88]:
In [89]:
    
with open('/home/gear/Dropbox/CompletedProjects/motifwalk/data/raw/pubmed/Pubmed-Diabetes.DIRECTED.cites.tab') as f:
    pubmed = f.read()
    
In [91]:
    
pubmed[0:100]
    
    Out[91]:
In [92]:
    
edges = pubmed.split('\n')
    
In [96]:
    
edges = edges[2:]
    
In [98]:
    
edges[0]
    
    Out[98]:
In [99]:
    
pubmed_graph = nx.DiGraph()
    
In [100]:
    
tuples = []
    
In [101]:
    
edges[-1]
    
    Out[101]:
In [105]:
    
ix, src, _ ,dst = edges[-2].split('\t')
    
In [109]:
    
edges[-2]
    
    Out[109]:
In [106]:
    
ix
    
    Out[106]:
In [107]:
    
src
    
    Out[107]:
In [108]:
    
dst
    
    Out[108]:
In [112]:
    
src_id = src.split(':')[-1]
    
In [113]:
    
src_id
    
    Out[113]:
In [114]:
    
for e in edges[:-1]:
    idx, src, _, dst = e.split('\t')
    src_id = src.split(':')[-1]
    dst_id = dst.split(':')[-1]
    tuples.append((src_id, dst_id))
    
In [115]:
    
len(tuples)
    
    Out[115]:
In [116]:
    
pubmed_graph.add_edges_from(tuples)
    
In [117]:
    
pubmed_graph.is_directed()
    
    Out[117]:
In [118]:
    
with open('/home/gear/Dropbox/CompletedProjects/motifwalk/data/raw/pubmed/Pubmed-Diabetes.NODE.paper.tab') as f:
    pubmed = f.read()
    
In [122]:
    
data = pubmed.split('\n')
    
In [123]:
    
data[0]
    
    Out[123]:
In [126]:
    
data[2]
    
    Out[126]:
In [129]:
    
template = data[1].split('\t')
    
In [130]:
    
template
    
    Out[130]:
In [131]:
    
kw_id_map = {}
    
In [132]:
    
i = 0
for words in template[1:-1]:
    _, word, _ = words.split(':')
    kw_id_map[word] = i
    i += 1
    
In [134]:
    
kw_id_map['w-use']
    
    Out[134]:
In [135]:
    
pubmed_graph.nodes()[:10]
    
    Out[135]:
In [136]:
    
all_pubmed_nodes = sorted(pubmed_graph.nodes(), key=int)
    
In [138]:
    
all_pubmed_nodes[:5]
    
    Out[138]:
In [142]:
    
all_pubmed_nodes[-1]
    
    Out[142]:
In [143]:
    
len(all_pubmed_nodes)
    
    Out[143]:
In [139]:
    
map_pubmed = {}
for i, node_id in enumerate(all_pubmed_nodes):
    map_pubmed[node_id] = i
    
In [140]:
    
map_pubmed['29094']
    
    Out[140]:
In [144]:
    
def pubmed_type(node_id):
    return map_pubmed[node_id]
    
In [145]:
    
len(kw_id_map)
    
    Out[145]:
In [146]:
    
len(map_pubmed)
    
    Out[146]:
In [148]:
    
pubmed_features = np.zeros(shape=(19717, 500), dtype=np.float32)
    
In [149]:
    
pubmed_labels = np.zeros(shape=(19717, 3), dtype=np.uint8)
    
In [150]:
    
data[-1]
    
    Out[150]:
In [153]:
    
test = data[2]
    
In [156]:
    
node_id, *features_vec, _ = test.split('\t')
    
In [157]:
    
node_id
    
    Out[157]:
In [158]:
    
features_vec
    
    Out[158]:
In [161]:
    
for d in data[2:-1]:
    node_id, label, *feature_vec, summary = d.split('\t')
    int_id = map_pubmed[node_id]
    label = int(label.split('=')[-1]) - 1
    pubmed_labels[int_id][label] = 1
    for f in feature_vec:
        word, val = f.split('=')
        feature_id = kw_id_map[word]
        pubmed_features[int_id][feature_id] = float(val)
    
In [162]:
    
map_pubmed["12187484"]
    
    Out[162]:
In [166]:
    
pubmed_labels[11943]
    
    Out[166]:
In [167]:
    
test_labels = np.sum(pubmed_labels, axis=1)
    
In [172]:
    
np.count_nonzero(test_labels)
    
    Out[172]:
In [173]:
    
len(edges)
    
    Out[173]:
In [180]:
    
pubmed = nx.DiGraph()
    
In [181]:
    
for t in pubmed_graph.edges():
    s, d = map(pubmed_type, t)
    pubmed.add_edge(s,d)
    
In [184]:
    
pubmed.size()
    
    Out[184]:
In [185]:
    
pubmed.number_of_nodes()
    
    Out[185]:
In [186]:
    
pubmed.number_of_edges()
    
    Out[186]:
In [187]:
    
pubmed.is_directed()
    
    Out[187]:
In [189]:
    
pubmed.edge[11943]
    
    Out[189]:
In [191]:
    
pubmed.in_edges(11943)
    
    Out[191]:
In [192]:
    
pubmed.out_edges(11943)
    
    Out[192]:
In [193]:
    
all_pubmed = {}
all_pubmed['NXGraph'] = pubmed
all_pubmed['Labels'] = pubmed_labels
all_pubmed['CSRFeatures'] = csr_matrix(pubmed_features)
    
In [194]:
    
all_pubmed['CSRFeatures']
    
    Out[194]:
In [196]:
    
with open('./pubmed.data', 'wb') as f:
    pickle.dump(all_pubmed, f)
    
In [6]:
    
pubmed_meta = find_meta('pubmed')
    
In [7]:
    
pubmed = GraphContainer(pubmed_meta, dataloc='.')
    
In [8]:
    
pubmed_gt = pubmed.get_gt_graph()
    
In [10]:
    
node_color = pubmed_gt.new_vertex_property("int")
for i, l in enumerate(pubmed.get_labels()):
    node_color[i] = np.argmax(l)
    
In [13]:
    
%time graph_draw(pubmed_gt, vertex_fill_color=node_color, output="pubmed_with_labels.png", output_size=(1200,1200))
    
    
    
    Out[13]:
In [9]:
    
from motifwalk.motifs import all_3
    
In [18]:
    
for m in all_3:
    motif = m.gt_motif
    text = motif.new_vertex_property("string")
    for n in motif.vertices():
        text[n] = str(n)
    graph_draw(m.gt_motif, vertex_text=text, output_size=(80,80))
    
    
    
    
    
    
    
    
    
    
    
    
    
    
In [10]:
    
feed_forward = all_3[9]
    
In [11]:
    
motif = feed_forward.gt_motif
text = motif.new_vertex_property("string")
for n in motif.vertices():
    text[n] = str(n)
graph_draw(motif, vertex_text=text, output_size=(100,100))
    
    
    Out[11]:
In [11]:
    
feed_forward.anchors = {1,2}
    
In [12]:
    
from motifwalk.motifs.analysis import construct_motif_graph
    
In [13]:
    
ff_pubmed = construct_motif_graph(pubmed, feed_forward)
    
In [14]:
    
ff_pubmed
    
    Out[14]:
In [16]:
    
%time graph_draw(ff_pubmed, output="pubmed_with_labels_feedforward.png", output_size=(1200,1200))
    
    
    
    Out[16]:
In [17]:
    
vfilt = ff_pubmed.new_vertex_property('bool');
for i in ff_pubmed.vertices():
    v = ff_pubmed.vertex(i)
    if v.out_degree() > 0:
        vfilt[i] = True
    else:
        vfilt[i] = False
    
In [18]:
    
ff_pubmed_filtered = GraphView(ff_pubmed, vfilt)
    
In [19]:
    
%time graph_draw(ff_pubmed_filtered, output="pubmed_with_labels_feedforward_filtered.png", output_size=(1200,1200))
    
    
    
    Out[19]:
In [20]:
    
node_color = pubmed_gt.new_vertex_property("int")
for i, l in enumerate(pubmed.get_labels()):
    node_color[i] = np.argmax(l)
    
In [23]:
    
%time graph_draw(ff_pubmed_filtered, output="pubmed_with_labels_feedforward_filtered.png", vertex_fill_color=node_color, output_size=(1200,1200))
    
    
    
    Out[23]:
In [ ]: