notebook.community

Edit and run



In [1]:

    
import networkx as nx
from graph_tool.all import *
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os
import sys









    



/home/gear/anaconda3/envs/network/lib/python3.6/site-packages/graph_tool/draw/cairo_draw.py:1480: RuntimeWarning: Error importing Gtk module: No module named 'gi'; GTK+ drawing will not work.
  warnings.warn(msg, RuntimeWarning)



In [3]:

    
with open('citeseer.data', 'rb') as f:
    citeseer = pickle.load(f)



In [4]:

    
citeseer.keys()









    Out[4]:





dict_keys(['Labels', 'NXGraph', 'CSRFeatures'])



In [2]:

    
from motifwalk.utils.Graph import GraphContainer



In [3]:

    
from motifwalk.utils import find_meta, set_dataloc, get_metadata



In [4]:

    
set_dataloc(path_to_data=os.path.abspath('./'))



In [5]:

    
metadata = get_metadata()



In [9]:

    
citeseer_meta = find_meta('citeseer')



In [10]:

    
citeseer_pack = GraphContainer(citeseer_meta, dataloc=os.path.abspath('./'))



In [11]:

    
citeseer_pack.get_labels()









    Out[11]:





array([[0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])



In [12]:

    
citeseer_gt = citeseer_pack.get_gt_graph()



In [43]:

    
graph_draw(citeseer_gt, vertex_fill_color=label, output="citeseer_with_labels.pdf")









    












    Out[43]:





<PropertyMap object with key type 'Vertex' and value type 'vector<double>', for Graph 0x7f1180268a20, at 0x7f113a671470>



In [15]:

    
citeseer_nx = citeseer_pack.get_graph()



In [38]:

    
label = citeseer_gt.new_vertex_property("int")



In [40]:

    
classes = np.argmax(citeseer_pack.get_labels(), axis=1)



In [41]:

    
for i in range(classes.size):
    label[i] = classes[i]



In [44]:

    
bc_meta = find_meta('blogcatalog')



In [52]:

    
bc_pack = GraphContainer(bc_meta, dataloc=os.path.abspath('./'))



In [53]:

    
bc_gt = bc_pack.get_gt_graph()



In [54]:

    
label = bc_gt.new_vertex_property("int")



In [59]:

    
classes = np.argmax(bc_pack.get_labels().toarray(), axis=1)



In [60]:

    
for i in range(classes.size):
    label[i] = classes[i]



In [62]:

    
from time import time
t = time()
graph_draw(bc_gt, vertex_fill_color=label, output="blogcatalog_with_labels.pdf", output_size=(1200, 1200))
print(time()-t)









    












    



91.13800048828125



In [65]:

    
cora_meta = find_meta('cora')



In [66]:

    
cora_pack = GraphContainer(metadata=cora_meta, dataloc=os.path.abspath('./'))



In [67]:

    
labels = cora_pack.get_labels()



In [69]:

    
cora_gt = cora_pack.get_gt_graph()



In [70]:

    
classes = np.argmax(cora_pack.get_labels(), axis=1)



In [71]:

    
label = cora_gt.new_vertex_property("int")



In [72]:

    
for i in range(classes.size):
    label[i] = classes[i]



In [73]:

    
graph_draw(cora_gt, vertex_fill_color=label, output="cora_with_labels.pdf")









    












    Out[73]:





<PropertyMap object with key type 'Vertex' and value type 'vector<double>', for Graph 0x7f1139e06cf8, at 0x7f1139e07c18>

Preprocess Amazon co-purchasing data



In [2]:

    
aloc = "/home/gear/Dropbox/CompletedProjects/motifwalk/data/raw/amazon_copurchasing"



In [3]:

    
with open(aloc+'/com-amazon.top5000.cmty.txt') as f:
    top5k = f.read()



In [4]:

    
top5k = top5k.split('\n')



In [5]:

    
top5klist = [i.split('\t') for i in top5k]



In [6]:

    
len(max(top5klist, key=len))









    Out[6]:





328



In [49]:

    
top5klist[0]









    Out[49]:





['164985', '225214', '232761']



In [30]:

    
amazon_nx = nx.read_edgelist('./raw/amazon_copurchasing/com-amazon.ungraph.txt')



In [8]:

    
amazon_nx.is_directed()









    Out[8]:





False



In [9]:

    
amazon_nx.size()









    Out[9]:





925872



In [10]:

    
len(amazon_nx.nodes())









    Out[10]:





334863



In [19]:

    
sorted_nodes_amazon = sorted(amazon_nx.nodes(), key=int)



In [20]:

    
map_amazon = {}
for i, node_id in enumerate(sorted_nodes_amazon):
    map_amazon[node_id] = i



In [21]:

    
len(map_amazon)









    Out[21]:





334863



In [22]:

    
max(amazon_nx.nodes(), key=int)









    Out[22]:





'548551'



In [23]:

    
map_amazon['548551']









    Out[23]:





334862



In [36]:

    
amazon_nx









    Out[36]:





False



In [37]:

    
def amazon_type_map(s):
    return map_amazon[s]



In [38]:

    
amazon_nx = nx.read_edgelist('./raw/amazon_copurchasing/com-amazon.ungraph.txt', nodetype=amazon_type_map)



In [39]:

    
amazon_nx[0]









    Out[39]:





{53525: {},
 71631: {},
 98005: {},
 148223: {},
 209319: {},
 268298: {},
 270059: {},
 302147: {}}



In [40]:

    
max(amazon_nx.nodes())









    Out[40]:





334862



In [41]:

    
from scipy.sparse import csr_matrix



In [59]:

    
label_amazon = np.zeros(shape=(334863, 5000), dtype=np.int8)



In [60]:

    
for cmty, nodelist in enumerate(top5klist[:-1]):
    for node in nodelist:
        label_amazon[map_amazon[node]][cmty] = 1



In [63]:

    
label_amazon = csr_matrix(label_amazon, dtype=np.int8)



In [64]:

    
label_amazon









    Out[64]:





<334863x5000 sparse matrix of type '<class 'numpy.int8'>'
	with 67462 stored elements in Compressed Sparse Row format>



In [70]:

    
label_amazon[4]









    Out[70]:





(array([], dtype=int32), array([], dtype=int32))



In [71]:

    
map_amazon['164985']









    Out[71]:





100150



In [72]:

    
label_amazon[100150]









    Out[72]:





<1x5000 sparse matrix of type '<class 'numpy.int8'>'
	with 2 stored elements in Compressed Sparse Row format>



In [73]:

    
np.nonzero(label_amazon[100150])









    Out[73]:





(array([0, 0], dtype=int32), array([  0, 820], dtype=int32))



In [74]:

    
amazon = {}



In [75]:

    
amazon['Labels'] = label_amazon



In [76]:

    
amazon['NXGraph'] = amazon_nx



In [77]:

    
with open('amazon.data', 'wb') as f:
    pickle.dump(amazon, f)



In [83]:

    
amazon_meta = find_meta('amazon')



In [84]:

    
amazon_pack = GraphContainer(metadata=amazon_meta, dataloc=os.path.abspath('./'))



In [85]:

    
amazon_gt = amazon_pack.get_gt_graph()



In [86]:

    
amazon_gt









    Out[86]:





<Graph object, undirected, with 334863 vertices and 925872 edges at 0x7fd7e9a0acf8>



In [88]:

    
%time graph_draw(amazon_gt, output="amazon_graph.pdf", output_size=(1200, 1200))









    












    



CPU times: user 2h 33min 2s, sys: 57.1 s, total: 2h 33min 59s
Wall time: 25min 38s






    Out[88]:





<PropertyMap object with key type 'Vertex' and value type 'vector<double>', for Graph 0x7fd7e9a0acf8, at 0x7fd7dcb23400>



In [89]:

    
with open('/home/gear/Dropbox/CompletedProjects/motifwalk/data/raw/pubmed/Pubmed-Diabetes.DIRECTED.cites.tab') as f:
    pubmed = f.read()



In [91]:

    
pubmed[0:100]









    Out[91]:





'DIRECTED\tcites\nNO_FEATURES\n33824\tpaper:19127292\t|\tpaper:17363749\n37511\tpaper:19668377\t|\tpaper:172938'



In [92]:

    
edges = pubmed.split('\n')



In [96]:

    
edges = edges[2:]



In [98]:

    
edges[0]









    Out[98]:





'33824\tpaper:19127292\t|\tpaper:17363749'



In [99]:

    
pubmed_graph = nx.DiGraph()



In [100]:

    
tuples = []



In [101]:

    
edges[-1]









    Out[101]:





''



In [105]:

    
ix, src, _ ,dst = edges[-2].split('\t')



In [109]:

    
edges[-2]









    Out[109]:





'39746\tpaper:2384600\t|\tpaper:2662016'



In [106]:

    
ix









    Out[106]:





'39746'



In [107]:

    
src









    Out[107]:





'paper:2384600'



In [108]:

    
dst









    Out[108]:





'paper:2662016'



In [112]:

    
src_id = src.split(':')[-1]



In [113]:

    
src_id









    Out[113]:





'2384600'



In [114]:

    
for e in edges[:-1]:
    idx, src, _, dst = e.split('\t')
    src_id = src.split(':')[-1]
    dst_id = dst.split(':')[-1]
    tuples.append((src_id, dst_id))



In [115]:

    
len(tuples)









    Out[115]:





44338



In [116]:

    
pubmed_graph.add_edges_from(tuples)



In [117]:

    
pubmed_graph.is_directed()









    Out[117]:





True



In [118]:

    
with open('/home/gear/Dropbox/CompletedProjects/motifwalk/data/raw/pubmed/Pubmed-Diabetes.NODE.paper.tab') as f:
    pubmed = f.read()



In [122]:

    
data = pubmed.split('\n')



In [123]:

    
data[0]









    Out[123]:





'NODE\tpaper'



In [126]:

    
data[2]









    Out[126]:





'12187484\tlabel=1\tw-rat=0.09393489570187145\tw-common=0.028698458467273157\tw-use=0.01176012652514843\tw-examin=0.019375414753592942\tw-pathogenesi=0.06316131961800078\tw-retinopathi=0.17089058531360632\tw-mous=0.06770248034355311\tw-studi=0.017554610474374233\tw-anim=0.09840151241009497\tw-model=0.06269133038832954\tw-metabol=0.06232233318170418\tw-abnorm=0.11247870345628387\tw-contribut=0.02534773765067718\tw-develop=0.030388826051908086\tw-investig=0.02014612607562432\tw-mice=0.12119873074191996\tw-2=0.020571546813213402\tw-month=0.10361986739277738\tw-compar=0.02367140886552208\tw-obtain=0.03061978039959059\tw-method=0.014469342700659771\tw-induc=0.023516442702830022\tw-6=0.014872498687869398\tw-inject=0.028054999329982466\tw-experiment=0.06866787644053303\tw-normal=0.01777754779525323\tw-diet=0.031956203604979944\tw-30=0.02512131278693402\tw-hyperglycemia=0.02896081409449482\tw-level=0.03654889376239291\tw-lipid=0.030348254033687905\tw-oxid=0.09357481262838539\tw-activ=0.03623879368519283\tw-protein=0.022816081905882666\tw-kinas=0.04216587194300068\tw-c=0.031475602330090724\tw-measur=0.015735336508945104\tw-result=0.0075446006836769695\tw-increas=0.008769967077523864\tw-retin=0.04575957596508121\tw-stress=0.03732992842799811\tw-3=0.01261883005795486\tw-similar=0.01996113997855104\tw-observ=0.01828742887023866\tw-conclus=0.012866895687595546\tw-play=0.03099778146368732\tw-import=0.023158771568589955\tw-role=0.021716016285633605\tw-present=0.020784310286111652\tsummary=w-rat,w-common,w-use,w-examin,w-pathogenesi,w-retinopathi,w-mous,w-studi,w-anim,w-model,w-metabol,w-abnorm,w-contribut,w-develop,w-investig,w-mice,w-2,w-month,w-compar,w-obtain,w-method,w-induc,w-6,w-inject,w-experiment,w-normal,w-diet,w-30,w-hyperglycemia,w-level,w-lipid,w-oxid,w-activ,w-protein,w-kinas,w-c,w-measur,w-result,w-increas,w-retin,w-stress,w-3,w-similar,w-observ,w-conclus,w-play,w-import,w-role,w-present'



In [129]:

    
template = data[1].split('\t')



In [130]:

    
template









    Out[130]:





['cat=1,2,3:label',
 'numeric:w-rat:0.0',
 'numeric:w-common:0.0',
 'numeric:w-use:0.0',
 'numeric:w-examin:0.0',
 'numeric:w-pathogenesi:0.0',
 'numeric:w-retinopathi:0.0',
 'numeric:w-mous:0.0',
 'numeric:w-studi:0.0',
 'numeric:w-anim:0.0',
 'numeric:w-model:0.0',
 'numeric:w-metabol:0.0',
 'numeric:w-abnorm:0.0',
 'numeric:w-contribut:0.0',
 'numeric:w-develop:0.0',
 'numeric:w-investig:0.0',
 'numeric:w-mice:0.0',
 'numeric:w-2:0.0',
 'numeric:w-month:0.0',
 'numeric:w-compar:0.0',
 'numeric:w-obtain:0.0',
 'numeric:w-method:0.0',
 'numeric:w-induc:0.0',
 'numeric:w-6:0.0',
 'numeric:w-inject:0.0',
 'numeric:w-experiment:0.0',
 'numeric:w-normal:0.0',
 'numeric:w-diet:0.0',
 'numeric:w-30:0.0',
 'numeric:w-hyperglycemia:0.0',
 'numeric:w-level:0.0',
 'numeric:w-lipid:0.0',
 'numeric:w-oxid:0.0',
 'numeric:w-activ:0.0',
 'numeric:w-protein:0.0',
 'numeric:w-kinas:0.0',
 'numeric:w-c:0.0',
 'numeric:w-measur:0.0',
 'numeric:w-result:0.0',
 'numeric:w-increas:0.0',
 'numeric:w-retin:0.0',
 'numeric:w-stress:0.0',
 'numeric:w-3:0.0',
 'numeric:w-similar:0.0',
 'numeric:w-observ:0.0',
 'numeric:w-conclus:0.0',
 'numeric:w-play:0.0',
 'numeric:w-import:0.0',
 'numeric:w-role:0.0',
 'numeric:w-present:0.0',
 'numeric:w-p:0.0',
 'numeric:w-m:0.0',
 'numeric:w-r:0.0',
 'numeric:w-muscl:0.0',
 'numeric:w-control:0.0',
 'numeric:w-chang:0.0',
 'numeric:w-dure:0.0',
 'numeric:w-lower:0.0',
 'numeric:w-higher:0.0',
 'numeric:w-mass:0.0',
 'numeric:w-correl:0.0',
 'numeric:w-decreas:0.0',
 'numeric:w-determin:0.0',
 'numeric:w-concentr:0.0',
 'numeric:w-stimul:0.0',
 'numeric:w-period:0.0',
 'numeric:w-caus:0.0',
 'numeric:w-mark:0.0',
 'numeric:w-group:0.0',
 'numeric:w-evid:0.0',
 'numeric:w-fast:0.0',
 'numeric:w-type:0.0',
 'numeric:w-signific:0.0',
 'numeric:w-differ:0.0',
 'numeric:w-ratio:0.0',
 'numeric:w-suggest:0.0',
 'numeric:w-degre:0.0',
 'numeric:w-occur:0.0',
 'numeric:w-vivo:0.0',
 'numeric:w-respect:0.0',
 'numeric:w-dysfunct:0.0',
 'numeric:w-region:0.0',
 'numeric:w-high:0.0',
 'numeric:w-appear:0.0',
 'numeric:w-sever:0.0',
 'numeric:w-affect:0.0',
 'numeric:w-cardiovascular:0.0',
 'numeric:w-complic:0.0',
 'numeric:w-primari:0.0',
 'numeric:w-death:0.0',
 'numeric:w-patient:0.0',
 'numeric:w-clinic:0.0',
 'numeric:w-suscept:0.0',
 'numeric:w-cardiac:0.0',
 'numeric:w-tissu:0.0',
 'numeric:w-specif:0.0',
 'numeric:w-function:0.0',
 'numeric:w-defect:0.0',
 'numeric:w-possibl:0.0',
 'numeric:w-indic:0.0',
 'numeric:w-state:0.0',
 'numeric:w-onli:0.0',
 'numeric:w-bodi:0.0',
 'numeric:w-weight:0.0',
 'numeric:w-loss:0.0',
 'numeric:w-valu:0.0',
 'numeric:w-howev:0.0',
 'numeric:w-4:0.0',
 'numeric:w-condit:0.0',
 'numeric:w-durat:0.0',
 'numeric:w-8:0.0',
 'numeric:w-week:0.0',
 'numeric:w-onset:0.0',
 'numeric:w-data:0.0',
 'numeric:w-direct:0.0',
 'numeric:w-report:0.0',
 'numeric:w-provid:0.0',
 'numeric:w-addit:0.0',
 'numeric:w-evalu:0.0',
 'numeric:w-sensit:0.0',
 'numeric:w-heart:0.0',
 'numeric:w-object:0.0',
 'numeric:w-mean:0.0',
 'numeric:w-blood:0.0',
 'numeric:w-glucos:0.0',
 'numeric:w-strong:0.0',
 'numeric:w-hba:0.0',
 'numeric:w-1c:0.0',
 'numeric:w-a1c:0.0',
 'numeric:w-variabl:0.0',
 'numeric:w-independ:0.0',
 'numeric:w-assess:0.0',
 'numeric:w-relat:0.0',
 'numeric:w-trial:0.0',
 'numeric:w-research:0.0',
 'numeric:w-design:0.0',
 'numeric:w-profil:0.0',
 'numeric:w-sampl:0.0',
 'numeric:w-particip:0.0',
 'numeric:w-n:0.0',
 'numeric:w-1:0.0',
 'numeric:w-consist:0.0',
 'numeric:w-befor:0.0',
 'numeric:w-min:0.0',
 'numeric:w-predict:0.0',
 'numeric:w-adjust:0.0',
 'numeric:w-sex:0.0',
 'numeric:w-treatment:0.0',
 'numeric:w-7:0.0',
 'numeric:w-gt:0.0',
 'numeric:w-0:0.0',
 'numeric:w-larg:0.0',
 'numeric:w-influenc:0.0',
 'numeric:w-base:0.0',
 'numeric:w-standard:0.0',
 'numeric:w-14:0.0',
 'numeric:w-10:0.0',
 'numeric:w-wherea:0.0',
 'numeric:w-enhanc:0.0',
 'numeric:w-manag:0.0',
 'numeric:w-day:0.0',
 'numeric:w-secret:0.0',
 'numeric:w-cholesterol:0.0',
 'numeric:w-insulin:0.0',
 'numeric:w-24:0.0',
 'numeric:w-h:0.0',
 'numeric:w-low:0.0',
 'numeric:w-rate:0.0',
 'numeric:w-fatti:0.0',
 'numeric:w-acid:0.0',
 'numeric:w-effect:0.0',
 'numeric:w-hormon:0.0',
 'numeric:w-hepat:0.0',
 'numeric:w-contrast:0.0',
 'numeric:w-product:0.0',
 'numeric:w-major:0.0',
 'numeric:w-plasma:0.0',
 'numeric:w-current:0.0',
 'numeric:w-flow:0.0',
 'numeric:w-chronic:0.0',
 'numeric:w-mechan:0.0',
 'numeric:w-test:0.0',
 'numeric:w-therefor:0.0',
 'numeric:w-analys:0.0',
 'numeric:w-mrna:0.0',
 'numeric:w-streptozotocin:0.0',
 'numeric:w-did:0.0',
 'numeric:w-15:0.0',
 'numeric:w-g:0.0',
 'numeric:w-25:0.0',
 'numeric:w-mmol:0.0',
 'numeric:w-l:0.0',
 'numeric:w-5:0.0',
 'numeric:w-reduc:0.0',
 'numeric:w-number:0.0',
 'numeric:w-densiti:0.0',
 'numeric:w-posit:0.0',
 'numeric:w-cell:0.0',
 'numeric:w-17:0.0',
 'numeric:w-mm:0.0',
 'numeric:w-18:0.0',
 'numeric:w-induct:0.0',
 'numeric:w-associ:0.0',
 'numeric:w-express:0.0',
 'numeric:w-glycem:0.0',
 'numeric:w-respons:0.0',
 'numeric:w-therapi:0.0',
 'numeric:w-random:0.0',
 'numeric:w-initi:0.0',
 'numeric:w-ani:0.0',
 'numeric:w-singl:0.0',
 'numeric:w-new:0.0',
 'numeric:w-agent:0.0',
 'numeric:w-metformin:0.0',
 'numeric:w-medic:0.0',
 'numeric:w-glycosyl:0.0',
 'numeric:w-hemoglobin:0.0',
 'numeric:w-analysi:0.0',
 'numeric:w-baselin:0.0',
 'numeric:w-health:0.0',
 'numeric:w-factor:0.0',
 'numeric:w-process:0.0',
 'numeric:w-care:0.0',
 'numeric:w-9:0.0',
 'numeric:w-01:0.0',
 'numeric:w-95:0.0',
 'numeric:w-interv:0.0',
 'numeric:w-ci:0.0',
 'numeric:w-12:0.0',
 'numeric:w-reduct:0.0',
 'numeric:w-achiev:0.0',
 'numeric:w-target:0.0',
 'numeric:w-lt:0.0',
 'numeric:w-diseas:0.0',
 'numeric:w-class:0.0',
 'numeric:w-age:0.0',
 'numeric:w-obes:0.0',
 'numeric:w-renal:0.0',
 'numeric:w-improv:0.0',
 'numeric:w-progress:0.0',
 'numeric:w-noninsulindepend:0.0',
 'numeric:w-mellitus:0.0',
 'numeric:w-becaus:0.0',
 'numeric:w-s:0.0',
 'numeric:w-index:0.0',
 'numeric:w-hypertens:0.0',
 'numeric:w-need:0.0',
 'numeric:w-followup:0.0',
 'numeric:w-year:0.0',
 'numeric:w-mg:0.0',
 'numeric:w-dl:0.0',
 'numeric:w-remain:0.0',
 'numeric:w-subject:0.0',
 'numeric:w-treat:0.0',
 'numeric:w-oral:0.0',
 'numeric:w-requir:0.0',
 'numeric:w-0001:0.0',
 'numeric:w-mortal:0.0',
 'numeric:w-includ:0.0',
 'numeric:w-vs:0.0',
 'numeric:w-background:0.0',
 'numeric:w-poor:0.0',
 'numeric:w-drug:0.0',
 'numeric:w-13:0.0',
 'numeric:w-rang:0.0',
 'numeric:w-combin:0.0',
 'numeric:w-intervent:0.0',
 'numeric:w-daili:0.0',
 'numeric:w-dose:0.0',
 'numeric:w-100:0.0',
 'numeric:w-toler:0.0',
 'numeric:w-receiv:0.0',
 'numeric:w-11:0.0',
 'numeric:w-postprandi:0.0',
 'numeric:w-kg:0.0',
 'numeric:w-hypoglycemia:0.0',
 'numeric:w-frequent:0.0',
 'numeric:w-event:0.0',
 'numeric:w-versus:0.0',
 'numeric:w-symptom:0.0',
 'numeric:w-incid:0.0',
 'numeric:w-parent:0.0',
 'numeric:w-complex:0.0',
 'numeric:w-longterm:0.0',
 'numeric:w-inhibitor:0.0',
 'numeric:w-peripher:0.0',
 'numeric:w-nerv:0.0',
 'numeric:w-stz:0.0',
 'numeric:w-conduct:0.0',
 'numeric:w-demonstr:0.0',
 'numeric:w-frequenc:0.0',
 'numeric:w-inhibit:0.0',
 'numeric:w-neuropathi:0.0',
 'numeric:w-pathway:0.0',
 'numeric:w-shown:0.0',
 'numeric:w-time:0.0',
 'numeric:w-ii:0.0',
 'numeric:w-individu:0.0',
 'numeric:w-adult:0.0',
 'numeric:w-50:0.0',
 'numeric:w-60:0.0',
 'numeric:w-diagnosi:0.0',
 'numeric:w-healthi:0.0',
 'numeric:w-follow:0.0',
 'numeric:w-young:0.0',
 'numeric:w-seen:0.0',
 'numeric:w-alter:0.0',
 'numeric:w-gene:0.0',
 'numeric:w-e:0.0',
 'numeric:w-identifi:0.0',
 'numeric:w-previous:0.0',
 'numeric:w-mediat:0.0',
 'numeric:w-vascular:0.0',
 'numeric:w-lipoprotein:0.0',
 'numeric:w-involv:0.0',
 'numeric:w-phenotyp:0.0',
 'numeric:w-confirm:0.0',
 'numeric:w-variant:0.0',
 'numeric:w-endotheli:0.0',
 'numeric:w-potenti:0.0',
 'numeric:w-disord:0.0',
 'numeric:w-popul:0.0',
 'numeric:w-nonobes:0.0',
 'numeric:w-aim:0.0',
 'numeric:w-serum:0.0',
 'numeric:w-hba1c:0.0',
 'numeric:w-hypoglycaemia:0.0',
 'numeric:w-continu:0.0',
 'numeric:w-case:0.0',
 'numeric:w-impair:0.0',
 'numeric:w-risk:0.0',
 'numeric:w-known:0.0',
 'numeric:w-men:0.0',
 'numeric:w-women:0.0',
 'numeric:w-40:0.0',
 'numeric:w-complet:0.0',
 'numeric:w-estim:0.0',
 'numeric:w-like:0.0',
 'numeric:w-particular:0.0',
 'numeric:w-human:0.0',
 'numeric:w-character:0.0',
 'numeric:w-elev:0.0',
 'numeric:w-synthesi:0.0',
 'numeric:w-greater:0.0',
 'numeric:w-small:0.0',
 'numeric:w-reveal:0.0',
 'numeric:w-liver:0.0',
 'numeric:w-niddm:0.0',
 'numeric:w-genet:0.0',
 'numeric:w-receptor:0.0',
 'numeric:w-growth:0.0',
 'numeric:w-pancreat:0.0',
 'numeric:w-betacel:0.0',
 'numeric:w-molecul:0.0',
 'numeric:w-enzym:0.0',
 'numeric:w-regul:0.0',
 'numeric:w-polymorph:0.0',
 'numeric:w-total:0.0',
 'numeric:w-allel:0.0',
 'numeric:w-02:0.0',
 'numeric:w-resist:0.0',
 'numeric:w-cpeptid:0.0',
 'numeric:w-hypothesi:0.0',
 'numeric:w-perform:0.0',
 'numeric:w-score:0.0',
 'numeric:w-001:0.0',
 'numeric:w-05:0.0',
 'numeric:w-histori:0.0',
 'numeric:w-action:0.0',
 'numeric:w-approxim:0.0',
 'numeric:w-suppress:0.0',
 'numeric:w-glucagon:0.0',
 'numeric:w-ml:0.0',
 'numeric:w-x:0.0',
 'numeric:w-free:0.0',
 'numeric:w-peopl:0.0',
 'numeric:w-uptak:0.0',
 'numeric:w-intens:0.0',
 'numeric:w-relationship:0.0',
 'numeric:w-prevent:0.0',
 'numeric:w-autoimmun:0.0',
 'numeric:w-recent:0.0',
 'numeric:w-preval:0.0',
 'numeric:w-nondiabet:0.0',
 'numeric:w-genotyp:0.0',
 'numeric:w-conclud:0.0',
 'numeric:w-linkag:0.0',
 'numeric:w-islet:0.0',
 'numeric:w-peptid:0.0',
 'numeric:w-form:0.0',
 'numeric:w-membran:0.0',
 'numeric:w-transgen:0.0',
 'numeric:w-failur:0.0',
 'numeric:w-isol:0.0',
 'numeric:w-negat:0.0',
 'numeric:w-earli:0.0',
 'numeric:w-famili:0.0',
 'numeric:w-chromosom:0.0',
 'numeric:w-immun:0.0',
 'numeric:w-support:0.0',
 'numeric:w-16:0.0',
 'numeric:w-cohort:0.0',
 'numeric:w-insulindepend:0.0',
 'numeric:w-outcom:0.0',
 'numeric:w-screen:0.0',
 'numeric:w-approach:0.0',
 'numeric:w-infus:0.0',
 'numeric:w-multipl:0.0',
 'numeric:w-depend:0.0',
 'numeric:w-physic:0.0',
 'numeric:w-transport:0.0',
 'numeric:w-acut:0.0',
 'numeric:w-releas:0.0',
 'numeric:w-presenc:0.0',
 'numeric:w-glycaem:0.0',
 'numeric:w-male:0.0',
 'numeric:w-antibodi:0.0',
 'numeric:w-femal:0.0',
 'numeric:w-pattern:0.0',
 'numeric:w-t2dm:0.0',
 'numeric:w-promot:0.0',
 'numeric:w-fat:0.0',
 'numeric:w-d:0.0',
 'numeric:w-bmi:0.0',
 'numeric:w-haplotyp:0.0',
 'numeric:w-triglycerid:0.0',
 'numeric:w-interact:0.0',
 'numeric:w-marker:0.0',
 'numeric:w-describ:0.0',
 'numeric:w-area:0.0',
 'numeric:w-20:0.0',
 'numeric:w-cytokin:0.0',
 'numeric:w-bind:0.0',
 'numeric:w-bb:0.0',
 'numeric:w-alpha:0.0',
 'numeric:w-beta:0.0',
 'numeric:w-cd4:0.0',
 'numeric:w-spontan:0.0',
 'numeric:w-given:0.0',
 'numeric:w-vitro:0.0',
 'numeric:w-basal:0.0',
 'numeric:w-protect:0.0',
 'numeric:w-pressur:0.0',
 'numeric:w-detect:0.0',
 'numeric:w-exercis:0.0',
 'numeric:w-children:0.0',
 'numeric:w-adolesc:0.0',
 'numeric:w-life:0.0',
 'numeric:w-b:0.0',
 'numeric:w-antigen:0.0',
 'numeric:w-iddm:0.0',
 'numeric:w-american:0.0',
 'numeric:w-hla:0.0',
 'numeric:w-arteri:0.0',
 'numeric:w-nephropathi:0.0',
 'numeric:w-review:0.0',
 'numeric:w-destruct:0.0',
 'numeric:w-content:0.0',
 'numeric:w-autoantibodi:0.0',
 'numeric:w-dm:0.0',
 'numeric:w-select:0.0',
 'numeric:w-infect:0.0',
 'numeric:w-recipi:0.0',
 'numeric:w-intak:0.0',
 'numeric:w-placebo:0.0',
 'numeric:w-db:0.0',
 'numeric:w-pancrea:0.0',
 'numeric:w-diagnos:0.0',
 'numeric:w-glomerular:0.0',
 'numeric:w-albumin:0.0',
 'numeric:w-excret:0.0',
 'numeric:w-syndrom:0.0',
 'numeric:w-t:0.0',
 'numeric:w-lymphocyt:0.0',
 'numeric:w-produc:0.0',
 'numeric:w-coronari:0.0',
 'numeric:w-status:0.0',
 'numeric:w-microalbuminuria:0.0',
 'numeric:w-nod:0.0',
 'numeric:w-mhc:0.0',
 'numeric:w-insul:0.0',
 'numeric:w-administr:0.0',
 'numeric:w-revers:0.0',
 'numeric:w-transplant:0.0',
 'numeric:w-graft:0.0',
 'numeric:w-t1d:0.0',
 'numeric:w-lead:0.0',
 'numeric:w-v:0.0',
 'numeric:w-dietari:0.0',
 'numeric:w-general:0.0',
 'numeric:w-macrophag:0.0',
 'numeric:w-kidney:0.0',
 'numeric:w-urinari:0.0',
 'numeric:w-myocardi:0.0',
 'numeric:w-meal:0.0',
 'numeric:w-ica:0.0',
 'numeric:w-locus:0.0',
 'numeric:w-tcell:0.0',
 'numeric:w-depress:0.0',
 'numeric:w-bone:0.0',
 'numeric:w-mutat:0.0',
 'string:summary']



In [131]:

    
kw_id_map = {}



In [132]:

    
i = 0
for words in template[1:-1]:
    _, word, _ = words.split(':')
    kw_id_map[word] = i
    i += 1



In [134]:

    
kw_id_map['w-use']









    Out[134]:





2



In [135]:

    
pubmed_graph.nodes()[:10]









    Out[135]:





['19127292',
 '17363749',
 '19668377',
 '17293876',
 '1313726',
 '3002783',
 '19110882',
 '14578298',
 '18606979',
 '10333910']



In [136]:

    
all_pubmed_nodes = sorted(pubmed_graph.nodes(), key=int)



In [138]:

    
all_pubmed_nodes[:5]









    Out[138]:





['7145', '29094', '34420', '34548', '37920']



In [142]:

    
all_pubmed_nodes[-1]









    Out[142]:





'20061360'



In [143]:

    
len(all_pubmed_nodes)









    Out[143]:





19717



In [139]:

    
map_pubmed = {}
for i, node_id in enumerate(all_pubmed_nodes):
    map_pubmed[node_id] = i



In [140]:

    
map_pubmed['29094']









    Out[140]:





1



In [144]:

    
def pubmed_type(node_id):
    return map_pubmed[node_id]



In [145]:

    
len(kw_id_map)









    Out[145]:





500



In [146]:

    
len(map_pubmed)









    Out[146]:





19717



In [148]:

    
pubmed_features = np.zeros(shape=(19717, 500), dtype=np.float32)



In [149]:

    
pubmed_labels = np.zeros(shape=(19717, 3), dtype=np.uint8)



In [150]:

    
data[-1]









    Out[150]:





''



In [153]:

    
test = data[2]



In [156]:

    
node_id, *features_vec, _ = test.split('\t')



In [157]:

    
node_id









    Out[157]:





'12187484'



In [158]:

    
features_vec









    Out[158]:





['label=1',
 'w-rat=0.09393489570187145',
 'w-common=0.028698458467273157',
 'w-use=0.01176012652514843',
 'w-examin=0.019375414753592942',
 'w-pathogenesi=0.06316131961800078',
 'w-retinopathi=0.17089058531360632',
 'w-mous=0.06770248034355311',
 'w-studi=0.017554610474374233',
 'w-anim=0.09840151241009497',
 'w-model=0.06269133038832954',
 'w-metabol=0.06232233318170418',
 'w-abnorm=0.11247870345628387',
 'w-contribut=0.02534773765067718',
 'w-develop=0.030388826051908086',
 'w-investig=0.02014612607562432',
 'w-mice=0.12119873074191996',
 'w-2=0.020571546813213402',
 'w-month=0.10361986739277738',
 'w-compar=0.02367140886552208',
 'w-obtain=0.03061978039959059',
 'w-method=0.014469342700659771',
 'w-induc=0.023516442702830022',
 'w-6=0.014872498687869398',
 'w-inject=0.028054999329982466',
 'w-experiment=0.06866787644053303',
 'w-normal=0.01777754779525323',
 'w-diet=0.031956203604979944',
 'w-30=0.02512131278693402',
 'w-hyperglycemia=0.02896081409449482',
 'w-level=0.03654889376239291',
 'w-lipid=0.030348254033687905',
 'w-oxid=0.09357481262838539',
 'w-activ=0.03623879368519283',
 'w-protein=0.022816081905882666',
 'w-kinas=0.04216587194300068',
 'w-c=0.031475602330090724',
 'w-measur=0.015735336508945104',
 'w-result=0.0075446006836769695',
 'w-increas=0.008769967077523864',
 'w-retin=0.04575957596508121',
 'w-stress=0.03732992842799811',
 'w-3=0.01261883005795486',
 'w-similar=0.01996113997855104',
 'w-observ=0.01828742887023866',
 'w-conclus=0.012866895687595546',
 'w-play=0.03099778146368732',
 'w-import=0.023158771568589955',
 'w-role=0.021716016285633605',
 'w-present=0.020784310286111652']



In [161]:

    
for d in data[2:-1]:
    node_id, label, *feature_vec, summary = d.split('\t')
    int_id = map_pubmed[node_id]
    label = int(label.split('=')[-1]) - 1
    pubmed_labels[int_id][label] = 1
    for f in feature_vec:
        word, val = f.split('=')
        feature_id = kw_id_map[word]
        pubmed_features[int_id][feature_id] = float(val)



In [162]:

    
map_pubmed["12187484"]









    Out[162]:





11943



In [166]:

    
pubmed_labels[11943]









    Out[166]:





array([1, 0, 0], dtype=uint8)



In [167]:

    
test_labels = np.sum(pubmed_labels, axis=1)



In [172]:

    
np.count_nonzero(test_labels)









    Out[172]:





19717



In [173]:

    
len(edges)









    Out[173]:





44339



In [180]:

    
pubmed = nx.DiGraph()



In [181]:

    
for t in pubmed_graph.edges():
    s, d = map(pubmed_type, t)
    pubmed.add_edge(s,d)



In [184]:

    
pubmed.size()









    Out[184]:





44338



In [185]:

    
pubmed.number_of_nodes()









    Out[185]:





19717



In [186]:

    
pubmed.number_of_edges()









    Out[186]:





44338



In [187]:

    
pubmed.is_directed()









    Out[187]:





True



In [189]:

    
pubmed.edge[11943]









    Out[189]:





{}



In [191]:

    
pubmed.in_edges(11943)









    Out[191]:





[(16602, 11943), (19317, 11943), (14573, 11943), (18565, 11943)]



In [192]:

    
pubmed.out_edges(11943)









    Out[192]:





[]



In [193]:

    
all_pubmed = {}
all_pubmed['NXGraph'] = pubmed
all_pubmed['Labels'] = pubmed_labels
all_pubmed['CSRFeatures'] = csr_matrix(pubmed_features)



In [194]:

    
all_pubmed['CSRFeatures']









    Out[194]:





<19717x500 sparse matrix of type '<class 'numpy.float32'>'
	with 988031 stored elements in Compressed Sparse Row format>



In [196]:

    
with open('./pubmed.data', 'wb') as f:
    pickle.dump(all_pubmed, f)



In [6]:

    
pubmed_meta = find_meta('pubmed')



In [7]:

    
pubmed = GraphContainer(pubmed_meta, dataloc='.')



In [8]:

    
pubmed_gt = pubmed.get_gt_graph()



In [10]:

    
node_color = pubmed_gt.new_vertex_property("int")
for i, l in enumerate(pubmed.get_labels()):
    node_color[i] = np.argmax(l)



In [13]:

    
%time graph_draw(pubmed_gt, vertex_fill_color=node_color, output="pubmed_with_labels.png", output_size=(1200,1200))









    












    



CPU times: user 6min 45s, sys: 5 s, total: 6min 50s
Wall time: 1min 2s






    Out[13]:





<PropertyMap object with key type 'Vertex' and value type 'vector<double>', for Graph 0x7f2b74a3da20, at 0x7f2b2f274c18>



In [9]:

    
from motifwalk.motifs import all_3



In [18]:

    
for m in all_3:
    motif = m.gt_motif
    text = motif.new_vertex_property("string")
    for n in motif.vertices():
        text[n] = str(n)
    graph_draw(m.gt_motif, vertex_text=text, output_size=(80,80))



In [10]:

    
feed_forward = all_3[9]



In [11]:

    
motif = feed_forward.gt_motif
text = motif.new_vertex_property("string")
for n in motif.vertices():
    text[n] = str(n)
graph_draw(motif, vertex_text=text, output_size=(100,100))









    












    Out[11]:





<PropertyMap object with key type 'Vertex' and value type 'vector<double>', for Graph 0x7f93e1664ba8, at 0x7f93e7bd34e0>



In [11]:

    
feed_forward.anchors = {1,2}



In [12]:

    
from motifwalk.motifs.analysis import construct_motif_graph



In [13]:

    
ff_pubmed = construct_motif_graph(pubmed, feed_forward)



In [14]:

    
ff_pubmed









    Out[14]:





<Graph object, undirected, with 19717 vertices and 12375 edges at 0x7f740ad6f7b8>



In [16]:

    
%time graph_draw(ff_pubmed, output="pubmed_with_labels_feedforward.png", output_size=(1200,1200))









    












    



CPU times: user 55min 31s, sys: 6.14 s, total: 55min 37s
Wall time: 7min 3s






    Out[16]:





<PropertyMap object with key type 'Vertex' and value type 'vector<double>', for Graph 0x7f740ad6f7b8, at 0x7f740ea69048>



In [17]:

    
vfilt = ff_pubmed.new_vertex_property('bool');
for i in ff_pubmed.vertices():
    v = ff_pubmed.vertex(i)
    if v.out_degree() > 0:
        vfilt[i] = True
    else:
        vfilt[i] = False



In [18]:

    
ff_pubmed_filtered = GraphView(ff_pubmed, vfilt)



In [19]:

    
%time graph_draw(ff_pubmed_filtered, output="pubmed_with_labels_feedforward_filtered.png", output_size=(1200,1200))









    












    



CPU times: user 1min 19s, sys: 612 ms, total: 1min 20s
Wall time: 11.9 s






    Out[19]:





<PropertyMap object with key type 'Vertex' and value type 'vector<double>', for Graph 0x7f74053f15f8, at 0x7f740ea50eb8>



In [20]:

    
node_color = pubmed_gt.new_vertex_property("int")
for i, l in enumerate(pubmed.get_labels()):
    node_color[i] = np.argmax(l)



In [23]:

    
%time graph_draw(ff_pubmed_filtered, output="pubmed_with_labels_feedforward_filtered.png", vertex_fill_color=node_color, output_size=(1200,1200))









    












    



CPU times: user 1min 21s, sys: 720 ms, total: 1min 21s
Wall time: 12.6 s






    Out[23]:





<PropertyMap object with key type 'Vertex' and value type 'vector<double>', for Graph 0x7f74053f15f8, at 0x7f74053f7da0>



In [ ]: