In [1]:
import networkx as nx
from graph_tool.all import *
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os
import sys


/home/gear/anaconda3/envs/network/lib/python3.6/site-packages/graph_tool/draw/cairo_draw.py:1480: RuntimeWarning: Error importing Gtk module: No module named 'gi'; GTK+ drawing will not work.
  warnings.warn(msg, RuntimeWarning)

In [3]:
with open('citeseer.data', 'rb') as f:
    citeseer = pickle.load(f)

In [4]:
citeseer.keys()


Out[4]:
dict_keys(['Labels', 'NXGraph', 'CSRFeatures'])

In [2]:
from motifwalk.utils.Graph import GraphContainer

In [3]:
from motifwalk.utils import find_meta, set_dataloc, get_metadata

In [4]:
set_dataloc(path_to_data=os.path.abspath('./'))

In [5]:
metadata = get_metadata()

In [9]:
citeseer_meta = find_meta('citeseer')

In [10]:
citeseer_pack = GraphContainer(citeseer_meta, dataloc=os.path.abspath('./'))

In [11]:
citeseer_pack.get_labels()


Out[11]:
array([[0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

In [12]:
citeseer_gt = citeseer_pack.get_gt_graph()

In [43]:
graph_draw(citeseer_gt, vertex_fill_color=label, output="citeseer_with_labels.pdf")


Out[43]:
<PropertyMap object with key type 'Vertex' and value type 'vector<double>', for Graph 0x7f1180268a20, at 0x7f113a671470>

In [15]:
citeseer_nx = citeseer_pack.get_graph()

In [38]:
label = citeseer_gt.new_vertex_property("int")

In [40]:
classes = np.argmax(citeseer_pack.get_labels(), axis=1)

In [41]:
for i in range(classes.size):
    label[i] = classes[i]

In [44]:
bc_meta = find_meta('blogcatalog')

In [52]:
bc_pack = GraphContainer(bc_meta, dataloc=os.path.abspath('./'))

In [53]:
bc_gt = bc_pack.get_gt_graph()

In [54]:
label = bc_gt.new_vertex_property("int")

In [59]:
classes = np.argmax(bc_pack.get_labels().toarray(), axis=1)

In [60]:
for i in range(classes.size):
    label[i] = classes[i]

In [62]:
from time import time
t = time()
graph_draw(bc_gt, vertex_fill_color=label, output="blogcatalog_with_labels.pdf", output_size=(1200, 1200))
print(time()-t)


91.13800048828125

In [65]:
cora_meta = find_meta('cora')

In [66]:
cora_pack = GraphContainer(metadata=cora_meta, dataloc=os.path.abspath('./'))

In [67]:
labels = cora_pack.get_labels()

In [69]:
cora_gt = cora_pack.get_gt_graph()

In [70]:
classes = np.argmax(cora_pack.get_labels(), axis=1)

In [71]:
label = cora_gt.new_vertex_property("int")

In [72]:
for i in range(classes.size):
    label[i] = classes[i]

In [73]:
graph_draw(cora_gt, vertex_fill_color=label, output="cora_with_labels.pdf")


Out[73]:
<PropertyMap object with key type 'Vertex' and value type 'vector<double>', for Graph 0x7f1139e06cf8, at 0x7f1139e07c18>

Preprocess Amazon co-purchasing data


In [2]:
aloc = "/home/gear/Dropbox/CompletedProjects/motifwalk/data/raw/amazon_copurchasing"

In [3]:
with open(aloc+'/com-amazon.top5000.cmty.txt') as f:
    top5k = f.read()

In [4]:
top5k = top5k.split('\n')

In [5]:
top5klist = [i.split('\t') for i in top5k]

In [6]:
len(max(top5klist, key=len))


Out[6]:
328

In [49]:
top5klist[0]


Out[49]:
['164985', '225214', '232761']

In [30]:
amazon_nx = nx.read_edgelist('./raw/amazon_copurchasing/com-amazon.ungraph.txt')

In [8]:
amazon_nx.is_directed()


Out[8]:
False

In [9]:
amazon_nx.size()


Out[9]:
925872

In [10]:
len(amazon_nx.nodes())


Out[10]:
334863

In [19]:
sorted_nodes_amazon = sorted(amazon_nx.nodes(), key=int)

In [20]:
map_amazon = {}
for i, node_id in enumerate(sorted_nodes_amazon):
    map_amazon[node_id] = i

In [21]:
len(map_amazon)


Out[21]:
334863

In [22]:
max(amazon_nx.nodes(), key=int)


Out[22]:
'548551'

In [23]:
map_amazon['548551']


Out[23]:
334862

In [36]:
amazon_nx


Out[36]:
False

In [37]:
def amazon_type_map(s):
    return map_amazon[s]

In [38]:
amazon_nx = nx.read_edgelist('./raw/amazon_copurchasing/com-amazon.ungraph.txt', nodetype=amazon_type_map)

In [39]:
amazon_nx[0]


Out[39]:
{53525: {},
 71631: {},
 98005: {},
 148223: {},
 209319: {},
 268298: {},
 270059: {},
 302147: {}}

In [40]:
max(amazon_nx.nodes())


Out[40]:
334862

In [41]:
from scipy.sparse import csr_matrix

In [59]:
label_amazon = np.zeros(shape=(334863, 5000), dtype=np.int8)

In [60]:
for cmty, nodelist in enumerate(top5klist[:-1]):
    for node in nodelist:
        label_amazon[map_amazon[node]][cmty] = 1

In [63]:
label_amazon = csr_matrix(label_amazon, dtype=np.int8)

In [64]:
label_amazon


Out[64]:
<334863x5000 sparse matrix of type '<class 'numpy.int8'>'
	with 67462 stored elements in Compressed Sparse Row format>

In [70]:
label_amazon[4]


Out[70]:
(array([], dtype=int32), array([], dtype=int32))

In [71]:
map_amazon['164985']


Out[71]:
100150

In [72]:
label_amazon[100150]


Out[72]:
<1x5000 sparse matrix of type '<class 'numpy.int8'>'
	with 2 stored elements in Compressed Sparse Row format>

In [73]:
np.nonzero(label_amazon[100150])


Out[73]:
(array([0, 0], dtype=int32), array([  0, 820], dtype=int32))

In [74]:
amazon = {}

In [75]:
amazon['Labels'] = label_amazon

In [76]:
amazon['NXGraph'] = amazon_nx

In [77]:
with open('amazon.data', 'wb') as f:
    pickle.dump(amazon, f)

In [83]:
amazon_meta = find_meta('amazon')

In [84]:
amazon_pack = GraphContainer(metadata=amazon_meta, dataloc=os.path.abspath('./'))

In [85]:
amazon_gt = amazon_pack.get_gt_graph()

In [86]:
amazon_gt


Out[86]:
<Graph object, undirected, with 334863 vertices and 925872 edges at 0x7fd7e9a0acf8>

In [88]:
%time graph_draw(amazon_gt, output="amazon_graph.pdf", output_size=(1200, 1200))


CPU times: user 2h 33min 2s, sys: 57.1 s, total: 2h 33min 59s
Wall time: 25min 38s
Out[88]:
<PropertyMap object with key type 'Vertex' and value type 'vector<double>', for Graph 0x7fd7e9a0acf8, at 0x7fd7dcb23400>

In [89]:
with open('/home/gear/Dropbox/CompletedProjects/motifwalk/data/raw/pubmed/Pubmed-Diabetes.DIRECTED.cites.tab') as f:
    pubmed = f.read()

In [91]:
pubmed[0:100]


Out[91]:
'DIRECTED\tcites\nNO_FEATURES\n33824\tpaper:19127292\t|\tpaper:17363749\n37511\tpaper:19668377\t|\tpaper:172938'

In [92]:
edges = pubmed.split('\n')

In [96]:
edges = edges[2:]

In [98]:
edges[0]


Out[98]:
'33824\tpaper:19127292\t|\tpaper:17363749'

In [99]:
pubmed_graph = nx.DiGraph()

In [100]:
tuples = []

In [101]:
edges[-1]


Out[101]:
''

In [105]:
ix, src, _ ,dst = edges[-2].split('\t')

In [109]:
edges[-2]


Out[109]:
'39746\tpaper:2384600\t|\tpaper:2662016'

In [106]:
ix


Out[106]:
'39746'

In [107]:
src


Out[107]:
'paper:2384600'

In [108]:
dst


Out[108]:
'paper:2662016'

In [112]:
src_id = src.split(':')[-1]

In [113]:
src_id


Out[113]:
'2384600'

In [114]:
for e in edges[:-1]:
    idx, src, _, dst = e.split('\t')
    src_id = src.split(':')[-1]
    dst_id = dst.split(':')[-1]
    tuples.append((src_id, dst_id))

In [115]:
len(tuples)


Out[115]:
44338

In [116]:
pubmed_graph.add_edges_from(tuples)

In [117]:
pubmed_graph.is_directed()


Out[117]:
True

In [118]:
with open('/home/gear/Dropbox/CompletedProjects/motifwalk/data/raw/pubmed/Pubmed-Diabetes.NODE.paper.tab') as f:
    pubmed = f.read()

In [122]:
data = pubmed.split('\n')

In [123]:
data[0]


Out[123]:
'NODE\tpaper'

In [126]:
data[2]


Out[126]:
'12187484\tlabel=1\tw-rat=0.09393489570187145\tw-common=0.028698458467273157\tw-use=0.01176012652514843\tw-examin=0.019375414753592942\tw-pathogenesi=0.06316131961800078\tw-retinopathi=0.17089058531360632\tw-mous=0.06770248034355311\tw-studi=0.017554610474374233\tw-anim=0.09840151241009497\tw-model=0.06269133038832954\tw-metabol=0.06232233318170418\tw-abnorm=0.11247870345628387\tw-contribut=0.02534773765067718\tw-develop=0.030388826051908086\tw-investig=0.02014612607562432\tw-mice=0.12119873074191996\tw-2=0.020571546813213402\tw-month=0.10361986739277738\tw-compar=0.02367140886552208\tw-obtain=0.03061978039959059\tw-method=0.014469342700659771\tw-induc=0.023516442702830022\tw-6=0.014872498687869398\tw-inject=0.028054999329982466\tw-experiment=0.06866787644053303\tw-normal=0.01777754779525323\tw-diet=0.031956203604979944\tw-30=0.02512131278693402\tw-hyperglycemia=0.02896081409449482\tw-level=0.03654889376239291\tw-lipid=0.030348254033687905\tw-oxid=0.09357481262838539\tw-activ=0.03623879368519283\tw-protein=0.022816081905882666\tw-kinas=0.04216587194300068\tw-c=0.031475602330090724\tw-measur=0.015735336508945104\tw-result=0.0075446006836769695\tw-increas=0.008769967077523864\tw-retin=0.04575957596508121\tw-stress=0.03732992842799811\tw-3=0.01261883005795486\tw-similar=0.01996113997855104\tw-observ=0.01828742887023866\tw-conclus=0.012866895687595546\tw-play=0.03099778146368732\tw-import=0.023158771568589955\tw-role=0.021716016285633605\tw-present=0.020784310286111652\tsummary=w-rat,w-common,w-use,w-examin,w-pathogenesi,w-retinopathi,w-mous,w-studi,w-anim,w-model,w-metabol,w-abnorm,w-contribut,w-develop,w-investig,w-mice,w-2,w-month,w-compar,w-obtain,w-method,w-induc,w-6,w-inject,w-experiment,w-normal,w-diet,w-30,w-hyperglycemia,w-level,w-lipid,w-oxid,w-activ,w-protein,w-kinas,w-c,w-measur,w-result,w-increas,w-retin,w-stress,w-3,w-similar,w-observ,w-conclus,w-play,w-import,w-role,w-present'

In [129]:
template = data[1].split('\t')

In [130]:
template


Out[130]:
['cat=1,2,3:label',
 'numeric:w-rat:0.0',
 'numeric:w-common:0.0',
 'numeric:w-use:0.0',
 'numeric:w-examin:0.0',
 'numeric:w-pathogenesi:0.0',
 'numeric:w-retinopathi:0.0',
 'numeric:w-mous:0.0',
 'numeric:w-studi:0.0',
 'numeric:w-anim:0.0',
 'numeric:w-model:0.0',
 'numeric:w-metabol:0.0',
 'numeric:w-abnorm:0.0',
 'numeric:w-contribut:0.0',
 'numeric:w-develop:0.0',
 'numeric:w-investig:0.0',
 'numeric:w-mice:0.0',
 'numeric:w-2:0.0',
 'numeric:w-month:0.0',
 'numeric:w-compar:0.0',
 'numeric:w-obtain:0.0',
 'numeric:w-method:0.0',
 'numeric:w-induc:0.0',
 'numeric:w-6:0.0',
 'numeric:w-inject:0.0',
 'numeric:w-experiment:0.0',
 'numeric:w-normal:0.0',
 'numeric:w-diet:0.0',
 'numeric:w-30:0.0',
 'numeric:w-hyperglycemia:0.0',
 'numeric:w-level:0.0',
 'numeric:w-lipid:0.0',
 'numeric:w-oxid:0.0',
 'numeric:w-activ:0.0',
 'numeric:w-protein:0.0',
 'numeric:w-kinas:0.0',
 'numeric:w-c:0.0',
 'numeric:w-measur:0.0',
 'numeric:w-result:0.0',
 'numeric:w-increas:0.0',
 'numeric:w-retin:0.0',
 'numeric:w-stress:0.0',
 'numeric:w-3:0.0',
 'numeric:w-similar:0.0',
 'numeric:w-observ:0.0',
 'numeric:w-conclus:0.0',
 'numeric:w-play:0.0',
 'numeric:w-import:0.0',
 'numeric:w-role:0.0',
 'numeric:w-present:0.0',
 'numeric:w-p:0.0',
 'numeric:w-m:0.0',
 'numeric:w-r:0.0',
 'numeric:w-muscl:0.0',
 'numeric:w-control:0.0',
 'numeric:w-chang:0.0',
 'numeric:w-dure:0.0',
 'numeric:w-lower:0.0',
 'numeric:w-higher:0.0',
 'numeric:w-mass:0.0',
 'numeric:w-correl:0.0',
 'numeric:w-decreas:0.0',
 'numeric:w-determin:0.0',
 'numeric:w-concentr:0.0',
 'numeric:w-stimul:0.0',
 'numeric:w-period:0.0',
 'numeric:w-caus:0.0',
 'numeric:w-mark:0.0',
 'numeric:w-group:0.0',
 'numeric:w-evid:0.0',
 'numeric:w-fast:0.0',
 'numeric:w-type:0.0',
 'numeric:w-signific:0.0',
 'numeric:w-differ:0.0',
 'numeric:w-ratio:0.0',
 'numeric:w-suggest:0.0',
 'numeric:w-degre:0.0',
 'numeric:w-occur:0.0',
 'numeric:w-vivo:0.0',
 'numeric:w-respect:0.0',
 'numeric:w-dysfunct:0.0',
 'numeric:w-region:0.0',
 'numeric:w-high:0.0',
 'numeric:w-appear:0.0',
 'numeric:w-sever:0.0',
 'numeric:w-affect:0.0',
 'numeric:w-cardiovascular:0.0',
 'numeric:w-complic:0.0',
 'numeric:w-primari:0.0',
 'numeric:w-death:0.0',
 'numeric:w-patient:0.0',
 'numeric:w-clinic:0.0',
 'numeric:w-suscept:0.0',
 'numeric:w-cardiac:0.0',
 'numeric:w-tissu:0.0',
 'numeric:w-specif:0.0',
 'numeric:w-function:0.0',
 'numeric:w-defect:0.0',
 'numeric:w-possibl:0.0',
 'numeric:w-indic:0.0',
 'numeric:w-state:0.0',
 'numeric:w-onli:0.0',
 'numeric:w-bodi:0.0',
 'numeric:w-weight:0.0',
 'numeric:w-loss:0.0',
 'numeric:w-valu:0.0',
 'numeric:w-howev:0.0',
 'numeric:w-4:0.0',
 'numeric:w-condit:0.0',
 'numeric:w-durat:0.0',
 'numeric:w-8:0.0',
 'numeric:w-week:0.0',
 'numeric:w-onset:0.0',
 'numeric:w-data:0.0',
 'numeric:w-direct:0.0',
 'numeric:w-report:0.0',
 'numeric:w-provid:0.0',
 'numeric:w-addit:0.0',
 'numeric:w-evalu:0.0',
 'numeric:w-sensit:0.0',
 'numeric:w-heart:0.0',
 'numeric:w-object:0.0',
 'numeric:w-mean:0.0',
 'numeric:w-blood:0.0',
 'numeric:w-glucos:0.0',
 'numeric:w-strong:0.0',
 'numeric:w-hba:0.0',
 'numeric:w-1c:0.0',
 'numeric:w-a1c:0.0',
 'numeric:w-variabl:0.0',
 'numeric:w-independ:0.0',
 'numeric:w-assess:0.0',
 'numeric:w-relat:0.0',
 'numeric:w-trial:0.0',
 'numeric:w-research:0.0',
 'numeric:w-design:0.0',
 'numeric:w-profil:0.0',
 'numeric:w-sampl:0.0',
 'numeric:w-particip:0.0',
 'numeric:w-n:0.0',
 'numeric:w-1:0.0',
 'numeric:w-consist:0.0',
 'numeric:w-befor:0.0',
 'numeric:w-min:0.0',
 'numeric:w-predict:0.0',
 'numeric:w-adjust:0.0',
 'numeric:w-sex:0.0',
 'numeric:w-treatment:0.0',
 'numeric:w-7:0.0',
 'numeric:w-gt:0.0',
 'numeric:w-0:0.0',
 'numeric:w-larg:0.0',
 'numeric:w-influenc:0.0',
 'numeric:w-base:0.0',
 'numeric:w-standard:0.0',
 'numeric:w-14:0.0',
 'numeric:w-10:0.0',
 'numeric:w-wherea:0.0',
 'numeric:w-enhanc:0.0',
 'numeric:w-manag:0.0',
 'numeric:w-day:0.0',
 'numeric:w-secret:0.0',
 'numeric:w-cholesterol:0.0',
 'numeric:w-insulin:0.0',
 'numeric:w-24:0.0',
 'numeric:w-h:0.0',
 'numeric:w-low:0.0',
 'numeric:w-rate:0.0',
 'numeric:w-fatti:0.0',
 'numeric:w-acid:0.0',
 'numeric:w-effect:0.0',
 'numeric:w-hormon:0.0',
 'numeric:w-hepat:0.0',
 'numeric:w-contrast:0.0',
 'numeric:w-product:0.0',
 'numeric:w-major:0.0',
 'numeric:w-plasma:0.0',
 'numeric:w-current:0.0',
 'numeric:w-flow:0.0',
 'numeric:w-chronic:0.0',
 'numeric:w-mechan:0.0',
 'numeric:w-test:0.0',
 'numeric:w-therefor:0.0',
 'numeric:w-analys:0.0',
 'numeric:w-mrna:0.0',
 'numeric:w-streptozotocin:0.0',
 'numeric:w-did:0.0',
 'numeric:w-15:0.0',
 'numeric:w-g:0.0',
 'numeric:w-25:0.0',
 'numeric:w-mmol:0.0',
 'numeric:w-l:0.0',
 'numeric:w-5:0.0',
 'numeric:w-reduc:0.0',
 'numeric:w-number:0.0',
 'numeric:w-densiti:0.0',
 'numeric:w-posit:0.0',
 'numeric:w-cell:0.0',
 'numeric:w-17:0.0',
 'numeric:w-mm:0.0',
 'numeric:w-18:0.0',
 'numeric:w-induct:0.0',
 'numeric:w-associ:0.0',
 'numeric:w-express:0.0',
 'numeric:w-glycem:0.0',
 'numeric:w-respons:0.0',
 'numeric:w-therapi:0.0',
 'numeric:w-random:0.0',
 'numeric:w-initi:0.0',
 'numeric:w-ani:0.0',
 'numeric:w-singl:0.0',
 'numeric:w-new:0.0',
 'numeric:w-agent:0.0',
 'numeric:w-metformin:0.0',
 'numeric:w-medic:0.0',
 'numeric:w-glycosyl:0.0',
 'numeric:w-hemoglobin:0.0',
 'numeric:w-analysi:0.0',
 'numeric:w-baselin:0.0',
 'numeric:w-health:0.0',
 'numeric:w-factor:0.0',
 'numeric:w-process:0.0',
 'numeric:w-care:0.0',
 'numeric:w-9:0.0',
 'numeric:w-01:0.0',
 'numeric:w-95:0.0',
 'numeric:w-interv:0.0',
 'numeric:w-ci:0.0',
 'numeric:w-12:0.0',
 'numeric:w-reduct:0.0',
 'numeric:w-achiev:0.0',
 'numeric:w-target:0.0',
 'numeric:w-lt:0.0',
 'numeric:w-diseas:0.0',
 'numeric:w-class:0.0',
 'numeric:w-age:0.0',
 'numeric:w-obes:0.0',
 'numeric:w-renal:0.0',
 'numeric:w-improv:0.0',
 'numeric:w-progress:0.0',
 'numeric:w-noninsulindepend:0.0',
 'numeric:w-mellitus:0.0',
 'numeric:w-becaus:0.0',
 'numeric:w-s:0.0',
 'numeric:w-index:0.0',
 'numeric:w-hypertens:0.0',
 'numeric:w-need:0.0',
 'numeric:w-followup:0.0',
 'numeric:w-year:0.0',
 'numeric:w-mg:0.0',
 'numeric:w-dl:0.0',
 'numeric:w-remain:0.0',
 'numeric:w-subject:0.0',
 'numeric:w-treat:0.0',
 'numeric:w-oral:0.0',
 'numeric:w-requir:0.0',
 'numeric:w-0001:0.0',
 'numeric:w-mortal:0.0',
 'numeric:w-includ:0.0',
 'numeric:w-vs:0.0',
 'numeric:w-background:0.0',
 'numeric:w-poor:0.0',
 'numeric:w-drug:0.0',
 'numeric:w-13:0.0',
 'numeric:w-rang:0.0',
 'numeric:w-combin:0.0',
 'numeric:w-intervent:0.0',
 'numeric:w-daili:0.0',
 'numeric:w-dose:0.0',
 'numeric:w-100:0.0',
 'numeric:w-toler:0.0',
 'numeric:w-receiv:0.0',
 'numeric:w-11:0.0',
 'numeric:w-postprandi:0.0',
 'numeric:w-kg:0.0',
 'numeric:w-hypoglycemia:0.0',
 'numeric:w-frequent:0.0',
 'numeric:w-event:0.0',
 'numeric:w-versus:0.0',
 'numeric:w-symptom:0.0',
 'numeric:w-incid:0.0',
 'numeric:w-parent:0.0',
 'numeric:w-complex:0.0',
 'numeric:w-longterm:0.0',
 'numeric:w-inhibitor:0.0',
 'numeric:w-peripher:0.0',
 'numeric:w-nerv:0.0',
 'numeric:w-stz:0.0',
 'numeric:w-conduct:0.0',
 'numeric:w-demonstr:0.0',
 'numeric:w-frequenc:0.0',
 'numeric:w-inhibit:0.0',
 'numeric:w-neuropathi:0.0',
 'numeric:w-pathway:0.0',
 'numeric:w-shown:0.0',
 'numeric:w-time:0.0',
 'numeric:w-ii:0.0',
 'numeric:w-individu:0.0',
 'numeric:w-adult:0.0',
 'numeric:w-50:0.0',
 'numeric:w-60:0.0',
 'numeric:w-diagnosi:0.0',
 'numeric:w-healthi:0.0',
 'numeric:w-follow:0.0',
 'numeric:w-young:0.0',
 'numeric:w-seen:0.0',
 'numeric:w-alter:0.0',
 'numeric:w-gene:0.0',
 'numeric:w-e:0.0',
 'numeric:w-identifi:0.0',
 'numeric:w-previous:0.0',
 'numeric:w-mediat:0.0',
 'numeric:w-vascular:0.0',
 'numeric:w-lipoprotein:0.0',
 'numeric:w-involv:0.0',
 'numeric:w-phenotyp:0.0',
 'numeric:w-confirm:0.0',
 'numeric:w-variant:0.0',
 'numeric:w-endotheli:0.0',
 'numeric:w-potenti:0.0',
 'numeric:w-disord:0.0',
 'numeric:w-popul:0.0',
 'numeric:w-nonobes:0.0',
 'numeric:w-aim:0.0',
 'numeric:w-serum:0.0',
 'numeric:w-hba1c:0.0',
 'numeric:w-hypoglycaemia:0.0',
 'numeric:w-continu:0.0',
 'numeric:w-case:0.0',
 'numeric:w-impair:0.0',
 'numeric:w-risk:0.0',
 'numeric:w-known:0.0',
 'numeric:w-men:0.0',
 'numeric:w-women:0.0',
 'numeric:w-40:0.0',
 'numeric:w-complet:0.0',
 'numeric:w-estim:0.0',
 'numeric:w-like:0.0',
 'numeric:w-particular:0.0',
 'numeric:w-human:0.0',
 'numeric:w-character:0.0',
 'numeric:w-elev:0.0',
 'numeric:w-synthesi:0.0',
 'numeric:w-greater:0.0',
 'numeric:w-small:0.0',
 'numeric:w-reveal:0.0',
 'numeric:w-liver:0.0',
 'numeric:w-niddm:0.0',
 'numeric:w-genet:0.0',
 'numeric:w-receptor:0.0',
 'numeric:w-growth:0.0',
 'numeric:w-pancreat:0.0',
 'numeric:w-betacel:0.0',
 'numeric:w-molecul:0.0',
 'numeric:w-enzym:0.0',
 'numeric:w-regul:0.0',
 'numeric:w-polymorph:0.0',
 'numeric:w-total:0.0',
 'numeric:w-allel:0.0',
 'numeric:w-02:0.0',
 'numeric:w-resist:0.0',
 'numeric:w-cpeptid:0.0',
 'numeric:w-hypothesi:0.0',
 'numeric:w-perform:0.0',
 'numeric:w-score:0.0',
 'numeric:w-001:0.0',
 'numeric:w-05:0.0',
 'numeric:w-histori:0.0',
 'numeric:w-action:0.0',
 'numeric:w-approxim:0.0',
 'numeric:w-suppress:0.0',
 'numeric:w-glucagon:0.0',
 'numeric:w-ml:0.0',
 'numeric:w-x:0.0',
 'numeric:w-free:0.0',
 'numeric:w-peopl:0.0',
 'numeric:w-uptak:0.0',
 'numeric:w-intens:0.0',
 'numeric:w-relationship:0.0',
 'numeric:w-prevent:0.0',
 'numeric:w-autoimmun:0.0',
 'numeric:w-recent:0.0',
 'numeric:w-preval:0.0',
 'numeric:w-nondiabet:0.0',
 'numeric:w-genotyp:0.0',
 'numeric:w-conclud:0.0',
 'numeric:w-linkag:0.0',
 'numeric:w-islet:0.0',
 'numeric:w-peptid:0.0',
 'numeric:w-form:0.0',
 'numeric:w-membran:0.0',
 'numeric:w-transgen:0.0',
 'numeric:w-failur:0.0',
 'numeric:w-isol:0.0',
 'numeric:w-negat:0.0',
 'numeric:w-earli:0.0',
 'numeric:w-famili:0.0',
 'numeric:w-chromosom:0.0',
 'numeric:w-immun:0.0',
 'numeric:w-support:0.0',
 'numeric:w-16:0.0',
 'numeric:w-cohort:0.0',
 'numeric:w-insulindepend:0.0',
 'numeric:w-outcom:0.0',
 'numeric:w-screen:0.0',
 'numeric:w-approach:0.0',
 'numeric:w-infus:0.0',
 'numeric:w-multipl:0.0',
 'numeric:w-depend:0.0',
 'numeric:w-physic:0.0',
 'numeric:w-transport:0.0',
 'numeric:w-acut:0.0',
 'numeric:w-releas:0.0',
 'numeric:w-presenc:0.0',
 'numeric:w-glycaem:0.0',
 'numeric:w-male:0.0',
 'numeric:w-antibodi:0.0',
 'numeric:w-femal:0.0',
 'numeric:w-pattern:0.0',
 'numeric:w-t2dm:0.0',
 'numeric:w-promot:0.0',
 'numeric:w-fat:0.0',
 'numeric:w-d:0.0',
 'numeric:w-bmi:0.0',
 'numeric:w-haplotyp:0.0',
 'numeric:w-triglycerid:0.0',
 'numeric:w-interact:0.0',
 'numeric:w-marker:0.0',
 'numeric:w-describ:0.0',
 'numeric:w-area:0.0',
 'numeric:w-20:0.0',
 'numeric:w-cytokin:0.0',
 'numeric:w-bind:0.0',
 'numeric:w-bb:0.0',
 'numeric:w-alpha:0.0',
 'numeric:w-beta:0.0',
 'numeric:w-cd4:0.0',
 'numeric:w-spontan:0.0',
 'numeric:w-given:0.0',
 'numeric:w-vitro:0.0',
 'numeric:w-basal:0.0',
 'numeric:w-protect:0.0',
 'numeric:w-pressur:0.0',
 'numeric:w-detect:0.0',
 'numeric:w-exercis:0.0',
 'numeric:w-children:0.0',
 'numeric:w-adolesc:0.0',
 'numeric:w-life:0.0',
 'numeric:w-b:0.0',
 'numeric:w-antigen:0.0',
 'numeric:w-iddm:0.0',
 'numeric:w-american:0.0',
 'numeric:w-hla:0.0',
 'numeric:w-arteri:0.0',
 'numeric:w-nephropathi:0.0',
 'numeric:w-review:0.0',
 'numeric:w-destruct:0.0',
 'numeric:w-content:0.0',
 'numeric:w-autoantibodi:0.0',
 'numeric:w-dm:0.0',
 'numeric:w-select:0.0',
 'numeric:w-infect:0.0',
 'numeric:w-recipi:0.0',
 'numeric:w-intak:0.0',
 'numeric:w-placebo:0.0',
 'numeric:w-db:0.0',
 'numeric:w-pancrea:0.0',
 'numeric:w-diagnos:0.0',
 'numeric:w-glomerular:0.0',
 'numeric:w-albumin:0.0',
 'numeric:w-excret:0.0',
 'numeric:w-syndrom:0.0',
 'numeric:w-t:0.0',
 'numeric:w-lymphocyt:0.0',
 'numeric:w-produc:0.0',
 'numeric:w-coronari:0.0',
 'numeric:w-status:0.0',
 'numeric:w-microalbuminuria:0.0',
 'numeric:w-nod:0.0',
 'numeric:w-mhc:0.0',
 'numeric:w-insul:0.0',
 'numeric:w-administr:0.0',
 'numeric:w-revers:0.0',
 'numeric:w-transplant:0.0',
 'numeric:w-graft:0.0',
 'numeric:w-t1d:0.0',
 'numeric:w-lead:0.0',
 'numeric:w-v:0.0',
 'numeric:w-dietari:0.0',
 'numeric:w-general:0.0',
 'numeric:w-macrophag:0.0',
 'numeric:w-kidney:0.0',
 'numeric:w-urinari:0.0',
 'numeric:w-myocardi:0.0',
 'numeric:w-meal:0.0',
 'numeric:w-ica:0.0',
 'numeric:w-locus:0.0',
 'numeric:w-tcell:0.0',
 'numeric:w-depress:0.0',
 'numeric:w-bone:0.0',
 'numeric:w-mutat:0.0',
 'string:summary']

In [131]:
kw_id_map = {}

In [132]:
i = 0
for words in template[1:-1]:
    _, word, _ = words.split(':')
    kw_id_map[word] = i
    i += 1

In [134]:
kw_id_map['w-use']


Out[134]:
2

In [135]:
pubmed_graph.nodes()[:10]


Out[135]:
['19127292',
 '17363749',
 '19668377',
 '17293876',
 '1313726',
 '3002783',
 '19110882',
 '14578298',
 '18606979',
 '10333910']

In [136]:
all_pubmed_nodes = sorted(pubmed_graph.nodes(), key=int)

In [138]:
all_pubmed_nodes[:5]


Out[138]:
['7145', '29094', '34420', '34548', '37920']

In [142]:
all_pubmed_nodes[-1]


Out[142]:
'20061360'

In [143]:
len(all_pubmed_nodes)


Out[143]:
19717

In [139]:
map_pubmed = {}
for i, node_id in enumerate(all_pubmed_nodes):
    map_pubmed[node_id] = i

In [140]:
map_pubmed['29094']


Out[140]:
1

In [144]:
def pubmed_type(node_id):
    return map_pubmed[node_id]

In [145]:
len(kw_id_map)


Out[145]:
500

In [146]:
len(map_pubmed)


Out[146]:
19717

In [148]:
pubmed_features = np.zeros(shape=(19717, 500), dtype=np.float32)

In [149]:
pubmed_labels = np.zeros(shape=(19717, 3), dtype=np.uint8)

In [150]:
data[-1]


Out[150]:
''

In [153]:
test = data[2]

In [156]:
node_id, *features_vec, _ = test.split('\t')

In [157]:
node_id


Out[157]:
'12187484'

In [158]:
features_vec


Out[158]:
['label=1',
 'w-rat=0.09393489570187145',
 'w-common=0.028698458467273157',
 'w-use=0.01176012652514843',
 'w-examin=0.019375414753592942',
 'w-pathogenesi=0.06316131961800078',
 'w-retinopathi=0.17089058531360632',
 'w-mous=0.06770248034355311',
 'w-studi=0.017554610474374233',
 'w-anim=0.09840151241009497',
 'w-model=0.06269133038832954',
 'w-metabol=0.06232233318170418',
 'w-abnorm=0.11247870345628387',
 'w-contribut=0.02534773765067718',
 'w-develop=0.030388826051908086',
 'w-investig=0.02014612607562432',
 'w-mice=0.12119873074191996',
 'w-2=0.020571546813213402',
 'w-month=0.10361986739277738',
 'w-compar=0.02367140886552208',
 'w-obtain=0.03061978039959059',
 'w-method=0.014469342700659771',
 'w-induc=0.023516442702830022',
 'w-6=0.014872498687869398',
 'w-inject=0.028054999329982466',
 'w-experiment=0.06866787644053303',
 'w-normal=0.01777754779525323',
 'w-diet=0.031956203604979944',
 'w-30=0.02512131278693402',
 'w-hyperglycemia=0.02896081409449482',
 'w-level=0.03654889376239291',
 'w-lipid=0.030348254033687905',
 'w-oxid=0.09357481262838539',
 'w-activ=0.03623879368519283',
 'w-protein=0.022816081905882666',
 'w-kinas=0.04216587194300068',
 'w-c=0.031475602330090724',
 'w-measur=0.015735336508945104',
 'w-result=0.0075446006836769695',
 'w-increas=0.008769967077523864',
 'w-retin=0.04575957596508121',
 'w-stress=0.03732992842799811',
 'w-3=0.01261883005795486',
 'w-similar=0.01996113997855104',
 'w-observ=0.01828742887023866',
 'w-conclus=0.012866895687595546',
 'w-play=0.03099778146368732',
 'w-import=0.023158771568589955',
 'w-role=0.021716016285633605',
 'w-present=0.020784310286111652']

In [161]:
for d in data[2:-1]:
    node_id, label, *feature_vec, summary = d.split('\t')
    int_id = map_pubmed[node_id]
    label = int(label.split('=')[-1]) - 1
    pubmed_labels[int_id][label] = 1
    for f in feature_vec:
        word, val = f.split('=')
        feature_id = kw_id_map[word]
        pubmed_features[int_id][feature_id] = float(val)

In [162]:
map_pubmed["12187484"]


Out[162]:
11943

In [166]:
pubmed_labels[11943]


Out[166]:
array([1, 0, 0], dtype=uint8)

In [167]:
test_labels = np.sum(pubmed_labels, axis=1)

In [172]:
np.count_nonzero(test_labels)


Out[172]:
19717

In [173]:
len(edges)


Out[173]:
44339

In [180]:
pubmed = nx.DiGraph()

In [181]:
for t in pubmed_graph.edges():
    s, d = map(pubmed_type, t)
    pubmed.add_edge(s,d)

In [184]:
pubmed.size()


Out[184]:
44338

In [185]:
pubmed.number_of_nodes()


Out[185]:
19717

In [186]:
pubmed.number_of_edges()


Out[186]:
44338

In [187]:
pubmed.is_directed()


Out[187]:
True

In [189]:
pubmed.edge[11943]


Out[189]:
{}

In [191]:
pubmed.in_edges(11943)


Out[191]:
[(16602, 11943), (19317, 11943), (14573, 11943), (18565, 11943)]

In [192]:
pubmed.out_edges(11943)


Out[192]:
[]

In [193]:
all_pubmed = {}
all_pubmed['NXGraph'] = pubmed
all_pubmed['Labels'] = pubmed_labels
all_pubmed['CSRFeatures'] = csr_matrix(pubmed_features)

In [194]:
all_pubmed['CSRFeatures']


Out[194]:
<19717x500 sparse matrix of type '<class 'numpy.float32'>'
	with 988031 stored elements in Compressed Sparse Row format>

In [196]:
with open('./pubmed.data', 'wb') as f:
    pickle.dump(all_pubmed, f)

In [6]:
pubmed_meta = find_meta('pubmed')

In [7]:
pubmed = GraphContainer(pubmed_meta, dataloc='.')

In [8]:
pubmed_gt = pubmed.get_gt_graph()

In [10]:
node_color = pubmed_gt.new_vertex_property("int")
for i, l in enumerate(pubmed.get_labels()):
    node_color[i] = np.argmax(l)

In [13]:
%time graph_draw(pubmed_gt, vertex_fill_color=node_color, output="pubmed_with_labels.png", output_size=(1200,1200))


CPU times: user 6min 45s, sys: 5 s, total: 6min 50s
Wall time: 1min 2s
Out[13]:
<PropertyMap object with key type 'Vertex' and value type 'vector<double>', for Graph 0x7f2b74a3da20, at 0x7f2b2f274c18>

In [9]:
from motifwalk.motifs import all_3

In [18]:
for m in all_3:
    motif = m.gt_motif
    text = motif.new_vertex_property("string")
    for n in motif.vertices():
        text[n] = str(n)
    graph_draw(m.gt_motif, vertex_text=text, output_size=(80,80))



In [10]:
feed_forward = all_3[9]

In [11]:
motif = feed_forward.gt_motif
text = motif.new_vertex_property("string")
for n in motif.vertices():
    text[n] = str(n)
graph_draw(motif, vertex_text=text, output_size=(100,100))


Out[11]:
<PropertyMap object with key type 'Vertex' and value type 'vector<double>', for Graph 0x7f93e1664ba8, at 0x7f93e7bd34e0>

In [11]:
feed_forward.anchors = {1,2}

In [12]:
from motifwalk.motifs.analysis import construct_motif_graph

In [13]:
ff_pubmed = construct_motif_graph(pubmed, feed_forward)

In [14]:
ff_pubmed


Out[14]:
<Graph object, undirected, with 19717 vertices and 12375 edges at 0x7f740ad6f7b8>

In [16]:
%time graph_draw(ff_pubmed, output="pubmed_with_labels_feedforward.png", output_size=(1200,1200))


CPU times: user 55min 31s, sys: 6.14 s, total: 55min 37s
Wall time: 7min 3s
Out[16]:
<PropertyMap object with key type 'Vertex' and value type 'vector<double>', for Graph 0x7f740ad6f7b8, at 0x7f740ea69048>

In [17]:
vfilt = ff_pubmed.new_vertex_property('bool');
for i in ff_pubmed.vertices():
    v = ff_pubmed.vertex(i)
    if v.out_degree() > 0:
        vfilt[i] = True
    else:
        vfilt[i] = False

In [18]:
ff_pubmed_filtered = GraphView(ff_pubmed, vfilt)

In [19]:
%time graph_draw(ff_pubmed_filtered, output="pubmed_with_labels_feedforward_filtered.png", output_size=(1200,1200))


CPU times: user 1min 19s, sys: 612 ms, total: 1min 20s
Wall time: 11.9 s
Out[19]:
<PropertyMap object with key type 'Vertex' and value type 'vector<double>', for Graph 0x7f74053f15f8, at 0x7f740ea50eb8>

In [20]:
node_color = pubmed_gt.new_vertex_property("int")
for i, l in enumerate(pubmed.get_labels()):
    node_color[i] = np.argmax(l)

In [23]:
%time graph_draw(ff_pubmed_filtered, output="pubmed_with_labels_feedforward_filtered.png", vertex_fill_color=node_color, output_size=(1200,1200))


CPU times: user 1min 21s, sys: 720 ms, total: 1min 21s
Wall time: 12.6 s
Out[23]:
<PropertyMap object with key type 'Vertex' and value type 'vector<double>', for Graph 0x7f74053f15f8, at 0x7f74053f7da0>

In [ ]: