Extracting degree information from hetio-ind


In [1]:
import gzip

import pandas

import hetio.readwrite
import hetio.stats

Download network


In [2]:
commit = 'f72d32ce09b8884b6ec7e000ec261c116b340198'

In [3]:
# # Download heterogeneous network
# url = 'https://raw.githubusercontent.com/dhimmel/integrate/{}/data/graph.json.gz'.format(commit)
# ! wget --no-verbose --timestamping --directory-prefix download {url}

In [4]:
# # Summary of metaegdes starting on Gene
# url = 'https://raw.githubusercontent.com/dhimmel/integrate/{}/data/summary/metaedges.tsv'.format(commit)
# summary_df = pandas.read_table(url)
# summary_df = summary_df[summary_df.metaedge.str.startswith('gene ')]
# summary_df.to_csv('download/network-summary.tsv', sep='\t', index=False)
# summary_df.head(2)

Read network and compute degrees


In [5]:
%%time
# Read the heterogeneous network
graph = hetio.readwrite.read_json('download/graph.json.gz')


CPU times: user 2min 20s, sys: 15.4 s, total: 2min 36s
Wall time: 2min 36s

In [6]:
# Calculate degrees for genes
gene = graph.metagraph.get_node('gene')
degree_df = hetio.stats.get_degrees_for_metanode(graph, gene)

In [7]:
# Entrez genes and symbols
degree_df['entrez_gene_id'] = degree_df['node_id'].map(lambda x: int(str(x).split('::')[1]))
degree_df = degree_df.rename(columns={'node_name': 'symbol'})
degree_df = degree_df[['entrez_gene_id', 'symbol', 'metaedge', 'degree']]
degree_df.head(2)


Out[7]:
entrez_gene_id symbol metaedge degree
428420 1 A1BG gene - association - disease 1
428419 1 A1BG gene - binding - compound 0

In [9]:
# Save as tsv
with gzip.open('data/gene-degrees.tsv.gz', 'wt') as write_file:
    degree_df.to_csv(write_file, sep='\t', index=False)