Goal: Create Jupyter notebook that runs an enrichment analysis in ToppGene through the API and runs Pathview to visualize the significant pathways outputted by ToppGene.
toppgene website: https://toppgene.cchmc.org/enrichment.jsp
Steps:
In [ ]:
#Import Python modules
import os
import pandas
import qgrid
import mygene
#Change directory
os.chdir("/data/test")
In [ ]:
#Read in DESeq2 results
genes=pandas.read_csv("DE_genes.csv")
#View top of file
genes.head(10)
In [ ]:
#Extract genes that are differentially expressed with a pvalue less than a certain cutoff (pvalue < 0.05 or padj < 0.05)
genes_DE_only = genes.loc[(genes.pvalue < 0.05)]
#View top of file
genes_DE_only.head(10)
In [ ]:
#Check how many rows in original genes file
len(genes)
In [ ]:
#Check how many rows in DE genes file
len(genes_DE_only)
In [ ]:
#Extract list of DE genes (Check to make sure this code works, this was adapted from a different notebook)
de_list = genes_DE_only[genes_DE_only.columns[0]]
#Remove .* from end of Ensembl ID
de_list2 = de_list.replace("\.\d","",regex=True)
#Add new column with reformatted Ensembl IDs
genes_DE_only["Full_Ensembl"] = de_list2
#View top of file
genes_DE_only.head(10)
In [ ]:
#Set up mygene.info API and query
mg = mygene.MyGeneInfo()
gene_ids = mg.getgenes(de_list2, 'name, symbol, entrezgene', as_dataframe=True)
gene_ids.index.name = "Ensembl"
gene_ids.reset_index(inplace=True)
#View top of file
gene_ids.head(10)
In [ ]:
#Merge mygene.info query results with original DE genes list
DE_with_ids = genes_DE_only.merge(gene_ids, left_on="Full_Ensembl", right_on="Ensembl", how="outer")
#View top of file
DE_with_ids.head(10)
In [ ]:
#Write results to file
DE_with_ids.to_csv("./DE_genes_converted.csv")
In [ ]:
#Dataframe to only contain gene symbol
DE_with_ids=pandas.read_csv("./DE_genes_converted.csv")
cols = DE_with_ids.columns.tolist()
cols.insert(0, cols.pop(cols.index('symbol')))
for_xmlfile = DE_with_ids.reindex(columns= cols)
#Condense dataframe to contain only gene symbol
for_xmlfile.drop(for_xmlfile.columns[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,12,13,14]], axis=1, inplace=True)
#Exclude NaN values
for_xmlfile.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
#View top of file
for_xmlfile.head(10)
In [ ]:
#Write results to file
for_xmlfile.to_csv("./for_xmlfile.csv", index=False)
In [ ]:
#.XML file generator from gene list in .csv file
import xml.etree.cElementTree as ET
import xml.etree.cElementTree as ElementTree
import lxml
#Root element of .xml "Tree"
root=ET.Element("requests")
#Title/identifier for the gene list inputted into ToppGene API
#Name it whatever you like
doc=ET.SubElement(root, "toppfun", id= "nicole's gene list")
config=ET.SubElement(doc, "enrichment-config")
gene_list=ET.SubElement(doc, "trainingset")
gene_list.set('accession-source','HGNC')
#For gene symbol in gene_list
#Parse through gene_list to create the .xml file
toppgene = pandas.read_csv("./for_xmlfile.csv")
for i in toppgene.ix[:,0]:
gene_symbol = i
gene = ET.SubElement(gene_list, "gene")
gene.text= gene_symbol
tree = ET.ElementTree(root)
#Function needed for proper indentation of the .xml file
def indent(elem, level=0):
i = "\n" + level*" "
if len(elem):
if not elem.text or not elem.text.strip():
elem.text = i + " "
if not elem.tail or not elem.tail.strip():
elem.tail = i
for elem in elem:
indent(elem, level+1)
if not elem.tail or not elem.tail.strip():
elem.tail = i
else:
if level and (not elem.tail or not elem.tail.strip()):
elem.tail = i
indent(root)
import xml.dom.minidom
from lxml import etree
#File to write the .xml file to
#Include DOCTYPE
with open('/data/test/test.xml', 'w') as f:
f.write('<?xml version="1.0" encoding="UTF-8" ?><!DOCTYPE requests SYSTEM "https://toppgene.cchmc.org/toppgenereq.dtd">')
ElementTree.ElementTree(root).write(f, 'utf-8')
#Display .xml file
xml = xml.dom.minidom.parse('/data/test/test.xml')
pretty_xml_as_string = xml.toprettyxml()
print(pretty_xml_as_string)
Include path for the input .xml file and path and name of the output .xml file.
Outputs all 17 features of ToppGene.
In [ ]:
!curl -v -H 'Content-Type: text/xml' --data @/data/test/test.xml -X POST https://toppgene.cchmc.org/api/44009585-27C5-41FD-8279-A5FE1C86C8DB > /data/test/testoutfile.xml
In [ ]:
#Display output .xml file
import xml.dom.minidom
xml = xml.dom.minidom.parse("/data/test/testoutfile.xml")
pretty_xml_as_string = xml.toprettyxml()
print(pretty_xml_as_string)
In [ ]:
import xml.dom.minidom
import pandas as pd
import numpy
#Parse through .xml file
def load_parse_xml(data_file):
"""Check if file exists. If file exists, load and parse the data file. """
if os.path.isfile(data_file):
print "File exists. Parsing..."
data_parse = ET.ElementTree(file=data_file)
print "File parsed."
return data_parse
xmlfile = load_parse_xml("/data/test/testoutfile.xml")
In [ ]:
#Generate array of annotation arrays for .csv file
root_tree = xmlfile.getroot()
gene_list=[]
for child in root_tree:
child.find("enrichment-results")
new_array = []
array_of_arrays=[]
for type in child.iter("enrichment-result"):
count = 0
for annotation in type.iter("annotation"):
array_of_arrays.append(new_array)
new_array = []
new_array.append(type.attrib['type'])
new_array.append(annotation.attrib['name'])
new_array.append(annotation.attrib['id'])
new_array.append(annotation.attrib['pvalue'])
new_array.append(annotation.attrib['genes-in-query'])
new_array.append(annotation.attrib['genes-in-term'])
new_array.append(annotation.attrib['source'])
for gene in annotation.iter("gene"):
gene_list.append(gene.attrib['symbol'])
new_array.append(gene_list)
gene_list =[]
count+= 1
print "Number of Annotations for ToppGene Feature - %s: " % type.attrib['type'] + str(count)
print "Total number of significant gene sets from ToppGene: " + str(len(array_of_arrays))
#print array_of_arrays
In [ ]:
#Convert array of annotation arrays into .csv file (to be viewed as dataframe)
import pyexcel
data = array_of_arrays
pyexcel.save_as(array = data, dest_file_name = '/data/test/results.csv')
In [ ]:
#Reading in the .csv ToppGene results
df=pandas.read_csv('/data/test/results.csv', header=None)
#Label dataframe columns
df.columns=['ToppGene Feature','Annotation Name','ID','pValue','Genes-in-Query','Genes-in-Term','Source','Genes']
Display the dataframe of each ToppGene feature
In [ ]:
#Dataframe for GeneOntologyMolecularFunction
df.loc[df['ToppGene Feature'] == 'GeneOntologyMolecularFunction']
In [ ]:
#Dataframe for GeneOntologyBiologicalProcess
df.loc[df['ToppGene Feature'] == 'GeneOntologyBiologicalProcess']
In [ ]:
#Dataframe for GeneOntologyCellularComponent
df.loc[df['ToppGene Feature'] == 'GeneOntologyCellularComponent']
In [ ]:
#Dataframe for Human Phenotype
df.loc[df['ToppGene Feature'] == 'HumanPheno']
In [ ]:
#Dataframe for Mouse Phenotype
df.loc[df['ToppGene Feature'] == 'MousePheno']
In [ ]:
#Dataframe for Domain
df.loc[df['ToppGene Feature'] == 'Domain']
In [ ]:
#Dataframe for Pathways
df.loc[df['ToppGene Feature'] == 'Pathway']
In [ ]:
#Dataframe for Pubmed
df.loc[df['ToppGene Feature'] == 'Pubmed']
In [ ]:
#Dataframe for Interactions
df.loc[df['ToppGene Feature'] == 'Interaction']
In [ ]:
#Dataframe for Cytobands
df.loc[df['ToppGene Feature'] == 'Cytoband']
In [ ]:
#Dataframe for Transcription Factor Binding Sites
df.loc[df['ToppGene Feature'] == 'TranscriptionFactorBindingSite']
In [ ]:
#Dataframe for Gene Family
df.loc[df['ToppGene Feature'] == 'GeneFamily']
In [ ]:
#Dataframe for Coexpression
df.loc[df['ToppGene Feature'] == 'Coexpression']
In [ ]:
#DataFrame for Coexpression Atlas
df.loc[df['ToppGene Feature'] == 'CoexpressionAtlas']
In [ ]:
#Dataframe for Computational
df.loc[df['ToppGene Feature'] == 'Computational']
In [ ]:
#Dataframe for MicroRNAs
df.loc[df['ToppGene Feature'] == 'MicroRNA']
In [ ]:
#Dataframe for Drugs
df.loc[df['ToppGene Feature'] == 'Drug']
In [ ]:
#Dataframe for Diseases
df.loc[df['ToppGene Feature'] == 'Disease']
In [ ]:
#Number of significant KEGG pathways
total_KEGG_pathways = df.loc[df['Source'] == 'BioSystems: KEGG']
print "Number of significant KEGG pathways: " + str(len(total_KEGG_pathways.index))
In [ ]:
df = df.loc[df['Source'] == 'BioSystems: KEGG']
df.to_csv('/data/test/keggpathways.csv', index=False)
In [ ]:
mapping_df = pandas.read_csv('/data/test/KEGGmap.csv')
mapping_df = mapping_df.loc[mapping_df['Organism'] == 'Homo sapiens ']
mapping_df.head(10)
Create dataframe that includes the KEGG IDs that correspond to the significant pathways outputted by ToppGene
In [ ]:
#Create array of KEGG IDs that correspond to the significant pathways outputted by ToppGene
KEGG_ID_array = []
for ID in df.ix[:,2]:
x = int(ID)
for index,BSID in enumerate(mapping_df.ix[:,0]):
y = int(BSID)
if x == y:
KEGG_ID_array.append(mapping_df.get_value(index,1,takeable=True))
print KEGG_ID_array
In [ ]:
#Transform array into KEGG ID dataframe
KEGG_IDs = pandas.DataFrame()
KEGG_IDs['KEGG ID'] = KEGG_ID_array
KEGG_IDs.to_csv('/data/test/keggidlist.csv', index=False)
In [ ]:
no_KEGG_ID = pandas.read_csv('/data/test/keggpathways.csv')
KEGG_IDs = pandas.read_csv('/data/test/keggidlist.csv')
In [ ]:
#Append KEGG ID dataframe to dataframe containing the significant pathways outputted by ToppGene
KEGG_ID_included = pd.concat([no_KEGG_ID, KEGG_IDs], axis = 1)
KEGG_ID_included.to_csv('/data/test/KEGG_ID_included.csv', index=False)
KEGG_ID_included
Switch to R kernel here
In [ ]:
#Set working directory
working_dir <- "/data/test"
setwd(working_dir)
date <- Sys.Date()
In [ ]:
#Set R options
options(jupyter.plot_mimetypes = 'image/png')
options(useHTTPS=FALSE)
options(scipen=500)
In [ ]:
#Load R packages from CRAN and Bioconductor
require(limma)
require(edgeR)
require(DESeq2)
require(RColorBrewer)
require(cluster)
library(gplots)
library(SPIA)
library(graphite)
library(PoiClaClu)
library(ggplot2)
library(pathview)
library(KEGG.db)
library(mygene)
library(splitstackshape)
library(reshape)
library(hwriter)
library(ReportingTools)
library("EnrichmentBrowser")
library(IRdisplay)
library(repr)
library(png)
Create matrix-like structure to contain entrez ID and log2FC for gene.data input
In [ ]:
#Extract entrez ID and log2FC from the input DE genes
#Read in differential expression results as a Pandas data frame to get differentially expressed gene list
#Read in DE_genes_converted results (generated in jupyter notebook)
genes <- read.csv("DE_genes_converted.csv")[,c('entrezgene', 'log2FoldChange')]
#Remove NA values
genes<-genes[complete.cases(genes),]
head(genes,10)
In [ ]:
#Transform data frame into matrix (gene.data in Pathview only takes in a matrix formatted data)
genedata<-matrix(c(genes[,2]),ncol=1,byrow=TRUE)
rownames(genedata)<-c(genes[,1])
colnames(genedata)<-c("log2FoldChange")
genedata <- as.matrix(genedata)
head(genedata,10)
Create vector containing the KEGG IDs of all the significant target pathways
In [ ]:
#Read in pathways that you want to map to (from toppgene pathway results)
#Store as a vector
pathways <- read.csv("/data/test/keggidlist.csv")
head(pathways, 12)
pathways.vector<-as.vector(pathways$KEGG.ID)
pathways.vector
In [ ]:
#Loop through all the pathways in pathways.vector
#Generate Pathview pathways for each one (native KEGG graphs)
i<-1
for (i in pathways.vector){
pv.out <- pathview(gene.data = genedata[, 1], pathway.id = i,
species = "hsa", out.suffix = "toppgene_native_kegg_graph", kegg.native = T)
#str(pv.out)
#head(pv.out$plot.data.gene)
}
In [ ]:
#Loop through all the pathways in pathways.vector
#Generate Pathview pathways for each one (Graphviz layouts)
i<-1
for (i in pathways.vector){
pv.out <- pathview(gene.data = genedata[, 1], pathway.id = i,
species = "hsa", out.suffix = "toppgene_graphviz_layout", kegg.native = F)
str(pv.out)
head(pv.out$plot.data.gene)
#head(pv.out$plot.data.gene)
}
Switch back to py27 kernel here
In [ ]:
#Display native KEGG graphs
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import pandas
%matplotlib inline
#for loop that iterates through the pathway images and displays them
pathways = pandas.read_csv("/data/test/keggidlist.csv")
pathways
for i in pathways.ix[:,0]:
image = i
address = "/data/test/%s.toppgene_native_kegg_graph.png" % image
img = mpimg.imread(address)
plt.imshow(img)
plt.gcf().set_size_inches(50,50)
print i
plt.show()
Weijun Luo and Cory Brouwer. Pathview: an R/Bioconductor package for pathway-based data integration and visualization. Bioinformatics, 29(14):1830-1831, 2013. doi: 10.1093/bioinformatics/btt285.
Only works for one pathway (first one)
In [ ]:
#Import more python modules
import sys
#To access visJS_module and entrez_to_symbol module
sys.path.append(os.getcwd().replace('/data/test', '/data/CCBB_internal/interns/Lilith/PathwayViz'))
import visJS_module
from ensembl_to_entrez import entrez_to_symbol
import networkx as nx
import matplotlib.pyplot as plt
import pymongo
from itertools import islice
import requests
import math
import spectra
from bioservices.kegg import KEGG
import imp
imp.reload(visJS_module)
In [ ]:
#Latex rendering of text in graphs
import matplotlib as mpl
mpl.rc('text', usetex = False)
mpl.rc('font', family = 'serif')
% matplotlib inline
In [ ]:
s = KEGG()
In [ ]:
#Lowest p value pathway
#But you can change the first parameter in pathways.get_value to see different pathways in the pathways list!
pathway = pathways.get_value(0,0, takeable=True)
print pathway
address = "/data/test/%s.xml" % pathway
#Parse pathway's xml file and get the root of the xml file
tree = ET.parse(address)
root = tree.getroot()
In [ ]:
res = s.parse_kgml_pathway(pathway)
print res['relations']
In [ ]:
print res['entries']
In [ ]:
G=nx.DiGraph()
In [ ]:
#Add nodes to networkx graph
for entry in res['entries']:
G.add_node(entry['id'], entry )
print len(G.nodes(data=True))
In [ ]:
#Get symbol of each node
temp_node_id_array = []
for node, data in G.nodes(data=True):
if data['type'] == 'gene':
if ' ' not in data['name']:
G.node[node]['symbol'] = data['gene_names'].split(',', 1)[0]
else:
result = data['name'].split("hsa:")
result = ''.join(result)
result = result.split()
for index, gene in enumerate(result):
if index == 0:
gene_symbol = str(entrez_to_symbol(gene))
else:
gene_symbol = gene_symbol + ', ' + str(entrez_to_symbol(gene))
G.node[node]['symbol'] = gene_symbol
elif data['type'] == 'compound':
gene_symbol = s.parse(s.get(data['name']))['NAME']
G.node[node]['gene_names'] = ' '.join(gene_symbol)
G.node[node]['symbol'] = gene_symbol[0].replace(';', '')
print G.nodes(data=True)
In [ ]:
#Get x and y coordinates for each node
seen_coord = set()
coord_array = []
dupes_coord = []
for entry in root.findall('entry'):
node_id = entry.attrib['id']
graphics = entry.find('graphics')
if (graphics.attrib['x'], graphics.attrib['y']) in seen_coord:
G.node[node_id]['x'] = (int(graphics.attrib['x']) + .1) * 2.5
G.node[node_id]['y'] = (int(graphics.attrib['y']) + .1) * 2.5
seen_coord.add((G.node[node_id]['x'], G.node[node_id]['y']))
print node_id
else:
seen_coord.add((graphics.attrib['x'], graphics.attrib['y']))
G.node[node_id]['x'] = int(graphics.attrib['x']) * 2.5
G.node[node_id]['y'] = int(graphics.attrib['y']) * 2.5
print dupes_coord
print seen_coord
In [ ]:
#Handle undefined nodes
comp_dict = dict()
node_to_comp = dict()
comp_array_total = [] #Array containing all component nodes
for entry in root.findall('entry'):
#Array to store components of undefined nodes
component_array = []
if entry.attrib['name'] == 'undefined':
node_id = entry.attrib['id']
#Find components
for index, component in enumerate(entry.iter('component')):
component_array.append(component.get('id'))
#Check to see which elements are components
comp_array_total.append(component.get('id'))
node_to_comp[component.get('id')] = node_id
#Store into node dictionary
G.node[node_id]['component'] = component_array
comp_dict[node_id] = component_array
#Store gene names
gene_name_array = []
for index, component_id in enumerate(component_array):
if index == 0:
gene_name_array.append(G.node[component_id]['gene_names'])
else:
gene_name_array.append('\n' + G.node[component_id]['gene_names'])
G.node[node_id]['gene_names'] = gene_name_array
#Store gene symbols
gene_symbol_array = []
for index, component_id in enumerate(component_array):
if index == 0:
gene_symbol_array.append(G.node[component_id]['symbol'])
else:
gene_symbol_array.append('\n' + G.node[component_id]['symbol'])
G.node[node_id]['symbol'] = gene_symbol_array
print G.node
In [ ]:
edge_list = []
edge_pairs = []
#Add edges to networkx graph
#Redirect edges to point to undefined nodes containing components in order to connect graph
for edge in res['relations']:
source = edge['entry1']
dest = edge['entry2']
if (edge['entry1'] in comp_array_total) == True:
source = node_to_comp[edge['entry1']]
if (edge['entry2'] in comp_array_total) == True:
dest = node_to_comp[edge['entry2']]
edge_list.append((source, dest, edge))
edge_pairs.append((source,dest))
#Check for duplicates
if (source, dest) in G.edges():
name = []
value = []
link = []
name.append(G.edge[source][dest]['name'])
value.append(G.edge[source][dest]['value'])
link.append(G.edge[source][dest]['link'])
name.append(edge['name'])
value.append(edge['value'])
link.append(edge['link'])
G.edge[source][dest]['name'] = '\n'.join(name)
G.edge[source][dest]['value'] = '\n'.join(value)
G.edge[source][dest]['link'] = '\n'.join(link)
else:
G.add_edge(source, dest, edge)
print G.edges(data=True)
In [ ]:
edge_to_name = dict()
for edge in G.edges():
edge_to_name[edge] = G.edge[edge[0]][edge[1]]['name']
print edge_to_name
In [ ]:
#Set colors of edges
edge_to_color = dict()
for edge in G.edges():
if 'activation' in G.edge[edge[0]][edge[1]]['name']:
edge_to_color[edge] = 'green'
elif 'inhibition' in G.edge[edge[0]][edge[1]]['name']:
edge_to_color[edge] = 'red'
else:
edge_to_color[edge] = 'blue'
print edge_to_color
In [ ]:
#Remove component nodes from graph
G.remove_nodes_from(comp_array_total)
#Get nodes in graph
nodes = G.nodes()
numnodes = len(nodes)
print numnodes
In [ ]:
print G.node
In [ ]:
#Get symbol of nodes
node_to_symbol = dict()
for node in G.node:
if G.node[node]['type'] == 'map':
node_to_symbol[node] = G.node[node]['gene_names']
else:
if 'symbol' in G.node[node]:
node_to_symbol[node] = G.node[node]['symbol']
elif 'gene_names'in G.node[node]:
node_to_symbol[node] = G.node[node]['gene_names']
else:
node_to_symbol[node] = G.node[node]['name']
In [ ]:
#Get name of nodes
node_to_gene = dict()
for node in G.node:
node_to_gene[node] = G.node[node]['gene_names']
In [ ]:
#Get x coord of nodes
node_to_x = dict()
for node in G.node:
node_to_x[node] = G.node[node]['x']
In [ ]:
#Get y coord of nodes
node_to_y = dict()
for node in G.node:
node_to_y[node] = G.node[node]['y']
In [ ]:
#Log2FoldChange
DE_genes_df = pandas.read_csv("/data/test/DE_genes_converted.csv")
DE_genes_df.head(10)
In [ ]:
short_df = DE_genes_df[['_id', 'Ensembl', 'log2FoldChange']]
short_df.head(10)
In [ ]:
short_df.to_dict('split')
In [ ]:
#Remove NA values
gene_to_log2fold = dict()
for entry in short_df.to_dict('split')['data']:
if isinstance(entry[0], float):
if math.isnan(entry[0]):
gene_to_log2fold[entry[1]] = entry[2]
else:
gene_to_log2fold[entry[0]] = entry[2]
else:
gene_to_log2fold[entry[0]] = entry[2]
print gene_to_log2fold
In [ ]:
#Create color scale with negative as green and positive as red
my_scale = spectra.scale([ "green", "#CCC", "red" ]).domain([ -4, 0, 4 ])
In [ ]:
id_to_log2fold = dict()
for node in res['entries']:
log2fold_array = []
if node['name'] == 'undefined':
print 'node is undefined'
elif node['type'] == 'map':
print 'node is a pathway'
else:
#print node['name']
result = node['name'].split("hsa:")
result = ''.join(result)
result = result.split()
#print result
for item in result:
if item in gene_to_log2fold.keys():
log2fold_array.append(gene_to_log2fold[item])
if len(log2fold_array) > 0:
id_to_log2fold[node['id']] = log2fold_array
print id_to_log2fold
In [ ]:
#Color nodes based on log2fold data
node_to_color = dict()
for node in G.nodes():
if node in id_to_log2fold:
node_to_color[node] = my_scale(id_to_log2fold[node][0]).hexcode
else:
node_to_color[node] = '#f1f1f1'
print node_to_color
In [ ]:
#Get number of edges in graph
edges = G.edges()
numedges = len(edges)
print numedges
In [ ]:
print G.edges(data=True)
In [ ]:
#Change directory
os.chdir("/data/CCBB_internal/interns/Nicole/ToppGene")
#Map to indices for source/target in edges
node_map = dict(zip(nodes,range(numnodes)))
#Dictionaries that hold per node and per edge attributes
nodes_dict = [{"id":node_to_gene[n],"degree":G.degree(n),"color":node_to_color[n], "node_shape":"box",
"node_size":10,'border_width':1, "id_num":node_to_symbol[n], "x":node_to_x[n], "y":node_to_y[n]} for n in nodes]
edges_dict = [{"source":node_map[edges[i][0]], "target":node_map[edges[i][1]],
"color":edge_to_color[edges[i]], "id":edge_to_name[edges[i]], "edge_label":'',
"hidden":'false', "physics":'true'} for i in range(numedges)]
#HTML file label for first graph (must manually increment later)
time = 1700
#Make edges thicker
#Create and display the graph here
visJS_module.visjs_network(nodes_dict, edges_dict, time_stamp = time, node_label_field = "id_num",
edge_width = 3, border_color = "black", edge_arrow_to = True, edge_font_size = 15, edge_font_align= "top",
physics_enabled = False, graph_width = 1000, graph_height = 1000)
In [ ]: