ToppGene API

Authors: N. Mouchamel, T. Nguyen & K. Fisch

Email: Kfisch@ucsd.edu

Date: June 2016

Goal: Create Jupyter notebook cell that runs an enrichment analysis in ToppGene through the API

Toppgene website https://toppgene.cchmc.org/enrichment.jsp *Note: request ToppGene API through ToppGene developers

Steps:

  1. Read in differentially expressed gene list
  2. Convert differentially expressed gene list to xml file as input to ToppGene API
  3. Run enrichment analysis of DE genes through ToppGene API
  4. Parse ToppGene API results from xml to csv and Pandas data frame
  5. Display results in notebook

In [ ]:
##importing python module
import os
import pandas
import qgrid
qgrid.nbinstall(overwrite=True)
qgrid.set_defaults(remote_js=True, precision=4)

import mygene

##change directory
os.chdir("/Users/nicole/Documents/CCBB Internship")

Read in differential expression results as a Pandas data frame to get differentially expressed gene list


In [ ]:
##read in DESeq2 results
genes=pandas.read_csv("DE_genes.csv")

##View interactive table
##qgrid.show_grid(genes.sample(1000), grid_options={'forceFitColumns': False, 'defaultColumnWidth': 100})

#View top of file
genes.head(10)

In [ ]:
#Extract genes that are differentially expressed with a pvalue less than a certain cutoff (pvalue < 0.05 or padj < 0.05)
genes_DE_only = genes.loc[(genes.pvalue < 0.05)]

#View top of file
genes_DE_only.head(10)

In [ ]:
#Check how many rows in original genes file
len(genes)

In [ ]:
#Check how many rows in DE genes file
len(genes_DE_only)

Translate Ensembl IDs to Gene Symbols and Entrez IDs using mygene.info API


In [ ]:
#Extract list of DE genes  (Check to make sure this code works, this was adapted from a different notebook)
de_list = genes_DE_only[genes_DE_only.columns[0]]

#Remove .* from end of Ensembl ID
de_list2 = de_list.replace("\.\d","",regex=True)

#Add new column with reformatted Ensembl IDs
genes_DE_only["Full_Ensembl"] = de_list2

#View top of file 
genes_DE_only.head(10)

In [ ]:
#Set up mygene.info API and query
mg = mygene.MyGeneInfo()
gene_ids = mg.getgenes(de_list2, 'name, symbol, entrezgene', as_dataframe=True)
gene_ids.index.name = "Ensembl"
gene_ids.reset_index(inplace=True)

#View top of file
gene_ids.head(10)

In [ ]:
#Merge mygene.info query results with original DE genes list
DE_with_ids = genes_DE_only.merge(gene_ids, left_on="Full_Ensembl", right_on="Ensembl", how="outer")

#Check top of file
DE_with_ids.head(10)

In [ ]:
#Write results to file
DE_with_ids.to_csv("./DE_genes_converted.csv")

In [ ]:
#Dataframe to only contain gene symbol
DE_with_ids=pandas.read_csv("./DE_genes_converted.csv")

cols = DE_with_ids.columns.tolist()
cols.insert(0, cols.pop(cols.index('symbol')))

for_xmlfile = DE_with_ids.reindex(columns= cols)

#Condense dataframe to contain only symbol
for_xmlfile.drop(for_xmlfile.columns[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,12,13,14]], axis=1, inplace=True) 

#Exclude NaN values
for_xmlfile.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)

for_xmlfile.head(10)

In [ ]:
#Write results to file
for_xmlfile.to_csv("./for_xmlfile.csv", index=False)

In [ ]:
#.XML file generator from gene list in .csv file
import xml.etree.cElementTree as ET
import xml.etree.cElementTree as ElementTree
import lxml

root=ET.Element("requests")

doc=ET.SubElement(root, "toppfun", id= "nicole's gene list")

config=ET.SubElement(doc, "enrichment-config")

gene_list=ET.SubElement(doc, "trainingset")
gene_list.set('accession-source','HGNC')

#for gene symbol in gene_list
toppgene = pandas.read_csv("./for_xmlfile.csv")

for i in toppgene.ix[:,0]:
    gene_symbol = i
    gene = ET.SubElement(gene_list, "gene")
    gene.text= gene_symbol


tree = ET.ElementTree(root)

def indent(elem, level=0):
    i = "\n" + level*"  "
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = i + "  "
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
        for elem in elem:
            indent(elem, level+1)
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
    else:
        if level and (not elem.tail or not elem.tail.strip()):
            elem.tail = i
indent(root)

import xml.dom.minidom
from lxml import etree


with open('/Users/nicole/Documents/test.xml', 'w') as f:
    f.write('<?xml version="1.0" encoding="UTF-8" ?><!DOCTYPE requests SYSTEM "https://toppgene.cchmc.org/toppgenereq.dtd">')
    ElementTree.ElementTree(root).write(f, 'utf-8')


xml = xml.dom.minidom.parse("/Users/nicole/Documents/test.xml")
pretty_xml_as_string = xml.toprettyxml()

print(pretty_xml_as_string)

Reformat list of genes to xml input format as in the test file z.xml

Reformat list of gene IDs (DE_with_ids, symbol) into xml format to look like example below. Documentation (along with google searching)

http://stackoverflow.com/questions/3605680/creating-a-simple-xml-file-using-python


In [ ]:
import xml.etree.cElementTree as ET
import lxml

root=ET.Element("requests")
doc=ET.SubElement(root, "toppfun", id="a")

config=ET.SubElement(doc, "enrichment-config")

#training=ET.SubElement(doc, "trainingset accession-source='HGNC'") 
training=ET.SubElement(doc, "trainingset")
training.set('accession-source','HGNC')

#features=ET.SubElement(config, "features")
gene1=ET.SubElement(training, "gene") 
gene2=ET.SubElement(training, "gene")
gene3=ET.SubElement(training, "gene")
gene1.text="APOB"
gene2.text="APOE"
gene3.text="AIR1"


####

doc2= ET.SubElement(root, "toppgene", id="b")
config2=ET.SubElement(doc2, "enrichment-config")
prior=ET.SubElement(doc2,"prioritization-config")

training2=ET.SubElement(doc2, "trainingset")
training2.set('accession-source','HGNC')

gene4=ET.SubElement(training2, "gene")
gene4.text="SRF"

#testset=ET.SubElement(doc2, "testset accession-source='HGNC'")
testset=ET.SubElement(doc2, "testset")
testset.set('accession-source','HGNC')

gene5=ET.SubElement(testset, "gene")
gene5.text="APOB"

####

doc3= ET.SubElement(root, "toppnet", id="Pattern")
config3=ET.SubElement(doc3, "net-config") ##net-config
markov=ET.SubElement (config3, "k-step-markov step-size='6'") #k-step-markov step-size='6'
visual=ET.SubElement (config3, "visualizer neighborhood-distance='1'") ##visualizer neighborhood-distance="1"

training3=ET.SubElement(doc3, "trainingset")
training3.set('accession-source','HGNC')

gene6=ET.SubElement(training3, "gene")
gene6.text="SRF"


#testset2=ET.SubElement(doc3, "testset accession-source='ENTREZ'")
testset2=ET.SubElement(doc3, "testset")
testset2.set('accession-source','ENTREZ')

gene7=ET.SubElement(testset2,"gene")
gene7.text="338"

###
doc4= ET.SubElement (root, "toppgenet", id="d")
config4=ET.SubElement(doc4, "enrichment-config")

training4=ET.SubElement(doc4, "trainingset")
training4.set('accession-source','HGNC')

gene8=ET.SubElement(training4, "gene")
gene8.text="APOB"

tree = ET.ElementTree(root)

def indent(elem, level=0):
    i = "\n" + level*"  "
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = i + "  "
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
        for elem in elem:
            indent(elem, level+1)
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
    else:
        if level and (not elem.tail or not elem.tail.strip()):
            elem.tail = i

import xml.dom.minidom
from lxml import etree

indent(root)

with open('/Users/nicole/Documents/nicolez.xml', 'w') as f:
    f.write('<?xml version="1.0" encoding="UTF-8" ?><!DOCTYPE requests SYSTEM "https://toppgene.cchmc.org/toppgenereq.dtd">')
    ElementTree.ElementTree(root).write(f,'utf-8')


xml = xml.dom.minidom.parse("/Users/nicole/Documents/nicolez.xml")
pretty_xml_as_string = xml.toprettyxml()

print(pretty_xml_as_string)

In [ ]:
#!sed -e 's/X/ /g' -e 's/Q/=/g' -e 's/9/\"/g' -e 's/\../\-/g' filename.xml >newfile.xml

In [ ]:
import xml.dom.minidom

xml = xml.dom.minidom.parse("/Users/nicole/Documents/zcopy.xml")
pretty_xml_as_string = xml.toprettyxml()

print(pretty_xml_as_string)

Run ToppGene API via command line through provided instructions

https://toppgene.cchmc.org/api/44009585-27C5-41FD-8279-A5FE1C86C8DB The input data is an XML payload that conforms to the DTD specified by https://toppgene.cchmc.org/toppgenereq.dtd An example of how you might run a command line query from a UNIX (including Linux and Mac) command line: curl -v -H 'Content-Type: text/xml' --data @z.xml -X POST https://toppgene.cchmc.org/api/44009585-27C5-41FD-8279-A5FE1C86C8DB

fix on server

vim ~/.curlrc

then add this to file

cacert=/etc/ssl/certs/ca-certificates.crt


In [ ]:
#Run command line tool through Python #this works on test file
!curl -v -H 'Content-Type: text/xml' --data @/Users/nicole/Documents/test.xml -X POST https://toppgene.cchmc.org/api/44009585-27C5-41FD-8279-A5FE1C86C8DB > /Users/nicole/Documents/testoutfile.xml

In [ ]:
import xml.dom.minidom

xml = xml.dom.minidom.parse("/Users/nicole/Documents/testoutfile.xml") 
pretty_xml_as_string = xml.toprettyxml()

print(pretty_xml_as_string)

Parse ToppGene results into Pandas data frame


In [ ]:
#xml to pandas dataframe http://pandas.pydata.org/pandas-docs/stable/api.html#input-output
import xml.dom.minidom
import pandas as pd
import numpy
import pyexcel

#Parse through .xml file
def load_parse_xml(data_file):
        """Check if file exists. If file exists, load and parse the data file. """
        if os.path.isfile(data_file):
                print "File exists. Parsing..."
                data_parse = ET.ElementTree(file=data_file)
                print "File parsed."
                return data_parse
 
xmlfile = load_parse_xml("/Users/nicole/Documents/testoutfile.xml")
 
#Generate array of annotation arrays for .csv file
root_tree = xmlfile.getroot()

gene_list=[]

for child in root_tree:
    
    child.find("enrichment-results")
    
    new_array = []
    array_of_arrays=[]
    
    for type in child.iter("enrichment-result"):
        
        for annotation in type.iter("annotation"):
            array_of_arrays.append(new_array)
            new_array = []
            new_array.append(type.attrib['type'])
            new_array.append(annotation.attrib['name'])
            new_array.append(annotation.attrib['id'])
            new_array.append(annotation.attrib['pvalue'])
            new_array.append(annotation.attrib['genes-in-query'])
            new_array.append(annotation.attrib['genes-in-term'])
            new_array.append(annotation.attrib['source'])
           
            for gene in annotation.iter("gene"):
                gene_list.append(gene.attrib['symbol'])
            new_array.append(gene_list)
            gene_list =[]
     
    print array_of_arrays

In [ ]:
import pyexcel
data = array_of_arrays
pyexcel.save_as(array = data, dest_file_name = '/Users/nicole/Documents/plswork.csv')

In [ ]:
df=pandas.read_csv('/Users/nicole/Documents/plswork.csv', header=None)
df.columns=['ToppGene Feature','Annotation Name','ID','pValue','Genes-in-Query','Genes-in-Term','Source','Genes']
#df.head(15)

In [ ]:
#Dataframe for GeneOntologyMolecularFunction
df.loc[df['ToppGene Feature'] == 'GeneOntologyMolecularFunction']

In [ ]:
#Dataframe for GeneOntologyBiologicalProcess
df.loc[df['ToppGene Feature'] == 'GeneOntologyBiologicalProcess']

In [ ]:
#Dataframe for GeneOntologyCellularComponent
df.loc[df['ToppGene Feature'] == 'GeneOntologyCellularComponent']

In [ ]:
#Dataframe for Human Phenotype
df.loc[df['ToppGene Feature'] == 'HumanPheno']

In [ ]:
#Dataframe for Mouse Phenotype
df.loc[df['ToppGene Feature'] == 'MousePheno']

In [ ]:
#Dataframe for Domain
df.loc[df['ToppGene Feature'] == 'Domain']

In [ ]:
#Dataframe for Pathways
df.loc[df['ToppGene Feature'] == 'Pathway']

In [ ]:
#Dataframe for Pubmed
df.loc[df['ToppGene Feature'] == 'Pubmed']

In [ ]:
#Dataframe for Interactions
df.loc[df['ToppGene Feature'] == 'Interaction']

In [ ]:
#Dataframe for Cytobands
df.loc[df['ToppGene Feature'] == 'Cytoband']

In [ ]:
#Dataframe for Transcription Factor Binding Sites
df.loc[df['ToppGene Feature'] == 'TranscriptionFactorBindingSite']

In [ ]:
#Dataframe for Gene Family
df.loc[df['ToppGene Feature'] == 'GeneFamily']

In [ ]:
#Dataframe for Coexpression
df.loc[df['ToppGene Feature'] == 'Coexpression']

In [ ]:
#DataFrame for Coexpression Atlas
df.loc[df['ToppGene Feature'] == 'CoexpressionAtlas']

In [ ]:
#Dataframe for Computational
df.loc[df['ToppGene Feature'] == 'Computational']

In [ ]:
#Dataframe for MicroRNAs
df.loc[df['ToppGene Feature'] == 'MicroRNA']

In [ ]:
#Dataframe for Drugs
df.loc[df['ToppGene Feature'] == 'Drug']

In [ ]:
#Dataframe for Diseases
df.loc[df['ToppGene Feature'] == 'Disease']

In [ ]: