Goal: Create Jupyter notebook cell that runs an enrichment analysis in ToppGene through the API
Toppgene website https://toppgene.cchmc.org/enrichment.jsp *Note: request ToppGene API through ToppGene developers
Steps:
In [ ]:
##importing python module
import os
import pandas
import qgrid
qgrid.nbinstall(overwrite=True)
qgrid.set_defaults(remote_js=True, precision=4)
import mygene
##change directory
os.chdir("/Users/nicole/Documents/CCBB Internship")
In [ ]:
##read in DESeq2 results
genes=pandas.read_csv("DE_genes.csv")
##View interactive table
##qgrid.show_grid(genes.sample(1000), grid_options={'forceFitColumns': False, 'defaultColumnWidth': 100})
#View top of file
genes.head(10)
In [ ]:
#Extract genes that are differentially expressed with a pvalue less than a certain cutoff (pvalue < 0.05 or padj < 0.05)
genes_DE_only = genes.loc[(genes.pvalue < 0.05)]
#View top of file
genes_DE_only.head(10)
In [ ]:
#Check how many rows in original genes file
len(genes)
In [ ]:
#Check how many rows in DE genes file
len(genes_DE_only)
In [ ]:
#Extract list of DE genes (Check to make sure this code works, this was adapted from a different notebook)
de_list = genes_DE_only[genes_DE_only.columns[0]]
#Remove .* from end of Ensembl ID
de_list2 = de_list.replace("\.\d","",regex=True)
#Add new column with reformatted Ensembl IDs
genes_DE_only["Full_Ensembl"] = de_list2
#View top of file
genes_DE_only.head(10)
In [ ]:
#Set up mygene.info API and query
mg = mygene.MyGeneInfo()
gene_ids = mg.getgenes(de_list2, 'name, symbol, entrezgene', as_dataframe=True)
gene_ids.index.name = "Ensembl"
gene_ids.reset_index(inplace=True)
#View top of file
gene_ids.head(10)
In [ ]:
#Merge mygene.info query results with original DE genes list
DE_with_ids = genes_DE_only.merge(gene_ids, left_on="Full_Ensembl", right_on="Ensembl", how="outer")
#Check top of file
DE_with_ids.head(10)
In [ ]:
#Write results to file
DE_with_ids.to_csv("./DE_genes_converted.csv")
In [ ]:
#Dataframe to only contain gene symbol
DE_with_ids=pandas.read_csv("./DE_genes_converted.csv")
cols = DE_with_ids.columns.tolist()
cols.insert(0, cols.pop(cols.index('symbol')))
for_xmlfile = DE_with_ids.reindex(columns= cols)
#Condense dataframe to contain only symbol
for_xmlfile.drop(for_xmlfile.columns[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,12,13,14]], axis=1, inplace=True)
#Exclude NaN values
for_xmlfile.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
for_xmlfile.head(10)
In [ ]:
#Write results to file
for_xmlfile.to_csv("./for_xmlfile.csv", index=False)
In [ ]:
#.XML file generator from gene list in .csv file
import xml.etree.cElementTree as ET
import xml.etree.cElementTree as ElementTree
import lxml
root=ET.Element("requests")
doc=ET.SubElement(root, "toppfun", id= "nicole's gene list")
config=ET.SubElement(doc, "enrichment-config")
gene_list=ET.SubElement(doc, "trainingset")
gene_list.set('accession-source','HGNC')
#for gene symbol in gene_list
toppgene = pandas.read_csv("./for_xmlfile.csv")
for i in toppgene.ix[:,0]:
gene_symbol = i
gene = ET.SubElement(gene_list, "gene")
gene.text= gene_symbol
tree = ET.ElementTree(root)
def indent(elem, level=0):
i = "\n" + level*" "
if len(elem):
if not elem.text or not elem.text.strip():
elem.text = i + " "
if not elem.tail or not elem.tail.strip():
elem.tail = i
for elem in elem:
indent(elem, level+1)
if not elem.tail or not elem.tail.strip():
elem.tail = i
else:
if level and (not elem.tail or not elem.tail.strip()):
elem.tail = i
indent(root)
import xml.dom.minidom
from lxml import etree
with open('/Users/nicole/Documents/test.xml', 'w') as f:
f.write('<?xml version="1.0" encoding="UTF-8" ?><!DOCTYPE requests SYSTEM "https://toppgene.cchmc.org/toppgenereq.dtd">')
ElementTree.ElementTree(root).write(f, 'utf-8')
xml = xml.dom.minidom.parse("/Users/nicole/Documents/test.xml")
pretty_xml_as_string = xml.toprettyxml()
print(pretty_xml_as_string)
http://stackoverflow.com/questions/3605680/creating-a-simple-xml-file-using-python
In [ ]:
import xml.etree.cElementTree as ET
import lxml
root=ET.Element("requests")
doc=ET.SubElement(root, "toppfun", id="a")
config=ET.SubElement(doc, "enrichment-config")
#training=ET.SubElement(doc, "trainingset accession-source='HGNC'")
training=ET.SubElement(doc, "trainingset")
training.set('accession-source','HGNC')
#features=ET.SubElement(config, "features")
gene1=ET.SubElement(training, "gene")
gene2=ET.SubElement(training, "gene")
gene3=ET.SubElement(training, "gene")
gene1.text="APOB"
gene2.text="APOE"
gene3.text="AIR1"
####
doc2= ET.SubElement(root, "toppgene", id="b")
config2=ET.SubElement(doc2, "enrichment-config")
prior=ET.SubElement(doc2,"prioritization-config")
training2=ET.SubElement(doc2, "trainingset")
training2.set('accession-source','HGNC')
gene4=ET.SubElement(training2, "gene")
gene4.text="SRF"
#testset=ET.SubElement(doc2, "testset accession-source='HGNC'")
testset=ET.SubElement(doc2, "testset")
testset.set('accession-source','HGNC')
gene5=ET.SubElement(testset, "gene")
gene5.text="APOB"
####
doc3= ET.SubElement(root, "toppnet", id="Pattern")
config3=ET.SubElement(doc3, "net-config") ##net-config
markov=ET.SubElement (config3, "k-step-markov step-size='6'") #k-step-markov step-size='6'
visual=ET.SubElement (config3, "visualizer neighborhood-distance='1'") ##visualizer neighborhood-distance="1"
training3=ET.SubElement(doc3, "trainingset")
training3.set('accession-source','HGNC')
gene6=ET.SubElement(training3, "gene")
gene6.text="SRF"
#testset2=ET.SubElement(doc3, "testset accession-source='ENTREZ'")
testset2=ET.SubElement(doc3, "testset")
testset2.set('accession-source','ENTREZ')
gene7=ET.SubElement(testset2,"gene")
gene7.text="338"
###
doc4= ET.SubElement (root, "toppgenet", id="d")
config4=ET.SubElement(doc4, "enrichment-config")
training4=ET.SubElement(doc4, "trainingset")
training4.set('accession-source','HGNC')
gene8=ET.SubElement(training4, "gene")
gene8.text="APOB"
tree = ET.ElementTree(root)
def indent(elem, level=0):
i = "\n" + level*" "
if len(elem):
if not elem.text or not elem.text.strip():
elem.text = i + " "
if not elem.tail or not elem.tail.strip():
elem.tail = i
for elem in elem:
indent(elem, level+1)
if not elem.tail or not elem.tail.strip():
elem.tail = i
else:
if level and (not elem.tail or not elem.tail.strip()):
elem.tail = i
import xml.dom.minidom
from lxml import etree
indent(root)
with open('/Users/nicole/Documents/nicolez.xml', 'w') as f:
f.write('<?xml version="1.0" encoding="UTF-8" ?><!DOCTYPE requests SYSTEM "https://toppgene.cchmc.org/toppgenereq.dtd">')
ElementTree.ElementTree(root).write(f,'utf-8')
xml = xml.dom.minidom.parse("/Users/nicole/Documents/nicolez.xml")
pretty_xml_as_string = xml.toprettyxml()
print(pretty_xml_as_string)
In [ ]:
#!sed -e 's/X/ /g' -e 's/Q/=/g' -e 's/9/\"/g' -e 's/\../\-/g' filename.xml >newfile.xml
In [ ]:
import xml.dom.minidom
xml = xml.dom.minidom.parse("/Users/nicole/Documents/zcopy.xml")
pretty_xml_as_string = xml.toprettyxml()
print(pretty_xml_as_string)
https://toppgene.cchmc.org/api/44009585-27C5-41FD-8279-A5FE1C86C8DB The input data is an XML payload that conforms to the DTD specified by https://toppgene.cchmc.org/toppgenereq.dtd An example of how you might run a command line query from a UNIX (including Linux and Mac) command line: curl -v -H 'Content-Type: text/xml' --data @z.xml -X POST https://toppgene.cchmc.org/api/44009585-27C5-41FD-8279-A5FE1C86C8DB
In [ ]:
#Run command line tool through Python #this works on test file
!curl -v -H 'Content-Type: text/xml' --data @/Users/nicole/Documents/test.xml -X POST https://toppgene.cchmc.org/api/44009585-27C5-41FD-8279-A5FE1C86C8DB > /Users/nicole/Documents/testoutfile.xml
In [ ]:
import xml.dom.minidom
xml = xml.dom.minidom.parse("/Users/nicole/Documents/testoutfile.xml")
pretty_xml_as_string = xml.toprettyxml()
print(pretty_xml_as_string)
In [ ]:
#xml to pandas dataframe http://pandas.pydata.org/pandas-docs/stable/api.html#input-output
import xml.dom.minidom
import pandas as pd
import numpy
import pyexcel
#Parse through .xml file
def load_parse_xml(data_file):
"""Check if file exists. If file exists, load and parse the data file. """
if os.path.isfile(data_file):
print "File exists. Parsing..."
data_parse = ET.ElementTree(file=data_file)
print "File parsed."
return data_parse
xmlfile = load_parse_xml("/Users/nicole/Documents/testoutfile.xml")
#Generate array of annotation arrays for .csv file
root_tree = xmlfile.getroot()
gene_list=[]
for child in root_tree:
child.find("enrichment-results")
new_array = []
array_of_arrays=[]
for type in child.iter("enrichment-result"):
for annotation in type.iter("annotation"):
array_of_arrays.append(new_array)
new_array = []
new_array.append(type.attrib['type'])
new_array.append(annotation.attrib['name'])
new_array.append(annotation.attrib['id'])
new_array.append(annotation.attrib['pvalue'])
new_array.append(annotation.attrib['genes-in-query'])
new_array.append(annotation.attrib['genes-in-term'])
new_array.append(annotation.attrib['source'])
for gene in annotation.iter("gene"):
gene_list.append(gene.attrib['symbol'])
new_array.append(gene_list)
gene_list =[]
print array_of_arrays
In [ ]:
import pyexcel
data = array_of_arrays
pyexcel.save_as(array = data, dest_file_name = '/Users/nicole/Documents/plswork.csv')
In [ ]:
df=pandas.read_csv('/Users/nicole/Documents/plswork.csv', header=None)
df.columns=['ToppGene Feature','Annotation Name','ID','pValue','Genes-in-Query','Genes-in-Term','Source','Genes']
#df.head(15)
In [ ]:
#Dataframe for GeneOntologyMolecularFunction
df.loc[df['ToppGene Feature'] == 'GeneOntologyMolecularFunction']
In [ ]:
#Dataframe for GeneOntologyBiologicalProcess
df.loc[df['ToppGene Feature'] == 'GeneOntologyBiologicalProcess']
In [ ]:
#Dataframe for GeneOntologyCellularComponent
df.loc[df['ToppGene Feature'] == 'GeneOntologyCellularComponent']
In [ ]:
#Dataframe for Human Phenotype
df.loc[df['ToppGene Feature'] == 'HumanPheno']
In [ ]:
#Dataframe for Mouse Phenotype
df.loc[df['ToppGene Feature'] == 'MousePheno']
In [ ]:
#Dataframe for Domain
df.loc[df['ToppGene Feature'] == 'Domain']
In [ ]:
#Dataframe for Pathways
df.loc[df['ToppGene Feature'] == 'Pathway']
In [ ]:
#Dataframe for Pubmed
df.loc[df['ToppGene Feature'] == 'Pubmed']
In [ ]:
#Dataframe for Interactions
df.loc[df['ToppGene Feature'] == 'Interaction']
In [ ]:
#Dataframe for Cytobands
df.loc[df['ToppGene Feature'] == 'Cytoband']
In [ ]:
#Dataframe for Transcription Factor Binding Sites
df.loc[df['ToppGene Feature'] == 'TranscriptionFactorBindingSite']
In [ ]:
#Dataframe for Gene Family
df.loc[df['ToppGene Feature'] == 'GeneFamily']
In [ ]:
#Dataframe for Coexpression
df.loc[df['ToppGene Feature'] == 'Coexpression']
In [ ]:
#DataFrame for Coexpression Atlas
df.loc[df['ToppGene Feature'] == 'CoexpressionAtlas']
In [ ]:
#Dataframe for Computational
df.loc[df['ToppGene Feature'] == 'Computational']
In [ ]:
#Dataframe for MicroRNAs
df.loc[df['ToppGene Feature'] == 'MicroRNA']
In [ ]:
#Dataframe for Drugs
df.loc[df['ToppGene Feature'] == 'Drug']
In [ ]:
#Dataframe for Diseases
df.loc[df['ToppGene Feature'] == 'Disease']
In [ ]: