In [87]:
!wget ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz
In [88]:
!wget ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Mus_musculus.gene_info.gz
In [89]:
!zcat Homo_sapiens.gene_info.gz | sed '1d' | cut -f3,5 | sed $'s/\t/|/' > Homo_sapiens_gene_to_aliases.txt
In [90]:
!zcat Mus_musculus.gene_info.gz | sed '1d' | cut -f3,5 | sed $'s/\t/|/' > Mus_musculus_gene_to_aliases.txt
In [91]:
alias_lists=[]
with open('Homo_sapiens_gene_to_aliases.txt') as infile:
for line in infile:
alias_lists.append(line.strip().upper().split('|'))
with open('Mus_musculus_gene_to_aliases.txt') as infile:
for line in infile:
alias_lists.append(line.strip().upper().split('|'))
In [100]:
def motif_to_genes(term,alias_list):
term=term.upper()
potential_mapping=[]
for gene_aliases in alias_list:
if term in gene_aliases:
potential_mapping+=gene_aliases
if '-' in potential_mapping:
potential_mapping.remove('-')
return list(set(potential_mapping))
In [103]:
#test one
motif_to_genes('NANOG',alias_lists)
Out[103]:
In [107]:
meme_filename='JASPAR_CORE_REDUNDANT_2016_vertebrates.meme'
with open ('JASPAR_CORE_REDUNDANT_2016_vertebrates_mapped_to_gene_human_mouse.txt','w+') as outfile:
for line in open(meme_filename):
if 'MOTIF' in line:
fields= line.strip().split()
motif_id=fields[1]
motif_name=fields[2]
print motif_name
cleaned_name=motif_name.replace(',','').replace('-','').replace('::','_').replace('(var.2)','').replace('(var.3)','')
for term in cleaned_name.split('_'):
#print 'PASRSING:',term
mapped_genes=motif_to_genes(term,alias_lists)
mapped_genes=[term.upper()]+mapped_genes
outfile.write('%s\t%s\t%s\n' %(motif_id,motif_name,','.join(mapped_genes)))
In [108]:
meme_filename='JASPAR_CORE_2016_vertebrates.meme'
with open ('JASPAR_CORE_2016_vertebrates_mapped_to_gene_human_mouse.txt','w+') as outfile:
for line in open(meme_filename):
if 'MOTIF' in line:
fields= line.strip().split()
motif_id=fields[1]
motif_name=fields[2]
print motif_name
cleaned_name=motif_name.replace(',','').replace('-','').replace('::','_').replace('(var.2)','').replace('(var.3)','')
for term in cleaned_name.split('_'):
#print 'PASRSING:',term
mapped_genes=motif_to_genes(term,alias_lists)
mapped_genes=[term.upper()]+mapped_genes
outfile.write('%s\t%s\t%s\n' %(motif_id,motif_name,','.join(mapped_genes)))
In [106]:
meme_filename='FACTORBOOK.meme'
with open ('FACTORBOOK_mapped_to_gene_human_mouse.txt','w+') as outfile:
for line in open(meme_filename):
if 'MOTIF' in line:
fields= line.strip().split()
motif_id=fields[1]
motif_name=fields[2]
print motif_name
cleaned_name=motif_name.replace(',','').replace('-','').replace('::','_')
for term in cleaned_name.split('_'):
#print 'PASRSING:',term
mapped_genes=motif_to_genes(term,alias_lists)
mapped_genes=[term.upper()]+mapped_genes
outfile.write('%s\t%s\t%s\n' %(motif_id,motif_name,','.join(mapped_genes)))
In [ ]: