In [87]:
!wget ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz


--2016-04-25 15:48:40--  http://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz
Resolving ftp.ncbi.nih.gov... 130.14.250.12, 2607:f220:41e:250::10
Connecting to ftp.ncbi.nih.gov|130.14.250.12|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2784876 (2.7M) [application/x-gzip]
Saving to: “Homo_sapiens.gene_info.gz”

100%[======================================>] 2,784,876   2.53M/s   in 1.1s    

2016-04-25 15:48:41 (2.53 MB/s) - “Homo_sapiens.gene_info.gz” saved [2784876/2784876]


In [88]:
!wget ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Mus_musculus.gene_info.gz


--2016-04-25 15:48:50--  http://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Mus_musculus.gene_info.gz
Resolving ftp.ncbi.nih.gov... 130.14.250.12, 2607:f220:41e:250::10
Connecting to ftp.ncbi.nih.gov|130.14.250.12|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2886858 (2.8M) [application/x-gzip]
Saving to: “Mus_musculus.gene_info.gz”

100%[======================================>] 2,886,858   2.57M/s   in 1.1s    

2016-04-25 15:48:52 (2.57 MB/s) - “Mus_musculus.gene_info.gz” saved [2886858/2886858]


In [89]:
!zcat Homo_sapiens.gene_info.gz | sed '1d' | cut -f3,5 | sed $'s/\t/|/' > Homo_sapiens_gene_to_aliases.txt

In [90]:
!zcat Mus_musculus.gene_info.gz | sed '1d' | cut -f3,5 | sed $'s/\t/|/' > Mus_musculus_gene_to_aliases.txt

In [91]:
alias_lists=[]
with open('Homo_sapiens_gene_to_aliases.txt') as infile:
    for line in infile:
        alias_lists.append(line.strip().upper().split('|'))

        
with open('Mus_musculus_gene_to_aliases.txt') as infile:
    for line in infile:
        alias_lists.append(line.strip().upper().split('|'))

In [100]:
def motif_to_genes(term,alias_list):
    term=term.upper()
    potential_mapping=[]
    for gene_aliases in alias_list:
        if term in gene_aliases:
            potential_mapping+=gene_aliases

            
    if '-' in potential_mapping:
        potential_mapping.remove('-')
    return list(set(potential_mapping))

In [103]:
#test one
motif_to_genes('NANOG',alias_lists)


Out[103]:
['ECAT4', 'PN8', 'NANOG', 'NANOGP8', 'ENK', 'NANOGP1', '2410002E02RIK']

In [107]:
meme_filename='JASPAR_CORE_REDUNDANT_2016_vertebrates.meme'
with open ('JASPAR_CORE_REDUNDANT_2016_vertebrates_mapped_to_gene_human_mouse.txt','w+') as outfile:
    
    for line in open(meme_filename):
        if 'MOTIF' in line:
            fields= line.strip().split()
            motif_id=fields[1]
            motif_name=fields[2]
            print  motif_name
            cleaned_name=motif_name.replace(',','').replace('-','').replace('::','_').replace('(var.2)','').replace('(var.3)','')
            for term in cleaned_name.split('_'):
                #print 'PASRSING:',term
                mapped_genes=motif_to_genes(term,alias_lists)
                mapped_genes=[term.upper()]+mapped_genes
                outfile.write('%s\t%s\t%s\n' %(motif_id,motif_name,','.join(mapped_genes)))


RUNX1
RUNX1
TFAP2A
TFAP2A
TFAP2A
Arnt
Ahr::Arnt
Ar
AR
Ar
T
T
Pax5
PAX5
NR2F1
NR2F1
CREB1
CREB1
Ddit3::Cebpa
E2F1
E2F1
E2F1
NFIL3
En1
EN1
ELK1
ELK1
Mecom
FOXF2
FOXD1
FOXC1
FOXC1
FOXL1
FOXL1
Gata1
Gata1
Gata1
GATA2
GATA2
GATA3
GATA3
Gfi1
Klf4
Klf4
Foxq1
Foxd3
FOXI1
FOXI1
HLF
HLF
HNF1A
HNF1A
Foxa2
Foxa2
NHLH1
NHLH1
IRF1
IRF1
IRF2
MEF2A
MEF2A
MEF2A
MZF1
MZF1(var.2)
MAX
MAX
MAX
MAX::MYC
NFYA
NFYA
GABPA
Gabpa
Nkx2-5
PPARG::RXRA
Pparg::Rxra
PPARG
Pax2
Pax4
PAX4
Pax6
PBX1
RORA
RORA(var.2)
RREB1
RXRA::VDR
Prrx2
Prrx2
ELK4
ELK4
SOX9
Sox17
SP1
SP1
SP1
SPI1
SPI1
Spi1
SPI1
SPIB
SRF
SRF
SRF
SRY
Sox5
znf143
ZNF143
MAFG::NFE2L1
TEAD1
TEAD1
TAL1::TCF3
Hand1::Tcf3
USF1
USF1
YY1
YY1
ETS1
Ets1
ETS1
JUN::FOS
FOS::JUN
Myb
Myb
REL
Cebpa
CEBPA
CEBPA
ZEB1
ZEB1
Mycn
Mycn
Mycn
NFKB1
NFKB1
NFKB1
NFKB1
TP53
TP53
TP53
RELA
TBP
TBP
HLTF
Spz1
ESR1
ESR1
ESR1
NR3C1
NR3C1
NR3C1
HNF4A
HNF4A
Hnf4a
NR1H2::RXRA
Znf423
Mafb
Mafb
NFIC::TLX1
Nkx3-2
NKX3-2
NKX3-1
Nkx3-1
Nobox
ZNF354C
HINFP
HINFP
Pdx1
PDX1
Lhx3
ELF5
ELF5
STAT1
STAT1
STAT1
REST
REST
CTCF
Tal1::Gata1
GATA1::TAL1
Esrrb
Esrrb
ESRRB
Pou5f1::Sox2
Sox2
Sox2
Sox2
Stat3
STAT3
Tcfcp2l1
Tcfcp2l1
TFCP2
Zfx
Zfx
Myc
Myc
FOXA1
FOXA1
FOXA1
EWSR1-FLI1
NFE2L2
Nfe2l2
Arid3a
NFATC2
HNF1B
HNF1B
EBF1
EBF1
EBF1
INSM1
FEV
FEV
FOXO3
FOXO3
HOXA5
RARA::RXRA
NR4A2
NFIC
Egr1
EGR1
PLAG1
Nr2e3
ESR2
ESR2
ARNT::HIF1A
SOX10
Atoh1
Atoh1
BATF::JUN
Bcl6
Bhlhe40
BHLHE40
CDX2
CEBPB
CEBPB
Crx
DUX4
E2F3
E2F3
E2F4
E2F6
EGR2
EGR2
ELF1
ELF1
Erg
ERG
FLI1
FLI1
FOS
FOSL1
FOSL2
FOXH1
Foxo1
FOXP1
Gata4
Gfi1b
HNF4G
Hoxc9
HSF1
HSF1
JUN
JUN(var.2)
JUNB
JUND
JUND(var.2)
Klf1
Nr1h3::Rxra
MAFF
MAFK
MEF2C
Meis1
MEIS1
Myod1
Myog
MAF::NFE2
NFYB
Nkx2-5(var.2)
NR2C2
Nr5a2
NRF1
POU2F2
PRDM1
Rfx1
RFX5
RFX5
RUNX2
RUNX2
Rxra
Rxra
SMAD2::SMAD3::SMAD4
Sox3
Sox6
SP2
STAT1::STAT2
Stat4
Stat5a::Stat5b
Stat6
Tcf12
Tcf3
TCF3
TCF7L2
TFAP2C
TFAP2C
TP63
TP63
USF2
ZBTB33
ZNF263
Bach1::Mafk
ESRRA
Esrra
FOXP2
Hoxa9
SREBF1
SREBF2
THAP1
EHF
EHF
KLF5
RFX2
RFX2
Arid3b
Arid5a
Arntl
Atf1
Atf3
NFAT5
Bhlha15
Creb3l2
Crem
DMRT3
Dux
EMX1
FOXG1
Foxj2
Gmeb1
Hes2
Id2
LBX1
LIN54
Mitf
mix-a
Mlxip
Neurog1
NFATC1
NFATC3
Npas2
Pou2f3
POU6F1
Rhox11
SHOX
Six3
Tcfl5
Twist2
ALX3
BARHL2
BHLHE41
CENPB
CREB3
DBP
ELF3
ELF4
EN2
Esrrg
ESX1
ETV6
GCM1
GRHL1
GSC
HEY2
HOXA13
HOXC11
IRF8
IRF9
ISX
JDP2
JDP2(var.2)
KLF13
LHX6
MAFG
MEF2B
MEOX1
MIXL1
MLX
MLXIPL
MSC
MSX1
MYF6
NEUROD2
NEUROG2
NFIA
NFIX
NKX2-3
NKX2-8
NKX6-1
NKX6-2
Nr2e1
Nr2f6
OLIG2
ONECUT1
PAX7
Phox2b
Pitx1
POU4F2
RUNX3
SP4
SPDEF
SPIC
TBX2
TBX20
TBX21
TFAP4
TFEB
Vdr
ZBTB7B
ZBTB7C
ZIC1
ZIC3
ZBTB18
LBX2
LHX2
LHX9
LMX1A
LMX1B
Lhx4
Lhx8
MEOX2
MNX1
MSX2
Msx3
NOTO
OTX1
OTX2
PHOX2A
PITX3
PROP1
PRRX1
RAX2
RAX
RHOXF1
Shox2
UNCX
VAX1
VAX2
VENTX
VSX1
VSX2
NR3C2
Nr2f6(var.2)
RARA
RARA(var.2)
BCL6B
EGR3
EGR4
GLI2
GLIS1
GLIS2
GLIS3
HIC2
Hic1
KLF14
KLF16
Klf12
SCRT1
SCRT2
SNAI2
SP3
SP8
YY2
ZBED1
ZBTB7A
ZIC4
ZNF410
ZNF740
CUX1
CUX2
ONECUT2
ONECUT3
E2F7
ELK3
ERF
ETV1
ETV2
ETV3
ETV4
ETV5
GATA5
GCM2
LEF1
Tcf7
HSF2
HSF4
IRF7
MEF2D
MEIS2
MEIS3
MYBL1
MYBL2
NFKB2
PAX1
PAX3
PAX9
PKNOX1
PKNOX2
POU1F1
POU2F1
POU3F1
POU3F2
POU3F3
POU3F4
POU4F1
POU4F3
POU5F1B
POU6F2
PROX1
SMAD3
TGIF1
TGIF2
RFX3
RFX4
EOMES
MGA
TBR1
TBX15
TBX19
TBX1
TBX4
TBX5
TEAD3
TEAD4
TFAP2A(var.2)
TFAP2B
TFAP2B(var.2)
TFAP2B(var.3)
TFAP2C(var.2)
TFAP2C(var.3)
Ascl2
BHLHE23
BHLHE22
CLOCK
FIGLA
HES5
HES7
HEY1
ID4
MNT
OLIG1
OLIG3
SREBF2(var.2)
Srebf1(var.2)
TCF4
TFE3
Tcf21
ATF4
ATF7
BATF3
CEBPD
CEBPE
CEBPG
CREB3L1
Creb5
NFE2
NRL
TEF
XBP1
FOXB1
FOXC2
FOXD2
FOXO4
FOXO6
FOXP3
Foxj3
Foxk1
Alx4
Alx1
RXRB
RXRG
Rarb
Rarb(var.2)
Rarg
Rarg(var.2)
TP73
GMEB2
MTF1
E2F2
E2F8
SOX21
SOX4
SOX8
Sox11
Sox1
TFEC
TFAP2A(var.3)
HOXD12
Arx
BARX1
BSX
Barhl1
CDX1
Dlx1
Dlx3
Dlx4
DLX6
Dmbx1
DUXA
Dlx2
EMX2
EVX1
EVX2
GBX1
GBX2
GSC2
GSX1
GSX2
HESX1
HMBOX1
Hmx1
Hmx2
Hmx3
HOXA10
HOXA2
HOXB13
HOXB2
HOXB3
Hoxb5
HOXC10
HOXC12
HOXC13
HOXD11
HOXD13
Hoxd8
Hoxa11
Hoxd3
Hoxd9
ISL2
Hes1

In [108]:
meme_filename='JASPAR_CORE_2016_vertebrates.meme'
with open ('JASPAR_CORE_2016_vertebrates_mapped_to_gene_human_mouse.txt','w+') as outfile:
    
    for line in open(meme_filename):
        if 'MOTIF' in line:
            fields= line.strip().split()
            motif_id=fields[1]
            motif_name=fields[2]
            print  motif_name
            cleaned_name=motif_name.replace(',','').replace('-','').replace('::','_').replace('(var.2)','').replace('(var.3)','')
            for term in cleaned_name.split('_'):
                #print 'PASRSING:',term
                mapped_genes=motif_to_genes(term,alias_lists)
                mapped_genes=[term.upper()]+mapped_genes
                outfile.write('%s\t%s\t%s\n' %(motif_id,motif_name,','.join(mapped_genes)))


RUNX1
TFAP2A
Arnt
Ahr::Arnt
Ar
T
PAX5
NR2F1
CREB1
Ddit3::Cebpa
E2F1
NFIL3
EN1
ELK1
Mecom
FOXF2
FOXD1
FOXC1
FOXL1
Gata1
GATA2
GATA3
Gfi1
Klf4
Foxq1
Foxd3
FOXI1
HLF
HNF1A
Foxa2
NHLH1
IRF1
IRF2
MEF2A
MZF1
MZF1(var.2)
MAX
MAX::MYC
NFYA
Gabpa
Nkx2-5
Pparg::Rxra
PPARG
Pax2
PAX4
Pax6
PBX1
RORA
RORA(var.2)
RREB1
RXRA::VDR
Prrx2
ELK4
SOX9
Sox17
SP1
SPI1
SPIB
SRF
SRY
Sox5
ZNF143
MAFG::NFE2L1
TEAD1
TAL1::TCF3
Hand1::Tcf3
USF1
YY1
ETS1
FOS::JUN
Myb
REL
CEBPA
ZEB1
Mycn
NFKB1
TP53
RELA
TBP
HLTF
Spz1
ESR1
NR3C1
Hnf4a
NR1H2::RXRA
Znf423
Mafb
NFIC::TLX1
NKX3-2
Nkx3-1
Nobox
ZNF354C
HINFP
PDX1
Lhx3
ELF5
STAT1
REST
CTCF
GATA1::TAL1
ESRRB
Pou5f1::Sox2
Sox2
STAT3
TFCP2
Zfx
Myc
FOXA1
EWSR1-FLI1
Nfe2l2
Arid3a
NFATC2
HNF1B
EBF1
INSM1
FEV
FOXO3
HOXA5
RARA::RXRA
NR4A2
NFIC
EGR1
PLAG1
Nr2e3
ESR2
ARNT::HIF1A
SOX10
Atoh1
BATF::JUN
Bcl6
BHLHE40
CDX2
CEBPB
Crx
DUX4
E2F3
E2F4
E2F6
EGR2
ELF1
ERG
FLI1
FOS
FOSL1
FOSL2
FOXH1
Foxo1
FOXP1
Gata4
Gfi1b
HNF4G
Hoxc9
HSF1
JUN
JUN(var.2)
JUNB
JUND
JUND(var.2)
Klf1
Nr1h3::Rxra
MAFF
MAFK
MEF2C
MEIS1
Myod1
Myog
MAF::NFE2
NFYB
Nkx2-5(var.2)
NR2C2
Nr5a2
NRF1
POU2F2
PRDM1
Rfx1
RFX5
RUNX2
Rxra
SMAD2::SMAD3::SMAD4
Sox3
Sox6
SP2
STAT1::STAT2
Stat4
Stat5a::Stat5b
Stat6
Tcf12
TCF3
TCF7L2
TFAP2C
TP63
USF2
ZBTB33
ZNF263
Bach1::Mafk
Esrra
FOXP2
Hoxa9
SREBF1
SREBF2
THAP1
EHF
KLF5
RFX2
Arid3b
Arid5a
Arntl
Atf1
Atf3
NFAT5
Bhlha15
Creb3l2
Crem
DMRT3
Dux
EMX1
FOXG1
Foxj2
Gmeb1
Hes2
Id2
LBX1
LIN54
Mitf
mix-a
Mlxip
Neurog1
NFATC1
NFATC3
Npas2
Pou2f3
POU6F1
Rhox11
SHOX
Six3
Tcfl5
Twist2
ALX3
BARHL2
BHLHE41
CENPB
CREB3
DBP
ELF3
ELF4
EN2
Esrrg
ESX1
ETV6
GCM1
GRHL1
GSC
HEY2
HOXA13
HOXC11
IRF8
IRF9
ISX
JDP2
JDP2(var.2)
KLF13
LHX6
MAFG
MEF2B
MEOX1
MIXL1
MLX
MLXIPL
MSC
MSX1
MYF6
NEUROD2
NEUROG2
NFIA
NFIX
NKX2-3
NKX2-8
NKX6-1
NKX6-2
Nr2e1
Nr2f6
OLIG2
ONECUT1
PAX7
Phox2b
Pitx1
POU4F2
RUNX3
SP4
SPDEF
SPIC
TBX2
TBX20
TBX21
TFAP4
TFEB
Vdr
ZBTB7B
ZBTB7C
ZIC1
ZIC3
ZBTB18
LBX2
LHX2
LHX9
LMX1A
LMX1B
Lhx4
Lhx8
MEOX2
MNX1
MSX2
Msx3
NOTO
OTX1
OTX2
PHOX2A
PITX3
PROP1
PRRX1
RAX2
RAX
RHOXF1
Shox2
UNCX
VAX1
VAX2
VENTX
VSX1
VSX2
NR3C2
Nr2f6(var.2)
RARA
RARA(var.2)
BCL6B
EGR3
EGR4
GLI2
GLIS1
GLIS2
GLIS3
HIC2
Hic1
KLF14
KLF16
Klf12
SCRT1
SCRT2
SNAI2
SP3
SP8
YY2
ZBED1
ZBTB7A
ZIC4
ZNF410
ZNF740
CUX1
CUX2
ONECUT2
ONECUT3
E2F7
ELK3
ERF
ETV1
ETV2
ETV3
ETV4
ETV5
GATA5
GCM2
LEF1
Tcf7
HSF2
HSF4
IRF7
MEF2D
MEIS2
MEIS3
MYBL1
MYBL2
NFKB2
PAX1
PAX3
PAX9
PKNOX1
PKNOX2
POU1F1
POU2F1
POU3F1
POU3F2
POU3F3
POU3F4
POU4F1
POU4F3
POU5F1B
POU6F2
PROX1
SMAD3
TGIF1
TGIF2
RFX3
RFX4
EOMES
MGA
TBR1
TBX15
TBX19
TBX1
TBX4
TBX5
TEAD3
TEAD4
TFAP2A(var.2)
TFAP2B
TFAP2B(var.2)
TFAP2B(var.3)
TFAP2C(var.2)
TFAP2C(var.3)
Ascl2
BHLHE23
BHLHE22
CLOCK
FIGLA
HES5
HES7
HEY1
ID4
MNT
OLIG1
OLIG3
SREBF2(var.2)
Srebf1(var.2)
TCF4
TFE3
Tcf21
ATF4
ATF7
BATF3
CEBPD
CEBPE
CEBPG
CREB3L1
Creb5
NFE2
NRL
TEF
XBP1
FOXB1
FOXC2
FOXD2
FOXO4
FOXO6
FOXP3
Foxj3
Foxk1
Alx4
Alx1
RXRB
RXRG
Rarb
Rarb(var.2)
Rarg
Rarg(var.2)
TP73
GMEB2
MTF1
E2F2
E2F8
SOX21
SOX4
SOX8
Sox11
Sox1
TFEC
TFAP2A(var.3)
HOXD12
Arx
BARX1
BSX
Barhl1
CDX1
Dlx1
Dlx3
Dlx4
DLX6
Dmbx1
DUXA
Dlx2
EMX2
EVX1
EVX2
GBX1
GBX2
GSC2
GSX1
GSX2
HESX1
HMBOX1
Hmx1
Hmx2
Hmx3
HOXA10
HOXA2
HOXB13
HOXB2
HOXB3
Hoxb5
HOXC10
HOXC12
HOXC13
HOXD11
HOXD13
Hoxd8
Hoxa11
Hoxd3
Hoxd9
ISL2
Hes1

In [106]:
meme_filename='FACTORBOOK.meme'
with open ('FACTORBOOK_mapped_to_gene_human_mouse.txt','w+') as outfile:
    
    for line in open(meme_filename):
        if 'MOTIF' in line:
            fields= line.strip().split()
            motif_id=fields[1]
            motif_name=fields[2]
            print  motif_name
            cleaned_name=motif_name.replace(',','').replace('-','').replace('::','_')
            for term in cleaned_name.split('_'):
                #print 'PASRSING:',term
                mapped_genes=motif_to_genes(term,alias_lists)
                mapped_genes=[term.upper()]+mapped_genes
                outfile.write('%s\t%s\t%s\n' %(motif_id,motif_name,','.join(mapped_genes)))


A-Box
AP1
AP2
BARHL2
B-Box
BHLHE40
CEBPB
CREB
CREB-ext
CTCF
CTCF-ext
E2F1
E2F4
EBF1
EGR1
ELF1
ELK4
ESR1
ESRRA
ETS1
FOXA
GABP
GATA1
GATA1-ext
GATA3
GFI1
HNF4
HSF1
MAX
MEF2
MYC
NFE2
NFKB1
NFY
NFY-UA2
NR2C2
NR3C1
NRF1
PAX5
POU2F2
PRDM1
PU1
REST
RFX5
RUNX1
RXRA
SOX2
SOX2-OCT4
SP1
SREBF1
SRF
STAT1
STAT2
TAL1
TBP
TCF12
TCF7L2
TEAD1
UA10
UA11
UA12
UA1
UA2
UA3
UA4
UA5
UA6
UA7
UA8
UA9
USF
v-JUN
v-Maf
YY1
ZEB1
ZNF143
ZNF143-ext
ZNF263
ZNF281

In [ ]: