In [1]:
import os, os.path
from itertools import groupby
import numpy as np
import matplotlib.pyplot as plt
from Bio import Motif
from Bio.Seq import Seq, Alphabet

os.chdir('/home/will/Dropbox/IpythonShare/HIVTfFig/')

In [2]:
from collections import Counter
seqfiles = [('pwms/AP1.fasta', 'AP1'),
             ('pwms/coup1.fasta', 'COUP1'),
             ('pwms/coup2.fasta', 'COUP2')]
count_dict = {}
order = 'ACGT'
for f, name in seqfiles:
    seqs = [line.strip().upper() for line in open(f) if not line.startswith('>')]
    cols = []
    for col in zip(*seqs):
        cols.append(Counter(col))
    #print(cols)
    ostr = u''
    for l in order:
        ostr += l
        for count in cols:
            ostr += ' ' + str(count[l])
        ostr += '\n'
    count_dict[name] = ostr

In [3]:
print(count_dict['COUP2'])


A 0 12 0 0 1 11 0 0 0 0 0 0 0 13
C 13 1 12 0 0 0 11 12 0 0 0 0 0 0
G 0 0 0 0 12 0 2 0 0 0 0 13 13 0
T 0 0 1 13 0 2 0 1 13 13 13 0 0 0


In [4]:
from io import StringIO

motif_dict = {}
for key, tmp in count_dict.items():
    print key, type(tmp)
    motif_dict[key] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tmp = u"""A  0  0 16  5  3  0 16
C  1  0  2 12  0 15  0
G  0 15  0  1  1  3  1
T  17  3  0  0 14  0  1"""

motif_dict['AP1'] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tmp = u"""A  3  1  4  2  4  2 18 18  0
C  0  1  1  9  2 15  0  0  6
G  0  4  6  2 10  0  0  0  2
T  15 12  7  5  2  1  0  0 10"""

motif_dict['CEBP'] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tmp = u"""A  0  0  0  4  2  0  1  0  6  3
C  32 30 35 27  5 28 31 24 25 26
G  1  1  0  0 15  1  0  3  0  3
T  2  4  0  4 13  6  3  8  4  3"""

motif_dict['SP1'] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tmp = u"""A  0  1 12  6  0  0  0  1  2  6  6  1  3  0
C  0  0  0  7 13  3  2  0  0  4  5 10  6  3
G  2 12  1  0  0  0  0  0 11  3  1  1  0  3
T 11  0  0  0  0 10 11 12  0  0  1  1  4  7"""

motif_dict['NR2F1'] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tmp = u""" A 7  4  1  1  0  0 12 11 10  0  0  0  0 11
C 3  0  1  1  3 11  0  0  0  0  0  1 12  2
G 3  6 10  5  4  0  0  2  3 13  7  0  0  0
T 0  3  1  6  6  2  1  0  0  0  6 12  1  0"""


motif_dict['HNF4A'] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tmp = u"""A 0  0 6 1 0 0 0 4 2 2 0 0 3 
C  1 1 1 0 5 6 4 1 0 0 0 3 5 5 4 0  
G  0 6 0 1 1 0 0 0 0 7 1 1 0 0 1 0 
T  6 0 0 0 1 1 3 5 7 0 0 0 0 2 2 4"""

motif_dict['COUP-T2'] = Motif.read(StringIO(tmp), 'jaspar-pfm')


tmp = u"""A  8 13  0  3  2  0 14  3
C  1  0  0  0  2 13  0  8
G  3  1 13 11  0  0  0  2
T  1  0  1  0 10  1  0  0"""

motif_dict['NR4A2'] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tmo = """A  4  2  4  6  7  0  0 16
C  4  5  4  4  3  0  0  0
G  5  6  3  5  2 16  0  0
T  3  3  5  1  4  0 16  0"""

motif_dict['FoxC1'] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tmp = u"""A  2  9  0  0 13 12  7  0
C  9  3  0  0  0  0  0  0
G  1  1 13 13  0  0  6  0
T  1  0  0  0  0  1  0 13"""


motif_dict['FEV'] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tmp = u"""A 33  4  4 42 41  5  3
C  3  0  0  0  0  3  5
G  6 37 38  0  1 33  2
T  0  1  0  0  0  1 32"""

motif_dict['SPI1'] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tmp = u"""A  0   0   0  22  19  55  53  19   9
C  0 185 185  71  57  44  30  16  78
G 185   0   0  46  61  67  91 137  79
T   0   0   0  46  48  19  11  13  19"""

motif_dict['TFAP2A'] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tmp = u"""A 4 17  0  0  0  5
C 16  0  1 39 39  3
G  4  0  0  1  0 17
T 16 23 39  0  1 15"""

motif_dict['ETS1'] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tmp = u"""A 13  0 52  0 25
C 13  5  0  0  7
G 18 48  1  0 15
T  9  0  0 53  6"""


motif_dict['GATA2'] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tmp = u"""A  7 10  6 13  4 21  0 22
C  1  4  3  4 10  0  2  1
G  4  2  6  4  2  2  0  0
T 11  7  8  2  7  0 21  0"""


motif_dict['FOXC1'] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tmp = u"""A  2  7  0  6 14 14  0  0
C 13  0  7  0  0  0  0  1
G  0  5  5  1  0  2  0  6
T  0  4  4  9  2  0 16  9"""

motif_dict['HOXA5'] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tmp = u"""A 25  0 61  0 39 15
C 14  1  0  0  1  3
G  4 62  1  5  4 37
T 20  0  1 58 19  8"""

motif_dict['GATA3'] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tmp = u"""A  4 17  0  0  0  5
C 16  0  1 39 39  3
G  4  0  0  1  0 17
T 16 23 39  0  1 15"""

motif_dict['ETS1'] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tmp = u"""A  7  3  0  0 16  0
C  6  2 16 16  0 15
G  3  0  0  0  0  1
T  0 11  0  0  0  0"""


motif_dict['ZNF354C'] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tmp = u"""A  358   81   81   91 1226 3298
C 1832   67   67   88 5364  981
G  176  300 6713 6643  160 1186
T 4546 6464   51   90  162 1447"""


motif_dict['NFIC'] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tmp = u"""A  0  8  0  0  0  0
C 19  2  1  0  3  0
G  0  2  4  0 19  1
T  3 10 17 22  0 21"""


motif_dict['SOX10'] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tmp = u"""A  0  0 11  0  1  0  2  8
C  1  1  0  9  0  3  7  0
G  1 10  0  2 10  0  1  1
T  9  0  0  0  0  8  1  2"""

motif_dict['CREB'] = Motif.read(StringIO(tmp), 'jaspar-pfm')

tfs = list(motif_dict.keys())

for m in tfs:
    if not m.endswith('-R'):
        motif_dict[m+'-R'] = motif_dict[m].reverse_complement()


COUP2 <type 'unicode'>
AP1 <type 'unicode'>
COUP1 <type 'unicode'>

In [17]:
test_seqs = [('SP-1', 1, 'GAGGCGTGGC')
             #('AP1', 105, 'GGGATCA'),
             #('AP1', 120, 'CTGACCT'),
             #('AP1', 155, 'TGAGCCA'),
             #('CEBP', 281, 'ATTTCATCA'),
             #('CREB', 330, 'TGACATCG'),
             #('CEBP', 340, 'CTTTCTACA'),
             #('SP1', 377, 'GAGGCGTGGC'),
             #('NR2F1', 95, 'ACCAGGGCCAGGGAT'),
             #('NR2F1', 118, 'CACTGACCTTTGGAT'),
             #('NR2F1-R', 98, 'AGGGCCAGGGATCAG'),
             #('NR2F1', 121, 'TGACCTTTGGATGGT'),
             #('HNF4A', 99, 'GGGCCAGGGATCAG'),
             #('COUP-T2', 95, 'ACCAGGGCCAGGGA'),
             #('COUP-T2', 118, 'CACTGACCTTTGGA'),
             #('COUP-T2', 121, 'TGACCTTTGGATGG'),
             #('COUP-T2-R', 98, 'AGGGCCAGGGATCA'),
             #('NR4A2', 64, 'AAGGCTAC'),
             #('FOXC1-R', 69, 'TACTTCCC'),
             #('FEV-R', 70, 'ACTTCCCT'),
             #('SPI1-R', 70, 'ACTTCCC'),
             ('TFAP2A', 101, 'GCCAGGGATC'),
             #('ETS1-R', 105, 'GGGATC'),
             #('GATA2', 106, 'GGATC'),
             #('GATA2-R', 107, 'GATCA'),
             #('FOXC1', 108, 'ATCAGATA'),
             #('HOXA5', 110, 'CAGATATC'),
             #('GATA2', 111, 'AGATA'),
             #('GATA3', 111, 'AGATAT'),
             #('GATA3-R', 113, 'ATATCC'),
             #('ETS1', 114, 'TATCCA'),
             #('GATA2-R', 114, 'TATCC'),
             #('ZNF354C', 115, 'ATCCAC'),
             #('NR4A2', 120, 'CTGACCTT'),
             #('AP1-R', 155, 'TGAGCCA'),
             #('NFIC-R', 157, 'AGCCAG'),
             #('GATA3', 161, 'AGAGAA'),
             #('NR4A2', 174, 'GAGGCCAA'),
             #('NFIC', 176, 'GGCCAA'),
             #('SOX10-R', 178, 'CCAATG')
             
]

In [18]:
def make_seq(seq, comp = False):
    if comp:
        return Seq(seq,Alphabet.IUPAC.unambiguous_dna).reverse_complement()
    else:
        return Seq(seq,Alphabet.IUPAC.unambiguous_dna) 

def score_seq(mot, seq, comp = False):
    return mot.scanPWM(make_seq(seq, comp = comp))[0]

In [19]:
res = []
lets = 'ACTG'
for name, startpos, base_seq in test_seqs:
    bs = list(base_seq)
    TM = motif_dict[name]
    mat = np.zeros((6, len(bs)+2))
    for n in range(len(bs)):
        olet = bs[n]
        for ln, let in enumerate(lets):
            bs[n] = let
            mat[ln+1, n+1] = score_seq(TM, ''.join(bs))
        bs[n] = olet
    res.append((name, startpos, base_seq, mat.copy(), score_seq(TM, base_seq)))

In [20]:
def add_letters(lets):
    yvals = {'A':1, 'C':2, 'T':3,'G':4}
    for n, l in enumerate(lets):
        yval = yvals[l]
        plt.annotate(l, xy=(n+1, yval), fontsize=60)

def add_sig(sig_mask):
    nrows, ncols = sig_mask.shape
    rows, cols = np.where(sig_mask)
    for row, col in zip(rows, cols):
        plt.annotate('*', xy=(col, row), fontsize=60)

In [21]:
from pylab import get_cmap

#plt.figure(figsize = (10, 90))
#fig, subs = plt.subplots(3,1, figsize = (5,15))
width_per_let = 0.75
for name, startpos, seq, r, conB_score in res:
    if True:
        plt.figure(figsize = (width_per_let*len(seq),3.4))
        plt.title(name + '-' + str(startpos))
        plt.pcolor(-(r-conB_score), cmap = get_cmap('RdBu'))
        plt.yticks([1.5,2.5,3.5,4.5], ('A', 'C', 'T', 'G'))
        plt.xticks(np.arange(1.5, len(seq)+1), range(startpos,startpos+len(seq)+1))
        plt.ylim([1,5])
        plt.xlim([1,len(seq)+1])
        plt.clim([-10, 10])
        add_letters(seq)
        add_sig(abs(r[0:-1, 0:-1]-conB_score)>3.84)
        #plt.colorbar()
        plt.savefig('%s-%i-color.png' % (name, startpos))



In [10]:
for name, startpos, seq, r, _ in res:
    if True:
        bnum = score_seq(motif_dict[name], seq)
        print(bnum)
        plt.figure(figsize = (15,5))
        plt.title(name + '-' + str(startpos))
        mask = 0.5*((-(r-bnum))>3.84) + -0.5*((-(r-bnum))<-3.84)
        plt.pcolor(mask, cmap = get_cmap('RdBu'))
        plt.yticks([1.5,2.5,3.5,4.5], ('A', 'C', 'T', 'G'))
        plt.xticks(np.arange(1.5, len(seq)+1), range(startpos,startpos+len(seq)+1))
        plt.ylim([1,5])
        plt.xlim([1,len(seq)+1])
        mval = abs(r).max()
        plt.clim([-0.5, 0.5])
        add_letters(seq)
        plt.colorbar()
        plt.savefig('%s-%i-stat.png' % (name, startpos))


-1.62644
-11.0681
2.60117

In [10]:


In [11]:
len(check_seq)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-c4a9abbc94b2> in <module>()
----> 1 len(check_seq)

NameError: name 'check_seq' is not defined

In [ ]: