notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
from bokeh.plotting import *
from bokeh.models import HoverTool, ColumnDataSource



In [3]:

    
output_notebook()









    





    
        
        Loading BokehJS ...



In [4]:

    
import pandas as pd
import numpy as np
from ggplot import *
from utils import *



In [1]:

    
%%writefile parse.py
def is_empty(line):
    """Returns True empty lines and lines consisting only of whitespace."""
    return (not line) or line.isspace()

def is_fasta_label(line):
    return line.strip().startswith('>')
    

def LabeledRecordFinder(is_label_line, ignore=is_empty):
    """Returns function that returns successive labeled records from file.
    Includes label line in return value. Returns list of relevant lines.
    Skips over any lines for which ignore(line) evaluates True (default is
    to skip empty lines).
    """
    def parser(lines):
        with open(lines, 'r') as lines:
            curr = []
            for l in lines:
                line = l.strip()
                if ignore(line):
                    continue
                # if we find the label, return the previous record
                if is_label_line(line):
                    if curr:
                        yield curr
                        curr = []
                curr.append(line)
            # don't forget to return the last record in the file
            if curr:
                yield curr
    return parser

    
FastaFinder = LabeledRecordFinder(is_fasta_label)


def parse_fasta(infile, finder=FastaFinder):
    """Generator of labels and sequences from a fasta file.
    """

    for rec in finder(infile):
        # first line must be a label line
        if not rec[0].startswith('>'):
            raise ValueError("Found Fasta record without label line: %s" % rec)
        # record must have at least one sequence
        if len(rec) < 2:
            raise ValueError("Found label line without sequences: %s" % rec)

        # remove the label character from the beginning of the label
        label = rec[0][1:].strip()
        seq = ''.join(rec[1:])

        yield label, seq









    



Writing parse.py



In [6]:

    
df = trange_df('../data/ROSE1', trange=range(25,46))



In [7]:

    
df









    Out[7]:






  
    
      
      pos
      Diff
      Temp
    
  
  
    
      0
      0
      0.000010
      26
    
    
      1
      1
      0.000008
      26
    
    
      2
      2
      0.000005
      26
    
    
      3
      3
      0.000023
      26
    
    
      4
      4
      0.000002
      26
    
    
      5
      5
      0.000028
      26
    
    
      6
      6
      0.000000
      26
    
    
      7
      7
      0.000000
      26
    
    
      8
      8
      0.000000
      26
    
    
      9
      9
      0.000000
      26
    
    
      10
      10
      0.000586
      26
    
    
      11
      11
      0.000558
      26
    
    
      12
      12
      0.000556
      26
    
    
      13
      13
      0.000534
      26
    
    
      14
      14
      0.000505
      26
    
    
      15
      15
      0.000046
      26
    
    
      16
      16
      0.000014
      26
    
    
      17
      17
      0.000013
      26
    
    
      18
      18
      0.000270
      26
    
    
      19
      19
      0.000288
      26
    
    
      20
      20
      0.000002
      26
    
    
      21
      21
      0.000074
      26
    
    
      22
      22
      0.000008
      26
    
    
      23
      23
      0.000025
      26
    
    
      24
      24
      0.000005
      26
    
    
      25
      25
      0.000005
      26
    
    
      26
      26
      0.000014
      26
    
    
      27
      27
      0.000006
      26
    
    
      28
      28
      0.000006
      26
    
    
      29
      29
      0.000006
      26
    
    
      ...
      ...
      ...
      ...
    
    
      87
      87
      0.000054
      45
    
    
      88
      88
      0.921019
      45
    
    
      89
      89
      0.838824
      45
    
    
      90
      90
      0.004779
      45
    
    
      91
      91
      0.003466
      45
    
    
      92
      92
      0.002476
      45
    
    
      93
      93
      0.006121
      45
    
    
      94
      94
      0.006333
      45
    
    
      95
      95
      0.010703
      45
    
    
      96
      96
      0.022966
      45
    
    
      97
      97
      3.354810
      45
    
    
      98
      98
      4.385154
      45
    
    
      99
      99
      0.055384
      45
    
    
      100
      100
      0.016521
      45
    
    
      101
      101
      0.009610
      45
    
    
      102
      102
      3.105612
      45
    
    
      103
      103
      3.111791
      45
    
    
      104
      104
      3.111899
      45
    
    
      105
      105
      0.701318
      45
    
    
      106
      106
      0.700406
      45
    
    
      107
      107
      2.875615
      45
    
    
      108
      108
      2.875617
      45
    
    
      109
      109
      2.884241
      45
    
    
      110
      110
      0.988213
      45
    
    
      111
      111
      0.989322
      45
    
    
      112
      112
      0.012669
      45
    
    
      113
      113
      0.007946
      45
    
    
      114
      114
      0.018693
      45
    
    
      115
      115
      0.010069
      45
    
    
      116
      116
      0.001594
      45
    
  

2340 rows × 3 columns



In [8]:

    
fasta = parse_fasta('../data/rose.fa')
for rec in fasta:
    id, seq = rec
    
#df['seq'] = 12 * [base for base in seq]
df['seq'] = [seq[x-1] for x in df['pos']]



In [9]:

    
def heatmap_T_range(df, temps, title=None):
    from bokeh.models import LinearAxis, SingleIntervalTicker
    data = df[['pos', 'seq', 'Temp', 'Diff']]
    row_range = [str(x) for x in sorted(temps)]
    col_range = [str(x) for x in set(df['pos'])]
    source = ColumnDataSource(
        data=dict(
            row=[str(x) for x in data['Temp']],
            col=[str(x) for x in data['pos']],
            base=data['seq'],
            alphas=data['Diff'],
        )
    )

    p = figure(title=title,
               tools='resize,save,box_zoom,reset,pan',
               x_range=col_range, 
               y_range=row_range,
               x_axis_type=None,
               plot_width=800)

    p.rect('col', 'row', 0.9, 0.9, source=source, alpha='alphas', color='#FF7F00', line_color=None)
    p.grid.grid_line_color = None
    ticker = SingleIntervalTicker(interval=100, num_minor_ticks=10)
    xaxis = LinearAxis(ticker=ticker)
    p.add_layout(xaxis, 'below')
    show(p)



In [10]:

    
temps = set(df['Temp'])
heatmap_T_range(df, temps, title='Human HSR1')



In [ ]:

    
df_dHSR = trange_df('data/dHSR1', trange=range(25,38))



In [ ]:

    
fasta = parse_fasta('data/dHSR1.fa')
for rec in fasta:
    id, seq = rec
    
df_dHSR['seq'] = [seq[x-1] for x in df_dHSR['pos']]



In [ ]:

    
temps = set(df_dHSR['Temp'])
heatmap_T_range(df_dHSR, temps, title='Drosophila HSR-1')



In [ ]:

    
df_ROSE = trange_df('../data/ROSE1', trange=range(25,46))
fasta = parse_fasta('../data/rose.fa')
for rec in fasta:
    id, seq = rec
    
df_ROSE['seq'] = [seq[x-1] for x in df_ROSE['pos']]



In [ ]:

    
temps = set(df_ROSE['Temp'])
heatmap_T_range(df_ROSE, temps, title='ROSE-1')



In [ ]:

    
def sequence_heatmap(df, temp, num_cols=60):
    from bokeh.models import LinearAxis, SingleIntervalTicker
    data = df[df['Temp'] == temp]
    num_rows = data.shape[0] // num_cols
    if data.shape[0] % num_cols:
        num_rows += 1

    rows = [(x // num_cols) + 1 for x in range(data.shape[0])]
    cols = [(x - x // num_cols * num_cols) + 1 for x in range(data.shape[0])]
    

    row_range = [str(x) for x in reversed(range(1,num_rows+1))]
    col_range = [str(x) for x in range(1, num_cols)]
    
    source = ColumnDataSource(
        data=dict(
            row=[str(x) for x in rows],
            col=[str(x) for x in cols],
            base=data['seq'],
            alphas=data['Diff'],
        )
    )
    
    p = figure(title='Human HSR1 prob diff', tools='resize,hover,save',
           x_range=col_range, y_range=row_range, x_axis_type=None, plot_width=800)

    p.rect('col', 'row', 0.9, 0.9, source=source, alpha='alphas', color='#FF7F00', line_color=None)
    text_props = {
        'source': source,
        'angle': 0,
        'color': 'black',
        'text_color': '#586e75',
        'text_align': 'center',
        'text_baseline': 'middle',
    }

    p.text(x=dict(field='col', units='data'),
           y=dict(field='row', units='data'),
           text=dict(field='base', units='data'),
           text_font_size='10pt', text_font_style='bold', **text_props)

    p.grid.grid_line_color = None
    hover = p.select(dict(type=HoverTool))
    hover.tooltips = [
        ('base', '@base'),
        ('shape', '@alphas')
    ]
    ticker = SingleIntervalTicker(interval=10, num_minor_ticks=10)
    xaxis = LinearAxis(ticker=ticker)
    p.add_layout(xaxis, 'below')
    
    show(p)



In [ ]:

    
sequence_heatmap(df, temp=37)



In [ ]:

    
sequence_heatmap(df_ROSE, temp=37, num_cols=30)



In [ ]:

    
ggplot(df, aes(xmin='pos', xmax='pos+1', ymin=0, ymax='Diff')) + geom_rect() + facet_wrap('Temp')



In [ ]:

    
from pandas.stats.moments import rolling_mean

def compute_rolling_mean(data, window=10):
    df = data.copy()
    df['rmean'] = 0
    for temp in set(df['Temp']):
        df.loc[df['Temp'] == temp, 'rmean'] = rolling_mean(df[df['Temp'] == temp]['Diff'], window=window)
    return df



In [ ]:

    
df = trange_df('data/hHSR', range(32,48))
ggplot(compute_rolling_mean(df, window=15), aes(x='pos', y='rmean')) \
        + geom_line() \
        + scale_y_continuous(limits=(0,0.25)) \
        + facet_wrap('Temp')



In [ ]:

    
df = trange_df('data/dHSR1', range(25,37))
ggplot(compute_rolling_mean(df, window=12), aes(x='pos', y='rmean')) \
        + geom_line() \
        + facet_wrap('Temp')



In [ ]:

    
from IPython.html.widgets import interact

def plot_T(temp):
    p = ggplot(df[df['Temp'] == temp], aes(xmin='pos', xmax='pos+1', ymin=0, ymax='Diff')) + geom_rect()
    print p
    
interact(plot_T, temp=(36,47))



In [ ]:

    
trange = range(35,48)
df_hHSR = compute_diff_df('data/hHSR', trange)



In [ ]:

    
dfd = trange_df('data/dHSR1', range(25,38))
ggplot(dfd, aes(xmin='pos', xmax='pos+1', ymin=0, ymax='Diff')) + geom_rect() + facet_wrap('Temp')



In [ ]:

    
def get_sig_positions(df, trange=range(37,43), num_sigma=6):
    '''
    Given the dataframe of probability differences for a T range
    and the level of significannce in sigmas returns positions in the
    dataframe where the probability difference at the highest T
    exceeds the sigma threshold.
    '''
    colnames = ['Diff_%d' % temp for temp in trange[1:]]
    diff_cols = [df[colname] for colname in colnames]
    all_diff = pd.concat(diff_cols)
    mean = all_diff.mean()
    sigma = all_diff.std()
    threshold = num_sigma * sigma
    print 'Mean:\t%f\nSigma:\t%f\nThreshold:\t%f\n' % (mean, sigma, threshold)
    return df[abs(df['Diff_%d' % trange[-1]] - mean) > threshold].sort(['Position'])

get_sig_positions(df_hHSR, trange=trange, num_sigma=3)



In [ ]:

    
def plot_RMSD(df, trange=range(37,43)):
    df_sum = pd.DataFrame()
    df_sum['Temp'] = trange[1:]
    mean = df['Diff'].mean()
    df_sum['RMSD'] = [np.sqrt(((df[df['Temp'] == T]['Diff']-mean)**2).sum()) for T in trange[1:]]
    #df_sum['RMSD'] = [df[df['Temp'] == T]['Diff'].sum() for T in trange[1:]]
    p = ggplot(df_sum, aes(x='Temp', y='RMSD')) + geom_line()
    print p



In [ ]:

    
plot_RMSD(df, trange=range(32,47))



In [ ]:

    
plot_RMSD(df, trange=range(34,42))



In [ ]:

    
trange1 = range(25,38)
df_dHSR1 = compute_diff_df('data/dHSR1', trange1)
get_sig_positions(df_dHSR1, trange1)



In [ ]:

    
df_dHSR1 = trange_df('data/dHSR1', range(26,37))
plot_RMSD(df_dHSR1, trange=range(26,37))



In [ ]:

    
df_435 = compute_diff_df('data/hHSR435', trange)
get_sig_positions(df_435, trange=trange)



In [ ]:

	pos	Diff	Temp
0	0	0.000010	26
1	1	0.000008	26
2	2	0.000005	26
3	3	0.000023	26
4	4	0.000002	26
5	5	0.000028	26
6	6	0.000000	26
7	7	0.000000	26
8	8	0.000000	26
9	9	0.000000	26
10	10	0.000586	26
11	11	0.000558	26
12	12	0.000556	26
13	13	0.000534	26
14	14	0.000505	26
15	15	0.000046	26
16	16	0.000014	26
17	17	0.000013	26
18	18	0.000270	26
19	19	0.000288	26
20	20	0.000002	26
21	21	0.000074	26
22	22	0.000008	26
23	23	0.000025	26
24	24	0.000005	26
25	25	0.000005	26
26	26	0.000014	26
27	27	0.000006	26
28	28	0.000006	26
29	29	0.000006	26
...	...	...	...
87	87	0.000054	45
88	88	0.921019	45
89	89	0.838824	45
90	90	0.004779	45
91	91	0.003466	45
92	92	0.002476	45
93	93	0.006121	45
94	94	0.006333	45
95	95	0.010703	45
96	96	0.022966	45
97	97	3.354810	45
98	98	4.385154	45
99	99	0.055384	45
100	100	0.016521	45
101	101	0.009610	45
102	102	3.105612	45
103	103	3.111791	45
104	104	3.111899	45
105	105	0.701318	45
106	106	0.700406	45
107	107	2.875615	45
108	108	2.875617	45
109	109	2.884241	45
110	110	0.988213	45
111	111	0.989322	45
112	112	0.012669	45
113	113	0.007946	45
114	114	0.018693	45
115	115	0.010069	45
116	116	0.001594	45