In [1]:
%matplotlib inline

In [2]:
from bokeh.plotting import *
from bokeh.models import HoverTool, ColumnDataSource

In [3]:
output_notebook()


Loading BokehJS ...

In [4]:
import pandas as pd
import numpy as np
from ggplot import *
from utils import *

In [1]:
%%writefile parse.py
def is_empty(line):
    """Returns True empty lines and lines consisting only of whitespace."""
    return (not line) or line.isspace()

def is_fasta_label(line):
    return line.strip().startswith('>')
    

def LabeledRecordFinder(is_label_line, ignore=is_empty):
    """Returns function that returns successive labeled records from file.
    Includes label line in return value. Returns list of relevant lines.
    Skips over any lines for which ignore(line) evaluates True (default is
    to skip empty lines).
    """
    def parser(lines):
        with open(lines, 'r') as lines:
            curr = []
            for l in lines:
                line = l.strip()
                if ignore(line):
                    continue
                # if we find the label, return the previous record
                if is_label_line(line):
                    if curr:
                        yield curr
                        curr = []
                curr.append(line)
            # don't forget to return the last record in the file
            if curr:
                yield curr
    return parser

    
FastaFinder = LabeledRecordFinder(is_fasta_label)


def parse_fasta(infile, finder=FastaFinder):
    """Generator of labels and sequences from a fasta file.
    """

    for rec in finder(infile):
        # first line must be a label line
        if not rec[0].startswith('>'):
            raise ValueError("Found Fasta record without label line: %s" % rec)
        # record must have at least one sequence
        if len(rec) < 2:
            raise ValueError("Found label line without sequences: %s" % rec)

        # remove the label character from the beginning of the label
        label = rec[0][1:].strip()
        seq = ''.join(rec[1:])

        yield label, seq


Writing parse.py

In [6]:
df = trange_df('../data/ROSE1', trange=range(25,46))

In [7]:
df


Out[7]:
pos Diff Temp
0 0 0.000010 26
1 1 0.000008 26
2 2 0.000005 26
3 3 0.000023 26
4 4 0.000002 26
5 5 0.000028 26
6 6 0.000000 26
7 7 0.000000 26
8 8 0.000000 26
9 9 0.000000 26
10 10 0.000586 26
11 11 0.000558 26
12 12 0.000556 26
13 13 0.000534 26
14 14 0.000505 26
15 15 0.000046 26
16 16 0.000014 26
17 17 0.000013 26
18 18 0.000270 26
19 19 0.000288 26
20 20 0.000002 26
21 21 0.000074 26
22 22 0.000008 26
23 23 0.000025 26
24 24 0.000005 26
25 25 0.000005 26
26 26 0.000014 26
27 27 0.000006 26
28 28 0.000006 26
29 29 0.000006 26
... ... ... ...
87 87 0.000054 45
88 88 0.921019 45
89 89 0.838824 45
90 90 0.004779 45
91 91 0.003466 45
92 92 0.002476 45
93 93 0.006121 45
94 94 0.006333 45
95 95 0.010703 45
96 96 0.022966 45
97 97 3.354810 45
98 98 4.385154 45
99 99 0.055384 45
100 100 0.016521 45
101 101 0.009610 45
102 102 3.105612 45
103 103 3.111791 45
104 104 3.111899 45
105 105 0.701318 45
106 106 0.700406 45
107 107 2.875615 45
108 108 2.875617 45
109 109 2.884241 45
110 110 0.988213 45
111 111 0.989322 45
112 112 0.012669 45
113 113 0.007946 45
114 114 0.018693 45
115 115 0.010069 45
116 116 0.001594 45

2340 rows × 3 columns


In [8]:
fasta = parse_fasta('../data/rose.fa')
for rec in fasta:
    id, seq = rec
    
#df['seq'] = 12 * [base for base in seq]
df['seq'] = [seq[x-1] for x in df['pos']]

In [9]:
def heatmap_T_range(df, temps, title=None):
    from bokeh.models import LinearAxis, SingleIntervalTicker
    data = df[['pos', 'seq', 'Temp', 'Diff']]
    row_range = [str(x) for x in sorted(temps)]
    col_range = [str(x) for x in set(df['pos'])]
    source = ColumnDataSource(
        data=dict(
            row=[str(x) for x in data['Temp']],
            col=[str(x) for x in data['pos']],
            base=data['seq'],
            alphas=data['Diff'],
        )
    )

    p = figure(title=title,
               tools='resize,save,box_zoom,reset,pan',
               x_range=col_range, 
               y_range=row_range,
               x_axis_type=None,
               plot_width=800)

    p.rect('col', 'row', 0.9, 0.9, source=source, alpha='alphas', color='#FF7F00', line_color=None)
    p.grid.grid_line_color = None
    ticker = SingleIntervalTicker(interval=100, num_minor_ticks=10)
    xaxis = LinearAxis(ticker=ticker)
    p.add_layout(xaxis, 'below')
    show(p)

In [10]:
temps = set(df['Temp'])
heatmap_T_range(df, temps, title='Human HSR1')



In [ ]:
df_dHSR = trange_df('data/dHSR1', trange=range(25,38))

In [ ]:
fasta = parse_fasta('data/dHSR1.fa')
for rec in fasta:
    id, seq = rec
    
df_dHSR['seq'] = [seq[x-1] for x in df_dHSR['pos']]

In [ ]:
temps = set(df_dHSR['Temp'])
heatmap_T_range(df_dHSR, temps, title='Drosophila HSR-1')

In [ ]:
df_ROSE = trange_df('../data/ROSE1', trange=range(25,46))
fasta = parse_fasta('../data/rose.fa')
for rec in fasta:
    id, seq = rec
    
df_ROSE['seq'] = [seq[x-1] for x in df_ROSE['pos']]

In [ ]:
temps = set(df_ROSE['Temp'])
heatmap_T_range(df_ROSE, temps, title='ROSE-1')

In [ ]:
def sequence_heatmap(df, temp, num_cols=60):
    from bokeh.models import LinearAxis, SingleIntervalTicker
    data = df[df['Temp'] == temp]
    num_rows = data.shape[0] // num_cols
    if data.shape[0] % num_cols:
        num_rows += 1

    rows = [(x // num_cols) + 1 for x in range(data.shape[0])]
    cols = [(x - x // num_cols * num_cols) + 1 for x in range(data.shape[0])]
    

    row_range = [str(x) for x in reversed(range(1,num_rows+1))]
    col_range = [str(x) for x in range(1, num_cols)]
    
    source = ColumnDataSource(
        data=dict(
            row=[str(x) for x in rows],
            col=[str(x) for x in cols],
            base=data['seq'],
            alphas=data['Diff'],
        )
    )
    
    p = figure(title='Human HSR1 prob diff', tools='resize,hover,save',
           x_range=col_range, y_range=row_range, x_axis_type=None, plot_width=800)

    p.rect('col', 'row', 0.9, 0.9, source=source, alpha='alphas', color='#FF7F00', line_color=None)
    text_props = {
        'source': source,
        'angle': 0,
        'color': 'black',
        'text_color': '#586e75',
        'text_align': 'center',
        'text_baseline': 'middle',
    }

    p.text(x=dict(field='col', units='data'),
           y=dict(field='row', units='data'),
           text=dict(field='base', units='data'),
           text_font_size='10pt', text_font_style='bold', **text_props)

    p.grid.grid_line_color = None
    hover = p.select(dict(type=HoverTool))
    hover.tooltips = [
        ('base', '@base'),
        ('shape', '@alphas')
    ]
    ticker = SingleIntervalTicker(interval=10, num_minor_ticks=10)
    xaxis = LinearAxis(ticker=ticker)
    p.add_layout(xaxis, 'below')
    
    show(p)

In [ ]:
sequence_heatmap(df, temp=37)

In [ ]:
sequence_heatmap(df_ROSE, temp=37, num_cols=30)

In [ ]:
ggplot(df, aes(xmin='pos', xmax='pos+1', ymin=0, ymax='Diff')) + geom_rect() + facet_wrap('Temp')

In [ ]:
from pandas.stats.moments import rolling_mean

def compute_rolling_mean(data, window=10):
    df = data.copy()
    df['rmean'] = 0
    for temp in set(df['Temp']):
        df.loc[df['Temp'] == temp, 'rmean'] = rolling_mean(df[df['Temp'] == temp]['Diff'], window=window)
    return df

In [ ]:
df = trange_df('data/hHSR', range(32,48))
ggplot(compute_rolling_mean(df, window=15), aes(x='pos', y='rmean')) \
        + geom_line() \
        + scale_y_continuous(limits=(0,0.25)) \
        + facet_wrap('Temp')

In [ ]:
df = trange_df('data/dHSR1', range(25,37))
ggplot(compute_rolling_mean(df, window=12), aes(x='pos', y='rmean')) \
        + geom_line() \
        + facet_wrap('Temp')

In [ ]:
from IPython.html.widgets import interact

def plot_T(temp):
    p = ggplot(df[df['Temp'] == temp], aes(xmin='pos', xmax='pos+1', ymin=0, ymax='Diff')) + geom_rect()
    print p
    
interact(plot_T, temp=(36,47))

In [ ]:
trange = range(35,48)
df_hHSR = compute_diff_df('data/hHSR', trange)

In [ ]:
dfd = trange_df('data/dHSR1', range(25,38))
ggplot(dfd, aes(xmin='pos', xmax='pos+1', ymin=0, ymax='Diff')) + geom_rect() + facet_wrap('Temp')

In [ ]:
def get_sig_positions(df, trange=range(37,43), num_sigma=6):
    '''
    Given the dataframe of probability differences for a T range
    and the level of significannce in sigmas returns positions in the
    dataframe where the probability difference at the highest T
    exceeds the sigma threshold.
    '''
    colnames = ['Diff_%d' % temp for temp in trange[1:]]
    diff_cols = [df[colname] for colname in colnames]
    all_diff = pd.concat(diff_cols)
    mean = all_diff.mean()
    sigma = all_diff.std()
    threshold = num_sigma * sigma
    print 'Mean:\t%f\nSigma:\t%f\nThreshold:\t%f\n' % (mean, sigma, threshold)
    return df[abs(df['Diff_%d' % trange[-1]] - mean) > threshold].sort(['Position'])

get_sig_positions(df_hHSR, trange=trange, num_sigma=3)

In [ ]:
def plot_RMSD(df, trange=range(37,43)):
    df_sum = pd.DataFrame()
    df_sum['Temp'] = trange[1:]
    mean = df['Diff'].mean()
    df_sum['RMSD'] = [np.sqrt(((df[df['Temp'] == T]['Diff']-mean)**2).sum()) for T in trange[1:]]
    #df_sum['RMSD'] = [df[df['Temp'] == T]['Diff'].sum() for T in trange[1:]]
    p = ggplot(df_sum, aes(x='Temp', y='RMSD')) + geom_line()
    print p

In [ ]:
plot_RMSD(df, trange=range(32,47))

In [ ]:
plot_RMSD(df, trange=range(34,42))

In [ ]:
trange1 = range(25,38)
df_dHSR1 = compute_diff_df('data/dHSR1', trange1)
get_sig_positions(df_dHSR1, trange1)

In [ ]:
df_dHSR1 = trange_df('data/dHSR1', range(26,37))
plot_RMSD(df_dHSR1, trange=range(26,37))

In [ ]:
df_435 = compute_diff_df('data/hHSR435', trange)
get_sig_positions(df_435, trange=trange)

In [ ]: