In [1]:
%matplotlib inline
In [2]:
from bokeh.plotting import *
from bokeh.models import HoverTool, ColumnDataSource
In [3]:
output_notebook()
In [4]:
import pandas as pd
import numpy as np
from ggplot import *
from utils import *
In [1]:
%%writefile parse.py
def is_empty(line):
"""Returns True empty lines and lines consisting only of whitespace."""
return (not line) or line.isspace()
def is_fasta_label(line):
return line.strip().startswith('>')
def LabeledRecordFinder(is_label_line, ignore=is_empty):
"""Returns function that returns successive labeled records from file.
Includes label line in return value. Returns list of relevant lines.
Skips over any lines for which ignore(line) evaluates True (default is
to skip empty lines).
"""
def parser(lines):
with open(lines, 'r') as lines:
curr = []
for l in lines:
line = l.strip()
if ignore(line):
continue
# if we find the label, return the previous record
if is_label_line(line):
if curr:
yield curr
curr = []
curr.append(line)
# don't forget to return the last record in the file
if curr:
yield curr
return parser
FastaFinder = LabeledRecordFinder(is_fasta_label)
def parse_fasta(infile, finder=FastaFinder):
"""Generator of labels and sequences from a fasta file.
"""
for rec in finder(infile):
# first line must be a label line
if not rec[0].startswith('>'):
raise ValueError("Found Fasta record without label line: %s" % rec)
# record must have at least one sequence
if len(rec) < 2:
raise ValueError("Found label line without sequences: %s" % rec)
# remove the label character from the beginning of the label
label = rec[0][1:].strip()
seq = ''.join(rec[1:])
yield label, seq
In [6]:
df = trange_df('../data/ROSE1', trange=range(25,46))
In [7]:
df
Out[7]:
In [8]:
fasta = parse_fasta('../data/rose.fa')
for rec in fasta:
id, seq = rec
#df['seq'] = 12 * [base for base in seq]
df['seq'] = [seq[x-1] for x in df['pos']]
In [9]:
def heatmap_T_range(df, temps, title=None):
from bokeh.models import LinearAxis, SingleIntervalTicker
data = df[['pos', 'seq', 'Temp', 'Diff']]
row_range = [str(x) for x in sorted(temps)]
col_range = [str(x) for x in set(df['pos'])]
source = ColumnDataSource(
data=dict(
row=[str(x) for x in data['Temp']],
col=[str(x) for x in data['pos']],
base=data['seq'],
alphas=data['Diff'],
)
)
p = figure(title=title,
tools='resize,save,box_zoom,reset,pan',
x_range=col_range,
y_range=row_range,
x_axis_type=None,
plot_width=800)
p.rect('col', 'row', 0.9, 0.9, source=source, alpha='alphas', color='#FF7F00', line_color=None)
p.grid.grid_line_color = None
ticker = SingleIntervalTicker(interval=100, num_minor_ticks=10)
xaxis = LinearAxis(ticker=ticker)
p.add_layout(xaxis, 'below')
show(p)
In [10]:
temps = set(df['Temp'])
heatmap_T_range(df, temps, title='Human HSR1')
In [ ]:
df_dHSR = trange_df('data/dHSR1', trange=range(25,38))
In [ ]:
fasta = parse_fasta('data/dHSR1.fa')
for rec in fasta:
id, seq = rec
df_dHSR['seq'] = [seq[x-1] for x in df_dHSR['pos']]
In [ ]:
temps = set(df_dHSR['Temp'])
heatmap_T_range(df_dHSR, temps, title='Drosophila HSR-1')
In [ ]:
df_ROSE = trange_df('../data/ROSE1', trange=range(25,46))
fasta = parse_fasta('../data/rose.fa')
for rec in fasta:
id, seq = rec
df_ROSE['seq'] = [seq[x-1] for x in df_ROSE['pos']]
In [ ]:
temps = set(df_ROSE['Temp'])
heatmap_T_range(df_ROSE, temps, title='ROSE-1')
In [ ]:
def sequence_heatmap(df, temp, num_cols=60):
from bokeh.models import LinearAxis, SingleIntervalTicker
data = df[df['Temp'] == temp]
num_rows = data.shape[0] // num_cols
if data.shape[0] % num_cols:
num_rows += 1
rows = [(x // num_cols) + 1 for x in range(data.shape[0])]
cols = [(x - x // num_cols * num_cols) + 1 for x in range(data.shape[0])]
row_range = [str(x) for x in reversed(range(1,num_rows+1))]
col_range = [str(x) for x in range(1, num_cols)]
source = ColumnDataSource(
data=dict(
row=[str(x) for x in rows],
col=[str(x) for x in cols],
base=data['seq'],
alphas=data['Diff'],
)
)
p = figure(title='Human HSR1 prob diff', tools='resize,hover,save',
x_range=col_range, y_range=row_range, x_axis_type=None, plot_width=800)
p.rect('col', 'row', 0.9, 0.9, source=source, alpha='alphas', color='#FF7F00', line_color=None)
text_props = {
'source': source,
'angle': 0,
'color': 'black',
'text_color': '#586e75',
'text_align': 'center',
'text_baseline': 'middle',
}
p.text(x=dict(field='col', units='data'),
y=dict(field='row', units='data'),
text=dict(field='base', units='data'),
text_font_size='10pt', text_font_style='bold', **text_props)
p.grid.grid_line_color = None
hover = p.select(dict(type=HoverTool))
hover.tooltips = [
('base', '@base'),
('shape', '@alphas')
]
ticker = SingleIntervalTicker(interval=10, num_minor_ticks=10)
xaxis = LinearAxis(ticker=ticker)
p.add_layout(xaxis, 'below')
show(p)
In [ ]:
sequence_heatmap(df, temp=37)
In [ ]:
sequence_heatmap(df_ROSE, temp=37, num_cols=30)
In [ ]:
ggplot(df, aes(xmin='pos', xmax='pos+1', ymin=0, ymax='Diff')) + geom_rect() + facet_wrap('Temp')
In [ ]:
from pandas.stats.moments import rolling_mean
def compute_rolling_mean(data, window=10):
df = data.copy()
df['rmean'] = 0
for temp in set(df['Temp']):
df.loc[df['Temp'] == temp, 'rmean'] = rolling_mean(df[df['Temp'] == temp]['Diff'], window=window)
return df
In [ ]:
df = trange_df('data/hHSR', range(32,48))
ggplot(compute_rolling_mean(df, window=15), aes(x='pos', y='rmean')) \
+ geom_line() \
+ scale_y_continuous(limits=(0,0.25)) \
+ facet_wrap('Temp')
In [ ]:
df = trange_df('data/dHSR1', range(25,37))
ggplot(compute_rolling_mean(df, window=12), aes(x='pos', y='rmean')) \
+ geom_line() \
+ facet_wrap('Temp')
In [ ]:
from IPython.html.widgets import interact
def plot_T(temp):
p = ggplot(df[df['Temp'] == temp], aes(xmin='pos', xmax='pos+1', ymin=0, ymax='Diff')) + geom_rect()
print p
interact(plot_T, temp=(36,47))
In [ ]:
trange = range(35,48)
df_hHSR = compute_diff_df('data/hHSR', trange)
In [ ]:
dfd = trange_df('data/dHSR1', range(25,38))
ggplot(dfd, aes(xmin='pos', xmax='pos+1', ymin=0, ymax='Diff')) + geom_rect() + facet_wrap('Temp')
In [ ]:
def get_sig_positions(df, trange=range(37,43), num_sigma=6):
'''
Given the dataframe of probability differences for a T range
and the level of significannce in sigmas returns positions in the
dataframe where the probability difference at the highest T
exceeds the sigma threshold.
'''
colnames = ['Diff_%d' % temp for temp in trange[1:]]
diff_cols = [df[colname] for colname in colnames]
all_diff = pd.concat(diff_cols)
mean = all_diff.mean()
sigma = all_diff.std()
threshold = num_sigma * sigma
print 'Mean:\t%f\nSigma:\t%f\nThreshold:\t%f\n' % (mean, sigma, threshold)
return df[abs(df['Diff_%d' % trange[-1]] - mean) > threshold].sort(['Position'])
get_sig_positions(df_hHSR, trange=trange, num_sigma=3)
In [ ]:
def plot_RMSD(df, trange=range(37,43)):
df_sum = pd.DataFrame()
df_sum['Temp'] = trange[1:]
mean = df['Diff'].mean()
df_sum['RMSD'] = [np.sqrt(((df[df['Temp'] == T]['Diff']-mean)**2).sum()) for T in trange[1:]]
#df_sum['RMSD'] = [df[df['Temp'] == T]['Diff'].sum() for T in trange[1:]]
p = ggplot(df_sum, aes(x='Temp', y='RMSD')) + geom_line()
print p
In [ ]:
plot_RMSD(df, trange=range(32,47))
In [ ]:
plot_RMSD(df, trange=range(34,42))
In [ ]:
trange1 = range(25,38)
df_dHSR1 = compute_diff_df('data/dHSR1', trange1)
get_sig_positions(df_dHSR1, trange1)
In [ ]:
df_dHSR1 = trange_df('data/dHSR1', range(26,37))
plot_RMSD(df_dHSR1, trange=range(26,37))
In [ ]:
df_435 = compute_diff_df('data/hHSR435', trange)
get_sig_positions(df_435, trange=trange)
In [ ]: