In [1]:
%matplotlib inline
In [132]:
#%%writefile utils.py
import pandas as pd
import numpy as np
from ggplot import *
def compute_prob_vector(ps_file, prob_paired=True):
'''
Given a text file derived from the RNAfold output of the form
i j sqrt(prob) ubox
computes a vector (dict) of probabilities for every nucleotide
to be in paired (or unpaired) state.
'''
prob_vector = {}
with open(ps_file) as fi:
for line in fi.readlines():
line = line.strip()
posi, posj, sqrt_prob, box = line.split()
curr_i = prob_vector.get(int(posi), 0)
curr_j = prob_vector.get(int(posj), 0)
prob_vector.update({
int(posi)-1: curr_i + float(sqrt_prob)**2,
int(posj)-1: curr_j + float(sqrt_prob)**2,
})
if prob_paired:
return prob_vector
else:
return dict([(pos, 1-p) for pos,p in prob_vector.items()])
def compute_prob_vector_max(ps_file, prob_paired=True):
'''
Given a text file derived from the RNAfold output of the form
i j sqrt(prob) ubox
computes a vector (dict) of probabilities for every nucleotide
to be in paired (or unpaired) state.
'''
prob_vector = {}
with open(ps_file) as fi:
for line in fi.readlines():
line = line.strip()
posi, posj, sqrt_prob, box = line.split()
curr_i = prob_vector.get(int(posi), 0)
curr_j = prob_vector.get(int(posj), 0)
curr_prob = float(sqrt_prob)**2
indi = int(posi)
indj = int(posj)
if indi in prob_vector:
if curr_prob > prob_vector[indi]:
prob_vector[indi] = curr_prob
else:
prob_vector[indi] = curr_prob
if indj in prob_vector:
if curr_prob > prob_vector[indj]:
prob_vector[indj] = curr_prob
else:
prob_vector[indj] = curr_prob
if prob_paired:
return prob_vector
else:
return dict([(pos, 1-p) for pos,p in prob_vector.items()])
def trange_df(base_name, prob_func=compute_prob_vector,
trange=range(35,43), abs_value=True):
'''
Same as `compute_diff_df` but builds dataframe in a long format
suitable for ggplot faceting.
'''
T0 = trange[0]
prob0 = prob_func('%s_%d.txt' % (base_name, T0))
chunks = []
for temp in trange[1:]:
df = pd.DataFrame()
prob_vector = prob_func('%s_%d.txt' % (base_name,temp))
npos = max(set(prob0.keys()) | set(prob_vector.keys())) + 1
d0 = np.zeros(npos)
dt = np.zeros(npos)
d0[list(prob0.keys())] = list(prob0.values())
dt[list(prob_vector.keys())] = list(prob_vector.values())
df['pos'] = range(npos)
if abs_value:
df['Diff'] = abs(d0 - dt)
else:
df['Diff'] = dt - d0
df['Temp'] = temp
chunks.append(df)
return pd.concat(chunks)
def sig_positions(df, num_sigma=6):
mean = df['Diff'].mean()
sigma = df['Diff'].std()
threshold = num_sigma * sigma
return abs(df['Diff'] - mean) > threshold
def compute_diff_df(base_name, trange=range(35,43), abs_value=True):
'''
Given the base_name for tab-delimited files containing base
pairing probabilities calculated by RNAfold computes a
dataframe containing probability difference vectors for each
temperature value in the range relative to the lowest T in the
range.
'''
T0 = trange[0]
prob = compute_prob_vector('%s_%d.txt' % (base_name, T0))
df = pd.DataFrame(prob.items(), columns=['Position', 'Prob_%d' % T0])
for temp in trange[1:]:
prob = compute_prob_vector('%s_%d.txt' % (base_name, temp))
prob_key = 'Prob_%d' % temp
df[prob_key] = pd.Series(prob.values())
if abs_value:
df['Diff_%d' % temp] = abs(df[prob_key] - df['Prob_%d' % T0])
else:
df['Diff_%d' % temp] = df[prob_key] - df['Prob_%d' % T0]
return df
def get_sig_positions(df, trange=range(37,43), num_sigma=6):
'''
Given the dataframe of probability differences for a T range
and the level of significannce in sigmas returns positions in the
dataframe where the probability difference at the highest T
exceeds the sigma threshold.
'''
colnames = ['Diff_%d' % temp for temp in trange[1:]]
diff_cols = [df[colname] for colname in colnames]
all_diff = pd.concat(diff_cols)
mean = all_diff.mean()
sigma = all_diff.std()
threshold = num_sigma * sigma
print('Mean:\t%f\nSigma:\t%f\nThreshold:\t%f\n' % (mean, sigma, threshold))
return df[abs(df['Diff_%d' % trange[-1]] - mean) > threshold].sort(['Position'])
def plot_RSMD(df, trange=range(37,43)):
df_sum = pd.DataFrame()
df_sum['Temp'] = trange[1:]
df_sum['RMSD'] = [np.sqrt(((df[df['Temp'] == T]['Diff'])**2).sum()) for T in trange[1:]]
p = ggplot(df_sum, aes(x='Temp', y='RMSD')) + geom_line()
return p
In [131]:
ls -lah
In [112]:
df = trange_df('../data/ROSE1', trange=range(25,37))
df
Out[112]:
In [113]:
compute_prob_vector('../data/ROSE1_25.txt')
Out[113]:
In [114]:
compute_prob_vector_max('../data/ROSE1_25.txt')
Out[114]:
In [115]:
df = trange_df('../data/ROSE1', trange=range(25,37))
df
Out[115]:
In [116]:
df_max = trange_df('../data/ROSE1', prob_func=compute_prob_vector_max,
trange=range(25,37))
df_max
Out[116]:
In [117]:
df.describe()
Out[117]:
In [118]:
df['Diff'].mean()
Out[118]:
In [119]:
df['Diff'].std()
Out[119]:
In [120]:
df[df['Temp'] == 35]
Out[120]:
In [122]:
g = df.groupby('Temp')
for name,group in g:
print(name, group)
In [123]:
%matplotlib inline
In [124]:
for name,group in g:
group.plot(x='pos', y='Diff', title=name)
In [125]:
g = ggplot(df, aes(xmin='pos-1',xmax='pos', ymin=0, ymax='Diff')) \
+ geom_rect() \
+ facet_wrap('Temp')
In [127]:
print(g)
In [128]:
g = ggplot(df_max, aes(xmin='pos-1',xmax='pos', ymin=0, ymax='Diff')) \
+ geom_rect() \
+ facet_wrap('Temp')
print(g)
In [ ]: