In [1]:
from __future__ import division
%pylab inline
import seaborn as sns
sns.set_style('ticks')
sns.set_context('paper')
import glob
from scipy.stats import ks_2samp
def parse_rnafold_ensemble(fn,norm=False):
"""Parse raw RNAfold output (w/ ensemble) into dict.
RNAfold output format:
>seqname
seq
MFE structure (deltaG)
ensemble structure ensemble free energy
centroid
mfe freq, ensmeble diversty
Return : dict
"""
def r2dict(n,s,ms,es,c,mdg,mf,edg,cdg,cd,dv):
"""Return a dictionary entry of the supplied RNAfold data"""
return n[1:],{'seq':s,'mfe_struct':ms,'ensemble_struct':es,
'centroid_struct':c,'mfe_deltaG':mdg,
'mfe_freq':mf,'ensemble_deltaG':edg,
'diversity':dv,'centroid_deltaG':cdg,
'centroid_dist':cd}
rdict = {}
name = seq = mstruct = estruct = cstruct = None
mdg = edg = cdg = dv = cd = None
ctr = 0
with open(fn,'r') as f:
for line in f:
line = line.rstrip()
if '>' == line[0]:
if name is not None:
name, d = r2dict(name,seq,mstruct,estruct,
cstruct,mdg,mf,edg,cdg,cd,dv)
rdict[name] = d
name = line
ctr = 1
continue
line = line.split()
if ctr == 1:
seq = line[0]
elif ctr == 2:
mstruct = line[0]
mdg = float(line[-1].strip('()').strip(' '))
elif ctr == 3:
estruct = line[0]
elif ctr == 4:
edg = float(line[-2])
elif ctr == 5:
cstruct = line[0]
cdg = float(line[-2].strip('{}'))
cd = float(line[-1].strip('d={}'))
elif ctr == 6:
mf = float(line[6].strip(';'))
dv = float(line[-1])
ctr +=1
# Get the straggler
try:
name, d = r2dict(name,seq,mstruct,estruct,cstruct,mdg,mf,edg,
cdg,cd,dv)
rdict[name] = d
except:
pass
if norm:
for name in rdict.iterkeys():
L = len(rdict[name]['seq'])
rdict[name]['mfe_deltaG'] /= L
rdict[name]['ensemble_deltaG'] /=L
rdict[name]['centroid_deltaG'] /=L
return rdict
def parse_lfold(fn,maxlen=100):
name = None
data = {}
with open(fn,'r') as f:
for line in f:
line = line.strip()
if '>' in line:
name = line[1:]
if name is not None:
data[name] = {}
grabnext = False
continue
if 'A' in line and 'C' in line and 'G' in line and 'T' in line:
grabnext= True
data[name]['len'] = len(line.rstrip())
continue
if grabnext:
data[name]['deltaG'] = float(line.rstrip().strip('()'))
data[name]['deltaG'] /= data[name]['len']
grabnext=False
continue
line = line.split()
try:
pos = int(line[-1])-1
val = float(line[-2].strip('()'))
plen = len(line[0])
if plen > maxlen:
continue
for i in range(plen):
if i+pos in data[name]:
v = data[name][i+pos]
else:
v = 0
data[name][i+pos] = min(v,val)
# data[name][i+pos] = v+val
except:
pass
return data
def lfold2arr(ldict):
arr = np.zeros(np.max(ldict.keys())+1)
for pos in ldict.keys():
arr[pos] = ldict[pos]
return arr
def movingaverage (values, window):
weights = np.repeat(1.0, window)/window
sma = np.convolve(values, weights, 'valid')
return sma
In [2]:
data = []
names = []
# fns = ['yeast/sc2.cen.rnafold.txt','yeast/high_loss_cde2.rnafold.txt',
fns = ['../data/rnafold/yeast/sc2.cen.rnafold.txt',
'../data/rnafold/yeast/smik.cen.rnafold.txt',
'../data/rnafold/yeast/skud.cen.rnafold.txt',
'../data/rnafold/yeast/n_dairenensis.rnafold.txt',
'../data/rnafold/yeast/n_castelli.rnafold.txt']
ns = [r"$\it{S. cerevisiae}$",r'$\it{S. mikatae}$',r'$\it{S. kudriavzevii}$',
r'$\it{S. diarenensis}$',r'$\it{S. castellii}$']
spec_data = []
for fn,nss in zip(fns,ns):
# for fn in glob.glob('yeast/*rnafold.txt'):
d = parse_rnafold_ensemble(fn)
x = [d[name]['ensemble_deltaG'] for name in d.keys()]
data.extend(x)
names.extend([nss]*len(x))
spec_data.append(x)
In [ ]:
corr = np.ones((len(spec_data),len(spec_data)))
for i,d1 in enumerate(spec_data):
for j,d2 in enumerate(spec_data):
s,p = ks_2samp(d1,d2)
corr[i,j] = p
fig, (ax, cax) = plt.subplots(nrows=2,figsize=(3,3.5),
gridspec_kw={"height_ratios":[1, 0.1]})
im = ax.imshow(corr,cmap=cm.RdBu,
rasterized=True,norm=matplotlib.colors.LogNorm(vmin=1e-3,vmax=1))
ax.vlines([2.5],-0.5,4.5,linestyles='dotted',lw=2)
ax.hlines([2.5],-0.5,4.5,linestyles='dotted',lw=2)
ax.set_xticks([])
ax.set_yticks([])
fig.colorbar(im, cax=cax,extend='both',orientation="horizontal")
setp(cax.get_xticklabels(),size=16)
cax.set_xlabel('Kolmogorov-Smirnov p-value',size=16)
plt.savefig('../figures/ks_pvalue_yeast.svg')
In [32]:
plt.figure(figsize=(1.5,2.5))
# cols = ['#e41a1c','#377eb8','#e41a1c','#e41a1c','#377eb8','#377eb8']
cols = ['#377eb8','#377eb8','#377eb8','#e41a1c','#e41a1c']
# cols = ['#fb9a99','#a6cee3','#fb9a99','#fb9a99','#a6cee3','#a6cee3']
ax = sns.violinplot(x=data,y=names, color=None,
width=0.8,linewidth=1,palette=cols,bw=0.5,inner=None)
sns.stripplot(x=data,y=names,
jitter=True, size=4, color="black", linewidth=0,marker='o')
ax.set_xticks([-20,-15,-10,-5])
# ax.set_xticklabels([-20,-11,-4],size=14)
ax.set_xlim([-20,-5])
# ax.set_yticklabels(['Native','High loss','Low loss'],size=14)
ax.tick_params(axis='y',length=0)
ax.xaxis.tick_top()
setp(ax.get_yticklabels(),size=12)
sns.despine(ax=ax,left=True,trim=False,top=False,bottom=True,offset=5)
ax.set_title('DNA secondary structure',y=1.35,size=12)
ax.xaxis.set_label_position('top')
setp(ax.get_xticklabels(),size=10,y=0.97)
ax.set_xlabel('Ensemble '+r'$\Delta$'+'G (kcal/mol)',size=10)
plt.savefig('../figures/yeast_cen_loss.svg')
In [ ]:
data = []
names = []
fns = ['../data/rnafold/homo_sapiens.2500.rnafold.txt',
'../data/rnafold/pan_troglodytes.2500.rnafold.txt',
'../data/rnafold/pan_paniscus.2500.rnafold.txt',
'../data/rnafold/gorilla_gorilla.2500.rnafold.txt',
'../data/rnafold/gorilla_berengei.2500.rnafold.txt',
'../data/rnafold/pongo_abelii.2500.rnafold.txt',
'../data/rnafold/pongo_pygmaeus.2500.rnafold.txt',
'../data/rnafold/chlorocebus_aethiops.2500.rnafold.txt',
'../data/rnafold/chlorocebus_sabaeus.2500.rnafold.txt',
'../data/rnafold/chlorocebus_pygerythrus.2500.rnafold.txt',
'../data/rnafold/macaca_mulatta.2500.rnafold.txt',
'../data/rnafold/macaca_fascicularis.2500.rnafold.txt',
'../data/rnafold/papio_anubis.2500.rnafold.txt',
'../data/rnafold/misat.sample.rnafold.txt',
'../data/rnafold/chicken_unique_cen.sim_reads_art.rnafold.txt']
spec_data = []
for fn in fns:
print fn
d = parse_rnafold_ensemble(fn)
# x = [d[name]['ensemble_deltaG'] for name in d.keys()]
# x = [d[name]['mfe_freq'] for name in d.keys()]
x = [100*d[name]['ensemble_deltaG']/len(d[name]['seq']) for name in d.keys()]
data.extend(x)
names.extend([fn]*len(x))
spec_data.append(np.random.choice(x,size=100))
In [12]:
corr = np.ones((len(spec_data),len(spec_data)))
for i,d1 in enumerate(spec_data):
for j,d2 in enumerate(spec_data):
s,p = ks_2samp(d1,d2)
corr[i,j] = p
fig, (ax, cax) = plt.subplots(nrows=2,figsize=(8,9),
gridspec_kw={"height_ratios":[1, 0.05]})
im = ax.imshow(corr,cmap=cm.RdBu,norm=matplotlib.colors.LogNorm(vmin=1e-15,vmax=1),
rasterized=True)
# ax.vlines([6.5,10.5,17.5],-0.5,21.5,linestyles='dotted',lw=1.25)
# ax.hlines([6.5,10.5,17.5],-0.5,21.5,linestyles='dotted',lw=1.25)
ax.vlines([6.5],-0.5,14.5,linestyles='dotted',lw=1.5)
ax.vlines([12.5],-0.5,14.5,linestyles='dotted',lw=1.5)
ax.vlines([13.5],-0.5,14.5,linestyles='dotted',lw=1.5)
ax.hlines([6.5],-0.5,14.5,linestyles='dotted',lw=1.5)
ax.hlines([12.5],-0.5,14.5,linestyles='dotted',lw=1.5)
ax.hlines([13.5],-0.5,14.5,linestyles='dotted',lw=1.5)
ax.set_xticks([])
ax.set_yticks([])
fig.colorbar(im, cax=cax,extend='both',orientation="horizontal")
setp(cax.get_xticklabels(),size=16)
cax.set_xlabel('Kolmogorov-Smirnov p-value',size=16)
plt.savefig('../figures/ks_pvalue_multispecies.svg')
def hex2rgb(h): h = h.strip('#') return tuple([int(h[i:i+2], 16)/255 for i in (0, 2 ,4)]+[1])
In [29]:
plt.figure(figsize(1.25,1.05))
cols = ['#377eb8','#e41a1c']
# cols = ['#a6cee3','#fb9a99']
ax = sns.violinplot(x=(misat+masat)[::-1],y=names[::-1], palette=cols,notch=True,
width=0.95,linewidth=1,bw=0.4)
setp(ax.get_yticklabels(),size=12)
ax.tick_params(axis='y',length=0)
ax.set_xlabel('Ensemble '+r'$\Delta$'+'G\n(kcal/mol)',size=10)
ax.set_xlim([-40,0])
sns.despine(ax=ax,left=True,bottom=False,trim=True,offset=5)
ax.set_xticks([0,-20,-40])
setp(ax.get_yticklabels(),size=12)
setp(ax.get_xticklabels(),size=10)
ax.set_title('DNA secondary\nstructure',size=12)
plt.savefig('../figures/mouse.rnafold.svg')
In [ ]:
bins = np.linspace(-20,-5,15)
# bins = np.linspace(0,40,20)
data = []
names = []
fns = [
'../data/rnafold/homo_sapiens.2500.rnafold.txt',
'../data/rnafold/pan_troglodytes.2500.rnafold.txt',
'../data/rnafold/pan_paniscus.2500.rnafold.txt',
'../data/rnafold/gorilla_gorilla.2500.rnafold.txt',
'../data/rnafold/gorilla_berengei.2500.rnafold.txt',
'../data/rnafold/pongo_abelii.2500.rnafold.txt',
'../data/rnafold/pongo_pygmaeus.2500.rnafold.txt',
'../data/rnafold/chlorocebus_aethiops.2500.rnafold.txt',
'../data/rnafold/chlorocebus_sabaeus.2500.rnafold.txt',
'../data/rnafold/chlorocebus_pygerythrus.2500.rnafold.txt',
'../data/rnafold/macaca_mulatta.2500.rnafold.txt',
'../data/rnafold/macaca_fascicularis.2500.rnafold.txt',
'../data/rnafold/papio_anubis.2500.rnafold.txt',
'../data/rnafold/misat.sample.1000.rnafold.txt',
'../data/rnafold/equus_caballus.2500.rnafold.txt',
'../data/rnafold/chicken_unique_cen.sim_reads_art.rnafold.txt',
'../data/rnafold/stickleback.rnafold.txt',
'../data/rnafold/pombe.cen_core.sim_reads.rnafold.txt',
'../data/rnafold/oryza_sativa.2500.rnafold.txt',
'../data/rnafold/zea_mays.2500.rnafold.txt',
'../data/rnafold/arabidopsis_thaliana.2500_sim.rnafold.txt'
]
for fn in fns:
print fn
d = parse_rnafold_ensemble(fn)
x = [100*d[name]['ensemble_deltaG']/len(d[name]['seq']) for name in d.keys()]
hgram,_ = np.histogram(x,bins=bins,normed=True)
data.append(hgram)
names.append(fn)
# data = np.array(data)
In [242]:
# data
imshow(data,cmap=cm.bone_r,interpolation='none',rasterized=True)
plt.axhline(y=6.5,lw=1,c='black',ls='dotted')
plt.axhline(y=12.5,lw=1,c='black',ls='dotted')
plt.axhline(y=13.5,lw=1,c='black',ls='dotted')
plt.axhline(y=14.5,lw=1,c='black',ls='dotted')
plt.axhline(y=15.5,lw=1,c='black',ls='dotted')
plt.axhline(y=16.5,lw=1,c='black',ls='dotted')
plt.axhline(y=17.5,lw=1,c='black',ls='dotted')
plt.axvline(x=7.5,lw=1,c='black',ls='dotted')
plt.xticks([])
plt.yticks([])
plt.savefig('../figures/rnafold.heatmap.svg')
In [3]:
fns = glob.glob('../data/rnafold/human_hors/*.sim.rnafold.txt')
data = []
names = []
spec_data = []
for fn in fns:
print fn
d = parse_rnafold_ensemble(fn)
# x = [d[name]['ensemble_deltaG'] for name in d.keys()]
# x = [d[name]['mfe_freq'] for name in d.keys()]
x = [100*d[name]['ensemble_deltaG']/len(d[name]['seq']) for name in d.keys()]
data.extend(x)
names.extend([fn]*len(x))
spec_data.append(np.random.choice(x,size=50))
In [ ]:
bins = np.linspace(-25,-5,15)
data = []
names = []
fns = ['../data/rnafold/human_hors/D5Z2.monomers.sim.rnafold.txt',
'../data/rnafold/human_hors/DXZ1.monomers.sim.rnafold.txt',
'../data/rnafold/human_hors/D7Z1.monomers.sim.rnafold.txt',
'../data/rnafold/human_hors/DYZ3.monomers.sim.rnafold.txt',
'../data/rnafold/human_hors/D17Z1B.monomers.sim.rnafold.txt',
'../data/rnafold/human_hors/D11Z1.monomers.sim.rnafold.txt',
'../data/rnafold/human_hors/D7Z2.monomers.sim.rnafold.txt',
'../data/rnafold/human_hors/D4Z2bn.monomers.sim.rnafold.txt',
'../data/rnafold/human_hors/D19Z1.monomers.sim.rnafold.txt',
'../data/rnafold/human_hors/Xmono.monomers.sim.rnafold.txt',
'../data/rnafold/human_hors/3mono.monomers.sim.rnafold.txt',
'../data/rnafold/human_hors/D5Z1.monomers.sim.rnafold.txt']
for fn in fns:
print fn
d = parse_rnafold_ensemble(fn)
x = [100*d[name]['ensemble_deltaG']/len(d[name]['seq']) for name in d.keys()]
hgram,_ = np.histogram(x,bins=bins,normed=True)
data.append(hgram)
names.append(fn)
In [ ]:
plt.figure(figsize=(4,4))
species = ['D5Z2','DXZ1','D7Z1','DYZ3','D171B','D11Z1','D7Z2','D4Z2bn','D19Z1','Xmono','3mono','D5Z1']
cols = ['blue']*len(species)
cols = ['#e41a1c']*4 + ['#377eb8']*4 + ['grey']*4#,'grey'] + ['#6a3d9a'] + ['#4daf4a']
# cols = ['#fb9a99']*11 + ['#a6cee3']*7 + ['#e5d8bd']*4
ax = sns.violinplot(x=data,y=names, palette=cols,notch=False,
width=0.8,linewidth=1,bw=0.5,gridsize=50,inner='box',scale='width',cut=0)
# ax = sns.boxplot(x=data,y=names, color=None,notch=True,
# width=0.5,linewidth=1,whis=0.5,fliersize=0)
sns.stripplot(x=data,y=names,
jitter=True, size=4, color="black", linewidth=0,marker='o')
# ax.set_xticks([-20,-11,-4])
# ax.set_xticklabels([-20,-11,-4],size=14)
# ax.set_xlim([-20,-4])
# ax.set_yticklabels(['Native','High loss','Low loss'],size=14)
ax.set_title('Ensemble '+r'$\Delta$'+'G (kcal/mol)',size=10)
ax.tick_params(axis='y',length=0)
ax.xaxis.tick_top()
ax.set_yticklabels(species)
setp(ax.get_yticklabels(),size=10)
setp(ax.get_xticklabels(),size=10)
ax.set_xlim(-30,-5)
# ax.set_xticks([-20,-10,0])
# ax.set_xticklabels(['','-20','','-10','','0'])
ax.grid(axis='x',ls='dotted')
sns.despine(ax=ax,left=True,trim=True,top=False,bottom=True)
# plt.savefig('../figures/human_hors.svg')
In [6]:
plt.figure(figsize=(2.5,4))
imshow(data,cmap=cm.bone_r,interpolation='none',rasterized=True)
plt.axhline(y=2.5,lw=1,c='black',ls='dotted')
plt.axhline(y=3.5,lw=1,c='black',ls='dotted')
plt.axhline(y=7.5,lw=1,c='black',ls='dotted')
plt.xticks([])
plt.yticks([])
plt.savefig('../figures/hors.rnafold.heatmap.svg')
Out[6]: