In [1]:
%matplotlib inline
In [2]:
#from __future__ import division
import pandas as pd
import numpy as np
from ggplot import *
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
In [3]:
# Sample titles with corresponding barcodes
# s9: WT
# s9+bcm: WT +BCM
# s17: triple sRNA mutant
samples = {
's9': ['ATCACG', 'ACAGTG'],
's9+bcm': ['CGATGT', 'GCCAAT'],
's17': ['TTAGGC', 'GATCAG'],
}
# Barcodes
barcodes = ['ATCACG', 'ACAGTG', 'CGATGT', 'GCCAAT', 'TTAGGC', 'GATCAG']
In [4]:
offsets = [150,200,300]
winsizes = [50,80,100,200]
output_tpl = '../results/dfa_mp.offset_{}.win_{}.csv'
output = []
for offset in offsets:
for winsize in winsizes:
df = pd.DataFrame.from_csv(output_tpl.format(offset, winsize))
df['win'] = winsize
df['offset'] = offset
output.append(df)
dfa = pd.concat(output)
In [5]:
dfa['UTR_length'] = dfa['end_x'] - dfa['start_x']
dfa
Out[5]:
In [7]:
dfa[(dfa['gene'] == 'thiM') & (dfa['UTR_length'] > 80)][[
'TSS', 'gene', 'strand', 'UTR_length',
'ratio_ATCACG', 'ratio_ACAGTG', 'ratio_CGATGT', 'ratio_GCCAAT',
'win', 'offset'
]]
Out[7]:
In [6]:
dfa[(dfa['gene'] == 'rpoS') & (dfa['UTR_length'] > 500)][[
'TSS', 'gene', 'strand', 'UTR_length',
'ratio_ATCACG', 'ratio_ACAGTG', 'ratio_CGATGT', 'ratio_GCCAAT',
'win', 'offset'
]]
Out[6]:
In [8]:
d = dfa[(dfa['UTR_length'] > 80)
& (dfa['ratio_ATCACG'] > 2)
& (dfa['offset'] == 200)
& (dfa['win'] == 80)][['UTR_length', 'ratio_ATCACG','ratio_CGATGT']].copy()
d['log-bcm'] = np.log10(d['ratio_ATCACG'])
d['log+bcm'] = np.log10(d['ratio_CGATGT'])
d['loglen'] = np.log10(d['UTR_length'])
In [9]:
p = ggplot(d, aes(x='loglen', y='log-bcm')) \
+ geom_point(alpha=0.1) \
+ geom_smooth(method='lowess', span=1/5.)
print(p)
In [10]:
p = ggplot(d, aes(x='loglen', y='log-bcm')) \
+ geom_point(alpha=0.1) \
+ geom_smooth(method='ma', window=25)
print(p)
In [11]:
p = ggplot(d, aes(x='loglen', y='log+bcm')) \
+ geom_point(alpha=0.1) \
+ geom_smooth(method='lowess', span=1/5.) \
+ scale_y_continuous(limits=(-1,2.5))
print(p)
In [12]:
p = ggplot(d, aes(x='loglen', y='log+bcm')) \
+ geom_point(alpha=0.1) \
+ geom_smooth(method='ma', window=25) \
+ scale_y_continuous(limits=(-1,2.5))
print(p)
In [13]:
d = dfa[(dfa['UTR_length'] > 80)
& (dfa['ratio_ATCACG'] > 2)
& (dfa['offset'] == 200)
& (dfa['win'] == 80)][['TSS', 'gene', 'UTR_length', 'ratio_ATCACG','ratio_CGATGT']].copy()
d['log-bcm'] = np.log10(d['ratio_ATCACG'])
d['log+bcm'] = np.log10(d['ratio_CGATGT'])
d['loglen'] = np.log10(d['UTR_length'])
In [14]:
d[d['gene'] == 'rpoS']
Out[14]:
In [15]:
p = ggplot(d, aes(x='loglen', y='ratio_ATCACG')) \
+ geom_point(alpha=0.1) \
+ geom_smooth(method='lowess', span=1/17.)
print(p)
In [16]:
p = ggplot(d, aes(x='loglen', y='ratio_ATCACG')) \
+ geom_point(alpha=0.1) \
+ geom_smooth(method='ma', window=20)
print(p)
In [17]:
p = ggplot(d, aes(x='loglen', y='ratio_ATCACG')) \
+ geom_point(alpha=0.1) \
+ scale_y_continuous(limits=(0,10))
print(p)
In [18]:
p = ggplot(d, aes(x='loglen', y='ratio_CGATGT')) \
+ geom_point(alpha=0.1) \
+ scale_y_continuous(limits=(0,10))
print(p)
In [19]:
dfa[(dfa['gene'] == 'fadE')][[
'TSS', 'gene', 'strand', 'UTR_length',
'ratio_ATCACG', 'ratio_ACAGTG', 'ratio_CGATGT', 'ratio_GCCAAT',
'win', 'offset'
]]
Out[19]:
In [20]:
d = dfa[(dfa['UTR_length'] > 80)
& (dfa['ratio_ATCACG'] > 2)][[
'TSS', 'gene', 'UTR_length',
'ratio_ATCACG','ratio_CGATGT', 'offset', 'win']].copy()
d['log-bcm'] = np.log10(d['ratio_ATCACG'])
d['log+bcm'] = np.log10(d['ratio_CGATGT'])
d['loglen'] = np.log10(d['UTR_length'])
In [21]:
p = ggplot(d, aes(x='loglen', y='log-bcm')) \
+ geom_point(alpha=0.1, size=1) \
+ geom_smooth(method='lowess', span=1/5.) \
+ facet_grid('offset ~ win')
print(p)
In [22]:
p = ggplot(d, aes(x='loglen', y='log-bcm')) \
+ geom_point(alpha=0.1, size=1) \
+ geom_smooth(method='ma', window=20) \
+ facet_grid('offset ~ win')
print(p)
In [23]:
p = ggplot(d, aes(x='loglen', y='log+bcm')) \
+ geom_point(alpha=0.1, size=1) \
+ geom_smooth(method='lowess', span=1/5.) \
+ facet_grid('offset ~ win')
print(p)
In [24]:
p = ggplot(d, aes(x='loglen', y='log+bcm')) \
+ geom_point(alpha=0.1, size=1) \
+ geom_smooth(method='ma', window=20) \
+ facet_grid('offset ~ win')
print(p)
In [25]:
d = dfa[(dfa['UTR_length'] > 80)
& (dfa['UTR_length'] < 600)
& (dfa['ratio_ATCACG'] > 2)
& (dfa['offset'] == 200)
& (dfa['win'] == 80)][['TSS', 'gene', 'UTR_length', 'ratio_ATCACG','ratio_CGATGT']].copy()
d['log-bcm'] = np.log2(d['ratio_ATCACG'])
d['log+bcm'] = np.log2(d['ratio_CGATGT'])
d['loglen'] = np.log2(d['UTR_length'])
d['diff'] = d['log-bcm'] - d['log+bcm']
d1 = d[['UTR_length', 'loglen', 'log-bcm']].rename(columns={'log-bcm': 'logratio'})
d1['bcm'] = '-'
d2 = d[['UTR_length', 'loglen', 'log+bcm']].rename(columns={'log+bcm': 'logratio'})
d2['bcm'] = '+'
_d = pd.concat([d1, d2])
In [26]:
d[d['gene'] == 'rpoS']
Out[26]:
In [27]:
p = ggplot(_d, aes(x='UTR_length', y='logratio', color='bcm')) \
+ geom_point(alpha=0.25) \
+ geom_smooth(method='lowess', span=1/5., size=3) \
+ xlab("5' UTR length") \
+ ylab("log(proximal/distal)") \
+ theme(axis_title=element_text(size=20),
axis_text=element_text(size=20))
print(p)
In [28]:
p = ggplot(_d, aes(x='UTR_length', y='logratio', color='bcm')) \
+ geom_point(alpha=0.25) \
+ geom_smooth(method='ma', level=0.9, window=25, size=7)
print(p)
In [29]:
p = ggplot(d, aes(x='UTR_length', y='diff')) \
+ geom_point(alpha=0.25) \
+ geom_smooth(method='lowess', span=1/5., size=4)
print(p)
In [30]:
p = ggplot(d, aes(x='UTR_length', y='diff')) \
+ geom_point(alpha=0.25) \
+ geom_smooth(method='ma', se=False, window=35, size=4)
print(p)
In [31]:
d = dfa[(dfa['UTR_length'] > 80)
& (dfa['UTR_length'] < 600)
& (dfa['ratio_ACAGTG'] > 2)
& (dfa['offset'] == 200)
& (dfa['win'] == 80)][['TSS', 'gene', 'UTR_length', 'ratio_ACAGTG','ratio_GCCAAT']].copy()
d['log-bcm'] = np.log10(d['ratio_ACAGTG'])
d['log+bcm'] = np.log10(d['ratio_GCCAAT'])
d['loglen'] = np.log10(d['UTR_length'])
d['diff'] = d['log-bcm'] - d['log+bcm']
d1 = d[['UTR_length', 'loglen', 'log-bcm']].rename(columns={'log-bcm': 'logratio'})
d1['bcm'] = '-'
d2 = d[['UTR_length', 'loglen', 'log+bcm']].rename(columns={'log+bcm': 'logratio'})
d2['bcm'] = '+'
_d = pd.concat([d1, d2])
In [32]:
p = ggplot(_d, aes(x='UTR_length', y='logratio', color='bcm')) \
+ geom_point(alpha=0.25) \
+ geom_smooth(method='lowess', span=1/5., size=3) \
+ xlab("5' UTR length") \
+ ylab("log(proximal/distal)") \
+ theme(axis_title=element_text(size=20),
axis_text=element_text(size=20))
print(p)
In [33]:
p = ggplot(_d, aes(x='UTR_length', y='logratio', color='bcm')) \
+ geom_point(alpha=0.25) \
+ geom_smooth(method='ma', level=0.9, window=25, size=7)
print(p)
In [34]:
d = dfa[(dfa['UTR_length'] < 80)
& (dfa['UTR_length'] > 0)
& (dfa['offset'] == 200)
& (dfa['win'] == 80)][['TSS', 'gene', 'UTR_length', 'ratio_ATCACG','ratio_CGATGT']].copy()
d['log-bcm'] = np.log10(d['ratio_ATCACG'])
d['log+bcm'] = np.log10(d['ratio_CGATGT'])
d['loglen'] = np.log10(d['UTR_length'])
d['diff'] = d['log-bcm'] - d['log+bcm']
d1 = d[['UTR_length', 'loglen', 'log-bcm']].rename(columns={'log-bcm': 'logratio'})
d1['bcm'] = '-'
d2 = d[['UTR_length', 'loglen', 'log+bcm']].rename(columns={'log+bcm': 'logratio'})
d2['bcm'] = '+'
_d = pd.concat([d1, d2])
In [35]:
p = ggplot(_d, aes(x='UTR_length', y='logratio', color='bcm')) \
+ geom_point(alpha=0.25) \
+ geom_smooth(method='lowess', span=1/5., size=7) \
+ scale_y_continuous(limits=(-1,2)) \
+ xlab("5' UTR length") \
+ ylab("log(proximal/distal)") \
+ theme(axis_title=element_text(size=20),
axis_text=element_text(size=20))
print(p)
In [36]:
p = ggplot(_d, aes(x='UTR_length', y='logratio', color='bcm')) \
+ geom_point(alpha=0.25) \
+ geom_smooth(method='ma', window=25, size=7) \
+ scale_y_continuous(limits=(-1,2))
print(p)
In [37]:
p = ggplot(d, aes(x='UTR_length', y='diff')) \
+ geom_point(alpha=0.25) \
+ geom_smooth(method='lowess', span=1/5., size=4)
print(p)
In [38]:
p = ggplot(d, aes(x='UTR_length', y='diff')) \
+ geom_point(alpha=0.25) \
+ geom_smooth(method='ma', se=False, window=35, size=4)
print(p)
In [39]:
d = dfa[(dfa['UTR_length'] < 80)
& (dfa['UTR_length'] > 0)
& (dfa['offset'] == 200)
& (dfa['win'] == 80)][['TSS', 'gene', 'UTR_length', 'ratio_ACAGTG','ratio_GCCAAT']].copy()
d['log-bcm'] = np.log2(d['ratio_ACAGTG'])
d['log+bcm'] = np.log2(d['ratio_GCCAAT'])
d['loglen'] = np.log2(d['UTR_length'])
d['diff'] = d['log-bcm'] - d['log+bcm']
d1 = d[['UTR_length', 'loglen', 'log-bcm']].rename(columns={'log-bcm': 'logratio'})
d1['bcm'] = '-'
d2 = d[['UTR_length', 'loglen', 'log+bcm']].rename(columns={'log+bcm': 'logratio'})
d2['bcm'] = '+'
_d = pd.concat([d1, d2])
In [40]:
p = ggplot(_d, aes(x='UTR_length', y='logratio', color='bcm')) \
+ geom_point(alpha=0.25) \
+ geom_smooth(method='lowess', span=1/5., size=3) \
+ xlab("5' UTR length") \
+ ylab("log(proximal/distal)") \
+ theme(axis_title=element_text(size=20),
axis_text=element_text(size=20))
print(p)
In [41]:
p = ggplot(_d, aes(x='UTR_length', y='logratio', color='bcm')) \
+ geom_point(alpha=0.25) \
+ geom_smooth(method='ma', window=25, size=7)
print(p)
In [42]:
d = dfa[(dfa['UTR_length'] > 80)
& (dfa['UTR_length'] < 700)
& (dfa['offset'] == 200)
& (dfa['win'] == 80)][['TSS', 'gene', 'UTR_length', 'ratio_ACAGTG','ratio_GCCAAT']].copy()
d['log-bcm'] = np.log10(d['ratio_ACAGTG'])
d['log+bcm'] = np.log10(d['ratio_GCCAAT'])
d['loglen'] = np.log10(d['UTR_length'])
d['diff'] = d['log-bcm'] - d['log+bcm']
d['bcm'] = '-'
d.loc[d['ratio_ACAGTG'] > 2, 'bcm'] = '+'
In [43]:
p = ggplot(d, aes(x='UTR_length', y='diff', color='bcm')) \
+ geom_point(alpha=0.25) \
+ geom_smooth(method='lowess', span=1/5., size=3) \
+ xlab("5' UTR length") \
+ ylab("log(proximal/distal)") \
+ theme(axis_title=element_text(size=20),
axis_text=element_text(size=20))
print(p)
In [44]:
p = ggplot(d, aes(x='UTR_length', y='diff', color='bcm')) \
+ geom_point(alpha=0.25) \
+ geom_smooth(method='ma', window=50, size=3)
print(p)
In [45]:
d = dfa[(dfa['UTR_length'] > 80)
& (dfa['ratio_ATCACG'] > 2)
& (dfa['offset'] == 200)
& (dfa['win'] == 80)][['TSS', 'gene', 'UTR_length', 'ratio_ATCACG','ratio_CGATGT']].copy()
d['log-bcm'] = np.log10(d['ratio_ATCACG'])
d['log+bcm'] = np.log10(d['ratio_CGATGT'])
d['loglen'] = np.log10(d['UTR_length'])
d1 = d[['loglen', 'ratio_ATCACG']].rename(columns={'ratio_ATCACG': 'ratio'})
d1['bcm'] = '-'
d2 = d[['loglen', 'ratio_CGATGT']].rename(columns={'ratio_CGATGT': 'ratio'})
d2['bcm'] = '+'
_d = pd.concat([d1, d2])
In [46]:
p = ggplot(_d, aes(x='loglen', y='ratio', color='bcm')) \
+ geom_point(alpha=0.2) \
+ geom_smooth(method='lowess', span=1/5., size=3)
print(p)
In [47]:
p = ggplot(_d, aes(x='loglen', y='ratio', color='bcm')) \
+ geom_point(alpha=0.2) \
+ geom_smooth(method='ma', window=20, size=5)
print(p)
In [48]:
p = ggplot(_d, aes(x='loglen', y='ratio', color='bcm')) \
+ geom_point(alpha=0.3) \
+ scale_y_continuous(limits=(0,10))
print(p)
In [50]:
samples_dict = {
's9': ['ATCACG', 'ACAGTG'],
's9+bcm': ['CGATGT', 'GCCAAT'],
's17': ['TTAGGC', 'GATCAG'],
}
utr_cols = ['TSS', 'gene', 'UTR_length',
'ratio_ATCACG','ratio_CGATGT',
'ratio_ACAGTG','ratio_GCCAAT']
long_utrs = dfa[(dfa['UTR_length'] > 80)
& (dfa['UTR_length'] < 600)
& (dfa['ratio_ACAGTG'] > 2)
& (dfa['offset'] == 200)
& (dfa['win'] == 80)][utr_cols].copy()
short_utrs = dfa[(dfa['UTR_length'] < 80)
& (dfa['UTR_length'] > 0)
& (dfa['offset'] == 200)
& (dfa['win'] == 80)][utr_cols].copy()
def utr_scatter(data, samples, cond=None, save_csv=False):
'''
`samples`: list of sample_ids from samples_dict
`cond`: list of conditions
'''
def mark_rho(rec):
if rec['gene'] == 'rpoS' and rec['UTR_length'] > 500:
return 'rpoS'
else:
return ''
res = []
for i,sample in enumerate(samples):
d = data[['UTR_length', 'gene']]
d['loglen'] = np.log10(data['UTR_length'])
dtmp = data[['ratio_{}'.format(bc) for bc in samples_dict[sample]]]
for barcode in samples_dict[sample]:
dtmp[barcode] = np.log10(data['ratio_{}'.format(barcode)])
d['logratio'] = dtmp[[bc for bc in samples_dict[sample]]].mean(axis=1)
if cond and len(cond) == len(samples):
d['cond'] = cond[i]
else:
d['cond'] = 'cond_{}'.format(i)
d['label'] = d.apply(mark_rho, axis=1)
res.append(d)
df = pd.concat(res)
if save_csv:
df.to_csv('../../results/redux/fig_1b.df.csv', sep='\t')
p = ggplot(df, aes(x='UTR_length', y='logratio', color='cond', label='label')) \
+ geom_point(alpha=0.25) \
+ geom_text(color="black", nudge_x=20) \
+ geom_smooth(method='lowess', span=1/5., size=3) \
+ xlab("5' UTR length") \
+ ylab("log(proximal/distal)") \
+ theme(axis_title=element_text(size=20),
axis_text=element_text(size=20))
print(p)
In [51]:
utr_scatter(long_utrs, ['s9', 's9+bcm'], cond=['-bcm', '+bcm'], save_csv=True)
In [73]:
utr_scatter(short_utrs, ['s9', 's9+bcm'], cond=['-bcm', '+bcm'])
In [78]:
samples_dict = {
's9': ['ATCACG'],
's9+bcm': ['CGATGT'],
}
In [79]:
utr_scatter(long_utrs, ['s9', 's9+bcm'], cond=['-bcm', '+bcm'])
In [80]:
utr_scatter(short_utrs, ['s9', 's9+bcm'], cond=['-bcm', '+bcm'])
In [81]:
samples_dict = {
's9': ['ACAGTG'],
's9+bcm': ['GCCAAT'],
}
In [82]:
utr_scatter(long_utrs, ['s9', 's9+bcm'], cond=['-bcm', '+bcm'])
In [83]:
utr_scatter(short_utrs, ['s9', 's9+bcm'], cond=['-bcm', '+bcm'])
In [ ]: