In [1]:
    
%matplotlib inline
    
In [2]:
    
#from __future__ import division
import pandas as pd
import numpy as np
from plotnine import *
    
In [3]:
    
!ls -lah ../data/*csv
    
    
In [4]:
    
offsets = [150,200,300]
winsizes = [50,80,100,200]
output_tpl = '../data/dfa_mp.offset_{}.win_{}.csv'
output = []
for offset in offsets:
    for winsize in winsizes:
        df = pd.DataFrame.from_csv(output_tpl.format(offset, winsize))
        df['win'] = winsize
        df['offset'] = offset
        output.append(df)
        
dfa = pd.concat(output)
    
In [5]:
    
dfa['UTR_length'] = dfa['end_x'] - dfa['start_x']
dfa
    
    Out[5]:
In [6]:
    
d = dfa[(dfa['UTR_length'] > 80)
        & (dfa['ratio_ATCACG'] > 2)
        & (dfa['offset'] == 200)
        & (dfa['win'] == 80)][['UTR_length', 'ratio_ATCACG','ratio_CGATGT']].copy()
d['log-bcm'] = np.log10(d['ratio_ATCACG'])
d['log+bcm'] = np.log10(d['ratio_CGATGT'])
d['loglen'] = np.log10(d['UTR_length'])
    
In [7]:
    
p = ggplot(d, aes(x='loglen', y='log-bcm')) \
        + geom_point(alpha=0.1) \
        + geom_smooth(method='lowess', span=1/7.)
print(p)
    
    
    
    
In [11]:
    
p = ggplot(d, aes(x='loglen', y='log-bcm')) \
        + geom_point(alpha=0.1) \
        + geom_smooth(method='mavg')
print(p)
    
    
    
    
In [12]:
    
p = ggplot(d, aes(x='loglen', y='log+bcm')) \
        + geom_point(alpha=0.1) \
        + geom_smooth(method='lowess', span=1/5.) \
        + scale_y_continuous(limits=(-1,2.5))
print(p)
    
    
    
    
In [12]:
    
p = ggplot(d, aes(x='loglen', y='log+bcm')) \
        + geom_point(alpha=0.1) \
        + geom_smooth(method='ma', window=25) \
        + scale_y_continuous(limits=(-1,2.5))
print(p)
    
    
    
    
In [13]:
    
p = ggplot(d, aes(x='loglen', y='ratio_ATCACG')) \
        + geom_point(alpha=0.1) \
        + geom_smooth(method='lowess', span=1/17.)
print(p)
    
    
    
In [14]:
    
p = ggplot(d, aes(x='loglen', y='ratio_ATCACG')) \
        + geom_point(alpha=0.1) \
        + geom_smooth(method='ma', window=20)
print(p)
    
    
    
    
In [15]:
    
d = dfa[(dfa['UTR_length'] > 80)
        & (dfa['ratio_ATCACG'] > 2)][[
            'TSS', 'gene', 'UTR_length', 
            'ratio_ATCACG','ratio_CGATGT', 'offset', 'win']].copy()
d['log-bcm'] = np.log10(d['ratio_ATCACG'])
d['log+bcm'] = np.log10(d['ratio_CGATGT'])
d['loglen'] = np.log10(d['UTR_length'])
    
In [16]:
    
p = ggplot(d, aes(x='loglen', y='log-bcm')) \
        + geom_point(alpha=0.1, size=1) \
        + geom_smooth(method='lowess', span=1/5.) \
        + facet_wrap('win')
print(p)
    
    
    
In [17]:
    
p = ggplot(d, aes(x='loglen', y='log-bcm')) \
        + geom_point(alpha=0.1, size=1) \
        + geom_smooth(method='lowess', span=1/5.) \
        + facet_wrap('offset')
print(p)
    
    
    
In [18]:
    
p = ggplot(d, aes(x='loglen', y='log-bcm')) \
        + geom_point(alpha=0.1, size=1) \
        + geom_smooth(method='lowess', span=1/5.) \
        + facet_grid('offset ~ win')
print(p)
    
    
    
In [19]:
    
p = ggplot(d, aes(x='loglen', y='log-bcm')) \
        + geom_point(alpha=0.1, size=1) \
        + geom_smooth(method='ma', window=20) \
        + facet_grid('offset ~ win')
print(p)
    
    
    
    
In [20]:
    
d = dfa[(dfa['UTR_length'] > 80)
        & (dfa['UTR_length'] < 600)
        & (dfa['ratio_ATCACG'] > 2)
        & (dfa['offset'] == 200)
        & (dfa['win'] == 80)][['TSS', 'gene', 'UTR_length', 'ratio_ATCACG','ratio_CGATGT']].copy()
d['log-bcm'] = np.log2(d['ratio_ATCACG'])
d['log+bcm'] = np.log2(d['ratio_CGATGT'])
d['loglen'] = np.log2(d['UTR_length'])
d['diff'] = d['log-bcm'] - d['log+bcm']
d1 = d[['UTR_length', 'loglen', 'log-bcm']].rename(columns={'log-bcm': 'logratio'})
d1['bcm'] = '-'
d2 = d[['UTR_length', 'loglen', 'log+bcm']].rename(columns={'log+bcm': 'logratio'})
d2['bcm'] = '+'
_d = pd.concat([d1, d2])
    
In [21]:
    
p = ggplot(_d, aes(x='UTR_length', y='logratio', color='bcm')) \
        + geom_point(alpha=0.25) \
        + geom_smooth(method='lowess', span=1/5., size=3) \
        + xlab("5' UTR length") \
        + ylab("log(proximal/distal)") \
        + theme(axis_title=element_text(size=20),
                axis_text=element_text(size=20))
print(p)
    
    
    
In [22]:
    
d = dfa[(dfa['UTR_length'] > 80)
        & (dfa['UTR_length'] < 600)
        & (dfa['ratio_ATCACG'] > 2)][['TSS', 'win', 'offset', 'gene', 'UTR_length', 'ratio_ATCACG','ratio_CGATGT']].copy()
d['log-bcm'] = np.log2(d['ratio_ATCACG'])
d['log+bcm'] = np.log2(d['ratio_CGATGT'])
d['loglen'] = np.log2(d['UTR_length'])
d1 = d[['UTR_length', 'win', 'offset', 'loglen', 'log-bcm']].rename(columns={'log-bcm': 'logratio'})
d1['bcm'] = '-'
d2 = d[['UTR_length', 'win', 'offset', 'loglen', 'log+bcm']].rename(columns={'log+bcm': 'logratio'})
d2['bcm'] = '+'
_d = pd.concat([d1, d2])
    
In [23]:
    
p = ggplot(_d, aes(x='UTR_length', y='logratio', color='bcm')) \
        + geom_point(alpha=0.25) \
        + geom_smooth(method='lowess', span=1/5., size=3) \
        + xlab("5' UTR length") \
        + ylab("log(proximal/distal)") \
        + theme(axis_title=element_text(size=20),
                axis_text=element_text(size=20)) \
        + facet_grid('offset ~ win')
print(p)
    
    
    
In [ ]: