In [1]:
%matplotlib inline
In [2]:
#from __future__ import division
import pandas as pd
import numpy as np
from ggplot import *
import os
import sys
In [3]:
# Sample titles with corresponding barcodes
# s9: WT
# s9+bcm: WT +BCM
# s17: triple sRNA mutant
samples = {
's9': ['ATCACG', 'ACAGTG'],
's9+bcm': ['CGATGT', 'GCCAAT'],
's17': ['TTAGGC', 'GATCAG'],
}
# Barcodes
barcodes = ['ATCACG', 'ACAGTG', 'CGATGT', 'GCCAAT', 'TTAGGC', 'GATCAG']
In [4]:
def extract_gff(line, ftype='gene'):
rec = {}
fields = line.strip().split('\t')
if fields[2] != ftype:
return None
opts = {}
for f in fields[8].split(';'):
k,v = f.split('=')
opts[k] = v
rec.update({
'gene': opts['gene'],
'start': int(fields[3]),
'end': int(fields[4]),
'strand': fields[6]
})
return rec
def extract_bed(line):
rec = {}
fields = line.strip().split('\t')
rec.update({
'TU': ''.join([x for x in fields[3].split('-')[2:]]),
'start': int(fields[1]),
'end': int(fields[2]),
'strand': fields[5]
})
return rec
def anno_df(annofile, extract=extract_gff):
result = []
with open(annofile) as fi:
for line in fi:
if not line.startswith('#'):
rec = extract(line)
if rec:
result.append(rec)
return pd.DataFrame.from_records(result)
In [5]:
gff = anno_df('../ref/NC_000913.gff')
gff
Out[5]:
In [6]:
df = pd.read_table('../ref/UTR_5_3_sequence.new.bed', sep='\t',
header=None,
names=['TSS', 'TU_name', 'coord_3', 'coord_5', 'first_gene_3',
'first_gene_5', 'first_gene_name', 'operon', 'promoter',
'seq_5UTR', 'strand'])
df
Out[6]:
In [7]:
utrs = df[['TSS', 'coord_3', 'coord_5', 'first_gene_name', 'strand']].copy()
utrs.rename(columns={'coord_3': 'end', 'coord_5': 'start', 'first_gene_name': 'gene'}, inplace=True)
utrs.loc[utrs['strand'] == 'forward', 'strand'] = '+'
utrs.loc[utrs['strand'] == 'reverse', 'strand'] = '-'
utrs
Out[7]:
In [8]:
# 5' UTR annotations
"""
res = []
with open('../../results/redux/utrs_corr.bed', 'r') as fi:
for line in fi:
fields = line.strip().split()
res.append({
'gene': fields[3],
'start': int(fields[1]),
'end': int(fields[2]),
})
utrs = pd.DataFrame.from_records(res)
utrs['UTR_length'] = utrs.end - utrs.start
utrs
"""
Out[8]:
In [9]:
dfa = utrs.merge(gff, on='gene', how='left').dropna()
dfa['strand'] = dfa['strand_x']
dfa
Out[9]:
In [10]:
import pysam
def get_ratio(df, bamfile, reference, win=50, offset=200):
def _utr_orf_ratio(rec):
def _counts(start, end):
count = 1
#start = int(rec[key5])
#end = int(rec[key3])
#if end < start:
# start, end = end, start
for read in bam.fetch(reference, start, end):
count += 1
return float(count)
if rec['strand'] == '-':
end_utr = rec['end_x']
start_utr = end_utr - win
end_orf = end_utr - offset
start_orf = end_orf - win
else:
start_utr = rec['start_x']
end_utr = start_utr + win
start_orf = rec['start_x'] + offset
end_orf = start_orf + win
return _counts(start_utr, end_utr) / _counts(start_orf, end_orf)
bam = pysam.AlignmentFile(bamfile, 'rb')
return df.apply(_utr_orf_ratio, axis=1)
def get_utr_ratio_df(data, barcodes, res_dir='../results', win=50, offset=200):
'''
Calculates 5'UTR coverage
Iterates over files whose name ends with '_sorted.bam' in `res_dir` and contains barcodes
specified in `barcodes`.
Adds `utr_<barcode>` column to df DataFrame
'''
d, _, filenames = next(os.walk(res_dir))
infiles = [f for f in filenames if f.endswith('_sorted.bam')]
df = data.copy()
for barcode in barcodes:
bamfile = os.path.join(d, [f for f in infiles if barcode in f][0])
df['ratio_{0}'.format(barcode)] = get_ratio(df,
bamfile,
'gi|556503834|ref|NC_000913.3|',
win, offset)
output.put({'data': df, 'winsize': win, 'offset': offset})
In [11]:
import multiprocessing as mp
output = mp.Queue()
offsets = [150,200,300]
winsizes = [50,80,100,200]
output_tpl = '../results/dfa_mp.offset_{}.win_{}.csv'
def utr_ratio(data, barcodes, res_dir='../results', win=50, offset=200):
'''
Calculates 5'UTR coverage
Iterates over files whose name ends with '_sorted.bam' in `res_dir` and contains barcodes
specified in `barcodes`.
Adds `utr_<barcode>` column to df DataFrame
'''
d, _, filenames = next(os.walk(res_dir))
infiles = [f for f in filenames if f.endswith('_sorted.bam')]
df = data.copy()
for barcode in barcodes:
bamfile = os.path.join(d, [f for f in infiles if barcode in f][0])
df['ratio_{0}'.format(barcode)] = get_ratio(df,
bamfile,
'gi|556503834|ref|NC_000913.3|',
win, offset)
df.to_csv(output_tpl.format(offset, win))
output.put({'winsize': win, 'offset': offset, 'filename': output_tpl.format(offset, win)})
pool = []
for offset in offsets:
for winsize in winsizes:
d = dfa.copy()
pool.append(mp.Process(target=utr_ratio,
args=(d, ['ATCACG', 'ACAGTG', 'CGATGT', 'GCCAAT']),
kwargs={'res_dir': '../results/',
'offset': offset,
'win': winsize}))
for p in pool:
p.start()
for p in pool:
p.join()
results = [output.get() for p in pool]
In [12]:
results
Out[12]:
In [ ]:
output.get()