In [6]:
%%bash
gunzip -c ../data/ACATCG_R2_fx.fastq.gz | bowtie2 -N 1 -x ../ref/dhsr1_acatcg -U - | samtools view -bhS - > ../results/ACATCG_R2.bam
In [9]:
%%bash
samtools sort ../results/ACATCG_R2.bam ../results/ACATCG_R2_sorted
samtools index ../results/ACATCG_R2_sorted.bam
In [12]:
%%bash
samtools mpileup -f ../ref/dhsr1_acatcg.fasta ../results/ACATCG_R2_sorted.bam > ../results/ACATCG_R2.pileup
In [16]:
import csv
import pandas as pd
with open('../results/ACATCG_R2.pileup', 'rb') as csvfile:
reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['seqname', 'pos', 'base', 'coverage', 'details', 'qual'])
data = [{'pos': int(rec['pos']),
'base': rec['base'],
'reads': rec['details'].count('^')} for rec in reader]
df = pd.DataFrame.from_records(data)
df
Out[16]:
In [18]:
%%bash
gunzip -c ../data/trimmed.fastq.gz | bowtie2 -N 1 -x ../ref/dhsr1_acatcg -U - | samtools view -bhS - > ../results/trimmed.bam
In [22]:
with open('../results/trimmed.pileup', 'rb') as csvfile:
reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['seqname', 'pos', 'base', 'coverage', 'details', 'qual'])
data = [{'pos': int(rec['pos']),
'base': rec['base'],
'reads': rec['details'].count('^')} for rec in reader]
df = pd.DataFrame.from_records(data)
In [25]:
%matplotlib inline
In [26]:
df.plot(x='pos', y='reads')
Out[26]:
In [32]:
import gzip
from itertools import izip
from skbio.parse.sequences import parse_fastq
bc_minus = 'YYYR'
bc_plus = 'RRRY'
fastq_tpl='@{id}\n{seq}\n+\n{qual}\n'
with gzip.open('../data/ACATCG_R1_fx.fastq.gz', 'rb') as gzr1, \
gzip.open('../data/ACATCG_R2_fx.fastq.gz', 'rb') as gzr2, \
gzip.open('../results/minus.fastq.gz', 'wb') as gz_minus, \
gzip.open('../results/plus.fastq.gz', 'wb') as gz_plus:
mates = izip(parse_fastq(gzr1), parse_fastq(gzr2))
for rec1, rec2 in mates:
id1, seq1, qual1 = rec1
id2, seq2, qual2 = rec2
qual_str = ''.join([chr(q+33) for q in qual2])
if seq1[0] in 'CTN' and seq1[1] in 'CTN' and seq1[2] in 'CTN' and seq1[3] in 'AGN':
gz_minus.write(fastq_tpl.format(id=id2,seq=seq2,qual=qual_str))
elif seq1[0] in 'AGN' and seq1[1] in 'AGN' and seq1[2] in 'AGN' and seq1[3] in 'CTN':
gz_plus.write(fastq_tpl.format(id=id2,seq=seq2,qual=qual_str))
In [ ]:
def split_pools(barcode, dirname='../data'):
d, _, filenames = os.walk(dirname)
files = [f for f in filenames if f.startswith(barcode)]
files_R1 = [f for f in files if 'R1' in f]
files_R2 = [f for f in files if 'R2' in f]
fastq_tpl='@{id}\n{seq}\n+\n{qual}\n'
for file_R1,file_R2 in zip(files_R1, files_R2):
with gzip.open(file_R1, 'rb') as gzr1, gzip.open(file_R2, 'rb') as gzr2, \
gzip.open(os.path.join(dirname, barcode+'_minus.fastq.gz'), 'wb') as gz_minus, \
gzip.open(os.path.join(dirname, barcode+'_plus.fastq.gz'), 'wb') as gz_plus:
for rec1,rec2 in izip(parse_fastq(gzr1), parse_fastq(gzr2)):
id1, seq1, qual1 = rec1
id2, seq2, qual2 = rec2
qual_str = ''.join([chr(33+q) for q in qual2])
if seq1[0] in 'CTN' and seq1[1] in 'CTN' and seq1[2] in 'CTN' and seq1[3] in 'AGN':
gz_minus.write(fastq_tpl.format(id=id2,seq=seq2,qual=qual_str))
elif seq1[0] in 'AGN' and seq1[1] in 'AGN' and seq1[2] in 'AGN' and seq1[3] in 'CTN':
gz_plus.write(fastq_tpl.format(id=id2,seq=seq2,qual=qual_str))
In [ ]:
%%bash
# Trim adapters from 3' ends of the reads
cutadapt -a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -o plus_trimmed.fastq.gz plus.fastq.gz
In [33]:
%%bash
# Minus pool
%%bash
gunzip -c ../data/minus_trimmed.fastq.gz | bowtie2 -N 1 -x ../ref/dhsr1_acatcg -U - | samtools view -bhS - > ../results/minus_trimmed.bam
In [34]:
%%bash
# Plus pool
%%bash
gunzip -c ../data/plus_trimmed.fastq.gz | bowtie2 -N 1 -x ../ref/dhsr1_acatcg -U - | samtools view -bhS - > ../results/plus_trimmed.bam
In [35]:
%%bash
samtools sort ../results/minus_trimmed.bam ../results/minus_trimmed_sorted
samtools index ../results/minus_trimmed_sorted.bam
In [36]:
%%bash
samtools sort ../results/plus_trimmed.bam ../results/plus_trimmed_sorted
samtools index ../results/plus_trimmed_sorted.bam
In [37]:
%%bash
samtools mpileup -f ../ref/dhsr1_acatcg.fasta ../results/minus_trimmed_sorted.bam > ../results/minus.pileup
samtools mpileup -f ../ref/dhsr1_acatcg.fasta ../results/plus_trimmed_sorted.bam > ../results/plus.pileup
In [38]:
def parse_pileup(filename):
with open(filename, 'rb') as csvfile:
reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['seqname', 'pos', 'base', 'coverage', 'details', 'qual'])
data = [{'pos': int(rec['pos']),
'base': rec['base'],
'reads': rec['details'].count('^')} for rec in reader]
return pd.DataFrame.from_records(data)
In [39]:
df_minus = parse_pileup('../results/minus.pileup')
df_plus = parse_pileup('../results/plus.pileup')
In [43]:
df_minus['plus'] = df_plus['reads']
df_minus.plot(x='pos', y='reads')
Out[43]:
In [44]:
df_minus.plot(x='pos', y='plus')
Out[44]:
In [49]:
df_minus.rename(columns={'reads': 'minus'}, inplace=True)
In [52]:
import numpy as np
# the order needs to be reversed
df_minus['theta'] = np.log(1-df_minus['minus']/df_minus['minus'].cumsum()) - np.log(1-df_minus['plus']/df_minus['plus'].cumsum())
df_minus.plot(x='pos', y='theta')
Out[52]:
In [80]:
df = df_minus[['pos','base','plus','minus']]
df = df[df['pos'] < 655 ]
df = df.reindex(index=df.index[::-1])
df['theta'] = np.log(1-df['minus']/df['minus'].cumsum()) - np.log(1-df['plus']/df['plus'].cumsum())
#df.plot(x='pos', y='theta')
df[df['theta'] < 0]['theta'] = 0
df.plot(x='pos', y='theta')
Out[80]:
In [70]:
df.max()
Out[70]:
In [ ]: