In [6]:
%%bash
gunzip -c ../data/ACATCG_R2_fx.fastq.gz | bowtie2 -N 1 -x ../ref/dhsr1_acatcg -U - | samtools view -bhS - > ../results/ACATCG_R2.bam


[samopen] SAM header is present: 1 sequences.
396005 reads; of these:
  396005 (100.00%) were unpaired; of these:
    222411 (56.16%) aligned 0 times
    173594 (43.84%) aligned exactly 1 time
    0 (0.00%) aligned >1 times
43.84% overall alignment rate

In [9]:
%%bash
samtools sort ../results/ACATCG_R2.bam ../results/ACATCG_R2_sorted
samtools index ../results/ACATCG_R2_sorted.bam

In [12]:
%%bash
samtools mpileup -f ../ref/dhsr1_acatcg.fasta ../results/ACATCG_R2_sorted.bam > ../results/ACATCG_R2.pileup


[mpileup] 1 samples in 1 input files
<mpileup> Set max per-file depth to 8000

In [16]:
import csv
import pandas as pd
with open('../results/ACATCG_R2.pileup', 'rb') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['seqname', 'pos', 'base', 'coverage', 'details', 'qual'])    
    data = [{'pos': int(rec['pos']),
             'base': rec['base'],
             'reads': rec['details'].count('^')} for rec in reader]
    
df = pd.DataFrame.from_records(data)
df


Out[16]:
base pos reads
0 T 1 12
1 C 2 2
2 T 3 9
3 C 4 0
4 G 5 1
5 C 6 1
6 T 7 7
7 A 8 23
8 T 9 0
9 G 10 1
10 C 11 0
11 G 12 7
12 T 13 22
13 C 14 2
14 A 15 85
15 C 16 1
16 G 17 1
17 T 18 22
18 T 19 37
19 A 20 27
20 A 21 16
21 A 22 20
22 A 23 9
23 A 24 56
24 T 25 40
25 A 26 40
26 A 27 66
27 T 28 9
28 G 29 2
29 A 30 13
... ... ... ...
655 C 656 0
656 C 657 0
657 C 658 0
658 G 659 0
659 A 660 0
660 T 661 0
661 G 662 0
662 T 663 0
663 A 664 0
664 A 665 0
665 A 666 0
666 T 667 0
667 C 668 0
668 G 669 0
669 G 670 0
670 G 671 0
671 C 672 0
672 T 673 0
673 T 674 0
674 C 675 0
675 G 676 0
676 G 677 0
677 T 678 0
678 C 679 0
679 C 680 0
680 G 681 0
681 G 682 0
682 T 683 0
683 T 684 0
684 C 685 0

685 rows × 3 columns


In [18]:
%%bash
gunzip -c ../data/trimmed.fastq.gz | bowtie2 -N 1 -x ../ref/dhsr1_acatcg -U - | samtools view -bhS - > ../results/trimmed.bam


[samopen] SAM header is present: 1 sequences.
396005 reads; of these:
  396005 (100.00%) were unpaired; of these:
    84945 (21.45%) aligned 0 times
    311060 (78.55%) aligned exactly 1 time
    0 (0.00%) aligned >1 times
78.55% overall alignment rate

In [22]:
with open('../results/trimmed.pileup', 'rb') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['seqname', 'pos', 'base', 'coverage', 'details', 'qual'])    
    data = [{'pos': int(rec['pos']),
             'base': rec['base'],
             'reads': rec['details'].count('^')} for rec in reader]
    
df = pd.DataFrame.from_records(data)

In [25]:
%matplotlib inline

In [26]:
df.plot(x='pos', y='reads')


Out[26]:
<matplotlib.axes.AxesSubplot at 0x7f3ab526f5d0>

In [32]:
import gzip
from itertools import izip
from skbio.parse.sequences import parse_fastq

bc_minus = 'YYYR'
bc_plus = 'RRRY'

fastq_tpl='@{id}\n{seq}\n+\n{qual}\n'

with gzip.open('../data/ACATCG_R1_fx.fastq.gz', 'rb') as gzr1, \
     gzip.open('../data/ACATCG_R2_fx.fastq.gz', 'rb') as gzr2, \
     gzip.open('../results/minus.fastq.gz', 'wb') as gz_minus, \
     gzip.open('../results/plus.fastq.gz', 'wb') as gz_plus:
    mates = izip(parse_fastq(gzr1), parse_fastq(gzr2))
    for rec1, rec2 in mates:
        id1, seq1, qual1 = rec1
        id2, seq2, qual2 = rec2
        qual_str = ''.join([chr(q+33) for q in qual2])
        if seq1[0] in 'CTN' and seq1[1] in 'CTN' and seq1[2] in 'CTN' and seq1[3] in 'AGN':
            gz_minus.write(fastq_tpl.format(id=id2,seq=seq2,qual=qual_str))
        elif seq1[0] in 'AGN' and seq1[1] in 'AGN' and seq1[2] in 'AGN' and seq1[3] in 'CTN':
            gz_plus.write(fastq_tpl.format(id=id2,seq=seq2,qual=qual_str))

In [ ]:
def split_pools(barcode, dirname='../data'):
    d, _, filenames = os.walk(dirname)
    files = [f for f in filenames if f.startswith(barcode)]
    files_R1 = [f for f in files if 'R1' in f]
    files_R2 = [f for f in files if 'R2' in f]
    fastq_tpl='@{id}\n{seq}\n+\n{qual}\n'
    for file_R1,file_R2 in zip(files_R1, files_R2):
        with gzip.open(file_R1, 'rb') as gzr1, gzip.open(file_R2, 'rb') as gzr2, \
             gzip.open(os.path.join(dirname, barcode+'_minus.fastq.gz'), 'wb') as gz_minus, \
             gzip.open(os.path.join(dirname, barcode+'_plus.fastq.gz'), 'wb') as gz_plus:
            for rec1,rec2 in izip(parse_fastq(gzr1), parse_fastq(gzr2)):
                id1, seq1, qual1 = rec1
                id2, seq2, qual2 = rec2
                qual_str = ''.join([chr(33+q) for q in qual2])
                if seq1[0] in 'CTN' and seq1[1] in 'CTN' and seq1[2] in 'CTN' and seq1[3] in 'AGN':
                    gz_minus.write(fastq_tpl.format(id=id2,seq=seq2,qual=qual_str))
                elif seq1[0] in 'AGN' and seq1[1] in 'AGN' and seq1[2] in 'AGN' and seq1[3] in 'CTN':
                    gz_plus.write(fastq_tpl.format(id=id2,seq=seq2,qual=qual_str))

In [ ]:
%%bash
# Trim adapters from 3' ends of the reads
cutadapt -a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -o plus_trimmed.fastq.gz plus.fastq.gz

In [33]:
%%bash
# Minus pool
%%bash
gunzip -c ../data/minus_trimmed.fastq.gz | bowtie2 -N 1 -x ../ref/dhsr1_acatcg -U - | samtools view -bhS - > ../results/minus_trimmed.bam


bash: line 2: fg: no job control
[samopen] SAM header is present: 1 sequences.
190066 reads; of these:
  190066 (100.00%) were unpaired; of these:
    40867 (21.50%) aligned 0 times
    149199 (78.50%) aligned exactly 1 time
    0 (0.00%) aligned >1 times
78.50% overall alignment rate

In [34]:
%%bash
# Plus pool
%%bash
gunzip -c ../data/plus_trimmed.fastq.gz | bowtie2 -N 1 -x ../ref/dhsr1_acatcg -U - | samtools view -bhS - > ../results/plus_trimmed.bam


bash: line 2: fg: no job control
[samopen] SAM header is present: 1 sequences.
188089 reads; of these:
  188089 (100.00%) were unpaired; of these:
    40194 (21.37%) aligned 0 times
    147895 (78.63%) aligned exactly 1 time
    0 (0.00%) aligned >1 times
78.63% overall alignment rate

In [35]:
%%bash
samtools sort ../results/minus_trimmed.bam ../results/minus_trimmed_sorted
samtools index ../results/minus_trimmed_sorted.bam

In [36]:
%%bash
samtools sort ../results/plus_trimmed.bam ../results/plus_trimmed_sorted
samtools index ../results/plus_trimmed_sorted.bam

In [37]:
%%bash
samtools mpileup -f ../ref/dhsr1_acatcg.fasta ../results/minus_trimmed_sorted.bam > ../results/minus.pileup
samtools mpileup -f ../ref/dhsr1_acatcg.fasta ../results/plus_trimmed_sorted.bam > ../results/plus.pileup


[mpileup] 1 samples in 1 input files
<mpileup> Set max per-file depth to 8000
[mpileup] 1 samples in 1 input files
<mpileup> Set max per-file depth to 8000

In [38]:
def parse_pileup(filename):
    with open(filename, 'rb') as csvfile:
        reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['seqname', 'pos', 'base', 'coverage', 'details', 'qual'])    
        data = [{'pos': int(rec['pos']),
                 'base': rec['base'],
                 'reads': rec['details'].count('^')} for rec in reader]
    
    return pd.DataFrame.from_records(data)

In [39]:
df_minus = parse_pileup('../results/minus.pileup')
df_plus = parse_pileup('../results/plus.pileup')

In [43]:
df_minus['plus'] = df_plus['reads']
df_minus.plot(x='pos', y='reads')


Out[43]:
<matplotlib.axes.AxesSubplot at 0x7f3ab4ee6450>

In [44]:
df_minus.plot(x='pos', y='plus')


Out[44]:
<matplotlib.axes.AxesSubplot at 0x7f3aad07dcd0>

In [49]:
df_minus.rename(columns={'reads': 'minus'}, inplace=True)

In [52]:
import numpy as np
# the order needs to be reversed

df_minus['theta'] = np.log(1-df_minus['minus']/df_minus['minus'].cumsum()) - np.log(1-df_minus['plus']/df_minus['plus'].cumsum())
df_minus.plot(x='pos', y='theta')


Out[52]:
<matplotlib.axes.AxesSubplot at 0x7f3aacf8e490>

In [80]:
df = df_minus[['pos','base','plus','minus']]
df = df[df['pos'] < 655 ]
df = df.reindex(index=df.index[::-1])
df['theta'] = np.log(1-df['minus']/df['minus'].cumsum()) - np.log(1-df['plus']/df['plus'].cumsum())
#df.plot(x='pos', y='theta')
df[df['theta'] < 0]['theta'] = 0
df.plot(x='pos', y='theta')


Out[80]:
<matplotlib.axes.AxesSubplot at 0x7f3aac5162d0>

In [70]:
df.max()


Out[70]:
pos            654
base             T
plus          2275
minus         2302
theta    0.1032137
dtype: object

In [ ]: