notebook.community

Edit and run



In [6]:

    
%%bash
gunzip -c ../data/ACATCG_R2_fx.fastq.gz | bowtie2 -N 1 -x ../ref/dhsr1_acatcg -U - | samtools view -bhS - > ../results/ACATCG_R2.bam









    



[samopen] SAM header is present: 1 sequences.
396005 reads; of these:
  396005 (100.00%) were unpaired; of these:
    222411 (56.16%) aligned 0 times
    173594 (43.84%) aligned exactly 1 time
    0 (0.00%) aligned >1 times
43.84% overall alignment rate



In [9]:

    
%%bash
samtools sort ../results/ACATCG_R2.bam ../results/ACATCG_R2_sorted
samtools index ../results/ACATCG_R2_sorted.bam



In [12]:

    
%%bash
samtools mpileup -f ../ref/dhsr1_acatcg.fasta ../results/ACATCG_R2_sorted.bam > ../results/ACATCG_R2.pileup









    



[mpileup] 1 samples in 1 input files
<mpileup> Set max per-file depth to 8000



In [16]:

    
import csv
import pandas as pd
with open('../results/ACATCG_R2.pileup', 'rb') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['seqname', 'pos', 'base', 'coverage', 'details', 'qual'])    
    data = [{'pos': int(rec['pos']),
             'base': rec['base'],
             'reads': rec['details'].count('^')} for rec in reader]
    
df = pd.DataFrame.from_records(data)
df









    Out[16]:






  
    
      
      base
      pos
      reads
    
  
  
    
      0  
       T
         1
       12
    
    
      1  
       C
         2
        2
    
    
      2  
       T
         3
        9
    
    
      3  
       C
         4
        0
    
    
      4  
       G
         5
        1
    
    
      5  
       C
         6
        1
    
    
      6  
       T
         7
        7
    
    
      7  
       A
         8
       23
    
    
      8  
       T
         9
        0
    
    
      9  
       G
        10
        1
    
    
      10 
       C
        11
        0
    
    
      11 
       G
        12
        7
    
    
      12 
       T
        13
       22
    
    
      13 
       C
        14
        2
    
    
      14 
       A
        15
       85
    
    
      15 
       C
        16
        1
    
    
      16 
       G
        17
        1
    
    
      17 
       T
        18
       22
    
    
      18 
       T
        19
       37
    
    
      19 
       A
        20
       27
    
    
      20 
       A
        21
       16
    
    
      21 
       A
        22
       20
    
    
      22 
       A
        23
        9
    
    
      23 
       A
        24
       56
    
    
      24 
       T
        25
       40
    
    
      25 
       A
        26
       40
    
    
      26 
       A
        27
       66
    
    
      27 
       T
        28
        9
    
    
      28 
       G
        29
        2
    
    
      29 
       A
        30
       13
    
    
      ...
      ...
      ...
      ...
    
    
      655
       C
       656
        0
    
    
      656
       C
       657
        0
    
    
      657
       C
       658
        0
    
    
      658
       G
       659
        0
    
    
      659
       A
       660
        0
    
    
      660
       T
       661
        0
    
    
      661
       G
       662
        0
    
    
      662
       T
       663
        0
    
    
      663
       A
       664
        0
    
    
      664
       A
       665
        0
    
    
      665
       A
       666
        0
    
    
      666
       T
       667
        0
    
    
      667
       C
       668
        0
    
    
      668
       G
       669
        0
    
    
      669
       G
       670
        0
    
    
      670
       G
       671
        0
    
    
      671
       C
       672
        0
    
    
      672
       T
       673
        0
    
    
      673
       T
       674
        0
    
    
      674
       C
       675
        0
    
    
      675
       G
       676
        0
    
    
      676
       G
       677
        0
    
    
      677
       T
       678
        0
    
    
      678
       C
       679
        0
    
    
      679
       C
       680
        0
    
    
      680
       G
       681
        0
    
    
      681
       G
       682
        0
    
    
      682
       T
       683
        0
    
    
      683
       T
       684
        0
    
    
      684
       C
       685
        0
    
  

685 rows × 3 columns



In [18]:

    
%%bash
gunzip -c ../data/trimmed.fastq.gz | bowtie2 -N 1 -x ../ref/dhsr1_acatcg -U - | samtools view -bhS - > ../results/trimmed.bam









    



[samopen] SAM header is present: 1 sequences.
396005 reads; of these:
  396005 (100.00%) were unpaired; of these:
    84945 (21.45%) aligned 0 times
    311060 (78.55%) aligned exactly 1 time
    0 (0.00%) aligned >1 times
78.55% overall alignment rate



In [22]:

    
with open('../results/trimmed.pileup', 'rb') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['seqname', 'pos', 'base', 'coverage', 'details', 'qual'])    
    data = [{'pos': int(rec['pos']),
             'base': rec['base'],
             'reads': rec['details'].count('^')} for rec in reader]
    
df = pd.DataFrame.from_records(data)



In [25]:

    
%matplotlib inline



In [26]:

    
df.plot(x='pos', y='reads')









    Out[26]:





<matplotlib.axes.AxesSubplot at 0x7f3ab526f5d0>



In [32]:

    
import gzip
from itertools import izip
from skbio.parse.sequences import parse_fastq

bc_minus = 'YYYR'
bc_plus = 'RRRY'

fastq_tpl='@{id}\n{seq}\n+\n{qual}\n'

with gzip.open('../data/ACATCG_R1_fx.fastq.gz', 'rb') as gzr1, \
     gzip.open('../data/ACATCG_R2_fx.fastq.gz', 'rb') as gzr2, \
     gzip.open('../results/minus.fastq.gz', 'wb') as gz_minus, \
     gzip.open('../results/plus.fastq.gz', 'wb') as gz_plus:
    mates = izip(parse_fastq(gzr1), parse_fastq(gzr2))
    for rec1, rec2 in mates:
        id1, seq1, qual1 = rec1
        id2, seq2, qual2 = rec2
        qual_str = ''.join([chr(q+33) for q in qual2])
        if seq1[0] in 'CTN' and seq1[1] in 'CTN' and seq1[2] in 'CTN' and seq1[3] in 'AGN':
            gz_minus.write(fastq_tpl.format(id=id2,seq=seq2,qual=qual_str))
        elif seq1[0] in 'AGN' and seq1[1] in 'AGN' and seq1[2] in 'AGN' and seq1[3] in 'CTN':
            gz_plus.write(fastq_tpl.format(id=id2,seq=seq2,qual=qual_str))



In [ ]:

    
def split_pools(barcode, dirname='../data'):
    d, _, filenames = os.walk(dirname)
    files = [f for f in filenames if f.startswith(barcode)]
    files_R1 = [f for f in files if 'R1' in f]
    files_R2 = [f for f in files if 'R2' in f]
    fastq_tpl='@{id}\n{seq}\n+\n{qual}\n'
    for file_R1,file_R2 in zip(files_R1, files_R2):
        with gzip.open(file_R1, 'rb') as gzr1, gzip.open(file_R2, 'rb') as gzr2, \
             gzip.open(os.path.join(dirname, barcode+'_minus.fastq.gz'), 'wb') as gz_minus, \
             gzip.open(os.path.join(dirname, barcode+'_plus.fastq.gz'), 'wb') as gz_plus:
            for rec1,rec2 in izip(parse_fastq(gzr1), parse_fastq(gzr2)):
                id1, seq1, qual1 = rec1
                id2, seq2, qual2 = rec2
                qual_str = ''.join([chr(33+q) for q in qual2])
                if seq1[0] in 'CTN' and seq1[1] in 'CTN' and seq1[2] in 'CTN' and seq1[3] in 'AGN':
                    gz_minus.write(fastq_tpl.format(id=id2,seq=seq2,qual=qual_str))
                elif seq1[0] in 'AGN' and seq1[1] in 'AGN' and seq1[2] in 'AGN' and seq1[3] in 'CTN':
                    gz_plus.write(fastq_tpl.format(id=id2,seq=seq2,qual=qual_str))



In [ ]:

    
%%bash
# Trim adapters from 3' ends of the reads
cutadapt -a GATCGGAAGAGCACACGTCTGAACTCCAGTCAC -o plus_trimmed.fastq.gz plus.fastq.gz



In [33]:

    
%%bash
# Minus pool
%%bash
gunzip -c ../data/minus_trimmed.fastq.gz | bowtie2 -N 1 -x ../ref/dhsr1_acatcg -U - | samtools view -bhS - > ../results/minus_trimmed.bam









    



bash: line 2: fg: no job control
[samopen] SAM header is present: 1 sequences.
190066 reads; of these:
  190066 (100.00%) were unpaired; of these:
    40867 (21.50%) aligned 0 times
    149199 (78.50%) aligned exactly 1 time
    0 (0.00%) aligned >1 times
78.50% overall alignment rate



In [34]:

    
%%bash
# Plus pool
%%bash
gunzip -c ../data/plus_trimmed.fastq.gz | bowtie2 -N 1 -x ../ref/dhsr1_acatcg -U - | samtools view -bhS - > ../results/plus_trimmed.bam









    



bash: line 2: fg: no job control
[samopen] SAM header is present: 1 sequences.
188089 reads; of these:
  188089 (100.00%) were unpaired; of these:
    40194 (21.37%) aligned 0 times
    147895 (78.63%) aligned exactly 1 time
    0 (0.00%) aligned >1 times
78.63% overall alignment rate



In [35]:

    
%%bash
samtools sort ../results/minus_trimmed.bam ../results/minus_trimmed_sorted
samtools index ../results/minus_trimmed_sorted.bam



In [36]:

    
%%bash
samtools sort ../results/plus_trimmed.bam ../results/plus_trimmed_sorted
samtools index ../results/plus_trimmed_sorted.bam



In [37]:

    
%%bash
samtools mpileup -f ../ref/dhsr1_acatcg.fasta ../results/minus_trimmed_sorted.bam > ../results/minus.pileup
samtools mpileup -f ../ref/dhsr1_acatcg.fasta ../results/plus_trimmed_sorted.bam > ../results/plus.pileup









    



[mpileup] 1 samples in 1 input files
<mpileup> Set max per-file depth to 8000
[mpileup] 1 samples in 1 input files
<mpileup> Set max per-file depth to 8000



In [38]:

    
def parse_pileup(filename):
    with open(filename, 'rb') as csvfile:
        reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['seqname', 'pos', 'base', 'coverage', 'details', 'qual'])    
        data = [{'pos': int(rec['pos']),
                 'base': rec['base'],
                 'reads': rec['details'].count('^')} for rec in reader]
    
    return pd.DataFrame.from_records(data)



In [39]:

    
df_minus = parse_pileup('../results/minus.pileup')
df_plus = parse_pileup('../results/plus.pileup')



In [43]:

    
df_minus['plus'] = df_plus['reads']
df_minus.plot(x='pos', y='reads')









    Out[43]:





<matplotlib.axes.AxesSubplot at 0x7f3ab4ee6450>



In [44]:

    
df_minus.plot(x='pos', y='plus')









    Out[44]:





<matplotlib.axes.AxesSubplot at 0x7f3aad07dcd0>



In [49]:

    
df_minus.rename(columns={'reads': 'minus'}, inplace=True)



In [52]:

    
import numpy as np
# the order needs to be reversed

df_minus['theta'] = np.log(1-df_minus['minus']/df_minus['minus'].cumsum()) - np.log(1-df_minus['plus']/df_minus['plus'].cumsum())
df_minus.plot(x='pos', y='theta')









    Out[52]:





<matplotlib.axes.AxesSubplot at 0x7f3aacf8e490>



In [80]:

    
df = df_minus[['pos','base','plus','minus']]
df = df[df['pos'] < 655 ]
df = df.reindex(index=df.index[::-1])
df['theta'] = np.log(1-df['minus']/df['minus'].cumsum()) - np.log(1-df['plus']/df['plus'].cumsum())
#df.plot(x='pos', y='theta')
df[df['theta'] < 0]['theta'] = 0
df.plot(x='pos', y='theta')









    Out[80]:





<matplotlib.axes.AxesSubplot at 0x7f3aac5162d0>



In [70]:

    
df.max()









    Out[70]:





pos            654
base             T
plus          2275
minus         2302
theta    0.1032137
dtype: object



In [ ]:

	base	pos	reads
0	T	1	12
1	C	2	2
2	T	3	9
3	C	4	0
4	G	5	1
5	C	6	1
6	T	7	7
7	A	8	23
8	T	9	0
9	G	10	1
10	C	11	0
11	G	12	7
12	T	13	22
13	C	14	2
14	A	15	85
15	C	16	1
16	G	17	1
17	T	18	22
18	T	19	37
19	A	20	27
20	A	21	16
21	A	22	20
22	A	23	9
23	A	24	56
24	T	25	40
25	A	26	40
26	A	27	66
27	T	28	9
28	G	29	2
29	A	30	13
...	...	...	...
655	C	656	0
656	C	657	0
657	C	658	0
658	G	659	0
659	A	660	0
660	T	661	0
661	G	662	0
662	T	663	0
663	A	664	0
664	A	665	0
665	A	666	0
666	T	667	0
667	C	668	0
668	G	669	0
669	G	670	0
670	G	671	0
671	C	672	0
672	T	673	0
673	T	674	0
674	C	675	0
675	G	676	0
676	G	677	0
677	T	678	0
678	C	679	0
679	C	680	0
680	G	681	0
681	G	682	0
682	T	683	0
683	T	684	0
684	C	685	0

	base	pos	reads
0	T	1	12
1	C	2	2
2	T	3	9
3	C	4	0
4	G	5	1
5	C	6	1
6	T	7	7
7	A	8	23
8	T	9	0
9	G	10	1
10	C	11	0
11	G	12	7
12	T	13	22
13	C	14	2
14	A	15	85
15	C	16	1
16	G	17	1
17	T	18	22
18	T	19	37
19	A	20	27
20	A	21	16
21	A	22	20
22	A	23	9
23	A	24	56
24	T	25	40
25	A	26	40
26	A	27	66
27	T	28	9
28	G	29	2
29	A	30	13
...	...	...	...
655	C	656	0
656	C	657	0
657	C	658	0
658	G	659	0
659	A	660	0
660	T	661	0
661	G	662	0
662	T	663	0
663	A	664	0
664	A	665	0
665	A	666	0
666	T	667	0
667	C	668	0
668	G	669	0
669	G	670	0
670	G	671	0
671	C	672	0
672	T	673	0
673	T	674	0
674	C	675	0
675	G	676	0
676	G	677	0
677	T	678	0
678	C	679	0
679	C	680	0
680	G	681	0
681	G	682	0
682	T	683	0
683	T	684	0
684	C	685	0

	base	pos	reads
0	T	1	12
1	C	2	2
2	T	3	9
3	C	4	0
4	G	5	1
5	C	6	1
6	T	7	7
7	A	8	23
8	T	9	0
9	G	10	1
10	C	11	0
11	G	12	7
12	T	13	22
13	C	14	2
14	A	15	85
15	C	16	1
16	G	17	1
17	T	18	22
18	T	19	37
19	A	20	27
20	A	21	16
21	A	22	20
22	A	23	9
23	A	24	56
24	T	25	40
25	A	26	40
26	A	27	66
27	T	28	9
28	G	29	2
29	A	30	13
...	...	...	...
655	C	656	0
656	C	657	0
657	C	658	0
658	G	659	0
659	A	660	0
660	T	661	0
661	G	662	0
662	T	663	0
663	A	664	0
664	A	665	0
665	A	666	0
666	T	667	0
667	C	668	0
668	G	669	0
669	G	670	0
670	G	671	0
671	C	672	0
672	T	673	0
673	T	674	0
674	C	675	0
675	G	676	0
676	G	677	0
677	T	678	0
678	C	679	0
679	C	680	0
680	G	681	0
681	G	682	0
682	T	683	0
683	T	684	0
684	C	685	0