notebook.community

Edit and run



In [ ]:

    
import sys
import os
import gzip

import numpy as np

from itertools import zip_longest



In [ ]:

    
# The following was shamelessly lifted from scikit-bio circa v0.2.0

def _ascii_to_phred(s, offset):
    """Convert ascii to Phred quality score with specified ASCII offset."""
    return np.fromstring(s, dtype='|S1').view(np.int8) - offset


def ascii_to_phred33(s):
    """Convert ascii string to Phred quality score with ASCII offset of 33.
    Standard "Sanger" ASCII offset of 33. This is used by Illumina in CASAVA
    versions after 1.8.0, and most other places. Note that internal Illumina
    files still use offset of 64
    """
    return _ascii_to_phred(s, 33)


def ascii_to_phred64(s):
    """Convert ascii string to Phred quality score with ASCII offset of 64.
    Illumina-specific ASCII offset of 64. This is used by Illumina in CASAVA
    versions prior to 1.8.0, and in Illumina internal formats (e.g.,
    export.txt files).
    """
    return _ascii_to_phred(s, 64)


def _drop_id_marker(s):
    """Drop the first character and decode bytes to text"""
    id_ = s[1:]
    try:
        return str(id_.decode('utf-8'))
    except AttributeError:
        return id_


def parse_fastq(data, strict=False, enforce_qual_range=True, phred_offset=None):
    r"""yields label, seq, and qual from a fastq file.
    
    Parameters
    ----------
    data : open file object or str
        An open fastq file (opened in binary mode) or a path to it.
    strict : bool, optional
        Defaults to ``False``. If strict is true a FastqParse error will be
        raised if the seq and qual labels dont' match.
    enforce_qual_range : bool, optional
        Defaults to ``True``. If ``True``, an exception will be raised if a
        quality score outside the range [0, 62] is detected
    phred_offset : {33, 64, None}, optional
        What Phred offset to use when converting qual score symbols to integers
        If ``None`` quality is returned as string.
    Returns
    -------
    label, seq, qual : (str, bytes, np.array)
        yields the label, sequence and quality for each entry
    Examples
    --------
    Assume we have a fastq formatted file with the following contents::
        @seq1
        AACACCAAACTTCTCCACCACGTGAGCTACAAAAG
        +
        ````Y^T]`]c^cabcacc`^Lb^ccYT\T\Y\WF
        @seq2
        TATGTATATATAACATATACATATATACATACATA
        +
        ]KZ[PY]_[YY^```ac^\\`bT``c`\aT``bbb
    We can use the following code:
    >>> from StringIO import StringIO
    >>> from skbio.parse.sequences import parse_fastq
    >>> fastq_f = StringIO('@seq1\n'
    ...                     'AACACCAAACTTCTCCACCACGTGAGCTACAAAAG\n'
    ...                     '+\n'
    ...                     '````Y^T]`]c^cabcacc`^Lb^ccYT\T\Y\WF\n'
    ...                     '@seq2\n'
    ...                     'TATGTATATATAACATATACATATATACATACATA\n'
    ...                     '+\n'
    ...                     ']KZ[PY]_[YY^```ac^\\\`bT``c`\\aT``bbb\n')
    >>> for label, seq, qual in parse_fastq(fastq_f, phred_offset=64):
    ...     print(label)
    ...     print(seq)
    ...     print(qual)
    seq1
    AACACCAAACTTCTCCACCACGTGAGCTACAAAAG
    [32 32 32 32 25 30 20 29 32 29 35 30 35 33 34 35 33 35 35 32 30 12 34 30 35
     35 25 20 28 20 28 25 28 23  6]
    seq2
    TATGTATATATAACATATACATATATACATACATA
    [29 11 26 27 16 25 29 31 27 25 25 30 32 32 32 33 35 30 28 28 32 34 20 32 32
     35 32 28 33 20 32 32 34 34 34]
    """

    if phred_offset == 33:
        phred_f = ascii_to_phred33
    elif phred_offset == 64:
        phred_f = ascii_to_phred64
    else:
        phred_f = None

    with gzip.open(data, mode='rt') as fi:
        iters = [iter(fi)] * 4
        for seqid, seq, qualid, qual in zip_longest(*iters):
            seqid = seqid.strip()
            # If the file simply ended in a blankline, do not error
            if seqid is '':
                continue
            # Error if an incomplete record is found
            # Note: seqid cannot be None, because if all 4 values were None,
            # then the loop condition would be false, and we could not have
            # gotten to this point
            if seq is None or qualid is None or qual is None:
                raise ValueError("Incomplete FASTQ record found at end "
                                      "of file")

            seq = seq.strip()
            qualid = qualid.strip()
            qual = qual.strip()

            seqid = _drop_id_marker(seqid)

            try:
                seq = str(seq.decode("utf-8"))
            except AttributeError:
                pass

            qualid = _drop_id_marker(qualid)
            if strict:
                if seqid != qualid:
                    raise ValueError('ID mismatch: {} != {}'.format(
                        seqid, qualid))

            # bounds based on illumina limits, see:
            # http://nar.oxfordjournals.org/content/38/6/1767/T1.expansion.html
            if phred_f is not None:
                qual = phred_f(qual)
            if (enforce_qual_range
                and (phred_f is not None)
                and ((qual < 0).any() or (qual > 62).any())):
                raise ValueError("Failed qual conversion for seq id: %s. "
                                      "This may be because you passed an "
                                      "incorrect value for phred_offset." %
                                      seqid)

            yield (seqid, seq, qual)



In [ ]:

    
samples = {
    'control_pool_phageP1': 'TTTT',
    'control_t1_phageP1': 'GACCA',
    'infection_24_phageP1':'CGACGT',
    'infection_28_phageP1':'ACGAGGA',
    'rif_wt_pool':'TACGAGTC',
    'rif_H526Y_pool':'GTACGATCG'
}


def hamming(s1, s2):
    '''
    Computes Hamming distance between s1 and s2.
    '''
    if len(s1) != len(s2):
        raise ValueError('s1 and s2 must be the same length to compute Hamming distance!')
    return sum(ch1 != ch2 for ch1,ch2 in zip(s1, s2))


def dmux_and_trim(fastq, samples, result_dir=None, transposon='TAACAGGTTGGATGATAAG', progress=1000000):
    # This will hold output files in the form
    # `barcode`: `file handle`
    # So we keep track of them to close on exit
    output_files = {}
    stat_dict = {}
    # Then we build a list (well, set) of all barcode lengths
    bcls = {len(x) for _, x in samples.items()}
    cnt = 0
    for seqid,seq,qual in parse_fastq(fastq):
        cnt += 1
        # get barcode
        all_bc = [seq[:x] for x in bcls if seq[:x] in samples.values()]
        if len(all_bc) != 1:
            # Barcode corresponds to more than one sample or unknown barcode, drop it
            # Potentially we might want to implement error correction here
            continue
        bc = all_bc[0]
        
        # trim transposon allowing for 1 mismatch
        # check for both 14 and 15-mers
        tpos = 14 + len(bc)
        tlen = len(transposon)
        # Read is too short, skip it
        if len(seq) < tpos+tlen+1:
            continue
            
        if hamming(seq[tpos:tpos+tlen], transposon) < 2:
            seq = seq[len(bc):tpos]
            qual = qual[len(bc):tpos]
        elif hamming(seq[tpos+1:tpos+tlen+1], transposon) < 2:
            seq = seq[len(bc):tpos+1]
            qual = qual[len(bc):tpos+1]
        else:
            # No transposon sequence in the read, skip
            continue
        
        # output to corresponding file and update stat
        if not bc in output_files:
            basename = '{}_trimmed.fastq.gz'.format(bc)
            if result_dir is not None:
                out_filename = os.path.join(result_dir, basename)
            else:
                out_filename = basename
            fh = gzip.open(out_filename, 'wt')
            output_files[bc] = fh
            print('\tcreated output file:\t{}'.format(out_filename))
            
        output_files[bc].write('@{seqid}\n{seq}\n+\n{qual}\n'.format(
                seqid=seqid,
                seq=seq,
                qual=qual
            ))
        
        if not bc in stat_dict:
            stat_dict[bc] = 1
        else:
            stat_dict[bc] += 1
            
        if cnt % progress == 0:
            print('processed: {} reads...'.format(cnt))
        
    # Cleanup: close output files
    for _,fh in output_files.items():
        fh.close()
    
    return stat_dict



In [ ]: