In [ ]:
import sys
import os
import gzip
import numpy as np
from itertools import zip_longest
In [ ]:
# The following was shamelessly lifted from scikit-bio circa v0.2.0
def _ascii_to_phred(s, offset):
"""Convert ascii to Phred quality score with specified ASCII offset."""
return np.fromstring(s, dtype='|S1').view(np.int8) - offset
def ascii_to_phred33(s):
"""Convert ascii string to Phred quality score with ASCII offset of 33.
Standard "Sanger" ASCII offset of 33. This is used by Illumina in CASAVA
versions after 1.8.0, and most other places. Note that internal Illumina
files still use offset of 64
"""
return _ascii_to_phred(s, 33)
def ascii_to_phred64(s):
"""Convert ascii string to Phred quality score with ASCII offset of 64.
Illumina-specific ASCII offset of 64. This is used by Illumina in CASAVA
versions prior to 1.8.0, and in Illumina internal formats (e.g.,
export.txt files).
"""
return _ascii_to_phred(s, 64)
def _drop_id_marker(s):
"""Drop the first character and decode bytes to text"""
id_ = s[1:]
try:
return str(id_.decode('utf-8'))
except AttributeError:
return id_
def parse_fastq(data, strict=False, enforce_qual_range=True, phred_offset=None):
r"""yields label, seq, and qual from a fastq file.
Parameters
----------
data : open file object or str
An open fastq file (opened in binary mode) or a path to it.
strict : bool, optional
Defaults to ``False``. If strict is true a FastqParse error will be
raised if the seq and qual labels dont' match.
enforce_qual_range : bool, optional
Defaults to ``True``. If ``True``, an exception will be raised if a
quality score outside the range [0, 62] is detected
phred_offset : {33, 64, None}, optional
What Phred offset to use when converting qual score symbols to integers
If ``None`` quality is returned as string.
Returns
-------
label, seq, qual : (str, bytes, np.array)
yields the label, sequence and quality for each entry
Examples
--------
Assume we have a fastq formatted file with the following contents::
@seq1
AACACCAAACTTCTCCACCACGTGAGCTACAAAAG
+
````Y^T]`]c^cabcacc`^Lb^ccYT\T\Y\WF
@seq2
TATGTATATATAACATATACATATATACATACATA
+
]KZ[PY]_[YY^```ac^\\`bT``c`\aT``bbb
We can use the following code:
>>> from StringIO import StringIO
>>> from skbio.parse.sequences import parse_fastq
>>> fastq_f = StringIO('@seq1\n'
... 'AACACCAAACTTCTCCACCACGTGAGCTACAAAAG\n'
... '+\n'
... '````Y^T]`]c^cabcacc`^Lb^ccYT\T\Y\WF\n'
... '@seq2\n'
... 'TATGTATATATAACATATACATATATACATACATA\n'
... '+\n'
... ']KZ[PY]_[YY^```ac^\\\`bT``c`\\aT``bbb\n')
>>> for label, seq, qual in parse_fastq(fastq_f, phred_offset=64):
... print(label)
... print(seq)
... print(qual)
seq1
AACACCAAACTTCTCCACCACGTGAGCTACAAAAG
[32 32 32 32 25 30 20 29 32 29 35 30 35 33 34 35 33 35 35 32 30 12 34 30 35
35 25 20 28 20 28 25 28 23 6]
seq2
TATGTATATATAACATATACATATATACATACATA
[29 11 26 27 16 25 29 31 27 25 25 30 32 32 32 33 35 30 28 28 32 34 20 32 32
35 32 28 33 20 32 32 34 34 34]
"""
if phred_offset == 33:
phred_f = ascii_to_phred33
elif phred_offset == 64:
phred_f = ascii_to_phred64
else:
phred_f = None
with gzip.open(data, mode='rt') as fi:
iters = [iter(fi)] * 4
for seqid, seq, qualid, qual in zip_longest(*iters):
seqid = seqid.strip()
# If the file simply ended in a blankline, do not error
if seqid is '':
continue
# Error if an incomplete record is found
# Note: seqid cannot be None, because if all 4 values were None,
# then the loop condition would be false, and we could not have
# gotten to this point
if seq is None or qualid is None or qual is None:
raise ValueError("Incomplete FASTQ record found at end "
"of file")
seq = seq.strip()
qualid = qualid.strip()
qual = qual.strip()
seqid = _drop_id_marker(seqid)
try:
seq = str(seq.decode("utf-8"))
except AttributeError:
pass
qualid = _drop_id_marker(qualid)
if strict:
if seqid != qualid:
raise ValueError('ID mismatch: {} != {}'.format(
seqid, qualid))
# bounds based on illumina limits, see:
# http://nar.oxfordjournals.org/content/38/6/1767/T1.expansion.html
if phred_f is not None:
qual = phred_f(qual)
if (enforce_qual_range
and (phred_f is not None)
and ((qual < 0).any() or (qual > 62).any())):
raise ValueError("Failed qual conversion for seq id: %s. "
"This may be because you passed an "
"incorrect value for phred_offset." %
seqid)
yield (seqid, seq, qual)
In [ ]:
samples = {
'control_pool_phageP1': 'TTTT',
'control_t1_phageP1': 'GACCA',
'infection_24_phageP1':'CGACGT',
'infection_28_phageP1':'ACGAGGA',
'rif_wt_pool':'TACGAGTC',
'rif_H526Y_pool':'GTACGATCG'
}
def hamming(s1, s2):
'''
Computes Hamming distance between s1 and s2.
'''
if len(s1) != len(s2):
raise ValueError('s1 and s2 must be the same length to compute Hamming distance!')
return sum(ch1 != ch2 for ch1,ch2 in zip(s1, s2))
def dmux_and_trim(fastq, samples, result_dir=None, transposon='TAACAGGTTGGATGATAAG', progress=1000000):
# This will hold output files in the form
# `barcode`: `file handle`
# So we keep track of them to close on exit
output_files = {}
stat_dict = {}
# Then we build a list (well, set) of all barcode lengths
bcls = {len(x) for _, x in samples.items()}
cnt = 0
for seqid,seq,qual in parse_fastq(fastq):
cnt += 1
# get barcode
all_bc = [seq[:x] for x in bcls if seq[:x] in samples.values()]
if len(all_bc) != 1:
# Barcode corresponds to more than one sample or unknown barcode, drop it
# Potentially we might want to implement error correction here
continue
bc = all_bc[0]
# trim transposon allowing for 1 mismatch
# check for both 14 and 15-mers
tpos = 14 + len(bc)
tlen = len(transposon)
# Read is too short, skip it
if len(seq) < tpos+tlen+1:
continue
if hamming(seq[tpos:tpos+tlen], transposon) < 2:
seq = seq[len(bc):tpos]
qual = qual[len(bc):tpos]
elif hamming(seq[tpos+1:tpos+tlen+1], transposon) < 2:
seq = seq[len(bc):tpos+1]
qual = qual[len(bc):tpos+1]
else:
# No transposon sequence in the read, skip
continue
# output to corresponding file and update stat
if not bc in output_files:
basename = '{}_trimmed.fastq.gz'.format(bc)
if result_dir is not None:
out_filename = os.path.join(result_dir, basename)
else:
out_filename = basename
fh = gzip.open(out_filename, 'wt')
output_files[bc] = fh
print('\tcreated output file:\t{}'.format(out_filename))
output_files[bc].write('@{seqid}\n{seq}\n+\n{qual}\n'.format(
seqid=seqid,
seq=seq,
qual=qual
))
if not bc in stat_dict:
stat_dict[bc] = 1
else:
stat_dict[bc] += 1
if cnt % progress == 0:
print('processed: {} reads...'.format(cnt))
# Cleanup: close output files
for _,fh in output_files.items():
fh.close()
return stat_dict
In [ ]: