In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
The normalized frequency of a sequence $f_{seq}$ is determined by the number of counts of that sequence relative to all counts in the data set.
The enrichment of the sequence in dataset 2 vs. dataset 1 is given by:
$$E_{seq} = \frac{f_{seq,2}}{f_{seq,1}}$$where $f_{seq,1}$ and $f_{seq,2}$ are the normalized frequencies of the sequence in dataset 1 and 2.
where $p$ is the probability that the call is wrong.
By rearranging Eq. $(1)$ above, we get: \begin{equation} p = 10^{- \frac{Q}{10}} \qquad \qquad \text{$(2)$} \end{equation}
In [ ]:
In [2]:
print(chr(33))
print(chr(34))
print(chr(35))
print("...")
print(chr(74))
print(chr(75))
In [3]:
Q_dict = {}
p_dict = {}
for i in range(33,76):
Q_dict[chr(i)] = i-33
p_dict[chr(i)] = 10**(-(Q_dict[chr(i)])/10.)
p_dict["K"]
Out[3]:
In [4]:
qual_string = "IIIIIIIIIIIIIIIIIIIIIIIIIIIIII9IG9ICIIIIIIIIIIIIIIIIIIIIDIII"
p_correct = 1.0
for q in qual_string:
p_correct = p_correct*(1-p_dict[q])
print(p_correct)
In [5]:
import gzip
get_line = False
seqs = {}
with gzip.open("files/example.fastq.gz") as f:
for l in f:
l_ascii = l.decode("ascii")
if l_ascii[0] == "@":
get_line = True
continue
if get_line:
try:
seqs[l_ascii.strip()] += 1
except KeyError:
seqs[l_ascii.strip()] = 1
get_line = False
In [ ]:
In [ ]: