You just need to download this ~28 MB file only once
In [1]:
!rm -f SRR003265.filt.fastq.gz 2>/dev/null
!wget -nd ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase3/data/NA18489/sequence_read/SRR003265.filt.fastq.gz
In [2]:
from collections import defaultdict
import gzip
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
from Bio import SeqIO
In [3]:
recs = SeqIO.parse(gzip.open('SRR003265.filt.fastq.gz'), 'fastq')
rec = next(recs)
print(rec.id, rec.description, rec.seq)
print(rec.letter_annotations)
In [4]:
recs = SeqIO.parse(gzip.open('SRR003265.filt.fastq.gz'), 'fastq')
cnt = defaultdict(int)
for rec in recs:
for letter in rec.seq:
cnt[letter] += 1
tot = sum(cnt.values())
for letter, cnt in cnt.items():
print('%s: %.2f %d' % (letter, 100. * cnt / tot, cnt))
In [5]:
recs = SeqIO.parse(gzip.open('SRR003265.filt.fastq.gz'), 'fastq')
n_cnt = defaultdict(int)
for rec in recs:
for i, letter in enumerate(rec.seq):
pos = i + 1
if letter == 'N':
n_cnt[pos] += 1
seq_len = max(n_cnt.keys())
positions = range(1, seq_len + 1)
fig, ax = plt.subplots()
ax.plot(positions, [n_cnt[x] for x in positions])
ax.set_xlim(1, seq_len)
Out[5]:
In [6]:
recs = SeqIO.parse(gzip.open('SRR003265.filt.fastq.gz'), 'fastq')
cnt_qual = defaultdict(int)
for rec in recs:
for i, qual in enumerate(rec.letter_annotations['phred_quality']):
if i < 25:
continue
cnt_qual[qual] += 1
tot = sum(cnt_qual.values())
for qual, cnt in cnt_qual.items():
print('%d: %.2f %d' % (qual, 100. * cnt / tot, cnt))
In [7]:
recs = SeqIO.parse(gzip.open('SRR003265.filt.fastq.gz'), 'fastq')
qual_pos = defaultdict(list)
for rec in recs:
for i, qual in enumerate(rec.letter_annotations['phred_quality']):
if i < 25 or qual == 40:
continue
pos = i + 1
qual_pos[pos].append(qual)
vps = []
poses = qual_pos.keys()
poses.sort()
for pos in poses:
vps.append(qual_pos[pos])
fig, ax = plt.subplots()
sns.boxplot(vps, ax=ax)
ax.set_xticklabels([str(x) for x in range(26, max(qual_pos.keys()) + 1)])
pass
Be careful as this will be 1GB of data (and fully optional)
In [8]:
!rm -f SRR003265_1.filt.fastq.gz 2>/dev/null
!rm -f SRR003265_2.filt.fastq.gz 2>/dev/null
!wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data/NA18489/sequence_read/SRR003265_1.filt.fastq.gz
!wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data/NA18489/sequence_read/SRR003265_2.filt.fastq.gz
In [ ]:
f1 = gzip.open('SRR003265_1.filt.fastq.gz')
f2 = gzip.open('SRR003265_2.filt.fastq.gz')
recs1 = SeqIO.parse(f1, 'fastq')
recs2 = SeqIO.parse(f2, 'fastq')
cnt = 0
for rec1 in recs1:
next(recs2)
cnt +=1
print('Number of pairs: %d' % cnt)
In [ ]:
#f1 = gzip.open('SRR003265_1.filt.fastq.gz', 'rt', encoding='utf8')
#f2 = gzip.open('SRR003265_2.filt.fastq.gz', 'rt', encoding='utf8')
#recs1 = SeqIO.parse(f1, 'fastq')
#recs2 = SeqIO.parse(f2, 'fastq')
#cnt = 0
#for rec1, rec2 in zip(recs1, recs2):
# cnt +=1
#
#print('Number of pairs: %d' % cnt)