In [1]:
!rm -f atroparvus.fa.gz gambiae.fa.gz 2>/dev/null
!wget ftp://ftp.vectorbase.org/public_data/organism_data/agambiae/Genome/agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz -O gambiae.fa.gz
!wget https://www.vectorbase.org/download/anopheles-atroparvus-ebroscaffoldsaatre1fagz -O atroparvus.fa.gz
In [1]:
from __future__ import division
import gzip
from Bio import SeqIO, SeqUtils
In [2]:
gambiae_name = 'gambiae.fa.gz'
atroparvus_name = 'atroparvus.fa.gz'
In [3]:
recs = SeqIO.parse(gzip.open(gambiae_name), 'fasta')
for rec in recs:
print(rec.description)
#Do not do this with atroparvus
In [4]:
recs = SeqIO.parse(gzip.open(gambiae_name), 'fasta')
chrom_Ns = {}
chrom_sizes = {}
for rec in recs:
chrom = rec.description.split(':')[2]
if chrom in ['UNKN', 'Y_unplaced']:
continue
chrom_Ns[chrom] = []
on_N = False
curr_size = 0
for pos, nuc in enumerate(rec.seq):
if nuc in ['N', 'n']:
curr_size += 1
on_N = True
else:
if on_N:
chrom_Ns[chrom].append(curr_size)
curr_size = 0
on_N = False
if on_N:
chrom_Ns[chrom].append(curr_size)
chrom_sizes[chrom] = len(rec.seq)
In [5]:
for chrom, Ns in chrom_Ns.items():
size = chrom_sizes[chrom]
print('%s (%s): %%Ns (%.1f), num Ns: %d, max N: %d' % (chrom, size, 100 * sum(Ns) / size, len(Ns), max(Ns)))
In [6]:
import numpy as np
#SeqUtils
recs = SeqIO.parse(gzip.open(atroparvus_name), 'fasta')
sizes = []
size_N = []
for rec in recs:
size = len(rec.seq)
sizes.append(size)
count_N = 0
for nuc in rec.seq:
if nuc in ['n', 'N']:
count_N += 1
size_N.append((size, count_N / size))
In [7]:
print(len(sizes), np.median(sizes), np.mean(sizes), max(sizes), min(sizes),
np.percentile(sizes, 10), np.percentile(sizes, 90))
In [8]:
%matplotlib inline
import matplotlib.pyplot as plt
small_split = 4800
large_split = 540000
fig, axs = plt.subplots(1, 3, figsize=(16, 9), squeeze=False)
xs, ys = zip(*[(x, 100 * y) for x, y in size_N if x <= small_split])
axs[0, 0].plot(xs, ys, '.')
axs[0, 0].set_ylim(-0.1, 3.5)
xs, ys = zip(*[(x, 100 * y) for x, y in size_N if x > small_split and x <= large_split])
axs[0, 1].plot(xs, ys, '.')
axs[0, 1].set_xlim(small_split, large_split)
xs, ys = zip(*[(x, 100 * y) for x, y in size_N if x > large_split])
axs[0, 2].plot(xs, ys, '.')
axs[0, 0].set_ylabel('Fraction of Ns', fontsize=12)
axs[0, 1].set_xlabel('Contig size', fontsize=12)
fig.suptitle('Fraction of Ns per contig size', fontsize=26)
Out[8]:
In [ ]: