In [1]:
!rm -f atroparvus.fa.gz gambiae.fa.gz 2>/dev/null
!wget ftp://ftp.vectorbase.org/public_data/organism_data/agambiae/Genome/agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz -O gambiae.fa.gz
!wget https://www.vectorbase.org/download/anopheles-atroparvus-ebroscaffoldsaatre1fagz -O atroparvus.fa.gz


--2015-07-05 17:14:17--  ftp://ftp.vectorbase.org/public_data/organism_data/agambiae/Genome/agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz
           => ‘gambiae.fa.gz’
Resolving ftp.vectorbase.org (ftp.vectorbase.org)... 129.74.255.228
Connecting to ftp.vectorbase.org (ftp.vectorbase.org)|129.74.255.228|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /public_data/organism_data/agambiae/Genome ... done.
==> SIZE agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz ... 81591806
==> PASV ... done.    ==> RETR agambiae.CHROMOSOMES-PEST.AgamP3.fa.gz ... done.
Length: 81591806 (78M) (unauthoritative)

agambiae.CHROMOSOME 100%[=====================>]  77.81M   583KB/s   in 3m 54s 

2015-07-05 17:18:12 (341 KB/s) - ‘gambiae.fa.gz’ saved [81591806]

--2015-07-05 17:18:12--  https://www.vectorbase.org/download/anopheles-atroparvus-ebroscaffoldsaatre1fagz
Resolving www.vectorbase.org (www.vectorbase.org)... 129.74.255.228
Connecting to www.vectorbase.org (www.vectorbase.org)|129.74.255.228|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://www.vectorbase.org/sites/default/files/ftp/downloads/Anopheles-atroparvus-EBRO_SCAFFOLDS_AatrE1.fa.gz [following]
--2015-07-05 17:18:14--  https://www.vectorbase.org/sites/default/files/ftp/downloads/Anopheles-atroparvus-EBRO_SCAFFOLDS_AatrE1.fa.gz
Reusing existing connection to www.vectorbase.org:443.
HTTP request sent, awaiting response... 200 OK
Length: 60941709 (58M) [application/x-gzip]
Saving to: ‘atroparvus.fa.gz’

atroparvus.fa.gz    100%[=====================>]  58.12M   282KB/s   in 2m 54s 

2015-07-05 17:21:09 (341 KB/s) - ‘atroparvus.fa.gz’ saved [60941709/60941709]


In [1]:
from __future__ import division
import gzip
from Bio import SeqIO, SeqUtils

In [2]:
gambiae_name = 'gambiae.fa.gz'
atroparvus_name = 'atroparvus.fa.gz'

In [3]:
recs = SeqIO.parse(gzip.open(gambiae_name), 'fasta')
for rec in recs:
    print(rec.description)
#Do not do this with atroparvus


chromosome:AgamP3:2L:1:49364325:1 chromosome 2L
chromosome:AgamP3:2R:1:61545105:1 chromosome 2R
chromosome:AgamP3:3L:1:41963435:1 chromosome 3L
chromosome:AgamP3:3R:1:53200684:1 chromosome 3R
chromosome:AgamP3:UNKN:1:42389979:1 chromosome UNKN
chromosome:AgamP3:X:1:24393108:1 chromosome X
chromosome:AgamP3:Y_unplaced:1:237045:1 chromosome Y_unplaced

In [4]:
recs = SeqIO.parse(gzip.open(gambiae_name), 'fasta')
chrom_Ns = {}
chrom_sizes = {}
for rec in recs:
    chrom = rec.description.split(':')[2]
    if chrom in ['UNKN', 'Y_unplaced']:
        continue
    chrom_Ns[chrom] = []
    on_N = False
    curr_size = 0
    for pos, nuc in enumerate(rec.seq):
        if nuc in ['N', 'n']:
            curr_size += 1
            on_N = True
        else:
            if on_N:
                chrom_Ns[chrom].append(curr_size)
                curr_size = 0
            on_N = False
    if on_N:
        chrom_Ns[chrom].append(curr_size)
    chrom_sizes[chrom] = len(rec.seq)

In [5]:
for chrom, Ns in chrom_Ns.items():
    size = chrom_sizes[chrom]
    print('%s (%s): %%Ns (%.1f), num Ns: %d, max N: %d' % (chrom, size, 100 * sum(Ns) / size, len(Ns), max(Ns)))


2L (49364325): %Ns (1.7), num Ns: 957, max N: 28884
3R (53200684): %Ns (1.8), num Ns: 1128, max N: 24292
X (24393108): %Ns (4.1), num Ns: 1287, max N: 21132
2R (61545105): %Ns (2.3), num Ns: 1658, max N: 36427
3L (41963435): %Ns (2.9), num Ns: 1272, max N: 31063

Atroparvus super-contigs


In [6]:
import numpy as np
#SeqUtils
recs = SeqIO.parse(gzip.open(atroparvus_name), 'fasta')
sizes = []
size_N = []
for rec in recs:
    size = len(rec.seq)
    sizes.append(size)
    count_N = 0
    for nuc in rec.seq:
        if nuc in ['n', 'N']:
            count_N += 1
    size_N.append((size, count_N / size))

In [7]:
print(len(sizes), np.median(sizes), np.mean(sizes), max(sizes), min(sizes),
      np.percentile(sizes, 10), np.percentile(sizes, 90))


(1371, 8304.0, 163596.00656455141, 20238125, 1004, 1563.0, 56612.0)

In [8]:
%matplotlib inline
import matplotlib.pyplot as plt

small_split = 4800
large_split = 540000
fig, axs = plt.subplots(1, 3, figsize=(16, 9), squeeze=False)
xs, ys = zip(*[(x, 100 * y) for x, y in size_N if x <= small_split])
axs[0, 0].plot(xs, ys, '.')
axs[0, 0].set_ylim(-0.1, 3.5)
xs, ys = zip(*[(x, 100 * y) for x, y in size_N if x > small_split and x <= large_split])
axs[0, 1].plot(xs, ys, '.')
axs[0, 1].set_xlim(small_split, large_split)
xs, ys = zip(*[(x, 100 * y) for x, y in size_N if x > large_split])
axs[0, 2].plot(xs, ys, '.')
axs[0, 0].set_ylabel('Fraction of Ns', fontsize=12)
axs[0, 1].set_xlabel('Contig size', fontsize=12)
fig.suptitle('Fraction of Ns per contig size', fontsize=26)


Out[8]:
<matplotlib.text.Text at 0x7f67d4b1b910>

In [ ]: