In [3]:
from __future__ import print_function
import dendropy
from dendropy import popgenstat

Genes


In [4]:
import os
from collections import OrderedDict
genes_species = OrderedDict()
my_species = ['RESTV', 'SUDV']
my_genes = ['NP', 'L', 'VP35', 'VP40']

for name in my_genes:
    gene_name = name.split('.')[0]
    char_mat = dendropy.DnaCharacterMatrix.get_from_path('%s_align.fasta' % name, 'fasta')
    genes_species[gene_name] = {}
    
    for species in my_species:
        genes_species[gene_name][species] = dendropy.DnaCharacterMatrix()
    for taxon, char_map in char_mat.items():
        species = taxon.label.split('_')[0]
        if species in my_species:
            genes_species[gene_name][species].extend_map({taxon: char_map})

In [5]:
import numpy as np
import pandas as pd
summary = np.ndarray(shape=(len(genes_species), 4 * len(my_species)))
stats = ['seg_sites', 'nuc_div', 'taj_d', 'wat_theta']
for row, (gene, species_data) in enumerate(genes_species.items()):
    for col_base, species in enumerate(my_species):
        summary[row, col_base * 4] = popgenstat.num_segregating_sites(species_data[species])
        summary[row, col_base * 4 + 1] = popgenstat.nucleotide_diversity(species_data[species])
        summary[row, col_base * 4 + 2] = popgenstat.tajimas_d(species_data[species])
        summary[row, col_base * 4 + 3] = popgenstat.wattersons_theta(species_data[species])
columns = []
for species in my_species:
    columns.extend(['%s (%s)' % (stat, species) for stat in stats])
df = pd.DataFrame(summary, index=genes_species.keys(), columns=columns)
df # vs print(df)


Out[5]:
seg_sites (RESTV) nuc_div (RESTV) taj_d (RESTV) wat_theta (RESTV) seg_sites (SUDV) nuc_div (SUDV) taj_d (SUDV) wat_theta (SUDV)
NP 113 0.020659 -0.482275 49.489051 118 0.029630 1.203522 56.64
L 288 0.018143 -0.295386 126.131387 282 0.024193 1.412350 135.36
VP35 42 0.017099 -0.530330 18.394161 50 0.027761 1.069061 24.00
VP40 61 0.026155 -0.188135 26.715328 41 0.023517 1.269160 19.68

Genomes


In [6]:
def do_basic_popgen(seqs):
    num_seg_sites = popgenstat.num_segregating_sites(seqs)
    avg_pair = popgenstat.average_number_of_pairwise_differences(seqs)
    nuc_div = popgenstat.nucleotide_diversity(seqs)
    print('Segregating sites: %d, Avg pairwise diffs: %.2f, Nucleotide diversity %.6f' % (num_seg_sites, avg_pair, nuc_div))
    print("Watterson's theta: %s" % popgenstat.wattersons_theta(seqs))
    print("Tajima's D: %s" % popgenstat.tajimas_d(seqs))

In [7]:
ebov_seqs = dendropy.DnaCharacterMatrix.get_from_path('trim.fasta',
                                                      schema='fasta', data_type='dna')
sl_2014 = []
drc_2007 = []
ebov2007_set = dendropy.DnaCharacterMatrix()
ebov2014_set = dendropy.DnaCharacterMatrix()
for taxon, char_map in ebov_seqs.items():
    print(taxon.label)
    if taxon.label.startswith('EBOV_2014') and len(sl_2014) < 8:
        sl_2014.append(char_map)
        ebov2014_set.extend_map({taxon: char_map})
    elif taxon.label.startswith('EBOV_2007'):
        drc_2007.append(char_map)
        ebov2007_set.extend_map({taxon: char_map})
del ebov_seqs


BDBV_KC545393 18728 bp
BDBV_KC545395 18728 bp
BDBV_KC545394 18728 bp
BDBV_KC545396 18728 bp
BDBV_FJ217161 18728 bp
TAFV_FJ217162 18728 bp
EBOV_2014_KM034549 18728 bp
EBOV_2014_KM034550 18728 bp
EBOV_2014_KM034554 18728 bp
EBOV_2014_KM034555 18728 bp
EBOV_2014_KM034556 18728 bp
EBOV_2014_KM034557 18728 bp
EBOV_2014_KM034560 18728 bp
EBOV_2014_KM034551 18728 bp
EBOV_2014_KM034552 18728 bp
EBOV_2014_KM034553 18728 bp
EBOV_2014_KM034558 18728 bp
EBOV_2014_KM034559 18728 bp
EBOV_2014_KM034561 18728 bp
EBOV_2014_KM034562 18728 bp
EBOV_2014_KM034563 18728 bp
EBOV_1976_AF272001 18728 bp
EBOV_1976_KC242801 18728 bp
EBOV_1995_KC242796 18728 bp
EBOV_1995_KC242799 18728 bp
EBOV_2007_KC242784 18728 bp
EBOV_2007_KC242785 18728 bp
EBOV_2007_KC242787 18728 bp
EBOV_2007_KC242786 18728 bp
EBOV_2007_KC242789 18728 bp
EBOV_2007_KC242788 18728 bp
EBOV_2007_KC242790 18728 bp
RESTV_AB050936 18728 bp
RESTV_JX477166 18728 bp
RESTV_FJ621585 18728 bp
RESTV_JX477165 18728 bp
RESTV_FJ621583 18728 bp
RESTV_FJ621584 18728 bp
SUDV_KC242783 18728 bp
SUDV_FJ968794 18728 bp
SUDV_EU338380 18728 bp
SUDV_AY729654 18728 bp
SUDV_JN638998 18728 bp
SUDV_KC589025 18728 bp

In [14]:
print('2007 outbreak:')
print('Number of individuals: %s' % len(ebov2007_set.taxon_set))
do_basic_popgen(ebov2007_set)

print('\n2014 outbreak:')
print('Number of individuals: %s' % len(ebov2014_set.taxon_set))
do_basic_popgen(ebov2014_set)


2007 outbreak:
Number of individuals: 7
Segregating sites: 25, Avg pairwise diffs: 7.71, Nucleotide diversity 0.000412
Watterson's theta: 10.2040816327
Tajima's D: -1.38311415748

2014 outbreak:
Number of individuals: 8
Segregating sites: 6, Avg pairwise diffs: 2.79, Nucleotide diversity 0.000149
Watterson's theta: 2.31404958678
Tajima's D: 0.950120802758

In [9]:
print(len(sl_2014))
print(len(drc_2007))


8
7

In [15]:
pair_stats = popgenstat.PopulationPairSummaryStatistics(sl_2014, drc_2007)

In [17]:
print('Average number of pairwise differences irrespective of population: %.2f' %
      pair_stats.average_number_of_pairwise_differences)
print('Average number of pairwise differences between populations: %.2f' %
      pair_stats.average_number_of_pairwise_differences_between)
print('Average number of pairwise differences within populations: %.2f' %
      pair_stats.average_number_of_pairwise_differences_within)
print('Average number of net pairwise differences : %.2f' %
      pair_stats.average_number_of_pairwise_differences_net)
print('Number of segregating sites: %d' %
      pair_stats.num_segregating_sites)
print("Watterson's theta: %.2f" %
      pair_stats.wattersons_theta)
print("Wakeley's Psi: %.3f" % pair_stats.wakeleys_psi)
print("Tajima's D: %.2f" % pair_stats.tajimas_d)


Average number of pairwise differences irrespective of population: 284.46
Average number of pairwise differences between populations: 529.07
Average number of pairwise differences within populations: 10.50
Average number of net pairwise differences : 518.57
Number of segregating sites: 549
Watterson's theta: 168.84
Wakeley's Psi: 0.296
Tajima's D: 3.05

In [ ]: