In [3]:

    
from __future__ import print_function
import dendropy
from dendropy import popgenstat

Genes



In [4]:

    
import os
from collections import OrderedDict
genes_species = OrderedDict()
my_species = ['RESTV', 'SUDV']
my_genes = ['NP', 'L', 'VP35', 'VP40']

for name in my_genes:
    gene_name = name.split('.')[0]
    char_mat = dendropy.DnaCharacterMatrix.get_from_path('%s_align.fasta' % name, 'fasta')
    genes_species[gene_name] = {}
    
    for species in my_species:
        genes_species[gene_name][species] = dendropy.DnaCharacterMatrix()
    for taxon, char_map in char_mat.items():
        species = taxon.label.split('_')[0]
        if species in my_species:
            genes_species[gene_name][species].extend_map({taxon: char_map})



In [5]:

    
import numpy as np
import pandas as pd
summary = np.ndarray(shape=(len(genes_species), 4 * len(my_species)))
stats = ['seg_sites', 'nuc_div', 'taj_d', 'wat_theta']
for row, (gene, species_data) in enumerate(genes_species.items()):
    for col_base, species in enumerate(my_species):
        summary[row, col_base * 4] = popgenstat.num_segregating_sites(species_data[species])
        summary[row, col_base * 4 + 1] = popgenstat.nucleotide_diversity(species_data[species])
        summary[row, col_base * 4 + 2] = popgenstat.tajimas_d(species_data[species])
        summary[row, col_base * 4 + 3] = popgenstat.wattersons_theta(species_data[species])
columns = []
for species in my_species:
    columns.extend(['%s (%s)' % (stat, species) for stat in stats])
df = pd.DataFrame(summary, index=genes_species.keys(), columns=columns)
df # vs print(df)









    Out[5]:






  
    
      
      seg_sites (RESTV)
      nuc_div (RESTV)
      taj_d (RESTV)
      wat_theta (RESTV)
      seg_sites (SUDV)
      nuc_div (SUDV)
      taj_d (SUDV)
      wat_theta (SUDV)
    
  
  
    
      NP
      113
      0.020659
      -0.482275
      49.489051
      118
      0.029630
      1.203522
      56.64
    
    
      L
      288
      0.018143
      -0.295386
      126.131387
      282
      0.024193
      1.412350
      135.36
    
    
      VP35
      42
      0.017099
      -0.530330
      18.394161
      50
      0.027761
      1.069061
      24.00
    
    
      VP40
      61
      0.026155
      -0.188135
      26.715328
      41
      0.023517
      1.269160
      19.68

Genomes



In [6]:

    
def do_basic_popgen(seqs):
    num_seg_sites = popgenstat.num_segregating_sites(seqs)
    avg_pair = popgenstat.average_number_of_pairwise_differences(seqs)
    nuc_div = popgenstat.nucleotide_diversity(seqs)
    print('Segregating sites: %d, Avg pairwise diffs: %.2f, Nucleotide diversity %.6f' % (num_seg_sites, avg_pair, nuc_div))
    print("Watterson's theta: %s" % popgenstat.wattersons_theta(seqs))
    print("Tajima's D: %s" % popgenstat.tajimas_d(seqs))



In [7]:

    
ebov_seqs = dendropy.DnaCharacterMatrix.get_from_path('trim.fasta',
                                                      schema='fasta', data_type='dna')
sl_2014 = []
drc_2007 = []
ebov2007_set = dendropy.DnaCharacterMatrix()
ebov2014_set = dendropy.DnaCharacterMatrix()
for taxon, char_map in ebov_seqs.items():
    print(taxon.label)
    if taxon.label.startswith('EBOV_2014') and len(sl_2014) < 8:
        sl_2014.append(char_map)
        ebov2014_set.extend_map({taxon: char_map})
    elif taxon.label.startswith('EBOV_2007'):
        drc_2007.append(char_map)
        ebov2007_set.extend_map({taxon: char_map})
del ebov_seqs









    



BDBV_KC545393 18728 bp
BDBV_KC545395 18728 bp
BDBV_KC545394 18728 bp
BDBV_KC545396 18728 bp
BDBV_FJ217161 18728 bp
TAFV_FJ217162 18728 bp
EBOV_2014_KM034549 18728 bp
EBOV_2014_KM034550 18728 bp
EBOV_2014_KM034554 18728 bp
EBOV_2014_KM034555 18728 bp
EBOV_2014_KM034556 18728 bp
EBOV_2014_KM034557 18728 bp
EBOV_2014_KM034560 18728 bp
EBOV_2014_KM034551 18728 bp
EBOV_2014_KM034552 18728 bp
EBOV_2014_KM034553 18728 bp
EBOV_2014_KM034558 18728 bp
EBOV_2014_KM034559 18728 bp
EBOV_2014_KM034561 18728 bp
EBOV_2014_KM034562 18728 bp
EBOV_2014_KM034563 18728 bp
EBOV_1976_AF272001 18728 bp
EBOV_1976_KC242801 18728 bp
EBOV_1995_KC242796 18728 bp
EBOV_1995_KC242799 18728 bp
EBOV_2007_KC242784 18728 bp
EBOV_2007_KC242785 18728 bp
EBOV_2007_KC242787 18728 bp
EBOV_2007_KC242786 18728 bp
EBOV_2007_KC242789 18728 bp
EBOV_2007_KC242788 18728 bp
EBOV_2007_KC242790 18728 bp
RESTV_AB050936 18728 bp
RESTV_JX477166 18728 bp
RESTV_FJ621585 18728 bp
RESTV_JX477165 18728 bp
RESTV_FJ621583 18728 bp
RESTV_FJ621584 18728 bp
SUDV_KC242783 18728 bp
SUDV_FJ968794 18728 bp
SUDV_EU338380 18728 bp
SUDV_AY729654 18728 bp
SUDV_JN638998 18728 bp
SUDV_KC589025 18728 bp



In [14]:

    
print('2007 outbreak:')
print('Number of individuals: %s' % len(ebov2007_set.taxon_set))
do_basic_popgen(ebov2007_set)

print('\n2014 outbreak:')
print('Number of individuals: %s' % len(ebov2014_set.taxon_set))
do_basic_popgen(ebov2014_set)









    



2007 outbreak:
Number of individuals: 7
Segregating sites: 25, Avg pairwise diffs: 7.71, Nucleotide diversity 0.000412
Watterson's theta: 10.2040816327
Tajima's D: -1.38311415748

2014 outbreak:
Number of individuals: 8
Segregating sites: 6, Avg pairwise diffs: 2.79, Nucleotide diversity 0.000149
Watterson's theta: 2.31404958678
Tajima's D: 0.950120802758



In [9]:

    
print(len(sl_2014))
print(len(drc_2007))

8
7



In [15]:

    
pair_stats = popgenstat.PopulationPairSummaryStatistics(sl_2014, drc_2007)



In [17]:

    
print('Average number of pairwise differences irrespective of population: %.2f' %
      pair_stats.average_number_of_pairwise_differences)
print('Average number of pairwise differences between populations: %.2f' %
      pair_stats.average_number_of_pairwise_differences_between)
print('Average number of pairwise differences within populations: %.2f' %
      pair_stats.average_number_of_pairwise_differences_within)
print('Average number of net pairwise differences : %.2f' %
      pair_stats.average_number_of_pairwise_differences_net)
print('Number of segregating sites: %d' %
      pair_stats.num_segregating_sites)
print("Watterson's theta: %.2f" %
      pair_stats.wattersons_theta)
print("Wakeley's Psi: %.3f" % pair_stats.wakeleys_psi)
print("Tajima's D: %.2f" % pair_stats.tajimas_d)









    



Average number of pairwise differences irrespective of population: 284.46
Average number of pairwise differences between populations: 529.07
Average number of pairwise differences within populations: 10.50
Average number of net pairwise differences : 518.57
Number of segregating sites: 549
Watterson's theta: 168.84
Wakeley's Psi: 0.296
Tajima's D: 3.05



In [ ]:

	seg_sites (RESTV)	nuc_div (RESTV)	taj_d (RESTV)	wat_theta (RESTV)	seg_sites (SUDV)	nuc_div (SUDV)	taj_d (SUDV)	wat_theta (SUDV)
NP	113	0.020659	-0.482275	49.489051	118	0.029630	1.203522	56.64
L	288	0.018143	-0.295386	126.131387	282	0.024193	1.412350	135.36
VP35	42	0.017099	-0.530330	18.394161	50	0.027761	1.069061	24.00
VP40	61	0.026155	-0.188135	26.715328	41	0.023517	1.269160	19.68