In [65]:
import networkx as nx
import custom_funcs as cf
import pandas as pd
import matplotlib.pyplot as plt
import dendropy
from Levenshtein import distance
from collections import defaultdict, Counter
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO, AlignIO
from Bio.Align import MultipleSeqAlignment
from itertools import product
%load_ext autoreload
%autoreload 2
%matplotlib inline
The autoreload extension is already loaded. To reload it, use:
%reload_ext autoreload
In [2]:
G = nx.read_gpickle('20150902_all_ird Final Graph.pkl')
G = cf.clean_host_species_names(G)
G = cf.impute_reassortant_status(G)
G = cf.impute_weights(G)
In [3]:
G.nodes(data=True)[1286]
Out[3]:
('A/Tennessee/F2018A/2011',
{'collection_date': Timestamp('2011-01-24 00:00:00'),
'country': 'USA',
'host_species': 'Human',
'reassortant': False,
'state': 'Tennessee',
'subtype': 'H3N2'})
In [4]:
# Get all of the host species with TOL and BOLD links
hosts_with_coi = pd.read_csv('host_species.csv', index_col=0)
hosts_with_coi
Out[4]:
host_species
TOL_species_name
TOL_url
sequence
BOLD_url
notes
0
Sparrow
NaN
NaN
NaN
NaN
ambiguous term
1
American Green-Winged Teal
Anas carolinensis
http://tolweb.org/Anas_carolinensis/89249
TCTATACCTTATCTTCGGGGCATGAGCCGGAATAATTGGCACAGCA...
http://boldsystems.org/index.php/Public_Record...
NaN
2
Turkey
Meleagris gallopavo
http://tolweb.org/Meleagris/57202
GTGACTTTCATCAACCGATGATTATTTTCAACCAACCATAAAGATA...
http://boldsystems.org/index.php/Public_Record...
NaN
3
Semipalmated Sandpiper
Calidris pusilla
http://tolweb.org/Calidris_pusilla/90811
GTGACTTTTATCAACCGATGACTATTCTCAACCAACCACAAAGATA...
http://boldsystems.org/index.php/Public_Record...
NaN
4
Murre
NaN
NaN
NaN
NaN
not available on TOL
5
Heron
NaN
NaN
NaN
NaN
more than one record on TOL
6
Wood Duck
Aix sponsa
http://tolweb.org/Aix/89196
CTTGTACCTTATCTTCGGGGCATGAGCCGGAATAATTGGCACAGCA...
http://boldsystems.org/index.php/Public_Record...
NaN
7
Myna
NaN
NaN
NaN
NaN
not available on TOL
8
Ostrich
Struthio camelus
http://tolweb.org/Struthio_camelus/26289
GTGACCTTCATTACTCGATGACTTTTTTCAACAAATCACAAAGACA...
http://boldsystems.org/index.php/Public_Record...
NaN
9
Ruddy Turnstone
Arenaria interpres
http://tolweb.org/Arenaria_interpres/90804
GTGACTTTTATCAACCGATGACTATTCTCAACCAACCACAAAGATA...
http://boldsystems.org/index.php/Public_Record...
NaN
10
Sanderling
Calidris alba
http://tolweb.org/Calidris_alba/90810
GTGACTTTTATCAACCGATGACTATTCTCAACCAACCACAAAGATA...
http://boldsystems.org/index.php/Public_Record...
NaN
11
Camel
Camelus dromedarius
http://tolweb.org/Camelus_dromedarius/30349
CTAAGCTTATTAATTCGTGCTGAATTGGGGCAGCCTGGGACATTGC...
http://boldsystems.org/index.php/Public_Record...
NaN
12
Redhead
Aythya americana
http://tolweb.org/Aythya_americana/89260
ATGGCACACCAAGCACACTCCTACCACATAGTAGACCCAAGCCCCT...
http://boldsystems.org/index.php/Public_Record...
NaN
13
Eagle
NaN
NaN
NaN
NaN
ambiguous term
14
Mallard
Anas platyrhynchos
http://tolweb.org/Anas_platyrhynchos/89217
GGGGCATGAGCCGGAATAATTGGCACAGCACTCAGCCTACTGATCC...
http://boldsystems.org/index.php/Public_Record...
NaN
15
Swan
NaN
NaN
NaN
NaN
ambiguous term
16
Great Crested Grebe
Podiceps cristatus
http://tolweb.org/Podiceps_cristatus/89462
TCTATACTTAATCTTTGGTGCATGAGCCGGCATAGTCGGCACCGCC...
http://boldsystems.org/index.php/Public_Record...
NaN
17
Herring Gull
Larus smithsonianus
http://tolweb.org/Larus_smithsonianus/129496
TAGGTACTGCCCTCAGCCTGCTTATCCGTGCAGAACTTGGCCAACC...
http://boldsystems.org/index.php/Public_Record...
also have european herring gull on TOL
18
Northern Shoveler
Anas clypeata
http://tolweb.org/Anas_clypeata/89233
CAAGACATTGGCACTCTATACCTTATCTTCGGGGCATGAGCCGGAA...
http://boldsystems.org/index.php/Public_Record...
NaN
19
Blue-Winged Teal
Anas discors
http://tolweb.org/Anas_discors/89228
TCTATACCTTATCTTCGGGGCATGAGCCGGAATAATTGGCACAGCA...
http://boldsystems.org/index.php/Public_Record...
NaN
20
Greater White-Fronted Goose
Anser albifrons
http://tolweb.org/Anser_albifrons/89141
GTGACCTTCATCAACCGATGACTATTTTCCACTAACCATAAGGATA...
http://boldsystems.org/index.php/Public_Record...
NaN
21
White-Rumped Sandpiper
Calidris fuscicollis
http://tolweb.org/Calidris_fuscicollis/90818
GTGACTTTTATTAATCGATGACTATTCTCAACCAACCACAAAGATA...
http://boldsystems.org/index.php/Public_Record...
NaN
22
Ring-Billed Gull
Larus delawarensis
http://tolweb.org/Larus_delawarensis/90604
GTGACCTTTATCAATCGATGATTATTTTCAACAAACCACAAAGATA...
http://boldsystems.org/index.php/Public_Record...
NaN
23
Parrot
NaN
NaN
NaN
NaN
ambiguous term
24
Tundra Swan
Cygnus columbianus
http://tolweb.org/Cygnus_columbianus/89161
GTGACCTTCATCAACCGATGACTATTTTCCACTAACCATAAAGATA...
http://boldsystems.org/index.php/Public_Record...
NaN
25
Ring-Necked Duck
Aythya collaris
http://tolweb.org/Aythya_collaris/89267
CCTATATCTTATCTTTGGGGCATGAGCCGGAATAATCGGCACAGCA...
http://boldsystems.org/index.php/Public_Record...
NaN
26
Chicken
Gallus gallus
http://tolweb.org/Gallus/57162
ATCGTCACAGCCCATGCTTTCGTCATAATCTTCTTTATAGTTATAC...
http://boldsystems.org/index.php/Public_Record...
other terms found, but gallus gallus is the kn...
27
American Black Duck
Anas rubripes
http://tolweb.org/Anas_rubripes/89216
TATACCTTATCTTCGGGACATGAGCCGGAATAATTGGCACAGCACT...
http://boldsystems.org/index.php/Public_Record...
NaN
28
Whiskered Tern
Chlidonias hybrida
http://tolweb.org/Chlidonias_hybrida/90682
GTGACCTTCATCAACCGATGATTATTTTCAACAAACCACAAAGATA...
http://boldsystems.org/index.php/Public_Record...
NaN
29
Mink
NaN
NaN
NaN
NaN
not available on TOL
...
...
...
...
...
...
...
122
Gadwall
Anas strepera
http://tolweb.org/Anas_strepera/89210
CTTATCTTCGGGGCATGGGCCGGAATAATTGGCACAGCACTCAGCC...
http://boldsystems.org/index.php/Public_Record...
NaN
123
Gull
NaN
NaN
NaN
NaN
ambiguous term
124
Barnacle Goose
Branta leucopsis
http://tolweb.org/Branta_leucopsis/89153
CTCATCTTCGGAGCATGAGCAGGAATAGTCGGCACCGCACTCAGCC...
http://boldsystems.org/index.php/Public_Record...
NaN
125
Panda
Ailuropoda melanoleuca
https://en.wikipedia.org/wiki/Giant_panda
ATGTTCATTAACCGATGACTGTTTTCCACCAACCACAAAGATATTG...
http://boldsystems.org/index.php/Public_Record...
species name found on Wikipedia
126
Condor
NaN
NaN
NaN
NaN
ambiguous term
127
Unknown
NaN
NaN
NaN
NaN
ambiguous term
128
Flycatcher
NaN
NaN
NaN
NaN
ambiguous term
129
Waterfowl
NaN
NaN
NaN
NaN
ambiguous term
130
Grey Heron
Ardea cinerea
http://tolweb.org/Ardea_cinerea/89637
ATCTTCGGAGCATGAGCCGGCATAATTGGAACCGCCCTAAGCCTAC...
http://boldsystems.org/index.php/Public_Record...
NaN
131
Grebe
NaN
NaN
NaN
NaN
ambiguous term
132
Mallard-Black Duck Hybrid
NaN
NaN
NaN
NaN
not available on TOL
133
Iceland Gull
Larus glaucoides
http://tolweb.org/Larus/90592
TCTTCGGCGCATGAGCTGGCATAGTAGGTACTGCCCTCAGCCTGCT...
http://boldsystems.org/index.php/Public_Record...
NaN
134
Shoveler
NaN
NaN
NaN
NaN
ambiguous term
135
Snow Goose
Chen caerulescens
http://tolweb.org/Chen_caerulescens/89145
CCTATACCTCATNTTCGGGGCATGAGCAGGAATAGTCGGCACCGCA...
http://boldsystems.org/index.php/Public_Record...
NaN
136
Sea Mammal
NaN
NaN
NaN
NaN
ambiguous term
137
Domestic Cat
Felis catus
http://tolweb.org/Felis_catus/123531
TCCGGGCCGAACTGGGCCAACCTGGTACACTACTAGGAGATGATCA...
http://boldsystems.org/index.php/Public_Record...
NaN
138
Red Knot
Calidris canutus
http://tolweb.org/Calidris_canutus/90809
TTTTCTCCAACCCACAAAGACATTGGCACCCTATACCTAATCTTCG...
http://boldsystems.org/index.php/Public_Record...
NaN
139
Spot-Billed Duck
NaN
NaN
NaN
NaN
ambiguous term
140
Quail
NaN
NaN
NaN
NaN
ambiguous term
141
Coot
NaN
NaN
NaN
NaN
ambiguous term
142
Magpie
Anseranas semipalmata
http://tolweb.org/Anseranas_semipalmata/26294
GTGACCTTCATTAACCGCTGACTATTCTCAACTAACCATAAAGACA...
http://boldsystems.org/index.php/Public_Record...
NaN
143
Babbler
Eupetes macrocerus
http://tolweb.org/Eupetidae/79175
NaN
NaN
not available on BOLD
144
Guinea Fowl
NaN
NaN
NaN
NaN
ambiguous term
145
Black Scoter
Melanitta nigra
http://tolweb.org/Melanitta_nigra/89281
CTTATCTACNCGGCATGAGCCGGAATAATTGGCACAGCACTCAGCC...
http://boldsystems.org/index.php/Public_Record...
NaN
146
Magpie Robin
NaN
NaN
NaN
NaN
not available on TOL
147
Pigeon
NaN
NaN
NaN
NaN
ambiguous term
148
Sharp-Tailed Sandpiper
Calidris acuminata
http://tolweb.org/Calidris_acuminata/90821
GTGACTTTCATCAACCGATGATTATTCTCAACCAACCACAAAGACA...
http://boldsystems.org/index.php/Public_Record...
NaN
149
Common Goldeneye
Bucephala clangula
http://tolweb.org/Bucephala_clangula/89286
TTCTCCAACCACAAAGACATTGGCACCCTATATCTTATCTTCGGAG...
http://boldsystems.org/index.php/Public_Record...
NaN
150
Chukar
NaN
NaN
NaN
NaN
ambiguous term
151
Garganey
Anas querquedula
http://tolweb.org/Anas_querquedula/89246
TCTATACCTTATCTTCGGGGCATGAGCCGGAATAATTGGCACAGCA...
http://boldsystems.org/index.php/Public_Record...
NaN
152 rows × 6 columns
In [5]:
# Compile COI sequences into a FASTA file to do multiple sequence alignment.
coi_sequences = []
for r, d in hosts_with_coi.iterrows():
if not pd.isnull(d['sequence']):
seq = Seq(d['sequence'])
seqrecord = SeqRecord(seq, id='{0}.{1}'.format(d['host_species'].replace(' ', '_'), d['TOL_species_name'].replace(' ', '_')))
coi_sequences.append(seqrecord)
SeqIO.write(coi_sequences, 'host_coi_unaligned.fasta', 'fasta')
Out[5]:
74
In [6]:
# After aligning using clustal omega (default parameters), load back the alignment.
coi_aligned = AlignIO.read('host_coi_aligned.fasta', 'fasta')
coi_aligned
Out[6]:
<<class 'Bio.Align.MultipleSeqAlignment'> instance (74 records of length 1638, SingleLetterAlphabet()) at 7f6e604a9710>
In [7]:
# To identify where to trim the alignment, look at the number of gaps.
num_gaps = dict()
for i in range(coi_aligned.get_alignment_length()):
num_gaps[i] = Counter(coi_aligned[:,i])['-']
plt.plot(list(num_gaps.keys()), list(num_gaps.values()))
plt.xlabel('position in alignment')
plt.ylabel('number of gap characters')
Out[7]:
<matplotlib.text.Text at 0x7f6e602967f0>
In [8]:
# Given this distribution of gaps, we will use a cut-off of 3 gaps to trim the alignment.
# i.e. if there are more than 3 gaps at that position, we trim that position out.
coi_df = pd.DataFrame([s for s in coi_aligned])
index = [s.id for s in coi_aligned]
for i in range(coi_aligned.get_alignment_length()):
num_gaps = Counter(coi_aligned[:,i])['-']
if num_gaps > 3:
coi_df = coi_df.drop(i, axis=1)
coi_df.index = index
coi_df
Out[8]:
111
112
113
114
115
116
117
118
119
120
...
736
737
738
739
740
741
742
743
744
745
Human.Homo_sapiens
T
T
G
G
T
T
C
G
G
G
...
T
C
G
T
G
A
T
G
T
C
Kelp_Gull.Larus_dominicanus
A
T
G
G
C
C
C
C
A
A
...
A
A
C
A
C
T
A
A
C
C
Baikal_Teal.Anas_formosa
A
T
G
G
C
C
C
C
A
A
...
A
A
C
C
C
T
A
A
C
C
Redhead.Aythya_americana
-
C
A
A
C
C
T
C
A
-
...
C
G
A
G
G
C
A
T
T
C
Ferret.Mustela_putorius
A
T
C
C
G
T
G
C
T
G
...
A
C
A
C
T
T
A
T
T
T
Sloth_Bear.Melursus_ursinus
A
T
T
C
G
T
G
C
C
G
...
A
C
A
C
T
T
G
T
T
C
Panda.Ailuropoda_melanoleuca
A
T
C
C
G
T
G
C
T
G
...
A
C
A
T
C
T
A
T
T
T
Camel.Camelus_dromedarius
A
T
T
C
G
T
G
C
T
G
...
A
C
A
C
C
T
A
T
T
T
Horse.Equus_ferus_caballus
A
T
C
C
G
T
G
C
T
G
...
A
C
A
C
C
T
A
T
T
C
Domestic_Cat.Felis_catus
-
T
C
C
G
G
G
C
C
G
...
A
C
A
C
T
T
A
T
T
C
Swine.Sus_scrofa
A
T
T
C
G
C
G
C
T
G
...
A
C
A
C
T
T
G
T
T
C
Turkey.Meleagris_gallopavo
A
T
C
C
G
T
G
C
A
G
...
A
C
A
C
C
T
A
T
T
T
Japanese_Quail.Coturnix_japonica
A
T
C
C
G
C
G
C
A
G
...
A
C
A
C
T
T
A
T
T
T
Chicken.Gallus_gallus
-
-
-
-
-
-
-
-
-
-
...
A
C
A
C
C
T
A
T
T
C
Chinese_Francolin.Francolinus_pintadeanus
A
T
C
C
G
C
G
C
A
G
...
A
C
A
C
C
T
A
T
T
C
Garganey.Anas_querquedula
A
T
C
C
G
C
G
C
A
G
...
A
C
A
C
C
T
A
T
T
T
Northern_Shoveler.Anas_clypeata
A
T
C
C
G
C
G
C
A
G
...
A
C
A
C
C
T
A
T
T
T
Blue-Winged_Teal.Anas_discors
A
T
C
C
G
C
G
C
A
G
...
A
C
A
C
C
T
A
-
-
-
Cinnamon_Teal.Anas_cyanoptera
A
T
C
C
G
C
G
C
A
G
...
A
C
A
C
C
T
A
-
-
-
American_Green-Winged_Teal.Anas_carolinensis
A
T
C
C
G
C
G
C
A
G
...
G
C
A
C
C
T
A
T
T
T
Green-Winged_Teal.Anas_carolinensis
A
T
C
C
G
C
G
C
A
G
...
G
C
A
C
C
T
A
T
T
T
Mallard.Anas_platyrhynchos
A
T
C
C
G
G
G
C
A
G
...
A
C
A
C
C
T
A
T
T
T
American_Black_Duck.Anas_rubripes
A
T
C
C
G
G
G
C
A
G
...
G
C
A
C
C
T
A
T
T
T
American_Wigeon.Anas_americana
A
T
C
C
G
C
G
C
A
G
...
A
C
A
C
C
T
A
T
T
T
Gadwall.Anas_strepera
A
T
C
C
G
C
G
C
A
G
...
A
C
A
C
C
T
A
T
T
T
Red-Crested_Pochard.Netta_rufina
A
T
C
C
G
C
G
C
A
G
...
A
C
A
T
C
T
A
T
T
C
Hooded_Merganser.Lophodytes_cucullatus
A
T
C
C
G
C
G
C
A
G
...
A
C
A
C
C
T
G
T
T
C
Rosy-Billed_Pochard.Netta_peposaca
A
T
C
C
G
C
G
C
A
G
...
A
C
A
C
C
T
A
T
T
C
Ring-Necked_Duck.Aythya_collaris
A
T
C
C
G
C
G
C
A
G
...
A
C
A
C
C
T
A
T
T
C
Canvasback.Aythya_valisineria
A
T
C
C
G
C
G
C
A
G
...
A
C
A
C
C
T
A
T
T
C
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
Snow_Goose.Chen_caerulescens
A
T
C
C
G
C
G
C
A
G
...
A
C
A
C
C
T
A
T
T
C
Pink-Footed_Goose.Anser_brachyrhynchus
A
T
C
C
G
C
G
C
A
G
...
A
C
A
C
C
T
A
T
T
C
Bar-Headed_Goose.Anser_indicus
A
T
C
C
G
C
G
C
A
G
...
A
C
A
C
C
T
A
T
T
C
Canada_Goose.Branta_canadensis
A
T
C
C
G
C
G
C
A
G
...
G
C
A
C
C
T
A
T
T
C
Barnacle_Goose.Branta_leucopsis
A
T
C
C
G
C
G
C
A
G
...
A
C
A
C
C
T
A
T
T
C
Whiskered_Tern.Chlidonias_hybrida
A
T
T
C
G
T
G
C
A
G
...
A
C
A
T
C
T
A
T
T
C
Common_Murre.Uria_aalge
A
T
C
C
G
T
G
C
A
G
...
A
C
A
C
C
T
C
T
T
C
Brown-Headed_Gull.Larus_brunnicephalus
A
T
T
C
G
T
G
C
A
G
...
A
C
A
C
C
T
C
T
T
C
Laughing_Gull.Larus_atricilla
A
T
T
C
G
T
G
C
A
G
...
A
C
A
C
C
T
C
T
T
C
Glaucous_Gull.Larus_hyperboreus
A
T
C
C
G
T
G
C
A
G
...
A
C
A
C
C
T
C
T
T
C
Ring-Billed_Gull.Larus_delawarensis
A
T
C
C
G
T
G
C
A
G
...
A
C
A
C
C
T
C
T
T
C
Herring_Gull.Larus_smithsonianus
A
T
C
C
G
T
G
C
A
G
...
A
C
A
C
C
T
C
-
-
-
Iceland_Gull.Larus_glaucoides
A
T
C
C
G
T
G
C
A
G
...
A
C
A
C
C
T
C
T
T
T
Ostrich.Struthio_camelus
A
T
T
C
G
T
G
C
A
G
...
A
C
A
C
C
T
C
T
T
C
Peregrine_Falcon.Falco_peregrinus
A
T
T
C
G
A
A
C
A
G
...
A
C
A
C
C
T
A
T
T
C
Saker_Falcon.Falco_cherrug
A
T
T
C
G
A
A
C
A
G
...
A
C
A
C
T
T
A
T
T
C
Ruddy_Turnstone.Arenaria_interpres
A
T
T
C
G
C
G
C
A
G
...
A
C
A
T
C
T
C
T
T
C
Sharp-Tailed_Sandpiper.Calidris_acuminata
A
T
T
C
G
T
G
C
A
G
...
G
C
A
C
C
T
G
T
T
C
Red-Necked_Stint.Calidris_ruficollis
A
T
C
C
G
T
G
C
A
G
...
A
C
A
C
C
T
T
T
T
C
Sanderling.Calidris_alba
A
T
T
C
G
A
G
C
A
G
...
A
C
A
T
C
T
C
T
T
C
White-Rumped_Sandpiper.Calidris_fuscicollis
A
T
T
C
G
T
G
C
A
G
...
A
C
A
T
C
T
C
T
T
C
Semipalmated_Sandpiper.Calidris_pusilla
A
T
T
C
G
T
G
C
A
G
...
A
C
A
T
C
T
C
T
T
C
Least_Sandpiper.Calidris_minutilla
A
T
T
C
G
T
G
C
A
G
...
G
C
A
C
C
T
C
T
T
C
Dunlin.Calidris_alpina
A
T
T
C
G
T
G
C
A
G
...
A
C
A
T
C
T
C
T
T
C
Red_Knot.Calidris_canutus
A
T
T
C
G
C
G
C
A
G
...
A
C
A
C
C
T
C
T
T
C
Little_Egret.Egretta_garzetta
A
T
C
C
G
A
G
C
T
G
...
A
C
A
C
C
T
A
T
T
C
Magpie.Anseranas_semipalmata
A
T
C
C
G
C
G
C
A
G
...
G
C
A
C
C
T
C
T
T
C
Grey_Heron.Ardea_cinerea
A
T
C
C
G
A
G
C
T
G
...
A
C
A
T
C
T
C
T
T
C
Great_Crested_Grebe.Podiceps_cristatus
A
T
C
C
G
C
G
C
A
G
...
A
C
A
C
C
T
C
T
T
C
Little_Grebe.Tachybaptus_ruficollis
A
T
C
C
G
T
G
C
A
G
...
A
C
A
T
C
T
C
T
T
C
74 rows × 580 columns
In [9]:
# Now, I have to concatenate the sequences back into a single string.
trimmed_coi = []
for host_name, letters in coi_df.iterrows():
sequence = ''
for letter in coi_df.ix[host_name]:
sequence += letter
seq = Seq(sequence)
seqrecord = SeqRecord(seq, description='', id=host_name, name='')
trimmed_coi.append(seqrecord)
SeqIO.write(trimmed_coi, 'host_coi_trimmed.fasta', 'fasta')
Out[9]:
74
In [10]:
trimmed_coi
Out[10]:
[SeqRecord(seq=Seq('TTGGTTCGGGGTATGG-----------------------GGTTAGCAGCGGTGT...GTC', Alphabet()), id='Human.Homo_sapiens', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATGGCCCCAAATCTACGAAAATCTCACCCCCTCCTCAAAATAGTTAACAACTCA...ACC', Alphabet()), id='Kelp_Gull.Larus_dominicanus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATGGCCCCAAATATCCGCAAATCCCACCCCCTACTAAAAATAATCAACAACTCC...ACC', Alphabet()), id='Baikal_Teal.Anas_formosa', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('-CAACCTCA---------------GGACTAATCATATGATTCCACTATAACTCA...TTC', Alphabet()), id='Redhead.Aythya_americana', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGTGCTGAACTAGGTCAACCTGGCACTCTGCTAGGAGACGACCAGATTTAT...TTT', Alphabet()), id='Ferret.Mustela_putorius', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGTGCCGAACTAGGTCAACCCGGGGCTCTGTTGGGGGATGATCAGATCTAC...TTC', Alphabet()), id='Sloth_Bear.Melursus_ursinus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGTGCTGAATTAGGTCAGCCTGGAGCTCTGTTAGGAGATGACCAAATTTAC...TTT', Alphabet()), id='Panda.Ailuropoda_melanoleuca', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGTGCTGAATTGGGGCAGCCTGGGACATTGCTTGGAGATGACCAAATCTAT...TTT', Alphabet()), id='Camel.Camelus_dromedarius', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGTGCTGAATTAGGCCAACCTGGGACCCTACTAGGAGATGATCAGATCTAC...TTC', Alphabet()), id='Horse.Equus_ferus_caballus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('-TCCGGGCCGAACTGGGCCAACCTGGTACACTACTAGGAGATGATCAGATTTAC...TTC', Alphabet()), id='Domestic_Cat.Felis_catus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGCGCTGAACTAGGTCAGCCCGGAACCCTACTTGGCGATGATCAAATCTAC...TTC', Alphabet()), id='Swine.Sus_scrofa', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGTGCAGAACTGGGACAACCTGGGACACTCCTAGGAGACGACCAAATCTAT...TTT', Alphabet()), id='Turkey.Meleagris_gallopavo', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGCACCCTCCTAGGAGATGACCAAATTTAC...TTT', Alphabet()), id='Japanese_Quail.Coturnix_japonica', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('------------------------------------------------------...TTC', Alphabet()), id='Chicken.Gallus_gallus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGGCAACCCGGAACCCTCTTAGGAGACGACCAAATCTAC...TTC', Alphabet()), id='Chinese_Francolin.Francolinus_pintadeanus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTTGGTCAACCAGGAACCCTCCTGGGCGATGACCAAATTTAC...TTT', Alphabet()), id='Garganey.Anas_querquedula', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTTGGTCAACCAGGGACTCTCCTGGGCGATGACCAAATTTAC...TTT', Alphabet()), id='Northern_Shoveler.Anas_clypeata', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAGCTTGGTCAACCCGGGACTCTCCTGGGCGATGACCAAATTTAC...---', Alphabet()), id='Blue-Winged_Teal.Anas_discors', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAGCTTGGTCAACCCGGGACTCTCCTGGGCGATGACCAAATTTAC...---', Alphabet()), id='Cinnamon_Teal.Anas_cyanoptera', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTGGGCGACGACCAAATTTAC...TTT', Alphabet()), id='American_Green-Winged_Teal.Anas_carolinensis', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTGGGCGACGACCAAATTTAC...TTT', Alphabet()), id='Green-Winged_Teal.Anas_carolinensis', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGGGCAGAACTAGGCCAGCCAGGGACCCTCCTGGGCGACGACCAAATTTAT...TTT', Alphabet()), id='Mallard.Anas_platyrhynchos', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGGGCAGAGCTAGGCCAGCCAGGGACCCTCCTGGGCGACGACCAAATTTAT...TTT', Alphabet()), id='American_Black_Duck.Anas_rubripes', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGGACCCTCCTGGGCGACGACCAAATTTAC...TTT', Alphabet()), id='American_Wigeon.Anas_americana', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAGCCAGGGACCCTCCTGGGCGACGACCAAATTTAT...TTT', Alphabet()), id='Gadwall.Anas_strepera', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTAGGTGATGACCAGATCTAC...TTC', Alphabet()), id='Red-Crested_Pochard.Netta_rufina', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTCGGCCAACCAGGGACCCTCCTAGGCGATGACCAAATTTAC...TTC', Alphabet()), id='Hooded_Merganser.Lophodytes_cucullatus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCGGGAACCCTCCTGGGTGATGACCAGATTTAC...TTC', Alphabet()), id='Rosy-Billed_Pochard.Netta_peposaca', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTAGGTGATGACCAGATTTAC...TTC', Alphabet()), id='Ring-Necked_Duck.Aythya_collaris', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTAGGTGATGACCAGATTTAC...TTC', Alphabet()), id='Canvasback.Aythya_valisineria', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTGGGTGATGACCAGATTTAC...TTC', Alphabet()), id='Tufted_Duck.Aythya_fuligula', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTAGGTGATGACCAGATTTAC...TTC', Alphabet()), id='Lesser_Scaup.Aythya_affinis', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTAGGTGATGACCAGATTTAC...TTC', Alphabet()), id='Greater_Scaup.Aythya_marila', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCTGAACTAGGCCAGCCAGGAACCCTCCTAGGTGATGACCAAATTTAT...TTC', Alphabet()), id='Wood_Duck.Aix_sponsa', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTCGGCCAGCCGGGAACCCTCCTAGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Black_Scoter.Melanitta_nigra', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTCGGCCAACCAGGGACCCTCCTGGGCGATGACCAAATTTAC...TTC', Alphabet()), id='Bufflehead.Bucephala_albeola', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTCGGCCAGCCAGGGACCCTCCTGGGCGATGACCAAATTTAC...TTC', Alphabet()), id='Common_Goldeneye.Bucephala_clangula', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCCGGAACCCTCCTAGGTGATGACCAAATTTAC...TTT', Alphabet()), id='Common_Eider.Somateria_mollissima', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTGGGCCAACCAGGAACCCTCCTGGGTGATGACCAAATTTAC...TTC', Alphabet()), id='Long-Tailed_Duck.Clangula_hyemalis', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAGCCAGGAACCCTCCTCGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Mute_Swan.Cygnus_olor', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTGGGACAACCGGGAACTCTCCTTGGTGACGACCAGATCTAT...TTC', Alphabet()), id='Tundra_Swan.Cygnus_columbianus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTGGGACAACCAGGAACCCTCCTTGGTGACGACCAGATCTAC...TTC', Alphabet()), id='Whooper_Swan.Cygnus_cygnus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGCGCAGAACTAGGCCAACCAGGGACTCTCCTGGGAGATGACCAAATCTAT...TTC', Alphabet()), id='Muscovy_Duck.Cairina_moschata', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAGCCAGGAACTCTCCTAGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Greater_White-Fronted_Goose.Anser_albifrons', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGAACTCTCCTAGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Snow_Goose.Chen_caerulescens', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGAACTCTCCTAGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Pink-Footed_Goose.Anser_brachyrhynchus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGAACTCTCCTAGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Bar-Headed_Goose.Anser_indicus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGGACTCTCCTAGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Canada_Goose.Branta_canadensis', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGGACTCTCCTAGGTGACGACCAAATTTAC...TTC', Alphabet()), id='Barnacle_Goose.Branta_leucopsis', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGTGCAGAACTAGGCCAACCAGGAACCCTCCTAGGAGATGATCAAATCTAC...TTC', Alphabet()), id='Whiskered_Tern.Chlidonias_hybrida', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGTGCAGAACTAGGCCAACCAGGGACCCTCCTAGGAGATGACCAAATCTAT...TTC', Alphabet()), id='Common_Murre.Uria_aalge', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGTGCAGAACTTGGCCAACCCGGAACCCTCCTAGGGGACGACCAAATCTAT...TTC', Alphabet()), id='Brown-Headed_Gull.Larus_brunnicephalus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGTGCAGAACTTGGCCAACCCGGAACCCTCCTAGGAGACGATCAAATCTAT...TTC', Alphabet()), id='Laughing_Gull.Larus_atricilla', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGTGCAGAACTTGGCCAACCCGGAACCCTCCTAGGAGACGACCAAATCTAT...TTC', Alphabet()), id='Glaucous_Gull.Larus_hyperboreus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGTGCAGAACTTGGCCAACCCGGGACCCTCCTAGGAGACGACCAAATCTAT...TTC', Alphabet()), id='Ring-Billed_Gull.Larus_delawarensis', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGTGCAGAACTTGGCCAACCCGGAACCCTCCTAGGAGACGACCAAATCTAT...---', Alphabet()), id='Herring_Gull.Larus_smithsonianus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGTGCAGAACTTGGCCAACCCGGAACCCTCCTAGGAGACGACCAAATCTAT...TTT', Alphabet()), id='Iceland_Gull.Larus_glaucoides', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGTGCAGAATTAGGACAACCAGGGACACTACTTGGAGACGATCAAATCTAC...TTC', Alphabet()), id='Ostrich.Struthio_camelus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGAACAGAACTTGGCCAACCAGGAACTCTCCTAGGAGATGACCAAATCTAC...TTC', Alphabet()), id='Peregrine_Falcon.Falco_peregrinus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGAACAGAACTTGGCCAACCAGGGACTCTCCTAGGAGATGACCAAATCTAC...TTC', Alphabet()), id='Saker_Falcon.Falco_cherrug', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGCGCAGAACTAGGTCAACCGGGGACCCTCTTAGGAGACGATCAAATTTAC...TTC', Alphabet()), id='Ruddy_Turnstone.Arenaria_interpres', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGTGCAGAACTAGGCCAACCCGGAACTCTCTTAGGAGACGATCAAATCTAT...TTC', Alphabet()), id='Sharp-Tailed_Sandpiper.Calidris_acuminata', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGTGCAGAACTAGGCCAGCCCGGAACCCTTCTAGGAGATGACCAAATCTAT...TTC', Alphabet()), id='Red-Necked_Stint.Calidris_ruficollis', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGAGCAGAACTAGGTCAACCCGGGACCCTCCTAGGAGATGATCAAATCTAC...TTC', Alphabet()), id='Sanderling.Calidris_alba', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGTGCAGAACTAGGTCAACCTGGAACTCTTTTAGGAGACGACCAAATCTAC...TTC', Alphabet()), id='White-Rumped_Sandpiper.Calidris_fuscicollis', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGTGCAGAACTAGGTCAACCCGGAACCCTTTTAGGAGATGACCAGATCTAC...TTC', Alphabet()), id='Semipalmated_Sandpiper.Calidris_pusilla', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGTGCAGAACTAGGTCAACCCGGAACCCTTTTAGGAGATGACCAAATCTAC...TTC', Alphabet()), id='Least_Sandpiper.Calidris_minutilla', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGTGCAGAACTAGGTCAACCTGGGACTCTTTTAGGAGATGACCAAATTTAC...TTC', Alphabet()), id='Dunlin.Calidris_alpina', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGCGCAGAACTAGGCCAACCCGGAACCCTCTTAGGAGATGACCAAATTTAT...TTC', Alphabet()), id='Red_Knot.Calidris_canutus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGAGCTGAACTTGGCCAGCCAGGAACGCTCCTAGGAGACGACCAGATCTAT...TTC', Alphabet()), id='Little_Egret.Egretta_garzetta', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGAACCCTCCTAGGCGATGACCAAATCTAT...TTC', Alphabet()), id='Magpie.Anseranas_semipalmata', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGAGCTGAACTTGGACAACCAGGGACGCTCCTAGGAGACGACCAAATCTAC...TTC', Alphabet()), id='Grey_Heron.Ardea_cinerea', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTGGGCCAGCCAGGAACCCTCCTAGGAGACGACCAAATCTAC...TTC', Alphabet()), id='Great_Crested_Grebe.Podiceps_cristatus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGTGCAGAACTAGGCCAACCAGGAACCCTTCTAGGAGACGACCAGATCTAC...TTC', Alphabet()), id='Little_Grebe.Tachybaptus_ruficollis', name='', description='', dbxrefs=[])]
In [11]:
# Filter out COI sequences such that only those without gaps are left. This is so that we can do a phylogenetic tree.
no_gaps = []
for s in trimmed_coi:
if '-' not in s.seq:
no_gaps.append(s)
no_gaps
Out[11]:
[SeqRecord(seq=Seq('ATCCGTGCTGAACTAGGTCAACCTGGCACTCTGCTAGGAGACGACCAGATTTAT...TTT', Alphabet()), id='Ferret.Mustela_putorius', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGTGCCGAACTAGGTCAACCCGGGGCTCTGTTGGGGGATGATCAGATCTAC...TTC', Alphabet()), id='Sloth_Bear.Melursus_ursinus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGTGCTGAATTAGGTCAGCCTGGAGCTCTGTTAGGAGATGACCAAATTTAC...TTT', Alphabet()), id='Panda.Ailuropoda_melanoleuca', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGTGCTGAATTGGGGCAGCCTGGGACATTGCTTGGAGATGACCAAATCTAT...TTT', Alphabet()), id='Camel.Camelus_dromedarius', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGTGCTGAATTAGGCCAACCTGGGACCCTACTAGGAGATGATCAGATCTAC...TTC', Alphabet()), id='Horse.Equus_ferus_caballus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGCGCTGAACTAGGTCAGCCCGGAACCCTACTTGGCGATGATCAAATCTAC...TTC', Alphabet()), id='Swine.Sus_scrofa', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGTGCAGAACTGGGACAACCTGGGACACTCCTAGGAGACGACCAAATCTAT...TTT', Alphabet()), id='Turkey.Meleagris_gallopavo', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGCACCCTCCTAGGAGATGACCAAATTTAC...TTT', Alphabet()), id='Japanese_Quail.Coturnix_japonica', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGGCAACCCGGAACCCTCTTAGGAGACGACCAAATCTAC...TTC', Alphabet()), id='Chinese_Francolin.Francolinus_pintadeanus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTTGGTCAACCAGGAACCCTCCTGGGCGATGACCAAATTTAC...TTT', Alphabet()), id='Garganey.Anas_querquedula', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTTGGTCAACCAGGGACTCTCCTGGGCGATGACCAAATTTAC...TTT', Alphabet()), id='Northern_Shoveler.Anas_clypeata', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTGGGCGACGACCAAATTTAC...TTT', Alphabet()), id='American_Green-Winged_Teal.Anas_carolinensis', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTGGGCGACGACCAAATTTAC...TTT', Alphabet()), id='Green-Winged_Teal.Anas_carolinensis', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGGGCAGAACTAGGCCAGCCAGGGACCCTCCTGGGCGACGACCAAATTTAT...TTT', Alphabet()), id='Mallard.Anas_platyrhynchos', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGGGCAGAGCTAGGCCAGCCAGGGACCCTCCTGGGCGACGACCAAATTTAT...TTT', Alphabet()), id='American_Black_Duck.Anas_rubripes', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGGACCCTCCTGGGCGACGACCAAATTTAC...TTT', Alphabet()), id='American_Wigeon.Anas_americana', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAGCCAGGGACCCTCCTGGGCGACGACCAAATTTAT...TTT', Alphabet()), id='Gadwall.Anas_strepera', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTAGGTGATGACCAGATCTAC...TTC', Alphabet()), id='Red-Crested_Pochard.Netta_rufina', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTCGGCCAACCAGGGACCCTCCTAGGCGATGACCAAATTTAC...TTC', Alphabet()), id='Hooded_Merganser.Lophodytes_cucullatus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCGGGAACCCTCCTGGGTGATGACCAGATTTAC...TTC', Alphabet()), id='Rosy-Billed_Pochard.Netta_peposaca', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTAGGTGATGACCAGATTTAC...TTC', Alphabet()), id='Ring-Necked_Duck.Aythya_collaris', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTAGGTGATGACCAGATTTAC...TTC', Alphabet()), id='Canvasback.Aythya_valisineria', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTGGGTGATGACCAGATTTAC...TTC', Alphabet()), id='Tufted_Duck.Aythya_fuligula', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTAGGTGATGACCAGATTTAC...TTC', Alphabet()), id='Lesser_Scaup.Aythya_affinis', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGCCAACCAGGAACCCTCCTAGGTGATGACCAGATTTAC...TTC', Alphabet()), id='Greater_Scaup.Aythya_marila', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCTGAACTAGGCCAGCCAGGAACCCTCCTAGGTGATGACCAAATTTAT...TTC', Alphabet()), id='Wood_Duck.Aix_sponsa', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTCGGCCAGCCGGGAACCCTCCTAGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Black_Scoter.Melanitta_nigra', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTCGGCCAACCAGGGACCCTCCTGGGCGATGACCAAATTTAC...TTC', Alphabet()), id='Bufflehead.Bucephala_albeola', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTCGGCCAGCCAGGGACCCTCCTGGGCGATGACCAAATTTAC...TTC', Alphabet()), id='Common_Goldeneye.Bucephala_clangula', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCCGGAACCCTCCTAGGTGATGACCAAATTTAC...TTT', Alphabet()), id='Common_Eider.Somateria_mollissima', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTGGGCCAACCAGGAACCCTCCTGGGTGATGACCAAATTTAC...TTC', Alphabet()), id='Long-Tailed_Duck.Clangula_hyemalis', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAGCCAGGAACCCTCCTCGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Mute_Swan.Cygnus_olor', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTGGGACAACCGGGAACTCTCCTTGGTGACGACCAGATCTAT...TTC', Alphabet()), id='Tundra_Swan.Cygnus_columbianus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTGGGACAACCAGGAACCCTCCTTGGTGACGACCAGATCTAC...TTC', Alphabet()), id='Whooper_Swan.Cygnus_cygnus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGCGCAGAACTAGGCCAACCAGGGACTCTCCTGGGAGATGACCAAATCTAT...TTC', Alphabet()), id='Muscovy_Duck.Cairina_moschata', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAGCCAGGAACTCTCCTAGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Greater_White-Fronted_Goose.Anser_albifrons', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGAACTCTCCTAGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Snow_Goose.Chen_caerulescens', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGAACTCTCCTAGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Pink-Footed_Goose.Anser_brachyrhynchus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGAACTCTCCTAGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Bar-Headed_Goose.Anser_indicus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGGACTCTCCTAGGCGACGACCAAATTTAC...TTC', Alphabet()), id='Canada_Goose.Branta_canadensis', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGGACTCTCCTAGGTGACGACCAAATTTAC...TTC', Alphabet()), id='Barnacle_Goose.Branta_leucopsis', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGTGCAGAACTAGGCCAACCAGGAACCCTCCTAGGAGATGATCAAATCTAC...TTC', Alphabet()), id='Whiskered_Tern.Chlidonias_hybrida', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGTGCAGAACTAGGCCAACCAGGGACCCTCCTAGGAGATGACCAAATCTAT...TTC', Alphabet()), id='Common_Murre.Uria_aalge', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGTGCAGAACTTGGCCAACCCGGAACCCTCCTAGGGGACGACCAAATCTAT...TTC', Alphabet()), id='Brown-Headed_Gull.Larus_brunnicephalus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGTGCAGAACTTGGCCAACCCGGAACCCTCCTAGGAGACGATCAAATCTAT...TTC', Alphabet()), id='Laughing_Gull.Larus_atricilla', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGTGCAGAACTTGGCCAACCCGGAACCCTCCTAGGAGACGACCAAATCTAT...TTC', Alphabet()), id='Glaucous_Gull.Larus_hyperboreus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGTGCAGAACTTGGCCAACCCGGGACCCTCCTAGGAGACGACCAAATCTAT...TTC', Alphabet()), id='Ring-Billed_Gull.Larus_delawarensis', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGTGCAGAACTTGGCCAACCCGGAACCCTCCTAGGAGACGACCAAATCTAT...TTT', Alphabet()), id='Iceland_Gull.Larus_glaucoides', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGTGCAGAATTAGGACAACCAGGGACACTACTTGGAGACGATCAAATCTAC...TTC', Alphabet()), id='Ostrich.Struthio_camelus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGAACAGAACTTGGCCAACCAGGAACTCTCCTAGGAGATGACCAAATCTAC...TTC', Alphabet()), id='Peregrine_Falcon.Falco_peregrinus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGAACAGAACTTGGCCAACCAGGGACTCTCCTAGGAGATGACCAAATCTAC...TTC', Alphabet()), id='Saker_Falcon.Falco_cherrug', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGCGCAGAACTAGGTCAACCGGGGACCCTCTTAGGAGACGATCAAATTTAC...TTC', Alphabet()), id='Ruddy_Turnstone.Arenaria_interpres', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGTGCAGAACTAGGCCAACCCGGAACTCTCTTAGGAGACGATCAAATCTAT...TTC', Alphabet()), id='Sharp-Tailed_Sandpiper.Calidris_acuminata', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGTGCAGAACTAGGCCAGCCCGGAACCCTTCTAGGAGATGACCAAATCTAT...TTC', Alphabet()), id='Red-Necked_Stint.Calidris_ruficollis', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGAGCAGAACTAGGTCAACCCGGGACCCTCCTAGGAGATGATCAAATCTAC...TTC', Alphabet()), id='Sanderling.Calidris_alba', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGTGCAGAACTAGGTCAACCTGGAACTCTTTTAGGAGACGACCAAATCTAC...TTC', Alphabet()), id='White-Rumped_Sandpiper.Calidris_fuscicollis', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGTGCAGAACTAGGTCAACCCGGAACCCTTTTAGGAGATGACCAGATCTAC...TTC', Alphabet()), id='Semipalmated_Sandpiper.Calidris_pusilla', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGTGCAGAACTAGGTCAACCCGGAACCCTTTTAGGAGATGACCAAATCTAC...TTC', Alphabet()), id='Least_Sandpiper.Calidris_minutilla', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGTGCAGAACTAGGTCAACCTGGGACTCTTTTAGGAGATGACCAAATTTAC...TTC', Alphabet()), id='Dunlin.Calidris_alpina', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATTCGCGCAGAACTAGGCCAACCCGGAACCCTCTTAGGAGATGACCAAATTTAT...TTC', Alphabet()), id='Red_Knot.Calidris_canutus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGAGCTGAACTTGGCCAGCCAGGAACGCTCCTAGGAGACGACCAGATCTAT...TTC', Alphabet()), id='Little_Egret.Egretta_garzetta', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTAGGACAACCAGGAACCCTCCTAGGCGATGACCAAATCTAT...TTC', Alphabet()), id='Magpie.Anseranas_semipalmata', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGAGCTGAACTTGGACAACCAGGGACGCTCCTAGGAGACGACCAAATCTAC...TTC', Alphabet()), id='Grey_Heron.Ardea_cinerea', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGCGCAGAACTGGGCCAGCCAGGAACCCTCCTAGGAGACGACCAAATCTAC...TTC', Alphabet()), id='Great_Crested_Grebe.Podiceps_cristatus', name='', description='', dbxrefs=[]),
SeqRecord(seq=Seq('ATCCGTGCAGAACTAGGCCAACCAGGAACCCTTCTAGGAGACGACCAGATCTAC...TTC', Alphabet()), id='Little_Grebe.Tachybaptus_ruficollis', name='', description='', dbxrefs=[])]
In [12]:
SeqIO.write(no_gaps, 'host_coi_nogaps.fasta', 'fasta')
SeqIO.write(no_gaps, 'host_coi_nogaps.phylip', 'phylip')
Out[12]:
65
In [13]:
# Get the distribution of hamming distances.
from itertools import combinations
from Levenshtein import distance
distances = []
for s1, s2 in combinations(no_gaps, 2):
s1 = str(s1.seq)
s2 = str(s2.seq)
distances.append(distance(s1,s2))
In [14]:
plt.hist(distances)
Out[14]:
(array([ 23., 14., 38., 232., 225., 193., 692., 334., 239., 90.]),
array([ 0. , 14.9, 29.8, 44.7, 59.6, 74.5, 89.4, 104.3,
119.2, 134.1, 149. ]),
<a list of 10 Patch objects>)
In [15]:
!! raxmlHPC -p 100 -# 3 -m GTRGAMMA -s host_coi_nogaps.fasta -n host_coi_nogaps.tree -T 2
Out[15]:
['Option -T does not have any effect with the sequential or parallel MPI version.',
'It is used to specify the number of threads for the Pthreads-based parallelization',
'',
"RAxML can't, parse the alignment file as phylip file ",
'it will now try to parse it as FASTA file',
'',
'RAxML output files with the run ID <host_coi_nogaps.tree> already exist ',
'in directory /home/ericmjl/influenza-global-reassortment/ ...... exiting']
In [16]:
!! ls *.tree
Out[16]:
['RAxML_bestTree.host_coi_nogaps.tree', 'RAxML_info.host_coi_nogaps.tree']
In [26]:
from dendropy import Tree
from dendropy.calculate.treemeasure import PatristicDistanceMatrix
coi_tree = Tree.get(file=open('RAxML_bestTree.host_coi_nogaps.tree', 'r'),
schema='newick')
coi_pds = PatristicDistanceMatrix(coi_tree)
coi_pds.sum_of_distances()
Out[26]:
1015.7242854563826
In [31]:
taxon1 = coi_tree.leaf_nodes()[0].taxon
taxon2 = coi_tree.leaf_nodes()[1].taxon
In [50]:
taxon2 = coi_tree.leaf_nodes()[1].taxon.__str__()
taxon2.replace("'","")
Out[50]:
'Horse.Equus ferus caballus'
In [34]:
coi_pds.__call__(taxon1, taxon2)
Out[34]:
0.7119165546950544
In [67]:
patristic_distances = nx.Graph()
pds = []
for taxon1, taxon2 in product(coi_tree.leaf_nodes(), coi_tree.leaf_nodes()):
taxon1 = taxon1.taxon
taxon2 = taxon2.taxon
pd = coi_pds.__call__(taxon1, taxon2)
t1 = taxon1.__str__().replace("'","").split('.')[0]
t2 = taxon2.__str__().replace("'","").split('.')[0]
patristic_distances.add_edge(t1, t2, pd=pd)
pds.append(pd)
In [68]:
plt.hist(pds)
Out[68]:
(array([ 403., 782., 364., 896., 990., 106., 88., 162., 354., 80.]),
array([ 0. , 0.12690987, 0.25381974, 0.38072961, 0.50763948,
0.63454935, 0.76145921, 0.88836908, 1.01527895, 1.14218882,
1.26909869]),
<a list of 10 Patch objects>)
In [69]:
max(pds)
Out[69]:
1.2690986907853712
In [70]:
min(pds)
Out[70]:
0.0
In [71]:
nx.write_gpickle(patristic_distances, 'supp_data/patristic_distances.pkl')
In [73]:
patristic_distances.edge['Mallard']
Out[73]:
{'American Black Duck': {'pd': 0.00978530479098707},
'American Green-Winged Teal': {'pd': 0.0760159068606335},
'American Wigeon': {'pd': 0.05329789418179376},
'Bar-Headed Goose': {'pd': 0.16440706032963423},
'Barnacle Goose': {'pd': 0.1525078793951098},
'Black Scoter': {'pd': 0.1313820887207964},
'Brown-Headed Gull': {'pd': 0.5359870461849322},
'Bufflehead': {'pd': 0.13844900323749146},
'Camel': {'pd': 1.139610961267169},
'Canada Goose': {'pd': 0.1487864476189279},
'Canvasback': {'pd': 0.12777814104578533},
'Chinese Francolin': {'pd': 0.41679378743034057},
'Common Eider': {'pd': 0.12320986574712678},
'Common Goldeneye': {'pd': 0.1250179531566235},
'Common Murre': {'pd': 0.5037977010375587},
'Dunlin': {'pd': 0.5541468716317777},
'Ferret': {'pd': 1.0052196033816883},
'Gadwall': {'pd': 0.06077763709949219},
'Garganey': {'pd': 0.09094589474066593},
'Glaucous Gull': {'pd': 0.5522423313203503},
'Great Crested Grebe': {'pd': 0.4322418927584293},
'Greater Scaup': {'pd': 0.12209641915797556},
'Greater White-Fronted Goose': {'pd': 0.16375245824132825},
'Green-Winged Teal': {'pd': 0.0760159068606335},
'Grey Heron': {'pd': 0.4983191919597433},
'Hooded Merganser': {'pd': 0.15361574839833744},
'Horse': {'pd': 1.0398314617667486},
'Iceland Gull': {'pd': 0.5585970770247295},
'Japanese Quail': {'pd': 0.4123314279477942},
'Laughing Gull': {'pd': 0.5538684305736757},
'Least Sandpiper': {'pd': 0.5538078786053702},
'Lesser Scaup': {'pd': 0.12529268690501263},
'Little Egret': {'pd': 0.45458216513256133},
'Little Grebe': {'pd': 0.435422329525462},
'Long-Tailed Duck': {'pd': 0.13113589994386673},
'Magpie': {'pd': 0.5184911095605531},
'Mallard': {'pd': 0.0},
'Muscovy Duck': {'pd': 0.17436409315181028},
'Mute Swan': {'pd': 0.1909289111215217},
'Northern Shoveler': {'pd': 0.0944879302701608},
'Ostrich': {'pd': 0.6269793620310405},
'Panda': {'pd': 1.0143060249254658},
'Peregrine Falcon': {'pd': 0.5833636672043966},
'Pink-Footed Goose': {'pd': 0.16125627242943497},
'Red Knot': {'pd': 0.514820128052132},
'Red-Crested Pochard': {'pd': 0.19320986169215124},
'Red-Necked Stint': {'pd': 0.5528505340557295},
'Ring-Billed Gull': {'pd': 0.5619518993833915},
'Ring-Necked Duck': {'pd': 0.12738246219395677},
'Rosy-Billed Pochard': {'pd': 0.13739837307773234},
'Ruddy Turnstone': {'pd': 0.49730290885553485},
'Saker Falcon': {'pd': 0.6160120296332262},
'Sanderling': {'pd': 0.556583392075901},
'Semipalmated Sandpiper': {'pd': 0.5321150884894232},
'Sharp-Tailed Sandpiper': {'pd': 0.5191468062037256},
'Sloth Bear': {'pd': 1.0620796615274277},
'Snow Goose': {'pd': 0.1604615363043253},
'Swine': {'pd': 0.7635753681642896},
'Tufted Duck': {'pd': 0.12898171178406181},
'Tundra Swan': {'pd': 0.19277471442754854},
'Turkey': {'pd': 0.4339315353387678},
'Whiskered Tern': {'pd': 0.5655726881069701},
'White-Rumped Sandpiper': {'pd': 0.5635020165658157},
'Whooper Swan': {'pd': 0.18976871641150603},
'Wood Duck': {'pd': 0.16351074763269205}}
In [ ]:
Content source: ericmjl/influenza-global-reassortment
Similar notebooks: