In the reviews provided, Reviewer 1 requested an analysis that showed that reassortment is favoured between different wild bird species. In the review, one specific example brought up was about genome transfer (clonal & reassortment) between gulls and mallards. In this notebook, I attempt this analysis.
In [100]:
import networkx as nx
import custom_funcs as cf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('paper')
sns.set_style('white')
%matplotlib inline
from joblib import Parallel, delayed
from time import time
from tqdm import tqdm
In [58]:
# Read in the graph data and clean it.
G = nx.read_gpickle('20150902_all_ird Final Graph.pkl')
G = cf.clean_host_species_names(G)
G = cf.impute_reassortant_status(G)
G = cf.impute_weights(G)
G = cf.remove_zero_weighted_edges(G)
In [59]:
G.nodes(data=True)[3]
Out[59]:
In [62]:
# Encode any gull species as "Gull" in the "host_label" field.
for n, d in G.nodes(data=True):
if d['host_species'] == 'Gull' or 'Gull' in d['host_species']:
G.node[n]['host_label'] = 'Gull'
elif d['host_species'] == 'Mallard':
G.node[n]['host_label'] = 'Mallard'
else:
G.node[n]['host_label'] = 'Unknown'
In [63]:
[n for n, d in G.nodes(data=True) if d['host_label'] == 'Gull']
Out[63]:
In [64]:
'Gull' in 'Herring Gull'
Out[64]:
In [65]:
set([d['host_species'] for n, d in G.nodes(data=True)])
Out[65]:
In [66]:
def counts_reassortant_domain_graph(G, node_attr):
"""
Computes the weighted counts of reassortant edges when going between different node attributes.
Returns a "domain graph" with counts of weighted reasosrtant edges and sum weighted edges.
"""
hg_graph = nx.DiGraph()
for n, node_d in G.nodes(data=True):
in_edges = G.in_edges(n, data=True)
total_edges = len(in_edges)
is_reassortant = node_d['reassortant']
sk_hg = G.node[n][node_attr]
if sk_hg not in hg_graph.nodes():
hg_graph.add_node(sk_hg)
for sc, _, edge_d in in_edges:
sc_hg = G.node[sc][node_attr]
if sc_hg not in hg_graph.nodes():
hg_graph.add_node(sc_hg)
if (sc_hg, sk_hg) not in hg_graph.edges():
hg_graph.add_edge(sc_hg, sk_hg, total=edge_d['weight'], reassortant=0)
if (sc_hg, sk_hg) in hg_graph.edges():
hg_graph.edge[sc_hg][sk_hg]['total'] += edge_d['weight']
if is_reassortant:
hg_graph.edge[sc_hg][sk_hg]['reassortant'] += edge_d['weight']
for sc, sk, d in hg_graph.edges(data=True):
hg_graph.edge[sc][sk]['p_reassortant'] = d['reassortant'] / d['total']
return hg_graph
In [67]:
# Compute the proportion reassortant across different host class pairs.
hg = counts_reassortant_domain_graph(G, 'host_label')
hg.edges(data=True)
Out[67]:
In [68]:
# A helper function for computing the null distribution.
def null_proportion_domain_graph_reassortant(G, node_attr, equally=False):
G_shuffled = cf.shuffle_node_attribute_label(G, node_attr, equally)
hg_graph_shuf = counts_reassortant_domain_graph(G_shuffled, node_attr)
return hg_graph_shuf
In [72]:
# Compute the null distribution.
### CAUTION! YOU WILL HAVE TO WAIT 3 MINUTES FOR THIS TO FINISH!
start = time()
results = Parallel(n_jobs=-1)(delayed(null_proportion_domain_graph_reassortant)(G, 'host_label', equally=True) for i in range(100))
len(results)
end = time()
print(end - start)
As shown in the analysis, there are insufficient numbers to compare reassortment between Mallards and Gulls (of any species). Within species, it is possible to do so for Mallards but not Gulls. At p_reassortant = 0.23, I would expect this (intuitively) to be greater than the null.
In [86]:
# Summarize the proportion reassortant distribution under null.
def distr_null_p_reassortant(list_of_hg_graphs):
hg_graph = nx.DiGraph()
for g in tqdm(list_of_hg_graphs):
hg_graph.add_nodes_from(g.nodes())
for sc, sk, d in g.edges(data=True):
if (sc, sk) not in hg_graph.edges():
hg_graph.add_edge(sc, sk, p_reassortant=[d['p_reassortant']])
else:
hg_graph.edge[sc][sk]['p_reassortant'].append(d['p_reassortant'])
return hg_graph
In [89]:
summaryG = distr_null_p_reassortant(results)
In [91]:
# Remove "unknowns" from consideration
for n, d in summaryG.nodes(data=True):
if 'Unknown' in n:
summaryG.remove_node(n)
summaryG.edges(data=True)
Out[91]:
In [95]:
# Grab out the "null" model statistics.
means = [] # mean of the distribution under null.
stds = [] # standard deviation of distribution under null.
names = [] # names
# grab out the 1st, 5th, 95th and 99th percentile of null distribution
percs = dict()
for p in [0.5, 5, 95, 99.5]:
percs[p] = []
name_map = {'Gull': 'Gull',
'Mallard': 'Mallard'}
# Reverse name_map for convenience
key_map = {v:k for k, v in name_map.items()}
for sc, sk, d in sorted(summaryG.edges(data=True), key=lambda x:(x[0], x[1])):
mean = np.mean(d['p_reassortant'])
std = np.std(d['p_reassortant'])
names.append('{0}:{1}'.format(name_map[sc], name_map[sk]))
means.append(mean)
stds.append(std)
for p in [0.5, 5, 95, 99.5]:
percs[p].append(np.percentile(d['p_reassortant'], p))
In [96]:
# Compile the "data" statistics.
data = []
names_data = []
log10weights = []
log10reassort = []
log10clonal = []
for sc, sk, d in sorted(hg.edges(data=True), key=lambda x:(x[0], x[1])):
if sc == 'Unknown' or sk == 'Unknown':
pass
else:
names_data.append('{0}:{1}'.format(name_map[sc], name_map[sk]))
data.append(d['p_reassortant'])
log10weights.append(np.log10(d['total']))
log10reassort.append(np.log10(d['reassortant']))
log10clonal.append(np.log10(d['total'] - d['reassortant']))
data
Out[96]:
In [115]:
# Plot data vs. null model.
fig = plt.figure(figsize=(4,3))
ind = np.arange(len(means))
width = 0.35
ax = fig.add_subplot(1,1,1)
ax.bar(ind, means, width=width,
color='blue',
label='Null',
#yerr=np.array(stds)*3,
yerr=[np.array(means) - percs[0.5],
percs[99.5] - np.array(means)],
alpha=0.3)
ax.bar(ind+width, data, width=width, color='blue', label='Data')
ax.set_xticks(ind+width)
ax.set_xticklabels(names, rotation=45, ha='right')
ax.set_ylabel('Proportion Reassortant')
ax.set_xlabel('Bird Transition')
for i, label in enumerate(ax.get_xaxis().get_ticklabels()):
if log10weights[i] > 3 or log10reassort[i] > 1:
label.set_weight('bold')
ax2 = ax.twinx()
ax2.scatter(ind+width, log10weights, color='orange', label='Total', alpha=0.3)
ax2.scatter(ind+width, log10reassort, color='green', label='Reassortant', alpha=0.3)
ax2.set_ylabel('log10(Num. Events)')
ax.legend(loc='upper left')
ax2.legend(loc='upper center')
ax2.axhline(y=1, color='green', alpha=0.3, linestyle='--')
ax2.axhline(y=3, color='orange', alpha=0.3, linestyle='--')
# ax.annotate('B', xy=(0,1), xycoords='figure fraction', va='top', ha='left')
# plt.legend()
plt.subplots_adjust(left=0.10, right=0.92, bottom=0.23)
plt.savefig('figures/Proportion Reassortant Gull Mallard.pdf', bbox_inches='tight')
In [ ]: