In [1]:
from collections import Counter
from itertools import product
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import os
%matplotlib inline
In [2]:
def golden_figsize(height):
"""
Assuming height dimension is the shorter one, the width should be:
(1 + 5**0.5)/2
This function thus returns the (width, height) tuple which is
computed to be in a golden ratio.
"""
width = height * (1 + 5**0.5) / 2
return (width, height)
In [3]:
def is_reassortant(G, node):
return G.node[node]['reassortant']
In [4]:
G = nx.read_gpickle('20141103 All IRD Final Graph.pkl')
In [5]:
# How many edges are 'whole genome' and how many are 'reassortment'?
edge_type_counts = Counter()
for sc, sk, d in G.edges(data=True):
edge_type_counts[d['edge_type']] += 1
1 - edge_type_counts['reassortant'] / float(sum(edge_type_counts.values()))
Out[5]:
In [6]:
for n, d in G.nodes(data=True):
in_edge_types = set([d['edge_type'] for sc, sk, d in G.in_edges(n, data=True)])
if 'reassortant' in in_edge_types:
G.node[n]['reassortant'] = True
elif 'full_complement' in in_edge_types:
G.node[n]['reassortant'] = False
else:
G.node[n]['reassortant'] = False
In [7]:
n_reassortants = 0
for n in G.nodes():
if is_reassortant(G, n):
n_reassortants += 1
n_reassortants
Out[7]:
In [8]:
G.edges(data=True)[0]
Out[8]:
In [9]:
countries = set([d['country'] for n, d in G.nodes(data=True)])
In [10]:
countryG = nx.DiGraph()
for c1, c2 in product(countries, countries):
countryG.add_edge(c1, c2, weight=0)
for source, sink, data in G.edges(data=True):
source_country = G.node[source]['country']
sink_country = G.node[sink]['country']
countryG.edge[source_country][sink_country]['weight'] += 1
In [11]:
host_types = ['Human', 'Avian', 'Swine']
flows = [i for i in product(host_types, host_types)]
flows
Out[11]:
In [12]:
domainG = nx.DiGraph()
for f in flows:
domainG.add_edge(f[0], f[1], weight=0)
domainG.edges()
Out[12]:
In [13]:
for source, sink, data in G.edges(data=True):
source_host = G.node[source]['host_species']
sink_host = G.node[sink]['host_species']
for f in flows:
if f[0] in source_host and f[1] in sink_host:
domainG.edge[f[0]][f[1]]['weight'] += 1
In [14]:
domainG.edges(data=True)
Out[14]:
In [15]:
# Histogram of Edges
# What is the distribution of PWIs represented on the edges?
pwis = [d['pwi'] for _, _, d in G.edges(data=True)]
fig = plt.figure(figsize=golden_figsize(2))
plt.hist(pwis, bins=100)
plt.xlabel('Sum PWI')
plt.ylabel('Counts')
plt.title('With Reassortment Edges')
plt.vlines(np.percentile(pwis, 5), 0, 10000, 'green', label='5th Percentile')
plt.vlines(np.percentile(pwis, 50), 0, 10000, 'red', label='50th Percentile')
plt.legend(loc='upper left')
plt.subplots_adjust(left=0.17, right=0.95, top=0.88, bottom=0.18)
plt.savefig('Final Graph PWI Histogram.pdf')
In [16]:
np.percentile(pwis, 5)
Out[16]:
In [17]:
subgraphs = [g for g in nx.connected_component_subgraphs(G.to_undirected())]
len(subgraphs)
Out[17]:
In [18]:
# Subgraph Nodes
subgraph_size = [len(g.nodes()) for g in subgraphs]
fig = plt.figure(figsize=golden_figsize(2))
ax = fig.add_subplot(111)
ax.hist(subgraph_size, bins=30)
ax.set_xlabel('Number of Nodes in Subgraph', fontsize=7)
ax.set_ylabel('Counts', fontsize=7)
ax.get_xaxis().set_ticks(np.arange(0, 18000, 4000))
ax.set_xticklabels(np.arange(0, 18000, 4000), fontsize=7)
ax.set_title('Subgraph Size', fontsize=9)
ax.annotate('b.', xy=(0,1), xycoords='figure fraction', va='top', ha='left')
yticklabels = ax.get_yticklabels()
plt.setp(yticklabels, fontsize=7)
plt.subplots_adjust(bottom=0.2)
plt.savefig('Final Graph Subgraph Sizes.pdf')
In [43]:
# Let's try making a scatter plot version of this graph.
# y-axis corresponds to the collection dates represented, x-axis corresponds to the size of the subgraph.
from matplotlib import dates as mdts
from collections import defaultdict
import math
import numpy as np
from matplotlib.colors import LogNorm
xs = []
ys = []
for g in subgraphs:
for n, d in g.nodes(data=True):
cdate = d['collection_date']
ys.append(mdts.date2num(cdate))
xs.append(len(g.nodes()))
#if len(g.nodes()) < 10:
#print(d['subtype'])
yearsFmt = mdts.DateFormatter('%Y')
fig = plt.figure(figsize=golden_figsize(2))
ax = fig.add_subplot(111)
# ax.scatter(xs, ys, alpha=0.05, color='green')
# ax.hexbin(xs, ys, gridsize=(30, 20), cmap='binary', bins='log')
from datetime import datetime as dt
min_date = dt(1980, 1, 1)
max_date = dt(2015, 1, 1)
xbins = np.arange(0, 17001, 500)
ybins = np.arange(mdts.date2num(min_date), mdts.date2num(max_date), 100)
ax.hist2d(xs, ys, norm=LogNorm(), cmap='Blues', bins=[xbins, ybins])
ax.yaxis.set_major_formatter(yearsFmt)
ax.set_xlabel('Number of Nodes in Subgraph (x1000)')
ax.set_ylabel('Dates of Isolation')
ax.xaxis.set_ticklabels([0, 0, 5, 10, 15, 20])
ax.set_xlim(-1000, 20000)
# set min_date
ax.set_ylim(mdts.date2num(min_date), mdts.date2num(max_date))
ax.set_yticks(ybins[::25])
ax.set_title('Subgraph Date and Size Distribution')
ax.annotate('b.', xy=(0,1), xycoords='figure fraction', ha='left', va='top')
#plt.locator_params(axis='y', nbins=5)
plt.subplots_adjust(left=0.15, right=0.95, bottom=0.18, top=0.90)
plt.savefig('Final Graph Subgraph Sizes and Date Range.pdf')
In [20]:
# Subgraph diameters
# NOTE: This cell takes quite a bit of time to run!
# diameters = [nx.diameter(g) for g in subgraphs]
In [21]:
# diameter_counter = Counter(diameters)
# fig = plt.figure(figsize=golden_figsize(3))
# ax = fig.add_subplot(111)
# ax.bar(diameter_counter.keys(), diameter_counter.values())
# ax.set_xlabel('Graph Diameter')
# ax.set_ylabel('Count')
# ax.set_title('Histogram of Subgraph Diameters \n in Final Graph')
# plt.savefig('Final Graph Diameter Distribution.pdf', bbox_inches='tight')
In [22]:
fullG = nx.read_gpickle('20141103 All IRD Full Complement Only Graph.pkl')
In [23]:
# Histogram of Edges
# What is the distribution of PWIs represented on the edges?
pwis = [d['pwi'] for _, _, d in fullG.edges(data=True)]
fig = plt.figure(figsize=golden_figsize(2))
plt.hist(pwis, bins=100)
plt.xlabel('Sum PWI')
plt.ylabel('Counts')
plt.title('Without Reassortant Edges')
plt.vlines(np.percentile(pwis, 5), 0, 10000, 'green', label='5th Percentile')
plt.vlines(np.percentile(pwis, 50), 0, 10000, 'red', label='50th Percentile')
plt.legend(loc='upper left')
plt.subplots_adjust(left=0.17, right=0.95, top=0.88, bottom=0.18)
plt.savefig('Full Complement Graph PWI Histogram.pdf')
In [24]:
# Identify the number of nodes in the largest two subgraphs.
fullGsubgraphs = [g for g in nx.connected_component_subgraphs(fullG.to_undirected())]
fullGsubgraphs = sorted(fullGsubgraphs, key=lambda x: len(x.nodes()), reverse=True)
In [25]:
# Query the number of nodes
set([d['host_species'] for n, d in fullGsubgraphs[1].nodes(data=True)])
# len(fullGsubgraphs[1].nodes())
Out[25]:
In [26]:
# Subgraph Nodes
fullGsubgraph_size = [len(g.nodes()) for g in fullGsubgraphs]
fig = plt.figure(figsize=golden_figsize(2))
ax1 = fig.add_subplot(111)
ax2 = fig.add_subplot(211)
ax3 = fig.add_subplot(212)
ax2.hist(fullGsubgraph_size, bins=30)
ax3.hist(fullGsubgraph_size, bins=30)
# Set the y-limits
ax3.set_ylim(0,6)
ax2.set_ylim(1500, 2100)
# Set the y-ticks intervals.
ax3.set_yticks(np.arange(0, 7, 2))
ax2.set_yticks(np.arange(1500, 2101, 300))
ytklabels2 = ax2.get_yticklabels()
plt.setp(ytklabels2, fontsize=7)
ytklabels3 = ax3.get_yticklabels()
plt.setp(ytklabels3, fontsize=7)
# Remove unnecessary spines.
ax1.spines['left'].set_visible(False)
ax1.spines['right'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax3.spines['top'].set_visible(False)
# Remove any unnecessary axes
ax1.get_yaxis().set_ticklabels([])
ax1.get_xaxis().set_visible(False)
ax2.get_xaxis().set_visible(False)
ax3.get_xaxis().set_ticks_position('bottom')
# Add in diagonal lines.
d = 0.015
kwargs = dict(transform=ax2.transAxes, color='k', clip_on=False)
ax2.plot((-d, +d), (-d, +d), **kwargs)
ax2.plot((1-d, 1+d), (-d, +d), **kwargs)
kwargs.update(transform=ax3.transAxes)
ax3.plot((-d, +d), (1-d, 1+d), **kwargs)
ax3.plot((1-d, 1+d), (1-d, 1+d), **kwargs)
# Set other plot parameters
# ax3.set_xlabel('Number of Nodes in Subgraph')
ax3.set_ylabel('Counts', fontsize=7)
ax2.set_title('Subgraph Size', fontsize=9)
ax3.get_xaxis().set_ticks(np.arange(0, 5000, 1000))
ax3.get_xaxis().set_ticklabels(np.arange(0, 5000, 1000), fontsize=7)
ax1.annotate('a.', xy=(0,1), xycoords='figure fraction', ha='left', va='top')
plt.subplots_adjust(bottom=0.2)
plt.savefig('Full Complement Graph Subgraph Sizes.pdf')
In [44]:
# Let's try making a scatter plot version of this graph.
# y-axis corresponds to the collection dates represented, x-axis corresponds to the size of the subgraph.
from matplotlib import dates as mdts
from collections import defaultdict
import math
import numpy as np
xs = []
ys = []
for g in fullGsubgraphs:
for n, d in g.nodes(data=True):
cdate = d['collection_date']
ys.append(mdts.date2num(cdate))
xs.append(len(g.nodes()))
yearsFmt = mdts.DateFormatter('%Y')
fig = plt.figure(figsize=golden_figsize(2))
ax = fig.add_subplot(111)
# ax.scatter(xs, ys, alpha=0.05, color='blue')
# ax.hexbin(xs, ys, gridsize=(10,20), cmap='Blues', bins='log')
from datetime import datetime as dt
min_date = dt(1980, 1, 1)
max_date = dt(2015, 1, 1)
xbins = np.arange(0, 17001, 500)
ybins = np.arange(mdts.date2num(min_date), mdts.date2num(max_date), 100)
ax.hist2d(xs, ys, norm=LogNorm(), cmap='Greens', bins=[xbins, ybins])
ax.yaxis.set_major_formatter(yearsFmt)
ax.set_xlabel('Number of Nodes in Subgraph (x1000)')
ax.xaxis.set_ticklabels([0, 0, 5, 10, 15, 20])
ax.set_ylabel('Dates of Isolation')
ax.set_xlim(-1000, 20000)
ax.set_ylim(mdts.date2num(min_date), mdts.date2num(max_date))
ax.set_yticks(ybins[::25])
ax.set_title('Subgraph Date and Size Distribution')
ax.annotate('a.', xy=(0,1), xycoords='figure fraction', ha='left', va='top')
# plt.locator_params(axis='y', nbins=4)
plt.subplots_adjust(left=0.15, right=0.95, bottom=0.18, top=0.90)
plt.savefig('Full Complement Subgraph Sizes and Date Range.pdf')
In [28]:
# What are the isolates that are not conneced in the big massive graph?
for g in subgraphs:
if len(g.nodes()) < 5:
for n in g.nodes():
print(n, g.node[n])
In [29]:
[n for n in G.nodes() if G.node[n]['subtype'] == 'H11N7']
Out[29]:
In [30]:
# Data Dump
G.nodes(data=True)[0]
Out[30]:
In [31]:
import pandas as pd
# Node List Dump
node_list = []
for n, d in G.nodes(data=True):
data = dict()
data['strain_name'] = n
for k, v in d.items():
data[k] = v
node_list.append(data)
pd.DataFrame(node_list).to_csv('20141103 All IRD Node List.csv')
In [32]:
# Edge List Dump
G.edges(data=True)[0]
Out[32]:
In [33]:
edge_list = []
for sc, sk, d in G.edges(data=True):
data = dict()
data['source'] = sc
data['sink'] = sk
for k, v in d.items():
if isinstance(v, dict):
for s, p in v.items():
data['segment{0}'.format(s)] = p
else:
data[k] = v
edge_list.append(data)
pd.DataFrame(edge_list).to_csv('20141103 All IRD Edge List.csv')
In [34]:
# Subgraph diameters
# NOTE: This cell takes quite a bit of time to run!
# fullGdiameters = [nx.diameter(g) for g in fullGsubgraphs]
In [35]:
# fullGdiameter_counter = Counter(fullGdiameters)
# plt.bar(fullGdiameter_counter.keys(), fullGdiameter_counter.values())
# plt.xlabel('Graph Diameter')
# plt.ylabel('Count')
# plt.title('Histogram of Subgraph Diameters \n in Full Complement Graph')
# plt.savefig('Full Complement Graph Diameter Distribution.pdf', bbox_inches='tight')
In [36]:
# For further analysis, write the subgraphs to disk
# if os.getcwd().split('/')[-1] == 'All_IRD_Analysis_Run2':
# os.chdir('subgraph_pkl')
# for i, g in enumerate(subgraphs):
# nx.write_gpickle(G.subgraph(g.nodes()), 'subgraph{0}.pkl'.format(i))
# if os.getcwd().split('/')[-1] == 'subgraph_pkl':
# os.chdir('..')
In [ ]: