In [1]:
%run setup.ipynb
%matplotlib inline
rcParams['figure.dpi'] = 120
rcParams['figure.facecolor'] = 'w'
import hapclust
import cartopy; print('cartopy', cartopy.__version__)
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import pandas as pd
import numpy as np
In [2]:
df_samples = phase1_ar3.df_samples
df_samples.columns
Out[2]:
In [3]:
df_haplotypes = phase1_ar31.df_haplotypes
df_haplotypes.head()
Out[3]:
In [4]:
# use the network membership to define haplotype groups
vgsc_clusters = np.load('../data/median_joining_network_membership.npy').astype('U')
clust_dict = {(l if l else 'wt'): set(np.nonzero(vgsc_clusters == l)[0])
for l in np.unique(vgsc_clusters)}
In [5]:
clust_dict.keys()
Out[5]:
In [6]:
# merge the "other resistant" groups
clust_dict['other_resistant'] = clust_dict['FX'] | clust_dict['SX']
del clust_dict['FX']
del clust_dict['SX']
In [7]:
clust_dict.keys()
Out[7]:
In [8]:
#using different clusterings/files
# with open('../data/clust_dict.pickle', 'rb') as handle:
# clust_dict = peterpickedapickledpepper.load(handle)
# vgsc_clusters = np.load('../data/vgsc_cluster_membership.npy').astype('U')
# clust_dict = {(l if l else 'wt'): set(np.nonzero(vgsc_clusters == l)[0]) for l in np.unique(vgsc_clusters)}
# with open('../data/core_haps.pkl', mode='rb') as f:
# clust_dict = pickle.load(f)
# core_haps_simple = dict()
# fx = set()
# sx = set()
# lx = set()
# wt = set()
# for k, v in clust_dict.items():
# if k.startswith('FX'):
# fx |= v
# elif k.startswith('SX'):
# sx |= v
# elif k.startswith('LX'):
# lx |= v
# elif k.startswith('WT'):
# wt |= v
# else:
# core_haps_simple[k] = v
# core_haps_simple['FX'] = fx
# core_haps_simple['SX'] = sx
# core_haps_simple['LX'] = lx
# core_haps_simple['WT'] = wt
# clust_dict = core_haps_simple
In [9]:
hap_labels = sorted(clust_dict)
# reorder for aesthetics
hap_labels = (
[l for l in hap_labels if l.startswith('F')] +
[l for l in hap_labels if l.startswith('S')] +
[l for l in hap_labels if l.startswith('L')] +
['other_resistant', 'wt']
)
hap_labels
Out[9]:
In [10]:
def make_df_pops():
global df_pops
tbl_pops = (
etl
.wrap([
['pop', 'long_label', 'short_label', 'query'],
['AOM', 'Angola $coluzzii$', 'AO$Ac$', 'population == "AOM"'],
['BFM', 'Burkina Faso $coluzzii$', 'BF$Ac$', 'population == "BFM"'],
['BFS', 'Burkina Faso $gambiae$', 'BF$Ag$', 'population == "BFS"'],
['GNS', 'Guinea $gambiae$', 'GN$Ag$', 'population == "GNS"'],
['CMS', 'Cameroon $gambiae$', 'CM$Ag$', 'population == "CMS"'],
['CMS_savanna', 'Cameroon (savanna) $gambiae$', 'CM$Ag$', 'population == "CMS" and (region == "Gado-Badzere" or region == "Zembe-Borongo")'],
['CMS_transition', 'Cameroon (transition) $gambiae$', '', 'population == "CMS" and region == "Daiguene"'],
['CMS_forest', 'Cameroon (forest) $gambiae$', '', 'population == "CMS" and region == "Mayos"'],
['GAS', 'Gabon $gambiae$', 'GA$Ag$', 'population == "GAS"'],
['UGS', 'Uganda $gambiae$', 'UG$Ag$', 'population == "UGS"'],
['KES', 'Kenya', 'KE', 'population == "KES"'],
['GWA', 'Guinea-Bissau', 'GW', 'population == "GWA"'],
])
.addfield('latitude', lambda row: df_samples.query(row.query).latitude.mean())
.addfield('longitude', lambda row: df_samples.query(row.query).longitude.mean())
.addfield('n_haps', lambda row: len(df_haplotypes.query(row.query)))
)
df_pops = tbl_pops.todataframe()
df_pops = df_pops.set_index('pop')
make_df_pops()
df_pops
Out[10]:
In [11]:
crs_lonlat = ccrs.PlateCarree()
ratios = np.asarray([0.5, 0.5])
sum(ratios)
Out[11]:
In [12]:
def compute_hap_freqs():
global df_freqs
n_pops = len(df_pops)
n_haps = len(hap_labels)
hap_frequencies = np.zeros([n_pops, n_haps], dtype=int)
# then loop through clusters
for i, pop in enumerate(df_pops.index):
pop_query = df_pops.loc[pop].query
pop_hap_ixs = set(df_haplotypes.query(pop_query).index.values)
for j, label in enumerate(hap_labels):
core_hap_ixs = clust_dict[label]
isec = pop_hap_ixs.intersection(core_hap_ixs)
hap_frequencies[i, j] = len(isec)
counts = df_pops.n_haps
counts
# make df for plotting
df_freqs = pd.DataFrame(data=hap_frequencies, index=df_pops.index, columns=hap_labels)
df_freqs['other'] = counts - df_freqs.sum(axis=1).values
df_freqs['total'] = counts
compute_hap_freqs()
df_freqs
Out[12]:
In [13]:
# test CMS breakdown is significant
def test_cms_breakdown():
arr = np.asarray(df_freqs
.loc[['CMS_savanna', 'CMS_transition', 'CMS_forest']]
.iloc[:, :len(hap_labels) + 1])
# remove zeros
arr = arr.compress(arr.sum(axis=0) > 0, axis=1)
return scipy.stats.chi2_contingency(arr)
test_cms_breakdown()
Out[13]:
In [14]:
palette = sns.color_palette('nipy_spectral', n_colors=len(hap_labels) - 2, desat=0.8)
# add a colour for other_resistant
palette.append((0, 0, 0))
# add a colour for wt
palette.append((0.9, 0.9, 0.9))
# check
sns.palplot(palette)
plt.gca().set_xticklabels(hap_labels);
In [15]:
df_pops
Out[15]:
In [16]:
def make_df_lonlat():
global df_lonlat
df_lonlat = df_pops[['latitude', 'longitude']].copy()
df_lonlat['offset_latitude'] = np.zeros(len(df_lonlat))
df_lonlat['offset_longitude'] = np.zeros(len(df_lonlat))
df_lonlat.loc['BFS'].offset_latitude = 2
df_lonlat.loc['BFS'].offset_longitude = 2
df_lonlat.loc['BFM'].offset_latitude = 2
df_lonlat.loc['BFM'].offset_longitude = -2
df_lonlat.loc['CMS_savanna'].offset_latitude = 3.5
df_lonlat.loc['CMS_savanna'].offset_longitude = 0.5
df_lonlat.loc['CMS_transition'].offset_latitude = 0.5
df_lonlat.loc['CMS_transition'].offset_longitude = 3
df_lonlat.loc['CMS_forest'].offset_latitude = -3
df_lonlat.loc['CMS_forest'].offset_longitude = 1
df_lonlat.loc['GAS'].offset_latitude = .5
df_lonlat.loc['GAS'].offset_longitude = -3
make_df_lonlat()
df_lonlat
Out[16]:
In [17]:
# for legend
hap_colors = {l: c for l, c in zip(hap_labels, palette)}
hap_colors
Out[17]:
In [18]:
df_lonlat.loc[['BFS', 'BFM']]
Out[18]:
In [19]:
pops_cms_whole = [p for p in df_pops.index if p not in {'CMS_savanna', 'CMS_transition', 'CMS_forest'}]
pops_cms_whole
Out[19]:
In [20]:
pops_cms_breakdown = [p for p in df_pops.index if p != 'CMS']
pops_cms_breakdown
Out[20]:
In [21]:
human_hap_labels = {k: k for k in hap_labels}
human_hap_labels['other_resistant'] = 'other $kdr$'
human_hap_labels['wt'] = '$wt$'
In [22]:
def plot_map(pops, fn=None, dpi=150):
#our frame
extent_lonlat = (-20, 45, -13, 20)
#plot
subplot_kw = dict(projection=crs_lonlat)
fig, ax = plt.subplots(figsize=(7, 7), dpi=120, subplot_kw=subplot_kw)
ax.coastlines(resolution='50m', linewidth=1, zorder=1)
ax.stock_img()
ax.add_feature(cfeature.BORDERS, lw=1, zorder=4)
ax.add_feature(cfeature.LAKES)
#ax.margins(0)
#add pies
for pop, row in df_lonlat.loc[pops].iterrows():
freqs = df_freqs.loc[pop]
ratios = np.asarray([freqs[k]/freqs['total'] for k in hap_labels])
ratios = np.append(ratios, 1 - np.sum(ratios))
# wedgeprops is used here just to pass the zorder command
center = (row.longitude + row.offset_longitude, row.latitude + row.offset_latitude)
radius = np.sqrt(df_freqs.loc[pop].total * .02)
ax.pie(ratios, wedgeprops=dict(zorder=7), colors=palette,
center=center, radius=radius, shadow=True)
ax.add_patch(plt.Circle(xy=center, radius=radius, facecolor='none', edgecolor='k', zorder=8, lw=1))
if row.offset_latitude > 0 or row.offset_longitude > 0:
ax.plot([row.longitude, row.longitude + row.offset_longitude],
[row.latitude, row.latitude + row.offset_latitude], 'k-', lw=2)
lbl = df_pops.loc[pop].short_label
ax.text(center[0], center[1] + radius, lbl, ha='center', va='bottom', fontsize=8, fontweight='bold',
bbox=dict(edgecolor='w', facecolor='w', pad=1, alpha=.8, ), zorder=6)
ax.set_extent(extent_lonlat, crs=crs_lonlat)
handles = [mpl.patches.Patch(facecolor=hap_colors[k], edgecolor='k', label=human_hap_labels[k]) for k in hap_labels]
leg = ax.legend(handles=handles, bbox_to_anchor=(1, 1), loc='upper left', title='Haplotype\ngroup', )
leg._legend_box.align = "left"
#save
if fn:
fig.savefig(fn, jpeg_quality=100, dpi=dpi, bbox_inches='tight')
In [23]:
plot_map(pops=pops_cms_whole, fn='../artwork/outbreak_map_base.pdf')
In [24]:
plot_map(pops=pops_cms_breakdown, fn='../artwork/outbreak_map_base_cms_breakdown.pdf')
In [ ]: