In [1]:
%run ../../shared_setup.ipynb
In [2]:
# load variation data
sample_exclusions = dup_samples.copy()
for cross in excessive_recomb_samples:
sample_exclusions[cross] += excessive_recomb_samples[cross]
callsets = load_callsets(COMBINED_CALLSET_FN_TEMPLATE,
sample_exclusions=sample_exclusions,
variant_filter='FILTER_PASS',
call_filter=combined_conf_calls)
samples = {cross: callsets[cross]['calldata'].dtype.names
for cross in CROSSES}
progeny = {cross: samples[cross][2:] for cross in CROSSES}
n_progeny = {cross: len(progeny[cross]) for cross in CROSSES}
print(n_progeny)
print(np.sum(list(n_progeny.values())))
In [3]:
tbl_co = etl.frompickle(os.path.join(PUBLIC_DIR, 'tbl_co.pickle'))
display_with_nrows(tbl_co, caption='CO events')
In [4]:
(tbl_co
.valuecounts('cross')
.addfield('count_per_meiosis',
lambda row: row['count'] / n_progeny[row['cross']])
.display(caption='CO events by cross')
)
In [5]:
# the simple method...
X = tbl_co.valuecounts('sample').values('count').list()
n = len(X)
assert np.sum(list(n_progeny.values())) == n
print('meioses:', n)
print('crossovers:', np.sum(X))
mu_hat = np.mean(X)
print('total map length: %.2f' % mu_hat, 'Morgan')
mu_stderr = np.sqrt(mu_hat / n)
print('map length stderr:', mu_stderr)
mu_95ci = 1.96 * mu_stderr
print('map length 95%% CI: %.2f - %.2f' % (mu_hat - mu_95ci, mu_hat + mu_95ci))
In [6]:
# calculate marker span using accessibility regions, it's fairer and simpler
tbl_marker_span = (
tbl_regions_1b
.eq('region_type', 'Core')
.aggregate(key='region_chrom',
aggregation={'start': ('region_start', min), 'stop': ('region_stop', max)})
.rename('region_chrom', 'chrom')
.addfield('span', lambda row: row.stop - row.start)
)
total_marker_span = tbl_marker_span.values('span').sum()
tbl_marker_span.displayall(caption='Core genome span (total = %.2f)' % (total_marker_span/1e6))
In [7]:
co_hat = (mu_hat / (total_marker_span / 1e6))
print('CO rate: %.4f Morgan/Mb' % co_hat)
co_95ci_lower = (mu_hat - mu_95ci) / (total_marker_span / 1e6)
co_95ci_upper = (mu_hat + mu_95ci) / (total_marker_span / 1e6)
print('CO rate 95%% CI: %.4f - %.4f Morgan/Mb' % (co_95ci_lower, co_95ci_upper))
print()
print('CO rate: %.1f kb/cM' % (10/co_hat))
print('CO rate 95%% CI: %.1f - %.1f kb/cM' % (10/co_95ci_upper, 10/co_95ci_lower))
In [8]:
tbl_co.valuecounts('sample', 'cross').head(5).display()
tbl_co.valuecounts('sample', 'cross').tail(5).display()
In [9]:
# the simple method...
for cross in CROSSES:
print()
print(LABELS[cross])
X = tbl_co.eq('cross', cross).valuecounts('sample').values('count').list()
n = len(X)
assert n_progeny[cross] == n
print('meioses:', n)
print('crossovers:', np.sum(X))
mu_hat = np.mean(X)
print('total map length: %.2f' % mu_hat, 'Morgan')
mu_stderr = np.sqrt(mu_hat / n)
print('map length stderr:', mu_stderr)
mu_95ci = 1.96 * mu_stderr
print('map length 95%% CI: %.2f - %.2f' % (mu_hat - mu_95ci, mu_hat + mu_95ci))
In [10]:
df_co_by_sample = (
tbl_co
.valuecounts('cross', 'sample')
.sort(key=('cross', 'sample'))
.rename('count', 'n_co')
.todataframe()
)
df_co_by_sample.head()
Out[10]:
In [11]:
vals = [df_co_by_sample[df_co_by_sample.cross == cross]['n_co'].values for cross in CROSSES]
f_val, p_val = stats.f_oneway(*vals)
print('ANOVA', f_val, p_val)
h_val, p_val = stats.kruskal(*vals)
print('Kruskal', h_val, p_val)
In [12]:
len(df_co_by_sample)
Out[12]:
In [14]:
plt.boxplot?
In [26]:
def plot_co_rate_by_cross(ax):
X = df_co_by_sample.n_co
F = df_co_by_sample.cross
vals = [X[F == cross] for cross in CROSSES]
sns.despine(ax=ax)
sns.despine(ax=ax, offset=5)
#sns.violinplot(vals, ax=ax, linewidth=.5, )
#sns.boxplot(vals, ax=ax)
ax.boxplot(vals, notch=True, bootstrap=10000, medianprops=dict(linewidth=1))
ax.set_xticklabels([LABELS[cross] for cross in CROSSES], fontsize=8)
ax.set_ylim(0, 30)
ax.set_ylabel('genetic map length (Morgan)')
In [27]:
width = 8/3
height = width
fig, ax = plt.subplots(figsize=(width, height))
ax.set_xlabel('cross', color='w')
plot_co_rate_by_cross(ax)
ax.set_title('A', fontweight='bold')
fig.tight_layout()
fig.savefig('../../artwork/main/fig3A.jpg', dpi=900, jpeg_quality=100)
In [15]:
lkp_span = tbl_marker_span.lookupone('chrom', 'span')
lkp_span
Out[15]:
In [16]:
tbl_co_by_chrom = [['cross', 'sample', 'chrom', 'count']]
for cross, sample in sorted(tbl_co.values(('cross', 'sample')).set()):
for chrom in CHROMOSOMES:
n = tbl_co.select(lambda row: row.cross == cross and row.sample == sample and row.chrom == chrom).nrows()
tbl_co_by_chrom.append([cross, sample, chrom, n])
tbl_co_by_chrom = (etl
.wrap(tbl_co_by_chrom)
.addfield('span', lambda row: lkp_span[str(row.chrom, 'ascii')])
)
tbl_co_by_chrom.display()
In [17]:
df_co_by_chrom = tbl_co_by_chrom.todataframe()
In [18]:
plot = sns.lmplot('span', 'count', df_co_by_chrom, x_estimator=np.mean, col='cross')
plot.set(ylim=(0, 4));
In [19]:
def plot_co_rate_by_chrom(ax, scatter_kws=dict()):
sns.despine(ax=ax)
sns.offset_spines(ax=ax)
sns.regplot('span', 'count', df_co_by_chrom, x_estimator=np.mean, scatter_kws=scatter_kws, ax=ax)
ax.set_xlabel('chromosome marker span (bp)')
ax.set_ylabel('genetic map length (Morgan)')
ax.set_ylim(0, 3)
In [20]:
width = 8/3
height = width
fig, ax = plt.subplots(figsize=(width, height))
plot_co_rate_by_chrom(ax, scatter_kws=dict(s=12))
ax.set_xticks(range(0, 3500000, 1000000))
ax.set_xticklabels(range(0, 4, 1))
ax.set_xlabel('chromosome marker span (Mbp)')
ax.set_title('B', fontweight='bold')
fig.tight_layout()
fig.savefig('../../artwork/main/fig3B.jpg', dpi=900, jpeg_quality=100)
TODO redo with newer genome annotations to get an extra centromere
In [21]:
def distance_to_centromere(row):
cen_id = 'PF3D7_CEN' + str(row.chrom[6:8], 'ascii')
cen = lkp_feature[cen_id]
cen_pos = (cen['feature_start'] + cen['feature_stop'])/2
return abs(row.co_pos_mid - cen_pos)
df_co_cen = (
tbl_co
.lt('co_pos_range', 10000) # require 10kb certainty
.ne('chrom', b'Pf3D7_10_v3') # no centromere
.todataframe()
)
df_co_cen['cen_dist'] = df_co_cen.apply(distance_to_centromere, axis=1)
print(len(df_co_cen))
df_co_cen.head()
Out[21]:
In [22]:
def get_cen_pos(chrom):
cen_id = 'PF3D7_CEN' + chrom[6:8]
cen = lkp_feature[cen_id]
return cen['feature_start'], cen['feature_stop']
tbl_chrom_cen = (
tbl_regions_1b
.ne('region_chrom', 'Pf3D7_10_v3')
.eq('region_type', 'Core')
.aggregate(key='region_chrom',
aggregation={'core_start': ('region_start', min), 'core_stop': ('region_stop', max)})
.rename('region_chrom', 'chrom')
.addfield('cen', lambda row: get_cen_pos(row.chrom))
.unpack('cen', ['cen_start', 'cen_stop'])
.addfield('cen_pos', lambda row: (row.cen_start + row.cen_stop)/2)
)
tbl_chrom_cen
Out[22]:
In [23]:
def plot_co_rate_cen(window_size=20000, ax=None, scatter_kws=dict()):
XX = list() # distance to centromere
YY = list() # CO recombination rate
# N.B., we need to account for the fact that some chromosomes are bigger than others.
# Consider each chromosome separately, move upstream and downstream of centromere separately.
# Gather what data there are.
# Should mean that estimates of recombination rate further from centromere have greater uncertainty.
# However should not mean there is any other effect of chromosome size.
for row in tbl_chrom_cen.records():
# downstream
max_dist = row.core_stop - row.cen_pos
bins = np.arange(0, max_dist, window_size)
X = (bins[:-1] + bins[1:])/2.
# recombination rate
D = df_co_cen[(df_co_cen.chrom == row.chrom.encode('ascii')) & (df_co_cen.co_pos_min > row.cen_pos)].cen_dist
Y, _ = np.histogram(D, bins=bins)
XX.append(X)
YY.append(Y)
# upstream
min_dist = row.cen_pos - row.core_start
bins = np.arange(0, min_dist, window_size)
X = (bins[:-1] + bins[1:])/2.
# recombination rate
D = df_co_cen[(df_co_cen.chrom == row.chrom.encode('ascii')) & (df_co_cen.co_pos_max < row.cen_pos)].cen_dist
Y, _ = np.histogram(D, bins=bins)
XX.append(X)
YY.append(Y)
XX = np.concatenate(XX)
YY = np.concatenate(YY)
YY = YY / np.sum(list(n_progeny.values())) # Morgan
YY = YY / (window_size/1e6) # Morgan/Mbp
if ax is None:
fig, ax = plt.subplots()
sns.despine(ax=ax, offset=5)
sns.regplot(XX, YY, x_estimator=np.mean, x_ci=95, ax=ax, fit_reg=False, scatter_kws=scatter_kws)
ax.set_xlim(0, 700000)
ax.set_ylim(0, 2)
ax.set_ylabel('CO recombination rate (Morgan/Mbp)')
ax.set_xlabel('distance from centromere (bp)')
ax.axhline(np.mean(YY), linestyle=':', linewidth=.5)
In [24]:
width = 8/3
height = width
fig, ax = plt.subplots(figsize=(width, height))
plot_co_rate_cen(30000, ax=ax, scatter_kws=dict(s=12))
ax.set_xlim(0, 510000)
ax.set_xticks(range(0, 600000, 100000))
ax.set_xticklabels(range(0, 600, 100))
ax.set_xlabel('distance from centromere (kbp)')
ax.set_title('C', fontweight='bold')
fig.tight_layout()
fig.savefig('../../artwork/main/fig3C.jpg', dpi=900, jpeg_quality=100)
In [25]:
8 * 2.54
Out[25]:
In [26]:
(16/3)*2.54
Out[26]:
In [ ]: