``````

In [1]:

import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pylab import rcParams
import matplotlib
import seaborn as sns
import scipy

from IPython.display import HTML

``````

# Assessing the relative impacts of constraints and calibrations on dating accuracy

``````

In [2]:

HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
\$('div.input').hide();
} else {
\$('div.input').show();
}
code_show = !code_show
}
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')

``````
``````

Out[2]:

code_show=true;
function code_toggle() {
if (code_show){
\$('div.input').hide();
} else {
\$('div.input').show();
}
code_show = !code_show
}

The raw code for this IPython notebook is by default hidden for easier reading.

``````

## Analysis of 800 MCMC runs on simulated data.

The simulation was performed on 10 different trees, used to generate alignments. For each tree, we gathered up to 5 calibration points (0, 1, 3, 5), and up to 5 constraints (0, 1, 3, 5). We ran the MCMC in each combination of conditions (=16 combinations), in 5 replicates, changing the nature of the calibrations and constraints. So in total we have \$10~trees*16~conditions*5~replicates=800~points\$.

``````

In [52]:

colNames = ["treeId","numCalib","numCons","numReplicate","correlation", "rmsd", "correlation_bl", "rmsd_bl", "numNodes","numInHPD","fracInHPD","percent0","percent25","percent50","percent75","percent100"]

``````
``````

In [53]:

d.describe()

``````
``````

Out[53]:

text-align: right;
}

text-align: left;
}

.dataframe tbody tr th {
vertical-align: top;
}

treeId
numCalib
numCons
numReplicate
correlation
rmsd
correlation_bl
rmsd_bl
numNodes
numInHPD
fracInHPD
percent0
percent25
percent50
percent75
percent100

count
807.000000
807.000000
807.000000
807.000000
807.000000
807.000000
807.000000
807.000000
807.0
807.000000
807.000000
807.000000
807.000000
807.000000
807.000000
807.000000

mean
5.493185
2.221809
3.453532
2.964064
0.879407
3.173555
0.663324
2.508961
29.0
21.944238
75.669786
0.685938
2.922134
4.667950
6.361106
8.722972

std
2.873623
1.928401
10.988151
1.446292
0.075069
1.896199
0.145286
1.116477
0.0
4.278530
14.753552
0.222735
1.001510
1.496446
2.064386
3.387293

min
1.000000
0.000000
0.000000
0.000000
0.606824
0.444335
0.169987
0.556378
29.0
4.000000
13.793103
0.008700
0.263751
0.403635
0.514055
0.664912

25%
3.000000
0.000000
1.000000
2.000000
0.844627
1.902160
0.572730
1.810623
29.0
19.000000
65.517241
0.560650
2.234251
3.583128
5.017914
6.525724

50%
5.000000
1.000000
3.000000
3.000000
0.890185
2.885526
0.668358
2.377053
29.0
21.000000
72.413793
0.669000
2.772384
4.246994
5.798840
7.728945

75%
8.000000
3.000000
5.000000
4.000000
0.940326
3.686664
0.782085
2.994718
29.0
26.000000
89.655172
0.766550
3.523032
5.610462
7.233969
9.542786

max
10.000000
5.000000
100.000000
5.000000
0.995693
12.070103
0.961315
7.265291
29.0
29.000000
100.000000
1.245100
6.773768
11.375833
14.228750
20.732230

``````

## Analysis of the impact of constraints

In the following 6 plots, we investigate the impact of constraints, not controlling for the number of calibrations.

``````

In [54]:

%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["numCons"], y=d["correlation_bl"], inner=None)
ax = sns.swarmplot(d["numCons"], y=d["correlation_bl"], color="white", edgecolor="gray")
ax.set(xlabel='Number of constraints', ylabel='Correlation in branch lengths')

``````
``````

Out[54]:

[<matplotlib.text.Text at 0x7f309898e9e8>,
<matplotlib.text.Text at 0x7f309a6931d0>]

``````
``````

In [55]:

%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["numCons"], y=d["rmsd_bl"], inner=None)
ax = sns.swarmplot(d["numCons"], y=d["rmsd_bl"], color="white", edgecolor="gray")
ax.set(xlabel='Number of constraints', ylabel='RMSD in branch lengths')

``````
``````

Out[55]:

[<matplotlib.text.Text at 0x7f3099164a20>,
<matplotlib.text.Text at 0x7f309a2ff9e8>]

``````
``````

In [56]:

%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["numCons"], y=d["rmsd"], inner=None)
ax = sns.swarmplot(d["numCons"], y=d["rmsd"], color="white", edgecolor="gray")
ax.set(xlabel='Number of constraints', ylabel='RMSD in node ages')

``````
``````

Out[56]:

[<matplotlib.text.Text at 0x7f30989824a8>,
<matplotlib.text.Text at 0x7f30989cb588>]

``````
``````

In [57]:

%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["numCons"], y=d["correlation"], inner=None)
ax = sns.swarmplot(d["numCons"], y=d["correlation"], color="white", edgecolor="gray")
ax.set(xlabel='Number of constraints', ylabel='Correlation with true dates')

``````
``````

Out[57]:

[<matplotlib.text.Text at 0x7f3098743780>,
<matplotlib.text.Text at 0x7f3098790080>]

``````
``````

In [58]:

%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["numCons"], y=d["fracInHPD"], inner=None)
ax = sns.swarmplot(d["numCons"], y=d["fracInHPD"], color="white", edgecolor="gray")
ax.set(xlabel='Number of constraints', ylabel='Fraction of dates in 95% HPD')

``````
``````

Out[58]:

[<matplotlib.text.Text at 0x7f30986ebf28>,
<matplotlib.text.Text at 0x7f3098733fd0>]

``````
``````

In [59]:

%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["numCons"], y=d["percent50"], inner=None)
ax = sns.swarmplot(d["numCons"], y=d["percent50"], color="white", edgecolor="gray")
ax.set(xlabel='Number of constraints', ylabel='Median size of the 95% HPD')

``````
``````

Out[59]:

[<matplotlib.text.Text at 0x7f3098678dd8>,
<matplotlib.text.Text at 0x7f30987aebe0>]

``````

### Partial conclusion on the above

As the number of constraints increases from 0 to 5, the accuracy of the reconstruction improves a little bit, despite smaller 95%HPD. Using all constraints available, we get usually better results, and smaller 95%HOD.

## Analysis of the impact of calibrations

In the following 6 plots, we investigate the impact of calibrations, not controlling for the number of constraints.

``````

In [60]:

%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["numCalib"], y=d["rmsd_bl"], inner=None)
ax = sns.swarmplot(d["numCalib"], y=d["rmsd_bl"], color="white", edgecolor="gray")
ax.set(xlabel='Number of calibrations', ylabel='RMSD in branch lengths')

``````
``````

Out[60]:

[<matplotlib.text.Text at 0x7f309855c278>,
<matplotlib.text.Text at 0x7f30985441d0>]

``````
``````

In [61]:

%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["numCalib"], y=d["correlation_bl"], inner=None)
ax = sns.swarmplot(d["numCalib"], y=d["correlation_bl"], color="white", edgecolor="gray")
ax.set(xlabel='Number of calibrations', ylabel='Correlation in branch lengths')

``````
``````

Out[61]:

[<matplotlib.text.Text at 0x7f30984f2f98>,
<matplotlib.text.Text at 0x7f30984c7e10>]

``````
``````

In [62]:

%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["numCalib"], y=d["rmsd"], inner=None)
ax = sns.swarmplot(d["numCalib"], y=d["rmsd"], color="white", edgecolor="gray")
ax.set(xlabel='Number of calibrations', ylabel='RMSD in node ages')

``````
``````

Out[62]:

[<matplotlib.text.Text at 0x7f3098401b70>,
<matplotlib.text.Text at 0x7f30984f29e8>]

``````
``````

In [63]:

%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["numCalib"], y=d["correlation"], inner=None)
ax = sns.swarmplot(d["numCalib"], y=d["correlation"], color="white", edgecolor="gray")
ax.set(xlabel='Number of calibrations', ylabel='Correlation with true dates')

``````
``````

Out[63]:

[<matplotlib.text.Text at 0x7f30983b0d68>,
<matplotlib.text.Text at 0x7f30985c3668>]

``````
``````

In [64]:

%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["numCalib"], y=d["fracInHPD"], inner=None)
ax = sns.swarmplot(d["numCalib"], y=d["fracInHPD"], color="white", edgecolor="gray")
ax.set(xlabel='Number of calibrations', ylabel='Fraction in 95% HPD')

``````
``````

Out[64]:

[<matplotlib.text.Text at 0x7f30982dddd8>,
<matplotlib.text.Text at 0x7f30982c9278>]

``````
``````

In [65]:

%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["numCalib"], y=d["percent50"], inner=None)
ax = sns.swarmplot(d["numCalib"], y=d["percent50"], color="white", edgecolor="gray")
ax.set(xlabel='Number of calibrations', ylabel='Median size of the 95% HPD')

``````
``````

Out[65]:

[<matplotlib.text.Text at 0x7f30982f6630>,
<matplotlib.text.Text at 0x7f30982d4470>]

``````

### Partial conclusion on the above

As the number of calibrations increases from 0 to 5, the accuracy of the reconstruction improves, despite smaller 95%HPD. The effect of calibrations seems stronger than that of constraints.

## Let's look at constraints, controlling for calibrations

In the following 10 plots, we separate the impacts of constraints and calibrations.

``````

In [66]:

%matplotlib inline
ax = sns.barplot(hue="numCons", y="rmsd_bl", x="numCalib", data=d)
#ax.set_ylim(0.8, 1.0)
ax.set(xlabel='Number of calibrations', ylabel='Average rmsd in branch lengths')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints", ])

``````
``````

Out[66]:

<matplotlib.legend.Legend at 0x7f3098564fd0>

``````
``````

In [67]:

%matplotlib inline
ax = sns.barplot(hue="numCons", y="correlation_bl", x="numCalib", data=d)
#ax.set_ylim(0.8, 1.0)
ax.set(xlabel='Number of calibrations', ylabel='Average correlation in branch lengths')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints", ])

``````
``````

Out[67]:

<matplotlib.legend.Legend at 0x7f30980718d0>

``````
``````

In [68]:

%matplotlib inline
ax = sns.barplot(hue="numCons", y="rmsd", x="numCalib", data=d)
#ax.set_ylim(0.8, 1.0)
ax.set(xlabel='Number of calibrations', ylabel='Average rmsd in node ages')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints", ])

``````
``````

Out[68]:

<matplotlib.legend.Legend at 0x7f30980550b8>

``````
``````

In [69]:

%matplotlib inline
ax = sns.barplot(hue="numCons", y="correlation", x="numCalib", data=d)
ax.set_ylim(0.8, 1.0)
ax.set(xlabel='Number of calibrations', ylabel='Average correlation in node ages')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints", ])

``````
``````

Out[69]:

<matplotlib.legend.Legend at 0x7f3097d9d518>

``````
``````

In [70]:

%matplotlib inline
ax = sns.barplot(hue="numCons", y="fracInHPD", x="numCalib", data=d)
ax.set_ylim(50, 90)
ax.set(xlabel='Number of calibrations', ylabel='Average fraction in 95% HPD')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints", ])

``````
``````

Out[70]:

<matplotlib.legend.Legend at 0x7f3097c0c240>

``````
``````

In [71]:

%matplotlib inline
ax = sns.barplot(hue="numCons", y="percent0", x="numCalib", data=d)
ax.set(xlabel='Number of calibrations', ylabel='Average smallest 95% HPD')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints", ])

``````
``````

Out[71]:

<matplotlib.legend.Legend at 0x7f3097c7d6a0>

``````
``````

In [72]:

%matplotlib inline
ax = sns.barplot(hue="numCons", y="percent25", x="numCalib", data=d)
ax.set(xlabel='Number of calibrations', ylabel='Average 1st quartile 95% HPD')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints", ])

``````
``````

Out[72]:

<matplotlib.legend.Legend at 0x7f3098889668>

``````
``````

In [73]:

%matplotlib inline
ax = sns.barplot(hue="numCons", y="percent50", x="numCalib", data=d)
ax.set(xlabel='Number of calibrations', ylabel='Average median 95% HPD')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints", ])

``````
``````

Out[73]:

<matplotlib.legend.Legend at 0x7f3097835eb8>

``````
``````

In [74]:

%matplotlib inline
ax = sns.barplot(hue="numCons", y="percent75", x="numCalib", data=d)
ax.set(xlabel='Number of calibrations', ylabel='Average 3rd quartile 95% HPD')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints", ])

``````
``````

Out[74]:

<matplotlib.legend.Legend at 0x7f309821bbe0>

``````
``````

In [75]:

%matplotlib inline
ax = sns.barplot(hue="numCons", y="percent100", x="numCalib", data=d)
ax.set(xlabel='Number of calibrations', ylabel='Average maximum 95% HPD')
# Put the legend out of the figure
handles, labels = ax.get_legend_handles_labels()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=handles, labels=["0 constraint", "1 constraint", "3 constraints", "5 constraints", ])

``````
``````

Out[75]:

<matplotlib.legend.Legend at 0x7f309756e978>

``````

### Partial conclusion on the above

Constraints and calibrations both improve accuracy. The number of constraints does not seem to have much of an effect on the size of the 95% HPD, contrary to calibrations. However, when using all constraints, resulyts are much improved, although with a lot of variance. It is possible that on some trees convergence was difficult in the fully constrained runs.

## Comparison between trees

In the following 6 plots, we look at the impact of the 10 different trees.

First let's order the trees from good to bad according to the correlation in node ages when all constraints are used:

``````

In [106]:

d_sel = d.loc[d['numCons'] == 100]
print(d_sel.sort_values(['correlation'], ascending=False))

``````
``````

treeId  numCalib  numCons  numReplicate  correlation       rmsd  \
242       3         0      100             0     0.995693   0.444335
646       8         0      100             0     0.995115   0.630101
403       5         0      100             0     0.992704   1.054818
725       9         0      100             0     0.992442   1.380285
80        1         0      100             0     0.980810   1.329130
565       7         0      100             0     0.962720   2.407002
322       4         0      100             0     0.951231   3.174121
161       2         0      100             0     0.948844   6.296218
806      10         0      100             0     0.936484   3.525262
484       6         0      100             0     0.814335  12.070103

correlation_bl   rmsd_bl  numNodes  numInHPD   fracInHPD  percent0  \
242        0.951570  0.556378        29      29.0  100.000000    0.6445
646        0.961315  0.794275        29      29.0  100.000000    0.6151
403        0.920911  1.166022        29      29.0  100.000000    0.6388
725        0.958851  1.222497        29      18.0   62.068966    0.3134
80         0.902159  1.405855        29      29.0  100.000000    0.5935
565        0.730545  1.891770        29      26.0   89.655172    0.8208
322        0.786032  1.546851        29       9.0   31.034483    0.5556
161        0.547477  3.272952        29       4.0   13.793103    1.1645
806        0.721978  1.879132        29       7.0   24.137931    0.7202
484        0.169987  6.737178        29       5.0   17.241379    0.9128

percent25  percent50  percent75  percent100
242   2.155174   2.774802   3.745086    4.811160
646   1.940732   2.896196   3.912038    5.611904
403   2.282609   4.070992   5.582668    6.995048
725   4.632578   5.838926   6.922800    7.756348
80    2.068649   3.810289   5.801742    9.077728
565   2.004180   3.297923   4.570120    6.951470
322   1.263370   2.140089   3.283129    4.846636
161   1.445156   3.048088   7.295572   10.404520
806   0.987030   3.940387   8.096464    8.620304
484   1.236568   2.319953   3.905656    5.692279

``````
``````

In [76]:

%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["treeId"], y=d["rmsd"], inner=None)
ax = sns.swarmplot(d["treeId"], y=d["rmsd"], color="white", edgecolor="gray")
ax.set(xlabel='Tree ID', ylabel='Rmsd in node ages')

``````
``````

Out[76]:

[<matplotlib.text.Text at 0x7f30973f8780>,
<matplotlib.text.Text at 0x7f309741cbe0>]

``````
``````

In [28]:

%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["treeId"], y=d["correlation"], inner=None)
ax = sns.swarmplot(d["treeId"], y=d["correlation"], color="white", edgecolor="gray")
ax.set(xlabel='Tree ID', ylabel='Correlation in node ages')

``````
``````

Out[28]:

[<matplotlib.text.Text at 0x7f309b3a8dd8>,
<matplotlib.text.Text at 0x7f309b2ca320>]

``````
``````

In [29]:

%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["treeId"], y=d["rmsd_bl"], inner=None)
ax = sns.swarmplot(d["treeId"], y=d["rmsd_bl"], color="white", edgecolor="gray")
ax.set(xlabel='Tree ID', ylabel='Rmsd in branch lengths')

``````
``````

Out[29]:

[<matplotlib.text.Text at 0x7f309b1c4e48>,
<matplotlib.text.Text at 0x7f309b3095f8>]

``````
``````

In [30]:

%matplotlib inline
sns.set_style("whitegrid")
ax = sns.violinplot(d["treeId"], y=d["correlation_bl"], inner=None)
ax = sns.swarmplot(d["treeId"], y=d["correlation_bl"], color="white", edgecolor="gray")
ax.set(xlabel='Tree ID', ylabel='Correlation in branch lengths')

``````
``````

Out[30]:

[<matplotlib.text.Text at 0x7f309b0b53c8>,
<matplotlib.text.Text at 0x7f309b164e48>]

``````