Here, we present the analysis to detect the compartments in Mouse B and iPS cells. In this example, we will use the GC-content (guanine-cytosine content) to identify which bins belong to the A or B compartments. The percentage of bases that are either guanine or cytosine on a DNA strand correlates directly with gene density and is a good measure to identify open and close chromatine.
Note: Compartments are detected on the full genome matrix.
In [1]:
from pytadbit.parsers.hic_parser import load_hic_data_from_bam
from pytadbit.parsers.genome_parser import get_gc_content, parse_fasta
from pickle import load
In [2]:
reso = 200000
base_path = 'results/fragment/{0}_both/03_filtering/valid_reads12_{0}.bam'
bias_path = 'results/fragment/{0}_both/04_normalizing/biases_{0}_both_{1}kb.biases'
In [3]:
rich_in_A = get_gc_content(parse_fasta('genome/Mus_musculus-GRCm38.p6/Mus_musculus-GRCm38.p6.fa'),
by_chrom=True ,resolution=reso, n_cpus=8)
In [4]:
cell = 'mouse_B'
In [5]:
hic_data = load_hic_data_from_bam(base_path.format(cell),
resolution=reso,
biases=bias_path.format(cell, reso // 1000),
ncpus=8)
In [6]:
! mkdir -p results/fragment/$cell\_both/05_segmenting
In [7]:
chrname = 'chr3'
corr = hic_data.find_compartments(show_compartment_labels=True,
show=True, crms=[chrname], vmin='auto', vmax='auto', rich_in_A=rich_in_A,
savedata='results/fragment/{0}_both/05_segmenting/compartments_{1}_{2}.tsv'.format(cell, chrname, reso),
savedir='results/fragment/{0}_both/05_segmenting/eigenvectors_{1}_{2}'.format(cell, chrname, reso))
In [8]:
cell = 'mouse_PSC'
In [9]:
hic_data = load_hic_data_from_bam(base_path.format(cell),
resolution=reso,
biases=bias_path.format(cell, reso // 1000),
ncpus=8)
In [10]:
! mkdir -p results/fragment/$cell\_both/05_segmenting
In [11]:
chrname = 'chr3'
corr = hic_data.find_compartments(show_compartment_labels=True,
show=True, crms=[chrname], vmin='auto', vmax='auto', rich_in_A=rich_in_A,
savedata='results/fragment/{0}_both/05_segmenting/compartments_{1}_{2}.tsv'.format(cell, chrname, reso),
savedir='results/fragment/{0}_both/05_segmenting/eigenvectors_{1}_{2}'.format(cell, chrname, reso))
The assignments of the compartments for the two cell types are stored in two different files:
In [12]:
! head -n 20 results/fragment/mouse_B_both/05_segmenting/compartments_chr3_200000.tsv
In [13]:
! head -n 20 results/fragment/mouse_PSC_both/05_segmenting/compartments_chr3_200000.tsv
In another folder, we also saved the coordinates of each computed eigenvector:
In [14]:
! ls -lh results/fragment/mouse_B_both/05_segmenting/eigenvectors_chr3_200000
Note: In this file the first line shows the eigenvector index with it's corresponding eigenvalue (the first column should always be the one you selected, even if it is not the first eigenvector).
Then there are the coordinates of the eigenvectors. In the first column, the coordinates correspond to the assignment of the A and B compartments, positive values for A compartments and negative values for B compartments.
In [15]:
! head -n 20 results/fragment/mouse_B_both/05_segmenting/eigenvectors_chr3_200000/chr3_EigVect1.tsv
Load eigenvectors coordinates from files:
In [16]:
from builtins import next
fh = open('results/fragment/mouse_B_both/05_segmenting/eigenvectors_chr3_200000/chr3_EigVect1.tsv')
header = next(fh)
ev1_B = []
for line in fh:
evc1, evc2, evc3 = line.split()
ev1_B.append(float(evc1))
fh = open('results/fragment/mouse_PSC_both/05_segmenting/eigenvectors_chr3_200000/chr3_EigVect1.tsv')
header = next(fh)
ev1_PSC = []
for line in fh:
evc1, evc2, evc3 = line.split()
ev1_PSC.append(float(evc1))
diff = []
for i in range(len(ev1_B)):
diff.append(ev1_B[i] - ev1_PSC[i])
Plot the difference between each eigenvector along chromosome
In [17]:
from matplotlib import pyplot as plt
In [18]:
plt.figure(figsize=(12, 2))
plt.text(10, 0.07, 'More active in B cell')
plt.fill_between(range(len(diff)), diff, 0, where=[i>0 for i in diff])
plt.text(10, -0.09, 'More active in PSC cell')
plt.fill_between(range(len(diff)), diff, 0, where=[i<0 for i in diff])
plt.xlim(0, len(diff))
plt.ylim(-0.1, 0.1)
plt.ylabel('difference of EV')
_ = plt.xlabel('Genomic coordinate (%s kb)' % (reso / 1000))
In [19]:
plt.figure(figsize=(6, 6))
for i in range(len(ev1_B)):
if ev1_B[i] > 0 and ev1_PSC[i] > 0:
plt.plot(ev1_B[i], ev1_PSC[i], 'ro', alpha=0.5)
elif ev1_B[i] < 0 and ev1_PSC[i] < 0:
plt.plot(ev1_B[i], ev1_PSC[i], 'bo', alpha=0.5)
else:
plt.plot(ev1_B[i], ev1_PSC[i], 'o', color='grey', alpha=0.5)
plt.axhline(0, color='k', alpha=0.2)
plt.axvline(0, color='k', alpha=0.2)
plt.xlabel('B cell EV1')
_ = plt.ylabel('PSC cell EV1')
Now, we move to the TADs detection. In this notebook we will detect TAD borders at 100kbp resolution.
In [20]:
from pytadbit import Chromosome
from pytadbit.parsers.hic_parser import load_hic_data_from_bam
In [21]:
base_path = 'results/fragment/{0}_both/03_filtering/valid_reads12_{0}.bam'
bias_path = 'results/fragment/{0}_both/04_normalizing/biases_{0}_both_{1}kb.biases'
reso = 100000
cel1 = 'mouse_B'
cel2 = 'mouse_PSC'
In [22]:
hic_data1 = load_hic_data_from_bam(base_path.format(cel1),
resolution=reso,
region='chr3',
biases=bias_path.format(cel1, reso // 1000),
ncpus=8)
hic_data2 = load_hic_data_from_bam(base_path.format(cel2),
resolution=reso,
region='chr3',
biases=bias_path.format(cel2, reso // 1000),
ncpus=8)
In [23]:
chrname = 'chr3'
crm = Chromosome(chrname)
crm.add_experiment('mouse_B',
hic_data=[hic_data1.get_matrix(focus='chr3')],
norm_data=[hic_data1.get_matrix(focus='chr3',normalized=True)],
resolution=reso)
crm.add_experiment('mouse_PSC',
hic_data=[hic_data2.get_matrix(focus='chr3')],
norm_data=[hic_data2.get_matrix(focus='chr3',normalized=True)],
resolution=reso)
In [24]:
crm.visualize([('mouse_B', 'mouse_PSC')])
TADbit is the original TAD caller algorithm TADbit is a breakpoint detection algorithm that returns the optimal segmentation of the chromosome under BIC-penalized likelihood. The model assumes that counts have a Poisson distribution and that the expected value of the counts decreases like a power-law with the linear distance on the chromosome.
In [25]:
crm.find_tad(['mouse_B', 'mouse_PSC'], n_cpus=8)
In [26]:
crm.visualize([('mouse_B', 'mouse_PSC')], normalized=True, paint_tads=True)
In [27]:
crm.visualize([('mouse_B', 'mouse_PSC')], normalized=True, paint_tads=True, focus=(300, 360))
In [28]:
B = crm.experiments['mouse_B']
PSC = crm.experiments['mouse_PSC']
In [29]:
crm.experiments
Out[29]:
TopDom identifies TAD borders based on the assumption that contact frequencies between regions upstream and downstream of a border are lower than those between two regions within a TAD. The algorithm only depends on a single parameter corresponding to the window size. The algorithm provides a measure (from 0 to 10) of confidence on the accuracy of the border detection (https://www.ncbi.nlm.nih.gov/pubmed/26704975).
In [30]:
crm.find_tad(['mouse_B', 'mouse_PSC'], n_cpus=8, use_topdom=True)
In [31]:
crm.visualize([('mouse_B', 'mouse_PSC')], normalized=True, paint_tads=True)
In [32]:
crm.visualize([('mouse_B', 'mouse_PSC')], normalized=True, paint_tads=True, focus=(300, 360))
Insulation score (Crane et al. 2015 https://doi.org/10.1038/nature14450) can be used to build an insulation profile of the genome and, with a simple transformation, to identify TAD borders.
In [33]:
from pytadbit.tadbit import insulation_score, insulation_to_borders
First we need to normalize the matrices by visibility and by decay:
In [34]:
hic_data1.normalize_hic()
hic_data1.normalize_expected()
hic_data2.normalize_hic()
hic_data2.normalize_expected()
The two important parameter to define are the window size, the distance from the diagonal and the delta.
In [35]:
wsize = (1, 4)
In [36]:
insc1, delta1 = insulation_score(hic_data1, [wsize], resolution=100000, normalize=True, delta=2)
insc2, delta2 = insulation_score(hic_data2, [wsize], resolution=100000, normalize=True, delta=2)
Once defined the insulation score and the values of delta can be used to search for borders.
In [37]:
borders1 = insulation_to_borders(insc1[wsize], delta1[wsize], min_strength=0.1)
borders2 = insulation_to_borders(insc2[wsize], delta2[wsize], min_strength=0.1)
Currently the representation is not available in TADbit as for the other methods, but we can easily plot it:
In [38]:
plt.figure(figsize=(10, 4))
plt.subplot(2, 1, 1)
plt.title('B cell')
l1 = plt.plot([insc1[(wsize)].get(i, float('nan')) for i in range(max(insc1[(wsize)]))], label='Insulation score')
l2 = plt.plot([delta1[(wsize)].get(i, float('nan')) for i in range(max(insc1[(wsize)]))],
alpha=0.3, label='Delta value')
for b, c in borders1:
l3 = plt.plot([b] * 2, [-2, -2.3], color='darkgreen', alpha=c, lw=4, label='Border strength')
plt.grid()
plt.axhline(0, color='k')
plt.ylim(-2.5, 2)
plt.xlim(300, 360)
plt.legend(l1 + l2 + l3, [l.get_label() for l in l1 + l2 + l3], frameon=False, bbox_to_anchor=(1.3, 0.6))
plt.subplot(2, 1, 2)
plt.title('PSC')
plt.plot([insc2[(wsize)].get(i, float('nan')) for i in range(max(insc2[(wsize)]))])
plt.plot([delta2[(wsize)].get(i, float('nan')) for i in range(max(insc2[(wsize)]))], alpha=0.3)
for b, c in borders2:
plt.plot([b] * 2, [-2, -2.3], color='darkgreen', alpha=c, lw=4)
plt.grid()
plt.axhline(0, color='k')
plt.ylim(-2.5, 2)
_ = plt.xlim(300, 360)
plt.tight_layout()
The TAD borders can be aligned, using a simple reciprocal best hit strategy:
In [39]:
ali = crm.align_experiments(['mouse_B', 'mouse_PSC'], max_dist=reso)
In the plots below, each arc represents a TAD. Between two consecutive arcs the triangle mark the border. This triangle is colored depending on the confidence of the TAD border call.
In [40]:
ali.draw(ymax=3)
In [41]:
ali.draw(focus=(1, 350), ymax=3)
In order to asses how well two experiments align, or how conserved are the TAD borders between two experiments, we can compare the overlap between our experiments with the overlap of simulated random distributions of TAD borders.
In [42]:
ali, stats = crm.align_experiments(['mouse_B', 'mouse_PSC'], randomize=True)
In [43]:
print(ali)
This analysis returns an alignment score between 0 and 1 (0: no match, 1: all borders aligned), a p-value (usually equal zero), the proportion of borders conserved in the second experiment, and the proportion of borders conserved in the first experiment:
In [44]:
stats
Out[44]:
In [45]:
print('Alignment score: %.3f, p-value: %.4f\n proportion of borders of T0 found in T60: %.3f, of T60 in T0 %.3f' % stats)
In [46]:
crm.save_chromosome('results/fragment/chr3.tdb')
TopDom and the methodology based on insulation score are relatively fast computationally and allow to call TAD borders at very high resolution. However being able to call TAD borders at high resolution does not mean that we should do it. As the resolution increases, so does the noise.
In order to assess which is the best resolution in order to call TAD borders, a good strategy is to test the consistency of several resolutions.
In the example below, we use the methodology based on the insulation score as it is the fastest (tadbit strategy is almost not usable bellow 20 kb). We compare the number of TAD borders that are shared (plus-minus one bin) between the two replicates (iPS and B cells).
In [47]:
from pytadbit.parsers.hic_parser import load_hic_data_from_bam
from pytadbit.tadbit import insulation_score, insulation_to_borders
In [48]:
base_path = 'results/fragment/{0}_both/03_filtering/valid_reads12_{0}.bam'
cel1 = 'mouse_B'
cel2 = 'mouse_PSC'
borders = {}
resos = [5000, 10000, 20000, 30000, 40000, 50000, 60000, 80000, 100000, 150000, 200000]
In [49]:
for c in ['chr%d' % cs for cs in range(1, 20)] + ['chrX']:
borders[c] = {}
for reso in resos:
hic_data1 = load_hic_data_from_bam(base_path.format(cel1),
resolution=reso,
region=c,
ncpus=8, verbose=False)
hic_data2 = load_hic_data_from_bam(base_path.format(cel2),
resolution=reso,
region=c,
ncpus=8, verbose=False)
hic_data1.normalize_hic(silent=True)
hic_data1.normalize_expected()
hic_data2.normalize_hic(silent=True)
hic_data2.normalize_expected()
wsize = (1, max(2, 500000 // reso))
delta = max(1, 100000 // reso)
insc1, delta1 = insulation_score(hic_data1, [wsize], resolution=100000, normalize=True, delta=delta)
insc2, delta2 = insulation_score(hic_data2, [wsize], resolution=100000, normalize=True, delta=delta)
borders1 = insulation_to_borders(insc1[wsize], delta1[wsize], min_strength=0.1)
borders2 = insulation_to_borders(insc2[wsize], delta2[wsize], min_strength=0.1)
borders[c][reso] = borders1, borders2
We align TAD borders and keep the proportion of borders aligned between cell types
In [50]:
from pytadbit.alignment import align
In [51]:
scores = {}
props1 = {}
props2 = {}
for c in borders:
scores[c] = []
props1[c] = []
props2[c] = []
for reso in resos:
ali = align([[t[0] for t in b] for b in borders[c][reso]], max_dist=1)
props1[c].append(ali[0][2])
props2[c].append(ali[0][3])
scores[c].append(ali[0][1])
We plot the results, as boxplots to group the values of each chromosomes:
In [52]:
from functools import reduce
plt.figure(figsize=(12, 5))
bp = plt.boxplot([reduce(lambda x, y: x+ y,
[(props1[c][i], props2[c][i]) for c in props1])
for i in range(len(resos))], positions=resos, widths=2000)
plt.plot([sum(m.get_xdata()) / 2 for m in bp['medians']],
[m.get_ydata()[0] for m in bp['medians']], color='grey', alpha=0.5)
plt.xlim(0, 205000)
plt.xticks(resos, [str(reso / 1000) + 'kb' for reso in resos], rotation=90)
plt.ylabel('Proportion of aligned borders\nPSC vs B cell (all chromosomes)')
plt.xlabel('Resolution at which TAD borders are called')
plt.grid()
plt.tight_layout()
plt.show()
In the case of these experiments it seems that looking for TAD borders bellow 50 kb is dangerous, as very few are reproducible. Calling TAD borders at 100 kb resolution might be here a good idea.