Single-cell Hi-C data analysis

Welcome to the second part of our analysis. Here we will work specifically with single-cell data.

The outline:

  1. Reads mapping
  2. Data filtering and binning
  3. Hi-C data visualisation
  4. Compartments and TADs

If you have any questions, please, contact Aleksandra Galitsyna (agalitzina@gmail.com)

1. Reads mapping

Go top

In this notebook, we will be working with three datasets from Flyamer et al. 2017 (GEO ID GSE80006) placed in DATA/FASTQ/ directory. Opposite to previous analysis, now you can work with single-cell Hi-C results.


In [ ]:
import os

from hiclib import mapping
from mirnylib import h5dict, genome

In [ ]:
bowtie_path       = '/opt/conda/bin/bowtie2'

enzyme = 'DpnII'

bowtie_index_path = '/home/jovyan/GENOMES/HG19_IND/hg19_chr1'
fasta_path        = '/home/jovyan/GENOMES/HG19_FASTA/'
chrms             = ['1']

genome_db    = genome.Genome(fasta_path, readChrms=chrms)

In [ ]:
min_seq_len       = 120
len_step          = 5
nthreads          = 2
temp_dir          = 'tmp'
bowtie_flags      = '--very-sensitive'

In this case, we have multiple datasets, thus we have to iterate through the list of files.


In [ ]:
experiment_ids = ['72-sc-1', '54-sc-1', '58-sc-1']

In [ ]:
for exp_id in experiment_ids:
    
    infile1           = '/home/jovyan/DATA/FASTQ1/K562_{}_R1.fastq'.format(exp_id)
    infile2           = '/home/jovyan/DATA/FASTQ1/K562_{}_R2.fastq'.format(exp_id)
    out1              = '/home/jovyan/DATA/SAM/K562_{}_R1.chr1.sam'.format(exp_id)
    out2              = '/home/jovyan/DATA/SAM/K562_{}_R2.chr1.sam'.format(exp_id)

    mapping.iterative_mapping(
        bowtie_path       = bowtie_path,
        bowtie_index_path = bowtie_index_path,
        fastq_path        = infile1,
        out_sam_path      = out1,
        min_seq_len       = min_seq_len,
        len_step          = len_step,
        nthreads          = nthreads,
        temp_dir          = temp_dir, 
        bowtie_flags      = bowtie_flags)

    mapping.iterative_mapping(
        bowtie_path       = bowtie_path,
        bowtie_index_path = bowtie_index_path,
        fastq_path        = infile2,
        out_sam_path      = out2,
        min_seq_len       = min_seq_len,
        len_step          = len_step,
        nthreads          = nthreads,
        temp_dir          = temp_dir, 
        bowtie_flags      = bowtie_flags)
    
    out = '/home/jovyan/DATA/HDF5/K562_{}.fragments.hdf5'

    mapped_reads = h5dict.h5dict(out.format(exp_id))

    mapping.parse_sam(
        sam_basename1  =  out1,
        sam_basename2  =  out2,
        out_dict       =  mapped_reads,
        genome_db      =  genome_db,
        enzyme_name    =  enzyme, 
        save_seqs      =  False,
        keep_ids       =  False)

2. Data filtering and binning

Go top

In case of single cell analysis, the contact filtering is much more sophisticated. For example, standard PCR duplicates filter is replaced by "filterLessThanDistance". The code below is based on hiclib single cell scripts for Flyamer et al. 2017.


In [ ]:
import h5py
import numpy as np

from hiclib import fragmentHiC
from hiclib.fragmentHiC import HiCdataset as HiCdatasetorig
from mirnylib.numutils import uniqueIndex

class HiCdataset(HiCdatasetorig):
    "Modification of HiCDataset to include all filters"

    def filterLessThanDistance(self):
        # This is the old function used to filter "duplicates".
        #After the final submission of the manuscript, It was replaced by a better function that does the same,
        #but at bp resolution, not 100 bp.
        M = self.N
        for i in range(5):
            for j in range(5):
                chrStrandID = 10000000 * 10000000 * (np.array(self.chrms1 * (self.strands1 + 1), dtype = np.int64) * 100 + self.chrms2 * (self.strands2 + 1))
                print(len(np.unique(chrStrandID)))
                posid = np.array((self.cuts1 + i * 100) // 500, dtype = np.int64) * 10000000 + (self.cuts2 + i * 100) // 500
                N = self.N
                self.maskFilter(uniqueIndex(posid + chrStrandID))
                print(N, "filtered to", self.N)
        self.metadata["321_quasiDuplicatesRemoved"] = M - self.N

In [ ]:
output = []

for exp_id in experiment_ids:
    
    inp = '/home/jovyan/DATA/HDF5/K562_{}.fragments.hdf5'.format(exp_id)
    
    out = '/home/jovyan/DATA/HDF5/K562_{}.tmp.hdf5'.format(exp_id)
    outstat = '/home/jovyan/DATA/HDF5/K562_{}.stat.txt'.format(exp_id)
    
    fragments = HiCdataset(
        filename             = out,
        genome               = genome_db,
        maximumMoleculeLength= 500,
        enzymeName           = 1000,
        mode                 = 'w')

    fragments.parseInputData(
        dictLike=inp)

    fragments.filterLessThanDistance() 
    fs = fragments.fragmentSum()
    fragments.fragmentFilter(fs < 9)

    output.append(list(fragments.metadata.items()))
    
    out_bin = '/home/jovyan/DATA/HDF5/K562_{}.binned_{}.hdf5'

    res_kb = [100, 20]
    
    for res in res_kb:
        print(res)
        outmap = out_bin.format(exp_id, str(res)+'kb')

        fragments.saveHeatmap(outmap, res*1000)
    
    del fragments

In [ ]:
output

4. Hi-C data visualisation

Go top


In [ ]:
from hiclib.binnedData import binnedDataAnalysis

In [ ]:
res = 100
data_hic = binnedDataAnalysis(resolution=res*1000, genome=genome_db)

In [ ]:
for exp_id in experiment_ids:
    data_hic.simpleLoad('/home/jovyan/DATA/HDF5/K562_{}.binned_{}.hdf5'.format(exp_id, str(res)+'kb'), exp_id)

In [ ]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('ticks')

In [ ]:
%matplotlib inline

In [ ]:
plt.figure(figsize=[10,10])
plt.imshow(data_hic.dataDict['54-sc-1'][200:500, 200:500], cmap='jet', interpolation='None')

Comparison of single-cell and bulk datasets


In [ ]:
data_hic.simpleLoad('/home/jovyan/DATA/HDF5/K562_B-bulk.binned_{}.hdf5'.format(str(res)+'kb'),'bulk')
data_hic.removeDiagonal()

In [ ]:
mtx1 = data_hic.dataDict['bulk']
mtx2 = data_hic.dataDict['54-sc-1']
mtx_tmp = np.triu(mtx1)/np.mean(mtx1) + np.tril(mtx2)/np.mean(mtx2)

In [ ]:
plt.figure(figsize=[10,10])
plt.imshow(mtx_tmp[200:500, 200:500], cmap='Blues', interpolation='None', vmax=900)

In [ ]:
mtx_merged = sum([data_hic.dataDict[exp_id] for exp_id in experiment_ids])

In [ ]:
mtx1 = data_hic.dataDict['bulk']
mtx2 = mtx_merged
mtx_tmp = np.triu(mtx1)/np.mean(mtx1) + np.tril(mtx2)/np.mean(mtx2)

In [ ]:
plt.figure(figsize=[10,10])
plt.imshow(mtx_tmp[200:500, 200:500], cmap='Blues', interpolation='None', vmax=800)

5. Compartmanets and TADs

Go top


In [ ]:
from matplotlib import gridspec

In [ ]:
eig =  np.loadtxt('/home/jovyan/DATA/ANNOT/comp_K562_100Kb_chr1.tsv')

In [ ]:
bgn = 0
end = 500

fig = plt.figure(figsize=(10,10))

gs = gridspec.GridSpec(2, 1, height_ratios=[20,2])#width_ratios=[2,20], 
gs.update(wspace=0.0, hspace=0.0)

ax = plt.subplot(gs[0,0])

ax.matshow(mtx_tmp[bgn:end, bgn:end], cmap='jet', origin='lower', aspect='auto')
ax.set_xticks([])
ax.set_yticks([])

axl = plt.subplot(gs[1,0])#, sharey=ax)
plt.plot(range(end-bgn), eig[bgn:end] )
plt.xlim(0, end-bgn)
plt.xlabel('Eigenvector values')

ticks = range(bgn, end+1, 100)
ticklabels = ['{} Kb'.format(x) for x in ticks]
plt.xticks(ticks, ticklabels)

Topologically associating domains (TADs)


In [ ]:
import lavaburst

In [ ]:
mtx = data_hic.dataDict['54-sc-1']

In [ ]:
good_bins = mtx.astype(bool).sum(axis=0) > 1 # We have to mask rows/cols if data is missing

gammas=[0.1, 1.0, 5.0, 10.0] # set of parameters gamma for TADs calling

segments_dict = {}

for gam_current in gammas:
    print(gam_current)
    
    S = lavaburst.scoring.armatus_score(mtx, gamma=gam_current, binmask=good_bins)
    model = lavaburst.model.SegModel(S)
    segments = model.optimal_segmentation() # Positions of TADs for input matrix

    segments_dict[gam_current] = segments.copy()

In [ ]:
# TADs at different parameters for particular cell (54-sc-1)

A = mtx.copy()

good_bins = A.astype(bool).sum(axis=0) > 0

At = lavaburst.utils.tilt_heatmap(mtx, n_diags=100)

start_tmp = 0
end_tmp = 500

f = plt.figure(figsize=(20, 6))

ax = f.add_subplot(111)
blues = sns.cubehelix_palette(0.4, gamma=0.5, rot=-0.3, dark=0.1, light=0.9, as_cmap=True)
ax.matshow(np.log(At[start_tmp: end_tmp]), cmap=blues)

cmap = mpl.cm.get_cmap('brg')

gammas = segments_dict.keys()
for n, gamma in enumerate(gammas):

    segments = segments_dict[gamma]

    for a in segments[:-1]:
        if a[1]<start_tmp or a[0]>end_tmp:
            continue
        ax.plot([a[0]-start_tmp, a[0]+(a[1]-a[0])/2-start_tmp], [0, -(a[1]-a[0])], c=cmap(n/len(gammas)), alpha=0.5)
        ax.plot([a[0]+(a[1]-a[0])/2-start_tmp, a[1]-start_tmp], [-(a[1]-a[0]), 0], c=cmap(n/len(gammas)), alpha=0.5)

    a = segments[-1]
    ax.plot([a[0]-start_tmp, a[0]+(a[1]-a[0])/2-start_tmp], [0, -(a[1]-a[0])], c=cmap(n/len(gammas)), alpha=0.5, label=gamma)
    ax.plot([a[0]+(a[1]-a[0])/2-start_tmp, a[1]-start_tmp], [-(a[1]-a[0]), 0], c=cmap(n/len(gammas)), alpha=0.5)
    
ax.set_xlim([0,end_tmp-start_tmp])
ax.set_ylim([100,-100])
        
ax.legend(bbox_to_anchor=(1.1, 1.05))
ax.set_aspect(0.5)

The problem of the variable parameter for TADs calling might be resolved via parameter optimization. For example, the best parameter could be selected based on mean TADs size fitting expectations (~ 500 Kb in this case).


In [ ]:
optimal_gammas = {}

for exp_id in experiment_ids:
    
    mtx = data_hic.dataDict[exp_id][0:1000, 0:1000]
    
    good_bins = mtx.astype(bool).sum(axis=0) > 1 # We have to mask rows/cols if data is missing

    gammas = np.arange(2, 24, 1)*1000/3250 # Desired set of gammas for testing

    means = []
    
    for gam_current in gammas:

        S = lavaburst.scoring.armatus_score(mtx, gamma=gam_current, binmask=good_bins)
        model = lavaburst.model.SegModel(S)
        segments = model.optimal_segmentation() # Positions of TADs for input matrix

        tad_lens = segments[:,1]-segments[:,0]
        good_lens = (tad_lens>=200/res)&(tad_lens<900) # We do not consider too large or too small segments as TADs
        
        means.append(np.mean(tad_lens[good_lens]))
        
    idx = np.argmin(np.abs(np.array(means)-500/res))
    opt_mean, opt_gamma = means[idx], gammas[idx]
    
    print(exp_id, opt_mean*res, opt_gamma)
    
    optimal_gammas[exp_id] = opt_gamma

In [ ]:
# TADs in single cells compared with merged single-cell data

A = mtx.copy()

good_bins = A.astype(bool).sum(axis=0) > 0

At = lavaburst.utils.tilt_heatmap(mtx_merged, n_diags=100)

start_tmp = 0
end_tmp = 500

f = plt.figure(figsize=(20, 6))

ax = f.add_subplot(111)
ax.matshow(np.log(At[start_tmp: end_tmp]), cmap='Reds')

for n, exp in enumerate(experiment_ids):
    A = data_hic.dataDict[exp][bgn:end, bgn:end].copy()

    good_bins = A.astype(bool).sum(axis=0) > 0

    gamma = optimal_gammas[exp]
    
    S = lavaburst.scoring.modularity_score(A, gamma=gamma, binmask=good_bins)
    model = lavaburst.model.SegModel(S)
    segments = model.optimal_segmentation()

    for a in segments[:-1]:
        if a[1]<start_tmp or a[0]>end_tmp:
            continue
        tad_len = a[1]-a[0]
        if (tad_len<200/res)|(tad_len>=900):
            continue
        ax.fill_between([a[0]-start_tmp, a[0]+(a[1]-a[0])/2-start_tmp, a[1]-start_tmp], [0, -(a[1]-a[0]), 0], 0, 
                        facecolor='#6100FF', interpolate=True, alpha=0.2)

    a = segments[-1]
    tad_len = a[1]-a[0]
    if (tad_len<200/res)|(tad_len>=900):
        continue
    ax.fill_between([a[0]-start_tmp, a[0]+(a[1]-a[0])/2-start_tmp, a[1]-start_tmp], [0, -(a[1]-a[0]), 0], 0, 
                    facecolor='#6100FF', interpolate=True, alpha=0.2)

ax.set_xlim([start_tmp,end_tmp])
ax.set_ylim([100,-100])

ax.set_aspect(0.5)

In [ ]:
# TADs in single cells compared with bulk Hi-C data

A = mtx.copy()

good_bins = A.astype(bool).sum(axis=0) > 0

At = lavaburst.utils.tilt_heatmap(data_hic.dataDict['bulk'], n_diags=100)

start_tmp = 0
end_tmp = 300

f = plt.figure(figsize=(20, 6))

ax = f.add_subplot(111)
ax.matshow(np.log(At[start_tmp: end_tmp]), cmap='Reds')

for n, exp in enumerate(experiment_ids):
    A = data_hic.dataDict[exp][bgn:end, bgn:end].copy()

    good_bins = A.astype(bool).sum(axis=0) > 0

    gamma = optimal_gammas[exp]
    
    S = lavaburst.scoring.modularity_score(A, gamma=gamma, binmask=good_bins)
    model = lavaburst.model.SegModel(S)
    segments = model.optimal_segmentation()

    for a in segments[:-1]:
        if a[1]<start_tmp or a[0]>end_tmp:
            continue
        tad_len = a[1]-a[0]
        if (tad_len<200/res)|(tad_len>=900):
            continue
        ax.fill_between([a[0]-start_tmp, a[0]+(a[1]-a[0])/2-start_tmp, a[1]-start_tmp], [0, -(a[1]-a[0]), 0], 0, 
                        facecolor='#6100FF', interpolate=True, alpha=0.2)

    a = segments[-1]
    tad_len = a[1]-a[0]
    if (tad_len<200/res)|(tad_len>=900):
        continue
    ax.fill_between([a[0]-start_tmp, a[0]+(a[1]-a[0])/2-start_tmp, a[1]-start_tmp], [0, -(a[1]-a[0]), 0], 0, 
                    facecolor='#6100FF', interpolate=True, alpha=0.2)

ax.set_xlim([start_tmp,end_tmp])
ax.set_ylim([100,-100])

ax.set_aspect(0.5)