In this notebook, we will be working with three datasets from Flyamer et al. 2017 (GEO ID GSE80006) placed in DATA/FASTQ/ directory. Opposite to previous analysis, now you can work with single-cell Hi-C results.
In [ ]:
import os
from hiclib import mapping
from mirnylib import h5dict, genome
In [ ]:
bowtie_path = '/opt/conda/bin/bowtie2'
enzyme = 'DpnII'
bowtie_index_path = '/home/jovyan/GENOMES/HG19_IND/hg19_chr1'
fasta_path = '/home/jovyan/GENOMES/HG19_FASTA/'
chrms = ['1']
genome_db = genome.Genome(fasta_path, readChrms=chrms)
In [ ]:
min_seq_len = 120
len_step = 5
nthreads = 2
temp_dir = 'tmp'
bowtie_flags = '--very-sensitive'
In this case, we have multiple datasets, thus we have to iterate through the list of files.
In [ ]:
experiment_ids = ['72-sc-1', '54-sc-1', '58-sc-1']
In [ ]:
for exp_id in experiment_ids:
infile1 = '/home/jovyan/DATA/FASTQ1/K562_{}_R1.fastq'.format(exp_id)
infile2 = '/home/jovyan/DATA/FASTQ1/K562_{}_R2.fastq'.format(exp_id)
out1 = '/home/jovyan/DATA/SAM/K562_{}_R1.chr1.sam'.format(exp_id)
out2 = '/home/jovyan/DATA/SAM/K562_{}_R2.chr1.sam'.format(exp_id)
mapping.iterative_mapping(
bowtie_path = bowtie_path,
bowtie_index_path = bowtie_index_path,
fastq_path = infile1,
out_sam_path = out1,
min_seq_len = min_seq_len,
len_step = len_step,
nthreads = nthreads,
temp_dir = temp_dir,
bowtie_flags = bowtie_flags)
mapping.iterative_mapping(
bowtie_path = bowtie_path,
bowtie_index_path = bowtie_index_path,
fastq_path = infile2,
out_sam_path = out2,
min_seq_len = min_seq_len,
len_step = len_step,
nthreads = nthreads,
temp_dir = temp_dir,
bowtie_flags = bowtie_flags)
out = '/home/jovyan/DATA/HDF5/K562_{}.fragments.hdf5'
mapped_reads = h5dict.h5dict(out.format(exp_id))
mapping.parse_sam(
sam_basename1 = out1,
sam_basename2 = out2,
out_dict = mapped_reads,
genome_db = genome_db,
enzyme_name = enzyme,
save_seqs = False,
keep_ids = False)
In case of single cell analysis, the contact filtering is much more sophisticated. For example, standard PCR duplicates filter is replaced by "filterLessThanDistance". The code below is based on hiclib single cell scripts for Flyamer et al. 2017.
In [ ]:
import h5py
import numpy as np
from hiclib import fragmentHiC
from hiclib.fragmentHiC import HiCdataset as HiCdatasetorig
from mirnylib.numutils import uniqueIndex
class HiCdataset(HiCdatasetorig):
"Modification of HiCDataset to include all filters"
def filterLessThanDistance(self):
# This is the old function used to filter "duplicates".
#After the final submission of the manuscript, It was replaced by a better function that does the same,
#but at bp resolution, not 100 bp.
M = self.N
for i in range(5):
for j in range(5):
chrStrandID = 10000000 * 10000000 * (np.array(self.chrms1 * (self.strands1 + 1), dtype = np.int64) * 100 + self.chrms2 * (self.strands2 + 1))
print(len(np.unique(chrStrandID)))
posid = np.array((self.cuts1 + i * 100) // 500, dtype = np.int64) * 10000000 + (self.cuts2 + i * 100) // 500
N = self.N
self.maskFilter(uniqueIndex(posid + chrStrandID))
print(N, "filtered to", self.N)
self.metadata["321_quasiDuplicatesRemoved"] = M - self.N
In [ ]:
output = []
for exp_id in experiment_ids:
inp = '/home/jovyan/DATA/HDF5/K562_{}.fragments.hdf5'.format(exp_id)
out = '/home/jovyan/DATA/HDF5/K562_{}.tmp.hdf5'.format(exp_id)
outstat = '/home/jovyan/DATA/HDF5/K562_{}.stat.txt'.format(exp_id)
fragments = HiCdataset(
filename = out,
genome = genome_db,
maximumMoleculeLength= 500,
enzymeName = 1000,
mode = 'w')
fragments.parseInputData(
dictLike=inp)
fragments.filterLessThanDistance()
fs = fragments.fragmentSum()
fragments.fragmentFilter(fs < 9)
output.append(list(fragments.metadata.items()))
out_bin = '/home/jovyan/DATA/HDF5/K562_{}.binned_{}.hdf5'
res_kb = [100, 20]
for res in res_kb:
print(res)
outmap = out_bin.format(exp_id, str(res)+'kb')
fragments.saveHeatmap(outmap, res*1000)
del fragments
In [ ]:
output
In [ ]:
from hiclib.binnedData import binnedDataAnalysis
In [ ]:
res = 100
data_hic = binnedDataAnalysis(resolution=res*1000, genome=genome_db)
In [ ]:
for exp_id in experiment_ids:
data_hic.simpleLoad('/home/jovyan/DATA/HDF5/K562_{}.binned_{}.hdf5'.format(exp_id, str(res)+'kb'), exp_id)
In [ ]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('ticks')
In [ ]:
%matplotlib inline
In [ ]:
plt.figure(figsize=[10,10])
plt.imshow(data_hic.dataDict['54-sc-1'][200:500, 200:500], cmap='jet', interpolation='None')
In [ ]:
data_hic.simpleLoad('/home/jovyan/DATA/HDF5/K562_B-bulk.binned_{}.hdf5'.format(str(res)+'kb'),'bulk')
data_hic.removeDiagonal()
In [ ]:
mtx1 = data_hic.dataDict['bulk']
mtx2 = data_hic.dataDict['54-sc-1']
mtx_tmp = np.triu(mtx1)/np.mean(mtx1) + np.tril(mtx2)/np.mean(mtx2)
In [ ]:
plt.figure(figsize=[10,10])
plt.imshow(mtx_tmp[200:500, 200:500], cmap='Blues', interpolation='None', vmax=900)
In [ ]:
mtx_merged = sum([data_hic.dataDict[exp_id] for exp_id in experiment_ids])
In [ ]:
mtx1 = data_hic.dataDict['bulk']
mtx2 = mtx_merged
mtx_tmp = np.triu(mtx1)/np.mean(mtx1) + np.tril(mtx2)/np.mean(mtx2)
In [ ]:
plt.figure(figsize=[10,10])
plt.imshow(mtx_tmp[200:500, 200:500], cmap='Blues', interpolation='None', vmax=800)
In [ ]:
from matplotlib import gridspec
In [ ]:
eig = np.loadtxt('/home/jovyan/DATA/ANNOT/comp_K562_100Kb_chr1.tsv')
In [ ]:
bgn = 0
end = 500
fig = plt.figure(figsize=(10,10))
gs = gridspec.GridSpec(2, 1, height_ratios=[20,2])#width_ratios=[2,20],
gs.update(wspace=0.0, hspace=0.0)
ax = plt.subplot(gs[0,0])
ax.matshow(mtx_tmp[bgn:end, bgn:end], cmap='jet', origin='lower', aspect='auto')
ax.set_xticks([])
ax.set_yticks([])
axl = plt.subplot(gs[1,0])#, sharey=ax)
plt.plot(range(end-bgn), eig[bgn:end] )
plt.xlim(0, end-bgn)
plt.xlabel('Eigenvector values')
ticks = range(bgn, end+1, 100)
ticklabels = ['{} Kb'.format(x) for x in ticks]
plt.xticks(ticks, ticklabels)
In [ ]:
import lavaburst
In [ ]:
mtx = data_hic.dataDict['54-sc-1']
In [ ]:
good_bins = mtx.astype(bool).sum(axis=0) > 1 # We have to mask rows/cols if data is missing
gammas=[0.1, 1.0, 5.0, 10.0] # set of parameters gamma for TADs calling
segments_dict = {}
for gam_current in gammas:
print(gam_current)
S = lavaburst.scoring.armatus_score(mtx, gamma=gam_current, binmask=good_bins)
model = lavaburst.model.SegModel(S)
segments = model.optimal_segmentation() # Positions of TADs for input matrix
segments_dict[gam_current] = segments.copy()
In [ ]:
# TADs at different parameters for particular cell (54-sc-1)
A = mtx.copy()
good_bins = A.astype(bool).sum(axis=0) > 0
At = lavaburst.utils.tilt_heatmap(mtx, n_diags=100)
start_tmp = 0
end_tmp = 500
f = plt.figure(figsize=(20, 6))
ax = f.add_subplot(111)
blues = sns.cubehelix_palette(0.4, gamma=0.5, rot=-0.3, dark=0.1, light=0.9, as_cmap=True)
ax.matshow(np.log(At[start_tmp: end_tmp]), cmap=blues)
cmap = mpl.cm.get_cmap('brg')
gammas = segments_dict.keys()
for n, gamma in enumerate(gammas):
segments = segments_dict[gamma]
for a in segments[:-1]:
if a[1]<start_tmp or a[0]>end_tmp:
continue
ax.plot([a[0]-start_tmp, a[0]+(a[1]-a[0])/2-start_tmp], [0, -(a[1]-a[0])], c=cmap(n/len(gammas)), alpha=0.5)
ax.plot([a[0]+(a[1]-a[0])/2-start_tmp, a[1]-start_tmp], [-(a[1]-a[0]), 0], c=cmap(n/len(gammas)), alpha=0.5)
a = segments[-1]
ax.plot([a[0]-start_tmp, a[0]+(a[1]-a[0])/2-start_tmp], [0, -(a[1]-a[0])], c=cmap(n/len(gammas)), alpha=0.5, label=gamma)
ax.plot([a[0]+(a[1]-a[0])/2-start_tmp, a[1]-start_tmp], [-(a[1]-a[0]), 0], c=cmap(n/len(gammas)), alpha=0.5)
ax.set_xlim([0,end_tmp-start_tmp])
ax.set_ylim([100,-100])
ax.legend(bbox_to_anchor=(1.1, 1.05))
ax.set_aspect(0.5)
The problem of the variable parameter for TADs calling might be resolved via parameter optimization. For example, the best parameter could be selected based on mean TADs size fitting expectations (~ 500 Kb in this case).
In [ ]:
optimal_gammas = {}
for exp_id in experiment_ids:
mtx = data_hic.dataDict[exp_id][0:1000, 0:1000]
good_bins = mtx.astype(bool).sum(axis=0) > 1 # We have to mask rows/cols if data is missing
gammas = np.arange(2, 24, 1)*1000/3250 # Desired set of gammas for testing
means = []
for gam_current in gammas:
S = lavaburst.scoring.armatus_score(mtx, gamma=gam_current, binmask=good_bins)
model = lavaburst.model.SegModel(S)
segments = model.optimal_segmentation() # Positions of TADs for input matrix
tad_lens = segments[:,1]-segments[:,0]
good_lens = (tad_lens>=200/res)&(tad_lens<900) # We do not consider too large or too small segments as TADs
means.append(np.mean(tad_lens[good_lens]))
idx = np.argmin(np.abs(np.array(means)-500/res))
opt_mean, opt_gamma = means[idx], gammas[idx]
print(exp_id, opt_mean*res, opt_gamma)
optimal_gammas[exp_id] = opt_gamma
In [ ]:
# TADs in single cells compared with merged single-cell data
A = mtx.copy()
good_bins = A.astype(bool).sum(axis=0) > 0
At = lavaburst.utils.tilt_heatmap(mtx_merged, n_diags=100)
start_tmp = 0
end_tmp = 500
f = plt.figure(figsize=(20, 6))
ax = f.add_subplot(111)
ax.matshow(np.log(At[start_tmp: end_tmp]), cmap='Reds')
for n, exp in enumerate(experiment_ids):
A = data_hic.dataDict[exp][bgn:end, bgn:end].copy()
good_bins = A.astype(bool).sum(axis=0) > 0
gamma = optimal_gammas[exp]
S = lavaburst.scoring.modularity_score(A, gamma=gamma, binmask=good_bins)
model = lavaburst.model.SegModel(S)
segments = model.optimal_segmentation()
for a in segments[:-1]:
if a[1]<start_tmp or a[0]>end_tmp:
continue
tad_len = a[1]-a[0]
if (tad_len<200/res)|(tad_len>=900):
continue
ax.fill_between([a[0]-start_tmp, a[0]+(a[1]-a[0])/2-start_tmp, a[1]-start_tmp], [0, -(a[1]-a[0]), 0], 0,
facecolor='#6100FF', interpolate=True, alpha=0.2)
a = segments[-1]
tad_len = a[1]-a[0]
if (tad_len<200/res)|(tad_len>=900):
continue
ax.fill_between([a[0]-start_tmp, a[0]+(a[1]-a[0])/2-start_tmp, a[1]-start_tmp], [0, -(a[1]-a[0]), 0], 0,
facecolor='#6100FF', interpolate=True, alpha=0.2)
ax.set_xlim([start_tmp,end_tmp])
ax.set_ylim([100,-100])
ax.set_aspect(0.5)
In [ ]:
# TADs in single cells compared with bulk Hi-C data
A = mtx.copy()
good_bins = A.astype(bool).sum(axis=0) > 0
At = lavaburst.utils.tilt_heatmap(data_hic.dataDict['bulk'], n_diags=100)
start_tmp = 0
end_tmp = 300
f = plt.figure(figsize=(20, 6))
ax = f.add_subplot(111)
ax.matshow(np.log(At[start_tmp: end_tmp]), cmap='Reds')
for n, exp in enumerate(experiment_ids):
A = data_hic.dataDict[exp][bgn:end, bgn:end].copy()
good_bins = A.astype(bool).sum(axis=0) > 0
gamma = optimal_gammas[exp]
S = lavaburst.scoring.modularity_score(A, gamma=gamma, binmask=good_bins)
model = lavaburst.model.SegModel(S)
segments = model.optimal_segmentation()
for a in segments[:-1]:
if a[1]<start_tmp or a[0]>end_tmp:
continue
tad_len = a[1]-a[0]
if (tad_len<200/res)|(tad_len>=900):
continue
ax.fill_between([a[0]-start_tmp, a[0]+(a[1]-a[0])/2-start_tmp, a[1]-start_tmp], [0, -(a[1]-a[0]), 0], 0,
facecolor='#6100FF', interpolate=True, alpha=0.2)
a = segments[-1]
tad_len = a[1]-a[0]
if (tad_len<200/res)|(tad_len>=900):
continue
ax.fill_between([a[0]-start_tmp, a[0]+(a[1]-a[0])/2-start_tmp, a[1]-start_tmp], [0, -(a[1]-a[0]), 0], 0,
facecolor='#6100FF', interpolate=True, alpha=0.2)
ax.set_xlim([start_tmp,end_tmp])
ax.set_ylim([100,-100])
ax.set_aspect(0.5)