In [14]:
from pytadbit.parsers.genome_parser import parse_fasta
In [15]:
genome_seq = parse_fasta(
"/media/storage/db/reference_genome/Homo_sapiens/hg38/hg38.fa")
In [16]:
maps2= ['results/HindIII/01_mapping/mapHindIII_r2/K562_HindIII_2_full_1-end.map',
'results/HindIII/01_mapping/mapHindIII_r2/K562_HindIII_2_frag_1-end.map']
In [17]:
maps1= ['results/HindIII/01_mapping/mapHindIII_r1/K562_HindIII_1_full_1-end.map',
'results/HindIII/01_mapping/mapHindIII_r1/K562_HindIII_1_frag_1-end.map']
In [18]:
! mkdir -p results/HindIII/02_parsing
In [19]:
from pytadbit.parsers.map_parser import parse_map
In [20]:
parse_map(maps1, maps2,
"results/HindIII/02_parsing/reads1.tsv",
"results/HindIII/02_parsing/reads2.tsv",
genome_seq=genome_seq, re_name="HindIII",
verbose=True)
Out[20]:
In [21]:
! head -n 50 results/HindIII/02_parsing/reads1.tsv
In [23]:
from pytadbit.mapping import get_intersection
In [25]:
! mkdir -p results/HindIII/03_filtering
In [27]:
get_intersection("results/HindIII/02_parsing/reads1.tsv",
"results/HindIII/02_parsing/reads2.tsv",
"results/HindIII/03_filtering/reads12.tsv", verbose=True)
Out[27]:
In [28]:
! head -n50 results/HindIII/03_filtering/reads12.tsv
In [6]:
from pytadbit.mapping.analyze import plot_distance_vs_interactions
In [8]:
plot_distance_vs_interactions("results/HindIII/03_filtering/reads12.tsv", resolution=100000, max_diff=1000, show=True)
Out[8]:
the orange slope is the most important and you expect a slope of around -1 unless you work with cells badly synchronized or with weird karyotype
resolution= size of bins
In [10]:
from pytadbit.mapping.analyze import plot_genomic_distribution
In [11]:
plot_genomic_distribution("results/HindIII/03_filtering/reads12.tsv", resolution=500000, show=True)
In Y we have number of reads per 500kb bins
It reflects different number of copies, for example in chromosome 2 the last part of the chromosome has less copies than the rest, therefore the lower number of reads.
Big peaks correspond to PCR duplicates
In [12]:
from pytadbit.mapping.analyze import hic_map
In [13]:
hic_map("results/HindIII/03_filtering/reads12.tsv", resolution=1000000, show=True)
In [14]:
from pytadbit.mapping.analyze import insert_sizes
In [15]:
insert_sizes("results/HindIII/03_filtering/reads12.tsv", show=True, nreads=100000)
Out[15]:
In [16]:
from pytadbit.mapping.filter import filter_reads
In [17]:
filter_reads("results/HindIII/03_filtering/reads12.tsv", max_molecule_length=750, min_dist_to_re=500)
Out[17]:
In [18]:
masked = {1: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_self-circle.tsv',
'name': 'self-circle',
'reads': 37383},
2: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_dangling-end.tsv',
'name': 'dangling-end',
'reads': 660146},
3: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_error.tsv',
'name': 'error',
'reads': 37395},
4: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_extra_dangling-end.tsv',
'name': 'extra dangling-end',
'reads': 3773498},
5: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_too_close_from_RES.tsv',
'name': 'too close from RES',
'reads': 3277369},
6: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_too_short.tsv',
'name': 'too short',
'reads': 296853},
7: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_too_large.tsv',
'name': 'too large',
'reads': 1843},
8: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_over-represented.tsv',
'name': 'over-represented',
'reads': 411157},
9: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_duplicated.tsv',
'name': 'duplicated',
'reads': 324490},
10: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_random_breaks.tsv',
'name': 'random breaks',
'reads': 968492}}
In [22]:
from pytadbit.mapping.filter import apply_filter
In [23]:
apply_filter ("results/HindIII/03_filtering/reads12.tsv",
"results/HindIII/03_filtering/reads12_valid.tsv",
masked, filters=[1, 2, 3, 4, 9, 10])
Out[23]:
In [ ]: