In [1]:
from pytadbit.parsers.genome_parser import parse_fasta
In [5]:
genome_seq = parse_fasta('/media/storage/db/reference_genome/Homo_sapiens/hg38/hg38.fa')
In [6]:
maps1 = [
'results/HindIII/01_mapping/mapHindIII_r1/K562_HindIII_1_full_1-end.map',
'results/HindIII/01_mapping/mapHindIII_r1/K562_HindIII_1_frag_1-end.map']
In [7]:
maps2 = [
'results/HindIII/01_mapping/mapHindIII_r2/K562_HindIII_2_full_1-end.map',
'results/HindIII/01_mapping/mapHindIII_r2/K562_HindIII_2_frag_1-end.map']
In [8]:
! mkdir -p results/HindIII/02_parsing
In [9]:
from pytadbit.parsers.map_parser import parse_map
In [11]:
parse_map(maps1,
maps2,
'results/HindIII/02_parsing/reads1.tsv',
'results/HindIII/02_parsing/reads2.tsv',
genome_seq=genome_seq, re_name='HindIII',
verbose=True)
Out[11]:
The output are the tables with the reads as ID, chrm, pos, lenght, and start-end fragment
In [12]:
! head -n 50 results/HindIII/02_parsing/reads1.tsv
In [14]:
from pytadbit.mapping import get_intersection
This function will filter all reads that are not mapped in both sides
In [15]:
! mkdir -p results/HindIII/03_filtering
In [16]:
get_intersection('results/HindIII/02_parsing/reads1.tsv',
'results/HindIII/02_parsing/reads2.tsv',
'results/HindIII/03_filtering/reads12.tsv',
verbose=True)
Out[16]:
In [17]:
! head -n50 results/HindIII/03_filtering/reads12.tsv
In [18]:
from pytadbit.mapping.analyze import plot_distance_vs_interactions
This function plots the number of interactions as fiunction of distance Each dot is a bin of 100K interactions, in Y axis is the interaction counts and in X axis is the distance. Hence interactions between close fragments should show the higher counts and interactions from fragments far away should be empty (no read counts)
In [20]:
plot_distance_vs_interactions('results/HindIII/03_filtering/reads12.tsv',
resolution=100000, max_diff=1000, show=True)
Out[20]:
In [22]:
from pytadbit.mapping.analyze import plot_genomic_distribution
This function will plot the genomic distribution, per chromosome of reads counts in 50kb bins (* files can be saved as csv, tsv or images by 'save' and specify the file format with the extension)
In [23]:
plot_genomic_distribution (
'results/HindIII/03_filtering/reads12.tsv',
resolution=500000, show=True)
In [24]:
from pytadbit.mapping.analyze import hic_map
In [25]:
hic_map('results/HindIII/03_filtering/reads12.tsv',
resolution=1000000, show=True)
Plot the interaction matrix and print a summary of the interactions: Cis interactions
In [26]:
from pytadbit.mapping.analyze import insert_sizes
In [27]:
insert_sizes('results/HindIII/03_filtering/reads12.tsv',show=True,nreads=100000)
Out[27]:
In [28]:
from pytadbit.mapping.filter import filter_reads
In [29]:
filter_reads('results/HindIII/03_filtering/reads12.tsv',max_molecule_length=750,min_dist_to_re=500)
Out[29]:
This function will identify all type of reads that can be filtered
In [30]:
masked = {1: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_self-circle.tsv',
'name': 'self-circle',
'reads': 37383},
2: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_dangling-end.tsv',
'name': 'dangling-end',
'reads': 660146},
3: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_error.tsv',
'name': 'error',
'reads': 37395},
4: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_extra_dangling-end.tsv',
'name': 'extra dangling-end',
'reads': 3773498},
5: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_too_close_from_RES.tsv',
'name': 'too close from RES',
'reads': 3277369},
6: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_too_short.tsv',
'name': 'too short',
'reads': 296853},
7: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_too_large.tsv',
'name': 'too large',
'reads': 1843},
8: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_over-represented.tsv',
'name': 'over-represented',
'reads': 411157},
9: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_duplicated.tsv',
'name': 'duplicated',
'reads': 324490},
10: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_random_breaks.tsv',
'name': 'random breaks',
'reads': 968492}}
In [31]:
from pytadbit.mapping.filter import apply_filter
This will filter the reads based in
In [32]:
apply_filter('results/HindIII/03_filtering/reads12.tsv',
'results/HindIII/03_filtering/reads12_valid.tsv',
masked,filters=[1,2,3,4,9,10])
Out[32]:
In [ ]: