In [1]:
    
from pytadbit.parsers.genome_parser import parse_fasta
    
In [5]:
    
genome_seq = parse_fasta('/media/storage/db/reference_genome/Homo_sapiens/hg38/hg38.fa')
    
    
In [6]:
    
maps1 = [
    'results/HindIII/01_mapping/mapHindIII_r1/K562_HindIII_1_full_1-end.map',
    'results/HindIII/01_mapping/mapHindIII_r1/K562_HindIII_1_frag_1-end.map']
    
In [7]:
    
maps2 = [
    'results/HindIII/01_mapping/mapHindIII_r2/K562_HindIII_2_full_1-end.map',
    'results/HindIII/01_mapping/mapHindIII_r2/K562_HindIII_2_frag_1-end.map']
    
In [8]:
    
! mkdir -p results/HindIII/02_parsing
    
In [9]:
    
from pytadbit.parsers.map_parser import parse_map
    
In [11]:
    
parse_map(maps1,
          maps2,
          'results/HindIII/02_parsing/reads1.tsv',
          'results/HindIII/02_parsing/reads2.tsv',
          genome_seq=genome_seq, re_name='HindIII',
          verbose=True)
    
    
    Out[11]:
The output are the tables with the reads as ID, chrm, pos, lenght, and start-end fragment
In [12]:
    
! head -n 50 results/HindIII/02_parsing/reads1.tsv
    
    
In [14]:
    
from pytadbit.mapping import get_intersection
    
This function will filter all reads that are not mapped in both sides
In [15]:
    
! mkdir -p results/HindIII/03_filtering
    
In [16]:
    
get_intersection('results/HindIII/02_parsing/reads1.tsv',
                'results/HindIII/02_parsing/reads2.tsv',
                'results/HindIII/03_filtering/reads12.tsv',
                verbose=True)
    
    
    Out[16]:
In [17]:
    
! head -n50 results/HindIII/03_filtering/reads12.tsv
    
    
In [18]:
    
from pytadbit.mapping.analyze import plot_distance_vs_interactions
    
This function plots the number of interactions as fiunction of distance Each dot is a bin of 100K interactions, in Y axis is the interaction counts and in X axis is the distance. Hence interactions between close fragments should show the higher counts and interactions from fragments far away should be empty (no read counts)
In [20]:
    
plot_distance_vs_interactions('results/HindIII/03_filtering/reads12.tsv',
                             resolution=100000, max_diff=1000, show=True)
    
    
    Out[20]:
In [22]:
    
from pytadbit.mapping.analyze import  plot_genomic_distribution
    
This function will plot the genomic distribution, per chromosome of reads counts in 50kb bins (* files can be saved as csv, tsv or images by 'save' and specify the file format with the extension)
In [23]:
    
plot_genomic_distribution (
'results/HindIII/03_filtering/reads12.tsv',
resolution=500000, show=True)
    
    
In [24]:
    
from pytadbit.mapping.analyze import hic_map
    
In [25]:
    
hic_map('results/HindIII/03_filtering/reads12.tsv',
       resolution=1000000, show=True)
    
    
Plot the interaction matrix and print a summary of the interactions: Cis interactions
In [26]:
    
from pytadbit.mapping.analyze import insert_sizes
    
In [27]:
    
insert_sizes('results/HindIII/03_filtering/reads12.tsv',show=True,nreads=100000)
    
    
    Out[27]:
In [28]:
    
from pytadbit.mapping.filter import filter_reads
    
In [29]:
    
filter_reads('results/HindIII/03_filtering/reads12.tsv',max_molecule_length=750,min_dist_to_re=500)
    
    
    Out[29]:
This function will identify all type of reads that can be filtered
In [30]:
    
masked = {1: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_self-circle.tsv',
  'name': 'self-circle',
  'reads': 37383},
 2: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_dangling-end.tsv',
  'name': 'dangling-end',
  'reads': 660146},
 3: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_error.tsv',
  'name': 'error',
  'reads': 37395},
 4: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_extra_dangling-end.tsv',
  'name': 'extra dangling-end',
  'reads': 3773498},
 5: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_too_close_from_RES.tsv',
  'name': 'too close from RES',
  'reads': 3277369},
 6: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_too_short.tsv',
  'name': 'too short',
  'reads': 296853},
 7: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_too_large.tsv',
  'name': 'too large',
  'reads': 1843},
 8: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_over-represented.tsv',
  'name': 'over-represented',
  'reads': 411157},
 9: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_duplicated.tsv',
  'name': 'duplicated',
  'reads': 324490},
 10: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_random_breaks.tsv',
  'name': 'random breaks',
  'reads': 968492}}
    
In [31]:
    
from pytadbit.mapping.filter import apply_filter
    
This will filter the reads based in
In [32]:
    
apply_filter('results/HindIII/03_filtering/reads12.tsv',
            'results/HindIII/03_filtering/reads12_valid.tsv',
            masked,filters=[1,2,3,4,9,10])
    
    
    Out[32]:
In [ ]: