Parsing mapped reads


In [1]:
from pytadbit.parsers.genome_parser import parse_fasta

In [5]:
genome_seq = parse_fasta('/media/storage/db/reference_genome/Homo_sapiens/hg38/hg38.fa')


Parsing chr1
Parsing chr2
Parsing chr3
Parsing chr4
Parsing chr5
Parsing chr6
Parsing chr7
Parsing chr8
Parsing chr9
Parsing chr10
Parsing chr11
Parsing chr12
Parsing chr13
Parsing chr14
Parsing chr15
Parsing chr16
Parsing chr17
Parsing chr18
Parsing chr19
Parsing chr20
Parsing chr21
Parsing chr22
Parsing chrX
Parsing chrY
Parsing chrM

In [6]:
maps1 = [
    'results/HindIII/01_mapping/mapHindIII_r1/K562_HindIII_1_full_1-end.map',
    'results/HindIII/01_mapping/mapHindIII_r1/K562_HindIII_1_frag_1-end.map']

In [7]:
maps2 = [
    'results/HindIII/01_mapping/mapHindIII_r2/K562_HindIII_2_full_1-end.map',
    'results/HindIII/01_mapping/mapHindIII_r2/K562_HindIII_2_frag_1-end.map']

In [8]:
! mkdir -p results/HindIII/02_parsing

In [9]:
from pytadbit.parsers.map_parser import parse_map

In [11]:
parse_map(maps1,
          maps2,
          'results/HindIII/02_parsing/reads1.tsv',
          'results/HindIII/02_parsing/reads2.tsv',
          genome_seq=genome_seq, re_name='HindIII',
          verbose=True)


Searching and mapping RE sites to the reference genome
Found 860368 RE sites
Loading read1
loading file: results/HindIII/01_mapping/mapHindIII_r1/K562_HindIII_1_full_1-end.map
loading file: results/HindIII/01_mapping/mapHindIII_r1/K562_HindIII_1_frag_1-end.map
Merge sort...........
Getting Multiple contacts
Loading read2
loading file: results/HindIII/01_mapping/mapHindIII_r2/K562_HindIII_2_full_1-end.map
loading file: results/HindIII/01_mapping/mapHindIII_r2/K562_HindIII_2_frag_1-end.map
Merge sort..........
Getting Multiple contacts
Out[11]:
({0: {1: 6047543, 2: 4605716}, 1: {1: 5940550, 2: 4357315}},
 {0: 1622735, 1: 1521698})

The output are the tables with the reads as ID, chrm, pos, lenght, and start-end fragment


In [12]:
! head -n 50 results/HindIII/02_parsing/reads1.tsv


# Chromosome lengths (order matters):
# CRM chr1	248956422
# CRM chr2	242193529
# CRM chr3	198295559
# CRM chr4	190214555
# CRM chr5	181538259
# CRM chr6	170805979
# CRM chr7	159345973
# CRM chr8	145138636
# CRM chr9	138394717
# CRM chr10	133797422
# CRM chr11	135086622
# CRM chr12	133275309
# CRM chr13	114364328
# CRM chr14	107043718
# CRM chr15	101991189
# CRM chr16	90338345
# CRM chr17	83257441
# CRM chr18	80373285
# CRM chr19	58617616
# CRM chr20	64444167
# CRM chr21	46709983
# CRM chr22	50818468
# CRM chrX	156040895
# CRM chrY	57227415
# CRM chrM	16569
# Mapped	reads count by iteration
# MAPPED 1 6047543
# MAPPED 2 4605716
NS500645:59:HCL32BGXY:1:11101:10000:16279	chr10	88637181	1	75	88636785	88638533
NS500645:59:HCL32BGXY:1:11101:10000:8112	chr3	36464588	1	75	36463181	36464653
NS500645:59:HCL32BGXY:1:11101:10002:9093	chr6	137108858	1	40	137107790	137108899
NS500645:59:HCL32BGXY:1:11101:10002:9922	chrX	69197335	0	75	69197264	69206899
NS500645:59:HCL32BGXY:1:11101:10003:18293	chrX	96723437	0	75	96723337	96725474
NS500645:59:HCL32BGXY:1:11101:10003:3068	chr5	94858963	0	75	94858631	94861895
NS500645:59:HCL32BGXY:1:11101:10003:5939	chr10	75729057	0	75	75728768	75736209
NS500645:59:HCL32BGXY:1:11101:10004:10267	chr5	174875986	1	75	174874079	174881635
NS500645:59:HCL32BGXY:1:11101:10004:12135	chr2	154442567	0	56	154442513	154447893
NS500645:59:HCL32BGXY:1:11101:10004:12614	chr15	69367526	1	75	69363863	69367659
NS500645:59:HCL32BGXY:1:11101:10004:16728	chr13	49978227	0	75	49977970	49982560
NS500645:59:HCL32BGXY:1:11101:10004:8658	chr2	61959866	1	75	61955591	61960015
NS500645:59:HCL32BGXY:1:11101:10005:11853	chr1	3606245	1	75	3582994	3606371
NS500645:59:HCL32BGXY:1:11101:10005:12121	chrX	30942292	0	16	30942272	30947764|||NS500645:59:HCL32BGXY:1:11101:10005:12121~2~	chrX	29671970	0	49	29671038	29672844
NS500645:59:HCL32BGXY:1:11101:10005:12688	chr1	48791365	0	75	48791057	48792686
NS500645:59:HCL32BGXY:1:11101:10006:2865	chr18	58191714	1	75	58191377	58191881
NS500645:59:HCL32BGXY:1:11101:10006:5162	chr1	53918296	0	75	53918108	53927122
NS500645:59:HCL32BGXY:1:11101:10007:3506	chr1	73124718	1	51	73118712	73124764|||NS500645:59:HCL32BGXY:1:11101:10007:3506~2~	chr1	73128562	0	26	73128558	73142904
NS500645:59:HCL32BGXY:1:11101:10007:9400	chr10	92229239	0	75	92222156	92235741
NS500645:59:HCL32BGXY:1:11101:10009:12190	chr19	51860119	1	75	51855867	51860307
NS500645:59:HCL32BGXY:1:11101:10009:19353	chr11	39650137	0	23	39650116	39662545|||NS500645:59:HCL32BGXY:1:11101:10009:19353~2~	chr11	20788481	0	54	20788477	20789439

In [14]:
from pytadbit.mapping import get_intersection

This function will filter all reads that are not mapped in both sides


In [15]:
! mkdir -p results/HindIII/03_filtering

In [16]:
get_intersection('results/HindIII/02_parsing/reads1.tsv',
                'results/HindIII/02_parsing/reads2.tsv',
                'results/HindIII/03_filtering/reads12.tsv',
                verbose=True)


Getting intersection of reads 1 and reads 2:
 
  .........
Found 8178944 pair of reads mapping uniquely
Sorting each temporary file by genomic coordinate
    1025/1025 sorted files
Removing temporary files...
Out[16]:
(8178944, {2: 2416873, 3: 139115, 4: 2})

In [17]:
! head -n50 results/HindIII/03_filtering/reads12.tsv


# CRM chr1	248956422
# CRM chr2	242193529
# CRM chr3	198295559
# CRM chr4	190214555
# CRM chr5	181538259
# CRM chr6	170805979
# CRM chr7	159345973
# CRM chr8	145138636
# CRM chr9	138394717
# CRM chr10	133797422
# CRM chr11	135086622
# CRM chr12	133275309
# CRM chr13	114364328
# CRM chr14	107043718
# CRM chr15	101991189
# CRM chr16	90338345
# CRM chr17	83257441
# CRM chr18	80373285
# CRM chr19	58617616
# CRM chr20	64444167
# CRM chr21	46709983
# CRM chr22	50818468
# CRM chrX	156040895
# CRM chrY	57227415
# CRM chrM	16569
NS500645:59:HCL32BGXY:1:11102:26481:1379	chr1	1001140	1	75	992015	1001575	chr1	1001256	0	75	992015	1001575
NS500645:59:HCL32BGXY:1:12110:26115:5745	chr1	1001324	1	75	992015	1001575	chr21	46033968	0	75	46030463	46041407
NS500645:59:HCL32BGXY:1:21206:21324:3203	chr1	1001334	1	75	992015	1001575	chr1	1001579	0	59	1001575	1007038
NS500645:59:HCL32BGXY:1:13307:21816:7609	chr1	1001500	1	75	992015	1001575	chr1	1451683	1	52	1450886	1451736
NS500645:59:HCL32BGXY:1:12106:1847:14122~2~	chr1	1001574	1	63	992015	1001575	chr1	1001731	0	75	1001575	1007038
NS500645:59:HCL32BGXY:1:13212:9523:18419#2/3	chr1	1001611	0	38	1001575	1007038	chr3	97654421	1	39	97653179	97654422
NS500645:59:HCL32BGXY:1:13212:9523:18419#3/3	chr1	1001611	0	38	1001575	1007038	chr3	97654615	0	75	97654422	97654693
NS500645:59:HCL32BGXY:1:12102:17848:18856	chr1	1001646	0	75	1001575	1007038	chr5	108685381	1	75	108682775	108685485
NS500645:59:HCL32BGXY:1:21101:17830:3039	chr1	1001665	0	75	1001575	1007038	chr19	48009760	1	75	48007445	48009872
NS500645:59:HCL32BGXY:1:21102:25611:8238	chr1	1001682	0	75	1001575	1007038	chr2	108230867	0	52	108230811	108232481
NS500645:59:HCL32BGXY:1:12210:22653:8408	chr1	1001751	0	75	1001575	1007038	chr12	50121562	1	75	50119202	50137467
NS500645:59:HCL32BGXY:1:13103:19543:7770	chr1	1002744	1	75	1001575	1007038	chr1	1002823	0	75	1001575	1007038
NS500645:59:HCL32BGXY:1:12306:8051:11287	chr1	1003513	1	75	1001575	1007038	chr1	1003788	0	75	1001575	1007038
NS500645:59:HCL32BGXY:1:21307:22446:2179#3/3	chr1	1006908	1	75	1001575	1007038	chr1	1007042	0	56	1007038	1008297
NS500645:59:HCL32BGXY:1:21307:22446:2179#2/3	chr1	1006908	1	75	1001575	1007038	chr1	1021804	1	21	1008297	1021820
NS500645:59:HCL32BGXY:1:21112:3188:12607#3/3	chr1	1006916	1	75	1001575	1007038	chr1	1007042	0	27	1007038	1008297
NS500645:59:HCL32BGXY:1:21112:3188:12607#1/3	chr1	1006916	1	75	1001575	1007038	chr1	2209300	0	50	2209252	2210621
NS500645:59:HCL32BGXY:1:21210:1526:15354	chr1	1006919	1	75	1001575	1007038	chr1	1008107	1	75	1007038	1008297
NS500645:59:HCL32BGXY:1:11106:7682:19670	chr1	1006967	1	75	1001575	1007038	chr1	1549073	1	75	1546944	1549172
NS500645:59:HCL32BGXY:1:11312:10176:12285#3/3	chr1	1006974	1	75	1001575	1007038	chr1	1007042	0	39	1007038	1008297
NS500645:59:HCL32BGXY:1:11312:10176:12285#2/3	chr1	1006974	1	75	1001575	1007038	chr2	168076008	0	38	168075972	168076695
NS500645:59:HCL32BGXY:1:12308:21624:12192	chr1	1007004	1	39	1001575	1007038	chr1	1065431	0	75	1065324	1069747
NS500645:59:HCL32BGXY:1:12301:7949:2088~2~	chr1	1007037	1	59	1001575	1007038	chr1	1007187	0	75	1007038	1008297
NS500645:59:HCL32BGXY:1:11305:16699:18604~2~#1/3	chr1	1007037	1	38	1001575	1007038	chr1	1007200	0	75	1007038	1008297
NS500645:59:HCL32BGXY:1:13206:7009:15464~2~	chr1	1007037	1	27	1001575	1007038	chr1	1007222	0	75	1007038	1008297

In [18]:
from pytadbit.mapping.analyze import plot_distance_vs_interactions

This function plots the number of interactions as fiunction of distance Each dot is a bin of 100K interactions, in Y axis is the interaction counts and in X axis is the distance. Hence interactions between close fragments should show the higher counts and interactions from fragments far away should be empty (no read counts)


In [20]:
plot_distance_vs_interactions('results/HindIII/03_filtering/reads12.tsv',
                             resolution=100000, max_diff=1000, show=True)


Out[20]:
((-0.88265018662076211, 5.9164938010086949, -0.99863993019479413),
 (-1.3501559560452454, 6.9658028500495099, -0.99882570064420828),
 (-1.0587521964219113, 5.9115998331601851, -0.97867093398859228))

In [22]:
from pytadbit.mapping.analyze import  plot_genomic_distribution

This function will plot the genomic distribution, per chromosome of reads counts in 50kb bins (* files can be saved as csv, tsv or images by 'save' and specify the file format with the extension)


In [23]:
plot_genomic_distribution (
'results/HindIII/03_filtering/reads12.tsv',
resolution=500000, show=True)



In [24]:
from pytadbit.mapping.analyze import hic_map

In [25]:
hic_map('results/HindIII/03_filtering/reads12.tsv',
       resolution=1000000, show=True)


Plot the interaction matrix and print a summary of the interactions: Cis interactions


In [26]:
from pytadbit.mapping.analyze import insert_sizes

In [27]:
insert_sizes('results/HindIII/03_filtering/reads12.tsv',show=True,nreads=100000)


Out[27]:
[174.0, 617.01000000000931]

In [28]:
from pytadbit.mapping.filter import filter_reads

In [29]:
filter_reads('results/HindIII/03_filtering/reads12.tsv',max_molecule_length=750,min_dist_to_re=500)


Filtered reads (and percentage of total):

     Mapped both                :     13708282 (100.00%)
  -----------------------------------------------------
   1- self-circle               :        37383 (  0.27%)
   2- dangling-end              :       660146 (  4.82%)
   3- error                     :        37395 (  0.27%)
   4- extra dangling-end        :      3773498 ( 27.53%)
   5- too close from RES        :      3277369 ( 23.91%)
   6- too short                 :       296853 (  2.17%)
   7- too large                 :         1843 (  0.01%)
   8- over-represented          :       411157 (  3.00%)
   9- duplicated                :       324490 (  2.37%)
  10- random breaks             :       968492 (  7.07%)
Out[29]:
{1: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_self-circle.tsv',
  'name': 'self-circle',
  'reads': 37383},
 2: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_dangling-end.tsv',
  'name': 'dangling-end',
  'reads': 660146},
 3: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_error.tsv',
  'name': 'error',
  'reads': 37395},
 4: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_extra_dangling-end.tsv',
  'name': 'extra dangling-end',
  'reads': 3773498},
 5: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_too_close_from_RES.tsv',
  'name': 'too close from RES',
  'reads': 3277369},
 6: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_too_short.tsv',
  'name': 'too short',
  'reads': 296853},
 7: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_too_large.tsv',
  'name': 'too large',
  'reads': 1843},
 8: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_over-represented.tsv',
  'name': 'over-represented',
  'reads': 411157},
 9: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_duplicated.tsv',
  'name': 'duplicated',
  'reads': 324490},
 10: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_random_breaks.tsv',
  'name': 'random breaks',
  'reads': 968492}}

This function will identify all type of reads that can be filtered


In [30]:
masked = {1: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_self-circle.tsv',
  'name': 'self-circle',
  'reads': 37383},
 2: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_dangling-end.tsv',
  'name': 'dangling-end',
  'reads': 660146},
 3: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_error.tsv',
  'name': 'error',
  'reads': 37395},
 4: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_extra_dangling-end.tsv',
  'name': 'extra dangling-end',
  'reads': 3773498},
 5: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_too_close_from_RES.tsv',
  'name': 'too close from RES',
  'reads': 3277369},
 6: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_too_short.tsv',
  'name': 'too short',
  'reads': 296853},
 7: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_too_large.tsv',
  'name': 'too large',
  'reads': 1843},
 8: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_over-represented.tsv',
  'name': 'over-represented',
  'reads': 411157},
 9: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_duplicated.tsv',
  'name': 'duplicated',
  'reads': 324490},
 10: {'fnam': 'results/HindIII/03_filtering/reads12.tsv_random_breaks.tsv',
  'name': 'random breaks',
  'reads': 968492}}

In [31]:
from pytadbit.mapping.filter import apply_filter

This will filter the reads based in


In [32]:
apply_filter('results/HindIII/03_filtering/reads12.tsv',
            'results/HindIII/03_filtering/reads12_valid.tsv',
            masked,filters=[1,2,3,4,9,10])


    saving to file 8482570 reads without .
Out[32]:
8482570

In [ ]: