Filtering


In [1]:
from pytadbit.parsers.hic_parser import load_hic_data_from_reads

In [2]:
r_enz_1 = 'HindIII'
r_enz_2 = 'MboI'
reso = 1000000

In [4]:
hic_data_1 = load_hic_data_from_reads(
    'results/{0}/03_filtering/valid_reads12_{0}.tsv'.format(r_enz_1),
    reso)
hic_data_2 = load_hic_data_from_reads(
    'results/{0}/03_filtering/valid_reads12_{0}.tsv'.format(r_enz_2),
    reso)

In [5]:
hic_data_1.filter_columns(draw_hist=True, min_count=10, by_mean=True)
hic_data_2.filter_columns(draw_hist=True, min_count=10, by_mean=True)


WARNING: removing columns having less than 20 counts:
   124   127   128   129   130   131   132   133   134   135   136   137   138   139   140   141   142   143   742   931
   932  1295  1563  1567  1571  1572  1586  1587  1588  1589  1590  1591  1592  1593  1594  1595  1596  1597  1598  1599
  1600  1601  1721  1867  1868  2085  2086  2087  2088  2089  2090  2091  2092  2093  2094  2095  2096  2097  2098  2099
  2100  2101  2102  2200  2201  2202  2203  2204  2205  2206  2207  2208  2209  2210  2211  2212  2213  2214  2215  2216
  2217  2307  2308  2309  2310  2311  2312  2313  2314  2315  2316  2317  2318  2319  2320  2321  2322  2323  2324  2447
  2448  2449  2450  2451  2452  2453  2454  2455  2601  2602  2603  2604  2691  2692  2752  2790  2791  2792  2793  2794
  2801  2837  2838  2839  2840  2841  2842  2843  2844  2845  2846  2850  2851  2888  2889  2947  2948  2949  3044  3045
  3046  3047  3052  3053  3054  3059  3060  3061  3062  3063  3067  3068  3069  3070  3072  3073  3074  3075  3076  3077
  3078  3079  3080  3081  3082  3083  3084  3085  3086  3087  3088  3089  3090  3091  3092  3093  3094  3095  3096  3097
  3098  3099  3100  3102
WARNING: removing columns having less than 944.794 counts:
     1   121   123   124   125   126   127   128   129   130   131   132   133   134   135   136   137   138   139   140
   141   142   143   144   145   147   149   337   340   341   342   492   584   585   691   742   882   930   931   932
   953  1064  1124  1294  1295  1296  1403  1440  1441  1541  1563  1564  1565  1566  1567  1571  1572  1581  1582  1583
  1585  1586  1587  1588  1589  1590  1591  1592  1593  1594  1595  1596  1597  1598  1599  1600  1601  1602  1603  1604
  1605  1606  1607  1608  1609  1680  1721  1866  1867  1868  1950  1986  1987  2085  2086  2087  2088  2089  2090  2091
  2092  2093  2094  2095  2096  2097  2098  2099  2100  2101  2102  2199  2200  2201  2202  2203  2204  2205  2206  2207
  2208  2209  2210  2211  2212  2213  2214  2215  2216  2217  2218  2307  2308  2309  2310  2311  2312  2313  2314  2315
  2316  2317  2318  2319  2320  2321  2322  2323  2324  2325  2326  2327  2329  2446  2447  2448  2449  2450  2451  2452
  2453  2454  2455  2500  2524  2525  2584  2601  2602  2603  2604  2605  2665  2666  2667  2691  2692  2752  2789  2790
  2791  2792  2793  2794  2795  2796  2797  2801  2802  2837  2838  2839  2840  2841  2842  2843  2844  2845  2846  2847
  2850  2851  2852  2887  2888  2889  2890  2947  2948  2949  3044  3045  3046  3047  3048  3049  3050  3051  3052  3053
  3054  3055  3056  3057  3058  3059  3060  3061  3062  3063  3064  3065  3066  3067  3068  3069  3070  3071  3072  3073
  3074  3075  3076  3077  3078  3079  3080  3081  3082  3083  3084  3085  3086  3087  3088  3089  3090  3091  3092  3093
  3094  3095  3096  3097  3098  3099  3100  3101  3102

WARNING: removing columns having less than 20 counts:
   123   124   127   128   129   130   131   132   133   134   135   136   137   138   139   140   141   142   143   585
   742   931   932  1124  1295  1296  1440  1563  1564  1565  1566  1567  1571  1572  1586  1587  1588  1589  1590  1591
  1592  1593  1594  1595  1596  1597  1598  1599  1600  1601  1602  1721  1866  1867  1868  1986  1987  2085  2086  2087
  2088  2089  2090  2091  2092  2093  2094  2095  2096  2097  2098  2099  2100  2101  2102  2200  2201  2202  2203  2204
  2205  2206  2207  2208  2209  2210  2211  2212  2213  2214  2215  2216  2217  2307  2308  2309  2310  2311  2312  2313
  2314  2315  2316  2317  2318  2319  2320  2321  2322  2323  2324  2326  2447  2448  2449  2450  2451  2452  2453  2454
  2455  2601  2602  2603  2604  2691  2692  2752  2790  2791  2792  2793  2794  2801  2802  2837  2838  2839  2840  2841
  2842  2843  2844  2845  2846  2850  2851  2888  2889  2947  2948  2949  3044  3045  3046  3047  3049  3050  3051  3052
  3053  3054  3057  3058  3059  3060  3061  3062  3063  3064  3065  3066  3067  3068  3069  3070  3071  3072  3073  3074
  3075  3076  3077  3078  3079  3080  3081  3082  3083  3084  3085  3086  3087  3088  3089  3090  3091  3092  3093  3094
  3095  3096  3097  3098  3099  3100  3102
Found 269 of 3102 columns with poor signal
Found 266 of 3102 columns with poor signal
WARNING: removing columns having less than 135.178 counts:
     1   123   124   125   126   127   128   129   130   131   132   133   134   135   136   137   138   139   140   141
   142   143   145   340   341   342   343   492   584   585   691   742   882   930   931   932   953  1124  1294  1295
  1296  1297  1403  1440  1441  1541  1563  1564  1565  1566  1567  1571  1572  1581  1582  1585  1586  1587  1588  1589
  1590  1591  1592  1593  1594  1595  1596  1597  1598  1599  1600  1601  1602  1603  1604  1605  1606  1607  1608  1609
  1680  1721  1722  1866  1867  1868  1950  1986  1987  2085  2086  2087  2088  2089  2090  2091  2092  2093  2094  2095
  2096  2097  2098  2099  2100  2101  2102  2199  2200  2201  2202  2203  2204  2205  2206  2207  2208  2209  2210  2211
  2212  2213  2214  2215  2216  2217  2218  2307  2308  2309  2310  2311  2312  2313  2314  2315  2316  2317  2318  2319
  2320  2321  2322  2323  2324  2325  2326  2327  2329  2444  2446  2447  2448  2449  2450  2451  2452  2453  2454  2455
  2500  2524  2525  2526  2584  2601  2602  2603  2604  2605  2665  2666  2691  2692  2751  2752  2789  2790  2791  2792
  2793  2794  2795  2796  2797  2801  2802  2837  2838  2839  2840  2841  2842  2843  2844  2845  2846  2847  2850  2851
  2852  2888  2889  2890  2946  2947  2948  2949  3044  3045  3046  3047  3048  3049  3050  3051  3052  3053  3054  3055
  3057  3058  3059  3060  3061  3062  3063  3064  3065  3066  3067  3068  3069  3070  3071  3072  3073  3074  3075  3076
  3077  3078  3079  3080  3081  3082  3083  3084  3085  3086  3087  3088  3089  3090  3091  3092  3093  3094  3095  3096
  3097  3098  3099  3100  3101  3102

In [6]:
print len(hic_data_1)
print len(hic_data_1.bads)
print len(hic_data_2)
print len(hic_data_2.bads)


3102
269
3102
266

Normalization (ICE)


In [7]:
hic_data_1.normalize_hic(iterations=10, max_dev=0.1)
hic_data_2.normalize_hic(iterations=10, max_dev=0.1)


iterative correction
  - copying matrix
  - computing baises
           925.000        6068.627       36651.000    0   5.03942
          2590.992        6890.690       22954.353    1   2.33121
          2565.382        7406.752       11323.117    2   0.65364
          5564.614        7714.378       17646.667    3   1.28750
          3907.979        7920.873        9984.402    4   0.50662
          6575.573        8053.562       14122.857    5   0.75362
          5078.471        8144.699        9498.490    6   0.37647
          7182.960        8205.508       11988.885    7   0.46108
          6012.410        8247.864        9191.391    8   0.27103
          7526.569        8276.798       10683.760    9   0.29081
          6714.214        8297.160        8971.088   10   0.19078
rescaling to factor 1
  - getting the sum of the matrix
    => 2837.829
  - rescaling biases
iterative correction
  - copying matrix
  - computing baises
           138.000         790.629        4403.000    0   4.56898
           399.372         861.843        2175.062    1   1.52373
           436.749         893.671        1220.721    2   0.51129
           739.963         908.526        1522.920    3   0.67625
           613.184         916.395        1058.900    4   0.33087
           823.897         920.596        1246.005    5   0.35348
           728.392         923.008         999.354    6   0.21085
           870.724         924.391        1109.283    7   0.20001
           801.121         925.223         966.047    8   0.13413
           896.220         925.721        1036.769    9   0.11996
           846.201         926.030         949.790   10   0.08620
rescaling to factor 1
  - getting the sum of the matrix
    => 2836.582
  - rescaling biases

In [9]:
from pytadbit.mapping.analyze import hic_map

hic_map(hic_data_1, normalized=True, focus='chr18', show=True)
hic_map(hic_data_2, normalized=True, focus='chr18', show=True)


Save biases and bin filtering


In [11]:
from cPickle import dump

In [12]:
! mkdir -p results/$r_enz_1/04_normalizing
! mkdir -p results/$r_enz_2/04_normalizing

In [13]:
out = open('results/{1}/04_normalizing/biases_{0}_{1}.pick'.format(reso, r_enz_1), 'w')
dump(hic_data_1.bias, out)
out.close()
out = open('results/{1}/04_normalizing/biases_{0}_{1}.pick'.format(reso, r_enz_2), 'w')
dump(hic_data_2.bias, out)
out.close()

Dryhic normalization


In [15]:
! dryhic3.r results/HindIII/03_filtering/valid_reads12_HindIII.tsv \
       results/HindIII/04_normalizing/bad_columns_1000000_HindIII.pick \
       HindIII hg38 1000000 results/HindIII/04_normalizing/biases_dryhic_1000000_HindIII.pick


Getting contacts
Read 8626149 rows and 2 (of 13) columns from 0.969 GB file in 00:00:05
Getting genomic features
Merging info
Getting bad bins
Traceback (most recent call last):
  File "/home/student/R/x86_64-pc-linux-gnu-library/3.2/dryhic/src/cpickle2tsv.py", line 10, in <module>
    dat = pickle.load(open(infile, "r"))
IOError: [Errno 2] No such file or directory: 'results/HindIII/04_normalizing/bad_columns_1000000_HindIII.pick'
Error in file(file, "rt") : cannot open the connection
Calls: read_cpickle -> read.delim -> read.table -> file
In addition: Warning message:
In file(file, "rt") :
  cannot open file '/home/student/tmp/RtmpGCk963/file261568fae774': No such file or directory
Execution halted

In [17]:
! dryhic3.r results/MboI/03_filtering/valid_reads12_MboI.tsv \
       results/MboI/04_normalizing/bad_columns_1000000_MboI.pick \
       MboI hg38 1000000 results/MboI/04_normalizing/biases_dryhic_1000000_MboI.pick


Getting contacts
Getting genomic features
Merging info
Getting bad bins
Traceback (most recent call last):
  File "/home/student/R/x86_64-pc-linux-gnu-library/3.2/dryhic/src/cpickle2tsv.py", line 10, in <module>
    dat = pickle.load(open(infile, "r"))
IOError: [Errno 2] No such file or directory: 'results/MboI/04_normalizing/bad_columns_1000000_MboI.pick'
Error in file(file, "rt") : cannot open the connection
Calls: read_cpickle -> read.delim -> read.table -> file
In addition: Warning message:
In file(file, "rt") :
  cannot open file '/home/student/tmp/Rtmply5P3G/file26c651f9c81b': No such file or directory
Execution halted

In [21]:
bias_dry_path = 'results/{1}/04_normalizing/biases_dryhic_{0}_{1}.tsv'

hic_data_1.bias = dict([(int(l.split()[0]), float(l.split()[1])) for l in open(bias_dry_path.format(reso, 'HindIII'))])
hic_data_2.bias = dict([(int(l.split()[0]), float(l.split()[1])) for l in open(bias_dry_path.format(reso, 'MboI'))])


---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-21-d8774fef90a6> in <module>()
      1 bias_dry_path = 'results/{1}/04_normalizing/biases_dryhic_{0}_{1}.tsv'
      2 
----> 3 hic_data_1.bias = dict([(int(l.split()[0]), float(l.split()[1])) for l in open(bias_dry_path.format(reso, 'HindIII'))])
      4 hic_data_2.bias = dict([(int(l.split()[0]), float(l.split()[1])) for l in open(bias_dry_path.format(reso, 'MboI'))])

IOError: [Errno 2] No such file or directory: 'results/HindIII/04_normalizing/biases_dryhic_1000000_HindIII.tsv'

In [22]:
hic_map(hic_data_1, normalized=True, focus='chr18', show=True)
hic_map(hic_data_2, normalized=True, focus='chr18', show=True)



In [ ]: