%matplotlib inline

In [2]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows


import os
os.chdir('/Users/evanbiederstedt/Downloads/RRBS_data_files')

import statsmodels.api as sm

In [3]:
"""

CD19cell_regions.csv
cw154_regions.csv
Normal_B_regions.csv
trito_regions.csv
pcell_regions.csv
mcell_regions.csv

"""


Out[3]:
'\n\nCD19cell_regions.csv\ncw154_regions.csv\nNormal_B_regions.csv\ntrito_regions.csv\npcell_regions.csv\nmcell_regions.csv\n\n'

In [4]:
trito = pd.read_csv("trito_regions.csv")
normal = pd.read_csv("Normal_B_regions.csv")
pcell = pd.read_csv("pcell_regions.csv")
mcell = pd.read_csv("mcell_regions.csv")
cw154 = pd.read_csv("cw154_regions.csv")
cd19cell = pd.read_csv("CD19cell_regions.csv")

In [5]:
print(trito.shape)
print(normal.shape)   # remove 2cell files
print(pcell.shape)
print(mcell.shape)
print(cw154.shape)
print(cd19cell.shape)


(44, 39)
(136, 39)
(90, 39)
(88, 39)
(66, 39)
(89, 39)

In [6]:
trito["filename"] = trito["filename"].str[:33]

In [7]:
trito.head()


Out[7]:
filename methylation_tssDistance methylation_genesDistance methylation_exonsDistance methylation_intronsDistance methylation_promoterDistance methylation_cgiDistance methylation_ctcfDistance methylation_ctcfUpDistance methylation_ctcfDownDistance methylation_geneDistalRegulatoryModulesDistance methylation_vistaEnhancersDistance methylation_3PrimeUTRDistance methylation_5PrimeUTRDistance methylation_firstExonDistance methylation_geneDistalRegulatoryModulesK562Distance methylation_hypoInHues64Distance methylation_intergenic methylation_shore methylation_shelf PDR_tssDistance PDR_genesDistance PDR_exonsDistance PDR_intronsDistance PDR_promoterDistance PDR_cgiDistance PDR_ctcfDistance PDR_ctcfUpDistance PDR_ctcfDownDistance PDR_geneDistalRegulatoryModulesDistance PDR_vistaEnhancersDistance PDR_3PrimeUTRDistance PDR_5PrimeUTRDistance PDR_firstExonDistance PDR_geneDistalRegulatoryModulesK562Distance PDR_hypoInHues64Distance PDR_intergenic PDR_shore PDR_shelf
0 RRBS_trito_pool_1_TAAGGCGA.ACAACC 0.0 0.570721 0.404589 0.594704 0.146371 0.169386 0.199815 0.0 0.199815 0.342986 0.444522 0.680480 0.305206 0.145730 0.254209 0.877142 0.820339 0.607671 0.803716 0.0 0.343014 0.371890 0.339596 0.347748 0.388709 0.385297 0.0 0.385297 0.401254 0.509555 0.386285 0.324236 0.367271 0.354046 0.240359 0.386809 0.377907 0.359143
1 RRBS_trito_pool_1_TAAGGCGA.ACGTGG 0.0 0.545781 0.383371 0.568638 0.141545 0.161519 0.191404 0.0 0.191404 0.326140 0.589834 0.670559 0.290196 0.140779 0.240221 0.809942 0.816166 0.573089 0.795932 0.0 0.348110 0.381251 0.341950 0.349891 0.398898 0.415058 0.0 0.415058 0.408417 0.548192 0.382172 0.332749 0.373615 0.359217 0.364148 0.391925 0.386808 0.354120
2 RRBS_trito_pool_1_TAAGGCGA.ACTCAC 0.0 0.564547 0.401760 0.588136 0.148529 0.174413 0.209041 0.0 0.209041 0.346473 0.553062 0.696068 0.296809 0.148360 0.255392 0.795883 0.832812 0.609544 0.812564 0.0 0.338412 0.371890 0.332321 0.351391 0.393829 0.392313 0.0 0.392313 0.412311 0.471703 0.378630 0.327488 0.370494 0.338321 0.334783 0.378580 0.378799 0.353949
3 RRBS_trito_pool_1_TAAGGCGA.AGGATG 0.0 0.567309 0.399934 0.592890 0.143897 0.168936 0.200661 0.0 0.200661 0.342257 0.665920 0.661426 0.308680 0.141673 0.242236 0.787966 0.824659 0.602995 0.799836 0.0 0.342724 0.374419 0.337654 0.346109 0.389718 0.399153 0.0 0.399153 0.405627 0.359189 0.391002 0.324431 0.360431 0.343730 0.304035 0.380413 0.373345 0.347372
4 RRBS_trito_pool_1_TAAGGCGA.ATAGCG 0.0 0.529224 0.367743 0.555131 0.136090 0.156827 0.175426 0.0 0.175426 0.307402 0.479145 0.644411 0.273473 0.134137 0.220729 0.815944 0.808981 0.575050 0.788587 0.0 0.349254 0.376307 0.342617 0.343348 0.388623 0.403861 0.0 0.403861 0.390288 0.471324 0.392438 0.332882 0.358450 0.319824 0.401641 0.398275 0.373236 0.363320

In [8]:
normal["filename"] = normal["filename"].str[:40]

In [9]:
normal.tail()


Out[9]:
filename methylation_tssDistance methylation_genesDistance methylation_exonsDistance methylation_intronsDistance methylation_promoterDistance methylation_cgiDistance methylation_ctcfDistance methylation_ctcfUpDistance methylation_ctcfDownDistance methylation_geneDistalRegulatoryModulesDistance methylation_vistaEnhancersDistance methylation_3PrimeUTRDistance methylation_5PrimeUTRDistance methylation_firstExonDistance methylation_geneDistalRegulatoryModulesK562Distance methylation_hypoInHues64Distance methylation_intergenic methylation_shore methylation_shelf PDR_tssDistance PDR_genesDistance PDR_exonsDistance PDR_intronsDistance PDR_promoterDistance PDR_cgiDistance PDR_ctcfDistance PDR_ctcfUpDistance PDR_ctcfDownDistance PDR_geneDistalRegulatoryModulesDistance PDR_vistaEnhancersDistance PDR_3PrimeUTRDistance PDR_5PrimeUTRDistance PDR_firstExonDistance PDR_geneDistalRegulatoryModulesK562Distance PDR_hypoInHues64Distance PDR_intergenic PDR_shore PDR_shelf
131 RRBS_normal_B_cell_H1_22_TAGGCATG.GTGAGG 0.0 0.476010 0.319477 0.501080 0.112156 0.125920 0.140172 0.0 0.140172 0.266191 0.520634 0.568097 0.250866 0.108529 0.199670 0.743346 0.709026 0.512304 0.723009 0.0 0.413186 0.417787 0.411922 0.371247 0.397676 0.361732 0.0 0.361732 0.449309 0.538398 0.431550 0.378186 0.390782 0.390614 0.433456 0.526395 0.438160 0.468283
132 RRBS_normal_B_cell_H1_22_TAGGCATG.GTTGAG 0.0 0.561826 0.421027 0.581238 0.194894 0.241158 0.227289 0.0 0.227289 0.355498 0.652030 0.675360 0.320042 0.195983 0.239473 0.848333 0.775009 0.603093 0.765059 0.0 0.378380 0.397910 0.375394 0.389046 0.412999 0.390730 0.0 0.390730 0.430454 0.628598 0.406830 0.371516 0.395340 0.389081 0.268990 0.455415 0.396795 0.385031
133 RRBS_normal_B_cell_H1_22_TAGGCATG.TAGCGG 0.0 0.403834 0.204255 0.435666 0.094118 0.093855 0.072948 0.0 0.072948 0.253112 0.333333 0.591837 0.223590 0.096000 0.223404 0.818182 0.820471 0.605528 0.801020 0.0 0.320128 0.375319 0.314522 0.375959 0.383838 0.258359 0.0 0.258359 0.396266 1.000000 0.346939 0.411282 0.409333 0.386525 0.818182 0.311942 0.381910 0.306122
134 RRBS_normal_B_cell_H1_22_TAGGCATG.TATCTC 0.0 0.601704 0.464233 0.621206 0.137723 0.148328 0.140719 0.0 0.140719 0.266187 0.400000 0.706186 0.334928 0.135086 0.218656 1.000000 0.878793 0.656104 0.840000 0.0 0.286919 0.323835 0.287354 0.384461 0.407314 0.347305 0.0 0.347305 0.350719 0.000000 0.368557 0.362440 0.391198 0.270812 0.000000 0.243974 0.306151 0.357500
135 RRBS_normal_B_cell_H1_22_TAGGCATG.TCTCTG 0.0 0.560233 0.366567 0.591090 0.120401 0.128137 0.182218 0.0 0.182218 0.329330 0.558873 0.703213 0.278496 0.111726 0.241188 0.930269 0.887614 0.628778 0.865630 0.0 0.276382 0.338157 0.268336 0.360595 0.380319 0.401676 0.0 0.401676 0.369472 0.293523 0.278300 0.314054 0.366833 0.346789 0.255155 0.244022 0.313503 0.248764

In [10]:
pcell["protocol"] = pcell["filename"].str[:31]

In [11]:
pcell["filename"][pcell["protocol"]=='RRBS_NormalBCD19pCD27pcell1_22_'] = pcell["filename"].str[:46]
pcell["filename"][pcell["protocol"]=='RRBS_NormalBCD19pCD27pcell23_44'] = pcell["filename"].str[:47]
pcell["filename"][pcell["protocol"]=='RRBS_NormalBCD19pCD27pcell45_66'] = pcell["filename"].str[:47]
pcell["filename"][pcell["protocol"]=='RRBS_NormalBCD19pCD27pcell67_88'] = pcell["filename"].str[:47]


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [12]:
pcell.tail()


Out[12]:
filename methylation_tssDistance methylation_genesDistance methylation_exonsDistance methylation_intronsDistance methylation_promoterDistance methylation_cgiDistance methylation_ctcfDistance methylation_ctcfUpDistance methylation_ctcfDownDistance methylation_geneDistalRegulatoryModulesDistance methylation_vistaEnhancersDistance methylation_3PrimeUTRDistance methylation_5PrimeUTRDistance methylation_firstExonDistance methylation_geneDistalRegulatoryModulesK562Distance methylation_hypoInHues64Distance methylation_intergenic methylation_shore methylation_shelf PDR_tssDistance PDR_genesDistance PDR_exonsDistance PDR_intronsDistance PDR_promoterDistance PDR_cgiDistance PDR_ctcfDistance PDR_ctcfUpDistance PDR_ctcfDownDistance PDR_geneDistalRegulatoryModulesDistance PDR_vistaEnhancersDistance PDR_3PrimeUTRDistance PDR_5PrimeUTRDistance PDR_firstExonDistance PDR_geneDistalRegulatoryModulesK562Distance PDR_hypoInHues64Distance PDR_intergenic PDR_shore PDR_shelf protocol
85 RRBS_NormalBCD19pCD27pcell67_88_GCTACGCT.GTTGAG 0.0 0.488591 0.320965 0.515461 0.090361 0.105259 0.121025 0.0 0.121025 0.263410 0.516635 0.618186 0.232372 0.087691 0.179349 0.855401 0.786055 0.541282 0.773497 0.0 0.281265 0.250894 0.285832 0.152622 0.193308 0.192678 0.0 0.192678 0.276641 0.645913 0.369431 0.183830 0.159538 0.183626 0.488850 0.429191 0.356704 0.371891 RRBS_NormalBCD19pCD27pcell67_88
86 RRBS_NormalBCD19pCD27pcell67_88_GCTACGCT.TAGCGG 0.0 0.482284 0.313186 0.509034 0.085526 0.094721 0.126048 0.0 0.126048 0.258379 0.331160 0.603295 0.229917 0.074569 0.173552 0.891344 0.781317 0.519032 0.765147 0.0 0.275113 0.244301 0.281934 0.159508 0.186889 0.213556 0.0 0.213556 0.284529 0.364407 0.374666 0.194492 0.150189 0.202991 0.220994 0.416715 0.372722 0.386118 RRBS_NormalBCD19pCD27pcell67_88
87 RRBS_NormalBCD19pCD27pcell67_88_GCTACGCT.TATCTC 0.0 0.486434 0.337771 0.509877 0.104169 0.131287 0.145222 0.0 0.145222 0.275499 0.528938 0.594743 0.239506 0.108890 0.177779 0.812648 0.734134 0.536591 0.741314 0.0 0.317291 0.273234 0.322569 0.168070 0.214958 0.197274 0.0 0.197274 0.298599 0.433302 0.385105 0.203708 0.181556 0.193979 0.399823 0.492227 0.388837 0.420926 RRBS_NormalBCD19pCD27pcell67_88
88 RRBS_NormalBCD19pCD27pcell67_88_GCTACGCT.TCTCTG 0.0 0.504985 0.354574 0.527785 0.126590 0.162859 0.148021 0.0 0.148021 0.298728 0.675342 0.654135 0.247424 0.123843 0.198148 0.793358 0.753129 0.541053 0.740954 0.0 0.308032 0.268731 0.311831 0.174724 0.213951 0.206692 0.0 0.206692 0.310745 0.335160 0.390648 0.196147 0.168175 0.209597 0.402829 0.469717 0.383694 0.375906 RRBS_NormalBCD19pCD27pcell67_88
89 RRBS_NormalBCD19pCD27pcell67_88_GCTACGCT.TGCTGC 0.0 0.461103 0.285016 0.490647 0.061652 0.073687 0.091623 0.0 0.091623 0.224748 0.583333 0.622024 0.196912 0.058647 0.151447 1.000000 0.882502 0.575498 0.874932 0.0 0.166791 0.159804 0.165718 0.099319 0.130943 0.126309 0.0 0.126309 0.179091 0.083333 0.245536 0.094161 0.100645 0.120579 0.000000 0.205599 0.257229 0.231186 RRBS_NormalBCD19pCD27pcell67_88

In [13]:
mcell["protocol"] = mcell["filename"].str[:31]

In [14]:
mcell["filename"][mcell["protocol"]=='RRBS_NormalBCD19pCD27mcell1_22_'] = mcell["filename"].str[:46]
mcell["filename"][mcell["protocol"]=='RRBS_NormalBCD19pCD27mcell23_44'] = mcell["filename"].str[:47]
mcell["filename"][mcell["protocol"]=='RRBS_NormalBCD19pCD27mcell45_66'] = mcell["filename"].str[:47]
mcell["filename"][mcell["protocol"]=='RRBS_NormalBCD19pCD27mcell67_88'] = mcell["filename"].str[:47]


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [15]:
mcell.tail()


Out[15]:
filename methylation_tssDistance methylation_genesDistance methylation_exonsDistance methylation_intronsDistance methylation_promoterDistance methylation_cgiDistance methylation_ctcfDistance methylation_ctcfUpDistance methylation_ctcfDownDistance methylation_geneDistalRegulatoryModulesDistance methylation_vistaEnhancersDistance methylation_3PrimeUTRDistance methylation_5PrimeUTRDistance methylation_firstExonDistance methylation_geneDistalRegulatoryModulesK562Distance methylation_hypoInHues64Distance methylation_intergenic methylation_shore methylation_shelf PDR_tssDistance PDR_genesDistance PDR_exonsDistance PDR_intronsDistance PDR_promoterDistance PDR_cgiDistance PDR_ctcfDistance PDR_ctcfUpDistance PDR_ctcfDownDistance PDR_geneDistalRegulatoryModulesDistance PDR_vistaEnhancersDistance PDR_3PrimeUTRDistance PDR_5PrimeUTRDistance PDR_firstExonDistance PDR_geneDistalRegulatoryModulesK562Distance PDR_hypoInHues64Distance PDR_intergenic PDR_shore PDR_shelf protocol
83 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.GTGAGG 0.0 0.472029 0.279894 0.507985 0.062184 0.068979 0.099081 0.0 0.099081 0.254673 0.414678 0.646722 0.202133 0.052964 0.180330 0.825413 0.880407 0.567798 0.868663 0.0 0.162036 0.157993 0.162275 0.105846 0.134560 0.144832 0.0 0.144832 0.210529 0.343079 0.229286 0.102900 0.115445 0.160677 0.270661 0.208397 0.252791 0.195733 RRBS_NormalBCD19pCD27mcell67_88
84 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.GTTGAG 0.0 0.475509 0.284108 0.507053 0.067481 0.074098 0.107121 0.0 0.107121 0.252745 0.376858 0.661143 0.200471 0.058478 0.178968 0.852344 0.884411 0.593522 0.879363 0.0 0.157373 0.152483 0.156989 0.105841 0.133327 0.140077 0.0 0.140077 0.195673 0.159236 0.214858 0.102166 0.103687 0.136311 0.139063 0.208999 0.275148 0.201310 RRBS_NormalBCD19pCD27mcell67_88
85 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.TAGCGG 0.0 0.465384 0.270854 0.497950 0.070138 0.075311 0.113918 0.0 0.113918 0.265460 0.515006 0.626513 0.197863 0.054605 0.191544 0.873469 0.874056 0.568923 0.844443 0.0 0.165017 0.151922 0.166105 0.117741 0.146015 0.161420 0.0 0.161420 0.211532 0.650660 0.257800 0.104910 0.104068 0.157382 0.273469 0.210667 0.270156 0.202981 RRBS_NormalBCD19pCD27mcell67_88
86 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.TATCTC 0.0 0.475148 0.283056 0.509076 0.067527 0.076951 0.117951 0.0 0.117951 0.258988 0.298264 0.644925 0.206166 0.057636 0.178186 0.948181 0.881155 0.586620 0.867445 0.0 0.160243 0.152014 0.161938 0.115076 0.146481 0.159352 0.0 0.159352 0.204830 0.377937 0.193244 0.110812 0.107856 0.135752 0.152150 0.215250 0.266874 0.199644 RRBS_NormalBCD19pCD27mcell67_88
87 RRBS_NormalBCD19pCD27mcell67_88_CGTACTAG.TCTCTG 0.0 0.428225 0.251018 0.461970 0.061715 0.068326 0.092568 0.0 0.092568 0.211492 0.450980 0.630726 0.171360 0.056249 0.140668 0.892283 0.877888 0.557980 0.854488 0.0 0.159956 0.156530 0.159670 0.108323 0.135232 0.156855 0.0 0.156855 0.199040 0.287582 0.263901 0.103231 0.117227 0.144726 0.075563 0.210384 0.254628 0.219492 RRBS_NormalBCD19pCD27mcell67_88

In [16]:
len("RRBS_NormalBCD19pcell1_22_")


Out[16]:
26

In [17]:
cd19cell["protocol"] = cd19cell["filename"].str[:26]

In [18]:
len('RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACAACC')


Out[18]:
41

In [19]:
cd19cell["filename"][cd19cell["protocol"]=='RRBS_NormalBCD19pcell1_22_'] = cd19cell["filename"].str[:41]
cd19cell["filename"][cd19cell["protocol"]=='RRBS_NormalBCD19pcell23_44'] = cd19cell["filename"].str[:42]
cd19cell["filename"][cd19cell["protocol"]=='RRBS_NormalBCD19pcell45_66'] = cd19cell["filename"].str[:42]
cd19cell["filename"][cd19cell["protocol"]=='RRBS_NormalBCD19pcell67_88'] = cd19cell["filename"].str[:42]


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [20]:
cd19cell.tail()


Out[20]:
filename methylation_tssDistance methylation_genesDistance methylation_exonsDistance methylation_intronsDistance methylation_promoterDistance methylation_cgiDistance methylation_ctcfDistance methylation_ctcfUpDistance methylation_ctcfDownDistance methylation_geneDistalRegulatoryModulesDistance methylation_vistaEnhancersDistance methylation_3PrimeUTRDistance methylation_5PrimeUTRDistance methylation_firstExonDistance methylation_geneDistalRegulatoryModulesK562Distance methylation_hypoInHues64Distance methylation_intergenic methylation_shore methylation_shelf PDR_tssDistance PDR_genesDistance PDR_exonsDistance PDR_intronsDistance PDR_promoterDistance PDR_cgiDistance PDR_ctcfDistance PDR_ctcfUpDistance PDR_ctcfDownDistance PDR_geneDistalRegulatoryModulesDistance PDR_vistaEnhancersDistance PDR_3PrimeUTRDistance PDR_5PrimeUTRDistance PDR_firstExonDistance PDR_geneDistalRegulatoryModulesK562Distance PDR_hypoInHues64Distance PDR_intergenic PDR_shore PDR_shelf protocol
84 RRBS_NormalBCD19pcell67_88_TCCTGAGC.GTTGAG 0.0 0.365003 0.198154 0.403464 0.045692 0.049036 0.067145 0.0 0.067145 0.175783 0.170213 0.603492 0.133213 0.039203 0.124610 0.990164 0.857586 0.529461 0.830424 0.0 0.151630 0.135355 0.155304 0.086338 0.115382 0.124663 0.0 0.124663 0.173466 0.045593 0.233981 0.088479 0.087266 0.128772 0.000000 0.238726 0.295577 0.208543 RRBS_NormalBCD19pcell67_88
85 RRBS_NormalBCD19pcell67_88_TCCTGAGC.TAGCGG 0.0 0.427826 0.249504 0.457398 0.062149 0.072737 0.087771 0.0 0.087771 0.207407 0.428877 0.564334 0.176146 0.050720 0.137381 0.662519 0.804080 0.516266 0.779332 0.0 0.224285 0.181686 0.230307 0.109675 0.134092 0.141826 0.0 0.141826 0.200233 0.200000 0.300089 0.122736 0.101102 0.129729 0.395023 0.371977 0.332454 0.371777 RRBS_NormalBCD19pcell67_88
86 RRBS_NormalBCD19pcell67_88_TCCTGAGC.TATCTC 0.0 0.424818 0.246150 0.456708 0.058898 0.070105 0.105839 0.0 0.105839 0.221580 0.363380 0.632963 0.176894 0.050543 0.149377 0.885320 0.825866 0.539768 0.819517 0.0 0.195755 0.165491 0.200067 0.101588 0.130197 0.151039 0.0 0.151039 0.215342 0.302535 0.281348 0.112925 0.096018 0.148490 0.410017 0.335954 0.310619 0.307484 RRBS_NormalBCD19pcell67_88
87 RRBS_NormalBCD19pcell67_88_TCCTGAGC.TCTCTG 0.0 0.472135 0.286358 0.503827 0.066952 0.077275 0.107918 0.0 0.107918 0.244037 0.413347 0.619841 0.199015 0.058952 0.168218 0.894972 0.879557 0.591964 0.864421 0.0 0.157952 0.150639 0.158441 0.108958 0.132628 0.156623 0.0 0.156623 0.199528 0.254980 0.243718 0.096941 0.102134 0.130035 0.138547 0.206872 0.276050 0.196075 RRBS_NormalBCD19pcell67_88
88 RRBS_NormalBCD19pcell67_88_TCCTGAGC.TGCTGC 0.0 0.386120 0.188577 0.424230 0.057304 0.069063 0.110272 0.0 0.110272 0.226848 0.000000 0.601695 0.166264 0.051560 0.174515 0.933333 0.866931 0.560803 0.818182 0.0 0.167532 0.149592 0.180484 0.103296 0.120248 0.193353 0.0 0.193353 0.230246 0.000000 0.152542 0.106332 0.105665 0.193906 0.466667 0.299404 0.284534 0.258182 RRBS_NormalBCD19pcell67_88

In [21]:
len("RRBS_cw154_Tris_protease_GR")


Out[21]:
27

In [22]:
cw154["protocol"] = cw154["filename"].str[:27]

In [23]:
cw154.head()  # RRBS_cw154_CutSmart_protein   # RRBS_cw154_Tris_protease_CT   # RRBS_cw154_Tris_protease_GR


Out[23]:
filename methylation_tssDistance methylation_genesDistance methylation_exonsDistance methylation_intronsDistance methylation_promoterDistance methylation_cgiDistance methylation_ctcfDistance methylation_ctcfUpDistance methylation_ctcfDownDistance methylation_geneDistalRegulatoryModulesDistance methylation_vistaEnhancersDistance methylation_3PrimeUTRDistance methylation_5PrimeUTRDistance methylation_firstExonDistance methylation_geneDistalRegulatoryModulesK562Distance methylation_hypoInHues64Distance methylation_intergenic methylation_shore methylation_shelf PDR_tssDistance PDR_genesDistance PDR_exonsDistance PDR_intronsDistance PDR_promoterDistance PDR_cgiDistance PDR_ctcfDistance PDR_ctcfUpDistance PDR_ctcfDownDistance PDR_geneDistalRegulatoryModulesDistance PDR_vistaEnhancersDistance PDR_3PrimeUTRDistance PDR_5PrimeUTRDistance PDR_firstExonDistance PDR_geneDistalRegulatoryModulesK562Distance PDR_hypoInHues64Distance PDR_intergenic PDR_shore PDR_shelf protocol
0 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACAA... 0.0 0.557445 0.387638 0.583685 0.148984 0.173551 0.197546 0.0 0.197546 0.330378 0.534917 0.659609 0.300969 0.142909 0.246735 0.805699 0.820512 0.590658 0.806973 0.0 0.370149 0.420504 0.359511 0.408136 0.450911 0.436709 0.0 0.436709 0.446102 0.393463 0.409348 0.376109 0.438725 0.388377 0.397393 0.379858 0.378600 0.345222 RRBS_cw154_CutSmart_protein
1 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACCG... 0.0 0.495537 0.324467 0.521993 0.140566 0.157658 0.175575 0.0 0.175575 0.296219 0.366917 0.587535 0.261936 0.137202 0.237040 0.690698 0.796567 0.530634 0.780923 0.0 0.388093 0.435418 0.383256 0.418755 0.458484 0.442186 0.0 0.442186 0.440632 0.840602 0.376294 0.386659 0.445932 0.394716 0.346512 0.418598 0.404386 0.362456 RRBS_cw154_CutSmart_protein
2 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACGT... 0.0 0.520409 0.357209 0.544602 0.138190 0.161856 0.202110 0.0 0.202110 0.314849 0.463004 0.656101 0.275719 0.138640 0.225783 0.761359 0.792970 0.551861 0.785731 0.0 0.383971 0.430167 0.374988 0.410647 0.455464 0.453277 0.0 0.453277 0.450694 0.324865 0.443011 0.387172 0.439933 0.414011 0.280260 0.411414 0.389000 0.363575 RRBS_cw154_CutSmart_protein
3 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACTC... 0.0 0.569906 0.398614 0.595372 0.158028 0.178889 0.196911 0.0 0.196911 0.340942 0.608100 0.665446 0.310607 0.154927 0.247978 0.867355 0.831256 0.609382 0.812445 0.0 0.363173 0.416487 0.354378 0.421272 0.454977 0.438265 0.0 0.438265 0.452257 0.477332 0.376957 0.382624 0.449764 0.400321 0.171406 0.369736 0.385435 0.345905 RRBS_cw154_CutSmart_protein
4 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.AGGA... 0.0 0.554293 0.390193 0.576205 0.148946 0.173784 0.194668 0.0 0.194668 0.339245 0.494894 0.652036 0.291318 0.145185 0.250083 0.894932 0.825017 0.592558 0.807747 0.0 0.365658 0.414198 0.357211 0.407138 0.447926 0.433596 0.0 0.433596 0.428716 0.406863 0.397655 0.376272 0.425466 0.383864 0.274772 0.382212 0.378352 0.346732 RRBS_cw154_CutSmart_protein

In [24]:
cw154["filename"][cw154["protocol"] == "RRBS_cw154_CutSmart_protein"] = cw154["filename"].str[:48]
cw154["filename"][cw154["protocol"] == "RRBS_cw154_Tris_protease_CT"] = cw154["filename"].str[:40]
cw154["filename"][cw154["protocol"] == "RRBS_cw154_Tris_protease_GR"] = cw154["filename"].str[:43]


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()

In [25]:
files = [trito, normal, pcell, mcell, cw154, cd19cell]

In [26]:
total_region_files = pd.concat([trito, normal, pcell, mcell, cw154, cd19cell])

In [27]:
total_region_files.shape


Out[27]:
(513, 40)

In [28]:
total_region_files = total_region_files[["filename", "methylation_tssDistance","methylation_genesDistance","methylation_exonsDistance",
                 "methylation_intronsDistance", "methylation_promoterDistance","methylation_cgiDistance",
                 "methylation_ctcfDistance","methylation_ctcfUpDistance","methylation_ctcfDownDistance",
                 "methylation_geneDistalRegulatoryModulesDistance","methylation_vistaEnhancersDistance",
                 "methylation_3PrimeUTRDistance","methylation_5PrimeUTRDistance",
                 "methylation_firstExonDistance","methylation_geneDistalRegulatoryModulesK562Distance",
                 "methylation_hypoInHues64Distance","methylation_intergenic",
                 "methylation_shore","methylation_shelf","PDR_tssDistance",
                 "PDR_genesDistance","PDR_exonsDistance","PDR_intronsDistance", "PDR_promoterDistance",
                 "PDR_cgiDistance","PDR_ctcfDistance","PDR_ctcfUpDistance","PDR_ctcfDownDistance",
                 "PDR_geneDistalRegulatoryModulesDistance","PDR_vistaEnhancersDistance","PDR_3PrimeUTRDistance",
                 "PDR_5PrimeUTRDistance","PDR_firstExonDistance","PDR_geneDistalRegulatoryModulesK562Distance",
                 "PDR_hypoInHues64Distance","PDR_intergenic","PDR_shore","PDR_shelf"]]

In [29]:
total_region_files = total_region_files.reset_index(drop=True)

In [30]:
total_region_files[:40]


Out[30]:
filename methylation_tssDistance methylation_genesDistance methylation_exonsDistance methylation_intronsDistance methylation_promoterDistance methylation_cgiDistance methylation_ctcfDistance methylation_ctcfUpDistance methylation_ctcfDownDistance methylation_geneDistalRegulatoryModulesDistance methylation_vistaEnhancersDistance methylation_3PrimeUTRDistance methylation_5PrimeUTRDistance methylation_firstExonDistance methylation_geneDistalRegulatoryModulesK562Distance methylation_hypoInHues64Distance methylation_intergenic methylation_shore methylation_shelf PDR_tssDistance PDR_genesDistance PDR_exonsDistance PDR_intronsDistance PDR_promoterDistance PDR_cgiDistance PDR_ctcfDistance PDR_ctcfUpDistance PDR_ctcfDownDistance PDR_geneDistalRegulatoryModulesDistance PDR_vistaEnhancersDistance PDR_3PrimeUTRDistance PDR_5PrimeUTRDistance PDR_firstExonDistance PDR_geneDistalRegulatoryModulesK562Distance PDR_hypoInHues64Distance PDR_intergenic PDR_shore PDR_shelf
0 RRBS_trito_pool_1_TAAGGCGA.ACAACC 0.0 0.570721 0.404589 0.594704 0.146371 0.169386 0.199815 0.0 0.199815 0.342986 0.444522 0.680480 0.305206 0.145730 0.254209 0.877142 0.820339 0.607671 0.803716 0.0 0.343014 0.371890 0.339596 0.347748 0.388709 0.385297 0.0 0.385297 0.401254 0.509555 0.386285 0.324236 0.367271 0.354046 0.240359 0.386809 0.377907 0.359143
1 RRBS_trito_pool_1_TAAGGCGA.ACGTGG 0.0 0.545781 0.383371 0.568638 0.141545 0.161519 0.191404 0.0 0.191404 0.326140 0.589834 0.670559 0.290196 0.140779 0.240221 0.809942 0.816166 0.573089 0.795932 0.0 0.348110 0.381251 0.341950 0.349891 0.398898 0.415058 0.0 0.415058 0.408417 0.548192 0.382172 0.332749 0.373615 0.359217 0.364148 0.391925 0.386808 0.354120
2 RRBS_trito_pool_1_TAAGGCGA.ACTCAC 0.0 0.564547 0.401760 0.588136 0.148529 0.174413 0.209041 0.0 0.209041 0.346473 0.553062 0.696068 0.296809 0.148360 0.255392 0.795883 0.832812 0.609544 0.812564 0.0 0.338412 0.371890 0.332321 0.351391 0.393829 0.392313 0.0 0.392313 0.412311 0.471703 0.378630 0.327488 0.370494 0.338321 0.334783 0.378580 0.378799 0.353949
3 RRBS_trito_pool_1_TAAGGCGA.AGGATG 0.0 0.567309 0.399934 0.592890 0.143897 0.168936 0.200661 0.0 0.200661 0.342257 0.665920 0.661426 0.308680 0.141673 0.242236 0.787966 0.824659 0.602995 0.799836 0.0 0.342724 0.374419 0.337654 0.346109 0.389718 0.399153 0.0 0.399153 0.405627 0.359189 0.391002 0.324431 0.360431 0.343730 0.304035 0.380413 0.373345 0.347372
4 RRBS_trito_pool_1_TAAGGCGA.ATAGCG 0.0 0.529224 0.367743 0.555131 0.136090 0.156827 0.175426 0.0 0.175426 0.307402 0.479145 0.644411 0.273473 0.134137 0.220729 0.815944 0.808981 0.575050 0.788587 0.0 0.349254 0.376307 0.342617 0.343348 0.388623 0.403861 0.0 0.403861 0.390288 0.471324 0.392438 0.332882 0.358450 0.319824 0.401641 0.398275 0.373236 0.363320
5 RRBS_trito_pool_1_TAAGGCGA.ATCGAC 0.0 0.566031 0.393281 0.591518 0.145246 0.162973 0.199969 0.0 0.199969 0.333682 0.490492 0.667285 0.304792 0.138075 0.248255 0.853356 0.817065 0.601387 0.800064 0.0 0.343104 0.371148 0.338015 0.350522 0.390895 0.405243 0.0 0.405243 0.397617 0.586487 0.367571 0.325332 0.362838 0.333114 0.328968 0.389536 0.375595 0.350909
6 RRBS_trito_pool_1_TAAGGCGA.CAAGAG 0.0 0.566742 0.402345 0.590378 0.152305 0.173906 0.195518 0.0 0.195518 0.346568 0.533040 0.656980 0.304390 0.149536 0.253048 0.805577 0.816897 0.603907 0.803092 0.0 0.350942 0.386671 0.344248 0.364405 0.406190 0.400039 0.0 0.400039 0.421197 0.545560 0.392756 0.338890 0.381441 0.356823 0.307240 0.394851 0.384458 0.357089
7 RRBS_trito_pool_1_TAAGGCGA.CATGAC 0.0 0.566995 0.407400 0.589923 0.148278 0.174522 0.209616 0.0 0.209616 0.337079 0.476748 0.678752 0.300825 0.146025 0.243253 0.863080 0.824824 0.602455 0.801551 0.0 0.345160 0.379410 0.338777 0.353197 0.399885 0.399392 0.0 0.399392 0.398004 0.482805 0.394881 0.334431 0.375372 0.332435 0.345932 0.381070 0.373821 0.350244
8 RRBS_trito_pool_1_TAAGGCGA.CCTTCG 0.0 0.544137 0.377628 0.568732 0.136603 0.163191 0.191971 0.0 0.191971 0.312724 0.428419 0.644483 0.284865 0.139631 0.229110 0.812046 0.810739 0.567608 0.784795 0.0 0.345857 0.376641 0.339524 0.349919 0.393570 0.413810 0.0 0.413810 0.407238 0.334977 0.388436 0.324229 0.367053 0.354882 0.307155 0.397849 0.390108 0.346731
9 RRBS_trito_pool_1_TAAGGCGA.CGGTAG 0.0 0.540051 0.369671 0.565375 0.139633 0.163692 0.198751 0.0 0.198751 0.312778 0.531099 0.642033 0.284431 0.138257 0.223869 0.851775 0.809572 0.567265 0.792353 0.0 0.354063 0.391736 0.346245 0.357297 0.403435 0.414073 0.0 0.414073 0.409215 0.462336 0.404820 0.330088 0.374945 0.334288 0.296450 0.398979 0.390262 0.360787
10 RRBS_trito_pool_1_TAAGGCGA.CTATTG 0.0 0.579098 0.413628 0.602551 0.149996 0.174348 0.206507 0.0 0.206507 0.340917 0.479200 0.681367 0.314310 0.147705 0.250151 0.842639 0.825690 0.610104 0.808571 0.0 0.334826 0.363741 0.331545 0.340101 0.389592 0.384261 0.0 0.384261 0.399187 0.295595 0.361252 0.316965 0.356730 0.331182 0.275036 0.384923 0.370680 0.338867
11 RRBS_trito_pool_1_TAAGGCGA.GACACG 0.0 0.549829 0.379579 0.572968 0.142349 0.163858 0.194206 0.0 0.194206 0.320273 0.511181 0.628813 0.297707 0.137945 0.231191 0.913175 0.808944 0.575022 0.795743 0.0 0.347308 0.375613 0.343617 0.356474 0.398687 0.405318 0.0 0.405318 0.403924 0.414471 0.390100 0.335772 0.368552 0.331528 0.181846 0.396579 0.382919 0.352013
12 RRBS_trito_pool_1_TAAGGCGA.GCATTC 0.0 0.577097 0.411205 0.599875 0.149844 0.176529 0.209171 0.0 0.209171 0.340387 0.512575 0.667962 0.312442 0.150562 0.251066 0.868216 0.824084 0.608142 0.803492 0.0 0.340215 0.377824 0.332149 0.354279 0.402852 0.387265 0.0 0.387265 0.416357 0.373845 0.376155 0.331282 0.370933 0.363017 0.378312 0.390375 0.366399 0.364220
13 RRBS_trito_pool_1_TAAGGCGA.GCTGCC 0.0 0.544259 0.384439 0.564795 0.139736 0.162409 0.176248 0.0 0.176248 0.318136 0.574486 0.634002 0.289043 0.139641 0.223191 0.856793 0.800237 0.554718 0.776976 0.0 0.354565 0.388414 0.347581 0.360744 0.402607 0.390989 0.0 0.390989 0.431375 0.409462 0.385777 0.340581 0.375888 0.372911 0.350551 0.400626 0.404546 0.364278
14 RRBS_trito_pool_1_TAAGGCGA.GGCATC 0.0 0.555539 0.392705 0.579332 0.145652 0.164935 0.184172 0.0 0.184172 0.333482 0.421168 0.661779 0.296269 0.147555 0.241141 0.825601 0.813275 0.589382 0.790102 0.0 0.349269 0.378454 0.343167 0.352282 0.398483 0.383984 0.0 0.383984 0.394837 0.570345 0.396641 0.337457 0.375483 0.341387 0.228666 0.393968 0.384816 0.358044
15 RRBS_trito_pool_1_TAAGGCGA.GTGAGG 0.0 0.530569 0.367508 0.554714 0.134121 0.155524 0.173857 0.0 0.173857 0.315881 0.463581 0.615143 0.280369 0.133060 0.232378 0.794342 0.804363 0.553666 0.777878 0.0 0.358688 0.390493 0.351879 0.361701 0.404384 0.398420 0.0 0.398420 0.422339 0.427769 0.417731 0.341625 0.380159 0.367890 0.326313 0.400965 0.391422 0.373351
16 RRBS_trito_pool_1_TAAGGCGA.GTTGAG 0.0 0.557958 0.396493 0.580842 0.143581 0.165943 0.198223 0.0 0.198223 0.335718 0.511515 0.653486 0.294972 0.142042 0.251258 0.808442 0.821536 0.592957 0.795827 0.0 0.344146 0.371379 0.337804 0.354805 0.397635 0.404505 0.0 0.404505 0.420580 0.415804 0.359175 0.330021 0.365134 0.366449 0.413398 0.375629 0.381035 0.346115
17 RRBS_trito_pool_1_TAAGGCGA.TAGCGG 0.0 0.540878 0.368191 0.564637 0.133724 0.153977 0.166738 0.0 0.166738 0.307876 0.486900 0.632573 0.286827 0.128397 0.230826 0.856077 0.814219 0.577274 0.793247 0.0 0.346260 0.378683 0.339097 0.356597 0.396187 0.381571 0.0 0.381571 0.400928 0.570090 0.362619 0.339981 0.379723 0.344153 0.377799 0.385887 0.384042 0.349956
18 RRBS_trito_pool_1_TAAGGCGA.TATCTC 0.0 0.575676 0.409355 0.600599 0.148591 0.174386 0.194566 0.0 0.194566 0.336279 0.554679 0.662141 0.309579 0.142574 0.240360 0.844196 0.822712 0.607978 0.806099 0.0 0.341342 0.367808 0.337694 0.339362 0.386849 0.380734 0.0 0.380734 0.394130 0.514109 0.395076 0.319727 0.356904 0.318327 0.309309 0.390633 0.376339 0.360353
19 RRBS_trito_pool_1_TAAGGCGA.TCTCTG 0.0 0.573932 0.402771 0.597337 0.153348 0.175088 0.200765 0.0 0.200765 0.347808 0.610443 0.676611 0.311039 0.144939 0.256872 0.854733 0.826905 0.607012 0.802475 0.0 0.342377 0.370775 0.337035 0.356039 0.402495 0.412892 0.0 0.412892 0.408307 0.464521 0.389068 0.330017 0.366829 0.337809 0.241373 0.383198 0.378137 0.345602
20 RRBS_trito_pool_1_TAAGGCGA.TGACAG 0.0 0.567869 0.399077 0.591561 0.148038 0.176237 0.202154 0.0 0.202154 0.343355 0.522213 0.665558 0.303592 0.146534 0.252288 0.796484 0.830742 0.598186 0.806577 0.0 0.345410 0.381678 0.340699 0.363333 0.411799 0.398106 0.0 0.398106 0.406224 0.468663 0.352417 0.345042 0.391606 0.339766 0.370686 0.369564 0.369690 0.345702
21 RRBS_trito_pool_1_TAAGGCGA.TGCTGC 0.0 0.549810 0.381938 0.576183 0.143244 0.162055 0.182065 0.0 0.182065 0.322882 0.522536 0.636949 0.297776 0.142530 0.244198 0.808005 0.807068 0.566512 0.789342 0.0 0.361772 0.392851 0.356422 0.365182 0.413081 0.408349 0.0 0.408349 0.431775 0.422301 0.386494 0.344547 0.390997 0.368241 0.364905 0.392520 0.386162 0.373449
22 RRBS_trito_pool_2_CGTACTAG.ACAACC 0.0 0.571434 0.403317 0.595518 0.150534 0.172412 0.197820 0.0 0.197820 0.335046 0.542060 0.674691 0.311226 0.145333 0.251228 0.782911 0.825876 0.601882 0.804140 0.0 0.356830 0.396188 0.349903 0.388446 0.425970 0.423147 0.0 0.423147 0.414028 0.406933 0.353024 0.363997 0.410352 0.353075 0.404312 0.376888 0.385532 0.352623
23 RRBS_trito_pool_2_CGTACTAG.ACGTGG 0.0 0.547223 0.381861 0.572699 0.142991 0.167937 0.187875 0.0 0.187875 0.322376 0.494384 0.655945 0.290386 0.138317 0.228471 0.824135 0.809657 0.575763 0.782319 0.0 0.367896 0.410155 0.360025 0.392362 0.439690 0.423503 0.0 0.423503 0.424893 0.371724 0.396436 0.364995 0.415006 0.358457 0.266702 0.393868 0.396684 0.361810
24 RRBS_trito_pool_2_CGTACTAG.ACTCAC 0.0 0.574078 0.410369 0.599662 0.154807 0.180722 0.204649 0.0 0.204649 0.346719 0.484916 0.687740 0.314513 0.155080 0.260803 0.812211 0.828639 0.608617 0.812883 0.0 0.360783 0.413536 0.352271 0.396930 0.438502 0.437218 0.0 0.437218 0.434233 0.243128 0.391187 0.371005 0.424675 0.381663 0.401646 0.380932 0.388726 0.364671
25 RRBS_trito_pool_2_CGTACTAG.AGGATG 0.0 0.574464 0.411157 0.598317 0.153244 0.179052 0.205426 0.0 0.205426 0.347422 0.549476 0.674730 0.318179 0.150916 0.255366 0.825079 0.819016 0.600086 0.803518 0.0 0.361346 0.394711 0.356405 0.385541 0.430138 0.435397 0.0 0.435397 0.430261 0.509647 0.397243 0.354870 0.405106 0.376955 0.349346 0.390618 0.387689 0.355591
26 RRBS_trito_pool_2_CGTACTAG.ATAGCG 0.0 0.530317 0.372598 0.555500 0.139056 0.162002 0.188584 0.0 0.188584 0.314593 0.662600 0.651363 0.276445 0.139170 0.226507 0.782103 0.805115 0.577048 0.796431 0.0 0.368334 0.399350 0.363279 0.375962 0.421184 0.422536 0.0 0.422536 0.423652 0.422062 0.390546 0.352241 0.394821 0.362621 0.349869 0.400868 0.397587 0.375378
27 RRBS_trito_pool_2_CGTACTAG.ATCGAC 0.0 0.579706 0.417754 0.601652 0.154766 0.179131 0.214474 0.0 0.214474 0.347695 0.592746 0.693724 0.318722 0.152873 0.251772 0.829067 0.818502 0.603463 0.804070 0.0 0.365534 0.410172 0.359211 0.392262 0.438666 0.432635 0.0 0.432635 0.437134 0.550723 0.406531 0.367066 0.412751 0.381165 0.291126 0.388891 0.385826 0.368804
28 RRBS_trito_pool_2_CGTACTAG.CAAGAG 0.0 0.562102 0.395170 0.587195 0.146861 0.171793 0.193740 0.0 0.193740 0.336475 0.643228 0.644603 0.303090 0.145796 0.249175 0.867900 0.811712 0.594372 0.798705 0.0 0.371383 0.417443 0.364244 0.392824 0.439427 0.424017 0.0 0.424017 0.430895 0.419896 0.406159 0.358254 0.421774 0.375869 0.285465 0.405000 0.385932 0.366543
29 RRBS_trito_pool_2_CGTACTAG.CATGAC 0.0 0.572795 0.406324 0.595511 0.149580 0.173139 0.207084 0.0 0.207084 0.341760 0.582188 0.686507 0.311526 0.145188 0.255092 0.898922 0.822096 0.602356 0.802416 0.0 0.363315 0.403748 0.356727 0.385169 0.426971 0.428521 0.0 0.428521 0.437559 0.666602 0.399967 0.360502 0.402785 0.382317 0.213772 0.390465 0.395524 0.364513
30 RRBS_trito_pool_2_CGTACTAG.CCTTCG 0.0 0.544424 0.372864 0.566449 0.141793 0.161766 0.188699 0.0 0.188699 0.320217 0.520051 0.648292 0.287153 0.134590 0.231480 0.918757 0.808219 0.556881 0.786660 0.0 0.372677 0.409840 0.367423 0.386818 0.428182 0.410043 0.0 0.410043 0.437956 0.431314 0.417184 0.370701 0.403697 0.379120 0.132574 0.402506 0.408092 0.392411
31 RRBS_trito_pool_2_CGTACTAG.CGGTAG 0.0 0.541543 0.377491 0.566615 0.144500 0.166361 0.187541 0.0 0.187541 0.323718 0.570565 0.649134 0.290349 0.139695 0.231105 0.875553 0.805793 0.574939 0.781107 0.0 0.366749 0.406201 0.359577 0.382439 0.427216 0.412182 0.0 0.412182 0.424271 0.326364 0.387866 0.358136 0.400989 0.367082 0.233944 0.405535 0.387747 0.372620
32 RRBS_trito_pool_2_CGTACTAG.CTATTG 0.0 0.571516 0.403810 0.595260 0.147111 0.169791 0.199903 0.0 0.199903 0.345774 0.461720 0.665058 0.308691 0.144474 0.257436 0.833091 0.825356 0.606750 0.808127 0.0 0.352923 0.389431 0.346998 0.367228 0.410854 0.408465 0.0 0.408465 0.420192 0.435325 0.384775 0.344882 0.388064 0.354390 0.322671 0.384292 0.385328 0.353731
33 RRBS_trito_pool_2_CGTACTAG.GACACG 0.0 0.547611 0.384121 0.571736 0.141024 0.160152 0.183884 0.0 0.183884 0.324428 0.516251 0.658554 0.296090 0.142236 0.243037 0.840847 0.810208 0.580500 0.780899 0.0 0.362036 0.396972 0.355362 0.379360 0.423028 0.407819 0.0 0.407819 0.430720 0.413402 0.375936 0.353105 0.395058 0.377523 0.239142 0.397595 0.385070 0.369290
34 RRBS_trito_pool_2_CGTACTAG.GCATTC 0.0 0.574308 0.399926 0.599169 0.146626 0.171425 0.202222 0.0 0.202222 0.339589 0.601239 0.673941 0.311423 0.142003 0.251027 0.900174 0.822302 0.607857 0.804399 0.0 0.350723 0.388557 0.343231 0.379437 0.423172 0.413884 0.0 0.413884 0.430536 0.484843 0.385927 0.351172 0.395395 0.377211 0.260417 0.387313 0.379687 0.352217
35 RRBS_trito_pool_2_CGTACTAG.GCTGCC 0.0 0.557827 0.389318 0.583295 0.146131 0.170323 0.216651 0.0 0.216651 0.332222 0.465542 0.685189 0.303248 0.142679 0.253152 0.925723 0.812384 0.566898 0.789918 0.0 0.364344 0.411073 0.357567 0.386643 0.436352 0.455234 0.0 0.455234 0.427594 0.441989 0.410031 0.357034 0.414984 0.377580 0.122134 0.390103 0.391888 0.356613
36 RRBS_trito_pool_2_CGTACTAG.GGCATC 0.0 0.558443 0.393582 0.581411 0.149267 0.173254 0.202211 0.0 0.202211 0.336075 0.497985 0.658613 0.307422 0.145063 0.253500 0.822348 0.810850 0.579463 0.788819 0.0 0.369847 0.416267 0.362248 0.393373 0.438305 0.432106 0.0 0.432106 0.447392 0.283823 0.414114 0.369362 0.423329 0.390531 0.294508 0.403624 0.394708 0.370958
37 RRBS_trito_pool_2_CGTACTAG.GTGAGG 0.0 0.548763 0.378005 0.573823 0.146951 0.166034 0.189554 0.0 0.189554 0.329531 0.417063 0.656924 0.298588 0.143700 0.237655 0.837979 0.818187 0.578995 0.795407 0.0 0.369274 0.411458 0.362212 0.399234 0.439421 0.426244 0.0 0.426244 0.437661 0.387644 0.379763 0.378844 0.416184 0.393381 0.327526 0.390663 0.404334 0.359301
38 RRBS_trito_pool_2_CGTACTAG.GTTGAG 0.0 0.560388 0.392703 0.585744 0.148877 0.173148 0.191432 0.0 0.191432 0.337372 0.531839 0.657189 0.308189 0.144341 0.245608 0.856433 0.820191 0.594452 0.790093 0.0 0.357508 0.399570 0.348983 0.382424 0.428961 0.426739 0.0 0.426739 0.436677 0.623022 0.378906 0.357031 0.402640 0.380493 0.320648 0.385596 0.378942 0.359409
39 RRBS_trito_pool_2_CGTACTAG.TAGCGG 0.0 0.542448 0.379673 0.565934 0.141666 0.161357 0.190254 0.0 0.190254 0.322077 0.564060 0.661459 0.291782 0.141023 0.236705 0.775427 0.812801 0.584395 0.788413 0.0 0.373111 0.419487 0.363822 0.398524 0.443218 0.429929 0.0 0.429929 0.428916 0.508213 0.389595 0.371629 0.425177 0.379944 0.390322 0.401379 0.390136 0.365038

In [31]:
stats = pd.read_csv("RRBS_anno_statistics_full_446files_filter50K.csv")

In [32]:
stats.shape


Out[32]:
(446, 18)

In [33]:
stats_files = stats.filename

In [34]:
merged = stats.merge(total_region_files, on='filename')

In [35]:
merged = merged.drop(['thisMeth', 'mixedReadCount', 'total_reads', 'total_cpg_no_filter', 'total_cpg_gtrthan1',
       'total_cpg_gtrthan38', 'avgReadCpgs_nofilter','avgReadCpgs_lessthan1CpG', 'avgReadCpgs_gtreql3.8CpG', 'bsRate',], axis=1)

In [36]:
merged


Out[36]:
filename methylation PDR_total methylation_unweighted PDR_unweighted type bio protocol methylation_tssDistance methylation_genesDistance methylation_exonsDistance methylation_intronsDistance methylation_promoterDistance methylation_cgiDistance methylation_ctcfDistance methylation_ctcfUpDistance methylation_ctcfDownDistance methylation_geneDistalRegulatoryModulesDistance methylation_vistaEnhancersDistance methylation_3PrimeUTRDistance methylation_5PrimeUTRDistance methylation_firstExonDistance methylation_geneDistalRegulatoryModulesK562Distance methylation_hypoInHues64Distance methylation_intergenic methylation_shore methylation_shelf PDR_tssDistance PDR_genesDistance PDR_exonsDistance PDR_intronsDistance PDR_promoterDistance PDR_cgiDistance PDR_ctcfDistance PDR_ctcfUpDistance PDR_ctcfDownDistance PDR_geneDistalRegulatoryModulesDistance PDR_vistaEnhancersDistance PDR_3PrimeUTRDistance PDR_5PrimeUTRDistance PDR_firstExonDistance PDR_geneDistalRegulatoryModulesK562Distance PDR_hypoInHues64Distance PDR_intergenic PDR_shore PDR_shelf
0 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACAACC 0.591346 0.259001 0.691996 0.254835 normal normal_B normal_B_cell_A1_24 0.0 0.572922 0.388607 0.597003 0.127235 0.135145 0.178152 0.0 0.178152 0.313712 0.354954 0.728744 0.282305 0.125011 0.230720 0.926937 0.902930 0.637052 0.896802 0.0 0.249665 0.320241 0.237896 0.363833 0.388255 0.419154 0.0 0.419154 0.364421 0.489309 0.214752 0.295134 0.379389 0.352525 0.020851 0.183311 0.285294 0.169240
1 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACCGCG 0.531169 0.411448 0.620106 0.390562 normal normal_B normal_B_cell_A1_24 0.0 0.505145 0.359230 0.526446 0.134970 0.171891 0.198475 0.0 0.198475 0.352037 0.516644 0.625871 0.266546 0.123009 0.233159 0.865658 0.777870 0.528428 0.750640 0.0 0.389613 0.436141 0.383879 0.426575 0.439299 0.424045 0.0 0.424045 0.468612 0.632490 0.413364 0.384389 0.446504 0.426731 0.016904 0.426067 0.413426 0.430125
2 RRBS_normal_B_cell_A1_24_TAAGGCGA.ACGTGG 0.586403 0.278568 0.699736 0.266418 normal normal_B normal_B_cell_A1_24 0.0 0.553568 0.359975 0.583731 0.117959 0.125268 0.176109 0.0 0.176109 0.309668 0.712070 0.718911 0.277957 0.101747 0.227526 0.942223 0.895722 0.618027 0.868715 0.0 0.276292 0.353180 0.264764 0.381896 0.403526 0.431742 0.0 0.431742 0.362307 0.148328 0.312959 0.323942 0.389617 0.349141 0.150021 0.199184 0.311731 0.206646
3 RRBS_normal_B_cell_A1_24_TAAGGCGA.AGGATG 0.628623 0.248006 0.732036 0.240201 normal normal_B normal_B_cell_A1_24 0.0 0.600840 0.392730 0.633467 0.130846 0.134532 0.228113 0.0 0.228113 0.344969 0.738883 0.721101 0.315410 0.110561 0.254307 0.955880 0.916222 0.672012 0.891286 0.0 0.242686 0.329575 0.226780 0.380465 0.395003 0.428452 0.0 0.428452 0.378543 0.293278 0.226409 0.306003 0.391967 0.380095 0.156554 0.167441 0.275408 0.184916
4 RRBS_normal_B_cell_A1_24_TAAGGCGA.ATAGCG 0.568354 0.434929 0.648127 0.425702 normal normal_B normal_B_cell_A1_24 0.0 0.553723 0.424441 0.572004 0.201759 0.254062 0.243405 0.0 0.243405 0.365090 0.703915 0.690610 0.324217 0.225215 0.250693 0.628902 0.744224 0.591154 0.764444 0.0 0.396151 0.413207 0.393634 0.414282 0.434310 0.404922 0.0 0.404922 0.454559 0.520165 0.398875 0.398097 0.431908 0.403162 0.649024 0.500065 0.432751 0.402732
5 RRBS_normal_B_cell_A1_24_TAAGGCGA.ATCGAC 0.622386 0.272543 0.716552 0.270967 normal normal_B normal_B_cell_A1_24 0.0 0.598241 0.400137 0.625556 0.135943 0.142543 0.197461 0.0 0.197461 0.328983 0.561384 0.724674 0.319542 0.123392 0.245854 0.957901 0.896857 0.631902 0.876983 0.0 0.263207 0.329618 0.253472 0.378263 0.392845 0.387199 0.0 0.387199 0.376833 0.406406 0.247637 0.313278 0.389120 0.356423 0.008145 0.210394 0.305067 0.196524
6 RRBS_normal_B_cell_A1_24_TAAGGCGA.CAAGAG 0.580746 0.358441 0.670718 0.348679 normal normal_B normal_B_cell_A1_24 0.0 0.557673 0.393037 0.581964 0.136496 0.154840 0.185085 0.0 0.185085 0.336342 0.548913 0.724639 0.289102 0.134083 0.242662 0.914303 0.839622 0.609142 0.828499 0.0 0.346284 0.400937 0.334616 0.403498 0.421695 0.446341 0.0 0.446341 0.415259 0.457161 0.375382 0.381060 0.431724 0.366805 0.323921 0.349326 0.361848 0.326459
7 RRBS_normal_B_cell_A1_24_TAAGGCGA.CATGAC 0.579873 0.374401 0.668592 0.364613 normal normal_B normal_B_cell_A1_24 0.0 0.555496 0.372118 0.585104 0.130767 0.149758 0.174434 0.0 0.174434 0.325693 0.489904 0.691227 0.303725 0.117398 0.252648 0.877326 0.830718 0.612466 0.819436 0.0 0.352702 0.397442 0.342892 0.398493 0.420841 0.398592 0.0 0.398592 0.433256 0.586659 0.322179 0.351247 0.405937 0.401367 0.420977 0.381797 0.388051 0.361306
8 RRBS_normal_B_cell_A1_24_TAAGGCGA.CGGTAG 0.580833 0.285978 0.701418 0.271634 normal normal_B normal_B_cell_A1_24 0.0 0.547152 0.343528 0.574055 0.117823 0.121103 0.155409 0.0 0.155409 0.304843 0.593398 0.648511 0.280638 0.096895 0.228697 0.986942 0.887179 0.600183 0.863762 0.0 0.280334 0.358519 0.267715 0.392784 0.396120 0.372254 0.0 0.372254 0.399106 0.174794 0.363332 0.337774 0.402650 0.366518 0.060128 0.231404 0.321895 0.210547
9 RRBS_normal_B_cell_A1_24_TAAGGCGA.CTATTG 0.582590 0.427069 0.650146 0.424804 normal normal_B normal_B_cell_A1_24 0.0 0.569780 0.422603 0.591459 0.171423 0.213337 0.222178 0.0 0.222178 0.366234 0.698103 0.685389 0.327695 0.170005 0.263775 0.864895 0.775154 0.610494 0.773103 0.0 0.393549 0.428448 0.389104 0.421122 0.442946 0.424529 0.0 0.424529 0.465571 0.501746 0.424570 0.388460 0.445857 0.430022 0.365360 0.478388 0.410114 0.419770
10 RRBS_normal_B_cell_A1_24_TAAGGCGA.CTCAGC 0.577931 0.441120 0.640678 0.433420 normal normal_B normal_B_cell_A1_24 0.0 0.565528 0.437396 0.585564 0.207370 0.274443 0.235145 0.0 0.235145 0.377706 0.636860 0.669217 0.337121 0.216485 0.247102 0.909352 0.737474 0.578117 0.735609 0.0 0.409112 0.426369 0.406486 0.421935 0.442577 0.428453 0.0 0.428453 0.502348 0.678498 0.373862 0.418496 0.439376 0.451782 0.296798 0.502111 0.430704 0.404902
11 RRBS_normal_B_cell_A1_24_TAAGGCGA.GACACG 0.603615 0.259780 0.705313 0.246132 normal normal_B normal_B_cell_A1_24 0.0 0.568850 0.361969 0.599111 0.122396 0.127940 0.159862 0.0 0.159862 0.320582 0.725028 0.694968 0.304324 0.117228 0.238581 0.971203 0.898648 0.616594 0.874494 0.0 0.259987 0.346939 0.247972 0.376738 0.384038 0.371028 0.0 0.371028 0.370863 0.041421 0.276435 0.305850 0.396597 0.374203 0.122058 0.193916 0.294751 0.178923
12 RRBS_normal_B_cell_A1_24_TAAGGCGA.GCTGCC 0.602191 0.274941 0.696495 0.261564 normal normal_B normal_B_cell_A1_24 0.0 0.569876 0.362369 0.602931 0.119385 0.123399 0.166912 0.0 0.166912 0.312508 0.483566 0.664175 0.295046 0.104887 0.227071 0.888816 0.894996 0.621557 0.873106 0.0 0.270732 0.350990 0.255018 0.379740 0.403785 0.393961 0.0 0.393961 0.374419 0.151892 0.287812 0.327504 0.390677 0.347520 0.373774 0.199908 0.307455 0.208747
13 RRBS_normal_B_cell_A1_24_TAAGGCGA.GGCATC 0.592896 0.309917 0.688324 0.293166 normal normal_B normal_B_cell_A1_24 0.0 0.568737 0.375639 0.597084 0.126210 0.134600 0.173529 0.0 0.173529 0.311749 0.609371 0.670949 0.303669 0.104746 0.230130 0.933383 0.874730 0.619164 0.858757 0.0 0.302826 0.370106 0.290653 0.398204 0.419242 0.411253 0.0 0.411253 0.405264 0.556019 0.334586 0.327526 0.392053 0.379147 0.134353 0.259203 0.328873 0.271502
14 RRBS_normal_B_cell_A1_24_TAAGGCGA.GTGAGG 0.576342 0.269746 0.692209 0.255067 normal normal_B normal_B_cell_A1_24 0.0 0.543913 0.345264 0.578228 0.119943 0.117022 0.186957 0.0 0.186957 0.304374 0.275972 0.686426 0.258585 0.103817 0.243962 0.973117 0.893765 0.601321 0.862469 0.0 0.269974 0.351114 0.255608 0.384957 0.398879 0.421579 0.0 0.421579 0.369955 0.324316 0.242306 0.326748 0.397716 0.360025 0.077235 0.189490 0.295756 0.182461
15 RRBS_normal_B_cell_A1_24_TAAGGCGA.GTTGAG 0.573082 0.434043 0.645717 0.421790 normal normal_B normal_B_cell_A1_24 0.0 0.567695 0.412468 0.587454 0.152377 0.169104 0.213248 0.0 0.213248 0.325752 0.478594 0.672947 0.300417 0.138325 0.267002 0.718321 0.755050 0.585532 0.768680 0.0 0.394381 0.413518 0.390813 0.422574 0.436592 0.393126 0.0 0.393126 0.423319 0.545367 0.442718 0.413598 0.426612 0.399205 0.439942 0.496714 0.422975 0.392376
16 RRBS_normal_B_cell_A1_24_TAAGGCGA.TAGCGG 0.563537 0.344400 0.671286 0.324550 normal normal_B normal_B_cell_A1_24 0.0 0.537509 0.353348 0.562371 0.118371 0.130877 0.170794 0.0 0.170794 0.303084 0.460163 0.660977 0.281453 0.108285 0.226112 0.777019 0.856849 0.597661 0.836136 0.0 0.334426 0.403578 0.325768 0.395605 0.424208 0.433552 0.0 0.433552 0.385425 0.504680 0.365191 0.353273 0.424782 0.364329 0.507417 0.312744 0.359828 0.303938
17 RRBS_normal_B_cell_A1_24_TAAGGCGA.TATCTC 0.592870 0.383162 0.663549 0.384798 normal normal_B normal_B_cell_A1_24 0.0 0.574181 0.422529 0.596448 0.157292 0.185985 0.197790 0.0 0.197790 0.333505 0.549745 0.695153 0.306368 0.162632 0.244439 0.958932 0.817648 0.615515 0.796791 0.0 0.359375 0.390368 0.355666 0.395329 0.415846 0.400304 0.0 0.400304 0.420615 0.631283 0.386431 0.353516 0.402680 0.372923 0.120302 0.406370 0.387130 0.361715
18 RRBS_normal_B_cell_A1_24_TAAGGCGA.TCTCTG 0.566829 0.459303 0.621230 0.456004 normal normal_B normal_B_cell_A1_24 0.0 0.561735 0.445770 0.575198 0.219102 0.270627 0.262653 0.0 0.262653 0.388225 0.425444 0.658489 0.339626 0.221938 0.268746 0.693445 0.707242 0.582715 0.734725 0.0 0.424692 0.428625 0.423983 0.416932 0.438851 0.435457 0.0 0.435457 0.458630 0.506799 0.461580 0.424516 0.410753 0.402906 0.446482 0.535373 0.447916 0.449113
19 RRBS_normal_B_cell_A1_24_TAAGGCGA.TGACAG 0.572760 0.339617 0.670456 0.330762 normal normal_B normal_B_cell_A1_24 0.0 0.545209 0.366002 0.571545 0.129616 0.136915 0.184688 0.0 0.184688 0.302508 0.516610 0.667369 0.283649 0.109472 0.226291 0.815052 0.858191 0.595584 0.824065 0.0 0.328561 0.362106 0.322815 0.385218 0.409109 0.420144 0.0 0.420144 0.416980 0.540203 0.342566 0.359376 0.387962 0.391970 0.426135 0.310284 0.363570 0.329527
20 RRBS_normal_B_cell_B1_24_CGTACTAG.ACAACC 0.626281 0.266847 0.723172 0.246680 normal normal_B normal_B_cell_B1_24 0.0 0.600708 0.393908 0.630897 0.130491 0.132780 0.202374 0.0 0.202374 0.352837 0.713675 0.725713 0.320354 0.106622 0.268270 0.927542 0.898352 0.640993 0.877308 0.0 0.259897 0.334198 0.248877 0.382296 0.401650 0.425809 0.0 0.425809 0.382371 0.255144 0.252366 0.309392 0.380196 0.367431 0.180845 0.201513 0.301175 0.213756
21 RRBS_normal_B_cell_B1_24_CGTACTAG.ACCGCG 0.537494 0.432684 0.620718 0.409340 normal normal_B normal_B_cell_B1_24 0.0 0.519674 0.378490 0.541383 0.159885 0.190980 0.207475 0.0 0.207475 0.318937 0.704963 0.615859 0.290296 0.155550 0.229599 0.820416 0.733765 0.532819 0.736470 0.0 0.398962 0.428732 0.396038 0.410177 0.437335 0.420623 0.0 0.420623 0.464244 0.680829 0.429482 0.377868 0.416697 0.435796 0.262760 0.487506 0.420668 0.395321
22 RRBS_normal_B_cell_B1_24_CGTACTAG.ACTCAC 0.641663 0.246022 0.731753 0.227309 normal normal_B normal_B_cell_B1_24 0.0 0.612460 0.404062 0.641619 0.138321 0.134888 0.200057 0.0 0.200057 0.365257 0.464153 0.743855 0.331324 0.115728 0.271711 0.888724 0.914558 0.655884 0.902320 0.0 0.242492 0.329096 0.230329 0.387293 0.406756 0.409925 0.0 0.409925 0.381945 0.411628 0.222693 0.307246 0.402911 0.373759 0.088080 0.165203 0.278304 0.167580
23 RRBS_normal_B_cell_B1_24_CGTACTAG.ATAGCG 0.589376 0.261165 0.710628 0.230766 normal normal_B normal_B_cell_B1_24 0.0 0.564847 0.359529 0.595146 0.118704 0.114490 0.173252 0.0 0.173252 0.310655 0.503806 0.691586 0.285765 0.103678 0.228965 0.970596 0.901415 0.619064 0.889049 0.0 0.258756 0.349989 0.242980 0.378504 0.394998 0.419274 0.0 0.419274 0.350099 0.564988 0.274140 0.311913 0.397878 0.335510 0.049096 0.181269 0.296877 0.184334
24 RRBS_normal_B_cell_B1_24_CGTACTAG.CAAGAG 0.573636 0.410016 0.649119 0.394524 normal normal_B normal_B_cell_B1_24 0.0 0.558144 0.410151 0.580173 0.160099 0.195220 0.200590 0.0 0.200590 0.349321 0.549857 0.668894 0.309413 0.163017 0.243278 0.919356 0.784926 0.568173 0.761311 0.0 0.386441 0.419907 0.381549 0.411113 0.441415 0.406298 0.0 0.406298 0.448031 0.547251 0.410927 0.394507 0.435718 0.393209 0.308112 0.437090 0.407888 0.396319
25 RRBS_normal_B_cell_B1_24_CGTACTAG.CATGAC 0.624309 0.250108 0.726315 0.228687 normal normal_B normal_B_cell_B1_24 0.0 0.596158 0.388236 0.627931 0.130745 0.131519 0.172315 0.0 0.172315 0.338750 0.456898 0.722962 0.315504 0.112215 0.257224 0.969047 0.914006 0.639293 0.894982 0.0 0.248352 0.341477 0.232416 0.376812 0.406716 0.399623 0.0 0.399623 0.368001 0.332834 0.242602 0.312927 0.398130 0.356175 0.101891 0.168523 0.281624 0.169076
26 RRBS_normal_B_cell_B1_24_CGTACTAG.CCTTCG 0.596452 0.329267 0.698844 0.306790 normal normal_B normal_B_cell_B1_24 0.0 0.571273 0.377748 0.599836 0.130880 0.142904 0.178021 0.0 0.178021 0.331112 0.590600 0.684578 0.300336 0.120516 0.255226 0.910927 0.858774 0.607147 0.851038 0.0 0.313604 0.385012 0.301194 0.409911 0.429555 0.440444 0.0 0.440444 0.411064 0.403216 0.356594 0.351884 0.427722 0.393187 0.197940 0.299706 0.348462 0.263281
27 RRBS_normal_B_cell_B1_24_CGTACTAG.CGGTAG 0.525054 0.433851 0.609985 0.410107 normal normal_B normal_B_cell_B1_24 0.0 0.507179 0.348300 0.529348 0.131065 0.146562 0.159320 0.0 0.159320 0.291927 0.585224 0.598317 0.271001 0.122086 0.204311 0.831863 0.748557 0.519638 0.731533 0.0 0.407883 0.431943 0.402297 0.405059 0.427618 0.414400 0.0 0.414400 0.426879 0.483459 0.421985 0.395431 0.423924 0.373547 0.410587 0.484246 0.438821 0.430328
28 RRBS_normal_B_cell_B1_24_CGTACTAG.CTATTG 0.593262 0.424328 0.660035 0.416186 normal normal_B normal_B_cell_B1_24 0.0 0.579434 0.434890 0.600699 0.188624 0.233725 0.230526 0.0 0.230526 0.366878 0.612085 0.672638 0.334540 0.186837 0.249289 0.849944 0.774006 0.594751 0.778275 0.0 0.393161 0.420478 0.388782 0.413085 0.445061 0.400803 0.0 0.400803 0.437013 0.527192 0.421103 0.396852 0.430744 0.381902 0.351360 0.469681 0.420649 0.397929
29 RRBS_normal_B_cell_B1_24_CGTACTAG.CTCAGC 0.615582 0.270868 0.718351 0.245807 normal normal_B normal_B_cell_B1_24 0.0 0.584390 0.381342 0.614639 0.128080 0.134331 0.181411 0.0 0.181411 0.326547 0.521315 0.699421 0.307323 0.114798 0.250506 0.958901 0.900439 0.617142 0.875827 0.0 0.267871 0.351355 0.252870 0.389412 0.415080 0.433702 0.0 0.433702 0.390515 0.388477 0.260598 0.320489 0.397438 0.368093 0.046859 0.194196 0.307630 0.192950
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
416 RRBS_trito_pool_1_TAAGGCGA.GCTGCC 0.565787 0.379120 0.636582 0.361267 CLL CLL trito_pool_1 0.0 0.544259 0.384439 0.564795 0.139736 0.162409 0.176248 0.0 0.176248 0.318136 0.574486 0.634002 0.289043 0.139641 0.223191 0.856793 0.800237 0.554718 0.776976 0.0 0.354565 0.388414 0.347581 0.360744 0.402607 0.390989 0.0 0.390989 0.431375 0.409462 0.385777 0.340581 0.375888 0.372911 0.350551 0.400626 0.404546 0.364278
417 RRBS_trito_pool_1_TAAGGCGA.GGCATC 0.578365 0.372803 0.639499 0.362328 CLL CLL trito_pool_1 0.0 0.555539 0.392705 0.579332 0.145652 0.164935 0.184172 0.0 0.184172 0.333482 0.421168 0.661779 0.296269 0.147555 0.241141 0.825601 0.813275 0.589382 0.790102 0.0 0.349269 0.378454 0.343167 0.352282 0.398483 0.383984 0.0 0.383984 0.394837 0.570345 0.396641 0.337457 0.375483 0.341387 0.228666 0.393968 0.384816 0.358044
418 RRBS_trito_pool_1_TAAGGCGA.GTGAGG 0.554707 0.379959 0.630143 0.365870 CLL CLL trito_pool_1 0.0 0.530569 0.367508 0.554714 0.134121 0.155524 0.173857 0.0 0.173857 0.315881 0.463581 0.615143 0.280369 0.133060 0.232378 0.794342 0.804363 0.553666 0.777878 0.0 0.358688 0.390493 0.351879 0.361701 0.404384 0.398420 0.0 0.398420 0.422339 0.427769 0.417731 0.341625 0.380159 0.367890 0.326313 0.400965 0.391422 0.373351
419 RRBS_trito_pool_1_TAAGGCGA.GTTGAG 0.581650 0.365969 0.651666 0.354063 CLL CLL trito_pool_1 0.0 0.557958 0.396493 0.580842 0.143581 0.165943 0.198223 0.0 0.198223 0.335718 0.511515 0.653486 0.294972 0.142042 0.251258 0.808442 0.821536 0.592957 0.795827 0.0 0.344146 0.371379 0.337804 0.354805 0.397635 0.404505 0.0 0.404505 0.420580 0.415804 0.359175 0.330021 0.365134 0.366449 0.413398 0.375629 0.381035 0.346115
420 RRBS_trito_pool_1_TAAGGCGA.TAGCGG 0.564165 0.368605 0.635522 0.354078 CLL CLL trito_pool_1 0.0 0.540878 0.368191 0.564637 0.133724 0.153977 0.166738 0.0 0.166738 0.307876 0.486900 0.632573 0.286827 0.128397 0.230826 0.856077 0.814219 0.577274 0.793247 0.0 0.346260 0.378683 0.339097 0.356597 0.396187 0.381571 0.0 0.381571 0.400928 0.570090 0.362619 0.339981 0.379723 0.344153 0.377799 0.385887 0.384042 0.349956
421 RRBS_trito_pool_1_TAAGGCGA.TATCTC 0.598086 0.365747 0.640908 0.360269 CLL CLL trito_pool_1 0.0 0.575676 0.409355 0.600599 0.148591 0.174386 0.194566 0.0 0.194566 0.336279 0.554679 0.662141 0.309579 0.142574 0.240360 0.844196 0.822712 0.607978 0.806099 0.0 0.341342 0.367808 0.337694 0.339362 0.386849 0.380734 0.0 0.380734 0.394130 0.514109 0.395076 0.319727 0.356904 0.318327 0.309309 0.390633 0.376339 0.360353
422 RRBS_trito_pool_1_TAAGGCGA.TCTCTG 0.598637 0.367210 0.649959 0.358410 CLL CLL trito_pool_1 0.0 0.573932 0.402771 0.597337 0.153348 0.175088 0.200765 0.0 0.200765 0.347808 0.610443 0.676611 0.311039 0.144939 0.256872 0.854733 0.826905 0.607012 0.802475 0.0 0.342377 0.370775 0.337035 0.356039 0.402495 0.412892 0.0 0.412892 0.408307 0.464521 0.389068 0.330017 0.366829 0.337809 0.241373 0.383198 0.378137 0.345602
423 RRBS_trito_pool_1_TAAGGCGA.TGACAG 0.592655 0.364070 0.651817 0.350381 CLL CLL trito_pool_1 0.0 0.567869 0.399077 0.591561 0.148038 0.176237 0.202154 0.0 0.202154 0.343355 0.522213 0.665558 0.303592 0.146534 0.252288 0.796484 0.830742 0.598186 0.806577 0.0 0.345410 0.381678 0.340699 0.363333 0.411799 0.398106 0.0 0.398106 0.406224 0.468663 0.352417 0.345042 0.391606 0.339766 0.370686 0.369564 0.369690 0.345702
424 RRBS_trito_pool_1_TAAGGCGA.TGCTGC 0.572118 0.380804 0.638604 0.366845 CLL CLL trito_pool_1 0.0 0.549810 0.381938 0.576183 0.143244 0.162055 0.182065 0.0 0.182065 0.322882 0.522536 0.636949 0.297776 0.142530 0.244198 0.808005 0.807068 0.566512 0.789342 0.0 0.361772 0.392851 0.356422 0.365182 0.413081 0.408349 0.0 0.408349 0.431775 0.422301 0.386494 0.344547 0.390997 0.368241 0.364905 0.392520 0.386162 0.373449
425 RRBS_trito_pool_2_CGTACTAG.ACAACC 0.593766 0.374765 0.655123 0.357956 CLL CLL trito_pool_2 0.0 0.571434 0.403317 0.595518 0.150534 0.172412 0.197820 0.0 0.197820 0.335046 0.542060 0.674691 0.311226 0.145333 0.251228 0.782911 0.825876 0.601882 0.804140 0.0 0.356830 0.396188 0.349903 0.388446 0.425970 0.423147 0.0 0.423147 0.414028 0.406933 0.353024 0.363997 0.410352 0.353075 0.404312 0.376888 0.385532 0.352623
426 RRBS_trito_pool_2_CGTACTAG.ACGTGG 0.569259 0.388716 0.648626 0.369196 CLL CLL trito_pool_2 0.0 0.547223 0.381861 0.572699 0.142991 0.167937 0.187875 0.0 0.187875 0.322376 0.494384 0.655945 0.290386 0.138317 0.228471 0.824135 0.809657 0.575763 0.782319 0.0 0.367896 0.410155 0.360025 0.392362 0.439690 0.423503 0.0 0.423503 0.424893 0.371724 0.396436 0.364995 0.415006 0.358457 0.266702 0.393868 0.396684 0.361810
427 RRBS_trito_pool_2_CGTACTAG.ACTCAC 0.595647 0.380070 0.652962 0.364404 CLL CLL trito_pool_2 0.0 0.574078 0.410369 0.599662 0.154807 0.180722 0.204649 0.0 0.204649 0.346719 0.484916 0.687740 0.314513 0.155080 0.260803 0.812211 0.828639 0.608617 0.812883 0.0 0.360783 0.413536 0.352271 0.396930 0.438502 0.437218 0.0 0.437218 0.434233 0.243128 0.391187 0.371005 0.424675 0.381663 0.401646 0.380932 0.388726 0.364671
428 RRBS_trito_pool_2_CGTACTAG.AGGATG 0.595616 0.384004 0.658358 0.369689 CLL CLL trito_pool_2 0.0 0.574464 0.411157 0.598317 0.153244 0.179052 0.205426 0.0 0.205426 0.347422 0.549476 0.674730 0.318179 0.150916 0.255366 0.825079 0.819016 0.600086 0.803518 0.0 0.361346 0.394711 0.356405 0.385541 0.430138 0.435397 0.0 0.435397 0.430261 0.509647 0.397243 0.354870 0.405106 0.376955 0.349346 0.390618 0.387689 0.355591
429 RRBS_trito_pool_2_CGTACTAG.ATAGCG 0.554055 0.387759 0.635741 0.364157 CLL CLL trito_pool_2 0.0 0.530317 0.372598 0.555500 0.139056 0.162002 0.188584 0.0 0.188584 0.314593 0.662600 0.651363 0.276445 0.139170 0.226507 0.782103 0.805115 0.577048 0.796431 0.0 0.368334 0.399350 0.363279 0.375962 0.421184 0.422536 0.0 0.422536 0.423652 0.422062 0.390546 0.352241 0.394821 0.362621 0.349869 0.400868 0.397587 0.375378
430 RRBS_trito_pool_2_CGTACTAG.ATCGAC 0.598233 0.384663 0.661048 0.366513 CLL CLL trito_pool_2 0.0 0.579706 0.417754 0.601652 0.154766 0.179131 0.214474 0.0 0.214474 0.347695 0.592746 0.693724 0.318722 0.152873 0.251772 0.829067 0.818502 0.603463 0.804070 0.0 0.365534 0.410172 0.359211 0.392262 0.438666 0.432635 0.0 0.432635 0.437134 0.550723 0.406531 0.367066 0.412751 0.381165 0.291126 0.388891 0.385826 0.368804
431 RRBS_trito_pool_2_CGTACTAG.CAAGAG 0.583609 0.393008 0.649029 0.373233 CLL CLL trito_pool_2 0.0 0.562102 0.395170 0.587195 0.146861 0.171793 0.193740 0.0 0.193740 0.336475 0.643228 0.644603 0.303090 0.145796 0.249175 0.867900 0.811712 0.594372 0.798705 0.0 0.371383 0.417443 0.364244 0.392824 0.439427 0.424017 0.0 0.424017 0.430895 0.419896 0.406159 0.358254 0.421774 0.375869 0.285465 0.405000 0.385932 0.366543
432 RRBS_trito_pool_2_CGTACTAG.CATGAC 0.593843 0.383170 0.659853 0.366034 CLL CLL trito_pool_2 0.0 0.572795 0.406324 0.595511 0.149580 0.173139 0.207084 0.0 0.207084 0.341760 0.582188 0.686507 0.311526 0.145188 0.255092 0.898922 0.822096 0.602356 0.802416 0.0 0.363315 0.403748 0.356727 0.385169 0.426971 0.428521 0.0 0.428521 0.437559 0.666602 0.399967 0.360502 0.402785 0.382317 0.213772 0.390465 0.395524 0.364513
433 RRBS_trito_pool_2_CGTACTAG.CCTTCG 0.568050 0.392622 0.647402 0.371530 CLL CLL trito_pool_2 0.0 0.544424 0.372864 0.566449 0.141793 0.161766 0.188699 0.0 0.188699 0.320217 0.520051 0.648292 0.287153 0.134590 0.231480 0.918757 0.808219 0.556881 0.786660 0.0 0.372677 0.409840 0.367423 0.386818 0.428182 0.410043 0.0 0.410043 0.437956 0.431314 0.417184 0.370701 0.403697 0.379120 0.132574 0.402506 0.408092 0.392411
434 RRBS_trito_pool_2_CGTACTAG.CGGTAG 0.569010 0.390108 0.645339 0.372348 CLL CLL trito_pool_2 0.0 0.541543 0.377491 0.566615 0.144500 0.166361 0.187541 0.0 0.187541 0.323718 0.570565 0.649134 0.290349 0.139695 0.231105 0.875553 0.805793 0.574939 0.781107 0.0 0.366749 0.406201 0.359577 0.382439 0.427216 0.412182 0.0 0.412182 0.424271 0.326364 0.387866 0.358136 0.400989 0.367082 0.233944 0.405535 0.387747 0.372620
435 RRBS_trito_pool_2_CGTACTAG.CTATTG 0.595037 0.373701 0.659609 0.357034 CLL CLL trito_pool_2 0.0 0.571516 0.403810 0.595260 0.147111 0.169791 0.199903 0.0 0.199903 0.345774 0.461720 0.665058 0.308691 0.144474 0.257436 0.833091 0.825356 0.606750 0.808127 0.0 0.352923 0.389431 0.346998 0.367228 0.410854 0.408465 0.0 0.408465 0.420192 0.435325 0.384775 0.344882 0.388064 0.354390 0.322671 0.384292 0.385328 0.353731
436 RRBS_trito_pool_2_CGTACTAG.GACACG 0.570886 0.384283 0.645343 0.367303 CLL CLL trito_pool_2 0.0 0.547611 0.384121 0.571736 0.141024 0.160152 0.183884 0.0 0.183884 0.324428 0.516251 0.658554 0.296090 0.142236 0.243037 0.840847 0.810208 0.580500 0.780899 0.0 0.362036 0.396972 0.355362 0.379360 0.423028 0.407819 0.0 0.407819 0.430720 0.413402 0.375936 0.353105 0.395058 0.377523 0.239142 0.397595 0.385070 0.369290
437 RRBS_trito_pool_2_CGTACTAG.GCATTC 0.596410 0.375766 0.656557 0.357551 CLL CLL trito_pool_2 0.0 0.574308 0.399926 0.599169 0.146626 0.171425 0.202222 0.0 0.202222 0.339589 0.601239 0.673941 0.311423 0.142003 0.251027 0.900174 0.822302 0.607857 0.804399 0.0 0.350723 0.388557 0.343231 0.379437 0.423172 0.413884 0.0 0.413884 0.430536 0.484843 0.385927 0.351172 0.395395 0.377211 0.260417 0.387313 0.379687 0.352217
438 RRBS_trito_pool_2_CGTACTAG.GCTGCC 0.583331 0.384057 0.650519 0.361400 CLL CLL trito_pool_2 0.0 0.557827 0.389318 0.583295 0.146131 0.170323 0.216651 0.0 0.216651 0.332222 0.465542 0.685189 0.303248 0.142679 0.253152 0.925723 0.812384 0.566898 0.789918 0.0 0.364344 0.411073 0.357567 0.386643 0.436352 0.455234 0.0 0.455234 0.427594 0.441989 0.410031 0.357034 0.414984 0.377580 0.122134 0.390103 0.391888 0.356613
439 RRBS_trito_pool_2_CGTACTAG.GGCATC 0.583463 0.392946 0.648295 0.372514 CLL CLL trito_pool_2 0.0 0.558443 0.393582 0.581411 0.149267 0.173254 0.202211 0.0 0.202211 0.336075 0.497985 0.658613 0.307422 0.145063 0.253500 0.822348 0.810850 0.579463 0.788819 0.0 0.369847 0.416267 0.362248 0.393373 0.438305 0.432106 0.0 0.432106 0.447392 0.283823 0.414114 0.369362 0.423329 0.390531 0.294508 0.403624 0.394708 0.370958
440 RRBS_trito_pool_2_CGTACTAG.GTGAGG 0.572670 0.387705 0.655986 0.366706 CLL CLL trito_pool_2 0.0 0.548763 0.378005 0.573823 0.146951 0.166034 0.189554 0.0 0.189554 0.329531 0.417063 0.656924 0.298588 0.143700 0.237655 0.837979 0.818187 0.578995 0.795407 0.0 0.369274 0.411458 0.362212 0.399234 0.439421 0.426244 0.0 0.426244 0.437661 0.387644 0.379763 0.378844 0.416184 0.393381 0.327526 0.390663 0.404334 0.359301
441 RRBS_trito_pool_2_CGTACTAG.GTTGAG 0.584506 0.379222 0.659633 0.362664 CLL CLL trito_pool_2 0.0 0.560388 0.392703 0.585744 0.148877 0.173148 0.191432 0.0 0.191432 0.337372 0.531839 0.657189 0.308189 0.144341 0.245608 0.856433 0.820191 0.594452 0.790093 0.0 0.357508 0.399570 0.348983 0.382424 0.428961 0.426739 0.0 0.426739 0.436677 0.623022 0.378906 0.357031 0.402640 0.380493 0.320648 0.385596 0.378942 0.359409
442 RRBS_trito_pool_2_CGTACTAG.TAGCGG 0.567804 0.392930 0.650606 0.371967 CLL CLL trito_pool_2 0.0 0.542448 0.379673 0.565934 0.141666 0.161357 0.190254 0.0 0.190254 0.322077 0.564060 0.661459 0.291782 0.141023 0.236705 0.775427 0.812801 0.584395 0.788413 0.0 0.373111 0.419487 0.363822 0.398524 0.443218 0.429929 0.0 0.429929 0.428916 0.508213 0.389595 0.371629 0.425177 0.379944 0.390322 0.401379 0.390136 0.365038
443 RRBS_trito_pool_2_CGTACTAG.TATCTC 0.599881 0.371286 0.647639 0.360518 CLL CLL trito_pool_2 0.0 0.575018 0.407772 0.599556 0.150032 0.172842 0.207183 0.0 0.207183 0.348191 0.524038 0.668196 0.314785 0.147052 0.263346 0.860790 0.826304 0.608059 0.807089 0.0 0.350740 0.387199 0.345054 0.369375 0.416003 0.412018 0.0 0.412018 0.418236 0.488627 0.380793 0.343215 0.395603 0.371997 0.304058 0.381193 0.377228 0.343486
444 RRBS_trito_pool_2_CGTACTAG.TCTCTG 0.597122 0.387602 0.651297 0.369044 CLL CLL trito_pool_2 0.0 0.574766 0.414443 0.597902 0.153941 0.177219 0.200535 0.0 0.200535 0.344312 0.593102 0.687200 0.318141 0.152659 0.260521 0.857143 0.820577 0.596070 0.810128 0.0 0.364954 0.410597 0.357541 0.390213 0.434403 0.418302 0.0 0.418302 0.436308 0.430239 0.403836 0.358261 0.411722 0.383838 0.307817 0.397070 0.395306 0.363937
445 RRBS_trito_pool_2_CGTACTAG.TGACAG 0.588475 0.377043 0.654169 0.362253 CLL CLL trito_pool_2 0.0 0.566762 0.398915 0.590485 0.146940 0.169929 0.200901 0.0 0.200901 0.330337 0.614493 0.668077 0.309691 0.140807 0.241382 0.877043 0.819644 0.588788 0.797851 0.0 0.352289 0.386013 0.345792 0.381209 0.425696 0.436417 0.0 0.436417 0.417572 0.441868 0.379378 0.356776 0.394481 0.368598 0.322557 0.387040 0.387626 0.360044

446 rows × 46 columns


In [37]:
print(np.unique(merged.protocol))  # there are 23 'protocol' fields


['NormalBCD19pCD27mcell1_22_' 'NormalBCD19pCD27mcell23_44'
 'NormalBCD19pCD27mcell45_66' 'NormalBCD19pCD27mcell67_88'
 'NormalBCD19pCD27pcell1_22_' 'NormalBCD19pCD27pcell23_44'
 'NormalBCD19pCD27pcell45_66' 'NormalBCD19pCD27pcell67_88'
 'RRBS_NormalBCD19pcell1_22_' 'RRBS_NormalBCD19pcell23_44'
 'RRBS_NormalBCD19pcell45_66' 'RRBS_NormalBCD19pcell67_88'
 'cw154_CutSmart_proteinase_K' 'cw154_Tris_protease'
 'cw154_Tris_protease_GR' 'normal_B_cell_A1_24' 'normal_B_cell_B1_24'
 'normal_B_cell_C1_24' 'normal_B_cell_D1_24' 'normal_B_cell_G1_22'
 'normal_B_cell_H1_22' 'trito_pool_1' 'trito_pool_2']

In [ ]:


In [ ]:


In [ ]:


In [38]:
# merged.to_csv("total_genomic_region.csv", index=False)

In [39]:
merged.shape


Out[39]:
(446, 46)

In [40]:
merged.columns


Out[40]:
Index(['filename', 'methylation', 'PDR_total', 'methylation_unweighted',
       'PDR_unweighted', 'type', 'bio', 'protocol', 'methylation_tssDistance',
       'methylation_genesDistance', 'methylation_exonsDistance',
       'methylation_intronsDistance', 'methylation_promoterDistance',
       'methylation_cgiDistance', 'methylation_ctcfDistance',
       'methylation_ctcfUpDistance', 'methylation_ctcfDownDistance',
       'methylation_geneDistalRegulatoryModulesDistance',
       'methylation_vistaEnhancersDistance', 'methylation_3PrimeUTRDistance',
       'methylation_5PrimeUTRDistance', 'methylation_firstExonDistance',
       'methylation_geneDistalRegulatoryModulesK562Distance',
       'methylation_hypoInHues64Distance', 'methylation_intergenic',
       'methylation_shore', 'methylation_shelf', 'PDR_tssDistance',
       'PDR_genesDistance', 'PDR_exonsDistance', 'PDR_intronsDistance',
       'PDR_promoterDistance', 'PDR_cgiDistance', 'PDR_ctcfDistance',
       'PDR_ctcfUpDistance', 'PDR_ctcfDownDistance',
       'PDR_geneDistalRegulatoryModulesDistance', 'PDR_vistaEnhancersDistance',
       'PDR_3PrimeUTRDistance', 'PDR_5PrimeUTRDistance',
       'PDR_firstExonDistance', 'PDR_geneDistalRegulatoryModulesK562Distance',
       'PDR_hypoInHues64Distance', 'PDR_intergenic', 'PDR_shore', 'PDR_shelf'],
      dtype='object')

In [41]:
#
# First do pairs by CLL vs Normal B;    We could discuss protocols at a later point
#
normal = merged[merged["type"]=="normal"]
CLL = merged[merged["type"]=="CLL"]

In [42]:
print(len(normal))
print(len(CLL))


342
104

In [43]:
#CLL_pairs = CLL
normal_pairs = normal

In [44]:
normal_pairs.columns


Out[44]:
Index(['filename', 'methylation', 'PDR_total', 'methylation_unweighted',
       'PDR_unweighted', 'type', 'bio', 'protocol', 'methylation_tssDistance',
       'methylation_genesDistance', 'methylation_exonsDistance',
       'methylation_intronsDistance', 'methylation_promoterDistance',
       'methylation_cgiDistance', 'methylation_ctcfDistance',
       'methylation_ctcfUpDistance', 'methylation_ctcfDownDistance',
       'methylation_geneDistalRegulatoryModulesDistance',
       'methylation_vistaEnhancersDistance', 'methylation_3PrimeUTRDistance',
       'methylation_5PrimeUTRDistance', 'methylation_firstExonDistance',
       'methylation_geneDistalRegulatoryModulesK562Distance',
       'methylation_hypoInHues64Distance', 'methylation_intergenic',
       'methylation_shore', 'methylation_shelf', 'PDR_tssDistance',
       'PDR_genesDistance', 'PDR_exonsDistance', 'PDR_intronsDistance',
       'PDR_promoterDistance', 'PDR_cgiDistance', 'PDR_ctcfDistance',
       'PDR_ctcfUpDistance', 'PDR_ctcfDownDistance',
       'PDR_geneDistalRegulatoryModulesDistance', 'PDR_vistaEnhancersDistance',
       'PDR_3PrimeUTRDistance', 'PDR_5PrimeUTRDistance',
       'PDR_firstExonDistance', 'PDR_geneDistalRegulatoryModulesK562Distance',
       'PDR_hypoInHues64Distance', 'PDR_intergenic', 'PDR_shore', 'PDR_shelf'],
      dtype='object')

In [45]:
print(np.unique(normal_pairs.protocol))


['NormalBCD19pCD27mcell1_22_' 'NormalBCD19pCD27mcell23_44'
 'NormalBCD19pCD27mcell45_66' 'NormalBCD19pCD27mcell67_88'
 'NormalBCD19pCD27pcell1_22_' 'NormalBCD19pCD27pcell23_44'
 'NormalBCD19pCD27pcell45_66' 'NormalBCD19pCD27pcell67_88'
 'RRBS_NormalBCD19pcell1_22_' 'RRBS_NormalBCD19pcell23_44'
 'RRBS_NormalBCD19pcell45_66' 'RRBS_NormalBCD19pcell67_88'
 'normal_B_cell_A1_24' 'normal_B_cell_B1_24' 'normal_B_cell_C1_24'
 'normal_B_cell_D1_24' 'normal_B_cell_G1_22' 'normal_B_cell_H1_22']

In [46]:
protocol = normal_pairs[normal_pairs["protocol"] == "NormalBCD19pCD27mcell1_22_"]

In [47]:
protocol.shape


Out[47]:
(18, 46)

In [48]:
normal_pairs.shape


Out[48]:
(342, 46)

In [49]:
protocol = protocol.reset_index(drop=True)

In [50]:
"""
DANGER!!!!
"""

# Falsely named variable!!! I simply do this to not modify the code below

normal_pairs = protocol

In [ ]:


In [ ]:


In [51]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.methylation, normal_pairs.methylation)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_difference': stacked})[['filename', 'methylation_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs1 = pd.merge(out, methylation_differences, how='inner')
print(pairs1.shape)


(153, 44)

In [52]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.PDR_total, normal_pairs.PDR_total)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_difference': stacked})[['filename', 'PDR_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs2 = pd.merge(out, PDR_differences, how='inner')
print(pairs2.shape)


(153, 44)

In [53]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.methylation_unweighted, normal_pairs.methylation_unweighted)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_unweighted_difference': stacked})[['filename', 'methylation_unweighted_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs3 = pd.merge(out, methylation_differences, how='inner')
print(pairs3.shape)


(153, 44)

In [54]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.PDR_unweighted, normal_pairs.PDR_unweighted)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
PDR_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_unweighted_difference': stacked})[['filename', 'PDR_unweighted_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs4 = pd.merge(out, PDR_differences, how='inner')
print(pairs4.shape)


(153, 44)

In [55]:
"""
  'methylation_tssDistance',
       'methylation_genesDistance', 'methylation_exonsDistance',
       'methylation_intronsDistance', 'methylation_promoterDistance',
       'methylation_cgiDistance', 'methylation_ctcfDistance',
       'methylation_ctcfUpDistance', 'methylation_ctcfDownDistance',
       'methylation_geneDistalRegulatoryModulesDistance',
       'methylation_vistaEnhancersDistance', 'methylation_3PrimeUTRDistance',
       'methylation_5PrimeUTRDistance', 'methylation_firstExonDistance',
       'methylation_geneDistalRegulatoryModulesK562Distance',
       'methylation_hypoInHues64Distance', 'methylation_intergenic',
       'methylation_shore', 'methylation_shelf'

"""


Out[55]:
"\n  'methylation_tssDistance',\n       'methylation_genesDistance', 'methylation_exonsDistance',\n       'methylation_intronsDistance', 'methylation_promoterDistance',\n       'methylation_cgiDistance', 'methylation_ctcfDistance',\n       'methylation_ctcfUpDistance', 'methylation_ctcfDownDistance',\n       'methylation_geneDistalRegulatoryModulesDistance',\n       'methylation_vistaEnhancersDistance', 'methylation_3PrimeUTRDistance',\n       'methylation_5PrimeUTRDistance', 'methylation_firstExonDistance',\n       'methylation_geneDistalRegulatoryModulesK562Distance',\n       'methylation_hypoInHues64Distance', 'methylation_intergenic',\n       'methylation_shore', 'methylation_shelf'\n\n"

In [56]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.methylation_tssDistance, normal_pairs.methylation_tssDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_tssDistance_difference': stacked})[['filename', 'methylation_tssDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs5 = pd.merge(out, methylation_differences, how='inner')
print(pairs5.shape)


(153, 44)

In [57]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.methylation_genesDistance, normal_pairs.methylation_genesDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_genesDistance_difference': stacked})[['filename', 'methylation_genesDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs6 = pd.merge(out, methylation_differences, how='inner')
print(pairs6.shape)


(153, 44)

In [58]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.methylation_exonsDistance, normal_pairs.methylation_exonsDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_exonsDistance_difference': stacked})[['filename', 'methylation_exonsDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs7 = pd.merge(out, methylation_differences, how='inner')
print(pairs7.shape)


(153, 44)

In [59]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.methylation_intronsDistance, normal_pairs.methylation_intronsDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_intronsDistance_difference': stacked})[['filename', 'methylation_intronsDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs8 = pd.merge(out, methylation_differences, how='inner')
print(pairs8.shape)


(153, 44)

In [60]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.methylation_promoterDistance, normal_pairs.methylation_promoterDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_promoterDistance_difference': stacked})[['filename', 'methylation_promoterDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs9 = pd.merge(out, methylation_differences, how='inner')
print(pairs9.shape)


(153, 44)

In [61]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.methylation_cgiDistance, normal_pairs.methylation_cgiDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_cgiDistance_difference': stacked})[['filename', 'methylation_cgiDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs10 = pd.merge(out, methylation_differences, how='inner')
print(pairs10.shape)


(153, 44)

In [62]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.methylation_ctcfDistance, normal_pairs.methylation_ctcfDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_ctcfDistance_difference': stacked})[['filename', 'methylation_ctcfDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs11 = pd.merge(out, methylation_differences, how='inner')
print(pairs11.shape)


(153, 44)

In [63]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.methylation_ctcfUpDistance, normal_pairs.methylation_ctcfUpDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_ctcfUpDistance_difference': stacked})[['filename', 'methylation_ctcfUpDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs12 = pd.merge(out, methylation_differences, how='inner')
print(pairs12.shape)


(153, 44)

In [64]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.methylation_ctcfDownDistance, normal_pairs.methylation_ctcfDownDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_ctcfDownDistance_difference': stacked})[['filename', 'methylation_ctcfDownDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs13 = pd.merge(out, methylation_differences, how='inner')
print(pairs13.shape)


(153, 44)

In [65]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.methylation_geneDistalRegulatoryModulesDistance, normal_pairs.methylation_geneDistalRegulatoryModulesDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_geneDistalRegulatoryModulesDistance_difference': stacked})[['filename', 'methylation_geneDistalRegulatoryModulesDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs14 = pd.merge(out, methylation_differences, how='inner')
print(pairs14.shape)


(153, 44)

In [66]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.methylation_vistaEnhancersDistance, normal_pairs.methylation_vistaEnhancersDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_vistaEnhancersDistance_difference': stacked})[['filename', 'methylation_vistaEnhancersDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs15 = pd.merge(out, methylation_differences, how='inner')
print(pairs15.shape)


(153, 44)

In [67]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.methylation_3PrimeUTRDistance, normal_pairs.methylation_3PrimeUTRDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_3PrimeUTRDistance_difference': stacked})[['filename', 'methylation_3PrimeUTRDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs16 = pd.merge(out, methylation_differences, how='inner')
print(pairs16.shape)


(153, 44)

In [68]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.methylation_5PrimeUTRDistance, normal_pairs.methylation_5PrimeUTRDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_5PrimeUTRDistance_difference': stacked})[['filename', 'methylation_5PrimeUTRDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs17 = pd.merge(out, methylation_differences, how='inner')
print(pairs17.shape)


(153, 44)

In [69]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.methylation_firstExonDistance, normal_pairs.methylation_firstExonDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_firstExonDistance_difference': stacked})[['filename', 'methylation_firstExonDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs18 = pd.merge(out, methylation_differences, how='inner')
print(pairs18.shape)


(153, 44)

In [70]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.methylation_geneDistalRegulatoryModulesK562Distance, normal_pairs.methylation_geneDistalRegulatoryModulesK562Distance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_geneDistalRegulatoryModulesK562Distance_difference': stacked})[['filename', 'methylation_geneDistalRegulatoryModulesK562Distance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs19 = pd.merge(out, methylation_differences, how='inner')
print(pairs19.shape)


(153, 44)

In [71]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.methylation_hypoInHues64Distance, normal_pairs.methylation_hypoInHues64Distance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_hypoInHues64Distance_difference': stacked})[['filename', 'methylation_hypoInHues64Distance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs20 = pd.merge(out, methylation_differences, how='inner')
print(pairs20.shape)


(153, 44)

In [72]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.methylation_intergenic, normal_pairs.methylation_intergenic)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_intergenic_difference': stacked})[['filename', 'methylation_intergenic_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs21 = pd.merge(out, methylation_differences, how='inner')
print(pairs21.shape)


(153, 44)

In [73]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.methylation_shore, normal_pairs.methylation_shore)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_shore_difference': stacked})[['filename', 'methylation_shore_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs22 = pd.merge(out, methylation_differences, how='inner')
print(pairs22.shape)


(153, 44)

In [74]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.methylation_shelf, normal_pairs.methylation_shelf)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'methylation_shelf_difference': stacked})[['filename', 'methylation_shelf_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs23 = pd.merge(out, methylation_differences, how='inner')
print(pairs23.shape)


(153, 44)

In [ ]:


In [75]:
"""
###
PDR by genomic regions
###
"""


Out[75]:
'\n###\nPDR by genomic regions\n###\n'

In [76]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.PDR_tssDistance, normal_pairs.PDR_tssDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_tssDistance_difference': stacked})[['filename', 'PDR_tssDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs24 = pd.merge(out, methylation_differences, how='inner')
print(pairs24.shape)


(153, 44)

In [77]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.PDR_genesDistance, normal_pairs.PDR_genesDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_genesDistance_difference': stacked})[['filename', 'PDR_genesDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs25 = pd.merge(out, methylation_differences, how='inner')
print(pairs25.shape)


(153, 44)

In [78]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.PDR_exonsDistance, normal_pairs.PDR_exonsDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_exonsDistance_difference': stacked})[['filename', 'PDR_exonsDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs26 = pd.merge(out, methylation_differences, how='inner')
print(pairs26.shape)


(153, 44)

In [79]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.PDR_intronsDistance, normal_pairs.PDR_intronsDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_intronsDistance_difference': stacked})[['filename', 'PDR_intronsDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs27 = pd.merge(out, methylation_differences, how='inner')
print(pairs27.shape)


(153, 44)

In [80]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.PDR_promoterDistance, normal_pairs.PDR_promoterDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_promoterDistance_difference': stacked})[['filename', 'PDR_promoterDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs28 = pd.merge(out, methylation_differences, how='inner')
print(pairs28.shape)


(153, 44)

In [81]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.PDR_cgiDistance, normal_pairs.PDR_cgiDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_cgiDistance_difference': stacked})[['filename', 'PDR_cgiDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs29 = pd.merge(out, methylation_differences, how='inner')
print(pairs29.shape)


(153, 44)

In [82]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.PDR_ctcfDistance, normal_pairs.PDR_ctcfDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_ctcfDistance_difference': stacked})[['filename', 'PDR_ctcfDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs30 = pd.merge(out, methylation_differences, how='inner')
print(pairs30.shape)


(153, 44)

In [83]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.PDR_ctcfUpDistance, normal_pairs.PDR_ctcfUpDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_ctcfUpDistance_difference': stacked})[['filename', 'PDR_ctcfUpDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs31 = pd.merge(out, methylation_differences, how='inner')
print(pairs31.shape)


(153, 44)

In [84]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.PDR_ctcfDownDistance, normal_pairs.PDR_ctcfDownDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_ctcfDownDistance_difference': stacked})[['filename', 'PDR_ctcfDownDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs32 = pd.merge(out, methylation_differences, how='inner')
print(pairs32.shape)


(153, 44)

In [85]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.PDR_geneDistalRegulatoryModulesDistance, normal_pairs.PDR_geneDistalRegulatoryModulesDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_geneDistalRegulatoryModulesDistance_difference': stacked})[['filename', 'PDR_geneDistalRegulatoryModulesDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs33 = pd.merge(out, methylation_differences, how='inner')
print(pairs33.shape)


(153, 44)

In [86]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.PDR_vistaEnhancersDistance, normal_pairs.PDR_vistaEnhancersDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_vistaEnhancersDistance_difference': stacked})[['filename', 'PDR_vistaEnhancersDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs34 = pd.merge(out, methylation_differences, how='inner')
print(pairs34.shape)


(153, 44)

In [87]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.PDR_3PrimeUTRDistance, normal_pairs.PDR_3PrimeUTRDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_3PrimeUTRDistance_difference': stacked})[['filename', 'PDR_3PrimeUTRDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs35 = pd.merge(out, methylation_differences, how='inner')
print(pairs35.shape)


(153, 44)

In [88]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.PDR_5PrimeUTRDistance, normal_pairs.PDR_5PrimeUTRDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_5PrimeUTRDistance_difference': stacked})[['filename', 'PDR_5PrimeUTRDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs36 = pd.merge(out, methylation_differences, how='inner')
print(pairs36.shape)


(153, 44)

In [89]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.PDR_firstExonDistance, normal_pairs.PDR_firstExonDistance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_firstExonDistance_difference': stacked})[['filename', 'PDR_firstExonDistance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs37 = pd.merge(out, methylation_differences, how='inner')
print(pairs37.shape)


(153, 44)

In [90]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.PDR_geneDistalRegulatoryModulesK562Distance, normal_pairs.PDR_geneDistalRegulatoryModulesK562Distance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_geneDistalRegulatoryModulesK562Distance_difference': stacked})[['filename', 'PDR_geneDistalRegulatoryModulesK562Distance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs38 = pd.merge(out, methylation_differences, how='inner')
print(pairs38.shape)


(153, 44)

In [91]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.PDR_hypoInHues64Distance, normal_pairs.PDR_hypoInHues64Distance)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_hypoInHues64Distance_difference': stacked})[['filename', 'PDR_hypoInHues64Distance_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs39 = pd.merge(out, methylation_differences, how='inner')
print(pairs39.shape)


(153, 44)

In [92]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.PDR_intergenic, normal_pairs.PDR_intergenic)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_intergenic_difference': stacked})[['filename', 'PDR_intergenic_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs40 = pd.merge(out, methylation_differences, how='inner')
print(pairs40.shape)


(153, 44)

In [93]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.PDR_shore, normal_pairs.PDR_shore)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_shore_difference': stacked})[['filename', 'PDR_shore_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs41 = pd.merge(out, methylation_differences, how='inner')
print(pairs41.shape)


(153, 44)

In [94]:
normal_pairsA = normal_pairs.set_index("filename")
from itertools import combinations
cc = list(combinations(normal_pairs.filename, 2)) # combines into all pairs
out = pd.DataFrame([normal_pairsA.loc[c,:].mean() for c in cc], index=cc)  # covariates between pairs == mean
df_ex = pd.DataFrame(np.abs(np.subtract.outer(normal_pairs.PDR_shelf, normal_pairs.PDR_shelf)), normal_pairs.filename, normal_pairs.filename)
stacked = df_ex.stack()
methylation_differences = pd.DataFrame({'filename': stacked.index.to_series(), 'PDR_shelf_difference': stacked})[['filename', 'PDR_shelf_difference']].reset_index(drop=True)
out['filename'] = out.index
out = out.reset_index(drop=True)
pairs42 = pd.merge(out, methylation_differences, how='inner')
print(pairs42.shape)


(153, 44)

In [ ]:


In [95]:
pairs42.head()


Out[95]:
methylation PDR_total methylation_unweighted PDR_unweighted methylation_tssDistance methylation_genesDistance methylation_exonsDistance methylation_intronsDistance methylation_promoterDistance methylation_cgiDistance methylation_ctcfDistance methylation_ctcfUpDistance methylation_ctcfDownDistance methylation_geneDistalRegulatoryModulesDistance methylation_vistaEnhancersDistance methylation_3PrimeUTRDistance methylation_5PrimeUTRDistance methylation_firstExonDistance methylation_geneDistalRegulatoryModulesK562Distance methylation_hypoInHues64Distance methylation_intergenic methylation_shore methylation_shelf PDR_tssDistance PDR_genesDistance PDR_exonsDistance PDR_intronsDistance PDR_promoterDistance PDR_cgiDistance PDR_ctcfDistance PDR_ctcfUpDistance PDR_ctcfDownDistance PDR_geneDistalRegulatoryModulesDistance PDR_vistaEnhancersDistance PDR_3PrimeUTRDistance PDR_5PrimeUTRDistance PDR_firstExonDistance PDR_geneDistalRegulatoryModulesK562Distance PDR_hypoInHues64Distance PDR_intergenic PDR_shore PDR_shelf filename PDR_shelf_difference
0 0.473770 0.174196 0.592972 0.168053 0.0 0.439020 0.251511 0.473456 0.062676 0.067859 0.096186 0.0 0.096186 0.230526 0.279602 0.597652 0.187685 0.052747 0.155892 0.933200 0.881066 0.564774 0.850167 0.0 0.161878 0.158834 0.161558 0.110189 0.134403 0.155025 0.0 0.155025 0.192356 0.223016 0.227512 0.106567 0.111315 0.143271 0.129509 0.210553 0.263209 0.198912 (RRBS_NormalBCD19pCD27mcell1_22_CGAGGCTG.ACCGC... 0.005388
1 0.507427 0.179332 0.616451 0.174348 0.0 0.474232 0.282827 0.507760 0.072478 0.079000 0.112518 0.0 0.112518 0.247994 0.343567 0.629403 0.212029 0.061188 0.169622 0.960708 0.881922 0.591675 0.860776 0.0 0.164630 0.166698 0.163264 0.116660 0.143141 0.164916 0.0 0.164916 0.194369 0.229280 0.235802 0.108010 0.116753 0.145322 0.015046 0.215633 0.266775 0.194113 (RRBS_NormalBCD19pCD27mcell1_22_CGAGGCTG.ACCGC... 0.004211
2 0.463842 0.172783 0.585348 0.169032 0.0 0.431786 0.251167 0.463718 0.058619 0.067260 0.091884 0.0 0.091884 0.226540 0.302709 0.584201 0.178963 0.050666 0.154567 0.934139 0.872425 0.559783 0.847683 0.0 0.159780 0.160545 0.158929 0.104871 0.131125 0.144074 0.0 0.144074 0.192018 0.269763 0.243117 0.100516 0.109569 0.135953 0.083990 0.215604 0.262332 0.206296 (RRBS_NormalBCD19pCD27mcell1_22_CGAGGCTG.ACCGC... 0.020155
3 0.500470 0.177295 0.612618 0.171571 0.0 0.468641 0.280236 0.500422 0.069698 0.076488 0.107499 0.0 0.107499 0.245738 0.374935 0.628940 0.203058 0.058201 0.168129 0.945877 0.881177 0.581597 0.855125 0.0 0.163682 0.164810 0.161913 0.113170 0.140879 0.154921 0.0 0.154921 0.196574 0.188357 0.225318 0.103181 0.116674 0.147162 0.076044 0.211780 0.266563 0.200849 (RRBS_NormalBCD19pCD27mcell1_22_CGAGGCTG.ACCGC... 0.009262
4 0.490417 0.174153 0.610419 0.170070 0.0 0.458582 0.273938 0.490682 0.066518 0.074388 0.107055 0.0 0.107055 0.240915 0.406132 0.606864 0.194877 0.057550 0.163871 0.969048 0.880833 0.573865 0.857818 0.0 0.161688 0.163494 0.159640 0.112759 0.137111 0.161020 0.0 0.161020 0.190680 0.167057 0.251890 0.104574 0.115887 0.145551 0.073908 0.208859 0.263075 0.197193 (RRBS_NormalBCD19pCD27mcell1_22_CGAGGCTG.ACCGC... 0.001948

In [96]:
"""
  'methylation_tssDistance',
       'methylation_genesDistance', 'methylation_exonsDistance',
       'methylation_intronsDistance', 'methylation_promoterDistance',
       'methylation_cgiDistance', 'methylation_ctcfDistance',
       'methylation_ctcfUpDistance', 'methylation_ctcfDownDistance',
       'methylation_geneDistalRegulatoryModulesDistance',
       'methylation_vistaEnhancersDistance', 'methylation_3PrimeUTRDistance',
       'methylation_5PrimeUTRDistance', 'methylation_firstExonDistance',
       'methylation_geneDistalRegulatoryModulesK562Distance',
       'methylation_hypoInHues64Distance', 'methylation_intergenic',
       'methylation_shore', 'methylation_shelf'

"""


Out[96]:
"\n  'methylation_tssDistance',\n       'methylation_genesDistance', 'methylation_exonsDistance',\n       'methylation_intronsDistance', 'methylation_promoterDistance',\n       'methylation_cgiDistance', 'methylation_ctcfDistance',\n       'methylation_ctcfUpDistance', 'methylation_ctcfDownDistance',\n       'methylation_geneDistalRegulatoryModulesDistance',\n       'methylation_vistaEnhancersDistance', 'methylation_3PrimeUTRDistance',\n       'methylation_5PrimeUTRDistance', 'methylation_firstExonDistance',\n       'methylation_geneDistalRegulatoryModulesK562Distance',\n       'methylation_hypoInHues64Distance', 'methylation_intergenic',\n       'methylation_shore', 'methylation_shelf'\n\n"

In [97]:
pairs1 = pairs1[["filename", "methylation_difference"]]
pairs2 = pairs2[["filename", "PDR_difference"]]
pairs3 = pairs3[["filename", "methylation_unweighted_difference"]]
pairs4 = pairs4[["filename", "PDR_unweighted_difference"]]
pairs5 = pairs5[["filename", "methylation_tssDistance_difference"]]
pairs6 = pairs6[["filename", "methylation_genesDistance_difference"]]
pairs7 = pairs7[["filename", "methylation_exonsDistance_difference"]]
pairs8 = pairs8[["filename", "methylation_intronsDistance_difference"]]
pairs9 = pairs9[["filename", "methylation_promoterDistance_difference"]]
pairs10 = pairs10[["filename", "methylation_cgiDistance_difference"]]
pairs11 = pairs11[["filename", "methylation_ctcfDistance_difference"]]
pairs12 = pairs12[["filename", "methylation_ctcfUpDistance_difference"]]
pairs13 = pairs13[["filename", "methylation_ctcfDownDistance_difference"]]
pairs14 = pairs14[["filename", "methylation_geneDistalRegulatoryModulesDistance"]]
pairs15 = pairs15[["filename", "methylation_vistaEnhancersDistance_difference"]]
pairs16 = pairs16[["filename", "methylation_3PrimeUTRDistance_difference"]]
pairs17 = pairs17[["filename", "methylation_5PrimeUTRDistance_difference"]]
pairs18 = pairs18[["filename", "methylation_firstExonDistance_difference"]]
pairs19 = pairs19[["filename", "methylation_geneDistalRegulatoryModulesK562Distance_difference"]]
pairs20 = pairs20[["filename", "methylation_hypoInHues64Distance_difference"]]
pairs21 = pairs21[["filename", "methylation_intergenic_difference"]]
pairs22 = pairs22[["filename", "methylation_shore_difference"]]
pairs23 = pairs23[["filename", "methylation_shelf_difference"]]
pairs24 = pairs24[["filename", "PDR_tssDistance_difference"]]
pairs25 = pairs25[["filename", "PDR_genesDistance_difference"]]
pairs26 = pairs26[["filename", "PDR_exonsDistance_difference"]]
pairs27 = pairs27[["filename", "PDR_intronsDistance_difference"]]
pairs28 = pairs28[["filename", "PDR_promoterDistance_difference"]]
pairs29 = pairs29[["filename", "PDR_cgiDistance_difference"]]
pairs30 = pairs30[["filename", "PDR_ctcfDistance_difference"]]
pairs31 = pairs31[["filename", "PDR_ctcfUpDistance_difference"]]
pairs32 = pairs32[["filename", "PDR_ctcfDownDistance_difference"]]
pairs33 = pairs33[["filename", "PDR_geneDistalRegulatoryModulesDistance"]]
pairs34 = pairs34[["filename", "PDR_vistaEnhancersDistance_difference"]]
pairs35 = pairs35[["filename", "PDR_3PrimeUTRDistance_difference"]]
pairs36 = pairs36[["filename", "PDR_5PrimeUTRDistance_difference"]]
pairs37 = pairs37[["filename", "PDR_firstExonDistance_difference"]]
pairs38 = pairs38[["filename", "PDR_geneDistalRegulatoryModulesK562Distance_difference"]]
pairs39 = pairs39[["filename", "PDR_hypoInHues64Distance_difference"]]
pairs40 = pairs40[["filename", "PDR_intergenic_difference"]]
pairs41 = pairs41[["filename", "PDR_shore_difference"]]
pairs42 = pairs42[["filename", "PDR_shelf_difference"]]

In [98]:
pairs_total = [pairs1, pairs2, pairs3, pairs4, pairs5, pairs6, pairs7, pairs8, pairs9, pairs10,
               pairs11, pairs12, pairs13, pairs14, pairs15, pairs16, pairs17, pairs18, pairs19, pairs20,
               pairs21, pairs22, pairs23, pairs24, pairs25, pairs26, pairs27, pairs28, pairs29, pairs30,
               pairs31, pairs32, pairs33, pairs34, pairs35, pairs36, pairs37, pairs38, pairs39, pairs40,
               pairs41, pairs42]

In [99]:
total_normal_pairs = pd.concat([df.set_index("filename") for df in pairs_total], axis=1).reset_index()

In [100]:
total_normal_pairs.shape


Out[100]:
(153, 43)

In [101]:
total_normal_pairs.to_csv("total_normal_pairs_NormalBCD19pCD27mcell1_22.csv", index=False)

In [ ]:


In [ ]: