notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows


import os
os.chdir("/Users/evanbiederstedt/Downloads/RRBS_data_files")



In [3]:

    
"""
allStats.csv
cd19_cpg3.csv
cd19_cpg2.csv
cd19_cpg1.csv
all_files450.csv
pcell_cpg3.csv
pcell_cpg2.csv
mcell_cpg3.csv
mcell_cpg2.csv
pcell_cpg1.csv
mcell_cpg1.csv
trito3.csv
trito2.csv
cw154_cpgs_test3.csv
cw154_cpgs_test2.csv
trito1.csv
cw154_cpgs_test1.csv

"""









    Out[3]:





'\nallStats.csv\ncd19_cpg3.csv\ncd19_cpg2.csv\ncd19_cpg1.csv\nall_files450.csv\npcell_cpg3.csv\npcell_cpg2.csv\nmcell_cpg3.csv\nmcell_cpg2.csv\npcell_cpg1.csv\nmcell_cpg1.csv\ntrito3.csv\ntrito2.csv\ncw154_cpgs_test3.csv\ncw154_cpgs_test2.csv\ntrito1.csv\ncw154_cpgs_test1.csv\n\n'



In [4]:

    
df1 = pd.read_csv("cd19_cpg1.csv")
df2 = pd.read_csv("cd19_cpg2.csv")
df3 = pd.read_csv("cd19_cpg3.csv")



In [5]:

    
df1 = df1.drop(["Unnamed: 0"], axis=1)
df2 = df2.drop(["Unnamed: 0"], axis=1)
df3 = df3.drop(["Unnamed: 0"], axis=1)



In [6]:

    
#df1 = df1.rename(columns = {'total_cpg_gtrthan1':'total_cpg_gtrthan38'})



In [7]:

    
df1.head()









    Out[7]:






  
    
      
      PDR_per_stack
      Unnamed: 0.1
      avgReadCpGs
      filename
      methReadCount
      methylation
      mixedReadCount
      read_stack_ID
      thisMeth
      thisUnmeth
      total_cpg_no_filter
      total_reads
      unmethReadCount
    
  
  
    
      0
      38535.155429
      1.226859e+10
      830087.0
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACAACC.
      2964863.0
      97458.161898
      1812703.0
      1.213268e+10
      4007620.0
      3247989.0
      830087.0
      7255609.0
      2478043.0
    
    
      1
      7775.252368
      7.026188e+08
      205444.0
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACCGCG.
      433354.0
      21222.416634
      268399.0
      6.906199e+08
      576341.0
      676915.0
      205444.0
      1253256.0
      551503.0
    
    
      2
      12938.138525
      2.224412e+09
      362762.0
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACGTGG.
      1141733.0
      40648.729308
      553762.0
      2.202570e+09
      1456949.0
      1358920.0
      362762.0
      2815869.0
      1120374.0
    
    
      3
      32980.046623
      8.821705e+09
      687749.0
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACTCAC.
      2622515.0
      86538.210407
      1538888.0
      8.735350e+09
      3527738.0
      2536187.0
      687749.0
      6063925.0
      1902522.0
    
    
      4
      10115.626759
      1.773965e+09
      306597.0
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.AGGATG.
      1373800.0
      40266.419425
      500640.0
      1.758527e+09
      1655479.0
      1154831.0
      306597.0
      2810310.0
      935870.0



In [8]:

    
df1 = df1[["filename", "total_cpg_no_filter"]]



In [9]:

    
df2.head()









    Out[9]:






  
    
      
      PDR_per_stack
      Unnamed: 0.1
      avgReadCpGs
      filename
      methReadCount
      methylation
      mixedReadCount
      read_stack_ID
      thisMeth
      thisUnmeth
      total_cpg_gtrthan1
      total_reads
      unmethReadCount
    
  
  
    
      0
      38529.608673
      1.226108e+10
      829995.0
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACAACC.
      2964125.0
      97397.609858
      1812653.0
      1.212525e+10
      4006850.0
      3247545.0
      829995.0
      7254395.0
      2477617.0
    
    
      1
      7774.242844
      7.024402e+08
      205433.0
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACCGCG.
      433314.0
      21214.416634
      268393.0
      6.904442e+08
      576296.0
      676896.0
      205433.0
      1253192.0
      551485.0
    
    
      2
      12936.341116
      2.223505e+09
      362736.0
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACGTGG.
      1141560.0
      40631.229308
      553748.0
      2.201672e+09
      1456764.0
      1358856.0
      362736.0
      2815620.0
      1120312.0
    
    
      3
      32972.331551
      8.816508e+09
      687666.0
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACTCAC.
      2621843.0
      86470.247293
      1538815.0
      8.730204e+09
      3527017.0
      2535995.0
      687666.0
      6063012.0
      1902354.0
    
    
      4
      10114.558267
      1.773191e+09
      306576.0
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.AGGATG.
      1373559.0
      40246.419425
      500629.0
      1.757760e+09
      1655227.0
      1154828.0
      306576.0
      2810055.0
      935867.0



In [10]:

    
df2 = df2[["filename", "total_cpg_gtrthan1"]]



In [11]:

    
df3.head()









    Out[11]:






  
    
      
      PDR_per_stack
      Unnamed: 0.1
      avgReadCpGs
      filename
      methReadCount
      methylation
      mixedReadCount
      read_stack_ID
      thisMeth
      thisUnmeth
      total_cpg_gtrthan38
      total_reads
      unmethReadCount
    
  
  
    
      0
      26748.683723
      7.763325e+09
      686683.0
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACAACC.
      1897380.0
      53835.887435
      1449102.0
      7.677330e+09
      2741967.0
      2855864.0
      686683.0
      5597831.0
      2251349.0
    
    
      1
      5394.725563
      4.543323e+08
      172625.0
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACCGCG.
      278247.0
      11445.886689
      220605.0
      4.465749e+08
      395297.0
      612023.0
      172625.0
      1007320.0
      508468.0
    
    
      2
      9004.718202
      1.419903e+09
      302802.0
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACGTGG.
      727876.0
      22018.044138
      448766.0
      1.405962e+09
      985480.0
      1222417.0
      302802.0
      2207897.0
      1031255.0
    
    
      3
      22614.174425
      5.478791e+09
      562701.0
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACTCAC.
      1643931.0
      47156.358513
      1222277.0
      5.425162e+09
      2373699.0
      2230845.0
      562701.0
      4604544.0
      1738336.0
    
    
      4
      7245.550208
      1.099108e+09
      250348.0
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.AGGATG.
      883336.0
      21680.113138
      413797.0
      1.089543e+09
      1117063.0
      1041187.0
      250348.0
      2158250.0
      861117.0



In [12]:

    
df3 = df3[["filename", "total_cpg_gtrthan38"]]



In [13]:

    
df1.head()









    Out[13]:






  
    
      
      filename
      total_cpg_no_filter
    
  
  
    
      0
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACAACC.
      830087.0
    
    
      1
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACCGCG.
      205444.0
    
    
      2
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACGTGG.
      362762.0
    
    
      3
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACTCAC.
      687749.0
    
    
      4
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.AGGATG.
      306597.0



In [14]:

    
df2.head()









    Out[14]:






  
    
      
      filename
      total_cpg_gtrthan1
    
  
  
    
      0
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACAACC.
      829995.0
    
    
      1
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACCGCG.
      205433.0
    
    
      2
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACGTGG.
      362736.0
    
    
      3
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACTCAC.
      687666.0
    
    
      4
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.AGGATG.
      306576.0



In [15]:

    
df3.head()









    Out[15]:






  
    
      
      filename
      total_cpg_gtrthan38
    
  
  
    
      0
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACAACC.
      686683.0
    
    
      1
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACCGCG.
      172625.0
    
    
      2
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACGTGG.
      302802.0
    
    
      3
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACTCAC.
      562701.0
    
    
      4
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.AGGATG.
      250348.0



In [16]:

    
first = df1.merge(df2, on="filename")



In [17]:

    
second= first.merge(df3, on="filename")



In [18]:

    
second.head()









    Out[18]:






  
    
      
      filename
      total_cpg_no_filter
      total_cpg_gtrthan1
      total_cpg_gtrthan38
    
  
  
    
      0
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACAACC.
      830087.0
      829995.0
      686683.0
    
    
      1
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACCGCG.
      205444.0
      205433.0
      172625.0
    
    
      2
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACGTGG.
      362762.0
      362736.0
      302802.0
    
    
      3
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACTCAC.
      687749.0
      687666.0
      562701.0
    
    
      4
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.AGGATG.
      306597.0
      306576.0
      250348.0



In [19]:

    
dfs = [df1, df2, df3]



In [20]:

    
df1.shape









    Out[20]:





(89, 2)



In [21]:

    
df2.shape









    Out[21]:





(89, 2)



In [22]:

    
df3.shape









    Out[22]:





(89, 2)



In [23]:

    
cd19_merged = pd.concat([df.set_index('filename') for df in dfs], axis=1).reset_index()



In [ ]:



In [24]:

    
cd19_merged.head()









    Out[24]:






  
    
      
      filename
      total_cpg_no_filter
      total_cpg_gtrthan1
      total_cpg_gtrthan38
    
  
  
    
      0
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACAACC.
      830087.0
      829995.0
      686683.0
    
    
      1
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACCGCG.
      205444.0
      205433.0
      172625.0
    
    
      2
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACGTGG.
      362762.0
      362736.0
      302802.0
    
    
      3
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACTCAC.
      687749.0
      687666.0
      562701.0
    
    
      4
      RRBS_NormalBCD19pcell1_22_TAAGGCGA.AGGATG.
      306597.0
      306576.0
      250348.0



In [25]:

    
cd19_merged["filename"] = cd19_merged["filename"].str.replace(r'.$', '')



In [26]:

    
cd19_merged.tail()









    Out[26]:






  
    
      
      filename
      total_cpg_no_filter
      total_cpg_gtrthan1
      total_cpg_gtrthan38
    
  
  
    
      84
      RRBS_NormalBCD19pcell67_88_TCCTGAGC.GTTGA
      91467.0
      91461.0
      78199.0
    
    
      85
      RRBS_NormalBCD19pcell67_88_TCCTGAGC.TAGCG
      215872.0
      215857.0
      178938.0
    
    
      86
      RRBS_NormalBCD19pcell67_88_TCCTGAGC.TATCT
      446362.0
      446317.0
      376497.0
    
    
      87
      RRBS_NormalBCD19pcell67_88_TCCTGAGC.TCTCT
      189736.0
      189729.0
      156738.0
    
    
      88
      RRBS_NormalBCD19pcell67_88_TCCTGAGC.TGCTG
      9121.0
      9121.0
      7691.0



In [27]:

    
cd19_merged.to_csv("CD19_cpgs.csv")



In [ ]:



In [28]:

    
df1 = pd.read_csv("mcell_cpg1.csv")
df2 = pd.read_csv("mcell_cpg2.csv")
df3 = pd.read_csv("mcell_cpg3.csv")



In [29]:

    
df3 = df3.rename(columns = {'total_cpg_gtrthan1':'total_cpg_gtrthan38'})



In [30]:

    
df1 = df1[["filename", "total_cpg_no_filter"]]
df2 = df2[["filename", "total_cpg_gtrthan1"]]
df3 = df3[["filename", "total_cpg_gtrthan38"]]



In [31]:

    
dfs = [df1, df2, df3]

mcell_cpg = pd.concat([df.set_index('filename') for df in dfs], axis=1).reset_index()



In [32]:

    
len('RRBS_NormalBCD19pCD27mcell1_22')









    Out[32]:





30



In [33]:

    
mcell_cpg['protocol'] = mcell_cpg['filename'].str[:30]



In [34]:

    
mcell_cpg["filename"][mcell_cpg["protocol"] == str('RRBS_NormalBCD19pCD27mcell1_22')] = mcell_cpg["filename"].str.replace(r'.$', '')









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [35]:

    
mcell_cpg.head()









    Out[35]:






  
    
      
      filename
      total_cpg_no_filter
      total_cpg_gtrthan1
      total_cpg_gtrthan38
      protocol
    
  
  
    
      0
      RRBS_NormalBCD19pCD27mcell1_22_CGAGGCTG.ACAACC
      4704.0
      4704.0
      3846.0
      RRBS_NormalBCD19pCD27mcell1_22
    
    
      1
      RRBS_NormalBCD19pCD27mcell1_22_CGAGGCTG.ACCGCG
      156456.0
      156451.0
      130998.0
      RRBS_NormalBCD19pCD27mcell1_22
    
    
      2
      RRBS_NormalBCD19pCD27mcell1_22_CGAGGCTG.ACGTGG
      229814.0
      229799.0
      192524.0
      RRBS_NormalBCD19pCD27mcell1_22
    
    
      3
      RRBS_NormalBCD19pCD27mcell1_22_CGAGGCTG.ACTCAC
      503087.0
      503037.0
      415806.0
      RRBS_NormalBCD19pCD27mcell1_22
    
    
      4
      RRBS_NormalBCD19pCD27mcell1_22_CGAGGCTG.AGGATG
      4385.0
      4385.0
      3606.0
      RRBS_NormalBCD19pCD27mcell1_22



In [36]:

    
mcell_cpg = mcell_cpg.drop("protocol", axis=1)



In [37]:

    
mcell_cpg.shape









    Out[37]:





(88, 4)



In [38]:

    
mcell_cpg.to_csv("mcell_cpg.csv")



In [39]:

    
df1 = pd.read_csv("pcell_cpg1.csv")
df2 = pd.read_csv("pcell_cpg2.csv")
df3 = pd.read_csv("pcell_cpg3.csv")



In [40]:

    
df3 = df3.rename(columns = {'total_cpg_gtrthan1':'total_cpg_gtrthan38'})



In [41]:

    
df1 = df1[["filename", "total_cpg_no_filter"]]
df2 = df2[["filename", "total_cpg_gtrthan1"]]
df3 = df3[["filename", "total_cpg_gtrthan38"]]



In [42]:

    
dfs = [df1, df2, df3]

pcell_cpg = pd.concat([df.set_index('filename') for df in dfs], axis=1).reset_index()



In [43]:

    
pcell_cpg['protocol'] = pcell_cpg['filename'].str[:30]



In [44]:

    
pcell_cpg["filename"][pcell_cpg["protocol"] == str('RRBS_NormalBCD19pCD27pcell1_22')] = pcell_cpg["filename"].str.replace(r'.$', '')









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [45]:

    
pcell_cpg.head()









    Out[45]:






  
    
      
      filename
      total_cpg_no_filter
      total_cpg_gtrthan1
      total_cpg_gtrthan38
      protocol
    
  
  
    
      0
      RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACAACC
      403150.0
      403090.0
      330131.0
      RRBS_NormalBCD19pCD27pcell1_22
    
    
      1
      RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACCGCG
      88590.0
      88584.0
      73436.0
      RRBS_NormalBCD19pCD27pcell1_22
    
    
      2
      RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACGTGG
      2012.0
      2012.0
      1636.0
      RRBS_NormalBCD19pCD27pcell1_22
    
    
      3
      RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACTCAC
      430377.0
      430330.0
      356147.0
      RRBS_NormalBCD19pCD27pcell1_22
    
    
      4
      RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.AGGATG
      215258.0
      215241.0
      179398.0
      RRBS_NormalBCD19pCD27pcell1_22



In [46]:

    
pcell_cpg = pcell_cpg.drop("protocol", axis=1)



In [47]:

    
pcell_cpg.to_csv("pcell_cpg.csv")



In [ ]:



In [48]:

    
df1 = pd.read_csv("trito1.csv")
df2 = pd.read_csv("trito2.csv")
df3 = pd.read_csv("trito3.csv")



In [49]:

    
df3 = df3.rename(columns = {'total_cpg_gtrthan1':'total_cpg_gtrthan38'})



In [50]:

    
df1 = df1[["filename", "total_cpg_no_filter"]]
df2 = df2[["filename", "total_cpg_gtrthan1"]]
df3 = df3[["filename", "total_cpg_gtrthan38"]]



In [51]:

    
dfs = [df1, df2, df3]

trito_cpg = pd.concat([df.set_index('filename') for df in dfs], axis=1).reset_index()



In [52]:

    
trito_cpg["filename"] = trito_cpg["filename"].str[:33]



In [53]:

    
trito_cpg.shape









    Out[53]:





(42, 4)



In [54]:

    
trito_cpg.to_csv("trito_cpg.csv")



In [ ]:



In [ ]:



In [55]:

    
df1 = pd.read_csv("cw154_cpgs_test1.csv")
df2 = pd.read_csv("cw154_cpgs_test2.csv")
df3 = pd.read_csv("cw154_cpgs_test3.csv")



In [56]:

    
df3 = df3.rename(columns = {'total_cpg_gtrthan1':'total_cpg_gtrthan38'})



In [57]:

    
df1 = df1[["filename", "total_cpg_no_filter"]]
df2 = df2[["filename", "total_cpg_gtrthan1"]]
df3 = df3[["filename", "total_cpg_gtrthan38"]]



In [58]:

    
dfs = [df1, df2, df3]

cw154_cpg = pd.concat([df.set_index('filename') for df in dfs], axis=1).reset_index()



In [59]:

    
cw154_cpg.shape









    Out[59]:





(66, 4)



In [60]:

    
cw154_cpg["filename"].ix[0] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACAACC")
cw154_cpg["filename"].ix[1] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACCGCG")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)



In [61]:

    
cw154_cpg["filename"].ix[2] = str('RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACGTGG')









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)



In [62]:

    
cw154_cpg["filename"].ix[3] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACTCAC")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)



In [63]:

    
cw154_cpg["filename"].ix[4] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.AGGATG")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)



In [64]:

    
cw154_cpg["filename"].ix[5] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ATAGCG")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)



In [65]:

    
cw154_cpg["filename"].ix[6] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ATCGAC")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)



In [66]:

    
cw154_cpg["filename"].ix[7] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CAAGAG")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)



In [67]:

    
cw154_cpg["filename"].ix[8] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CATGAC")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)



In [68]:

    
cw154_cpg["filename"].ix[9] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CCTTCG")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)



In [69]:

    
cw154_cpg["filename"].ix[10] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CGGTAG")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)



In [70]:

    
cw154_cpg["filename"].ix[11] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CTATTG")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)



In [71]:

    
cw154_cpg["filename"].ix[12] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CTCAGC")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)



In [72]:

    
cw154_cpg["filename"].ix[13] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GACACG")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)



In [73]:

    
cw154_cpg["filename"].ix[14] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GCATTC")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)



In [74]:

    
cw154_cpg["filename"].ix[15] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GCTGCC")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)



In [75]:

    
cw154_cpg["filename"].ix[16] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GGCATC")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)



In [76]:

    
cw154_cpg["filename"].ix[17] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GTGAGG")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)



In [77]:

    
cw154_cpg["filename"].ix[18] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GTTGAG")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)



In [78]:

    
cw154_cpg["filename"].ix[19] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TAGCGG")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)



In [79]:

    
cw154_cpg["filename"].ix[20] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TATCTC")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)



In [80]:

    
cw154_cpg["filename"].ix[21] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TCTCTG")









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)



In [81]:

    
cw154_cpg["filename"].ix[22]









    Out[81]:





'RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC.dan.an'



In [82]:

    
len("RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACAACC")









    Out[82]:





43



In [83]:

    
cw154_cpg["protocol"] = cw154_cpg["filename"].str[:28]



In [84]:

    
cw154_cpg.tail(25)









    Out[84]:






  
    
      
      filename
      total_cpg_no_filter
      total_cpg_gtrthan1
      total_cpg_gtrthan38
      protocol
    
  
  
    
      41
      RRBS_cw154_Tris_protease_CTCTCTAC.TAGCGG.dan.an
      77891.0
      77882.0
      65323.0
      RRBS_cw154_Tris_protease_CTC
    
    
      42
      RRBS_cw154_Tris_protease_CTCTCTAC.TATCTC.dan.an
      306155.0
      306092.0
      257092.0
      RRBS_cw154_Tris_protease_CTC
    
    
      43
      RRBS_cw154_Tris_protease_CTCTCTAC.TCTCTG.dan.an
      615705.0
      615569.0
      519604.0
      RRBS_cw154_Tris_protease_CTC
    
    
      44
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACAACC.dan
      739320.0
      739104.0
      625224.0
      RRBS_cw154_Tris_protease_GR_
    
    
      45
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACCGCG.dan
      56706.0
      56695.0
      46212.0
      RRBS_cw154_Tris_protease_GR_
    
    
      46
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACGTGG.dan
      326834.0
      326784.0
      272926.0
      RRBS_cw154_Tris_protease_GR_
    
    
      47
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACTCAC.dan
      753993.0
      753779.0
      635135.0
      RRBS_cw154_Tris_protease_GR_
    
    
      48
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.AGGATG.dan
      784636.0
      784492.0
      663005.0
      RRBS_cw154_Tris_protease_GR_
    
    
      49
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.ATAGCG.dan
      287163.0
      287118.0
      239557.0
      RRBS_cw154_Tris_protease_GR_
    
    
      50
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.ATCGAC.dan
      644063.0
      643915.0
      539497.0
      RRBS_cw154_Tris_protease_GR_
    
    
      51
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.CAAGAG.dan
      9786.0
      9786.0
      7866.0
      RRBS_cw154_Tris_protease_GR_
    
    
      52
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.CATGAC.dan
      720869.0
      720763.0
      600463.0
      RRBS_cw154_Tris_protease_GR_
    
    
      53
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.CCTTCG.dan
      259789.0
      259749.0
      216388.0
      RRBS_cw154_Tris_protease_GR_
    
    
      54
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.CGGTAG.dan
      409451.0
      409407.0
      342160.0
      RRBS_cw154_Tris_protease_GR_
    
    
      55
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.CTATTG.dan
      393366.0
      393304.0
      339646.0
      RRBS_cw154_Tris_protease_GR_
    
    
      56
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.CTCAGC.dan
      376935.0
      376888.0
      309490.0
      RRBS_cw154_Tris_protease_GR_
    
    
      57
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.GACACG.dan
      260836.0
      260789.0
      215879.0
      RRBS_cw154_Tris_protease_GR_
    
    
      58
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.GCATTC.dan
      324252.0
      324200.0
      263836.0
      RRBS_cw154_Tris_protease_GR_
    
    
      59
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.GCTGCC.dan
      224514.0
      224480.0
      183768.0
      RRBS_cw154_Tris_protease_GR_
    
    
      60
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.GGCATC.dan
      363506.0
      363446.0
      297360.0
      RRBS_cw154_Tris_protease_GR_
    
    
      61
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.GTGAGG.dan
      446957.0
      446899.0
      370171.0
      RRBS_cw154_Tris_protease_GR_
    
    
      62
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.GTTGAG.dan
      365218.0
      365172.0
      301151.0
      RRBS_cw154_Tris_protease_GR_
    
    
      63
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.TAGCGG.dan
      174703.0
      174677.0
      141958.0
      RRBS_cw154_Tris_protease_GR_
    
    
      64
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.TATCTC.dan
      423104.0
      423019.0
      349305.0
      RRBS_cw154_Tris_protease_GR_
    
    
      65
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.TCTCTG.dan
      569470.0
      569373.0
      471904.0
      RRBS_cw154_Tris_protease_GR_



In [85]:

    
len("RRBS_cw154_Tris_protease_CTC")









    Out[85]:





28



In [86]:

    
cw154_cpg["filename"][cw154_cpg["protocol"] == str("RRBS_cw154_Tris_protease_CTC")] = cw154_cpg["filename"].str.replace(r'.an$', '')









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [87]:

    
cw154_cpg









    Out[87]:






  
    
      
      filename
      total_cpg_no_filter
      total_cpg_gtrthan1
      total_cpg_gtrthan38
      protocol
    
  
  
    
      0
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACAACC
      731145.0
      730971.0
      609788.0
      RRBS_cw154_CutSmart_proteina
    
    
      1
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACCGCG
      202674.0
      202653.0
      168418.0
      RRBS_cw154_CutSmart_proteina
    
    
      2
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACGTGG
      451053.0
      451000.0
      374185.0
      RRBS_cw154_CutSmart_proteina
    
    
      3
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACTCAC
      719630.0
      719469.0
      598731.0
      RRBS_cw154_CutSmart_proteina
    
    
      4
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.AGGATG
      799182.0
      799040.0
      668842.0
      RRBS_cw154_CutSmart_proteina
    
    
      5
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ATAGCG
      551512.0
      551428.0
      454895.0
      RRBS_cw154_CutSmart_proteina
    
    
      6
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ATCGAC
      746691.0
      746542.0
      617118.0
      RRBS_cw154_CutSmart_proteina
    
    
      7
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CAAGAG
      659749.0
      659662.0
      548621.0
      RRBS_cw154_CutSmart_proteina
    
    
      8
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CATGAC
      772618.0
      772511.0
      642159.0
      RRBS_cw154_CutSmart_proteina
    
    
      9
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CCTTCG
      454537.0
      454458.0
      373943.0
      RRBS_cw154_CutSmart_proteina
    
    
      10
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CGGTAG
      485483.0
      485419.0
      403532.0
      RRBS_cw154_CutSmart_proteina
    
    
      11
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CTATTG
      9834.0
      9834.0
      8007.0
      RRBS_cw154_CutSmart_proteina
    
    
      12
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CTCAGC
      470082.0
      470009.0
      386951.0
      RRBS_cw154_CutSmart_proteina
    
    
      13
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GACACG
      406942.0
      406885.0
      335484.0
      RRBS_cw154_CutSmart_proteina
    
    
      14
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GCATTC
      579844.0
      579771.0
      476413.0
      RRBS_cw154_CutSmart_proteina
    
    
      15
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GCTGCC
      257369.0
      257331.0
      211519.0
      RRBS_cw154_CutSmart_proteina
    
    
      16
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GGCATC
      512469.0
      512391.0
      426033.0
      RRBS_cw154_CutSmart_proteina
    
    
      17
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GTGAGG
      541784.0
      541712.0
      452376.0
      RRBS_cw154_CutSmart_proteina
    
    
      18
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GTTGAG
      15152.0
      15150.0
      12450.0
      RRBS_cw154_CutSmart_proteina
    
    
      19
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TAGCGG
      404298.0
      404249.0
      334974.0
      RRBS_cw154_CutSmart_proteina
    
    
      20
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TATCTC
      791044.0
      790894.0
      657494.0
      RRBS_cw154_CutSmart_proteina
    
    
      21
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TCTCTG
      653387.0
      653298.0
      544721.0
      RRBS_cw154_CutSmart_proteina
    
    
      22
      RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC.dan
      475766.0
      475596.0
      398721.0
      RRBS_cw154_Tris_protease_CTC
    
    
      23
      RRBS_cw154_Tris_protease_CTCTCTAC.ACCGCG.dan
      118876.0
      118851.0
      100501.0
      RRBS_cw154_Tris_protease_CTC
    
    
      24
      RRBS_cw154_Tris_protease_CTCTCTAC.ACGTGG.dan
      133116.0
      133102.0
      112979.0
      RRBS_cw154_Tris_protease_CTC
    
    
      25
      RRBS_cw154_Tris_protease_CTCTCTAC.ACTCAC.dan
      422977.0
      422884.0
      349345.0
      RRBS_cw154_Tris_protease_CTC
    
    
      26
      RRBS_cw154_Tris_protease_CTCTCTAC.AGGATG.dan
      281432.0
      281365.0
      236281.0
      RRBS_cw154_Tris_protease_CTC
    
    
      27
      RRBS_cw154_Tris_protease_CTCTCTAC.ATAGCG.dan
      283027.0
      282977.0
      237352.0
      RRBS_cw154_Tris_protease_CTC
    
    
      28
      RRBS_cw154_Tris_protease_CTCTCTAC.ATCGAC.dan
      333368.0
      333284.0
      278192.0
      RRBS_cw154_Tris_protease_CTC
    
    
      29
      RRBS_cw154_Tris_protease_CTCTCTAC.CAAGAG.dan
      12665.0
      12665.0
      10385.0
      RRBS_cw154_Tris_protease_CTC
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      36
      RRBS_cw154_Tris_protease_CTCTCTAC.GCATTC.dan
      589697.0
      589548.0
      490794.0
      RRBS_cw154_Tris_protease_CTC
    
    
      37
      RRBS_cw154_Tris_protease_CTCTCTAC.GCTGCC.dan
      146093.0
      146076.0
      119380.0
      RRBS_cw154_Tris_protease_CTC
    
    
      38
      RRBS_cw154_Tris_protease_CTCTCTAC.GGCATC.dan
      430967.0
      430875.0
      357048.0
      RRBS_cw154_Tris_protease_CTC
    
    
      39
      RRBS_cw154_Tris_protease_CTCTCTAC.GTGAGG.dan
      339976.0
      339902.0
      285024.0
      RRBS_cw154_Tris_protease_CTC
    
    
      40
      RRBS_cw154_Tris_protease_CTCTCTAC.GTTGAG.dan
      209790.0
      209741.0
      172699.0
      RRBS_cw154_Tris_protease_CTC
    
    
      41
      RRBS_cw154_Tris_protease_CTCTCTAC.TAGCGG.dan
      77891.0
      77882.0
      65323.0
      RRBS_cw154_Tris_protease_CTC
    
    
      42
      RRBS_cw154_Tris_protease_CTCTCTAC.TATCTC.dan
      306155.0
      306092.0
      257092.0
      RRBS_cw154_Tris_protease_CTC
    
    
      43
      RRBS_cw154_Tris_protease_CTCTCTAC.TCTCTG.dan
      615705.0
      615569.0
      519604.0
      RRBS_cw154_Tris_protease_CTC
    
    
      44
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACAACC.dan
      739320.0
      739104.0
      625224.0
      RRBS_cw154_Tris_protease_GR_
    
    
      45
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACCGCG.dan
      56706.0
      56695.0
      46212.0
      RRBS_cw154_Tris_protease_GR_
    
    
      46
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACGTGG.dan
      326834.0
      326784.0
      272926.0
      RRBS_cw154_Tris_protease_GR_
    
    
      47
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACTCAC.dan
      753993.0
      753779.0
      635135.0
      RRBS_cw154_Tris_protease_GR_
    
    
      48
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.AGGATG.dan
      784636.0
      784492.0
      663005.0
      RRBS_cw154_Tris_protease_GR_
    
    
      49
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.ATAGCG.dan
      287163.0
      287118.0
      239557.0
      RRBS_cw154_Tris_protease_GR_
    
    
      50
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.ATCGAC.dan
      644063.0
      643915.0
      539497.0
      RRBS_cw154_Tris_protease_GR_
    
    
      51
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.CAAGAG.dan
      9786.0
      9786.0
      7866.0
      RRBS_cw154_Tris_protease_GR_
    
    
      52
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.CATGAC.dan
      720869.0
      720763.0
      600463.0
      RRBS_cw154_Tris_protease_GR_
    
    
      53
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.CCTTCG.dan
      259789.0
      259749.0
      216388.0
      RRBS_cw154_Tris_protease_GR_
    
    
      54
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.CGGTAG.dan
      409451.0
      409407.0
      342160.0
      RRBS_cw154_Tris_protease_GR_
    
    
      55
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.CTATTG.dan
      393366.0
      393304.0
      339646.0
      RRBS_cw154_Tris_protease_GR_
    
    
      56
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.CTCAGC.dan
      376935.0
      376888.0
      309490.0
      RRBS_cw154_Tris_protease_GR_
    
    
      57
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.GACACG.dan
      260836.0
      260789.0
      215879.0
      RRBS_cw154_Tris_protease_GR_
    
    
      58
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.GCATTC.dan
      324252.0
      324200.0
      263836.0
      RRBS_cw154_Tris_protease_GR_
    
    
      59
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.GCTGCC.dan
      224514.0
      224480.0
      183768.0
      RRBS_cw154_Tris_protease_GR_
    
    
      60
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.GGCATC.dan
      363506.0
      363446.0
      297360.0
      RRBS_cw154_Tris_protease_GR_
    
    
      61
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.GTGAGG.dan
      446957.0
      446899.0
      370171.0
      RRBS_cw154_Tris_protease_GR_
    
    
      62
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.GTTGAG.dan
      365218.0
      365172.0
      301151.0
      RRBS_cw154_Tris_protease_GR_
    
    
      63
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.TAGCGG.dan
      174703.0
      174677.0
      141958.0
      RRBS_cw154_Tris_protease_GR_
    
    
      64
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.TATCTC.dan
      423104.0
      423019.0
      349305.0
      RRBS_cw154_Tris_protease_GR_
    
    
      65
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.TCTCTG.dan
      569470.0
      569373.0
      471904.0
      RRBS_cw154_Tris_protease_GR_
    
  

66 rows × 5 columns



In [88]:

    
cw154_cpg["filename"][cw154_cpg["protocol"] == str("RRBS_cw154_Tris_protease_CTC")] = cw154_cpg["filename"].str.replace(r'.an$', '')









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [89]:

    
cw154_cpg["filename"][cw154_cpg["protocol"] == str("RRBS_cw154_Tris_protease_GR_")] = cw154_cpg["filename"].str.replace(r'.dan$', '')









    



/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':



In [90]:

    
cw154_cpg.shape









    Out[90]:





(66, 5)



In [91]:

    
cw154_cpg









    Out[91]:






  
    
      
      filename
      total_cpg_no_filter
      total_cpg_gtrthan1
      total_cpg_gtrthan38
      protocol
    
  
  
    
      0
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACAACC
      731145.0
      730971.0
      609788.0
      RRBS_cw154_CutSmart_proteina
    
    
      1
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACCGCG
      202674.0
      202653.0
      168418.0
      RRBS_cw154_CutSmart_proteina
    
    
      2
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACGTGG
      451053.0
      451000.0
      374185.0
      RRBS_cw154_CutSmart_proteina
    
    
      3
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACTCAC
      719630.0
      719469.0
      598731.0
      RRBS_cw154_CutSmart_proteina
    
    
      4
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.AGGATG
      799182.0
      799040.0
      668842.0
      RRBS_cw154_CutSmart_proteina
    
    
      5
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ATAGCG
      551512.0
      551428.0
      454895.0
      RRBS_cw154_CutSmart_proteina
    
    
      6
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ATCGAC
      746691.0
      746542.0
      617118.0
      RRBS_cw154_CutSmart_proteina
    
    
      7
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CAAGAG
      659749.0
      659662.0
      548621.0
      RRBS_cw154_CutSmart_proteina
    
    
      8
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CATGAC
      772618.0
      772511.0
      642159.0
      RRBS_cw154_CutSmart_proteina
    
    
      9
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CCTTCG
      454537.0
      454458.0
      373943.0
      RRBS_cw154_CutSmart_proteina
    
    
      10
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CGGTAG
      485483.0
      485419.0
      403532.0
      RRBS_cw154_CutSmart_proteina
    
    
      11
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CTATTG
      9834.0
      9834.0
      8007.0
      RRBS_cw154_CutSmart_proteina
    
    
      12
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CTCAGC
      470082.0
      470009.0
      386951.0
      RRBS_cw154_CutSmart_proteina
    
    
      13
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GACACG
      406942.0
      406885.0
      335484.0
      RRBS_cw154_CutSmart_proteina
    
    
      14
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GCATTC
      579844.0
      579771.0
      476413.0
      RRBS_cw154_CutSmart_proteina
    
    
      15
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GCTGCC
      257369.0
      257331.0
      211519.0
      RRBS_cw154_CutSmart_proteina
    
    
      16
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GGCATC
      512469.0
      512391.0
      426033.0
      RRBS_cw154_CutSmart_proteina
    
    
      17
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GTGAGG
      541784.0
      541712.0
      452376.0
      RRBS_cw154_CutSmart_proteina
    
    
      18
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GTTGAG
      15152.0
      15150.0
      12450.0
      RRBS_cw154_CutSmart_proteina
    
    
      19
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TAGCGG
      404298.0
      404249.0
      334974.0
      RRBS_cw154_CutSmart_proteina
    
    
      20
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TATCTC
      791044.0
      790894.0
      657494.0
      RRBS_cw154_CutSmart_proteina
    
    
      21
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TCTCTG
      653387.0
      653298.0
      544721.0
      RRBS_cw154_CutSmart_proteina
    
    
      22
      RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC.
      475766.0
      475596.0
      398721.0
      RRBS_cw154_Tris_protease_CTC
    
    
      23
      RRBS_cw154_Tris_protease_CTCTCTAC.ACCGCG.
      118876.0
      118851.0
      100501.0
      RRBS_cw154_Tris_protease_CTC
    
    
      24
      RRBS_cw154_Tris_protease_CTCTCTAC.ACGTGG.
      133116.0
      133102.0
      112979.0
      RRBS_cw154_Tris_protease_CTC
    
    
      25
      RRBS_cw154_Tris_protease_CTCTCTAC.ACTCAC.
      422977.0
      422884.0
      349345.0
      RRBS_cw154_Tris_protease_CTC
    
    
      26
      RRBS_cw154_Tris_protease_CTCTCTAC.AGGATG.
      281432.0
      281365.0
      236281.0
      RRBS_cw154_Tris_protease_CTC
    
    
      27
      RRBS_cw154_Tris_protease_CTCTCTAC.ATAGCG.
      283027.0
      282977.0
      237352.0
      RRBS_cw154_Tris_protease_CTC
    
    
      28
      RRBS_cw154_Tris_protease_CTCTCTAC.ATCGAC.
      333368.0
      333284.0
      278192.0
      RRBS_cw154_Tris_protease_CTC
    
    
      29
      RRBS_cw154_Tris_protease_CTCTCTAC.CAAGAG.
      12665.0
      12665.0
      10385.0
      RRBS_cw154_Tris_protease_CTC
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      36
      RRBS_cw154_Tris_protease_CTCTCTAC.GCATTC.
      589697.0
      589548.0
      490794.0
      RRBS_cw154_Tris_protease_CTC
    
    
      37
      RRBS_cw154_Tris_protease_CTCTCTAC.GCTGCC.
      146093.0
      146076.0
      119380.0
      RRBS_cw154_Tris_protease_CTC
    
    
      38
      RRBS_cw154_Tris_protease_CTCTCTAC.GGCATC.
      430967.0
      430875.0
      357048.0
      RRBS_cw154_Tris_protease_CTC
    
    
      39
      RRBS_cw154_Tris_protease_CTCTCTAC.GTGAGG.
      339976.0
      339902.0
      285024.0
      RRBS_cw154_Tris_protease_CTC
    
    
      40
      RRBS_cw154_Tris_protease_CTCTCTAC.GTTGAG.
      209790.0
      209741.0
      172699.0
      RRBS_cw154_Tris_protease_CTC
    
    
      41
      RRBS_cw154_Tris_protease_CTCTCTAC.TAGCGG.
      77891.0
      77882.0
      65323.0
      RRBS_cw154_Tris_protease_CTC
    
    
      42
      RRBS_cw154_Tris_protease_CTCTCTAC.TATCTC.
      306155.0
      306092.0
      257092.0
      RRBS_cw154_Tris_protease_CTC
    
    
      43
      RRBS_cw154_Tris_protease_CTCTCTAC.TCTCTG.
      615705.0
      615569.0
      519604.0
      RRBS_cw154_Tris_protease_CTC
    
    
      44
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACAACC
      739320.0
      739104.0
      625224.0
      RRBS_cw154_Tris_protease_GR_
    
    
      45
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACCGCG
      56706.0
      56695.0
      46212.0
      RRBS_cw154_Tris_protease_GR_
    
    
      46
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACGTGG
      326834.0
      326784.0
      272926.0
      RRBS_cw154_Tris_protease_GR_
    
    
      47
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACTCAC
      753993.0
      753779.0
      635135.0
      RRBS_cw154_Tris_protease_GR_
    
    
      48
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.AGGATG
      784636.0
      784492.0
      663005.0
      RRBS_cw154_Tris_protease_GR_
    
    
      49
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.ATAGCG
      287163.0
      287118.0
      239557.0
      RRBS_cw154_Tris_protease_GR_
    
    
      50
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.ATCGAC
      644063.0
      643915.0
      539497.0
      RRBS_cw154_Tris_protease_GR_
    
    
      51
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.CAAGAG
      9786.0
      9786.0
      7866.0
      RRBS_cw154_Tris_protease_GR_
    
    
      52
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.CATGAC
      720869.0
      720763.0
      600463.0
      RRBS_cw154_Tris_protease_GR_
    
    
      53
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.CCTTCG
      259789.0
      259749.0
      216388.0
      RRBS_cw154_Tris_protease_GR_
    
    
      54
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.CGGTAG
      409451.0
      409407.0
      342160.0
      RRBS_cw154_Tris_protease_GR_
    
    
      55
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.CTATTG
      393366.0
      393304.0
      339646.0
      RRBS_cw154_Tris_protease_GR_
    
    
      56
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.CTCAGC
      376935.0
      376888.0
      309490.0
      RRBS_cw154_Tris_protease_GR_
    
    
      57
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.GACACG
      260836.0
      260789.0
      215879.0
      RRBS_cw154_Tris_protease_GR_
    
    
      58
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.GCATTC
      324252.0
      324200.0
      263836.0
      RRBS_cw154_Tris_protease_GR_
    
    
      59
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.GCTGCC
      224514.0
      224480.0
      183768.0
      RRBS_cw154_Tris_protease_GR_
    
    
      60
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.GGCATC
      363506.0
      363446.0
      297360.0
      RRBS_cw154_Tris_protease_GR_
    
    
      61
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.GTGAGG
      446957.0
      446899.0
      370171.0
      RRBS_cw154_Tris_protease_GR_
    
    
      62
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.GTTGAG
      365218.0
      365172.0
      301151.0
      RRBS_cw154_Tris_protease_GR_
    
    
      63
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.TAGCGG
      174703.0
      174677.0
      141958.0
      RRBS_cw154_Tris_protease_GR_
    
    
      64
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.TATCTC
      423104.0
      423019.0
      349305.0
      RRBS_cw154_Tris_protease_GR_
    
    
      65
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.TCTCTG
      569470.0
      569373.0
      471904.0
      RRBS_cw154_Tris_protease_GR_
    
  

66 rows × 5 columns



In [92]:

    
cw154_cpg.drop("protocol", axis=1)









    Out[92]:






  
    
      
      filename
      total_cpg_no_filter
      total_cpg_gtrthan1
      total_cpg_gtrthan38
    
  
  
    
      0
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACAACC
      731145.0
      730971.0
      609788.0
    
    
      1
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACCGCG
      202674.0
      202653.0
      168418.0
    
    
      2
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACGTGG
      451053.0
      451000.0
      374185.0
    
    
      3
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACTCAC
      719630.0
      719469.0
      598731.0
    
    
      4
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.AGGATG
      799182.0
      799040.0
      668842.0
    
    
      5
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ATAGCG
      551512.0
      551428.0
      454895.0
    
    
      6
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ATCGAC
      746691.0
      746542.0
      617118.0
    
    
      7
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CAAGAG
      659749.0
      659662.0
      548621.0
    
    
      8
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CATGAC
      772618.0
      772511.0
      642159.0
    
    
      9
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CCTTCG
      454537.0
      454458.0
      373943.0
    
    
      10
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CGGTAG
      485483.0
      485419.0
      403532.0
    
    
      11
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CTATTG
      9834.0
      9834.0
      8007.0
    
    
      12
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CTCAGC
      470082.0
      470009.0
      386951.0
    
    
      13
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GACACG
      406942.0
      406885.0
      335484.0
    
    
      14
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GCATTC
      579844.0
      579771.0
      476413.0
    
    
      15
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GCTGCC
      257369.0
      257331.0
      211519.0
    
    
      16
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GGCATC
      512469.0
      512391.0
      426033.0
    
    
      17
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GTGAGG
      541784.0
      541712.0
      452376.0
    
    
      18
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GTTGAG
      15152.0
      15150.0
      12450.0
    
    
      19
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TAGCGG
      404298.0
      404249.0
      334974.0
    
    
      20
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TATCTC
      791044.0
      790894.0
      657494.0
    
    
      21
      RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TCTCTG
      653387.0
      653298.0
      544721.0
    
    
      22
      RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC.
      475766.0
      475596.0
      398721.0
    
    
      23
      RRBS_cw154_Tris_protease_CTCTCTAC.ACCGCG.
      118876.0
      118851.0
      100501.0
    
    
      24
      RRBS_cw154_Tris_protease_CTCTCTAC.ACGTGG.
      133116.0
      133102.0
      112979.0
    
    
      25
      RRBS_cw154_Tris_protease_CTCTCTAC.ACTCAC.
      422977.0
      422884.0
      349345.0
    
    
      26
      RRBS_cw154_Tris_protease_CTCTCTAC.AGGATG.
      281432.0
      281365.0
      236281.0
    
    
      27
      RRBS_cw154_Tris_protease_CTCTCTAC.ATAGCG.
      283027.0
      282977.0
      237352.0
    
    
      28
      RRBS_cw154_Tris_protease_CTCTCTAC.ATCGAC.
      333368.0
      333284.0
      278192.0
    
    
      29
      RRBS_cw154_Tris_protease_CTCTCTAC.CAAGAG.
      12665.0
      12665.0
      10385.0
    
    
      ...
      ...
      ...
      ...
      ...
    
    
      36
      RRBS_cw154_Tris_protease_CTCTCTAC.GCATTC.
      589697.0
      589548.0
      490794.0
    
    
      37
      RRBS_cw154_Tris_protease_CTCTCTAC.GCTGCC.
      146093.0
      146076.0
      119380.0
    
    
      38
      RRBS_cw154_Tris_protease_CTCTCTAC.GGCATC.
      430967.0
      430875.0
      357048.0
    
    
      39
      RRBS_cw154_Tris_protease_CTCTCTAC.GTGAGG.
      339976.0
      339902.0
      285024.0
    
    
      40
      RRBS_cw154_Tris_protease_CTCTCTAC.GTTGAG.
      209790.0
      209741.0
      172699.0
    
    
      41
      RRBS_cw154_Tris_protease_CTCTCTAC.TAGCGG.
      77891.0
      77882.0
      65323.0
    
    
      42
      RRBS_cw154_Tris_protease_CTCTCTAC.TATCTC.
      306155.0
      306092.0
      257092.0
    
    
      43
      RRBS_cw154_Tris_protease_CTCTCTAC.TCTCTG.
      615705.0
      615569.0
      519604.0
    
    
      44
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACAACC
      739320.0
      739104.0
      625224.0
    
    
      45
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACCGCG
      56706.0
      56695.0
      46212.0
    
    
      46
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACGTGG
      326834.0
      326784.0
      272926.0
    
    
      47
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACTCAC
      753993.0
      753779.0
      635135.0
    
    
      48
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.AGGATG
      784636.0
      784492.0
      663005.0
    
    
      49
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.ATAGCG
      287163.0
      287118.0
      239557.0
    
    
      50
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.ATCGAC
      644063.0
      643915.0
      539497.0
    
    
      51
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.CAAGAG
      9786.0
      9786.0
      7866.0
    
    
      52
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.CATGAC
      720869.0
      720763.0
      600463.0
    
    
      53
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.CCTTCG
      259789.0
      259749.0
      216388.0
    
    
      54
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.CGGTAG
      409451.0
      409407.0
      342160.0
    
    
      55
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.CTATTG
      393366.0
      393304.0
      339646.0
    
    
      56
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.CTCAGC
      376935.0
      376888.0
      309490.0
    
    
      57
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.GACACG
      260836.0
      260789.0
      215879.0
    
    
      58
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.GCATTC
      324252.0
      324200.0
      263836.0
    
    
      59
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.GCTGCC
      224514.0
      224480.0
      183768.0
    
    
      60
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.GGCATC
      363506.0
      363446.0
      297360.0
    
    
      61
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.GTGAGG
      446957.0
      446899.0
      370171.0
    
    
      62
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.GTTGAG
      365218.0
      365172.0
      301151.0
    
    
      63
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.TAGCGG
      174703.0
      174677.0
      141958.0
    
    
      64
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.TATCTC
      423104.0
      423019.0
      349305.0
    
    
      65
      RRBS_cw154_Tris_protease_GR_CAGAGAGG.TCTCTG
      569470.0
      569373.0
      471904.0
    
  

66 rows × 4 columns



In [93]:

    
cw154_cpg.to_csv("cw154_cpg.csv")



In [94]:

    
print(cd19_merged.shape)
print(mcell_cpg.shape)
print(pcell_cpg.shape)
print(trito_cpg.shape)
print(cw154_cpg.shape)









    



(89, 4)
(88, 4)
(90, 4)
(42, 4)
(66, 5)



In [ ]:

    
files = [mcell_cpg, pcell_cpg, trito_cpg, cw154_cpg, cd19_merged]



In [97]:

    
len("stacked_RRBS_normal_B_cell_G1_22_GGACTCCT.ACCGCG")









    Out[97]:





48



In [98]:

    
totcpg = pd.read_csv('total_CpG_filename.csv')



In [ ]:



In [ ]:



In [ ]:



In [ ]:

	PDR_per_stack	Unnamed: 0.1	avgReadCpGs	filename	methReadCount	methylation	mixedReadCount	read_stack_ID	thisMeth	thisUnmeth	total_cpg_no_filter	total_reads	unmethReadCount
0	38535.155429	1.226859e+10	830087.0	RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACAACC.	2964863.0	97458.161898	1812703.0	1.213268e+10	4007620.0	3247989.0	830087.0	7255609.0	2478043.0
1	7775.252368	7.026188e+08	205444.0	RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACCGCG.	433354.0	21222.416634	268399.0	6.906199e+08	576341.0	676915.0	205444.0	1253256.0	551503.0
2	12938.138525	2.224412e+09	362762.0	RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACGTGG.	1141733.0	40648.729308	553762.0	2.202570e+09	1456949.0	1358920.0	362762.0	2815869.0	1120374.0
3	32980.046623	8.821705e+09	687749.0	RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACTCAC.	2622515.0	86538.210407	1538888.0	8.735350e+09	3527738.0	2536187.0	687749.0	6063925.0	1902522.0
4	10115.626759	1.773965e+09	306597.0	RRBS_NormalBCD19pcell1_22_TAAGGCGA.AGGATG.	1373800.0	40266.419425	500640.0	1.758527e+09	1655479.0	1154831.0	306597.0	2810310.0	935870.0

	PDR_per_stack	Unnamed: 0.1	avgReadCpGs	filename	methReadCount	methylation	mixedReadCount	read_stack_ID	thisMeth	thisUnmeth	total_cpg_gtrthan1	total_reads	unmethReadCount
0	38529.608673	1.226108e+10	829995.0	RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACAACC.	2964125.0	97397.609858	1812653.0	1.212525e+10	4006850.0	3247545.0	829995.0	7254395.0	2477617.0
1	7774.242844	7.024402e+08	205433.0	RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACCGCG.	433314.0	21214.416634	268393.0	6.904442e+08	576296.0	676896.0	205433.0	1253192.0	551485.0
2	12936.341116	2.223505e+09	362736.0	RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACGTGG.	1141560.0	40631.229308	553748.0	2.201672e+09	1456764.0	1358856.0	362736.0	2815620.0	1120312.0
3	32972.331551	8.816508e+09	687666.0	RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACTCAC.	2621843.0	86470.247293	1538815.0	8.730204e+09	3527017.0	2535995.0	687666.0	6063012.0	1902354.0
4	10114.558267	1.773191e+09	306576.0	RRBS_NormalBCD19pcell1_22_TAAGGCGA.AGGATG.	1373559.0	40246.419425	500629.0	1.757760e+09	1655227.0	1154828.0	306576.0	2810055.0	935867.0

	PDR_per_stack	Unnamed: 0.1	avgReadCpGs	filename	methReadCount	methylation	mixedReadCount	read_stack_ID	thisMeth	thisUnmeth	total_cpg_gtrthan38	total_reads	unmethReadCount
0	26748.683723	7.763325e+09	686683.0	RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACAACC.	1897380.0	53835.887435	1449102.0	7.677330e+09	2741967.0	2855864.0	686683.0	5597831.0	2251349.0
1	5394.725563	4.543323e+08	172625.0	RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACCGCG.	278247.0	11445.886689	220605.0	4.465749e+08	395297.0	612023.0	172625.0	1007320.0	508468.0
2	9004.718202	1.419903e+09	302802.0	RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACGTGG.	727876.0	22018.044138	448766.0	1.405962e+09	985480.0	1222417.0	302802.0	2207897.0	1031255.0
3	22614.174425	5.478791e+09	562701.0	RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACTCAC.	1643931.0	47156.358513	1222277.0	5.425162e+09	2373699.0	2230845.0	562701.0	4604544.0	1738336.0
4	7245.550208	1.099108e+09	250348.0	RRBS_NormalBCD19pcell1_22_TAAGGCGA.AGGATG.	883336.0	21680.113138	413797.0	1.089543e+09	1117063.0	1041187.0	250348.0	2158250.0	861117.0

	filename	total_cpg_no_filter	total_cpg_gtrthan1	total_cpg_gtrthan38
84	RRBS_NormalBCD19pcell67_88_TCCTGAGC.GTTGA	91467.0	91461.0	78199.0
85	RRBS_NormalBCD19pcell67_88_TCCTGAGC.TAGCG	215872.0	215857.0	178938.0
86	RRBS_NormalBCD19pcell67_88_TCCTGAGC.TATCT	446362.0	446317.0	376497.0
87	RRBS_NormalBCD19pcell67_88_TCCTGAGC.TCTCT	189736.0	189729.0	156738.0
88	RRBS_NormalBCD19pcell67_88_TCCTGAGC.TGCTG	9121.0	9121.0	7691.0

	filename	total_cpg_no_filter	total_cpg_gtrthan1	total_cpg_gtrthan38	protocol
0	RRBS_NormalBCD19pCD27mcell1_22_CGAGGCTG.ACAACC	4704.0	4704.0	3846.0	RRBS_NormalBCD19pCD27mcell1_22
1	RRBS_NormalBCD19pCD27mcell1_22_CGAGGCTG.ACCGCG	156456.0	156451.0	130998.0	RRBS_NormalBCD19pCD27mcell1_22
2	RRBS_NormalBCD19pCD27mcell1_22_CGAGGCTG.ACGTGG	229814.0	229799.0	192524.0	RRBS_NormalBCD19pCD27mcell1_22
3	RRBS_NormalBCD19pCD27mcell1_22_CGAGGCTG.ACTCAC	503087.0	503037.0	415806.0	RRBS_NormalBCD19pCD27mcell1_22
4	RRBS_NormalBCD19pCD27mcell1_22_CGAGGCTG.AGGATG	4385.0	4385.0	3606.0	RRBS_NormalBCD19pCD27mcell1_22

	filename	total_cpg_no_filter	total_cpg_gtrthan1	total_cpg_gtrthan38	protocol
0	RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACAACC	403150.0	403090.0	330131.0	RRBS_NormalBCD19pCD27pcell1_22
1	RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACCGCG	88590.0	88584.0	73436.0	RRBS_NormalBCD19pCD27pcell1_22
2	RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACGTGG	2012.0	2012.0	1636.0	RRBS_NormalBCD19pCD27pcell1_22
3	RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACTCAC	430377.0	430330.0	356147.0	RRBS_NormalBCD19pCD27pcell1_22
4	RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.AGGATG	215258.0	215241.0	179398.0	RRBS_NormalBCD19pCD27pcell1_22

	filename	total_cpg_no_filter	total_cpg_gtrthan1	total_cpg_gtrthan38	protocol
41	RRBS_cw154_Tris_protease_CTCTCTAC.TAGCGG.dan.an	77891.0	77882.0	65323.0	RRBS_cw154_Tris_protease_CTC
42	RRBS_cw154_Tris_protease_CTCTCTAC.TATCTC.dan.an	306155.0	306092.0	257092.0	RRBS_cw154_Tris_protease_CTC
43	RRBS_cw154_Tris_protease_CTCTCTAC.TCTCTG.dan.an	615705.0	615569.0	519604.0	RRBS_cw154_Tris_protease_CTC
44	RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACAACC.dan	739320.0	739104.0	625224.0	RRBS_cw154_Tris_protease_GR_
45	RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACCGCG.dan	56706.0	56695.0	46212.0	RRBS_cw154_Tris_protease_GR_
46	RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACGTGG.dan	326834.0	326784.0	272926.0	RRBS_cw154_Tris_protease_GR_
47	RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACTCAC.dan	753993.0	753779.0	635135.0	RRBS_cw154_Tris_protease_GR_
48	RRBS_cw154_Tris_protease_GR_CAGAGAGG.AGGATG.dan	784636.0	784492.0	663005.0	RRBS_cw154_Tris_protease_GR_
49	RRBS_cw154_Tris_protease_GR_CAGAGAGG.ATAGCG.dan	287163.0	287118.0	239557.0	RRBS_cw154_Tris_protease_GR_
50	RRBS_cw154_Tris_protease_GR_CAGAGAGG.ATCGAC.dan	644063.0	643915.0	539497.0	RRBS_cw154_Tris_protease_GR_
51	RRBS_cw154_Tris_protease_GR_CAGAGAGG.CAAGAG.dan	9786.0	9786.0	7866.0	RRBS_cw154_Tris_protease_GR_
52	RRBS_cw154_Tris_protease_GR_CAGAGAGG.CATGAC.dan	720869.0	720763.0	600463.0	RRBS_cw154_Tris_protease_GR_
53	RRBS_cw154_Tris_protease_GR_CAGAGAGG.CCTTCG.dan	259789.0	259749.0	216388.0	RRBS_cw154_Tris_protease_GR_
54	RRBS_cw154_Tris_protease_GR_CAGAGAGG.CGGTAG.dan	409451.0	409407.0	342160.0	RRBS_cw154_Tris_protease_GR_
55	RRBS_cw154_Tris_protease_GR_CAGAGAGG.CTATTG.dan	393366.0	393304.0	339646.0	RRBS_cw154_Tris_protease_GR_
56	RRBS_cw154_Tris_protease_GR_CAGAGAGG.CTCAGC.dan	376935.0	376888.0	309490.0	RRBS_cw154_Tris_protease_GR_
57	RRBS_cw154_Tris_protease_GR_CAGAGAGG.GACACG.dan	260836.0	260789.0	215879.0	RRBS_cw154_Tris_protease_GR_
58	RRBS_cw154_Tris_protease_GR_CAGAGAGG.GCATTC.dan	324252.0	324200.0	263836.0	RRBS_cw154_Tris_protease_GR_
59	RRBS_cw154_Tris_protease_GR_CAGAGAGG.GCTGCC.dan	224514.0	224480.0	183768.0	RRBS_cw154_Tris_protease_GR_
60	RRBS_cw154_Tris_protease_GR_CAGAGAGG.GGCATC.dan	363506.0	363446.0	297360.0	RRBS_cw154_Tris_protease_GR_
61	RRBS_cw154_Tris_protease_GR_CAGAGAGG.GTGAGG.dan	446957.0	446899.0	370171.0	RRBS_cw154_Tris_protease_GR_
62	RRBS_cw154_Tris_protease_GR_CAGAGAGG.GTTGAG.dan	365218.0	365172.0	301151.0	RRBS_cw154_Tris_protease_GR_
63	RRBS_cw154_Tris_protease_GR_CAGAGAGG.TAGCGG.dan	174703.0	174677.0	141958.0	RRBS_cw154_Tris_protease_GR_
64	RRBS_cw154_Tris_protease_GR_CAGAGAGG.TATCTC.dan	423104.0	423019.0	349305.0	RRBS_cw154_Tris_protease_GR_
65	RRBS_cw154_Tris_protease_GR_CAGAGAGG.TCTCTG.dan	569470.0	569373.0	471904.0	RRBS_cw154_Tris_protease_GR_

	filename	total_cpg_no_filter	total_cpg_gtrthan1	total_cpg_gtrthan38	protocol
0	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACAACC	731145.0	730971.0	609788.0	RRBS_cw154_CutSmart_proteina
1	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACCGCG	202674.0	202653.0	168418.0	RRBS_cw154_CutSmart_proteina
2	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACGTGG	451053.0	451000.0	374185.0	RRBS_cw154_CutSmart_proteina
3	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACTCAC	719630.0	719469.0	598731.0	RRBS_cw154_CutSmart_proteina
4	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.AGGATG	799182.0	799040.0	668842.0	RRBS_cw154_CutSmart_proteina
5	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ATAGCG	551512.0	551428.0	454895.0	RRBS_cw154_CutSmart_proteina
6	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ATCGAC	746691.0	746542.0	617118.0	RRBS_cw154_CutSmart_proteina
7	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CAAGAG	659749.0	659662.0	548621.0	RRBS_cw154_CutSmart_proteina
8	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CATGAC	772618.0	772511.0	642159.0	RRBS_cw154_CutSmart_proteina
9	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CCTTCG	454537.0	454458.0	373943.0	RRBS_cw154_CutSmart_proteina
10	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CGGTAG	485483.0	485419.0	403532.0	RRBS_cw154_CutSmart_proteina
11	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CTATTG	9834.0	9834.0	8007.0	RRBS_cw154_CutSmart_proteina
12	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CTCAGC	470082.0	470009.0	386951.0	RRBS_cw154_CutSmart_proteina
13	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GACACG	406942.0	406885.0	335484.0	RRBS_cw154_CutSmart_proteina
14	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GCATTC	579844.0	579771.0	476413.0	RRBS_cw154_CutSmart_proteina
15	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GCTGCC	257369.0	257331.0	211519.0	RRBS_cw154_CutSmart_proteina
16	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GGCATC	512469.0	512391.0	426033.0	RRBS_cw154_CutSmart_proteina
17	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GTGAGG	541784.0	541712.0	452376.0	RRBS_cw154_CutSmart_proteina
18	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GTTGAG	15152.0	15150.0	12450.0	RRBS_cw154_CutSmart_proteina
19	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TAGCGG	404298.0	404249.0	334974.0	RRBS_cw154_CutSmart_proteina
20	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TATCTC	791044.0	790894.0	657494.0	RRBS_cw154_CutSmart_proteina
21	RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TCTCTG	653387.0	653298.0	544721.0	RRBS_cw154_CutSmart_proteina
22	RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC.dan	475766.0	475596.0	398721.0	RRBS_cw154_Tris_protease_CTC
23	RRBS_cw154_Tris_protease_CTCTCTAC.ACCGCG.dan	118876.0	118851.0	100501.0	RRBS_cw154_Tris_protease_CTC
24	RRBS_cw154_Tris_protease_CTCTCTAC.ACGTGG.dan	133116.0	133102.0	112979.0	RRBS_cw154_Tris_protease_CTC
25	RRBS_cw154_Tris_protease_CTCTCTAC.ACTCAC.dan	422977.0	422884.0	349345.0	RRBS_cw154_Tris_protease_CTC
26	RRBS_cw154_Tris_protease_CTCTCTAC.AGGATG.dan	281432.0	281365.0	236281.0	RRBS_cw154_Tris_protease_CTC
27	RRBS_cw154_Tris_protease_CTCTCTAC.ATAGCG.dan	283027.0	282977.0	237352.0	RRBS_cw154_Tris_protease_CTC
28	RRBS_cw154_Tris_protease_CTCTCTAC.ATCGAC.dan	333368.0	333284.0	278192.0	RRBS_cw154_Tris_protease_CTC
29	RRBS_cw154_Tris_protease_CTCTCTAC.CAAGAG.dan	12665.0	12665.0	10385.0	RRBS_cw154_Tris_protease_CTC
...	...	...	...	...	...
36	RRBS_cw154_Tris_protease_CTCTCTAC.GCATTC.dan	589697.0	589548.0	490794.0	RRBS_cw154_Tris_protease_CTC
37	RRBS_cw154_Tris_protease_CTCTCTAC.GCTGCC.dan	146093.0	146076.0	119380.0	RRBS_cw154_Tris_protease_CTC
38	RRBS_cw154_Tris_protease_CTCTCTAC.GGCATC.dan	430967.0	430875.0	357048.0	RRBS_cw154_Tris_protease_CTC
39	RRBS_cw154_Tris_protease_CTCTCTAC.GTGAGG.dan	339976.0	339902.0	285024.0	RRBS_cw154_Tris_protease_CTC
40	RRBS_cw154_Tris_protease_CTCTCTAC.GTTGAG.dan	209790.0	209741.0	172699.0	RRBS_cw154_Tris_protease_CTC
41	RRBS_cw154_Tris_protease_CTCTCTAC.TAGCGG.dan	77891.0	77882.0	65323.0	RRBS_cw154_Tris_protease_CTC
42	RRBS_cw154_Tris_protease_CTCTCTAC.TATCTC.dan	306155.0	306092.0	257092.0	RRBS_cw154_Tris_protease_CTC
43	RRBS_cw154_Tris_protease_CTCTCTAC.TCTCTG.dan	615705.0	615569.0	519604.0	RRBS_cw154_Tris_protease_CTC
44	RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACAACC.dan	739320.0	739104.0	625224.0	RRBS_cw154_Tris_protease_GR_
45	RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACCGCG.dan	56706.0	56695.0	46212.0	RRBS_cw154_Tris_protease_GR_
46	RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACGTGG.dan	326834.0	326784.0	272926.0	RRBS_cw154_Tris_protease_GR_
47	RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACTCAC.dan	753993.0	753779.0	635135.0	RRBS_cw154_Tris_protease_GR_
48	RRBS_cw154_Tris_protease_GR_CAGAGAGG.AGGATG.dan	784636.0	784492.0	663005.0	RRBS_cw154_Tris_protease_GR_
49	RRBS_cw154_Tris_protease_GR_CAGAGAGG.ATAGCG.dan	287163.0	287118.0	239557.0	RRBS_cw154_Tris_protease_GR_
50	RRBS_cw154_Tris_protease_GR_CAGAGAGG.ATCGAC.dan	644063.0	643915.0	539497.0	RRBS_cw154_Tris_protease_GR_
51	RRBS_cw154_Tris_protease_GR_CAGAGAGG.CAAGAG.dan	9786.0	9786.0	7866.0	RRBS_cw154_Tris_protease_GR_
52	RRBS_cw154_Tris_protease_GR_CAGAGAGG.CATGAC.dan	720869.0	720763.0	600463.0	RRBS_cw154_Tris_protease_GR_
53	RRBS_cw154_Tris_protease_GR_CAGAGAGG.CCTTCG.dan	259789.0	259749.0	216388.0	RRBS_cw154_Tris_protease_GR_
54	RRBS_cw154_Tris_protease_GR_CAGAGAGG.CGGTAG.dan	409451.0	409407.0	342160.0	RRBS_cw154_Tris_protease_GR_
55	RRBS_cw154_Tris_protease_GR_CAGAGAGG.CTATTG.dan	393366.0	393304.0	339646.0	RRBS_cw154_Tris_protease_GR_
56	RRBS_cw154_Tris_protease_GR_CAGAGAGG.CTCAGC.dan	376935.0	376888.0	309490.0	RRBS_cw154_Tris_protease_GR_
57	RRBS_cw154_Tris_protease_GR_CAGAGAGG.GACACG.dan	260836.0	260789.0	215879.0	RRBS_cw154_Tris_protease_GR_
58	RRBS_cw154_Tris_protease_GR_CAGAGAGG.GCATTC.dan	324252.0	324200.0	263836.0	RRBS_cw154_Tris_protease_GR_
59	RRBS_cw154_Tris_protease_GR_CAGAGAGG.GCTGCC.dan	224514.0	224480.0	183768.0	RRBS_cw154_Tris_protease_GR_
60	RRBS_cw154_Tris_protease_GR_CAGAGAGG.GGCATC.dan	363506.0	363446.0	297360.0	RRBS_cw154_Tris_protease_GR_
61	RRBS_cw154_Tris_protease_GR_CAGAGAGG.GTGAGG.dan	446957.0	446899.0	370171.0	RRBS_cw154_Tris_protease_GR_
62	RRBS_cw154_Tris_protease_GR_CAGAGAGG.GTTGAG.dan	365218.0	365172.0	301151.0	RRBS_cw154_Tris_protease_GR_
63	RRBS_cw154_Tris_protease_GR_CAGAGAGG.TAGCGG.dan	174703.0	174677.0	141958.0	RRBS_cw154_Tris_protease_GR_
64	RRBS_cw154_Tris_protease_GR_CAGAGAGG.TATCTC.dan	423104.0	423019.0	349305.0	RRBS_cw154_Tris_protease_GR_
65	RRBS_cw154_Tris_protease_GR_CAGAGAGG.TCTCTG.dan	569470.0	569373.0	471904.0	RRBS_cw154_Tris_protease_GR_