In [1]:
%matplotlib inline

In [2]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows


import os
os.chdir("/Users/evanbiederstedt/Downloads/RRBS_data_files")

In [3]:
"""
allStats.csv
cd19_cpg3.csv
cd19_cpg2.csv
cd19_cpg1.csv
all_files450.csv
pcell_cpg3.csv
pcell_cpg2.csv
mcell_cpg3.csv
mcell_cpg2.csv
pcell_cpg1.csv
mcell_cpg1.csv
trito3.csv
trito2.csv
cw154_cpgs_test3.csv
cw154_cpgs_test2.csv
trito1.csv
cw154_cpgs_test1.csv

"""


Out[3]:
'\nallStats.csv\ncd19_cpg3.csv\ncd19_cpg2.csv\ncd19_cpg1.csv\nall_files450.csv\npcell_cpg3.csv\npcell_cpg2.csv\nmcell_cpg3.csv\nmcell_cpg2.csv\npcell_cpg1.csv\nmcell_cpg1.csv\ntrito3.csv\ntrito2.csv\ncw154_cpgs_test3.csv\ncw154_cpgs_test2.csv\ntrito1.csv\ncw154_cpgs_test1.csv\n\n'

In [4]:
df1 = pd.read_csv("cd19_cpg1.csv")
df2 = pd.read_csv("cd19_cpg2.csv")
df3 = pd.read_csv("cd19_cpg3.csv")

In [5]:
df1 = df1.drop(["Unnamed: 0"], axis=1)
df2 = df2.drop(["Unnamed: 0"], axis=1)
df3 = df3.drop(["Unnamed: 0"], axis=1)

In [6]:
#df1 = df1.rename(columns = {'total_cpg_gtrthan1':'total_cpg_gtrthan38'})

In [7]:
df1.head()


Out[7]:
PDR_per_stack Unnamed: 0.1 avgReadCpGs filename methReadCount methylation mixedReadCount read_stack_ID thisMeth thisUnmeth total_cpg_no_filter total_reads unmethReadCount
0 38535.155429 1.226859e+10 830087.0 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACAACC. 2964863.0 97458.161898 1812703.0 1.213268e+10 4007620.0 3247989.0 830087.0 7255609.0 2478043.0
1 7775.252368 7.026188e+08 205444.0 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACCGCG. 433354.0 21222.416634 268399.0 6.906199e+08 576341.0 676915.0 205444.0 1253256.0 551503.0
2 12938.138525 2.224412e+09 362762.0 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACGTGG. 1141733.0 40648.729308 553762.0 2.202570e+09 1456949.0 1358920.0 362762.0 2815869.0 1120374.0
3 32980.046623 8.821705e+09 687749.0 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACTCAC. 2622515.0 86538.210407 1538888.0 8.735350e+09 3527738.0 2536187.0 687749.0 6063925.0 1902522.0
4 10115.626759 1.773965e+09 306597.0 RRBS_NormalBCD19pcell1_22_TAAGGCGA.AGGATG. 1373800.0 40266.419425 500640.0 1.758527e+09 1655479.0 1154831.0 306597.0 2810310.0 935870.0

In [8]:
df1 = df1[["filename", "total_cpg_no_filter"]]

In [9]:
df2.head()


Out[9]:
PDR_per_stack Unnamed: 0.1 avgReadCpGs filename methReadCount methylation mixedReadCount read_stack_ID thisMeth thisUnmeth total_cpg_gtrthan1 total_reads unmethReadCount
0 38529.608673 1.226108e+10 829995.0 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACAACC. 2964125.0 97397.609858 1812653.0 1.212525e+10 4006850.0 3247545.0 829995.0 7254395.0 2477617.0
1 7774.242844 7.024402e+08 205433.0 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACCGCG. 433314.0 21214.416634 268393.0 6.904442e+08 576296.0 676896.0 205433.0 1253192.0 551485.0
2 12936.341116 2.223505e+09 362736.0 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACGTGG. 1141560.0 40631.229308 553748.0 2.201672e+09 1456764.0 1358856.0 362736.0 2815620.0 1120312.0
3 32972.331551 8.816508e+09 687666.0 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACTCAC. 2621843.0 86470.247293 1538815.0 8.730204e+09 3527017.0 2535995.0 687666.0 6063012.0 1902354.0
4 10114.558267 1.773191e+09 306576.0 RRBS_NormalBCD19pcell1_22_TAAGGCGA.AGGATG. 1373559.0 40246.419425 500629.0 1.757760e+09 1655227.0 1154828.0 306576.0 2810055.0 935867.0

In [10]:
df2 = df2[["filename", "total_cpg_gtrthan1"]]

In [11]:
df3.head()


Out[11]:
PDR_per_stack Unnamed: 0.1 avgReadCpGs filename methReadCount methylation mixedReadCount read_stack_ID thisMeth thisUnmeth total_cpg_gtrthan38 total_reads unmethReadCount
0 26748.683723 7.763325e+09 686683.0 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACAACC. 1897380.0 53835.887435 1449102.0 7.677330e+09 2741967.0 2855864.0 686683.0 5597831.0 2251349.0
1 5394.725563 4.543323e+08 172625.0 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACCGCG. 278247.0 11445.886689 220605.0 4.465749e+08 395297.0 612023.0 172625.0 1007320.0 508468.0
2 9004.718202 1.419903e+09 302802.0 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACGTGG. 727876.0 22018.044138 448766.0 1.405962e+09 985480.0 1222417.0 302802.0 2207897.0 1031255.0
3 22614.174425 5.478791e+09 562701.0 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACTCAC. 1643931.0 47156.358513 1222277.0 5.425162e+09 2373699.0 2230845.0 562701.0 4604544.0 1738336.0
4 7245.550208 1.099108e+09 250348.0 RRBS_NormalBCD19pcell1_22_TAAGGCGA.AGGATG. 883336.0 21680.113138 413797.0 1.089543e+09 1117063.0 1041187.0 250348.0 2158250.0 861117.0

In [12]:
df3 = df3[["filename", "total_cpg_gtrthan38"]]

In [13]:
df1.head()


Out[13]:
filename total_cpg_no_filter
0 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACAACC. 830087.0
1 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACCGCG. 205444.0
2 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACGTGG. 362762.0
3 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACTCAC. 687749.0
4 RRBS_NormalBCD19pcell1_22_TAAGGCGA.AGGATG. 306597.0

In [14]:
df2.head()


Out[14]:
filename total_cpg_gtrthan1
0 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACAACC. 829995.0
1 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACCGCG. 205433.0
2 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACGTGG. 362736.0
3 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACTCAC. 687666.0
4 RRBS_NormalBCD19pcell1_22_TAAGGCGA.AGGATG. 306576.0

In [15]:
df3.head()


Out[15]:
filename total_cpg_gtrthan38
0 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACAACC. 686683.0
1 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACCGCG. 172625.0
2 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACGTGG. 302802.0
3 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACTCAC. 562701.0
4 RRBS_NormalBCD19pcell1_22_TAAGGCGA.AGGATG. 250348.0

In [16]:
first = df1.merge(df2, on="filename")

In [17]:
second= first.merge(df3, on="filename")

In [18]:
second.head()


Out[18]:
filename total_cpg_no_filter total_cpg_gtrthan1 total_cpg_gtrthan38
0 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACAACC. 830087.0 829995.0 686683.0
1 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACCGCG. 205444.0 205433.0 172625.0
2 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACGTGG. 362762.0 362736.0 302802.0
3 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACTCAC. 687749.0 687666.0 562701.0
4 RRBS_NormalBCD19pcell1_22_TAAGGCGA.AGGATG. 306597.0 306576.0 250348.0

In [19]:
dfs = [df1, df2, df3]

In [20]:
df1.shape


Out[20]:
(89, 2)

In [21]:
df2.shape


Out[21]:
(89, 2)

In [22]:
df3.shape


Out[22]:
(89, 2)

In [23]:
cd19_merged = pd.concat([df.set_index('filename') for df in dfs], axis=1).reset_index()

In [ ]:


In [24]:
cd19_merged.head()


Out[24]:
filename total_cpg_no_filter total_cpg_gtrthan1 total_cpg_gtrthan38
0 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACAACC. 830087.0 829995.0 686683.0
1 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACCGCG. 205444.0 205433.0 172625.0
2 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACGTGG. 362762.0 362736.0 302802.0
3 RRBS_NormalBCD19pcell1_22_TAAGGCGA.ACTCAC. 687749.0 687666.0 562701.0
4 RRBS_NormalBCD19pcell1_22_TAAGGCGA.AGGATG. 306597.0 306576.0 250348.0

In [25]:
cd19_merged["filename"] = cd19_merged["filename"].str.replace(r'.$', '')

In [26]:
cd19_merged.tail()


Out[26]:
filename total_cpg_no_filter total_cpg_gtrthan1 total_cpg_gtrthan38
84 RRBS_NormalBCD19pcell67_88_TCCTGAGC.GTTGA 91467.0 91461.0 78199.0
85 RRBS_NormalBCD19pcell67_88_TCCTGAGC.TAGCG 215872.0 215857.0 178938.0
86 RRBS_NormalBCD19pcell67_88_TCCTGAGC.TATCT 446362.0 446317.0 376497.0
87 RRBS_NormalBCD19pcell67_88_TCCTGAGC.TCTCT 189736.0 189729.0 156738.0
88 RRBS_NormalBCD19pcell67_88_TCCTGAGC.TGCTG 9121.0 9121.0 7691.0

In [27]:
cd19_merged.to_csv("CD19_cpgs.csv")

In [ ]:


In [28]:
df1 = pd.read_csv("mcell_cpg1.csv")
df2 = pd.read_csv("mcell_cpg2.csv")
df3 = pd.read_csv("mcell_cpg3.csv")

In [29]:
df3 = df3.rename(columns = {'total_cpg_gtrthan1':'total_cpg_gtrthan38'})

In [30]:
df1 = df1[["filename", "total_cpg_no_filter"]]
df2 = df2[["filename", "total_cpg_gtrthan1"]]
df3 = df3[["filename", "total_cpg_gtrthan38"]]

In [31]:
dfs = [df1, df2, df3]

mcell_cpg = pd.concat([df.set_index('filename') for df in dfs], axis=1).reset_index()

In [32]:
len('RRBS_NormalBCD19pCD27mcell1_22')


Out[32]:
30

In [33]:
mcell_cpg['protocol'] = mcell_cpg['filename'].str[:30]

In [34]:
mcell_cpg["filename"][mcell_cpg["protocol"] == str('RRBS_NormalBCD19pCD27mcell1_22')] = mcell_cpg["filename"].str.replace(r'.$', '')


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [35]:
mcell_cpg.head()


Out[35]:
filename total_cpg_no_filter total_cpg_gtrthan1 total_cpg_gtrthan38 protocol
0 RRBS_NormalBCD19pCD27mcell1_22_CGAGGCTG.ACAACC 4704.0 4704.0 3846.0 RRBS_NormalBCD19pCD27mcell1_22
1 RRBS_NormalBCD19pCD27mcell1_22_CGAGGCTG.ACCGCG 156456.0 156451.0 130998.0 RRBS_NormalBCD19pCD27mcell1_22
2 RRBS_NormalBCD19pCD27mcell1_22_CGAGGCTG.ACGTGG 229814.0 229799.0 192524.0 RRBS_NormalBCD19pCD27mcell1_22
3 RRBS_NormalBCD19pCD27mcell1_22_CGAGGCTG.ACTCAC 503087.0 503037.0 415806.0 RRBS_NormalBCD19pCD27mcell1_22
4 RRBS_NormalBCD19pCD27mcell1_22_CGAGGCTG.AGGATG 4385.0 4385.0 3606.0 RRBS_NormalBCD19pCD27mcell1_22

In [36]:
mcell_cpg = mcell_cpg.drop("protocol", axis=1)

In [37]:
mcell_cpg.shape


Out[37]:
(88, 4)

In [38]:
mcell_cpg.to_csv("mcell_cpg.csv")

In [39]:
df1 = pd.read_csv("pcell_cpg1.csv")
df2 = pd.read_csv("pcell_cpg2.csv")
df3 = pd.read_csv("pcell_cpg3.csv")

In [40]:
df3 = df3.rename(columns = {'total_cpg_gtrthan1':'total_cpg_gtrthan38'})

In [41]:
df1 = df1[["filename", "total_cpg_no_filter"]]
df2 = df2[["filename", "total_cpg_gtrthan1"]]
df3 = df3[["filename", "total_cpg_gtrthan38"]]

In [42]:
dfs = [df1, df2, df3]

pcell_cpg = pd.concat([df.set_index('filename') for df in dfs], axis=1).reset_index()

In [43]:
pcell_cpg['protocol'] = pcell_cpg['filename'].str[:30]

In [44]:
pcell_cpg["filename"][pcell_cpg["protocol"] == str('RRBS_NormalBCD19pCD27pcell1_22')] = pcell_cpg["filename"].str.replace(r'.$', '')


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [45]:
pcell_cpg.head()


Out[45]:
filename total_cpg_no_filter total_cpg_gtrthan1 total_cpg_gtrthan38 protocol
0 RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACAACC 403150.0 403090.0 330131.0 RRBS_NormalBCD19pCD27pcell1_22
1 RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACCGCG 88590.0 88584.0 73436.0 RRBS_NormalBCD19pCD27pcell1_22
2 RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACGTGG 2012.0 2012.0 1636.0 RRBS_NormalBCD19pCD27pcell1_22
3 RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.ACTCAC 430377.0 430330.0 356147.0 RRBS_NormalBCD19pCD27pcell1_22
4 RRBS_NormalBCD19pCD27pcell1_22_TAGGCATG.AGGATG 215258.0 215241.0 179398.0 RRBS_NormalBCD19pCD27pcell1_22

In [46]:
pcell_cpg = pcell_cpg.drop("protocol", axis=1)

In [47]:
pcell_cpg.to_csv("pcell_cpg.csv")

In [ ]:


In [48]:
df1 = pd.read_csv("trito1.csv")
df2 = pd.read_csv("trito2.csv")
df3 = pd.read_csv("trito3.csv")

In [49]:
df3 = df3.rename(columns = {'total_cpg_gtrthan1':'total_cpg_gtrthan38'})

In [50]:
df1 = df1[["filename", "total_cpg_no_filter"]]
df2 = df2[["filename", "total_cpg_gtrthan1"]]
df3 = df3[["filename", "total_cpg_gtrthan38"]]

In [51]:
dfs = [df1, df2, df3]

trito_cpg = pd.concat([df.set_index('filename') for df in dfs], axis=1).reset_index()

In [52]:
trito_cpg["filename"] = trito_cpg["filename"].str[:33]

In [53]:
trito_cpg.shape


Out[53]:
(42, 4)

In [54]:
trito_cpg.to_csv("trito_cpg.csv")

In [ ]:


In [ ]:


In [55]:
df1 = pd.read_csv("cw154_cpgs_test1.csv")
df2 = pd.read_csv("cw154_cpgs_test2.csv")
df3 = pd.read_csv("cw154_cpgs_test3.csv")

In [56]:
df3 = df3.rename(columns = {'total_cpg_gtrthan1':'total_cpg_gtrthan38'})

In [57]:
df1 = df1[["filename", "total_cpg_no_filter"]]
df2 = df2[["filename", "total_cpg_gtrthan1"]]
df3 = df3[["filename", "total_cpg_gtrthan38"]]

In [58]:
dfs = [df1, df2, df3]

cw154_cpg = pd.concat([df.set_index('filename') for df in dfs], axis=1).reset_index()

In [59]:
cw154_cpg.shape


Out[59]:
(66, 4)

In [60]:
cw154_cpg["filename"].ix[0] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACAACC")
cw154_cpg["filename"].ix[1] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACCGCG")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [61]:
cw154_cpg["filename"].ix[2] = str('RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACGTGG')


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [62]:
cw154_cpg["filename"].ix[3] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACTCAC")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [63]:
cw154_cpg["filename"].ix[4] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.AGGATG")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [64]:
cw154_cpg["filename"].ix[5] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ATAGCG")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [65]:
cw154_cpg["filename"].ix[6] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ATCGAC")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [66]:
cw154_cpg["filename"].ix[7] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CAAGAG")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [67]:
cw154_cpg["filename"].ix[8] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CATGAC")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [68]:
cw154_cpg["filename"].ix[9] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CCTTCG")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [69]:
cw154_cpg["filename"].ix[10] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CGGTAG")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [70]:
cw154_cpg["filename"].ix[11] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CTATTG")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [71]:
cw154_cpg["filename"].ix[12] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CTCAGC")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [72]:
cw154_cpg["filename"].ix[13] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GACACG")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [73]:
cw154_cpg["filename"].ix[14] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GCATTC")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [74]:
cw154_cpg["filename"].ix[15] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GCTGCC")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [75]:
cw154_cpg["filename"].ix[16] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GGCATC")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [76]:
cw154_cpg["filename"].ix[17] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GTGAGG")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [77]:
cw154_cpg["filename"].ix[18] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GTTGAG")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [78]:
cw154_cpg["filename"].ix[19] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TAGCGG")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [79]:
cw154_cpg["filename"].ix[20] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TATCTC")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [80]:
cw154_cpg["filename"].ix[21] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TCTCTG")


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/pandas/core/indexing.py:132: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [81]:
cw154_cpg["filename"].ix[22]


Out[81]:
'RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC.dan.an'

In [82]:
len("RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACAACC")


Out[82]:
43

In [83]:
cw154_cpg["protocol"] = cw154_cpg["filename"].str[:28]

In [84]:
cw154_cpg.tail(25)


Out[84]:
filename total_cpg_no_filter total_cpg_gtrthan1 total_cpg_gtrthan38 protocol
41 RRBS_cw154_Tris_protease_CTCTCTAC.TAGCGG.dan.an 77891.0 77882.0 65323.0 RRBS_cw154_Tris_protease_CTC
42 RRBS_cw154_Tris_protease_CTCTCTAC.TATCTC.dan.an 306155.0 306092.0 257092.0 RRBS_cw154_Tris_protease_CTC
43 RRBS_cw154_Tris_protease_CTCTCTAC.TCTCTG.dan.an 615705.0 615569.0 519604.0 RRBS_cw154_Tris_protease_CTC
44 RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACAACC.dan 739320.0 739104.0 625224.0 RRBS_cw154_Tris_protease_GR_
45 RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACCGCG.dan 56706.0 56695.0 46212.0 RRBS_cw154_Tris_protease_GR_
46 RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACGTGG.dan 326834.0 326784.0 272926.0 RRBS_cw154_Tris_protease_GR_
47 RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACTCAC.dan 753993.0 753779.0 635135.0 RRBS_cw154_Tris_protease_GR_
48 RRBS_cw154_Tris_protease_GR_CAGAGAGG.AGGATG.dan 784636.0 784492.0 663005.0 RRBS_cw154_Tris_protease_GR_
49 RRBS_cw154_Tris_protease_GR_CAGAGAGG.ATAGCG.dan 287163.0 287118.0 239557.0 RRBS_cw154_Tris_protease_GR_
50 RRBS_cw154_Tris_protease_GR_CAGAGAGG.ATCGAC.dan 644063.0 643915.0 539497.0 RRBS_cw154_Tris_protease_GR_
51 RRBS_cw154_Tris_protease_GR_CAGAGAGG.CAAGAG.dan 9786.0 9786.0 7866.0 RRBS_cw154_Tris_protease_GR_
52 RRBS_cw154_Tris_protease_GR_CAGAGAGG.CATGAC.dan 720869.0 720763.0 600463.0 RRBS_cw154_Tris_protease_GR_
53 RRBS_cw154_Tris_protease_GR_CAGAGAGG.CCTTCG.dan 259789.0 259749.0 216388.0 RRBS_cw154_Tris_protease_GR_
54 RRBS_cw154_Tris_protease_GR_CAGAGAGG.CGGTAG.dan 409451.0 409407.0 342160.0 RRBS_cw154_Tris_protease_GR_
55 RRBS_cw154_Tris_protease_GR_CAGAGAGG.CTATTG.dan 393366.0 393304.0 339646.0 RRBS_cw154_Tris_protease_GR_
56 RRBS_cw154_Tris_protease_GR_CAGAGAGG.CTCAGC.dan 376935.0 376888.0 309490.0 RRBS_cw154_Tris_protease_GR_
57 RRBS_cw154_Tris_protease_GR_CAGAGAGG.GACACG.dan 260836.0 260789.0 215879.0 RRBS_cw154_Tris_protease_GR_
58 RRBS_cw154_Tris_protease_GR_CAGAGAGG.GCATTC.dan 324252.0 324200.0 263836.0 RRBS_cw154_Tris_protease_GR_
59 RRBS_cw154_Tris_protease_GR_CAGAGAGG.GCTGCC.dan 224514.0 224480.0 183768.0 RRBS_cw154_Tris_protease_GR_
60 RRBS_cw154_Tris_protease_GR_CAGAGAGG.GGCATC.dan 363506.0 363446.0 297360.0 RRBS_cw154_Tris_protease_GR_
61 RRBS_cw154_Tris_protease_GR_CAGAGAGG.GTGAGG.dan 446957.0 446899.0 370171.0 RRBS_cw154_Tris_protease_GR_
62 RRBS_cw154_Tris_protease_GR_CAGAGAGG.GTTGAG.dan 365218.0 365172.0 301151.0 RRBS_cw154_Tris_protease_GR_
63 RRBS_cw154_Tris_protease_GR_CAGAGAGG.TAGCGG.dan 174703.0 174677.0 141958.0 RRBS_cw154_Tris_protease_GR_
64 RRBS_cw154_Tris_protease_GR_CAGAGAGG.TATCTC.dan 423104.0 423019.0 349305.0 RRBS_cw154_Tris_protease_GR_
65 RRBS_cw154_Tris_protease_GR_CAGAGAGG.TCTCTG.dan 569470.0 569373.0 471904.0 RRBS_cw154_Tris_protease_GR_

In [85]:
len("RRBS_cw154_Tris_protease_CTC")


Out[85]:
28

In [86]:
cw154_cpg["filename"][cw154_cpg["protocol"] == str("RRBS_cw154_Tris_protease_CTC")] = cw154_cpg["filename"].str.replace(r'.an$', '')


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [87]:
cw154_cpg


Out[87]:
filename total_cpg_no_filter total_cpg_gtrthan1 total_cpg_gtrthan38 protocol
0 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACAACC 731145.0 730971.0 609788.0 RRBS_cw154_CutSmart_proteina
1 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACCGCG 202674.0 202653.0 168418.0 RRBS_cw154_CutSmart_proteina
2 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACGTGG 451053.0 451000.0 374185.0 RRBS_cw154_CutSmart_proteina
3 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACTCAC 719630.0 719469.0 598731.0 RRBS_cw154_CutSmart_proteina
4 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.AGGATG 799182.0 799040.0 668842.0 RRBS_cw154_CutSmart_proteina
5 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ATAGCG 551512.0 551428.0 454895.0 RRBS_cw154_CutSmart_proteina
6 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ATCGAC 746691.0 746542.0 617118.0 RRBS_cw154_CutSmart_proteina
7 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CAAGAG 659749.0 659662.0 548621.0 RRBS_cw154_CutSmart_proteina
8 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CATGAC 772618.0 772511.0 642159.0 RRBS_cw154_CutSmart_proteina
9 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CCTTCG 454537.0 454458.0 373943.0 RRBS_cw154_CutSmart_proteina
10 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CGGTAG 485483.0 485419.0 403532.0 RRBS_cw154_CutSmart_proteina
11 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CTATTG 9834.0 9834.0 8007.0 RRBS_cw154_CutSmart_proteina
12 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CTCAGC 470082.0 470009.0 386951.0 RRBS_cw154_CutSmart_proteina
13 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GACACG 406942.0 406885.0 335484.0 RRBS_cw154_CutSmart_proteina
14 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GCATTC 579844.0 579771.0 476413.0 RRBS_cw154_CutSmart_proteina
15 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GCTGCC 257369.0 257331.0 211519.0 RRBS_cw154_CutSmart_proteina
16 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GGCATC 512469.0 512391.0 426033.0 RRBS_cw154_CutSmart_proteina
17 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GTGAGG 541784.0 541712.0 452376.0 RRBS_cw154_CutSmart_proteina
18 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GTTGAG 15152.0 15150.0 12450.0 RRBS_cw154_CutSmart_proteina
19 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TAGCGG 404298.0 404249.0 334974.0 RRBS_cw154_CutSmart_proteina
20 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TATCTC 791044.0 790894.0 657494.0 RRBS_cw154_CutSmart_proteina
21 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TCTCTG 653387.0 653298.0 544721.0 RRBS_cw154_CutSmart_proteina
22 RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC.dan 475766.0 475596.0 398721.0 RRBS_cw154_Tris_protease_CTC
23 RRBS_cw154_Tris_protease_CTCTCTAC.ACCGCG.dan 118876.0 118851.0 100501.0 RRBS_cw154_Tris_protease_CTC
24 RRBS_cw154_Tris_protease_CTCTCTAC.ACGTGG.dan 133116.0 133102.0 112979.0 RRBS_cw154_Tris_protease_CTC
25 RRBS_cw154_Tris_protease_CTCTCTAC.ACTCAC.dan 422977.0 422884.0 349345.0 RRBS_cw154_Tris_protease_CTC
26 RRBS_cw154_Tris_protease_CTCTCTAC.AGGATG.dan 281432.0 281365.0 236281.0 RRBS_cw154_Tris_protease_CTC
27 RRBS_cw154_Tris_protease_CTCTCTAC.ATAGCG.dan 283027.0 282977.0 237352.0 RRBS_cw154_Tris_protease_CTC
28 RRBS_cw154_Tris_protease_CTCTCTAC.ATCGAC.dan 333368.0 333284.0 278192.0 RRBS_cw154_Tris_protease_CTC
29 RRBS_cw154_Tris_protease_CTCTCTAC.CAAGAG.dan 12665.0 12665.0 10385.0 RRBS_cw154_Tris_protease_CTC
... ... ... ... ... ...
36 RRBS_cw154_Tris_protease_CTCTCTAC.GCATTC.dan 589697.0 589548.0 490794.0 RRBS_cw154_Tris_protease_CTC
37 RRBS_cw154_Tris_protease_CTCTCTAC.GCTGCC.dan 146093.0 146076.0 119380.0 RRBS_cw154_Tris_protease_CTC
38 RRBS_cw154_Tris_protease_CTCTCTAC.GGCATC.dan 430967.0 430875.0 357048.0 RRBS_cw154_Tris_protease_CTC
39 RRBS_cw154_Tris_protease_CTCTCTAC.GTGAGG.dan 339976.0 339902.0 285024.0 RRBS_cw154_Tris_protease_CTC
40 RRBS_cw154_Tris_protease_CTCTCTAC.GTTGAG.dan 209790.0 209741.0 172699.0 RRBS_cw154_Tris_protease_CTC
41 RRBS_cw154_Tris_protease_CTCTCTAC.TAGCGG.dan 77891.0 77882.0 65323.0 RRBS_cw154_Tris_protease_CTC
42 RRBS_cw154_Tris_protease_CTCTCTAC.TATCTC.dan 306155.0 306092.0 257092.0 RRBS_cw154_Tris_protease_CTC
43 RRBS_cw154_Tris_protease_CTCTCTAC.TCTCTG.dan 615705.0 615569.0 519604.0 RRBS_cw154_Tris_protease_CTC
44 RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACAACC.dan 739320.0 739104.0 625224.0 RRBS_cw154_Tris_protease_GR_
45 RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACCGCG.dan 56706.0 56695.0 46212.0 RRBS_cw154_Tris_protease_GR_
46 RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACGTGG.dan 326834.0 326784.0 272926.0 RRBS_cw154_Tris_protease_GR_
47 RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACTCAC.dan 753993.0 753779.0 635135.0 RRBS_cw154_Tris_protease_GR_
48 RRBS_cw154_Tris_protease_GR_CAGAGAGG.AGGATG.dan 784636.0 784492.0 663005.0 RRBS_cw154_Tris_protease_GR_
49 RRBS_cw154_Tris_protease_GR_CAGAGAGG.ATAGCG.dan 287163.0 287118.0 239557.0 RRBS_cw154_Tris_protease_GR_
50 RRBS_cw154_Tris_protease_GR_CAGAGAGG.ATCGAC.dan 644063.0 643915.0 539497.0 RRBS_cw154_Tris_protease_GR_
51 RRBS_cw154_Tris_protease_GR_CAGAGAGG.CAAGAG.dan 9786.0 9786.0 7866.0 RRBS_cw154_Tris_protease_GR_
52 RRBS_cw154_Tris_protease_GR_CAGAGAGG.CATGAC.dan 720869.0 720763.0 600463.0 RRBS_cw154_Tris_protease_GR_
53 RRBS_cw154_Tris_protease_GR_CAGAGAGG.CCTTCG.dan 259789.0 259749.0 216388.0 RRBS_cw154_Tris_protease_GR_
54 RRBS_cw154_Tris_protease_GR_CAGAGAGG.CGGTAG.dan 409451.0 409407.0 342160.0 RRBS_cw154_Tris_protease_GR_
55 RRBS_cw154_Tris_protease_GR_CAGAGAGG.CTATTG.dan 393366.0 393304.0 339646.0 RRBS_cw154_Tris_protease_GR_
56 RRBS_cw154_Tris_protease_GR_CAGAGAGG.CTCAGC.dan 376935.0 376888.0 309490.0 RRBS_cw154_Tris_protease_GR_
57 RRBS_cw154_Tris_protease_GR_CAGAGAGG.GACACG.dan 260836.0 260789.0 215879.0 RRBS_cw154_Tris_protease_GR_
58 RRBS_cw154_Tris_protease_GR_CAGAGAGG.GCATTC.dan 324252.0 324200.0 263836.0 RRBS_cw154_Tris_protease_GR_
59 RRBS_cw154_Tris_protease_GR_CAGAGAGG.GCTGCC.dan 224514.0 224480.0 183768.0 RRBS_cw154_Tris_protease_GR_
60 RRBS_cw154_Tris_protease_GR_CAGAGAGG.GGCATC.dan 363506.0 363446.0 297360.0 RRBS_cw154_Tris_protease_GR_
61 RRBS_cw154_Tris_protease_GR_CAGAGAGG.GTGAGG.dan 446957.0 446899.0 370171.0 RRBS_cw154_Tris_protease_GR_
62 RRBS_cw154_Tris_protease_GR_CAGAGAGG.GTTGAG.dan 365218.0 365172.0 301151.0 RRBS_cw154_Tris_protease_GR_
63 RRBS_cw154_Tris_protease_GR_CAGAGAGG.TAGCGG.dan 174703.0 174677.0 141958.0 RRBS_cw154_Tris_protease_GR_
64 RRBS_cw154_Tris_protease_GR_CAGAGAGG.TATCTC.dan 423104.0 423019.0 349305.0 RRBS_cw154_Tris_protease_GR_
65 RRBS_cw154_Tris_protease_GR_CAGAGAGG.TCTCTG.dan 569470.0 569373.0 471904.0 RRBS_cw154_Tris_protease_GR_

66 rows × 5 columns


In [88]:
cw154_cpg["filename"][cw154_cpg["protocol"] == str("RRBS_cw154_Tris_protease_CTC")] = cw154_cpg["filename"].str.replace(r'.an$', '')


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [89]:
cw154_cpg["filename"][cw154_cpg["protocol"] == str("RRBS_cw154_Tris_protease_GR_")] = cw154_cpg["filename"].str.replace(r'.dan$', '')


/opt/local/Library/Frameworks/Python.framework/Versions/3.4/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [90]:
cw154_cpg.shape


Out[90]:
(66, 5)

In [91]:
cw154_cpg


Out[91]:
filename total_cpg_no_filter total_cpg_gtrthan1 total_cpg_gtrthan38 protocol
0 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACAACC 731145.0 730971.0 609788.0 RRBS_cw154_CutSmart_proteina
1 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACCGCG 202674.0 202653.0 168418.0 RRBS_cw154_CutSmart_proteina
2 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACGTGG 451053.0 451000.0 374185.0 RRBS_cw154_CutSmart_proteina
3 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACTCAC 719630.0 719469.0 598731.0 RRBS_cw154_CutSmart_proteina
4 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.AGGATG 799182.0 799040.0 668842.0 RRBS_cw154_CutSmart_proteina
5 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ATAGCG 551512.0 551428.0 454895.0 RRBS_cw154_CutSmart_proteina
6 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ATCGAC 746691.0 746542.0 617118.0 RRBS_cw154_CutSmart_proteina
7 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CAAGAG 659749.0 659662.0 548621.0 RRBS_cw154_CutSmart_proteina
8 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CATGAC 772618.0 772511.0 642159.0 RRBS_cw154_CutSmart_proteina
9 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CCTTCG 454537.0 454458.0 373943.0 RRBS_cw154_CutSmart_proteina
10 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CGGTAG 485483.0 485419.0 403532.0 RRBS_cw154_CutSmart_proteina
11 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CTATTG 9834.0 9834.0 8007.0 RRBS_cw154_CutSmart_proteina
12 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CTCAGC 470082.0 470009.0 386951.0 RRBS_cw154_CutSmart_proteina
13 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GACACG 406942.0 406885.0 335484.0 RRBS_cw154_CutSmart_proteina
14 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GCATTC 579844.0 579771.0 476413.0 RRBS_cw154_CutSmart_proteina
15 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GCTGCC 257369.0 257331.0 211519.0 RRBS_cw154_CutSmart_proteina
16 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GGCATC 512469.0 512391.0 426033.0 RRBS_cw154_CutSmart_proteina
17 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GTGAGG 541784.0 541712.0 452376.0 RRBS_cw154_CutSmart_proteina
18 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GTTGAG 15152.0 15150.0 12450.0 RRBS_cw154_CutSmart_proteina
19 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TAGCGG 404298.0 404249.0 334974.0 RRBS_cw154_CutSmart_proteina
20 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TATCTC 791044.0 790894.0 657494.0 RRBS_cw154_CutSmart_proteina
21 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TCTCTG 653387.0 653298.0 544721.0 RRBS_cw154_CutSmart_proteina
22 RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC. 475766.0 475596.0 398721.0 RRBS_cw154_Tris_protease_CTC
23 RRBS_cw154_Tris_protease_CTCTCTAC.ACCGCG. 118876.0 118851.0 100501.0 RRBS_cw154_Tris_protease_CTC
24 RRBS_cw154_Tris_protease_CTCTCTAC.ACGTGG. 133116.0 133102.0 112979.0 RRBS_cw154_Tris_protease_CTC
25 RRBS_cw154_Tris_protease_CTCTCTAC.ACTCAC. 422977.0 422884.0 349345.0 RRBS_cw154_Tris_protease_CTC
26 RRBS_cw154_Tris_protease_CTCTCTAC.AGGATG. 281432.0 281365.0 236281.0 RRBS_cw154_Tris_protease_CTC
27 RRBS_cw154_Tris_protease_CTCTCTAC.ATAGCG. 283027.0 282977.0 237352.0 RRBS_cw154_Tris_protease_CTC
28 RRBS_cw154_Tris_protease_CTCTCTAC.ATCGAC. 333368.0 333284.0 278192.0 RRBS_cw154_Tris_protease_CTC
29 RRBS_cw154_Tris_protease_CTCTCTAC.CAAGAG. 12665.0 12665.0 10385.0 RRBS_cw154_Tris_protease_CTC
... ... ... ... ... ...
36 RRBS_cw154_Tris_protease_CTCTCTAC.GCATTC. 589697.0 589548.0 490794.0 RRBS_cw154_Tris_protease_CTC
37 RRBS_cw154_Tris_protease_CTCTCTAC.GCTGCC. 146093.0 146076.0 119380.0 RRBS_cw154_Tris_protease_CTC
38 RRBS_cw154_Tris_protease_CTCTCTAC.GGCATC. 430967.0 430875.0 357048.0 RRBS_cw154_Tris_protease_CTC
39 RRBS_cw154_Tris_protease_CTCTCTAC.GTGAGG. 339976.0 339902.0 285024.0 RRBS_cw154_Tris_protease_CTC
40 RRBS_cw154_Tris_protease_CTCTCTAC.GTTGAG. 209790.0 209741.0 172699.0 RRBS_cw154_Tris_protease_CTC
41 RRBS_cw154_Tris_protease_CTCTCTAC.TAGCGG. 77891.0 77882.0 65323.0 RRBS_cw154_Tris_protease_CTC
42 RRBS_cw154_Tris_protease_CTCTCTAC.TATCTC. 306155.0 306092.0 257092.0 RRBS_cw154_Tris_protease_CTC
43 RRBS_cw154_Tris_protease_CTCTCTAC.TCTCTG. 615705.0 615569.0 519604.0 RRBS_cw154_Tris_protease_CTC
44 RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACAACC 739320.0 739104.0 625224.0 RRBS_cw154_Tris_protease_GR_
45 RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACCGCG 56706.0 56695.0 46212.0 RRBS_cw154_Tris_protease_GR_
46 RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACGTGG 326834.0 326784.0 272926.0 RRBS_cw154_Tris_protease_GR_
47 RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACTCAC 753993.0 753779.0 635135.0 RRBS_cw154_Tris_protease_GR_
48 RRBS_cw154_Tris_protease_GR_CAGAGAGG.AGGATG 784636.0 784492.0 663005.0 RRBS_cw154_Tris_protease_GR_
49 RRBS_cw154_Tris_protease_GR_CAGAGAGG.ATAGCG 287163.0 287118.0 239557.0 RRBS_cw154_Tris_protease_GR_
50 RRBS_cw154_Tris_protease_GR_CAGAGAGG.ATCGAC 644063.0 643915.0 539497.0 RRBS_cw154_Tris_protease_GR_
51 RRBS_cw154_Tris_protease_GR_CAGAGAGG.CAAGAG 9786.0 9786.0 7866.0 RRBS_cw154_Tris_protease_GR_
52 RRBS_cw154_Tris_protease_GR_CAGAGAGG.CATGAC 720869.0 720763.0 600463.0 RRBS_cw154_Tris_protease_GR_
53 RRBS_cw154_Tris_protease_GR_CAGAGAGG.CCTTCG 259789.0 259749.0 216388.0 RRBS_cw154_Tris_protease_GR_
54 RRBS_cw154_Tris_protease_GR_CAGAGAGG.CGGTAG 409451.0 409407.0 342160.0 RRBS_cw154_Tris_protease_GR_
55 RRBS_cw154_Tris_protease_GR_CAGAGAGG.CTATTG 393366.0 393304.0 339646.0 RRBS_cw154_Tris_protease_GR_
56 RRBS_cw154_Tris_protease_GR_CAGAGAGG.CTCAGC 376935.0 376888.0 309490.0 RRBS_cw154_Tris_protease_GR_
57 RRBS_cw154_Tris_protease_GR_CAGAGAGG.GACACG 260836.0 260789.0 215879.0 RRBS_cw154_Tris_protease_GR_
58 RRBS_cw154_Tris_protease_GR_CAGAGAGG.GCATTC 324252.0 324200.0 263836.0 RRBS_cw154_Tris_protease_GR_
59 RRBS_cw154_Tris_protease_GR_CAGAGAGG.GCTGCC 224514.0 224480.0 183768.0 RRBS_cw154_Tris_protease_GR_
60 RRBS_cw154_Tris_protease_GR_CAGAGAGG.GGCATC 363506.0 363446.0 297360.0 RRBS_cw154_Tris_protease_GR_
61 RRBS_cw154_Tris_protease_GR_CAGAGAGG.GTGAGG 446957.0 446899.0 370171.0 RRBS_cw154_Tris_protease_GR_
62 RRBS_cw154_Tris_protease_GR_CAGAGAGG.GTTGAG 365218.0 365172.0 301151.0 RRBS_cw154_Tris_protease_GR_
63 RRBS_cw154_Tris_protease_GR_CAGAGAGG.TAGCGG 174703.0 174677.0 141958.0 RRBS_cw154_Tris_protease_GR_
64 RRBS_cw154_Tris_protease_GR_CAGAGAGG.TATCTC 423104.0 423019.0 349305.0 RRBS_cw154_Tris_protease_GR_
65 RRBS_cw154_Tris_protease_GR_CAGAGAGG.TCTCTG 569470.0 569373.0 471904.0 RRBS_cw154_Tris_protease_GR_

66 rows × 5 columns


In [92]:
cw154_cpg.drop("protocol", axis=1)


Out[92]:
filename total_cpg_no_filter total_cpg_gtrthan1 total_cpg_gtrthan38
0 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACAACC 731145.0 730971.0 609788.0
1 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACCGCG 202674.0 202653.0 168418.0
2 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACGTGG 451053.0 451000.0 374185.0
3 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACTCAC 719630.0 719469.0 598731.0
4 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.AGGATG 799182.0 799040.0 668842.0
5 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ATAGCG 551512.0 551428.0 454895.0
6 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ATCGAC 746691.0 746542.0 617118.0
7 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CAAGAG 659749.0 659662.0 548621.0
8 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CATGAC 772618.0 772511.0 642159.0
9 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CCTTCG 454537.0 454458.0 373943.0
10 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CGGTAG 485483.0 485419.0 403532.0
11 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CTATTG 9834.0 9834.0 8007.0
12 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CTCAGC 470082.0 470009.0 386951.0
13 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GACACG 406942.0 406885.0 335484.0
14 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GCATTC 579844.0 579771.0 476413.0
15 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GCTGCC 257369.0 257331.0 211519.0
16 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GGCATC 512469.0 512391.0 426033.0
17 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GTGAGG 541784.0 541712.0 452376.0
18 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GTTGAG 15152.0 15150.0 12450.0
19 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TAGCGG 404298.0 404249.0 334974.0
20 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TATCTC 791044.0 790894.0 657494.0
21 RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TCTCTG 653387.0 653298.0 544721.0
22 RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC. 475766.0 475596.0 398721.0
23 RRBS_cw154_Tris_protease_CTCTCTAC.ACCGCG. 118876.0 118851.0 100501.0
24 RRBS_cw154_Tris_protease_CTCTCTAC.ACGTGG. 133116.0 133102.0 112979.0
25 RRBS_cw154_Tris_protease_CTCTCTAC.ACTCAC. 422977.0 422884.0 349345.0
26 RRBS_cw154_Tris_protease_CTCTCTAC.AGGATG. 281432.0 281365.0 236281.0
27 RRBS_cw154_Tris_protease_CTCTCTAC.ATAGCG. 283027.0 282977.0 237352.0
28 RRBS_cw154_Tris_protease_CTCTCTAC.ATCGAC. 333368.0 333284.0 278192.0
29 RRBS_cw154_Tris_protease_CTCTCTAC.CAAGAG. 12665.0 12665.0 10385.0
... ... ... ... ...
36 RRBS_cw154_Tris_protease_CTCTCTAC.GCATTC. 589697.0 589548.0 490794.0
37 RRBS_cw154_Tris_protease_CTCTCTAC.GCTGCC. 146093.0 146076.0 119380.0
38 RRBS_cw154_Tris_protease_CTCTCTAC.GGCATC. 430967.0 430875.0 357048.0
39 RRBS_cw154_Tris_protease_CTCTCTAC.GTGAGG. 339976.0 339902.0 285024.0
40 RRBS_cw154_Tris_protease_CTCTCTAC.GTTGAG. 209790.0 209741.0 172699.0
41 RRBS_cw154_Tris_protease_CTCTCTAC.TAGCGG. 77891.0 77882.0 65323.0
42 RRBS_cw154_Tris_protease_CTCTCTAC.TATCTC. 306155.0 306092.0 257092.0
43 RRBS_cw154_Tris_protease_CTCTCTAC.TCTCTG. 615705.0 615569.0 519604.0
44 RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACAACC 739320.0 739104.0 625224.0
45 RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACCGCG 56706.0 56695.0 46212.0
46 RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACGTGG 326834.0 326784.0 272926.0
47 RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACTCAC 753993.0 753779.0 635135.0
48 RRBS_cw154_Tris_protease_GR_CAGAGAGG.AGGATG 784636.0 784492.0 663005.0
49 RRBS_cw154_Tris_protease_GR_CAGAGAGG.ATAGCG 287163.0 287118.0 239557.0
50 RRBS_cw154_Tris_protease_GR_CAGAGAGG.ATCGAC 644063.0 643915.0 539497.0
51 RRBS_cw154_Tris_protease_GR_CAGAGAGG.CAAGAG 9786.0 9786.0 7866.0
52 RRBS_cw154_Tris_protease_GR_CAGAGAGG.CATGAC 720869.0 720763.0 600463.0
53 RRBS_cw154_Tris_protease_GR_CAGAGAGG.CCTTCG 259789.0 259749.0 216388.0
54 RRBS_cw154_Tris_protease_GR_CAGAGAGG.CGGTAG 409451.0 409407.0 342160.0
55 RRBS_cw154_Tris_protease_GR_CAGAGAGG.CTATTG 393366.0 393304.0 339646.0
56 RRBS_cw154_Tris_protease_GR_CAGAGAGG.CTCAGC 376935.0 376888.0 309490.0
57 RRBS_cw154_Tris_protease_GR_CAGAGAGG.GACACG 260836.0 260789.0 215879.0
58 RRBS_cw154_Tris_protease_GR_CAGAGAGG.GCATTC 324252.0 324200.0 263836.0
59 RRBS_cw154_Tris_protease_GR_CAGAGAGG.GCTGCC 224514.0 224480.0 183768.0
60 RRBS_cw154_Tris_protease_GR_CAGAGAGG.GGCATC 363506.0 363446.0 297360.0
61 RRBS_cw154_Tris_protease_GR_CAGAGAGG.GTGAGG 446957.0 446899.0 370171.0
62 RRBS_cw154_Tris_protease_GR_CAGAGAGG.GTTGAG 365218.0 365172.0 301151.0
63 RRBS_cw154_Tris_protease_GR_CAGAGAGG.TAGCGG 174703.0 174677.0 141958.0
64 RRBS_cw154_Tris_protease_GR_CAGAGAGG.TATCTC 423104.0 423019.0 349305.0
65 RRBS_cw154_Tris_protease_GR_CAGAGAGG.TCTCTG 569470.0 569373.0 471904.0

66 rows × 4 columns


In [93]:
cw154_cpg.to_csv("cw154_cpg.csv")

In [94]:
print(cd19_merged.shape)
print(mcell_cpg.shape)
print(pcell_cpg.shape)
print(trito_cpg.shape)
print(cw154_cpg.shape)


(89, 4)
(88, 4)
(90, 4)
(42, 4)
(66, 5)

In [ ]:
files = [mcell_cpg, pcell_cpg, trito_cpg, cw154_cpg, cd19_merged]

In [97]:
len("stacked_RRBS_normal_B_cell_G1_22_GGACTCCT.ACCGCG")


Out[97]:
48

In [98]:
totcpg = pd.read_csv('total_CpG_filename.csv')

In [ ]:


In [ ]:


In [ ]:


In [ ]: