In [1]:
%matplotlib inline
In [2]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows
import os
os.chdir("/Users/evanbiederstedt/Downloads/RRBS_data_files")
In [3]:
"""
allStats.csv
cd19_cpg3.csv
cd19_cpg2.csv
cd19_cpg1.csv
all_files450.csv
pcell_cpg3.csv
pcell_cpg2.csv
mcell_cpg3.csv
mcell_cpg2.csv
pcell_cpg1.csv
mcell_cpg1.csv
trito3.csv
trito2.csv
cw154_cpgs_test3.csv
cw154_cpgs_test2.csv
trito1.csv
cw154_cpgs_test1.csv
"""
Out[3]:
In [4]:
df1 = pd.read_csv("cd19_cpg1.csv")
df2 = pd.read_csv("cd19_cpg2.csv")
df3 = pd.read_csv("cd19_cpg3.csv")
In [5]:
df1 = df1.drop(["Unnamed: 0"], axis=1)
df2 = df2.drop(["Unnamed: 0"], axis=1)
df3 = df3.drop(["Unnamed: 0"], axis=1)
In [6]:
#df1 = df1.rename(columns = {'total_cpg_gtrthan1':'total_cpg_gtrthan38'})
In [7]:
df1.head()
Out[7]:
In [8]:
df1 = df1[["filename", "total_cpg_no_filter"]]
In [9]:
df2.head()
Out[9]:
In [10]:
df2 = df2[["filename", "total_cpg_gtrthan1"]]
In [11]:
df3.head()
Out[11]:
In [12]:
df3 = df3[["filename", "total_cpg_gtrthan38"]]
In [13]:
df1.head()
Out[13]:
In [14]:
df2.head()
Out[14]:
In [15]:
df3.head()
Out[15]:
In [16]:
first = df1.merge(df2, on="filename")
In [17]:
second= first.merge(df3, on="filename")
In [18]:
second.head()
Out[18]:
In [19]:
dfs = [df1, df2, df3]
In [20]:
df1.shape
Out[20]:
In [21]:
df2.shape
Out[21]:
In [22]:
df3.shape
Out[22]:
In [23]:
cd19_merged = pd.concat([df.set_index('filename') for df in dfs], axis=1).reset_index()
In [ ]:
In [24]:
cd19_merged.head()
Out[24]:
In [25]:
cd19_merged["filename"] = cd19_merged["filename"].str.replace(r'.$', '')
In [26]:
cd19_merged.tail()
Out[26]:
In [27]:
cd19_merged.to_csv("CD19_cpgs.csv")
In [ ]:
In [28]:
df1 = pd.read_csv("mcell_cpg1.csv")
df2 = pd.read_csv("mcell_cpg2.csv")
df3 = pd.read_csv("mcell_cpg3.csv")
In [29]:
df3 = df3.rename(columns = {'total_cpg_gtrthan1':'total_cpg_gtrthan38'})
In [30]:
df1 = df1[["filename", "total_cpg_no_filter"]]
df2 = df2[["filename", "total_cpg_gtrthan1"]]
df3 = df3[["filename", "total_cpg_gtrthan38"]]
In [31]:
dfs = [df1, df2, df3]
mcell_cpg = pd.concat([df.set_index('filename') for df in dfs], axis=1).reset_index()
In [32]:
len('RRBS_NormalBCD19pCD27mcell1_22')
Out[32]:
In [33]:
mcell_cpg['protocol'] = mcell_cpg['filename'].str[:30]
In [34]:
mcell_cpg["filename"][mcell_cpg["protocol"] == str('RRBS_NormalBCD19pCD27mcell1_22')] = mcell_cpg["filename"].str.replace(r'.$', '')
In [35]:
mcell_cpg.head()
Out[35]:
In [36]:
mcell_cpg = mcell_cpg.drop("protocol", axis=1)
In [37]:
mcell_cpg.shape
Out[37]:
In [38]:
mcell_cpg.to_csv("mcell_cpg.csv")
In [39]:
df1 = pd.read_csv("pcell_cpg1.csv")
df2 = pd.read_csv("pcell_cpg2.csv")
df3 = pd.read_csv("pcell_cpg3.csv")
In [40]:
df3 = df3.rename(columns = {'total_cpg_gtrthan1':'total_cpg_gtrthan38'})
In [41]:
df1 = df1[["filename", "total_cpg_no_filter"]]
df2 = df2[["filename", "total_cpg_gtrthan1"]]
df3 = df3[["filename", "total_cpg_gtrthan38"]]
In [42]:
dfs = [df1, df2, df3]
pcell_cpg = pd.concat([df.set_index('filename') for df in dfs], axis=1).reset_index()
In [43]:
pcell_cpg['protocol'] = pcell_cpg['filename'].str[:30]
In [44]:
pcell_cpg["filename"][pcell_cpg["protocol"] == str('RRBS_NormalBCD19pCD27pcell1_22')] = pcell_cpg["filename"].str.replace(r'.$', '')
In [45]:
pcell_cpg.head()
Out[45]:
In [46]:
pcell_cpg = pcell_cpg.drop("protocol", axis=1)
In [47]:
pcell_cpg.to_csv("pcell_cpg.csv")
In [ ]:
In [48]:
df1 = pd.read_csv("trito1.csv")
df2 = pd.read_csv("trito2.csv")
df3 = pd.read_csv("trito3.csv")
In [49]:
df3 = df3.rename(columns = {'total_cpg_gtrthan1':'total_cpg_gtrthan38'})
In [50]:
df1 = df1[["filename", "total_cpg_no_filter"]]
df2 = df2[["filename", "total_cpg_gtrthan1"]]
df3 = df3[["filename", "total_cpg_gtrthan38"]]
In [51]:
dfs = [df1, df2, df3]
trito_cpg = pd.concat([df.set_index('filename') for df in dfs], axis=1).reset_index()
In [52]:
trito_cpg["filename"] = trito_cpg["filename"].str[:33]
In [53]:
trito_cpg.shape
Out[53]:
In [54]:
trito_cpg.to_csv("trito_cpg.csv")
In [ ]:
In [ ]:
In [55]:
df1 = pd.read_csv("cw154_cpgs_test1.csv")
df2 = pd.read_csv("cw154_cpgs_test2.csv")
df3 = pd.read_csv("cw154_cpgs_test3.csv")
In [56]:
df3 = df3.rename(columns = {'total_cpg_gtrthan1':'total_cpg_gtrthan38'})
In [57]:
df1 = df1[["filename", "total_cpg_no_filter"]]
df2 = df2[["filename", "total_cpg_gtrthan1"]]
df3 = df3[["filename", "total_cpg_gtrthan38"]]
In [58]:
dfs = [df1, df2, df3]
cw154_cpg = pd.concat([df.set_index('filename') for df in dfs], axis=1).reset_index()
In [59]:
cw154_cpg.shape
Out[59]:
In [60]:
cw154_cpg["filename"].ix[0] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACAACC")
cw154_cpg["filename"].ix[1] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACCGCG")
In [61]:
cw154_cpg["filename"].ix[2] = str('RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACGTGG')
In [62]:
cw154_cpg["filename"].ix[3] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACTCAC")
In [63]:
cw154_cpg["filename"].ix[4] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.AGGATG")
In [64]:
cw154_cpg["filename"].ix[5] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ATAGCG")
In [65]:
cw154_cpg["filename"].ix[6] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ATCGAC")
In [66]:
cw154_cpg["filename"].ix[7] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CAAGAG")
In [67]:
cw154_cpg["filename"].ix[8] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CATGAC")
In [68]:
cw154_cpg["filename"].ix[9] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CCTTCG")
In [69]:
cw154_cpg["filename"].ix[10] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CGGTAG")
In [70]:
cw154_cpg["filename"].ix[11] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CTATTG")
In [71]:
cw154_cpg["filename"].ix[12] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CTCAGC")
In [72]:
cw154_cpg["filename"].ix[13] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GACACG")
In [73]:
cw154_cpg["filename"].ix[14] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GCATTC")
In [74]:
cw154_cpg["filename"].ix[15] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GCTGCC")
In [75]:
cw154_cpg["filename"].ix[16] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GGCATC")
In [76]:
cw154_cpg["filename"].ix[17] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GTGAGG")
In [77]:
cw154_cpg["filename"].ix[18] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GTTGAG")
In [78]:
cw154_cpg["filename"].ix[19] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TAGCGG")
In [79]:
cw154_cpg["filename"].ix[20] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TATCTC")
In [80]:
cw154_cpg["filename"].ix[21] = str("RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TCTCTG")
In [81]:
cw154_cpg["filename"].ix[22]
Out[81]:
In [82]:
len("RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACAACC")
Out[82]:
In [83]:
cw154_cpg["protocol"] = cw154_cpg["filename"].str[:28]
In [84]:
cw154_cpg.tail(25)
Out[84]:
In [85]:
len("RRBS_cw154_Tris_protease_CTC")
Out[85]:
In [86]:
cw154_cpg["filename"][cw154_cpg["protocol"] == str("RRBS_cw154_Tris_protease_CTC")] = cw154_cpg["filename"].str.replace(r'.an$', '')
In [87]:
cw154_cpg
Out[87]:
In [88]:
cw154_cpg["filename"][cw154_cpg["protocol"] == str("RRBS_cw154_Tris_protease_CTC")] = cw154_cpg["filename"].str.replace(r'.an$', '')
In [89]:
cw154_cpg["filename"][cw154_cpg["protocol"] == str("RRBS_cw154_Tris_protease_GR_")] = cw154_cpg["filename"].str.replace(r'.dan$', '')
In [90]:
cw154_cpg.shape
Out[90]:
In [91]:
cw154_cpg
Out[91]:
In [92]:
cw154_cpg.drop("protocol", axis=1)
Out[92]:
In [93]:
cw154_cpg.to_csv("cw154_cpg.csv")
In [94]:
print(cd19_merged.shape)
print(mcell_cpg.shape)
print(pcell_cpg.shape)
print(trito_cpg.shape)
print(cw154_cpg.shape)
In [ ]:
files = [mcell_cpg, pcell_cpg, trito_cpg, cw154_cpg, cd19_merged]
In [97]:
len("stacked_RRBS_normal_B_cell_G1_22_GGACTCCT.ACCGCG")
Out[97]:
In [98]:
totcpg = pd.read_csv('total_CpG_filename.csv')
In [ ]:
In [ ]:
In [ ]:
In [ ]: