In [1]:
%matplotlib inline

In [41]:
import glob
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
pd.set_option('display.max_columns', 50) # print all rows

import os
os.chdir('/Users/evanbiederstedt/Downloads/required_binary_phylo_files')

import statsmodels.api as sm

In [42]:
%ls


binary_position_RRBS_CW154_double_digestion_TCCTGAGC.ACAACC.bed.anno.csv
binary_position_RRBS_CW154_double_digestion_TCCTGAGC.ACCGCG.bed.anno.csv
binary_position_RRBS_CW154_double_digestion_TCCTGAGC.ACGTGG.bed.anno.csv
binary_position_RRBS_CW154_double_digestion_TCCTGAGC.ACTCAC.bed.anno.csv
binary_position_RRBS_CW154_double_digestion_TCCTGAGC.AGGATG.bed.anno.csv
binary_position_RRBS_CW154_double_digestion_TCCTGAGC.ATAGCG.bed.anno.csv
binary_position_RRBS_CW154_double_digestion_TCCTGAGC.ATCGAC.bed.anno.csv
binary_position_RRBS_CW154_double_digestion_TCCTGAGC.CAAGAG.bed.anno.csv
binary_position_RRBS_CW154_double_digestion_TCCTGAGC.CATGAC.bed.anno.csv
binary_position_RRBS_CW154_double_digestion_TCCTGAGC.CCTTCG.bed.anno.csv
binary_position_RRBS_CW154_double_digestion_TCCTGAGC.CGGTAG.bed.anno.csv
binary_position_RRBS_CW154_double_digestion_TCCTGAGC.CTCAGC.bed.anno.csv
binary_position_RRBS_CW154_double_digestion_TCCTGAGC.GACACG.bed.anno.csv
binary_position_RRBS_CW154_double_digestion_TCCTGAGC.GCATTC.bed.anno.csv
binary_position_RRBS_CW154_double_digestion_TCCTGAGC.GCTGCC.bed.anno.csv
binary_position_RRBS_CW154_double_digestion_TCCTGAGC.GGCATC.bed.anno.csv
binary_position_RRBS_CW154_double_digestion_TCCTGAGC.GTGAGG.bed.anno.csv
binary_position_RRBS_CW154_double_digestion_TCCTGAGC.TAGCGG.bed.anno.csv
binary_position_RRBS_CW154_double_digestion_TCCTGAGC.TATCTC.bed.anno.csv
binary_position_RRBS_CW154_double_digestion_TCCTGAGC.TCTCTG.bed.anno.csv
binary_position_RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACAACC.bed.anno.csv
binary_position_RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACCGCG.bed.anno.csv
binary_position_RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACGTGG.bed.anno.csv
binary_position_RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ACTCAC.bed.anno.csv
binary_position_RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.AGGATG.bed.anno.csv
binary_position_RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ATAGCG.bed.anno.csv
binary_position_RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.ATCGAC.bed.anno.csv
binary_position_RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CAAGAG.bed.anno.csv
binary_position_RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CATGAC.bed.anno.csv
binary_position_RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CCTTCG.bed.anno.csv
binary_position_RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CGGTAG.bed.anno.csv
binary_position_RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.CTCAGC.bed.anno.csv
binary_position_RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GACACG.bed.anno.csv
binary_position_RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GCATTC.bed.anno.csv
binary_position_RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GCTGCC.bed.anno.csv
binary_position_RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GGCATC.bed.anno.csv
binary_position_RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.GTGAGG.bed.anno.csv
binary_position_RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TAGCGG.bed.anno.csv
binary_position_RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TATCTC.bed.anno.csv
binary_position_RRBS_cw154_CutSmart_proteinase_K_TAGGCATG.TCTCTG.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_CTCTCTAC.ACAACC.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_CTCTCTAC.ACCGCG.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_CTCTCTAC.ACGTGG.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_CTCTCTAC.ACTCAC.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_CTCTCTAC.AGGATG.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_CTCTCTAC.ATAGCG.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_CTCTCTAC.ATCGAC.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_CTCTCTAC.CATGAC.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_CTCTCTAC.CCTTCG.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_CTCTCTAC.CGGTAG.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_CTCTCTAC.CTATTG.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_CTCTCTAC.CTCAGC.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_CTCTCTAC.GACACG.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_CTCTCTAC.GCATTC.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_CTCTCTAC.GCTGCC.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_CTCTCTAC.GGCATC.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_CTCTCTAC.GTGAGG.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_CTCTCTAC.GTTGAG.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_CTCTCTAC.TAGCGG.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_CTCTCTAC.TATCTC.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_CTCTCTAC.TCTCTG.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACAACC.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACCGCG.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACGTGG.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_GR_CAGAGAGG.ACTCAC.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_GR_CAGAGAGG.AGGATG.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_GR_CAGAGAGG.ATAGCG.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_GR_CAGAGAGG.ATCGAC.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_GR_CAGAGAGG.CATGAC.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_GR_CAGAGAGG.CCTTCG.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_GR_CAGAGAGG.CGGTAG.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_GR_CAGAGAGG.CTCAGC.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_GR_CAGAGAGG.GACACG.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_GR_CAGAGAGG.GCATTC.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_GR_CAGAGAGG.GCTGCC.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_GR_CAGAGAGG.GGCATC.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_GR_CAGAGAGG.GTGAGG.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_GR_CAGAGAGG.TAGCGG.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_GR_CAGAGAGG.TATCTC.bed.anno.csv
binary_position_RRBS_cw154_Tris_protease_GR_CAGAGAGG.TCTCTG.bed.anno.csv
binary_position_RRBS_trito_pool_1_TAAGGCGA.ACAACC.bed.anno.csv
binary_position_RRBS_trito_pool_1_TAAGGCGA.ACGTGG.bed.anno.csv
binary_position_RRBS_trito_pool_1_TAAGGCGA.ACTCAC.bed.anno.csv
binary_position_RRBS_trito_pool_1_TAAGGCGA.ATAGCG.bed.anno.csv
binary_position_RRBS_trito_pool_1_TAAGGCGA.ATCGAC.bed.anno.csv
binary_position_RRBS_trito_pool_1_TAAGGCGA.CAAGAG.bed.anno.csv
binary_position_RRBS_trito_pool_1_TAAGGCGA.CATGAC.bed.anno.csv
binary_position_RRBS_trito_pool_1_TAAGGCGA.CCTTCG.bed.anno.csv
binary_position_RRBS_trito_pool_1_TAAGGCGA.CGGTAG.bed.anno.csv
binary_position_RRBS_trito_pool_1_TAAGGCGA.CTATTG.bed.anno.csv
binary_position_RRBS_trito_pool_1_TAAGGCGA.GACACG.bed.anno.csv
binary_position_RRBS_trito_pool_1_TAAGGCGA.GCATTC.bed.anno.csv
binary_position_RRBS_trito_pool_1_TAAGGCGA.GCTGCC.bed.anno.csv
binary_position_RRBS_trito_pool_1_TAAGGCGA.GGCATC.bed.anno.csv
binary_position_RRBS_trito_pool_1_TAAGGCGA.GTGAGG.bed.anno.csv
binary_position_RRBS_trito_pool_1_TAAGGCGA.GTTGAG.bed.anno.csv
binary_position_RRBS_trito_pool_1_TAAGGCGA.TAGCGG.bed.anno.csv
binary_position_RRBS_trito_pool_1_TAAGGCGA.TATCTC.bed.anno.csv
binary_position_RRBS_trito_pool_1_TAAGGCGA.TCTCTG.bed.anno.csv
binary_position_RRBS_trito_pool_1_TAAGGCGA.TGACAG.bed.anno.csv
binary_position_RRBS_trito_pool_1_TAAGGCGA.TGCTGC.bed.anno.csv
binary_position_RRBS_trito_pool_2_CGTACTAG.ACAACC.bed.anno.csv
binary_position_RRBS_trito_pool_2_CGTACTAG.ACGTGG.bed.anno.csv
binary_position_RRBS_trito_pool_2_CGTACTAG.ACTCAC.bed.anno.csv
binary_position_RRBS_trito_pool_2_CGTACTAG.AGGATG.bed.anno.csv
binary_position_RRBS_trito_pool_2_CGTACTAG.ATAGCG.bed.anno.csv
binary_position_RRBS_trito_pool_2_CGTACTAG.ATCGAC.bed.anno.csv
binary_position_RRBS_trito_pool_2_CGTACTAG.CAAGAG.bed.anno.csv
binary_position_RRBS_trito_pool_2_CGTACTAG.CATGAC.bed.anno.csv
binary_position_RRBS_trito_pool_2_CGTACTAG.CCTTCG.bed.anno.csv
binary_position_RRBS_trito_pool_2_CGTACTAG.CGGTAG.bed.anno.csv
binary_position_RRBS_trito_pool_2_CGTACTAG.CTATTG.bed.anno.csv
binary_position_RRBS_trito_pool_2_CGTACTAG.GACACG.bed.anno.csv
binary_position_RRBS_trito_pool_2_CGTACTAG.GCATTC.bed.anno.csv
binary_position_RRBS_trito_pool_2_CGTACTAG.GCTGCC.bed.anno.csv
binary_position_RRBS_trito_pool_2_CGTACTAG.GGCATC.bed.anno.csv
binary_position_RRBS_trito_pool_2_CGTACTAG.GTGAGG.bed.anno.csv
binary_position_RRBS_trito_pool_2_CGTACTAG.GTTGAG.bed.anno.csv
binary_position_RRBS_trito_pool_2_CGTACTAG.TAGCGG.bed.anno.csv
binary_position_RRBS_trito_pool_2_CGTACTAG.TATCTC.bed.anno.csv
binary_position_RRBS_trito_pool_2_CGTACTAG.TCTCTG.bed.anno.csv
binary_position_RRBS_trito_pool_2_CGTACTAG.TGACAG.bed.anno.csv
leftovers/
trito1_binary.csv
trito1_binary.phy
trito1_binary.txt
tritopool_1_final.phy
tritopool_2_final.phy
tritopool_total_final.phy

In [43]:
trito_files = glob.glob("binary_position_RRBS_trito_pool*")

In [44]:
len(trito_files)


Out[44]:
42

In [ ]:
df1 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.ACAACC.bed.anno.csv")
df2 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.ACGTGG.bed.anno.csv")
df3 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.ACTCAC.bed.anno.csv")
df4 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.ATAGCG.bed.anno.csv")
df5 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.ATCGAC.bed.anno.csv")
df6 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.CAAGAG.bed.anno.csv")
df7 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.CATGAC.bed.anno.csv")
df8 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.CCTTCG.bed.anno.csv")
df9 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.CGGTAG.bed.anno.csv")
df10 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.CTATTG.bed.anno.csv")
df11 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.GACACG.bed.anno.csv")
df12 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.GCATTC.bed.anno.csv")
df13 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.GCTGCC.bed.anno.csv")
df14 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.GGCATC.bed.anno.csv")
df15 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.GTGAGG.bed.anno.csv")
df16 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.GTTGAG.bed.anno.csv")
df17 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.TAGCGG.bed.anno.csv")
df18 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.TATCTC.bed.anno.csv")
df19 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.TCTCTG.bed.anno.csv")
df20 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.TGACAG.bed.anno.csv")
df21 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.TGCTGC.bed.anno.csv")

In [9]:
df1 = df1.drop("Unnamed: 0", axis=1)
df2 = df2.drop("Unnamed: 0", axis=1)
df3 = df3.drop("Unnamed: 0", axis=1)
df4 = df4.drop("Unnamed: 0", axis=1)
df5 = df5.drop("Unnamed: 0", axis=1)
df6 = df6.drop("Unnamed: 0", axis=1)
df7 = df7.drop("Unnamed: 0", axis=1)
df8 = df8.drop("Unnamed: 0", axis=1)
df9 = df9.drop("Unnamed: 0", axis=1)
df10 = df10.drop("Unnamed: 0", axis=1)
df11 = df11.drop("Unnamed: 0", axis=1)
df12 = df12.drop("Unnamed: 0", axis=1)
df13 = df13.drop("Unnamed: 0", axis=1)
df14 = df14.drop("Unnamed: 0", axis=1)
df15 = df15.drop("Unnamed: 0", axis=1)
df16 = df16.drop("Unnamed: 0", axis=1)
df17 = df17.drop("Unnamed: 0", axis=1)
df18 = df18.drop("Unnamed: 0", axis=1)
df19 = df19.drop("Unnamed: 0", axis=1)
df20 = df20.drop("Unnamed: 0", axis=1)
df21 = df21.drop("Unnamed: 0", axis=1)

In [10]:
df_list = [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, 
           df12, df13, df14, df15, df16, df17, df18, df19, df20, df21]

In [ ]:


In [11]:
df1.head()


Out[11]:
position Methyl
0 chr1_10496 1.0
1 chr1_10524 1.0
2 chr1_10541 1.0
3 chr1_10562 1.0
4 chr1_10570 1.0

In [ ]:
trito_matrix = pd.concat([df.set_index("position") for df in df_list], axis=1).reset_index().astype(object)

In [ ]:
trito_matrix.shape

In [ ]:
trito_matrix.head()

In [ ]:
trito_matrix = trito_matrix.drop("index", axis=1)

In [ ]:
trito_matrix.columns = ["RRBS_trito_pool_1_TAAGGCGA.ACAACC", "RRBS_trito_pool_1_TAAGGCGA.ACGTGG", 
                        "RRBS_trito_pool_1_TAAGGCGA.ACTCAC", "RRBS_trito_pool_1_TAAGGCGA.ATAGCG", 
                        "RRBS_trito_pool_1_TAAGGCGA.ATCGAC", "RRBS_trito_pool_1_TAAGGCGA.CAAGAG", 
                        "RRBS_trito_pool_1_TAAGGCGA.CATGAC", "RRBS_trito_pool_1_TAAGGCGA.CCTTCG", 
                        "RRBS_trito_pool_1_TAAGGCGA.CGGTAG", "RRBS_trito_pool_1_TAAGGCGA.CTATTG", 
                        "RRBS_trito_pool_1_TAAGGCGA.GACACG", "RRBS_trito_pool_1_TAAGGCGA.GCATTC", 
                        "RRBS_trito_pool_1_TAAGGCGA.GCTGCC", "RRBS_trito_pool_1_TAAGGCGA.GGCATC", 
                        "RRBS_trito_pool_1_TAAGGCGA.GTGAGG", "RRBS_trito_pool_1_TAAGGCGA.GTTGAG", 
                        "RRBS_trito_pool_1_TAAGGCGA.TAGCGG", "RRBS_trito_pool_1_TAAGGCGA.TATCTC", 
                        "RRBS_trito_pool_1_TAAGGCGA.TCTCTG", "RRBS_trito_pool_1_TAAGGCGA.TGACAG", 
                        "RRBS_trito_pool_1_TAAGGCGA.TGCTGC"]

In [ ]:


In [ ]:


In [ ]:
#trito_matrix = trito_matrix.T.astype(object)  # don't transpose

In [ ]:
# trito_matrix.applymap(int)

In [ ]:
trito_matrix.head()

In [ ]:
#trito_matrix.to_csv("trito1_binary.csv", index=False, header=False)

In [ ]:
#f = pd.read_csv("trito1_binary.csv")

In [ ]:
#f

In [ ]:


In [ ]:


In [ ]:
trito_matrix = trito_matrix.applymap(lambda x: int(x) if pd.notnull(x) else str("?"))

In [ ]:
#trito_matrix.to_csv("tritopool_binary1.phy", header=None, index=None, sep=' ')

In [ ]:
trito_matrix.shape

In [ ]:
%pwd

In [ ]:
trito_matrix

In [ ]:
#trito_matrix.applymap(lambda x: int(x) if pd.notnull(x) else str("?"))

In [ ]:
#trito_matrix.to_csv("tritopool_correct1.phy", header=None, index=None)

In [ ]:
#trito_matrix.to_csv("tritopool_correct2.phy", header=None, index=None, sep=' ')

In [ ]:
#trito_matrix.to_csv("tritopool_correct3.phy", header=None, index=None, sep='')

In [ ]:


In [ ]:
trito_matrix = trito_matrix.astype(str).apply(''.join)

In [ ]:
trito_matrix

In [ ]:
type(trito_matrix)

In [ ]:
trito_matrix.index

In [ ]:
matt = pd.Series(trito_matrix.index.astype(str).str.cat(trito_matrix.astype(str),'    '))

In [ ]:
matt

In [ ]:
trito_matrix = matt

In [ ]:
trito_matrix.to_csv("tritopool_1_final.phy", header=None, index=None)

In [ ]:
trito_matrix

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:
trito_files2 = glob.glob("binary_position_RRBS_trito_pool_2*")

In [ ]:
len(trito_files2)

In [ ]:
trito_files2

In [12]:
df1 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.ACAACC.bed.anno.csv")
df2 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.ACGTGG.bed.anno.csv")
df3 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.ACTCAC.bed.anno.csv")
df4 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.AGGATG.bed.anno.csv")
df5 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.ATAGCG.bed.anno.csv")
df6 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.ATCGAC.bed.anno.csv")
df7 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.CAAGAG.bed.anno.csv")
df8 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.CATGAC.bed.anno.csv")
df9 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.CCTTCG.bed.anno.csv")
df10 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.CGGTAG.bed.anno.csv")
df11 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.CTATTG.bed.anno.csv")
df12 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.GACACG.bed.anno.csv")
df13 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.GCATTC.bed.anno.csv")
df14 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.GCTGCC.bed.anno.csv")
df15 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.GGCATC.bed.anno.csv")
df16 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.GTGAGG.bed.anno.csv")
df17 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.GTTGAG.bed.anno.csv")
df18 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.TAGCGG.bed.anno.csv")
df19 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.TATCTC.bed.anno.csv")
df20 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.TCTCTG.bed.anno.csv")
df21 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.TGACAG.bed.anno.csv")

In [13]:
df1 = df1.drop("Unnamed: 0", axis=1)
df2 = df2.drop("Unnamed: 0", axis=1)
df3 = df3.drop("Unnamed: 0", axis=1)
df4 = df4.drop("Unnamed: 0", axis=1)
df5 = df5.drop("Unnamed: 0", axis=1)
df6 = df6.drop("Unnamed: 0", axis=1)
df7 = df7.drop("Unnamed: 0", axis=1)
df8 = df8.drop("Unnamed: 0", axis=1)
df9 = df9.drop("Unnamed: 0", axis=1)
df10 = df10.drop("Unnamed: 0", axis=1)
df11 = df11.drop("Unnamed: 0", axis=1)
df12 = df12.drop("Unnamed: 0", axis=1)
df13 = df13.drop("Unnamed: 0", axis=1)
df14 = df14.drop("Unnamed: 0", axis=1)
df15 = df15.drop("Unnamed: 0", axis=1)
df16 = df16.drop("Unnamed: 0", axis=1)
df17 = df17.drop("Unnamed: 0", axis=1)
df18 = df18.drop("Unnamed: 0", axis=1)
df19 = df19.drop("Unnamed: 0", axis=1)
df20 = df20.drop("Unnamed: 0", axis=1)
df21 = df21.drop("Unnamed: 0", axis=1)

In [14]:
df_list = [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, 
           df12, df13, df14, df15, df16, df17, df18, df19, df20, df21]

In [15]:
trito_matrix2 = pd.concat([df.set_index("position") for df in df_list], axis=1).reset_index().astype(object)

In [16]:
trito_matrix2.shape


Out[16]:
(5081248, 22)

In [17]:
trito_matrix2.head()


Out[17]:
index Methyl Methyl Methyl Methyl Methyl Methyl Methyl Methyl Methyl Methyl Methyl Methyl Methyl Methyl Methyl Methyl Methyl Methyl Methyl Methyl Methyl
0 chr10_10000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1 NaN 1 NaN NaN NaN NaN 1 NaN
1 chr10_10000009 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN NaN NaN NaN NaN NaN
2 chr10_10000203 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1 NaN NaN NaN NaN NaN NaN NaN NaN
3 chr10_10000232 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN NaN NaN NaN NaN NaN
4 chr10_10000258 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN NaN NaN NaN NaN NaN

In [18]:
trito_matrix2 = trito_matrix2.drop("index", axis=1)

In [19]:
trito_matrix2.columns = ["RRBS_trito_pool_2_CGTACTAG.ACAACC", "RRBS_trito_pool_2_CGTACTAG.ACGTGG", 
                        "RRBS_trito_pool_2_CGTACTAG.ACTCAC", "RRBS_trito_pool_2_CGTACTAG.AGGATG", 
                        "RRBS_trito_pool_2_CGTACTAG.ATAGCG", "RRBS_trito_pool_2_CGTACTAG.ATCGAC", 
                        "RRBS_trito_pool_2_CGTACTAG.CAAGAG", "RRBS_trito_pool_2_CGTACTAG.CATGAC", 
                        "RRBS_trito_pool_2_CGTACTAG.CCTTCG", "RRBS_trito_pool_2_CGTACTAG.CGGTAG", 
                        "RRBS_trito_pool_2_CGTACTAG.CTATTG", "RRBS_trito_pool_2_CGTACTAG.GACACG", 
                        "RRBS_trito_pool_2_CGTACTAG.GCATTC", "RRBS_trito_pool_2_CGTACTAG.GCTGCC", 
                        "RRBS_trito_pool_2_CGTACTAG.GGCATC", "RRBS_trito_pool_2_CGTACTAG.GTGAGG", 
                        "RRBS_trito_pool_2_CGTACTAG.GTTGAG", "RRBS_trito_pool_2_CGTACTAG.TAGCGG", 
                        "RRBS_trito_pool_2_CGTACTAG.TATCTC", "RRBS_trito_pool_2_CGTACTAG.TCTCTG", 
                        "RRBS_trito_pool_2_CGTACTAG.TGACAG"]

In [20]:
trito_matrix2 = trito_matrix2.applymap(lambda x: int(x) if pd.notnull(x) else str("?"))

In [21]:
trito_matrix2 = trito_matrix2.astype(str).apply(''.join)

In [22]:
trito2 = pd.Series(trito_matrix2.index.astype(str).str.cat(trito_matrix2.astype(str),'    '))

In [23]:
trito_matrix2 = trito2

In [24]:
trito_matrix2.to_csv("tritopool_2_final.phy", header=None, index=None)

In [ ]:
trito_matrix2

In [ ]:
trito_matrix2.shape

In [ ]:
trito_matrix.shape

In [ ]:
type(trito_matrix)

In [ ]:


In [ ]:


In [26]:
df1 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.ACAACC.bed.anno.csv")
df2 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.ACGTGG.bed.anno.csv")
df3 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.ACTCAC.bed.anno.csv")
df4 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.ATAGCG.bed.anno.csv")
df5 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.ATCGAC.bed.anno.csv")
df6 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.CAAGAG.bed.anno.csv")
df7 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.CATGAC.bed.anno.csv")
df8 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.CCTTCG.bed.anno.csv")
df9 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.CGGTAG.bed.anno.csv")
df10 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.CTATTG.bed.anno.csv")
df11 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.GACACG.bed.anno.csv")
df12 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.GCATTC.bed.anno.csv")
df13 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.GCTGCC.bed.anno.csv")
df14 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.GGCATC.bed.anno.csv")
df15 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.GTGAGG.bed.anno.csv")
df16 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.GTTGAG.bed.anno.csv")
df17 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.TAGCGG.bed.anno.csv")
df18 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.TATCTC.bed.anno.csv")
df19 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.TCTCTG.bed.anno.csv")
df20 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.TGACAG.bed.anno.csv")
df21 = pd.read_csv("binary_position_RRBS_trito_pool_1_TAAGGCGA.TGCTGC.bed.anno.csv")
df22 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.ACAACC.bed.anno.csv")
df23 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.ACGTGG.bed.anno.csv")
df24 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.ACTCAC.bed.anno.csv")
df25 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.AGGATG.bed.anno.csv")
df26 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.ATAGCG.bed.anno.csv")
df27 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.ATCGAC.bed.anno.csv")
df28 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.CAAGAG.bed.anno.csv")
df29 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.CATGAC.bed.anno.csv")
df30 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.CCTTCG.bed.anno.csv")
df31 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.CGGTAG.bed.anno.csv")
df32 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.CTATTG.bed.anno.csv")
df33 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.GACACG.bed.anno.csv")
df34 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.GCATTC.bed.anno.csv")
df35 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.GCTGCC.bed.anno.csv")
df36 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.GGCATC.bed.anno.csv")
df37 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.GTGAGG.bed.anno.csv")
df38 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.GTTGAG.bed.anno.csv")
df39 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.TAGCGG.bed.anno.csv")
df40 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.TATCTC.bed.anno.csv")
df41 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.TCTCTG.bed.anno.csv")
df42 = pd.read_csv("binary_position_RRBS_trito_pool_2_CGTACTAG.TGACAG.bed.anno.csv")

In [27]:
df1 = df1.drop("Unnamed: 0", axis=1)
df2 = df2.drop("Unnamed: 0", axis=1)
df3 = df3.drop("Unnamed: 0", axis=1)
df4 = df4.drop("Unnamed: 0", axis=1)
df5 = df5.drop("Unnamed: 0", axis=1)
df6 = df6.drop("Unnamed: 0", axis=1)
df7 = df7.drop("Unnamed: 0", axis=1)
df8 = df8.drop("Unnamed: 0", axis=1)
df9 = df9.drop("Unnamed: 0", axis=1)
df10 = df10.drop("Unnamed: 0", axis=1)
df11 = df11.drop("Unnamed: 0", axis=1)
df12 = df12.drop("Unnamed: 0", axis=1)
df13 = df13.drop("Unnamed: 0", axis=1)
df14 = df14.drop("Unnamed: 0", axis=1)
df15 = df15.drop("Unnamed: 0", axis=1)
df16 = df16.drop("Unnamed: 0", axis=1)
df17 = df17.drop("Unnamed: 0", axis=1)
df18 = df18.drop("Unnamed: 0", axis=1)
df19 = df19.drop("Unnamed: 0", axis=1)
df20 = df20.drop("Unnamed: 0", axis=1)
df21 = df21.drop("Unnamed: 0", axis=1)

df22 = df22.drop("Unnamed: 0", axis=1)
df23 = df23.drop("Unnamed: 0", axis=1)
df24 = df24.drop("Unnamed: 0", axis=1)
df25 = df25.drop("Unnamed: 0", axis=1)
df26 = df26.drop("Unnamed: 0", axis=1)
df27 = df27.drop("Unnamed: 0", axis=1)
df28 = df28.drop("Unnamed: 0", axis=1)
df29 = df29.drop("Unnamed: 0", axis=1)
df30 = df30.drop("Unnamed: 0", axis=1)
df31 = df31.drop("Unnamed: 0", axis=1)
df32 = df32.drop("Unnamed: 0", axis=1)
df33 = df33.drop("Unnamed: 0", axis=1)
df34 = df34.drop("Unnamed: 0", axis=1)
df35 = df35.drop("Unnamed: 0", axis=1)
df36 = df36.drop("Unnamed: 0", axis=1)
df37 = df37.drop("Unnamed: 0", axis=1)
df38 = df38.drop("Unnamed: 0", axis=1)
df39 = df39.drop("Unnamed: 0", axis=1)
df40 = df40.drop("Unnamed: 0", axis=1)
df41 = df41.drop("Unnamed: 0", axis=1)
df42 = df42.drop("Unnamed: 0", axis=1)

In [28]:
df_list = [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14, df15, df16, df17, df18, df19,
           df20, df21, df22, df23, df24, df25, df26, df27, df28, df29, df30, df31, df32, df33, df34, df35,
           df36, df37, df38, df39, df40, df41, df42]

In [29]:
trito_matrix3 = pd.concat([df.set_index("position") for df in df_list], axis=1).reset_index().astype(object)

In [30]:
trito_matrix3.shape


Out[30]:
(5751027, 43)

In [31]:
trito_matrix3 = trito_matrix3.drop("index", axis=1)

In [32]:
trito_matrix3.columns = ["RRBS_trito_pool_1_TAAGGCGA.ACAACC", "RRBS_trito_pool_1_TAAGGCGA.ACGTGG", 
                        "RRBS_trito_pool_1_TAAGGCGA.ACTCAC", "RRBS_trito_pool_1_TAAGGCGA.ATAGCG", 
                        "RRBS_trito_pool_1_TAAGGCGA.ATCGAC", "RRBS_trito_pool_1_TAAGGCGA.CAAGAG", 
                        "RRBS_trito_pool_1_TAAGGCGA.CATGAC", "RRBS_trito_pool_1_TAAGGCGA.CCTTCG", 
                        "RRBS_trito_pool_1_TAAGGCGA.CGGTAG", "RRBS_trito_pool_1_TAAGGCGA.CTATTG", 
                        "RRBS_trito_pool_1_TAAGGCGA.GACACG", "RRBS_trito_pool_1_TAAGGCGA.GCATTC", 
                        "RRBS_trito_pool_1_TAAGGCGA.GCTGCC", "RRBS_trito_pool_1_TAAGGCGA.GGCATC", 
                        "RRBS_trito_pool_1_TAAGGCGA.GTGAGG", "RRBS_trito_pool_1_TAAGGCGA.GTTGAG", 
                        "RRBS_trito_pool_1_TAAGGCGA.TAGCGG", "RRBS_trito_pool_1_TAAGGCGA.TATCTC", 
                        "RRBS_trito_pool_1_TAAGGCGA.TCTCTG", "RRBS_trito_pool_1_TAAGGCGA.TGACAG", 
                        "RRBS_trito_pool_1_TAAGGCGA.TGCTGC", "RRBS_trito_pool_2_CGTACTAG.ACAACC", 
                        "RRBS_trito_pool_2_CGTACTAG.ACGTGG", "RRBS_trito_pool_2_CGTACTAG.ACTCAC", 
                        "RRBS_trito_pool_2_CGTACTAG.AGGATG", 
                        "RRBS_trito_pool_2_CGTACTAG.ATAGCG", "RRBS_trito_pool_2_CGTACTAG.ATCGAC", 
                        "RRBS_trito_pool_2_CGTACTAG.CAAGAG", "RRBS_trito_pool_2_CGTACTAG.CATGAC", 
                        "RRBS_trito_pool_2_CGTACTAG.CCTTCG", "RRBS_trito_pool_2_CGTACTAG.CGGTAG", 
                        "RRBS_trito_pool_2_CGTACTAG.CTATTG", "RRBS_trito_pool_2_CGTACTAG.GACACG", 
                        "RRBS_trito_pool_2_CGTACTAG.GCATTC", "RRBS_trito_pool_2_CGTACTAG.GCTGCC", 
                        "RRBS_trito_pool_2_CGTACTAG.GGCATC", "RRBS_trito_pool_2_CGTACTAG.GTGAGG", 
                        "RRBS_trito_pool_2_CGTACTAG.GTTGAG", "RRBS_trito_pool_2_CGTACTAG.TAGCGG", 
                        "RRBS_trito_pool_2_CGTACTAG.TATCTC", "RRBS_trito_pool_2_CGTACTAG.TCTCTG", 
                        "RRBS_trito_pool_2_CGTACTAG.TGACAG"]

In [33]:
trito_matrix3 = trito_matrix3.applymap(lambda x: int(x) if pd.notnull(x) else str("?"))

In [34]:
trito_matrix3 = trito_matrix3.astype(str).apply(''.join)

In [35]:
trito3 = pd.Series(trito_matrix3.index.astype(str).str.cat(trito_matrix3.astype(str),'    '))

In [36]:
trito_matrix3 = trito3

In [37]:
trito_matrix3.to_csv("tritopool_total_final.phy", header=None, index=None)

In [38]:
trito_matrix3.head()


Out[38]:
0    RRBS_trito_pool_1_TAAGGCGA.ACAACC    ?????????...
1    RRBS_trito_pool_1_TAAGGCGA.ACGTGG    ?????????...
2    RRBS_trito_pool_1_TAAGGCGA.ACTCAC    ?????????...
3    RRBS_trito_pool_1_TAAGGCGA.ATAGCG    ?????????...
4    RRBS_trito_pool_1_TAAGGCGA.ATCGAC    ?????????...
dtype: object

In [ ]:


In [ ]:


In [ ]: