In [1]:
import pandas as pd
import csv
In [23]:
def removespecchar(test):
import re
if type(test) == str:
test3=re.sub('\t','',test)
test2=re.sub('\"','',test3)
test=re.sub('\n','',test2)
return(test)
In [20]:
def createbed(dfgenes, disease, sheetindex):
df=pd.read_excel("../data/MyeloidGeneKB01.xlsx", sheetname=sheetindex, index_col="GeneSymbol", parse_cols = [0,1,2,3,4])
df.columns= disease +"_"+ df.columns
dfgenes2 = dfgenes.join(df, how="outer")
dfgenes2.reset_index(inplace=True)
dfgenes2=dfgenes2.fillna("")
dfgenes2=dfgenes2.applymap(removespecchar)
collist=list(dfgenes2.columns)
newcollist=collist[6:9] + collist[0:6] + collist[9:]
dfgenes2.to_csv(disease+".bed", sep='\t', columns=newcollist, index=False)
In [22]:
dfgenes=pd.read_excel("../data/MyeloidGeneKB01.xlsx", sheetname=0,index_col="GeneSymbol", dtypes=object)
myeloidDiseaseList = ["Unknown", "AML", "MDS", "MPN", ]
myeloidSheetList = [1,2,3,4]
for disease, sheetindex in zip (myeloidDiseaseList, myeloidSheetList):
createbed(dfgenes,disease,sheetindex)
In [ ]: