In [119]:
import pandas as pd
In [120]:
df = pd.read_table("../data/refseqOutVarSeq", dtype = {'start': int, 'stop': int}, usecols= [0,1,2,3,7])
In [121]:
df.head()
Out[121]:
In [122]:
dfgenes = pd.read_excel("../data/GeneKB01.xlsx")
In [123]:
myeloidList = list(dfgenes['GeneSymbol'])
In [124]:
df = df[df['Gene Name'].isin(myeloidList)]
bygene = df.groupby('Gene Name')
bygene.nunique()
Out[124]:
In [125]:
df[df['Gene Name'] == 'DNMT3A']
Out[125]:
In [126]:
bygene.apply()
In [127]:
def minmax(group):
mychr = 'chr' + str(group['Chromosome'].iloc[0])
mymax = pd.np.max([pd.np.max(group['Start']),pd.np.max(group['Stop'])])
mymin = pd.np.min([pd.np.min(group['Start']),pd.np.min(group['Stop'])])
return pd.Series({'chr': mychr, 'start': mymin, 'stop': mymax})
In [128]:
tempdf = bygene.apply(minmax)
In [129]:
tempdf.to_csv("refseq_start_stop.tsv", sep="\t")
In [ ]:
In [ ]: