In [119]:
import pandas as pd

In [120]:
df = pd.read_table("../data/refseqOutVarSeq", dtype = {'start': int, 'stop': int}, usecols= [0,1,2,3,7])


/home/glen/miniconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [121]:
df.head()


Out[121]:
Chromosome Start Stop Gene Name Strand
0 1 11873 14409 DDX11L1 +
1 1 14361 29370 WASH7P -
2 1 17368 17436 MIR6859-1 -
3 1 30365 30503 MIR1302-2 +
4 1 34610 36081 FAM138A -

In [122]:
dfgenes = pd.read_excel("../data/GeneKB01.xlsx")

In [123]:
myeloidList = list(dfgenes['GeneSymbol'])

In [124]:
df = df[df['Gene Name'].isin(myeloidList)]
bygene = df.groupby('Gene Name')
bygene.nunique()


Out[124]:
Chromosome Start Stop Gene Name Strand
Gene Name
ABL1 1 2 1 1 1
AKT1 1 1 2 1 1
ALK 1 1 1 1 1
APC 1 2 1 1 1
ASXL1 1 1 2 1 1
ATM 1 1 1 1 1
BCOR 1 1 2 1 1
BCORL1 1 1 1 1 1
BRAF 1 1 1 1 1
CALR 1 1 1 1 1
CBL 1 1 1 1 1
CBLB 1 1 5 1 1
CDH1 1 1 1 1 1
CDKN2A 1 1 3 1 1
CEBPA 1 1 1 1 1
CSF1R 1 1 2 1 1
CSF3R 1 1 1 1 1
CTNNB1 1 2 2 1 1
DNMT3A 1 2 4 1 1
EGFR 1 3 5 1 1
ERBB2 1 2 2 1 1
ERBB4 1 1 1 1 1
ETV6 1 1 1 1 1
EZH2 1 1 2 1 1
FBXW7 1 2 4 1 1
FGFR1 1 1 2 1 1
FGFR2 1 2 5 1 1
FGFR3 1 1 1 1 1
FLT3 1 1 1 1 1
GATA1 1 1 1 1 1
... ... ... ... ... ...
NOTCH1 1 1 1 1 1
NPM1 1 1 2 1 1
NRAS 1 1 1 1 1
PDGFRA 1 3 2 1 1
PHF6 1 1 2 1 1
PIK3CA 1 1 1 1 1
PML 1 1 3 1 1
PTEN 1 1 1 1 1
PTPN11 1 1 2 1 1
RAD21 1 1 1 1 1
RB1 1 1 1 1 1
RET 1 1 2 1 1
RUNX1 1 2 2 1 1
SETBP1 1 2 2 1 1
SF3B1 1 2 1 1 1
SMAD4 1 1 1 1 1
SMARCB1 1 1 1 1 1
SMC1A 1 1 1 1 1
SMC3 1 1 1 1 1
SMO 1 1 1 1 1
SRC 1 2 1 1 1
SRSF2 1 1 1 1 1
STAG2 1 3 1 1 1
STK11 1 1 1 1 1
TET2 1 2 2 1 1
TP53 1 1 2 1 1
U2AF1 1 1 1 1 1
VHL 1 1 1 1 1
WT1 1 1 2 1 1
ZRSR2 1 1 1 1 1

81 rows × 5 columns


In [125]:
df[df['Gene Name'] == 'DNMT3A']


Out[125]:
Chromosome Start Stop Gene Name Strand
6845 2 25455829 25475000 DNMT3A -
6846 2 25455829 25475184 DNMT3A -
6847 2 25455829 25564784 DNMT3A -
6848 2 25455829 25565459 DNMT3A -
6849 2 25504320 25564784 DNMT3A -
6850 2 25504320 25565459 DNMT3A -

In [126]:
bygene.apply()


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-126-e8e9da76f2c8> in <module>()
----> 1 bygene.apply()

TypeError: apply() missing 1 required positional argument: 'func'

In [127]:
def minmax(group):
    mychr = 'chr' + str(group['Chromosome'].iloc[0])
    mymax = pd.np.max([pd.np.max(group['Start']),pd.np.max(group['Stop'])])
    mymin = pd.np.min([pd.np.min(group['Start']),pd.np.min(group['Stop'])])
    return pd.Series({'chr': mychr, 'start': mymin, 'stop': mymax})

In [128]:
tempdf = bygene.apply(minmax)

In [129]:
tempdf.to_csv("refseq_start_stop.tsv", sep="\t")

In [ ]:


In [ ]: