In [2]:
cd ..


/cellar/users/agross/TCGA_Code/DX/Notebooks

In [3]:
import NotebookImport
from DX_screen import *


importing IPython notebook from DX_screen
importing IPython notebook from Imports

In [4]:
cd ../DX/Notebooks/


/cellar/users/agross/TCGA_Code/DX/Notebooks

In [5]:
from Imports import *
from Preprocessing.ClinicalDataFilters import *


importing IPython notebook from Preprocessing/ClinicalDataFilters
importing IPython notebook from Preprocessing/ClinicalData
PCPG
UCS
TGCT
CHOL
THYM
MESO
FPPP
/cellar/users/agross/anaconda2/lib/python2.7/site-packages/Processing/ProcessClinicalDataPortal.py:37: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  f['vitalstatus'] = f['daystodeath'].isnull()

uPA protease


In [6]:
paired_bp_tn_split??

In [7]:
cc = codes.ix[matched_rna.columns.get_level_values(0)].dropna().unique()
r = pd.DataFrame({c: ttest_rel(matched_rna.ix['PLAU'].ix[ti(codes==c)])
              for c in cc}).T

In [8]:
fig, ax = subplots(figsize=(7,3))
cc = ['HNSC','LUSC','LUAD','BLCA','THCA','BRCA','COAD','READ']
paired_bp_tn_split(matched_rna.ix['PLAU'], codes[codes.isin(cc)], ax=ax)
fig.savefig('/cellar/users/agross/figures/plau.pdf')



In [ ]:


In [30]:
r.sort('p')


Out[30]:
p t
THCA 9.19e-16 10.96
HNSC 1.17e-14 11.58
LUAD 7.37e-14 9.82
LUSC 9.43e-14 10.46
KIRC 8.75e-13 -8.69
COAD 2.75e-12 9.88
BRCA 3.31e-12 7.82
PRAD 8.73e-12 -8.79
KICH 1.23e-10 -10.73
LIHC 8.18e-06 4.98
KIRP 5.05e-05 -4.70
CESC 9.36e-04 32.66
BLCA 2.60e-03 3.49
READ 7.71e-03 3.53
CHOL 1.70e-02 3.00
PCPG 3.34e-02 5.34
UCEC 8.09e-02 1.83
THYM 2.58e-01 -2.33
PAAD 3.58e-01 1.18
SARC 9.74e-01 0.04

In [10]:
ttest_rel(matched_rna.ix['PLAU'])


Out[10]:
p    6.13e-16
t    8.29e+00
dtype: float64

TPA protease


In [5]:
paired_bp_tn_split(matched_rna.ix['PLAT'], codes)


Collagenase


In [31]:
paired_bp_tn_split(matched_rna.ix['MMP1'], codes)


elastases


In [6]:
g = ['CELA1','CELA2A','CELA2B','CELA3A','CELA3B','CTRC','ELANE','MMP12']

In [13]:
paired_bp_tn_split?

In [17]:
fig, axs = subplots(8, 1, figsize=(15,20), sharex=True)
for i,gene in enumerate(g):
    paired_bp_tn_split(matched_rna.ix[gene], codes, ax=axs[i],
                      data_type='')


Cathepsin


In [27]:
g = ['CTSA','CTSB','CTSC','CTSD','CTSE','CTSF','CTSG','CTSH',
     'CTSK','CTSL1','CTSL2','CTSO','CTSS','CTSW','CTSZ']
len(g)


Out[27]:
15

In [30]:
fig, axs = subplots(15, 1, figsize=(15,40), sharex=True)
for i,gene in enumerate(g):
    paired_bp_tn_split(matched_rna.ix[gene], codes, ax=axs[i],
                      data_type='')


Is there a way for you to query TCGA about all extracellular proteases in an unbiased fashion? i.e. not by asking about specific proteases by name but asking about all extracellular proteases?

If yes, can you please help me do this?

If no, the data that you already have is really useful - can we put them in the same table, ranking the most highly expressed proteases for all cancers, with HNSCC being the first cancer on the x axis (similar to panel a in the figure inserted above).


In [ ]: