Global Imports


In [3]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

External Package Imports


In [4]:
import os as os
import pickle as pickle
import pandas as pd

Module Imports


In [5]:
from Stats.Scipy import *
from Stats.Survival import *
from Helpers.Pandas import *

from Figures.FigureHelpers import *
from Figures.Pandas import *
from Figures.Boxplots import *
from Figures.Survival import draw_survival_curve, survival_and_stats
from Figures.Survival import draw_survival_curves
from Figures.Survival import survival_stat_plot

In [6]:
import Data.Firehose as FH

Tweaking Display Parameters


In [7]:
pd.set_option('precision', 3)
pd.set_option('display.width', 300)
plt.rcParams['font.size'] = 12

In [8]:
'''Color schemes for paper taken from http://colorbrewer2.org/'''
colors = plt.rcParams['axes.color_cycle']
colors_st = ['#CA0020', '#F4A582', '#92C5DE', '#0571B0']
colors_th = ['#E66101', '#FDB863', '#B2ABD2', '#5E3C99']

Function to Pull a Firehose Run Container


In [9]:
def get_run(firehose_dir, version='Latest'):
    '''
    Helper to get a run from the file-system. 
    '''
    path = '{}/ucsd_analyses'.format(firehose_dir)
    if version is 'Latest':
        version = sorted(os.listdir(path))[-1]
    run = pickle.load(open('{}/{}/RunObject.p'.format(path, version), 'rb'))
    return run

Read In Data

Here we read in the pre-processed data that we downloaded and initialized in the download_data notebook.


In [10]:
print 'populating namespace with data'


populating namespace with data

In [11]:
OUT_PATH = '/cellar/users/agross/TCGA_Code/CancerData/Data'
RUN_DATE = '2014_07_15'
VERSION = 'all'
CANCER = 'HNSC'
FIGDIR = '../Figures/'
if not os.path.isdir(FIGDIR):
    os.makedirs(FIGDIR)

In [12]:
run_path  = '{}/Firehose__{}/'.format(OUT_PATH, RUN_DATE)
run = get_run(run_path, 'Run_' + VERSION)
run.data_path = run_path 
run.result_path = run_path + 'ucsd_analyses'
run.report_path = run_path + 'ucsd_analyses/Run_all'

cancer = run.load_cancer(CANCER)
cancer.path = '{}/{}'.format(run.report_path , cancer.name)
clinical = cancer.load_clinical()

mut = cancer.load_data('Mutation')
mut.uncompress()
cn = cancer.load_data('CN_broad')
cn.uncompress()

In [13]:
rna = FH.read_rnaSeq(run.data_path, cancer.name, tissue_code='All')
mirna = FH.read_miRNASeq(run.data_path, cancer.name, tissue_code='All')

In [14]:
keepers_o = pd.read_csv('/cellar/users/agross/TCGA_Code/TCGA_Working/Data/Firehose__2014_04_16/' + 'old_keepers.csv', index_col=0,
                        squeeze=True)
keepers_o = array(keepers_o)

Update Clinical Data


In [15]:
from Processing.ProcessClinicalDataPortal import update_clinical_object

p = '/cellar/users/agross/TCGA_Code/TCGA/Data'
path  = p + '/Followup_R9/HNSC/'
clinical = update_clinical_object(clinical, path)
clinical.clinical.shape


Out[15]:
(508, 56)

In [16]:
#hpv = clinical.hpv
surv = clinical.survival.survival_5y
age = clinical.clinical.age.astype(float)
old = pd.Series(1.*(age>=75), name='old')

In [139]:
p = '/cellar/users/agross/TCGA_Code/TCGA/Data'
f = p + '/MAFs/PR_TCGA_HNSC_PAIR_Capture_All_Pairs_QCPASS_v4.aggregated.capture.tcga.uuid.automated.somatic.maf.txt'
mut_new = pd.read_table(f, skiprows=4, low_memory=False)
keep = (mut_new.Variant_Classification.isin(['Silent', 'Intron', "3'UTR", "5'UTR"])==False)
mut_new = mut_new[keep]
mut_new['barcode'] = mut_new.Tumor_Sample_Barcode.map(lambda s: s[:12])
mut_new = mut_new.groupby(['barcode','Hugo_Symbol']).size().unstack().fillna(0).T
mut_new = mut.df.combine_first(mut_new).fillna(0)

In [214]:
gistic = FH.get_gistic_gene_matrix(run.data_path, cancer.name)
del_3p = gistic.ix['3p14.2'].median(0)
del_3p.name = '3p_deletion'

HPV Data


In [18]:
p = '/cellar/users/agross/TCGA_Code/TCGA/'
hpv_all = pd.read_csv(p + '/Extra_Data/hpv_summary_3_20_13_distribute.csv', index_col=0)
hpv = hpv_all.Molecular_HPV.map({0:'HPV-', 1:'HPV+'})
hpv.name = 'HPV'
hpv_seq = hpv

In [19]:
status = clinical.clinical[['hpvstatusbyishtesting','hpvstatusbyp16testing']]
status = status.replace('[Not Evaluated]', nan)
hpv_clin = (status.dropna() == 'Positive').sum(1)
hpv_clin = hpv_clin.map({2: 'HPV+', 0:'HPV-', 1:nan}).dropna()

In [20]:
hpv_clin.value_counts()


Out[20]:
HPV-    65
HPV+    18
dtype: int64

In [21]:
hpv_clin.ix[hpv_clin.index.diff(hpv_seq.index)].value_counts()


Out[21]:
HPV-    24
HPV+    14
dtype: int64

In [22]:
hpv_new = pd.read_table(p + '/Data/Followup_R6/HNSC/auxiliary_hnsc.txt',
                  skiprows=[1], index_col=0, na_values=['[Not Available]'])
hpv_new = hpv_new['hpv_status']

In [23]:
hpv = (hpv_seq.dropna() == 'HPV+').combine_first(hpv_new == 'Positive')
hpv.name = 'HPV'

In [24]:
hpv.value_counts()


Out[24]:
False    450
True      78
dtype: int64

In [25]:
n = ti(hpv==False)

In [26]:
fisher_exact_test(del_3p<0, mut_new.ix['TP53'].ix[n.diff(keepers_o)]>0)


Out[26]:
odds_ratio    6.79e+00
p             5.46e-07
dtype: float64