Global Imports



In [3]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib

External Package Imports



In [4]:

    
import os as os
import pickle as pickle
import pandas as pd

Module Imports



In [5]:

    
from Stats.Scipy import *
from Stats.Survival import *
from Helpers.Pandas import *

from Figures.FigureHelpers import *
from Figures.Pandas import *
from Figures.Boxplots import *
from Figures.Survival import draw_survival_curve, survival_and_stats
from Figures.Survival import draw_survival_curves
from Figures.Survival import survival_stat_plot



In [6]:

    
import Data.Firehose as FH

Tweaking Display Parameters



In [7]:

    
pd.set_option('precision', 3)
pd.set_option('display.width', 300)
plt.rcParams['font.size'] = 12



In [8]:

    
'''Color schemes for paper taken from http://colorbrewer2.org/'''
colors = plt.rcParams['axes.color_cycle']
colors_st = ['#CA0020', '#F4A582', '#92C5DE', '#0571B0']
colors_th = ['#E66101', '#FDB863', '#B2ABD2', '#5E3C99']

Function to Pull a Firehose Run Container



In [9]:

    
def get_run(firehose_dir, version='Latest'):
    '''
    Helper to get a run from the file-system. 
    '''
    path = '{}/ucsd_analyses'.format(firehose_dir)
    if version is 'Latest':
        version = sorted(os.listdir(path))[-1]
    run = pickle.load(open('{}/{}/RunObject.p'.format(path, version), 'rb'))
    return run

Read In Data

Here we read in the pre-processed data that we downloaded and initialized in the download_data notebook.



In [10]:

    
print 'populating namespace with data'









    



populating namespace with data



In [11]:

    
OUT_PATH = '/cellar/users/agross/TCGA_Code/CancerData/Data'
RUN_DATE = '2014_07_15'
VERSION = 'all'
CANCER = 'HNSC'
FIGDIR = '../Figures/'
if not os.path.isdir(FIGDIR):
    os.makedirs(FIGDIR)



In [12]:

    
run_path  = '{}/Firehose__{}/'.format(OUT_PATH, RUN_DATE)
run = get_run(run_path, 'Run_' + VERSION)
run.data_path = run_path 
run.result_path = run_path + 'ucsd_analyses'
run.report_path = run_path + 'ucsd_analyses/Run_all'

cancer = run.load_cancer(CANCER)
cancer.path = '{}/{}'.format(run.report_path , cancer.name)
clinical = cancer.load_clinical()

mut = cancer.load_data('Mutation')
mut.uncompress()
cn = cancer.load_data('CN_broad')
cn.uncompress()



In [13]:

    
rna = FH.read_rnaSeq(run.data_path, cancer.name, tissue_code='All')
mirna = FH.read_miRNASeq(run.data_path, cancer.name, tissue_code='All')



In [14]:

    
keepers_o = pd.read_csv('/cellar/users/agross/TCGA_Code/TCGA_Working/Data/Firehose__2014_04_16/' + 'old_keepers.csv', index_col=0,
                        squeeze=True)
keepers_o = array(keepers_o)

Update Clinical Data



In [15]:

    
from Processing.ProcessClinicalDataPortal import update_clinical_object

p = '/cellar/users/agross/TCGA_Code/TCGA/Data'
path  = p + '/Followup_R9/HNSC/'
clinical = update_clinical_object(clinical, path)
clinical.clinical.shape









    Out[15]:





(508, 56)



In [16]:

    
#hpv = clinical.hpv
surv = clinical.survival.survival_5y
age = clinical.clinical.age.astype(float)
old = pd.Series(1.*(age>=75), name='old')



In [139]:

    
p = '/cellar/users/agross/TCGA_Code/TCGA/Data'
f = p + '/MAFs/PR_TCGA_HNSC_PAIR_Capture_All_Pairs_QCPASS_v4.aggregated.capture.tcga.uuid.automated.somatic.maf.txt'
mut_new = pd.read_table(f, skiprows=4, low_memory=False)
keep = (mut_new.Variant_Classification.isin(['Silent', 'Intron', "3'UTR", "5'UTR"])==False)
mut_new = mut_new[keep]
mut_new['barcode'] = mut_new.Tumor_Sample_Barcode.map(lambda s: s[:12])
mut_new = mut_new.groupby(['barcode','Hugo_Symbol']).size().unstack().fillna(0).T
mut_new = mut.df.combine_first(mut_new).fillna(0)



In [214]:

    
gistic = FH.get_gistic_gene_matrix(run.data_path, cancer.name)
del_3p = gistic.ix['3p14.2'].median(0)
del_3p.name = '3p_deletion'

HPV Data



In [18]:

    
p = '/cellar/users/agross/TCGA_Code/TCGA/'
hpv_all = pd.read_csv(p + '/Extra_Data/hpv_summary_3_20_13_distribute.csv', index_col=0)
hpv = hpv_all.Molecular_HPV.map({0:'HPV-', 1:'HPV+'})
hpv.name = 'HPV'
hpv_seq = hpv



In [19]:

    
status = clinical.clinical[['hpvstatusbyishtesting','hpvstatusbyp16testing']]
status = status.replace('[Not Evaluated]', nan)
hpv_clin = (status.dropna() == 'Positive').sum(1)
hpv_clin = hpv_clin.map({2: 'HPV+', 0:'HPV-', 1:nan}).dropna()



In [20]:

    
hpv_clin.value_counts()









    Out[20]:





HPV-    65
HPV+    18
dtype: int64



In [21]:

    
hpv_clin.ix[hpv_clin.index.diff(hpv_seq.index)].value_counts()









    Out[21]:





HPV-    24
HPV+    14
dtype: int64



In [22]:

    
hpv_new = pd.read_table(p + '/Data/Followup_R6/HNSC/auxiliary_hnsc.txt',
                  skiprows=[1], index_col=0, na_values=['[Not Available]'])
hpv_new = hpv_new['hpv_status']



In [23]:

    
hpv = (hpv_seq.dropna() == 'HPV+').combine_first(hpv_new == 'Positive')
hpv.name = 'HPV'



In [24]:

    
hpv.value_counts()









    Out[24]:





False    450
True      78
dtype: int64



In [25]:

    
n = ti(hpv==False)



In [26]:

    
fisher_exact_test(del_3p<0, mut_new.ix['TP53'].ix[n.diff(keepers_o)]>0)









    Out[26]:





odds_ratio    6.79e+00
p             5.46e-07
dtype: float64