In [3]:
%pylab inline
In [4]:
import os as os
import pickle as pickle
import pandas as pd
In [5]:
from Stats.Scipy import *
from Stats.Survival import *
from Helpers.Pandas import *
from Figures.FigureHelpers import *
from Figures.Pandas import *
from Figures.Boxplots import *
from Figures.Survival import draw_survival_curve, survival_and_stats
from Figures.Survival import draw_survival_curves
from Figures.Survival import survival_stat_plot
In [6]:
import Data.Firehose as FH
In [7]:
pd.set_option('precision', 3)
pd.set_option('display.width', 300)
plt.rcParams['font.size'] = 12
In [8]:
'''Color schemes for paper taken from http://colorbrewer2.org/'''
colors = plt.rcParams['axes.color_cycle']
colors_st = ['#CA0020', '#F4A582', '#92C5DE', '#0571B0']
colors_th = ['#E66101', '#FDB863', '#B2ABD2', '#5E3C99']
In [9]:
def get_run(firehose_dir, version='Latest'):
'''
Helper to get a run from the file-system.
'''
path = '{}/ucsd_analyses'.format(firehose_dir)
if version is 'Latest':
version = sorted(os.listdir(path))[-1]
run = pickle.load(open('{}/{}/RunObject.p'.format(path, version), 'rb'))
return run
Here we read in the pre-processed data that we downloaded and initialized in the download_data notebook.
In [10]:
print 'populating namespace with data'
In [11]:
OUT_PATH = '/cellar/users/agross/TCGA_Code/CancerData/Data'
RUN_DATE = '2014_07_15'
VERSION = 'all'
CANCER = 'HNSC'
FIGDIR = '../Figures/'
if not os.path.isdir(FIGDIR):
os.makedirs(FIGDIR)
In [12]:
run_path = '{}/Firehose__{}/'.format(OUT_PATH, RUN_DATE)
run = get_run(run_path, 'Run_' + VERSION)
run.data_path = run_path
run.result_path = run_path + 'ucsd_analyses'
run.report_path = run_path + 'ucsd_analyses/Run_all'
cancer = run.load_cancer(CANCER)
cancer.path = '{}/{}'.format(run.report_path , cancer.name)
clinical = cancer.load_clinical()
mut = cancer.load_data('Mutation')
mut.uncompress()
cn = cancer.load_data('CN_broad')
cn.uncompress()
In [13]:
rna = FH.read_rnaSeq(run.data_path, cancer.name, tissue_code='All')
mirna = FH.read_miRNASeq(run.data_path, cancer.name, tissue_code='All')
In [14]:
keepers_o = pd.read_csv('/cellar/users/agross/TCGA_Code/TCGA_Working/Data/Firehose__2014_04_16/' + 'old_keepers.csv', index_col=0,
squeeze=True)
keepers_o = array(keepers_o)
In [15]:
from Processing.ProcessClinicalDataPortal import update_clinical_object
p = '/cellar/users/agross/TCGA_Code/TCGA/Data'
path = p + '/Followup_R9/HNSC/'
clinical = update_clinical_object(clinical, path)
clinical.clinical.shape
Out[15]:
In [16]:
#hpv = clinical.hpv
surv = clinical.survival.survival_5y
age = clinical.clinical.age.astype(float)
old = pd.Series(1.*(age>=75), name='old')
In [139]:
p = '/cellar/users/agross/TCGA_Code/TCGA/Data'
f = p + '/MAFs/PR_TCGA_HNSC_PAIR_Capture_All_Pairs_QCPASS_v4.aggregated.capture.tcga.uuid.automated.somatic.maf.txt'
mut_new = pd.read_table(f, skiprows=4, low_memory=False)
keep = (mut_new.Variant_Classification.isin(['Silent', 'Intron', "3'UTR", "5'UTR"])==False)
mut_new = mut_new[keep]
mut_new['barcode'] = mut_new.Tumor_Sample_Barcode.map(lambda s: s[:12])
mut_new = mut_new.groupby(['barcode','Hugo_Symbol']).size().unstack().fillna(0).T
mut_new = mut.df.combine_first(mut_new).fillna(0)
In [214]:
gistic = FH.get_gistic_gene_matrix(run.data_path, cancer.name)
del_3p = gistic.ix['3p14.2'].median(0)
del_3p.name = '3p_deletion'
In [18]:
p = '/cellar/users/agross/TCGA_Code/TCGA/'
hpv_all = pd.read_csv(p + '/Extra_Data/hpv_summary_3_20_13_distribute.csv', index_col=0)
hpv = hpv_all.Molecular_HPV.map({0:'HPV-', 1:'HPV+'})
hpv.name = 'HPV'
hpv_seq = hpv
In [19]:
status = clinical.clinical[['hpvstatusbyishtesting','hpvstatusbyp16testing']]
status = status.replace('[Not Evaluated]', nan)
hpv_clin = (status.dropna() == 'Positive').sum(1)
hpv_clin = hpv_clin.map({2: 'HPV+', 0:'HPV-', 1:nan}).dropna()
In [20]:
hpv_clin.value_counts()
Out[20]:
In [21]:
hpv_clin.ix[hpv_clin.index.diff(hpv_seq.index)].value_counts()
Out[21]:
In [22]:
hpv_new = pd.read_table(p + '/Data/Followup_R6/HNSC/auxiliary_hnsc.txt',
skiprows=[1], index_col=0, na_values=['[Not Available]'])
hpv_new = hpv_new['hpv_status']
In [23]:
hpv = (hpv_seq.dropna() == 'HPV+').combine_first(hpv_new == 'Positive')
hpv.name = 'HPV'
In [24]:
hpv.value_counts()
Out[24]:
In [25]:
n = ti(hpv==False)
In [26]:
fisher_exact_test(del_3p<0, mut_new.ix['TP53'].ix[n.diff(keepers_o)]>0)
Out[26]: