Ras Differential Expression Analysis

Perform a t-test across all genes to determine differentially expressed genes between Ras wildtype and Ras mutant genes. Compare with what the model learns and output the data for visualization with R/ggplot2.



In [1]:

    
import os
import sys
import pandas as pd
import scipy
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns
import plotnine as gg

sys.path.insert(0, os.path.join('..', 'scripts', 'util'))
from tcga_util import integrate_copy_number









    



/home/gway/anaconda3/lib/python3.5/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools



In [2]:

    
%matplotlib inline
plt.style.use('seaborn-notebook')

Load Data

Loading RNAseq (X matrix), copy number and mutation (Y matrix), and coefficients data (W matrix from machine learning model)



In [3]:

    
# Load RNAseq matrix
expr_file = os.path.join('..', 'data', 'pancan_rnaseq_freeze.tsv')
rnaseq_full_df = pd.read_table(expr_file, index_col=0)



In [4]:

    
# Load Mutation matrix
mut_file = os.path.join('..', 'data', 'pancan_mutation_freeze.tsv')
mutation_df = pd.read_table(mut_file, index_col=0)



In [5]:

    
# Load sample freeze data and cancer genes
sample_freeze_file = os.path.join('..', 'data', 'sample_freeze.tsv')
sample_freeze = pd.read_table(sample_freeze_file, index_col=0)

cancer_gene_file = os.path.join('..', 'data', 'vogelstein_cancergenes.tsv')
cancer_genes = pd.read_table(cancer_gene_file)



In [6]:

    
# Load copy number data to determine final status matrix
copy_loss_file = os.path.join('..', 'data', 'copy_number_loss_status.tsv')
copy_gain_file = os.path.join('..', 'data', 'copy_number_gain_status.tsv')

copy_loss_df = pd.read_table(copy_loss_file, index_col=0)
copy_gain_df = pd.read_table(copy_gain_file, index_col=0)



In [7]:

    
# Load Coefficients File
coef_file = os.path.join('..', 'classifiers', 'RAS', 'classifier_coefficients.tsv')
coef_df = pd.read_table(coef_file, index_col=0)

Process and Prep Data for Differential Expression Analysis



In [8]:

    
# Process y matrix
genes = ['KRAS', 'HRAS', 'NRAS']
y = mutation_df[genes]

y_df = integrate_copy_number(y=y, cancer_genes_df=cancer_genes,
                             genes=genes, loss_df=copy_loss_df,
                             gain_df=copy_gain_df)

y_df = y_df.assign(total_status=y.max(axis=1))
y_df = y_df.reset_index().merge(sample_freeze,
                          how='left').set_index('SAMPLE_BARCODE')



In [9]:

    
y_df['total_status'].value_counts()









    Out[9]:





0    8008
1    1066
Name: total_status, dtype: int64



In [10]:

    
# Write out files for easy use in R DEG analysis
ras_status_file = os.path.join('..', 'data', 'Ras_sample_status.tsv')
y_df.to_csv(ras_status_file, sep='\t')



In [11]:

    
ras_mad_genes_file = os.path.join('..', 'data', 'RNAseq_scaled_all_genes.tsv')

x_df = rnaseq_full_df.dropna(axis=1)
x_df_update = StandardScaler().fit_transform(x_df)
x_df_update = pd.DataFrame(x_df_update, columns=x_df.columns, index=x_df.index)
x_df = x_df_update

x_df.to_csv(ras_mad_genes_file, sep='\t')



In [12]:

    
x_df.shape









    Out[12]:





(9074, 20500)

Example Naive Analysis using simple unadjusted t-tests



In [13]:

    
# Get two arrays to compare
ras_wt_samples = y_df[y_df['total_status'] == 0].index
ras_mut_samples = y_df[y_df['total_status'] == 1].index

x_wt_df = x_df.loc[ras_wt_samples, :]
x_mut_df = x_df.loc[ras_mut_samples, :]



In [14]:

    
ttest_results = scipy.stats.ttest_ind(x_mut_df, x_wt_df)
t_stat = ttest_results.statistic
p_val = ttest_results.pvalue









    



/home/gway/anaconda3/lib/python3.5/site-packages/scipy/stats/_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in greater
  return (self.a < x) & (x < self.b)
/home/gway/anaconda3/lib/python3.5/site-packages/scipy/stats/_distn_infrastructure.py:879: RuntimeWarning: invalid value encountered in less
  return (self.a < x) & (x < self.b)
/home/gway/anaconda3/lib/python3.5/site-packages/scipy/stats/_distn_infrastructure.py:1818: RuntimeWarning: invalid value encountered in less_equal
  cond2 = cond0 & (x <= self.a)



In [15]:

    
ttest_df = pd.DataFrame(t_stat, columns=['stat'])
ttest_df = ttest_df.assign(pval = p_val)
ttest_df = ttest_df.assign(gene = x_wt_df.columns)



In [16]:

    
plot_df = pd.merge(ttest_df, coef_df, left_on='gene', right_on='feature')
plot_df.head(2)



In [17]:

    
p = (gg.ggplot(plot_df, gg.aes(x='weight', y='stat')) +
     gg.geom_point(size=4, alpha=0.6) +
     gg.theme_seaborn(style='whitegrid') +
     gg.xlab('Ras Classifier Weight') +
     gg.ylab('Differential Expression Score') +
     gg.ggtitle('') +
     gg.theme(
        plot_title=gg.element_text(size=22),
        axis_title_x=gg.element_text(size=16),
        axis_title_y=gg.element_text(size=16),
        axis_text_x=gg.element_text(size=14),
        axis_text_y=gg.element_text(size=14),
        axis_ticks_length=4,
        legend_position=(1.0, 0.5),
        legend_background=gg.element_blank(),
        legend_key=gg.element_rect(fill='white'),
        legend_text=gg.element_text(size=9),
        legend_title=gg.element_text(size=12),
        panel_border=gg.element_blank(),
        panel_grid_major=gg.element_blank(),
        panel_grid_minor=gg.element_blank()))
p









    












    Out[17]:





<ggplot: (-9223363283404045077)>

	stat	pval	gene	feature	weight	abs
0	-3.084877	0.002042	A2M	A2M	0.0	0.0
1	-4.428324	0.000010	A4GALT	A4GALT	0.0	0.0