In [1]:
import os
import sys
import pandas as pd
import scipy
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import plotnine as gg
sys.path.insert(0, os.path.join('..', 'scripts', 'util'))
from tcga_util import integrate_copy_number
In [2]:
%matplotlib inline
plt.style.use('seaborn-notebook')
In [3]:
# Load RNAseq matrix
expr_file = os.path.join('..', 'data', 'pancan_rnaseq_freeze.tsv')
rnaseq_full_df = pd.read_table(expr_file, index_col=0)
In [4]:
# Load Mutation matrix
mut_file = os.path.join('..', 'data', 'pancan_mutation_freeze.tsv')
mutation_df = pd.read_table(mut_file, index_col=0)
In [5]:
# Load sample freeze data and cancer genes
sample_freeze_file = os.path.join('..', 'data', 'sample_freeze.tsv')
sample_freeze = pd.read_table(sample_freeze_file, index_col=0)
cancer_gene_file = os.path.join('..', 'data', 'vogelstein_cancergenes.tsv')
cancer_genes = pd.read_table(cancer_gene_file)
In [6]:
# Load copy number data to determine final status matrix
copy_loss_file = os.path.join('..', 'data', 'copy_number_loss_status.tsv')
copy_gain_file = os.path.join('..', 'data', 'copy_number_gain_status.tsv')
copy_loss_df = pd.read_table(copy_loss_file, index_col=0)
copy_gain_df = pd.read_table(copy_gain_file, index_col=0)
In [7]:
# Load Coefficients File
coef_file = os.path.join('..', 'classifiers', 'RAS', 'classifier_coefficients.tsv')
coef_df = pd.read_table(coef_file, index_col=0)
In [8]:
# Process y matrix
genes = ['KRAS', 'HRAS', 'NRAS']
y = mutation_df[genes]
y_df = integrate_copy_number(y=y, cancer_genes_df=cancer_genes,
genes=genes, loss_df=copy_loss_df,
gain_df=copy_gain_df)
y_df = y_df.assign(total_status=y.max(axis=1))
y_df = y_df.reset_index().merge(sample_freeze,
how='left').set_index('SAMPLE_BARCODE')
In [9]:
y_df['total_status'].value_counts()
Out[9]:
In [10]:
# Write out files for easy use in R DEG analysis
ras_status_file = os.path.join('..', 'data', 'Ras_sample_status.tsv')
y_df.to_csv(ras_status_file, sep='\t')
In [11]:
ras_mad_genes_file = os.path.join('..', 'data', 'RNAseq_scaled_all_genes.tsv')
x_df = rnaseq_full_df.dropna(axis=1)
x_df_update = StandardScaler().fit_transform(x_df)
x_df_update = pd.DataFrame(x_df_update, columns=x_df.columns, index=x_df.index)
x_df = x_df_update
x_df.to_csv(ras_mad_genes_file, sep='\t')
In [12]:
x_df.shape
Out[12]:
In [13]:
# Get two arrays to compare
ras_wt_samples = y_df[y_df['total_status'] == 0].index
ras_mut_samples = y_df[y_df['total_status'] == 1].index
x_wt_df = x_df.loc[ras_wt_samples, :]
x_mut_df = x_df.loc[ras_mut_samples, :]
In [14]:
ttest_results = scipy.stats.ttest_ind(x_mut_df, x_wt_df)
t_stat = ttest_results.statistic
p_val = ttest_results.pvalue
In [15]:
ttest_df = pd.DataFrame(t_stat, columns=['stat'])
ttest_df = ttest_df.assign(pval = p_val)
ttest_df = ttest_df.assign(gene = x_wt_df.columns)
In [16]:
plot_df = pd.merge(ttest_df, coef_df, left_on='gene', right_on='feature')
plot_df.head(2)
Out[16]:
In [17]:
p = (gg.ggplot(plot_df, gg.aes(x='weight', y='stat')) +
gg.geom_point(size=4, alpha=0.6) +
gg.theme_seaborn(style='whitegrid') +
gg.xlab('Ras Classifier Weight') +
gg.ylab('Differential Expression Score') +
gg.ggtitle('') +
gg.theme(
plot_title=gg.element_text(size=22),
axis_title_x=gg.element_text(size=16),
axis_title_y=gg.element_text(size=16),
axis_text_x=gg.element_text(size=14),
axis_text_y=gg.element_text(size=14),
axis_ticks_length=4,
legend_position=(1.0, 0.5),
legend_background=gg.element_blank(),
legend_key=gg.element_rect(fill='white'),
legend_text=gg.element_text(size=9),
legend_title=gg.element_text(size=12),
panel_border=gg.element_blank(),
panel_grid_major=gg.element_blank(),
panel_grid_minor=gg.element_blank()))
p
Out[17]: