We sought to validate the Ras classifier trained on TCGA pan-cancer data by generating predictions on cell line data. A good classifier should generalize to predicting Ras status in other samples. We apply the classifier on two datasets:
These data were accessed via publicly available resources with help from links in the UCSD-CCAL Onco-GPS github repository
In [1]:
import os
import numpy as np
import pandas as pd
from decimal import Decimal
from scipy.stats import ttest_ind
from statsmodels.stats.proportion import proportions_chisquare
from sklearn.preprocessing import StandardScaler
from Bio.SeqUtils import IUPACData
import matplotlib.pyplot as plt
import seaborn as sns
import plotnine as gg
In [2]:
# Store protein change dictionary
aa = IUPACData.protein_letters_1to3_extended
In [3]:
%matplotlib inline
In [4]:
classifier_file = os.path.join('..', 'classifiers', 'RAS', 'classifier_summary.txt')
with open(classifier_file) as class_fh:
for line in class_fh:
line = line.strip().split('\t')
if line[0] == 'Coefficients:':
all_coef_df = pd.read_table(os.path.join('..', line[1]), index_col=0)
# Only non-zero coefficients contribute to model performance
coef_df = all_coef_df[all_coef_df['abs'] > 0]
print(coef_df.shape)
coef_df.head(10)
Out[4]:
In [5]:
kras_file = 'https://raw.githubusercontent.com/UCSD-CCAL/onco_gps_paper_analysis/master/data/gene_x_kras_isogenic_and_imortalized_celllines.gct'
kras_cellline_df = pd.read_table(kras_file, skiprows=2, index_col=0)
print(kras_cellline_df.shape)
kras_cellline_df.head()
Out[5]:
In [6]:
# Determine the extent of coefficient overlap
common_genes = list(set(coef_df['feature']) & set(kras_cellline_df.index))
common_coef = coef_df[coef_df['feature'].isin(common_genes)]
print('There are a total of {} out of {} genes in common between the datasets'
.format(common_coef.shape[0], coef_df.shape[0]))
In [7]:
# Subset cell line data and reorder
kras_cellline_df = kras_cellline_df.loc[common_coef['feature'], kras_cellline_df.columns[1:]]
print(kras_cellline_df.shape)
In [8]:
# Which Genes are Missing?
missing_genes = list(set(coef_df['feature']).difference(set(kras_cellline_df.index)))
all_coef_df[all_coef_df['feature'].isin(missing_genes)]
Out[8]:
In [9]:
# Transform the cell line data by z-score
scaled_fit = StandardScaler().fit(kras_cellline_df.T)
kras_cellline_df = pd.DataFrame(scaled_fit.transform(kras_cellline_df.T),
index=kras_cellline_df.columns,
columns=kras_cellline_df.index)
In [10]:
# Get the weights ready for applying the classifier
apply_weights = pd.DataFrame(common_coef['weight'])
apply_weights.index = common_coef.feature
In [11]:
# Apply a logit transform [y = 1/(1+e^(-wX))] to output probabilities
result = apply_weights.T.dot(kras_cellline_df.T)
result = 1 / (1 + np.exp(-1 * result))
In [12]:
result.T.sort_values(by='weight')
Out[12]:
In [13]:
# Mutation status from Onco-GPS repository
output = result.T.assign(Ras_Mutation = [1, 1, 1, 1, 1, 1, -1, -1, -1, -1])
output = output.assign(sample_name = output.index)
output = output.assign(dummy_y = 0)
output
Out[13]:
In [14]:
# Perform a t-test to determine if weights are significantly different
ras_geo_mutant = output[output['Ras_Mutation'] == 1]
ras_geo_wt = output[output['Ras_Mutation'] == -1]
# Output t-test results
t_results_geo_ras = ttest_ind(a = ras_geo_mutant['weight'],
b = ras_geo_wt['weight'], equal_var = False)
print('Statistic = {:.2f}, p = {:.2E}'.format(t_results_geo_ras[0],
Decimal(t_results_geo_ras[1])))
In [15]:
p = (gg.ggplot(output,
gg.aes(x='weight', y='dummy_y', color='factor(Ras_Mutation)')) +
gg.geom_hline(gg.aes(yintercept=0), linetype='solid') +
gg.geom_point(size=4) +
gg.scale_color_manual(values=["#377eb8", "#ff7f00"], labels=['Wild-Type', 'Mutant']) +
gg.ylim([-0.1, 0.1]) +
gg.xlim([-0.001, 1.001]) +
gg.theme_seaborn(style='whitegrid') +
gg.xlab('Ras Classifier Score') +
gg.ylab('') +
gg.labs(color='Ras Status') +
gg.ggtitle('GSE94937 Cell Lines\n') +
gg.theme(
plot_title=gg.element_text(size=22),
axis_title_x=gg.element_text(size=16),
axis_text_x=gg.element_text(size=14),
axis_text_y=gg.element_blank(),
axis_ticks_length=4,
axis_ticks_major_y=gg.element_blank(),
axis_ticks_minor_y=gg.element_blank(),
axis_ticks_minor_x=gg.element_blank(),
legend_position=(1.0, 0.5),
legend_background=gg.element_blank(),
legend_key=gg.element_rect(fill='white'),
legend_text=gg.element_text(size=9),
legend_title=gg.element_text(size=12),
panel_border=gg.element_blank(),
panel_grid_major=gg.element_blank(),
panel_grid_minor=gg.element_blank()))
kras_fig_file = os.path.join('..', 'figures', 'cell_line', 'kras_cell_line_predictions.pdf')
p.save(kras_fig_file, width=6, height=0.5)
p
Out[15]:
In [16]:
ccle_file_name = os.path.join('..', '..', 'onco-gps-paper-analysis', 'data',
'rpkm__gene_x_ccle_cellline.gct')
ccle_df = pd.read_table(ccle_file_name, skiprows=2, index_col=0)
print(ccle_df.shape)
ccle_df.head()
Out[16]:
In [17]:
# Subset to common genes in the classifier and CCLE data
common_genes = list(set(coef_df['feature']) & set(ccle_df.index))
common_ccle_coef = coef_df[coef_df['feature'].isin(common_genes)]
print(common_ccle_coef.shape)
common_ccle_coef.head()
Out[17]:
In [18]:
ccle_df = ccle_df.loc[common_ccle_coef['feature'], ccle_df.columns[1:]]
In [19]:
scaled_fit = StandardScaler().fit(ccle_df.T)
ccle_df = pd.DataFrame(scaled_fit.transform(ccle_df.T),
index=ccle_df.columns,
columns=ccle_df.index)
In [20]:
# Apply a logit transform [y = 1/(1+e^(-wX))] to output probabilities
result_ccle = apply_weights.T.dot(ccle_df.T)
result_ccle = 1 / (1 + np.exp(-1 * result_ccle))
In [21]:
# Distribution of predictions of the Ras Classifier applied to CCLE data
result_ccle.T.hist();
In [22]:
# Load CCLE Mutation Data
ccle_mut_file_name = os.path.join('..', '..', 'onco-gps-paper-analysis', 'data',
'mutation__gene_x_ccle_cellline.gct')
ccle_all_mut_df = pd.read_table(ccle_mut_file_name, skiprows=2, index_col=0)
ccle_all_mut_df.shape
Out[22]:
In [23]:
# Load CCLE Variant Data
ccle_maf_file = 'https://data.broadinstitute.org/ccle/CCLE_DepMap_18Q1_maf_20180207.txt'
ccle_maf_df = pd.read_table(ccle_maf_file, index_col=15)
print(ccle_maf_df.shape)
ccle_maf_df.head(2)
Out[23]:
In [24]:
# Identify all cell lines with mutations in Ras genes, also subset BRAF mutant samples
ras_genes = ['KRAS_MUT', 'HRAS_MUT', 'NRAS_MUT']
ras_status = ccle_all_mut_df.loc[ras_genes, :].T.apply(max, axis=1)
# BRAF mutations do not contribute to Ras status in this case
ccle_mut_df = (
ccle_all_mut_df.loc[ras_genes + ['BRAF_MUT'], :].T
.assign(ras_status=ras_status).drop(['Description'])
)
print(ccle_mut_df.shape)
ccle_mut_df.head(5)
Out[24]:
In [25]:
# Join classifier scores with mutation status
ccle_full_df = ccle_mut_df.join(result_ccle.T).dropna()
ccle_full_df = ccle_full_df.assign(sample_name = ccle_full_df.index)
ccle_full_df = ccle_full_df.sort_values(by='weight', ascending=False)
ccle_full_df.index.name = 'cell_line'
print(ccle_full_df.shape)
ccle_full_df.head(5)
Out[25]:
In [26]:
# Write CCLE Scores to file
ccle_scores_file = os.path.join('..', 'results', 'ccle_Ras_classifier_scores.tsv')
ccle_full_df.to_csv(ccle_scores_file, sep='\t')
In [27]:
# Use Seaborn for the 2nd plot
sns.set(style="whitegrid")
sns.set_context("paper", rc={"font.size":11, "axes.titlesize":11, "axes.labelsize":16,
'xtick.labelsize':11, 'ytick.labelsize':11})
In [28]:
# Ras mutant vs. Ras wildtype
ras_mutant = ccle_full_df[ccle_full_df['ras_status'] == 1]
ras_wt = ccle_full_df[ccle_full_df['ras_status'] == 0]
# Also interested in BRAF status within Ras wildtype samples
braf_mutant = ras_wt[ras_wt['BRAF_MUT'] == 1]
braf_wt = ras_wt[ras_wt['BRAF_MUT'] == 0]
# Output t-test results
t_results_ras = ttest_ind(a = ras_mutant['weight'],
b = ras_wt['weight'], equal_var = False)
print('Ras Status:')
print(t_results_ras)
t_results_braf = ttest_ind(a = braf_mutant['weight'],
b = braf_wt['weight'], equal_var = False)
print('\nBRAF Status in Ras Wild-Type Samples:')
print(t_results_braf)
In [29]:
# Plot Results
x1, x2 = 0, 1
x3, x4 = -0.2, 0.2
y1, y2, h = 1.17, 1, 0.03
plt.rcParams['figure.figsize']=(3.5, 4)
ax = sns.boxplot(x="ras_status", y="weight", data=ccle_full_df,
hue='BRAF_MUT', palette = {0: "whitesmoke", 1: 'gainsboro'},
fliersize=0)
ax = sns.stripplot(x='ras_status', y='weight', hue='BRAF_MUT',
data=ccle_full_df,
dodge=True, edgecolor='gray',
palette = {1: "seagreen", 0: 'goldenrod'},
jitter=0.25, size=2, alpha=0.65)
handles, labels = ax.get_legend_handles_labels()
l = plt.legend(handles[2:4], ['Wild-Type', 'Mutant'], bbox_to_anchor=(.63, 0.2), loc=2, borderaxespad=0.)
l.set_title("BRAF")
ax.axes.set_ylim(0, 1.3)
ax.set_yticklabels([0, 0.2, 0.4, 0.6, 0.8, 1, ''])
ax.set_xticklabels(['Ras Wild-Type', 'Ras Mutant'])
ax.set_ylabel('Ras Classifier Score')
ax.set_xlabel('CCLE Data')
ax.legend
plt.axhline(0.5, color='black', linestyle='dashed', linewidth=1)
# Add Ras T-Test Results
plt.plot([x1, x1, x2, x2], [y1, y1+h, y1+h, y1], lw=1.2, c='black')
plt.text(.5, y1+h, "{:.2E}".format(Decimal(t_results_ras.pvalue)),
ha='center', va='bottom', color="black")
# Add BRAF t-test results
plt.plot([x3, x3, x4, x4], [y2, y2+h, y2+h, y2], lw=1.2, c='black')
plt.text(0, y2+h, "{:.2E}".format(Decimal(t_results_braf.pvalue)),
ha='center', va='bottom', color="black")
plt.tight_layout()
ccle_fig_file = os.path.join('..', 'figures', 'cell_line', 'ccle_predictions.pdf')
plt.savefig(ccle_fig_file)
In [30]:
# Assign a label to what the predictions are given classifier scores
ccle_full_df = ccle_full_df.assign(predictions = 'wild-type')
ccle_full_df.loc[ccle_full_df['weight'] > 0.5, 'predictions'] = 'mutant'
In [31]:
# Stratify cell lines based on predictions and ground truth status
positive_ras_predictions_ccle = ccle_full_df[ccle_full_df['weight'] > 0.5]
negative_ras_predictions_ccle = ccle_full_df[ccle_full_df['weight'] <= 0.5]
positive_ras_lines_ccle = ccle_full_df[ccle_full_df['ras_status'] == 1]
negative_ras_lines_ccle = ccle_full_df[ccle_full_df['ras_status'] == 0]
In [32]:
# Of wild-type Ras cell lines, how many are predicted correctly?
# True Negative Rate, Specificity
negative_ras_lines_ccle['predictions'].value_counts()
Out[32]:
In [33]:
# Of mutated Ras cell lines, how many are predicted correctly?
# True Positive Rate (TPR), Recall, Sensitivity
positive_ras_lines_ccle['predictions'].value_counts()
Out[33]:
In [34]:
# Of the wild-type predictions, how many are actually wild-type?
# Negative Predictive Value (NPV)
neg_ccle_results = negative_ras_predictions_ccle['ras_status'].value_counts()
true_neg = neg_ccle_results[0]
predicted_condition_neg = neg_ccle_results.sum()
print('{} out of {} Ras wild-type predictions '
'are true ({:.1f}%)'.format(true_neg, predicted_condition_neg,
true_neg * 100 / predicted_condition_neg))
In [35]:
# Of the mutated predictions, how many are actually mutated?
# Positive Predictive Value (PPV) -or- precision
pos_ccle_results = positive_ras_predictions_ccle['ras_status'].value_counts()
false_pos, true_pos = pos_ccle_results
predicted_condition_pos = pos_ccle_results.sum()
print('{} out of {} Ras mutation predictions '
'are true ({:.1f}%)'.format(true_pos, predicted_condition_pos,
true_pos * 100 / predicted_condition_pos))
In [36]:
total_correct = true_pos + true_neg
print('{} of {} Total cell lines '
'predicted correctly ({:.1f}%)'.format(total_correct, ccle_full_df.shape[0],
total_correct * 100 / ccle_full_df.shape[0]))
In [37]:
# Of the False positives, how many are BRAF mutant?
wt_ras_braf_ccle = positive_ras_predictions_ccle[positive_ras_predictions_ccle['ras_status'] == 0]
braf_neg, braf_pos = wt_ras_braf_ccle['BRAF_MUT'].value_counts()
print('{} of {} total false positives '
'have BRAF mutations ({:.1f}%)'.format(braf_pos, false_pos,
braf_pos * 100 / false_pos))
In [38]:
# If include BRAF mutations, how many correct
correct_braf = wt_ras_braf_ccle['BRAF_MUT'].value_counts()[1]
true_pos_with_braf = true_pos + correct_braf
print('Including BRAF mutants, {} of {} Ras mutation predictions '
'have Ras pathway mutations ({:.1f}%)'.format(true_pos_with_braf,
predicted_condition_pos,
true_pos_with_braf * 100 / predicted_condition_pos))
In [39]:
print('Of the false positives, there are {} BRAF mutated cell lines '
'and {} BRAF wild-type cell lines'.format(braf_pos, braf_neg))
In [40]:
total_braf_wildtype, total_braf_mut = ccle_full_df['BRAF_MUT'].value_counts()
print('In all of CCLE, there are {} BRAF mutated cell lines '
'and {} BRAF wild-type cell lines'.format(total_braf_mut, total_braf_wildtype))
In [41]:
# How many MAF tumors also have CCLE and mutation data?
len(set(ccle_full_df.index).intersection(set(ccle_maf_df.index)))
Out[41]:
In [42]:
# Of the false negaves, what RAS mutations do they harbor?
false_negatives_df = negative_ras_predictions_ccle.query('ras_status == 1')
common_neg_ccle_samples = set(ccle_maf_df.index).intersection(set(false_negatives_df.index))
false_neg_maf_df = ccle_maf_df.loc[list(common_neg_ccle_samples), :]
In [43]:
# What about true positives?
true_positives_df = positive_ras_predictions_ccle[positive_ras_predictions_ccle['ras_status'] == 1]
common_pos_ccle_samples = set(ccle_maf_df.index).intersection(set(true_positives_df.index))
true_pos_maf = ccle_maf_df.loc[list(common_pos_ccle_samples), :]
In [44]:
# Subset to only Ras genes
false_neg_ras = false_neg_maf_df.query('Hugo_Symbol in ["KRAS", "HRAS", "NRAS"]')
tru_pos_ras = true_pos_maf.query('Hugo_Symbol in ["KRAS", "HRAS", "NRAS"]')
In [45]:
# Remove duplicate cell-lines. Assume 1 cosmic variant supercedes lack of cosmic
false_neg_dup = false_neg_ras.groupby('Tumor_Sample_Barcode')['isCOSMIChotspot']
cosmic_false_neg = (
false_neg_dup.value_counts()
.reset_index(name='count')
.sort_values(by='isCOSMIChotspot')
.drop_duplicates(subset='Tumor_Sample_Barcode', keep='last')
)
n_false_neg_cosmic = cosmic_false_neg['isCOSMIChotspot'].value_counts()[1]
n_false_neg_obs = cosmic_false_neg['isCOSMIChotspot'].shape[0]
In [46]:
# What is the proportion of COSMIC variant (True) to non-COSMIC variant (False)
print(cosmic_false_neg['isCOSMIChotspot'].value_counts())
cosmic_false_neg['isCOSMIChotspot'].value_counts(normalize=True)
Out[46]:
In [47]:
# Remove duplicate cell-lines. Assume 1 cosmic variant supercedes lack of cosmic
tru_pos_dup = tru_pos_ras.groupby('Tumor_Sample_Barcode')['isCOSMIChotspot']
cosmic_true_pos = (
tru_pos_dup.value_counts()
.reset_index(name='count')
.sort_values(by='isCOSMIChotspot')
.drop_duplicates(subset='Tumor_Sample_Barcode', keep='last')
)
n_true_pos_cosmic = cosmic_true_pos['isCOSMIChotspot'].value_counts()[1]
n_tru_pos_obs = cosmic_true_pos['isCOSMIChotspot'].shape[0]
In [48]:
print(cosmic_true_pos['isCOSMIChotspot'].value_counts())
cosmic_true_pos['isCOSMIChotspot'].value_counts(normalize=True)
Out[48]:
In [49]:
# Test if the proportions of observed COSMIC variants are significantly different
# between true positives and false negatives. All of these samples have Ras mutations.
# The question is asking if the proportion of Ras variants annotated in the COSMIC
# database is lower in False negative tumors (those the classifier predicted as
# Ras wild-type) than in True positive tumors
cosmic_prop_chi = proportions_chisquare(count = [n_false_neg_cosmic, n_true_pos_cosmic],
nobs = [n_false_neg_obs, n_tru_pos_obs])
print('Chi Square = {}, p value = {}'.format(cosmic_prop_chi[0], cosmic_prop_chi[1]))
print('There is a significant difference in the expected proportion of COSMIC variants.')
In [50]:
# Load TCGA PanCanAtlas Core Ras Pathway genes
ras_genes_file = os.path.join('..', 'data', 'ras_genes.csv')
ras_core_df = pd.read_table(ras_genes_file)
ras_core_df.head()
Out[50]:
In [51]:
# Subset MAF file to Ras pathway variants and merge with CCLE classifier scores
ras_pathway_genes = ras_core_df['genes'].tolist()
all_common_lines = set(ccle_maf_df.index).intersection(set(ccle_full_df.index))
# Subset to common cell lines
subset_maf = ccle_maf_df.loc[list(all_common_lines), :]
subset_maf = (
subset_maf.query('Hugo_Symbol in @ras_pathway_genes')
.loc[:, ['Hugo_Symbol', 'Protein_Change', 'cDNA_Change']]
.merge(ccle_full_df, left_index=True, right_index=True)
)
subset_maf.head(3)
Out[51]:
In [52]:
# Get the mean classifier scores for CCLE nucleotide variants
mean_nuc_data = (
pd.DataFrame(subset_maf
.groupby(['cDNA_Change', 'Hugo_Symbol'])['weight']
.mean())
)
mean_nuc_data.columns = ['ccle_mean_weight']
mean_nuc_data = mean_nuc_data.reset_index()
# Get the sd classifier scores for CCLE variants
sd_nuc_data = (
pd.DataFrame(subset_maf
.groupby(['cDNA_Change', 'Hugo_Symbol'])['weight']
.std())
)
sd_nuc_data.columns = ['ccle_sd_weight']
sd_nuc_data = sd_nuc_data.reset_index()
# Counts of CCLE variants altering amino acids
count_nuc_data = (
pd.DataFrame(subset_maf
.groupby(['cDNA_Change', 'Hugo_Symbol'])['weight']
.count())
)
count_nuc_data.columns = ['ccle_count']
count_nuc_data = count_nuc_data.reset_index()
In [53]:
# Merge protein data
nuc_merge_on = ['Hugo_Symbol', 'cDNA_Change']
nuc_change_df = (
mean_nuc_data.merge(sd_nuc_data,
left_on=nuc_merge_on, right_on=nuc_merge_on)
.merge(count_nuc_data, left_on=nuc_merge_on, right_on=nuc_merge_on)
)
nuc_change_df.sort_values('ccle_count').tail(5)
Out[53]:
In [54]:
data_s4_file = os.path.join('..', 'classifiers', 'RAS', 'tables',
'nucleotide_mutation_scores.tsv')
data_s4_df = pd.read_table(data_s4_file)
# Merge the CCLE nucleotide scores
data_s4_df = data_s4_df.merge(nuc_change_df, left_on = ['Hugo_Symbol', 'HGVSc'],
right_on = ['Hugo_Symbol', 'cDNA_Change'],
how='outer')
updated_data_s4_df = data_s4_df.sort_values(by='count', ascending=False)
updated_data_s4_file = os.path.join('..', 'tables', 'updated_Data_S4.csv')
updated_data_s4_df.to_csv(updated_data_s4_file, sep=',', index=False)
updated_data_s4_df.head()
Out[54]:
In [55]:
# Get the mean classifier scores for CCLE variants
mean_protein_data = (
pd.DataFrame(subset_maf
.groupby(['Protein_Change', 'Hugo_Symbol'])['weight']
.mean())
)
mean_protein_data.columns = ['ccle_mean_weight']
mean_protein_data = mean_protein_data.reset_index()
# Get the sd classifier scores for CCLE variants
sd_protein_data = (
pd.DataFrame(subset_maf
.groupby(['Protein_Change', 'Hugo_Symbol'])['weight']
.std())
)
sd_protein_data.columns = ['ccle_sd_weight']
sd_protein_data = sd_protein_data.reset_index()
# Counts of CCLE variants altering amino acids
count_protein_data = (
pd.DataFrame(subset_maf
.groupby(['Protein_Change', 'Hugo_Symbol'])['weight']
.count())
)
count_protein_data.columns = ['ccle_count']
count_protein_data = count_protein_data.reset_index()
In [56]:
# Merge protein data
merge_on = ['Hugo_Symbol', 'Protein_Change']
protein_change_df = (
mean_protein_data.merge(sd_protein_data,
left_on=merge_on, right_on=merge_on)
.merge(count_protein_data, left_on=merge_on, right_on=merge_on)
)
protein_change_df.sort_values('ccle_count').tail(5)
Out[56]:
In [57]:
# Convert amino acid to 3 letters
protein_convert = [''.join([aa[x] if x in aa.keys() else x for x in y])
for y in protein_change_df['Protein_Change']]
protein_change_df = protein_change_df.assign(conversion = protein_convert)
In [58]:
data_s5_file = os.path.join('..', 'classifiers', 'RAS', 'tables',
'amino_acid_mutation_scores.tsv')
data_s5_df = pd.read_table(data_s5_file)
# Merge the CCLE protein scores
data_s5_df = data_s5_df.merge(protein_change_df, left_on = ['Hugo_Symbol', 'HGVSp'],
right_on = ['Hugo_Symbol', 'conversion'],
how='outer')
# Sort by the total number of mutations observed
updated_data_s5_df = (
data_s5_df.drop(['Protein_Change'], axis=1).sort_values(by='count', ascending=False)
)
updated_data_s5_file = os.path.join('..', 'tables', 'updated_Data_S5.csv')
updated_data_s5_df.to_csv(updated_data_s5_file, sep=',', index=False)
updated_data_s5_df.head()
Out[58]:
Here, we process drug efficacy data on the CCLE dataset. Data obtained from https://portals.broadinstitute.org/ccle/data (Pharmacologic profiling) (signin required).
A processed .tsv file is output to be visualized in scripts/viz/ras_ccle_pharmacology.R.
In [59]:
# Load in pharmacological results
pharm_file = os.path.join('..', 'data', 'CCLE_NP24.2009_Drug_data_2015.02.24.csv')
pharm_df = pd.read_csv(pharm_file, index_col=0)
pharm_df = pharm_df.assign(tissue = [' '.join(x[1:]) for x in pharm_df.index.str.split('_')])
pharm_full_df = pharm_df.merge(ccle_full_df, left_index=True, right_index=True)
pharm_full_df.head()
Out[59]:
In [60]:
common_celllines_pharm = set(ccle_full_df.index).intersection(set(pharm_df.index))
print('There are {} cell lines in common'.format(len(common_celllines_pharm)))
pharm_full_df['Compound'].value_counts()
Out[60]:
In [61]:
pharm_full_df['tissue'].value_counts()
Out[61]:
In [62]:
# What is the cell line tissue representation?
compound_heatmap = pd.pivot_table(pharm_full_df[['tissue', 'Compound']],
columns='tissue', index='Compound',
aggfunc=len)
compound_heatmap = pd.DataFrame(compound_heatmap.unstack()).reset_index()
compound_heatmap.columns = ['tissue', 'Compound', 'count']
compound_heatmap = compound_heatmap.sort_values(by=['tissue', 'Compound'])
In [63]:
gg.ggplot(compound_heatmap, gg.aes('factor(tissue)', 'factor(Compound)', fill='count')) + \
gg.geom_tile(gg.aes(width=.95, height=.95)) + \
gg.theme(axis_text_x=gg.element_text(rotation=90),
panel_background=gg.element_rect(fill='white'))
Out[63]:
In [64]:
# Write out pharm_full_df to file to plot in ggplot2
# plotnine does not include all the functionality required to create the plot
pharm_file = os.path.join('..', 'data', 'pharmacology_predictions_ccle.tsv')
pharm_full_df.to_csv(pharm_file, sep='\t')