This IPython notebook accompanies the chapter 'Phosphoproteomics-based profiling of kinase activities in cancer cell' in the book 'Methods of Molecular Biology: Cancer Systems Biology' from Springer, 2016.
The script aims to demonstrate the methodology of KSEA, to facilitate grasping the operations performed in the provided code, and to enable reproduction of the implementation in other programming languages where required.
In [1]:
# Import useful libraries
import numpy as np
import pandas as pd
# Import required libraries for data visualisation
import matplotlib.pyplot as plt
import seaborn as sns
# Import the package
import kinact
# Magic
%matplotlib inline
In [4]:
# import data
data_fc, data_p_value = kinact.get_example_data()
# import prior knowledge
adj_matrix = kinact.get_kinase_targets()
print data_fc.head()
print
print data_p_value.head()
In [5]:
# Perform ksea using the Mean method
score, p_value = kinact.ksea.ksea_mean(data_fc=data_fc['5min'].dropna(),
interactions=adj_matrix,
mP=data_fc['5min'].values.mean(),
delta=data_fc['5min'].values.std())
print pd.DataFrame({'score': score, 'p_value': p_value}).head()
In [6]:
# Perform ksea using the Alternative Mean method
score, p_value = kinact.ksea.ksea_mean_alt(data_fc=data_fc['5min'].dropna(),
p_values=data_p_value['5min'],
interactions=adj_matrix,
mP=data_fc['5min'].values.mean(),
delta=data_fc['5min'].values.std())
print pd.DataFrame({'score': score, 'p_value': p_value}).head()
In [7]:
# Perform ksea using the Delta method
score, p_value = kinact.ksea.ksea_delta(data_fc=data_fc['5min'].dropna(),
p_values=data_p_value['5min'],
interactions=adj_matrix)
print pd.DataFrame({'score': score, 'p_value': p_value}).head()
In order to perform the described kinase enrichment analysis, we load the data into a Pandas DataFrame. Here, we use the data from de Graaf et al., 2014 for demonstration of KSEA. The data is available as supplemental material to the article online under http://mcponline.org/content/13/9/2426/suppl/DC1. The dataset of interest can be found in the Supplemental Table 2.
When downloading the dataset from the internet, it will be provided as Excel spreadsheet. For the use in this script, it will have to saved as csv-file, using the 'Save As' function in Excel.
In the accompanying github repository, we will provide an already processed csv-file together with the code for KSEA.
In [8]:
# Read data
data_raw = pd.read_csv('../kinact/data/deGraaf_2014_jurkat.csv', sep=',', header=0)
# Filter for those p-sites that were matched ambiguously
data_reduced = data_raw[~data_raw['Proteins'].str.contains(';')]
# Create identifier for each phosphorylation site, e.g. P06239_S59 for the Serine 59 in the protein Lck
data_reduced.loc[:, 'ID'] = data_reduced['Proteins'] + '_' + data_reduced['Amino acid'] + \
data_reduced['Positions within proteins']
data_indexed = data_reduced.set_index('ID')
# Extract only relevant columns
data_relevant = data_indexed[[x for x in data_indexed if x.startswith('Average')]]
# Rename columns
data_relevant.columns = [x.split()[-1] for x in data_relevant]
# Convert abundaces into fold changes compared to control (0 minutes after stimulation)
data_fc = data_relevant.sub(data_relevant['0min'], axis=0)
data_fc.drop('0min', axis=1, inplace=True)
# Also extract the p-values for the fold changes
data_p_value = data_indexed[[x for x in data_indexed if x.startswith('p value') and x.endswith('vs0min')]]
data_p_value.columns = [x.split('_')[-1].split('vs')[0] + 'min' for x in data_p_value]
data_p_value = data_p_value.astype('float') # Excel saved the p-values as strings, not as floating point numbers
print data_fc.head()
print data_p_value.head()
In the following example, we use the data from the PhosphoSitePlus database, which can be downloaded here: http://www.phosphosite.org/staticDownloads.action.
Consider, that the downloaded file contains a disclaimer at the top of the file, which has to be removed before the file can be used as described below.
In [9]:
# Read data
ks_rel = pd.read_csv('../kinact/data/PhosphoSitePlus.txt', sep='\t')
# The data from the PhosphoSitePlus database is not provided as comma-separated value file (csv),
# but instead, a tab = \t delimits the individual cells
# Restrict the data on interactions in the organism of interest
ks_rel_human = ks_rel.loc[(ks_rel['KIN_ORGANISM'] == 'human') & (ks_rel['SUB_ORGANISM'] == 'human')]
# Create p-site identifier of the same format as before
ks_rel_human.loc[:, 'psite'] = ks_rel_human['SUB_ACC_ID'] + '_' + ks_rel_human['SUB_MOD_RSD']
# Create adjencency matrix (links between kinases (columns) and p-sites (rows) are indicated with a 1, NA otherwise)
ks_rel_human.loc[:, 'value'] = 1
adj_matrix = pd.pivot_table(ks_rel_human, values='value', index='psite', columns='GENE', fill_value=0)
print adj_matrix.head()
print adj_matrix.sum(axis=0).sort_values(ascending=False).head()
In [10]:
score, p_value = kinact.ksea.ksea_delta(data_fc=data_fc['5min'],
p_values=data_p_value['5min'],
interactions=adj_matrix,
)
print pd.DataFrame({'score': score, 'p_value': p_value}).head()
In [11]:
# Calculate the KSEA scores for all data with the ksea_mean method
activity_mean = pd.DataFrame({c: kinact.ksea.ksea_mean(data_fc=data_fc[c],
interactions=adj_matrix,
mP=data_fc.values.mean(),
delta=data_fc.values.std())[0]
for c in data_fc})
activity_mean = activity_mean[['5min', '10min', '20min', '30min', '60min']]
print activity_mean.head()
# Calculate the KSEA scores for all data with the ksea_mean method, using the median
activity_median = pd.DataFrame({c: kinact.ksea.ksea_mean(data_fc=data_fc[c],
interactions=adj_matrix,
mP=data_fc.values.mean(),
delta=data_fc.values.std(), median=True)[0]
for c in data_fc})
activity_median = activity_median[['5min', '10min', '20min', '30min', '60min']]
print activity_median.head()
# Calculate the KSEA scores for all data with the ksea_mean_alt method
activity_mean_alt = pd.DataFrame({c: kinact.ksea.ksea_mean_alt(data_fc=data_fc[c],
p_values=data_p_value[c],
interactions=adj_matrix,
mP=data_fc.values.mean(),
delta=data_fc.values.std())[0]
for c in data_fc})
activity_mean_alt = activity_mean_alt[['5min', '10min', '20min', '30min', '60min']]
print activity_mean_alt.head()
# Calculate the KSEA scores for all data with the ksea_mean method, using the median
activity_median_alt = pd.DataFrame({c: kinact.ksea.ksea_mean_alt(data_fc=data_fc[c],
p_values=data_p_value[c],
interactions=adj_matrix,
mP=data_fc.values.mean(),
delta=data_fc.values.std(),
median=True)[0]
for c in data_fc})
activity_median_alt = activity_median_alt[['5min', '10min', '20min', '30min', '60min']]
print activity_median_alt.head()
# Calculate the KSEA scores for all data with the ksea_delta method
activity_delta = pd.DataFrame({c: kinact.ksea.ksea_delta(data_fc=data_fc[c],
p_values=data_p_value[c],
interactions=adj_matrix)[0]
for c in data_fc})
activity_delta = activity_delta[['5min', '10min', '20min', '30min', '60min']]
print activity_delta.head()
In [12]:
sns.set(context='poster', style='ticks')
sns.heatmap(activity_mean_alt, cmap=sns.blend_palette([sns.xkcd_rgb['amber'],
sns.xkcd_rgb['almost black'],
sns.xkcd_rgb['bright blue']],
as_cmap=True))
plt.show()
In de Graaf et al., they associated (amongst others) the Casein kinase II alpha (CSNK2A1) with higher activity after prolonged stimulation with prostaglandin E2. Here, we plot the activity scores of CSNK2A1 for all three methods of KSEA, which are in good agreement.
In [13]:
kinase='CSNK2A1'
df_plot = pd.DataFrame({'mean': activity_mean.loc[kinase],
'delta': activity_delta.loc[kinase],
'mean_alt': activity_mean_alt.loc[kinase]})
df_plot['time [min]'] = [5, 10, 20, 30, 60]
df_plot = pd.melt(df_plot, id_vars='time [min]', var_name='method', value_name='activity score')
g = sns.FacetGrid(df_plot, col='method', sharey=False, size=3, aspect=1)
g = g.map(sns.pointplot, 'time [min]', 'activity score')
plt.subplots_adjust(top=.82)
plt.show()
In [14]:
data_condition = data_fc['60min'].copy()
p_values = data_p_value['60min']
kinase = 'CDK1'
In [15]:
substrates = adj_matrix[kinase].replace(0, np.nan).dropna().index
detected_p_sites = data_fc.index
intersect = list(set(substrates).intersection(detected_p_sites))
In [16]:
mS = data_condition.loc[intersect].mean()
mP = data_fc.values.mean()
m = len(intersect)
delta = data_fc.values.std()
z_score = (mS - mP) * np.sqrt(m) * 1/delta
from scipy.stats import norm
p_value_mean = norm.sf(abs(z_score))
print mS, p_value_mean
In [17]:
cut_off = -np.log10(0.05)
set_alt = data_condition.loc[intersect].where(p_values.loc[intersect] > cut_off).dropna()
mS_alt = set_alt.mean()
z_score_alt = (mS_alt - mP) * np.sqrt(len(set_alt)) * 1/delta
p_value_mean_alt = norm.sf(abs(z_score_alt))
print mS_alt, p_value_mean_alt
In [18]:
cut_off = -np.log10(0.05)
score_delta = len(data_condition.loc[intersect].where((data_condition.loc[intersect] > 0) &
(p_values.loc[intersect] > cut_off)).dropna()) -\
len(data_condition.loc[intersect].where((data_condition.loc[intersect] < 0) &
(p_values.loc[intersect] > cut_off)).dropna())
M = len(data_condition)
n = len(intersect)
N = len(np.where(p_values.loc[adj_matrix.index.tolist()] > cut_off)[0])
from scipy.stats import hypergeom
hypergeom_dist = hypergeom(M, n, N)
p_value_delta = hypergeom_dist.pmf(len(p_values.loc[intersect].where(p_values.loc[intersect] > cut_off).dropna()))
print score_delta, p_value_delta
In [ ]: