In [1]:
import os
import numpy as np
import pandas as pd
from scipy import linalg
In [2]:
def load_from_taiga(data_id, index_col = None):
'''Load dataset from taiga with caching.
INPUTS:
data_id: Unique ID of taiga dataset
index_col: column number to use as index
OUTPUTS:
loaded dataset as pandas df
'''
taiga_url = 'http://datasci-dev.broadinstitute.org:8999/rest/v0/datasets/'
data_dir = os.path.expanduser('~/.taiga') #directory for caching taiga files
if not os.path.exists(data_dir):
os.makedirs(data_dir)
data_file = os.path.join(data_dir,data_id + ".csv")
data_source = taiga_url + data_id + "?format=tabular_csv"
if os.path.exists(data_file):
print('Loading ' + data_file + ' from disc')
data = pd.read_csv(data_file, header = 0)
else:
try:
print('Loading ' + data_source + ' from taiga')
data = pd.read_csv(data_source)
except:
print('Loading ' + data_source + ' from taiga')
data_source = taiga_url + data_id + "?format=csv"
data = pd.read_csv(data_source)
print('Saving to disc')
data.to_csv(data_file, index = False)
if index_col is not None:
data.set_index(data.columns[index_col], inplace=True)
return data
In [55]:
min_strong_deps = 5 #minimum number of < -2 sigma deps for using a gene
dep_data_id = "966b70bf-c414-43ad-97ec-1a8b50f91ed6" #achilles-v2-20-1-demeter-z-scores-ignoring-expression-expanded-families- v2
GE_data_id = "013458ff-bb36-4fd7-ac87-dcc04b15174e" # ccle-rnaseq-gene-expression-rpkm-for-analysis-in-manuscripts-protein-coding-genes-only-hgnc-mapped v3
target_geneset_data_id = 'e714c26b-80c8-42b1-bb61-a88b7bb1b334' #target genes to be analyzed in depmap manuscript
#grad datasets from taiga
Dep = load_from_taiga(dep_data_id, index_col = 0).transpose()
GE = load_from_taiga(GE_data_id, index_col = 0)
#align GE and Dep data to common set of cell lines
used_CLs = np.intersect1d(Dep.index.values, GE.index.values)
Dep = Dep.ix[used_CLs]
GE = GE.ix[used_CLs]
#restrict target variables to those genes with at least min_strong_deps strong dependencies
n_strong_deps = np.nansum(Dep < -2, axis = 0)
Y_mat = Dep.values[:, n_strong_deps >= min_strong_deps].copy()
X_mat = GE.values.copy()
n_features = X_mat.shape[1] #number of predictors
(n_CLs, n_targets) = Y_mat.shape
print('CLs: {}, Features: {}, Targets: {}'.format(n_CLs, X_mat.shape[1], n_targets))
In [66]:
def fit_mtRidge(X, Y, l2_lambda = 0, verbose = True, force_fast = False):
'''Fit 'multi-task' ridge regression model
Inputs:
X: nxp matrix of predictors (no missing values alowed)
Y: nxq matrix of response variables (can have missing values)
l2_lambda: L2 regularization parameter
verbose: set to False to silence output
force_fast: set to True if you want to ignore the differing patterns of missing data in the columns of Y
This can be much faster if the patterns of missing data among columns of Y are highly variable.
It only gives an approximation to the ML solution though.
'''
#save y intersepts separately and then subtract them out
y_int = np.nanmean(Y, axis = 0)
Yc = Y.copy() - y_int
if force_fast:
#ignore differing patterns of missing values in the columns of Y
Yc[np.isnan(Yc)] = 0
U, s, Vt = linalg.svd(X, full_matrices=False)
idx = s > 1e-15 # same default value as scipy.linalg.pinv
s_nnz = s[idx][:, np.newaxis]
UTy = np.dot(U.T, Yc)
d = np.zeros((s.size, Yc.shape[1]))
d[idx] = s_nnz / (s_nnz ** 2 + l2_lambda)
d_UT_y = d * UTy
coefs = np.dot(Vt.T, d_UT_y)
else:
nan_sets = np.isnan(Yc)
nan_patterns = pd.DataFrame(nan_sets.T).drop_duplicates() #unique patterns of missing rows, over columns of Y
num_nan_patterns = nan_patterns.shape[0]
if verbose:
print('Found %d unique patterns of missing rows' % (num_nan_patterns))
coefs = np.ones((X.shape[1], Y.shape[1])) * np.nan
for idx, nan_pattern in enumerate(nan_patterns.values):
cur_cols = np.all(nan_sets == np.tile(nan_pattern, (nan_sets.shape[1],1)).T, axis = 0)
if verbose:
print('Pattern %d/%d. Found %d/%d columns with %d/%d missing rows' % \
(idx+1, num_nan_patterns, np.sum(cur_cols), nan_sets.shape[1],
np.sum(nan_pattern), nan_sets.shape[0]))
if np.sum(~nan_pattern) >= min_rows: #if there are enough usable rows
U, s, Vt = linalg.svd(X[~nan_pattern,:], full_matrices=False)
idx = s > 1e-15 # same default value as scipy.linalg.pinv
s_nnz = s[idx][:, np.newaxis]
d = np.zeros((s.size, Yc.shape[1]))
d[idx] = s_nnz / (s_nnz ** 2 + l2_lambda)
Y_sub = Yc[~nan_pattern,:]
UTy = np.dot(U.T, Y_sub[:, cur_cols])
d_UT_y = d[:, cur_cols] * UTy
coefs[:, cur_cols] = np.dot(Vt.T, d_UT_y)
elif verbose:
print('Not enough rows for this block')
mod = {'coefs': coefs, 'y_int': y_int}
return mod
In [67]:
%%time
mod = fit_mtRidge(X_mat, Y_mat, l2_lambda = 1000, verbose = True)
In [ ]: