In [1]:
import os
import numpy as np
import pandas as pd
from scipy import linalg

In [2]:
def load_from_taiga(data_id, index_col = None):
    '''Load dataset from taiga with caching.
    INPUTS:
        data_id: Unique ID of taiga dataset
        index_col: column number to use as index
    OUTPUTS:
        loaded dataset as pandas df
    '''
    taiga_url = 'http://datasci-dev.broadinstitute.org:8999/rest/v0/datasets/'
    data_dir = os.path.expanduser('~/.taiga') #directory for caching taiga files

    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    data_file = os.path.join(data_dir,data_id + ".csv")
    data_source = taiga_url + data_id + "?format=tabular_csv"
    if os.path.exists(data_file):
        print('Loading ' + data_file + ' from disc')
        data = pd.read_csv(data_file, header = 0)
    else:
        try:
            print('Loading ' + data_source + ' from taiga')
            data = pd.read_csv(data_source)
        except:
            print('Loading ' + data_source + ' from taiga')
            data_source = taiga_url + data_id + "?format=csv"
            data = pd.read_csv(data_source)

        print('Saving to disc')
        data.to_csv(data_file, index = False)   
    if index_col is not None:
        data.set_index(data.columns[index_col], inplace=True)
    return data

In [55]:
min_strong_deps = 5 #minimum number of < -2 sigma deps for using a gene

dep_data_id = "966b70bf-c414-43ad-97ec-1a8b50f91ed6" #achilles-v2-20-1-demeter-z-scores-ignoring-expression-expanded-families- v2
GE_data_id = "013458ff-bb36-4fd7-ac87-dcc04b15174e" # ccle-rnaseq-gene-expression-rpkm-for-analysis-in-manuscripts-protein-coding-genes-only-hgnc-mapped v3
target_geneset_data_id = 'e714c26b-80c8-42b1-bb61-a88b7bb1b334' #target genes to be analyzed in depmap manuscript

#grad datasets from taiga
Dep = load_from_taiga(dep_data_id, index_col = 0).transpose()
GE = load_from_taiga(GE_data_id, index_col = 0)

#align GE and Dep data to common set of cell lines
used_CLs = np.intersect1d(Dep.index.values, GE.index.values) 
Dep = Dep.ix[used_CLs]
GE = GE.ix[used_CLs]

#restrict target variables to those genes with at least min_strong_deps strong dependencies
n_strong_deps = np.nansum(Dep < -2, axis = 0)
Y_mat = Dep.values[:, n_strong_deps >= min_strong_deps].copy()

X_mat = GE.values.copy()

n_features = X_mat.shape[1] #number of predictors
(n_CLs, n_targets) = Y_mat.shape 

print('CLs: {}, Features: {}, Targets: {}'.format(n_CLs, X_mat.shape[1], n_targets))


Loading /root/.taiga/966b70bf-c414-43ad-97ec-1a8b50f91ed6.csv from disc
Loading /root/.taiga/013458ff-bb36-4fd7-ac87-dcc04b15174e.csv from disc
CLs: 478, Features: 18772, Targets: 5332

In [66]:
def fit_mtRidge(X, Y, l2_lambda = 0, verbose = True, force_fast = False):
    '''Fit 'multi-task' ridge regression model
    Inputs:
        X: nxp matrix of predictors (no missing values alowed)
        Y: nxq matrix of response variables (can have missing values)
        l2_lambda: L2 regularization parameter
        verbose: set to False to silence output
        force_fast: set to True if you want to ignore the differing patterns of missing data in the columns of Y
            This can be much faster if the patterns of missing data among columns of Y are highly variable.
            It only gives an approximation to the ML solution though.
    '''
    #save y intersepts separately and then subtract them out
    y_int = np.nanmean(Y, axis = 0)
    Yc = Y.copy() - y_int

    if force_fast:
        #ignore differing patterns of missing values in the columns of Y
        Yc[np.isnan(Yc)] = 0
        U, s, Vt = linalg.svd(X, full_matrices=False)
        idx = s > 1e-15  # same default value as scipy.linalg.pinv
        s_nnz = s[idx][:, np.newaxis]
        UTy = np.dot(U.T, Yc)

        d = np.zeros((s.size, Yc.shape[1]))
        d[idx] = s_nnz / (s_nnz ** 2 + l2_lambda)
        d_UT_y = d * UTy
        coefs = np.dot(Vt.T, d_UT_y) 
    else:
        nan_sets = np.isnan(Yc)
        nan_patterns = pd.DataFrame(nan_sets.T).drop_duplicates() #unique patterns of missing rows, over columns of Y
        num_nan_patterns = nan_patterns.shape[0]
        if verbose:
            print('Found %d unique patterns of missing rows' % (num_nan_patterns))

        coefs = np.ones((X.shape[1], Y.shape[1])) * np.nan 
        for idx, nan_pattern in enumerate(nan_patterns.values):
            cur_cols = np.all(nan_sets == np.tile(nan_pattern, (nan_sets.shape[1],1)).T, axis = 0)
            if verbose:
                    print('Pattern %d/%d. Found %d/%d columns with %d/%d missing rows' % \
                      (idx+1, num_nan_patterns, np.sum(cur_cols), nan_sets.shape[1], 
                       np.sum(nan_pattern), nan_sets.shape[0]))
            if np.sum(~nan_pattern) >= min_rows: #if there are enough usable rows
                U, s, Vt = linalg.svd(X[~nan_pattern,:], full_matrices=False)
                idx = s > 1e-15  # same default value as scipy.linalg.pinv
                s_nnz = s[idx][:, np.newaxis]
                d = np.zeros((s.size, Yc.shape[1]))
                d[idx] = s_nnz / (s_nnz ** 2 + l2_lambda)
                Y_sub = Yc[~nan_pattern,:]
                UTy = np.dot(U.T, Y_sub[:, cur_cols])
                d_UT_y = d[:, cur_cols] * UTy
                coefs[:, cur_cols] = np.dot(Vt.T, d_UT_y) 
            elif verbose:
                print('Not enough rows for this block')
       
    mod = {'coefs': coefs, 'y_int': y_int}
    return mod

In [67]:
%%time
mod = fit_mtRidge(X_mat, Y_mat, l2_lambda = 1000, verbose = True)


Found 5 unique patterns of missing rows
Pattern 1/5. Found 4261/5332 columns with 0/478 missing rows
Pattern 2/5. Found 1037/5332 columns with 211/478 missing rows
Pattern 3/5. Found 28/5332 columns with 19/478 missing rows
Pattern 4/5. Found 4/5332 columns with 267/478 missing rows
Pattern 5/5. Found 2/5332 columns with 192/478 missing rows
CPU times: user 1min 28s, sys: 10.2 s, total: 1min 39s
Wall time: 24.4 s

In [ ]: