The purpose of this Notebook is to benchmark different methods for principal coordinates analysis (API). This Notebook depends on scikit-bio 0.2.3-dev, which includes a revised PCoA API. The specific methods to be evaluated are:

  • Current QIIME default (eigh)
  • Split-combine MDS (scmds)
  • Nystrom (nystrom)
  • LAPACK (svd)
  • Stochastic SVD (ssvd)
  • Fast SVD (fsvd)
  • IR Arnoldi-Lanczos (eigsh)

Support for eigh and eigsh is already provided by NumPy and SciPy respectively, and are available through scikit-bio. The remaining methods are implemented within this Notebook.


In [36]:
from __future__ import absolute_import, division, print_function
from unittest import TestCase, main as unittest_main

import numpy.testing as npt
import numpy as np
import scipy as sp
import scipy.sparse
import scipy.sparse.linalg
from skbio.stats.ordination import OrdinationResults

from mod.principal_coordinate_analysis import PCoABase

class DecompTests(TestCase):
    def setUp(self):
        np.random.seed(0)
        self.mat1 = np.random.randn(9, 6)
        self.mat2 = np.dot(self.mat1.T, self.mat1)

Stochastic SVD, as derived from the R implementation.


In [37]:
# Method implemented by Jamie Morton and Antonio Gonzalez
def ssvd(A, k=10, p=10, qiter=0, compute_v=True,  **kwargs):
    """Runs the SSVD method

    This method is based on the algorithm described in "Finding Structure with
    Randomness: Probabilistic Algorithms for Constructing Approximate Matrix
    Decompositions [1]. The method implemented is derived from the R
    implementation found in [2].

    Parameters
    ----------
    A : np.ndarray
        The result of PCoABase._A
    k : unsigned int, optional
        The number of eigenvectors and values to find. A lower k will result in
        lower quality resulting eignvectors and values.
    p : unsigned int, optional
        Oversampling parameter, this is added to k to boost accuracy.
    qiter : unsigned int, optional
        The number of iterations to perform.
    compute_v : bool, optional
        Whether or not to compute u v in addition to s. True by default.

    Returns
    -------
    eigvals: np.ndarray
        The resulting k eigenvalues.
    U_ssvd: np.ndarray
        The first set of resulting k eigenvectors.
    V_ssvd: np.ndarray
        The second set of resulting k eigenvectors.
        Returned only if compute_v is true

    References
    ----------
    .. [1] http://epubs.siam.org/doi/abs/10.1137/090771806
    .. [2] https://mahout.apache.org/users/dim-reduction/ssvd.html

    """

    A = np.atleast_2d(A)
    if A.ndim > 2:
        raise ValueError("Input matrix can only have two dimensions or less")

    m, n = A.shape

    p = min(min(m, n) - k, p)      # an mxn matrix M has at most p = min(m,n) unique
                                   # singular values
    r = k + p                      # rank plus oversampling parameter p

    omega = np.random.standard_normal(size = (n, r))   # generate random matrix omega
    y = np.dot(A, omega)           # compute a sample matrix Y: apply A to random
                                   # vectors to identify part of its range corresponding
                                   # to largest singular values
    Q, R = sp.linalg.qr(y)         # find an ON matrix st. Y = QQ'Y
    b = np.dot(Q.T, A)             # multiply A by Q whose columns form
                                   # an orthonormal basis for the range of Y

    #often, no iteration required to small error in eqn. 1.5
    for i in range(qiter):
        y = np.dot(A, b.T)
        Q, R = sp.linalg.qr(y)
        b = np.dot(Q.T, A)

    bbt = np.dot(b, b.T)
    eigvals, U_ssvd = scipy.sparse.linalg.eigsh(bbt, k)
    if compute_v: # compute svd of much smaller matrix b
        V_ssvd = np.dot(b.T, np.dot(U_ssvd, np.diag(1 / np.sqrt(eigvals))))
        U_ssvd = np.dot(Q, U_ssvd)
        return np.sqrt(eigvals), U_ssvd, V_ssvd.T
    else:
        U_ssvd = np.dot(Q, U_ssvd)
        return np.sqrt(eigvals), U_ssvd


class SSVDTests(DecompTests):
    def test_ssvd(self):
        np.random.seed(0)
        test_s, test_U, test_V = ssvd(self.mat1, k=3)
        actual_U, actual_s, actual_V = scipy.sparse.linalg.svds(self.mat1, k=3)
        npt.assert_allclose(-1*test_U, actual_U, rtol=4)
        npt.assert_allclose(test_s, actual_s, rtol=4)
        npt.assert_allclose(-1*test_V, actual_V, rtol=4)

    def test_ssvd_eig(self):
        np.random.seed(0)
        test_s, test_U = ssvd(self.mat2, k=5, compute_v=False)
        actual_s, actual_U = scipy.sparse.linalg.eigsh(self.mat2, k=5)
        npt.assert_allclose(-1*test_U, actual_U, rtol=4)
        npt.assert_allclose(test_s, actual_s, rtol=4)

Split-combine MDS


In [38]:
def scmds(A):
    raise NotImplementedError


def scmds_scores(ids, eigvals, eigvecs):
    raise NotImplementedError
    
    
class SCMDSTests(DecompTests):
    def test_scmds(self):
        self.fail()
        
    def test_scmds_scores(self):
        self.fail()

Nystrom


In [39]:
def nystrom(A, **kwargs):
    raise NotImplementedError


def nystrom_scores(ids, eigvals, eigvecs):
    raise NotImplementedError
    
class NystromTests(DecompTests):
    def test_scmds(self):
        self.fail()
        
    def test_nystrom_scores(self):
        self.fail()

FSVD


In [40]:
def fsvd(A, k=10, i=1, usePowerMethod=0, **kwargs):    
    """takes an F-matrix and returns eigenvalues and eigenvectors of the ssvd method
    Based on algorithm described in 'An Algorithm for the Principal Component analysis
    of Large Data Sets' by N. Halko, P.G. Martinsson, Y. Shkolnisky, and M. Tygert
    and Matlab code: http://stats.stackexchange.com/questions/2806/best-pca-algorithm-for-huge-number-of-features
    
    F_matrix: F_matrix
    k: dimensions
    i: is the number of levels of the Krylov method to use, for most applications, i=1 or i=2 is sufficient
    userPowerMethod: changes the power of the spectral norm (minimizing the error). See p11/eq8.1 DOI = {10.1137/100804139}  
    """
    m, n = A.shape
    
    if m < n:
        A = A.T
    
    m, n = A.shape    #dimensions could have changed in above Transpose
    l = k + 2

    # entries independent, identically distributed Gaussian random variables of zero mean and unit variance
    G = np.random.standard_normal(size=(n, l))   
    if usePowerMethod == 1:
        H = np.dot(A, G)
        for x in xrange(i):
            H = dot(A, np.dot(A.T, H))   #enhance decay of singular values
    else:
        H = np.dot(A, G)
        tmp = np.dot(A, np.dot(A.T, H))
        H = np.hstack((H, np.dot(A, np.dot(A.T, H))))
        for x in xrange(i-1):
            H = hstack((H, np.dot(A, np.dot(A.T, tmp)))) ### tmp is currently undefined, inquiring
    
    Q, R = np.linalg.qr(H)    #pivoted QR-decomposition
    T = np.dot(A.T, Q) #step 3
    
    Vt, St, W = np.linalg.svd(T, full_matrices=False) #step 4 (as documented in paper)
    W = W.T
    
    Ut = np.dot(Q,W)

    if m < n:
        V_fsvd = Ut[:,:k]
        U_fsvd = Vt[:,:k]
    else:
        U_fsvd = Ut[:,:k]
        V_fsvd = Vt[:,:k]
    
    #drop imaginary component, if we got one
    eigvals = (St[:k]**2).real
    eigvecs = U_fsvd.real

    return eigvals, eigvecs.T


class FSVDTests(DecompTests):
    def test_fsvd(self):
        self.fail()

Execute all the associated test code for the decomposition and scoring methods (where appropriate).


In [41]:
_ = unittest_main(exit=False, argv=['ipynb'])


FFFFF..
======================================================================
FAIL: test_fsvd (__main__.FSVDTests)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-40-e6ad2152c9d0>", line 7, in test_fsvd
    self.fail()
AssertionError: None

======================================================================
FAIL: test_nystrom_scores (__main__.NystromTests)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-39-d84a53266cc3>", line 13, in test_nystrom_scores
    self.fail()
AssertionError: None

======================================================================
FAIL: test_scmds (__main__.NystromTests)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-39-d84a53266cc3>", line 10, in test_scmds
    self.fail()
AssertionError: None

======================================================================
FAIL: test_scmds (__main__.SCMDSTests)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-38-52de35b3c930>", line 11, in test_scmds
    self.fail()
AssertionError: None

======================================================================
FAIL: test_scmds_scores (__main__.SCMDSTests)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-38-52de35b3c930>", line 14, in test_scmds_scores
    self.fail()
AssertionError: None

----------------------------------------------------------------------
Ran 7 tests in 0.006s

FAILED (failures=5)

In [42]:
def default_score_method(ids, eigvals, eigvecs):
    # this is an adaption of skbio's PCoA.scores. This method is explicitly not
    # tested as it is identical (short of referring to self) to the skbio implementation
    num_positive = (eigvals >= 0).sum()
    eigvecs = eigvecs
    eigvecs[:, num_positive:] = np.zeros(eigvecs[:, num_positive:].shape)
    eigvals = eigvals
    eigvals[num_positive:] = np.zeros(eigvals[num_positive:].shape)

    coordinates = eigvecs * np.sqrt(eigvals)

    proportion_explained = eigvals / eigvals.sum()

    return OrdinationResults(eigvals=eigvals, site=coordinates,
                             proportion_explained=proportion_explained,
                             site_ids=ids)

class APCoA(PCoABase):
    short_method_name = 'APCoA'
    long_method_name = 'Approximate Principal Coordinate Analysis'
    
    # The eig method dictionary is expected to be {"method_name": function}
    # where the function must accept an F_matrix (the result of PCoABase._F_matrix)
    # and **kwargs. The method must return (eigvals, eigvecs).
    eig_methods = {'ssvd': ssvd,
                   'fsvd': fsvd,
                   'nystrom': nystrom,
                   'scmds': scmds}  
    
    # The score methods dictionary is expected to be {"method_name": function}
    # with a 1-1 mapping of the keys to eig_methods. The function must accept
    # ids. eigvals, eigvecs and **kwargs. The method must return an OrdinationResults
    # object.
    score_methods = {'ssvd': default_score_method,
                     'fsvd': default_score_method,
                     'nystrom': nystrom_scores,
                     'scmds': scmds_scores}  
    
    def scores(self):
        return self.score_methods[self._eig_name](self.ids, self.eigvals, self.eigvecs)

In [ ]: