Run PCP to clean sonar data


In [1]:
import os, sys
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
from numpy.linalg import inv, eig, pinv
from scipy import linalg
from scipy import pi, multiply, power, tanh, exp, cosh
from scipy import random

In [2]:
sys.path.insert(0,'..')
import db_diff
import decomp_plot


/home/wu-jung/anaconda3/envs/py27/lib/python2.7/site-packages/matplotlib/__init__.py:1405: UserWarning: 
This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

  warnings.warn(_use_error_msg)

In [3]:
# This is the pcp(Principal Projection Pursuit) function from https://github.com/dfm/pcp

"""
An implementation of the Principal Component Pursuit algorithm for robust PCA
as described in `Candes, Li, Ma, & Wright <http://arxiv.org/abs/0912.3599>`_.
An alternative Python implementation using non-standard dependencies and
different hyperparameter choices is available at:
http://blog.shriphani.com/2013/12/18/
    robust-principal-component-pursuit-background-matrix-recovery/
"""

from __future__ import division, print_function

__all__ = ["pcp"]

import time
# import fbpca
import logging
import numpy as np
from scipy.sparse.linalg import svds


def pcp(M, delta=1e-6, mu=None, maxiter=500, verbose=False, missing_data=True,
        svd_method="approximate", **svd_args):
    # Check the SVD method.
    allowed_methods = ["approximate", "exact", "sparse"]
    if svd_method not in allowed_methods:
        raise ValueError("'svd_method' must be one of: {0}"
                         .format(allowed_methods))

    # Check for missing data.
    shape = M.shape
    if missing_data:
        missing = ~(np.isfinite(M))
        if np.any(missing):
            M = np.array(M)
            M[missing] = 0.0
    else:
        missing = np.zeros_like(M, dtype=bool)
        if not np.all(np.isfinite(M)):
            logging.warn("The matrix has non-finite entries. "
                         "SVD will probably fail.")

    # Initialize the tuning parameters.
    lam = 1.0 / np.sqrt(np.max(shape))
    if mu is None:
        mu = 0.25 * np.prod(shape) / np.sum(np.abs(M))
        if verbose:
            print("mu = {0}".format(mu))

    # Convergence criterion.
    norm = np.sum(M ** 2)

    # Iterate.
    i = 0
    rank = np.min(shape)
    S = np.zeros(shape)
    Y = np.zeros(shape)
    while i < max(maxiter, 1):
        # SVD step.
        strt = time.time()
        u, s, v = _svd(svd_method, M - S + Y / mu, rank+1, 1./mu, **svd_args)
        svd_time = time.time() - strt

        s = shrink(s, 1./mu)
        rank = np.sum(s > 0.0)
        u, s, v = u[:, :rank], s[:rank], v[:rank, :]
        L = np.dot(u, np.dot(np.diag(s), v))

        # Shrinkage step.
        S = shrink(M - L + Y / mu, lam / mu)

        # Lagrange step.
        step = M - L - S
        # step[missing] = 0.0
        Y += mu * step

        # Check for convergence.
        err = np.sqrt(np.sum(step ** 2) / norm)
        if verbose:
            print(("Iteration {0}: error={1:.3e}, rank={2:d}, nnz={3:d}, "
                   "time={4:.3e}")
                  .format(i, err, np.sum(s > 0), np.sum(S > 0), svd_time))
        if err < delta:
            break
        i += 1

    if i >= maxiter:
        logging.warn("convergence not reached in pcp")
    return L, S, (u, s, v)


def shrink(M, tau):
    sgn = np.sign(M)
    S = np.abs(M) - tau
    S[S < 0.0] = 0.0
    return sgn * S


def _svd(method, X, rank, tol, **args):
    rank = min(rank, np.min(X.shape))
    if method == "approximate":
        return fbpca.pca(X, k=rank, raw=True, **args)
    elif method == "exact":
        return np.linalg.svd(X, full_matrices=False, **args)
    elif method == "sparse":
        if rank >= np.min(X.shape):
            return np.linalg.svd(X, full_matrices=False)
        u, s, v = svds(X, k=rank, tol=tol)
        u, s, v = u[:, ::-1], s[::-1], v[::-1, :]
        return u, s, v
    raise ValueError("invalid SVD method")

In [4]:
MVBS_path = '/media/wu-jung/wjlee_apl_2/ooi_zplsc_new/'
MVBS_fname = '20150817-20151017_MVBS.h5'

In [5]:
# Reading 2-month sonar time series
import h5py
f = h5py.File(os.path.join(MVBS_path,MVBS_fname),"r")
MVBS = np.array(f['MVBS'])
depth_bin_size = np.array(f['depth_bin_size'])
ping_time = np.array(f['ping_time'])
f.close()

In [6]:
ping_per_day_mvbs = 144

Run Robust PCA and check results


In [26]:
# converting to linear domain
mvbs = 10**(MVBS[:,1:-2,:]/10)  # 62-day stretch
mvbs_3freq = np.array([mvbs[ff,:,:].T.reshape((-1,ping_per_day_mvbs*mvbs.shape[1])) for ff in range(3)])
mvbs_long = mvbs_3freq.swapaxes(0,1).reshape((-1,ping_per_day_mvbs*mvbs.shape[1]*3))

In [27]:
%%time
# applying pcp to the data in log domain (i.e. the way data comes from ooi)
L, S, (u,s,v) = pcp(10*np.log10(mvbs_long),maxiter=500, verbose=False,svd_method="exact")


CPU times: user 20.8 s, sys: 80 ms, total: 20.9 s
Wall time: 5.26 s

In [28]:
L_sep, L_plot = decomp_plot.separate_transform_result(L,mvbs,ping_per_day_mvbs,log_opt = 0)
S_sep, S_plot = decomp_plot.separate_transform_result(S,mvbs,ping_per_day_mvbs,log_opt = 0)

In [29]:
import copy

In [30]:
plot_param_base = dict([("x_ticks_spacing", 5),\
                       ("y_ticks_num",5),\
                       ("y_start_idx",1),\
                       ("y_end_idx",-2),\
                       ("c_min",-80),\
                       ("c_max",-40),\
                       ("c_ticks_spacing",10),\
                       ("ping_per_day_mvbs",ping_per_day_mvbs),\
                       ("depth_bin_size",depth_bin_size),\
                       ("ping_time",ping_time)])

In [31]:
# plotting the raw data
plot_param_raw = copy.deepcopy(plot_param_base)
db_diff.plot_echogram(MVBS,1,60,plot_param_raw,fig_size=(16,5),cmap_name='viridis')



In [32]:
# plotting the low-rank component
plot_param_L = copy.deepcopy(plot_param_base)
plot_param_L["y_start_idx"] = 0
plot_param_L["y_end_idx"] = 0
db_diff.plot_echogram(L_plot,1,60,plot_param_L,fig_size=(16,5),cmap_name='viridis')



In [33]:
# plotting the sparse component
plot_param_S = copy.deepcopy(plot_param_base)
plot_param_S["y_start_idx"] = 0
plot_param_S["y_end_idx"] = 0
plot_param_S["c_min"] = -20
plot_param_S["c_max"] = 20
db_diff.plot_echogram(S_plot,1,60,plot_param_S,fig_size=(16,5),cmap_name='viridis')


Save PCP-cleaned sonar data


In [37]:
# Reading 2-month sonar time series
MVBS_clean_fname = '20150817-20151017_MVBS_PCPcleaned.h5'

f = h5py.File(os.path.join(MVBS_path,MVBS_clean_fname),"w")
f.create_dataset("L", data=L)
f.create_dataset("L_sep", data=L_sep)
f.create_dataset("L_plot", data=L_plot)
f.create_dataset("S", data=S)
f.create_dataset("S_sep", data=S_sep)
f.create_dataset("S_plot", data=S_plot)
f.create_dataset("ping_time", data=ping_time)
f.create_dataset("depth_bin_size",data=depth_bin_size)
f.create_dataset("ping_per_day_mvbs",data=ping_per_day_mvbs)
f.close()

In [38]:
f = h5py.File(os.path.join(MVBS_path,MVBS_clean_fname),"r")

In [41]:
test = np.array(f['ping_per_day_mvbs'])

In [42]:
test


Out[42]:
array(144)

In [ ]: