Normalisation of vector data

Setting up the environment (do not use pylab)


In [ ]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats as sps
import os

pd.set_option('display.width', 4000)
%matplotlib inline

In [ ]:
raw_data = pd.read_csv('../DataWrangling/census_abs2011_summary.csv')
raw_data.head()

Normalise to unit interval


In [ ]:
def normalise_01(data):
    minimum = np.min(data)
    maximum = np.max(data)
    return (data-minimum)/(maximum-minimum)

normalise_01(raw_data).head()

Zero mean and unit variance (z normalisation)

This standardises the features to all have zero mean and similar (unit) spread. Note that this does not mean that the data is converted into a Gaussian distribution, it just shifts and scales each feature. $$ x_{normalised} = \frac{x-\mu}{\sigma} $$ where $\mu$ is the mean and $\sigma$ is the standard deviation. In this case, $\mu$ and $\sigma$ are estimated from the data.


In [ ]:
def normalise_z(data):
    mu = np.mean(data, axis=0)
    sigma = np.std(data, axis=0)
    assert np.any(sigma > 0.0), 'Zero variance'
    return (data-mu)/sigma

normalise_z(raw_data).head()

Box-Cox transform

The Box Cox transform converts data into a normal (Gaussian distribution). It is given by $$ x_{new} = \begin{cases} \frac{x^\lambda - 1}{\lambda}& \lambda\neq 0\\ \ln(x)&\lambda = 0 \end{cases} $$ Finding the parameter $\lambda$ is done by maximising the log likelihood.

The code below is updated from WeatherGod at github


In [ ]:
def boxcox_auto(x):
    """
    Automatically determines the lambda needed to perform a boxcox
    transform of the given vector of data points.  Note that it will
    also automatically offset the datapoints so that the minimum value
    is just above 0 to satisfy the criteria for the boxcox transform.
    
    The object returned by this function contains the transformed data
    ('bc_data'), the lambda ('lamb'), and the offset used on the data
    points ('dataOffset').  This object can be fed easily into
    boxcox_inverse() to retrieve the original data values like so:
    
    EXAMPLE:
    >>> bc_results = boxcox_auto(data_vector)
    >>> print bc_results
    {'lamb': array([ 0.313]), 'bc_data': array([ 0.47712018,  1.33916353,  6.66393874, ...,  3.80242394,
    3.79166974,  0.47712018]), 'data_offset': 2.2204460492503131e-16}
    >>> reconstit = boxcox_inverse(**bc_results)
    >>> print numpy.mean((dataVector - reconstit) ** 2)
    5.6965875916e-29
    """
    const_offset = -np.min(x) + 1e-8
    tempX = x + const_offset
    bclambda = sp.optimize.fmin(boxcox_opt, 0.0, args=(tempX), maxiter=2000, disp=0)
    bclambda = bclambda[0]

    # Generate the transformed data using the optimal lambda.
    return({'bc_data': boxcox(tempX, bclambda), 'lamb': bclambda, 'data_offset': const_offset})

def boxcox(x, lamb):
    """
    boxcox() performs the boxcox transformation upon the data vector 'x',
    using the supplied lambda value 'lamb'.
    Note that this function does not check for minimum value of the data,
    and it will not correct for values being below 0.
 
    The function returns a vector the same size of x containing the
    the transformed values.
    """
    if (lamb != 0.0) :
        return(((x ** lamb) - 1) / lamb)
    else:
        return(np.log(x))
    
def boxcox_inverse(bc_data, lamb, data_offset = 0.0) :
    """
    Performs the inverse operation of the boxcox transform.
    Note that one can use the output of boxcox_auto() to easily
    run boxcox_inverse:
    
    >>> bc_results = boxcox_auto(data)
    >>> reconstit_data = boxcox_inverse(**bc_results)
    
    Also can be used directly like so:
    >>> trans_data = boxcox(orig_data, lamb_val)
    >>> trans_data = do_processing(trans_data)
    >>> reconstit_data = boxcox_inverse(trans_data, lamb_val)
    """
    if (lamb != 0.0) :
        return((((bc_data * lamb) + 1) ** (1.0/lamb)) - data_offset)
    else :
        return(np.exp(bc_data) - data_offset)


def boxcox_opt(lamb, *pargs):
    """
    Don't call this function, it is meant to be
    used by boxcox_auto().
    """
    x = np.array(pargs)
    
    # Transform data using a particular lambda.
    xhat = boxcox(x, lamb[0])
    
    # The algorithm calls for maximizing the LLF; however, since we have
    # only functions that minimize, the LLF is negated so that we can 
    # minimize the function instead of maximixing it to find the optimum lambda.
    return(-(-(len(x)/2.0) * np.log(np.std(xhat.T)**2) + (lamb[0] - 1.0)*(np.sum(np.log(x)))))

n_data = boxcox_auto(np.array(raw_data.values))
pd.DataFrame(n_data['bc_data']).head()

Quantile normalisation

Quantile normalisation converts the empirical distribution to match a given distribution.

It is popular in bioinformatics for normalising microarray data, where instead of using an arbitrary distribution, it uses the average empirical distribution.


In [ ]:
def quantile_normalise(data, avg=np.mean):
    """Perform quantile normalisation on the data (assumed to be a 2D array).
    Assume each column is an example, and quantile normalise the rows.

    Replace value with the mean of the values of the same rank.
    """
    sort_idx = np.argsort(data, axis=1)
    new_data = np.zeros(data.shape)
    for idx in range(data.shape[1]):
        new_val = avg(data[sort_idx==idx])
        new_data[sort_idx == idx] = new_val
    return new_data

n_data = quantile_normalise(raw_data.values)
pd.DataFrame(n_data).head()

Rank normalisation


In [ ]:
def _score2rank(orig_scores):
    """Convert an array of scores into an array of normalised ranks,
    such that the highest score has the highest rank (1/num_ex)."""
    scores = orig_scores.copy()
    idx_sort = np.argsort(scores)[::-1]
    unsort = np.argsort(idx_sort)
    scores = scores[idx_sort]

    ranks = infer_ranks(scores)
    
    assert len(ranks) == len(scores), 'Shape mismatch'
    ranks = ranks/(len(scores)+1.0)
    ranks = ranks[unsort]
    return ranks

def score2rank(orig_data):
    """Convert each column of scores into normalised ranks"""
    ranks = np.zeros(orig_data.shape)
    for ix in range(orig_data.shape[1]):
        ranks[:,ix] = _score2rank(orig_data[:,ix])
    return ranks
        

def infer_ranks(scores):
    """impute the ranks of the scores, taking care of ties."""
    num_ex = len(scores)
    unique_scores = np.unique(scores)
    sc = scores.copy()
    sc = sc[::-1]
    left = num_ex - sc.searchsorted(unique_scores, side='right')
    right = num_ex - sc.searchsorted(unique_scores)
    ranks = np.zeros(num_ex)
    for idx in range(len(unique_scores)):
        cur_rank = 0.5*(left[idx]+right[idx]+1)
        ranks[left[idx]:right[idx]] = cur_rank
    return ranks


def normalise_rank(data):
    """Return rank normalised data in the interval [-0.5, 0.5]"""
    return (1.-score2rank(data)) - 0.5

def gauss_dist(data):
    """Return a Gaussian distributed dataset,
    with zero mean and unit variance.
    """
    X = 1.-score2rank(data)
    D = sps.norm()
    return D.ppf(X)

n_data = normalise_rank(raw_data.values)
pd.DataFrame(n_data).head()

In [ ]: