Setting up the environment (do not use pylab)
In [ ]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats as sps
import os
pd.set_option('display.width', 4000)
%matplotlib inline
In [ ]:
raw_data = pd.read_csv('../DataWrangling/census_abs2011_summary.csv')
raw_data.head()
In [ ]:
def normalise_01(data):
minimum = np.min(data)
maximum = np.max(data)
return (data-minimum)/(maximum-minimum)
normalise_01(raw_data).head()
This standardises the features to all have zero mean and similar (unit) spread. Note that this does not mean that the data is converted into a Gaussian distribution, it just shifts and scales each feature. $$ x_{normalised} = \frac{x-\mu}{\sigma} $$ where $\mu$ is the mean and $\sigma$ is the standard deviation. In this case, $\mu$ and $\sigma$ are estimated from the data.
In [ ]:
def normalise_z(data):
mu = np.mean(data, axis=0)
sigma = np.std(data, axis=0)
assert np.any(sigma > 0.0), 'Zero variance'
return (data-mu)/sigma
normalise_z(raw_data).head()
The Box Cox transform converts data into a normal (Gaussian distribution). It is given by $$ x_{new} = \begin{cases} \frac{x^\lambda - 1}{\lambda}& \lambda\neq 0\\ \ln(x)&\lambda = 0 \end{cases} $$ Finding the parameter $\lambda$ is done by maximising the log likelihood.
The code below is updated from WeatherGod at github
In [ ]:
def boxcox_auto(x):
"""
Automatically determines the lambda needed to perform a boxcox
transform of the given vector of data points. Note that it will
also automatically offset the datapoints so that the minimum value
is just above 0 to satisfy the criteria for the boxcox transform.
The object returned by this function contains the transformed data
('bc_data'), the lambda ('lamb'), and the offset used on the data
points ('dataOffset'). This object can be fed easily into
boxcox_inverse() to retrieve the original data values like so:
EXAMPLE:
>>> bc_results = boxcox_auto(data_vector)
>>> print bc_results
{'lamb': array([ 0.313]), 'bc_data': array([ 0.47712018, 1.33916353, 6.66393874, ..., 3.80242394,
3.79166974, 0.47712018]), 'data_offset': 2.2204460492503131e-16}
>>> reconstit = boxcox_inverse(**bc_results)
>>> print numpy.mean((dataVector - reconstit) ** 2)
5.6965875916e-29
"""
const_offset = -np.min(x) + 1e-8
tempX = x + const_offset
bclambda = sp.optimize.fmin(boxcox_opt, 0.0, args=(tempX), maxiter=2000, disp=0)
bclambda = bclambda[0]
# Generate the transformed data using the optimal lambda.
return({'bc_data': boxcox(tempX, bclambda), 'lamb': bclambda, 'data_offset': const_offset})
def boxcox(x, lamb):
"""
boxcox() performs the boxcox transformation upon the data vector 'x',
using the supplied lambda value 'lamb'.
Note that this function does not check for minimum value of the data,
and it will not correct for values being below 0.
The function returns a vector the same size of x containing the
the transformed values.
"""
if (lamb != 0.0) :
return(((x ** lamb) - 1) / lamb)
else:
return(np.log(x))
def boxcox_inverse(bc_data, lamb, data_offset = 0.0) :
"""
Performs the inverse operation of the boxcox transform.
Note that one can use the output of boxcox_auto() to easily
run boxcox_inverse:
>>> bc_results = boxcox_auto(data)
>>> reconstit_data = boxcox_inverse(**bc_results)
Also can be used directly like so:
>>> trans_data = boxcox(orig_data, lamb_val)
>>> trans_data = do_processing(trans_data)
>>> reconstit_data = boxcox_inverse(trans_data, lamb_val)
"""
if (lamb != 0.0) :
return((((bc_data * lamb) + 1) ** (1.0/lamb)) - data_offset)
else :
return(np.exp(bc_data) - data_offset)
def boxcox_opt(lamb, *pargs):
"""
Don't call this function, it is meant to be
used by boxcox_auto().
"""
x = np.array(pargs)
# Transform data using a particular lambda.
xhat = boxcox(x, lamb[0])
# The algorithm calls for maximizing the LLF; however, since we have
# only functions that minimize, the LLF is negated so that we can
# minimize the function instead of maximixing it to find the optimum lambda.
return(-(-(len(x)/2.0) * np.log(np.std(xhat.T)**2) + (lamb[0] - 1.0)*(np.sum(np.log(x)))))
n_data = boxcox_auto(np.array(raw_data.values))
pd.DataFrame(n_data['bc_data']).head()
Quantile normalisation converts the empirical distribution to match a given distribution.
It is popular in bioinformatics for normalising microarray data, where instead of using an arbitrary distribution, it uses the average empirical distribution.
In [ ]:
def quantile_normalise(data, avg=np.mean):
"""Perform quantile normalisation on the data (assumed to be a 2D array).
Assume each column is an example, and quantile normalise the rows.
Replace value with the mean of the values of the same rank.
"""
sort_idx = np.argsort(data, axis=1)
new_data = np.zeros(data.shape)
for idx in range(data.shape[1]):
new_val = avg(data[sort_idx==idx])
new_data[sort_idx == idx] = new_val
return new_data
n_data = quantile_normalise(raw_data.values)
pd.DataFrame(n_data).head()
In [ ]:
def _score2rank(orig_scores):
"""Convert an array of scores into an array of normalised ranks,
such that the highest score has the highest rank (1/num_ex)."""
scores = orig_scores.copy()
idx_sort = np.argsort(scores)[::-1]
unsort = np.argsort(idx_sort)
scores = scores[idx_sort]
ranks = infer_ranks(scores)
assert len(ranks) == len(scores), 'Shape mismatch'
ranks = ranks/(len(scores)+1.0)
ranks = ranks[unsort]
return ranks
def score2rank(orig_data):
"""Convert each column of scores into normalised ranks"""
ranks = np.zeros(orig_data.shape)
for ix in range(orig_data.shape[1]):
ranks[:,ix] = _score2rank(orig_data[:,ix])
return ranks
def infer_ranks(scores):
"""impute the ranks of the scores, taking care of ties."""
num_ex = len(scores)
unique_scores = np.unique(scores)
sc = scores.copy()
sc = sc[::-1]
left = num_ex - sc.searchsorted(unique_scores, side='right')
right = num_ex - sc.searchsorted(unique_scores)
ranks = np.zeros(num_ex)
for idx in range(len(unique_scores)):
cur_rank = 0.5*(left[idx]+right[idx]+1)
ranks[left[idx]:right[idx]] = cur_rank
return ranks
def normalise_rank(data):
"""Return rank normalised data in the interval [-0.5, 0.5]"""
return (1.-score2rank(data)) - 0.5
def gauss_dist(data):
"""Return a Gaussian distributed dataset,
with zero mean and unit variance.
"""
X = 1.-score2rank(data)
D = sps.norm()
return D.ppf(X)
n_data = normalise_rank(raw_data.values)
pd.DataFrame(n_data).head()
In [ ]: