Categorical Features, One-hot Encoding and Dimensionality Reduction

This notebook contains code and corresponding explanations for handling of categorical data, i.e. pitfalls when using one-hot encoding with subsequent PCA.


In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

Data Generation

The next block contains the main method for generating samples.

    n_samples : number of (random)samples to generate
        n_cat : number of categorical features 
   n_cat_bins : number of discrete values for each categorical feature
       n_cont : number of additional continuous Gaussian random features
       center : (Boolean) subtract the mean
    normalize : (Boolean) subtract the mean, normalize each feature to be in range [-1, +1]
       whiten : (Boolean) subtract the mean, normalize each feature to have unit variance.
       return : (n_cat*n_cat_bins+n_cont) x n_samples numpy array

In [164]:
def generate_mixed_attribute_samples(n_samples=100, n_cat=1, n_cat_bins=5, n_cont=0, 
                                     center=False, normalize=False, whiten=False):
    samples = np.zeros((n_cat * n_cat_bins + n_cont, n_samples), dtype=np.float)
    labels = np.zeros((n_cat, n_samples), dtype=np.int)
    cnt = 0
    for n in range(n_cat):
        d = np.random.randint(0, n_cat_bins, size=n_samples)
        labels[n_cat, :] = d
        for b in range(n_cat_bins):
            samples[cnt + b, d==b] = 1.
        cnt += n_cat_bins
    samples[cnt:, :] = 0.1*np.random.randn(n_cont, n_samples)
    if center:
        samples -= np.mean(samples, axis=1)[:, np.newaxis]
    if normalize:
        samples -= np.mean(samples, axis=1)[:, np.newaxis]
        samples /= np.max(np.abs(samples), axis=1)[:, np.newaxis]
        print(np.max(samples, axis=1))
    if whiten:
        samples -= np.mean(samples, axis=1)[:, np.newaxis]
        samples /= np.std(samples, axis=1)[:, np.newaxis]        
        print(np.var(samples, axis=1))
    return samples

In [251]:
X = generate_mixed_attribute_samples(n_samples=100, n_cat=1, n_cat_bins=3, n_cont=0, 
                                     center=True, normalize=False, whiten=False)
C = X.dot(X.T) / X.shape[1]
plt.subplot(1, 2, 1)
plt.pcolor(X.T)
plt.colorbar()
plt.subplot(1, 2, 2)
plt.pcolor(C)
plt.colorbar()


Out[251]:
<matplotlib.colorbar.Colorbar at 0x1317c0668>

In [250]:
u, v = np.linalg.eig(C)
inds = np.argsort(u)  # sort ascending
u = u[inds]
v = v[inds, :]
Xr = v[1:, :].dot(np.diag(u)).dot(X)
plt.subplot(1, 2, 1)
# plt.pcolor(X.T)
# plt.colorbar()
plt.bar(np.arange(u.size), u)
plt.xticks(np.arange(u.size), np.arange(u.size)+1)

# plt.subplot(1, 2, 2)
# plt.pcolor(v.dot(np.diag(u).dot(v.T)))
# plt.colorbar()
plt.subplot(1, 2, 2)
plt.pcolor(Xr.T)
plt.colorbar()


Out[250]:
<matplotlib.colorbar.Colorbar at 0x131642ba8>

In [ ]: