In [7]:
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import euclidean_distances

In [4]:
X = np.random.normal(size=(10, 10))

Entropy

Based on the paper in Mitra paper. Distance is based on this:

$$D_{pq} = \left( \sum_{j=1}^M \left(\frac{x_{p,j} - x_{q,j}}{\text{max}_j - \text{min}_j}\right)^2 \right)^{1/2}$$

This is simply MinMaxScaler with euclidean distance.

Then we will further define

$$\text{sim}(p, q) = e^{-\alpha D_{pq}}$$

Where $\alpha = \frac{-\log 0.5}{\bar{D}}$ and $\bar{D}$ is the average distance computed between data points for the whole dataset.

Then using this, we can calculate Entropy:

$$-\sum_{p=1}^l \sum_{q=1}^l (\text{sim}(p, q) \times \log sim(p, q) + (1-\text{sim}(p, q))\times \log(1-\text{sim}(p, q)))$$

In [6]:
mm = MinMaxScaler()
X_mm = mm.fit_transform(X)

In [25]:
np.exp(np.array([1,1,1,1,1,1]))


Out[25]:
array([ 2.71828183,  2.71828183,  2.71828183,  2.71828183,  2.71828183,
        2.71828183])

In [36]:
def entropy(X):
    mm = MinMaxScaler()
    X_mm = mm.fit_transform(X)
    Dpq = euclidean_distances(X_mm)
    D_bar = np.mean([x for x in np.triu(Dpq).flatten() if x != 0])
    alpha = -np.log(0.5)/D_bar
    sim_pq = np.exp(-alpha * Dpq)
    log_sim_pq = np.log(sim_pq)
    entropy = -2*np.sum(np.triu(sim_pq*log_sim_pq + ((1-sim_pq)*np.log((1-sim_pq))), 1))
    return entropy

In [41]:
entropy(np.random.normal(size=(10, 2)))


c:\users\chapm\anaconda3\lib\site-packages\ipykernel\__main__.py:9: RuntimeWarning: divide by zero encountered in log
c:\users\chapm\anaconda3\lib\site-packages\ipykernel\__main__.py:9: RuntimeWarning: invalid value encountered in multiply
Out[41]:
56.444984076641092

In [42]:
from sklearn.mixture import BayesianGaussianMixture

In [61]:
bgm = BayesianGaussianMixture(n_components=10)

In [62]:
X = np.random.normal(size=(1000,)).reshape(-1, 1)

In [63]:
bgm.fit(X)


c:\users\chapm\anaconda3\lib\site-packages\sklearn\mixture\base.py:237: ConvergenceWarning: Initialization 1 did not converge. Try different init parameters, or increase max_iter, tol or check for degenerate data.
  % (init + 1), ConvergenceWarning)
Out[63]:
BayesianGaussianMixture(covariance_prior=None, covariance_type='full',
            degrees_of_freedom_prior=None, init_params='kmeans',
            max_iter=100, mean_precision_prior=None, mean_prior=None,
            n_components=10, n_init=1, random_state=None, reg_covar=1e-06,
            tol=0.001, verbose=0, verbose_interval=10, warm_start=False,
            weight_concentration_prior=None,
            weight_concentration_prior_type='dirichlet_process')

In [64]:
bgm.predict(X)


Out[64]:
array([1, 2, 0, 1, 1, 1, 2, 0, 0, 1, 1, 0, 1, 1, 0, 2, 1, 0, 0, 0, 0, 2, 0,
       2, 2, 2, 2, 1, 2, 0, 2, 2, 0, 0, 2, 0, 2, 0, 0, 2, 1, 0, 0, 0, 0, 2,
       2, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 1, 1, 2, 0, 1, 2,
       1, 1, 2, 1, 0, 1, 2, 2, 1, 0, 2, 0, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 1,
       2, 1, 1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 2, 2, 1, 2, 0, 2, 0, 2, 1, 1, 0,
       1, 1, 0, 1, 2, 1, 2, 1, 2, 0, 2, 0, 2, 0, 0, 0, 1, 1, 0, 2, 1, 2, 2,
       1, 0, 2, 1, 2, 1, 0, 1, 2, 2, 1, 2, 2, 0, 2, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 2, 2, 1, 1, 0, 1, 2, 1, 1, 1, 2, 2, 2, 1, 2, 2, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 2, 1, 2, 0, 0, 2, 2, 2, 0, 2, 0, 0, 1, 2, 0, 2, 1, 0,
       0, 0, 0, 0, 1, 2, 1, 1, 1, 2, 0, 0, 0, 0, 2, 2, 0, 0, 1, 0, 0, 2, 1,
       0, 2, 1, 2, 0, 1, 0, 2, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 2, 2, 0, 0, 1,
       1, 1, 2, 2, 2, 2, 0, 2, 0, 0, 1, 0, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1,
       0, 0, 2, 1, 2, 1, 0, 2, 2, 1, 0, 2, 2, 1, 2, 1, 1, 2, 2, 0, 0, 0, 0,
       0, 2, 2, 1, 0, 0, 0, 2, 2, 1, 0, 1, 2, 0, 0, 2, 1, 1, 2, 0, 2, 1, 0,
       0, 2, 1, 0, 1, 2, 2, 1, 1, 2, 2, 2, 1, 0, 0, 2, 1, 1, 0, 0, 0, 1, 1,
       0, 1, 2, 0, 2, 2, 0, 0, 1, 0, 2, 0, 2, 0, 0, 0, 1, 1, 2, 0, 1, 0, 1,
       2, 0, 0, 1, 0, 1, 2, 1, 0, 0, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 2, 0, 2, 0, 0, 1, 2, 1, 1, 1, 1, 2, 2, 0, 0, 0, 0, 0, 2,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 2, 0, 1, 0, 2, 1, 2, 2, 2, 1, 1, 0, 1, 0,
       0, 2, 1, 0, 0, 0, 0, 0, 1, 1, 2, 1, 0, 0, 2, 2, 2, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 2, 0, 1, 1, 0, 1, 2, 0, 0, 2, 1, 2, 1, 1, 2, 0, 2, 2,
       0, 2, 0, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 2, 1,
       1, 1, 1, 1, 0, 2, 0, 2, 2, 2, 2, 0, 0, 1, 2, 1, 1, 2, 1, 1, 1, 0, 0,
       0, 1, 2, 0, 2, 2, 0, 0, 1, 0, 0, 2, 0, 0, 1, 1, 0, 1, 0, 2, 0, 2, 0,
       1, 1, 0, 2, 0, 1, 2, 0, 2, 1, 0, 1, 2, 1, 0, 1, 0, 2, 0, 0, 1, 0, 1,
       2, 0, 1, 0, 1, 0, 2, 0, 1, 0, 0, 1, 1, 2, 0, 2, 1, 2, 0, 1, 2, 1, 0,
       2, 1, 1, 1, 0, 0, 1, 0, 0, 2, 0, 1, 2, 0, 0, 0, 2, 1, 2, 1, 2, 1, 1,
       2, 1, 0, 1, 0, 1, 2, 1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 2, 0, 2, 2,
       0, 2, 2, 1, 1, 0, 2, 2, 0, 0, 0, 2, 1, 1, 1, 0, 0, 1, 0, 1, 0, 2, 2,
       0, 1, 0, 1, 0, 2, 2, 0, 0, 1, 2, 2, 0, 2, 2, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 2, 2, 0, 0, 2, 1, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 2, 0, 2, 1, 2, 0, 1, 2, 1, 0, 1, 0, 1, 0, 1, 2, 1, 0, 1, 0, 2,
       2, 0, 2, 0, 1, 0, 1, 2, 2, 1, 2, 2, 1, 2, 2, 0, 1, 2, 2, 0, 1, 1, 0,
       1, 2, 0, 0, 0, 2, 1, 2, 0, 1, 1, 2, 2, 0, 1, 1, 2, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 2, 0, 1, 0, 2, 1, 0, 0, 1, 1, 0, 2, 1, 1, 0, 0, 0, 1,
       2, 0, 2, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 2, 1, 2, 2, 0, 2,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 2, 0, 2, 1, 1, 2, 1, 0, 2,
       0, 2, 0, 2, 0, 0, 2, 0, 0, 2, 0, 1, 2, 1, 0, 2, 2, 0, 0, 1, 0, 0, 0,
       2, 1, 0, 1, 0, 0, 0, 2, 0, 2, 1, 2, 1, 1, 0, 0, 0, 2, 0, 1, 0, 1, 2,
       2, 1, 0, 0, 1, 1, 1, 2, 1, 0, 2, 1, 2, 1, 2, 0, 1, 0, 0, 1, 0, 1, 2,
       1, 2, 1, 2, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 2, 1, 1, 0, 1, 0, 2, 1,
       0, 2, 0, 0, 2, 0, 0, 2, 0, 1, 1, 2, 1, 1, 0, 0, 2, 0, 1, 2, 2, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 2, 2, 0, 0, 2, 1, 2, 1, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 2, 1, 0, 0, 1, 0, 0, 1, 1], dtype=int64)