Scikit-learn kernel density estimation



In [1]:
%matplotlib inline
from sklearn.neighbors import KernelDensity
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
# values = np.random.normal(size=(100, 2))
values = np.concatenate((np.random.rand(100, 2) * 100, np.random.rand(25,2)*5))

x_lim = [np.min(values[:,0]), np.max(values[:,0])]
y_lim = [np.min(values[:,1]), np.max(values[:,1])]

In [3]:
sns.distplot(values[:,0])
sns.distplot(values[:,1])


Out[3]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fabacdff518>

In [4]:
plt.plot(values[:,0], values[:,1], 'o')


Out[4]:
[<matplotlib.lines.Line2D at 0x7faba99ab6d8>]

In [5]:
density = KernelDensity()
density.fit(values)


Out[5]:
KernelDensity(algorithm='auto', atol=0, bandwidth=1.0, breadth_first=True,
       kernel='gaussian', leaf_size=40, metric='euclidean',
       metric_params=None, rtol=0)

In [6]:
samples = density.sample(10)
scores = density.score_samples(samples)

df = pd.DataFrame({ 'x': samples[:,0], 'y': samples[:,1], 'score': scores})
df


Out[6]:
score x y
0 -5.139336 0.526086 3.235091
1 -7.245322 33.159791 4.127004
2 -5.855119 4.337169 -0.691243
3 -7.133952 63.798958 98.172368
4 -7.384817 24.392525 78.664047
5 -6.034676 -0.580813 0.882175
6 -6.763766 96.753210 75.944110
7 -6.739576 0.829453 34.725105
8 -6.682077 56.099210 34.285628
9 -7.709388 48.959824 65.713919

In [7]:
argmin_score = np.argmin(scores)
min_sample = samples[argmin_score]
min_score = scores[argmin_score]
print("Min idx: %d, sample: %s, score: %.2f" % (argmin_score, min_sample, min_score))


Min idx: 9, sample: [ 48.9598241   65.71391925], score: -7.71