In [1]:
import pandas as pd
In [2]:
data_df = pd.read_csv('raw-data.csv', index_col='eventID')
data_df.head()
Out[2]:
In [3]:
import matplotlib.pylab as plt
import seaborn as sns
# Show plots in notebook
%matplotlib inline
# Set some styling options
sns.set_style("darkgrid")
sns.set_context("paper", font_scale=1.4)
In [4]:
feature = "AfterInhMATRIX5"
sns.distplot(data_df.query("label == 1")[feature])
Out[4]:
In [5]:
from sklearn.neighbors import KernelDensity
import numpy as np
import matplotlib.pylab as plt
# Put the data you want to apply the KDE to into an array
data = data_df.query("label == 1")[feature].values[:, np.newaxis]
# Create a KDE object and then fit it to the data
kde = KernelDensity(kernel='gaussian', bandwidth=1400).fit(data)
In [6]:
X_plot = np.linspace(10000, 100000, 1000)[:, np.newaxis]
# Get log density values for each point on the x-axis
log_dens = kde.score_samples(X_plot)
Y_plot = np.exp(log_dens)
# Plot the two against each other
sns.distplot(data_df.query("label == 1")[feature],
hist=False, color='black',
label='Seaborn KDE')
plt.plot(X_plot, Y_plot, '--', color='red', lw=2,
label='scikit-learn KDE')
plt.legend(loc='best')
Out[6]:
Good! It looks like it matches exactly.
I'm guessing that this is four Gaussian functions added together.
In [7]:
def gauss_dist(xdata, amp, mean, stddev):
return (amp * np.exp( np.divide(-1 * np.square(xdata-mean),
(2 * stddev**2))))
In [8]:
# Take four amplitudes, means, and standard deviations
# Compute sum of four Gaussians
def my_fit(xdata,
a1, a2, a3, a4,
m1, m2, m3, m4,
s1, s2, s3, s4):
exp1 = gauss_dist(xdata, a1, m1, s1)
exp2 = gauss_dist(xdata, a2, m2, s2)
exp3 = gauss_dist(xdata, a3, m3, s3)
exp4 = gauss_dist(xdata, a4, m4, s4)
return exp1 + exp2 + exp3 + exp4
In [9]:
p0 = [0.00001, 0.00002, 0.000061, 0.000005,
31000, 51000, 66000, 83000,
1000, 1500, 2000, 3000]
In [10]:
my_guess = my_fit(X_plot[:, 0], *p0)
plt.plot(X_plot, my_guess, '-')
Out[10]:
In [11]:
from scipy.optimize import curve_fit
popt, pcov = curve_fit(my_fit, X_plot[:, 0], Y_plot, p0)
In [12]:
print popt
In [12]:
optim_fit = my_fit(X_plot[:, 0], *popt)
# Plot the whole fit
plt.plot(X_plot, optim_fit, '-.', lw=3, color='black')
# Along with the consituent gaussians
for i in range(0,4):
plt.plot(X_plot,
gauss_dist(X_plot[:, 0], popt[i], popt[i+4], popt[i+8]),
'-', lw=2)
In [13]:
plt.plot(X_plot, Y_plot, '-', lw=3, color='black',
label='scikit-learn KDE')
plt.plot(X_plot, optim_fit, '-', color='red',
label=r'$\sum_{i=1}^4\ f_G(a_i, \mu_i, \sigma_i)$')
plt.legend(loc='best')
Out[13]:
In [14]:
def calc_chisq(obs, exp):
chisq = 0.0
for i in range(0,len(obs)):
chisq += (obs[i] - exp[i])**2 / exp[i]
return chisq
In [15]:
print calc_chisq(Y_plot, optim_fit)
For 12 degrees of freedom (12 fit parameters), we can look at a $\chi^2$ table to find that we have a $p>0.995$ that this is a good fit to the distribution.
In [16]:
from scipy.stats import chisquare
chisq, pvalue = chisquare(Y_plot, optim_fit, ddof=12)
print ("Chi-squared: %.02f\np-value: %.02f" % (chisq, pvalue))
In [17]:
sns.distplot(data_df.query("label == 1")[feature],
kde=False, hist=True, norm_hist=True)
for i in range(0,4):
plt.plot(X_plot,
gauss_dist(X_plot[:, 0], popt[i], popt[i+4], popt[i+8]),
'-', lw=2)
In [ ]: