In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
pylab.rcParams['figure.figsize'] = 16, 12
In [ ]:
import seaborn as sns
import numpy as np
import scipy
import pandas as pd
We can compare pairs of time series (e.g. rates) as long as the sampling times for the two series are (roughly) the same. Below we make two almost identical series A and B, and look at their correlation. Even though we corrupt B with some noise the correlation is still pretty high.
In [ ]:
time = np.arange(0,20,0.1)
A = [np.sin(x)**2 for x in time]
sns.tsplot(A,interpolate=False)
In [ ]:
B = [np.sin(x)**2 + np.random.exponential(0.2) for x in time]
sns.tsplot(B,interpolate=False)
In [ ]:
phi, p = scipy.stats.pearsonr(A,B)
print phi # correlation is between -1 and 1. -1 means that one series goes up when the other goes down.
The more we corrupt B with noise, the lower the correlation:
In [ ]:
genB = lambda r: [np.sin(x)**2 + np.random.exponential(r) for x in time]
In [ ]:
corruption = np.linspace(0.1, 3, 100)
d = pd.DataFrame({"scale": corruption, "correlation": [scipy.stats.pearsonr(A,genB(r))[0] for r in corruption]})
sns.regplot("scale","correlation",d,fit_reg=False)
we can also show that, by simply delaying the sine wave by an increasing amount, the correlation moves from 1 to -1, and back again
In [ ]:
genC = lambda phi: [np.sin(x + phi)**2 for x in np.arange(0,20,0.1)]
#data = pd.DataFrame({"0-phase":genC(0), "pi-phase":genC(np.pi), "time":time})
#data = pd.melt(data,id_vars=["time"])
#sns.tsplot(time="time",data=data, value="value", condition="variable", interpolate=False,err_style=None)
plt.figure()
plt.plot(genC(0))
plt.plot(genC(np.pi/4))
plt.plot(genC(np.pi/2))
plt.show()
In [ ]:
phase = np.linspace(0,np.pi,100)
d = pd.DataFrame({"phase": phase, "correlation": [scipy.stats.pearsonr(A,genC(phi))[0] for phi in phase]})
sns.regplot("phase","correlation",d,fit_reg=False)
In [ ]:
k = 10
categories = range(k)
In [ ]:
# we can pull a random categorical distribution from the Dirichlet, which is a distribution over vectors that sum to 1.
# This is super cool but don't worry too much about it!
P = np.random.dirichlet([1 for i in categories])
sns.barplot(x="categories", y="probability", data=pd.DataFrame({"probability":P, "categories":categories}))
In [ ]:
Q = np.random.dirichlet([1 for i in categories])
sns.barplot(x="categories", y="probability", data=pd.DataFrame({"probability":Q, "categories":categories}))
In [ ]:
DKL = lambda p,q : sum(p[i] * np.log(p[i]/q[i]) for i in range(len(p)))
Note that the Kullback Leibler Divergence IS NOT A DISTANCE! It's not symmetric, amongst other things. We can use the symmetrised DKL, which is just the sum. This still isn't a distance, but it's symmetric.
In [ ]:
print DKL(P,Q), DKL(Q,P), DKL(P,Q)+DKL(Q,P)
print scipy.stats.entropy(P,Q) ## note that scipy.stats.entropy will also give you the KL divergence!
if we make a distribution that is markedly different, we should see a higher divergence
In [ ]:
R = np.random.dirichlet([np.exp(i+1) for i in categories])
sns.barplot(x="categories", y="probability", data=pd.DataFrame({"probability":R, "categories":categories}))
In [ ]:
DKL(P,R)+DKL(R,P), DKL(R,Q)+DKL(Q,R)
In [ ]: