In [ ]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
pylab.rcParams['figure.figsize'] = 16, 12



In [ ]:

    
import seaborn as sns
import numpy as np
import scipy
import pandas as pd

timeseries comparison

We can compare pairs of time series (e.g. rates) as long as the sampling times for the two series are (roughly) the same. Below we make two almost identical series A and B, and look at their correlation. Even though we corrupt B with some noise the correlation is still pretty high.



In [ ]:

    
time = np.arange(0,20,0.1)
A = [np.sin(x)**2 for x in time]
sns.tsplot(A,interpolate=False)



In [ ]:

    
B = [np.sin(x)**2 + np.random.exponential(0.2)  for x in time]
sns.tsplot(B,interpolate=False)



In [ ]:

    
phi, p = scipy.stats.pearsonr(A,B)
print phi # correlation is between -1 and 1. -1 means that one series goes up when the other goes down.

The more we corrupt B with noise, the lower the correlation:



In [ ]:

    
genB = lambda r: [np.sin(x)**2 + np.random.exponential(r)  for x in time]



In [ ]:

    
corruption = np.linspace(0.1, 3, 100)
d = pd.DataFrame({"scale": corruption, "correlation": [scipy.stats.pearsonr(A,genB(r))[0] for r in corruption]})
sns.regplot("scale","correlation",d,fit_reg=False)

we can also show that, by simply delaying the sine wave by an increasing amount, the correlation moves from 1 to -1, and back again



In [ ]:

    
genC = lambda phi: [np.sin(x + phi)**2 for x in np.arange(0,20,0.1)]
#data = pd.DataFrame({"0-phase":genC(0), "pi-phase":genC(np.pi), "time":time})
#data = pd.melt(data,id_vars=["time"])

#sns.tsplot(time="time",data=data, value="value", condition="variable", interpolate=False,err_style=None)
plt.figure()
plt.plot(genC(0))
plt.plot(genC(np.pi/4))
plt.plot(genC(np.pi/2))
plt.show()



In [ ]:

    
phase = np.linspace(0,np.pi,100)
d = pd.DataFrame({"phase": phase, "correlation": [scipy.stats.pearsonr(A,genC(phi))[0] for phi in phase]})
sns.regplot("phase","correlation",d,fit_reg=False)

distribution comparison

If we're interested in comparing how a variable is distributed in two different streams, we can compare their histograms using a number known as the "Kullback Leibler divergence", normally referred to as D_KL. Let's make a histogram P and compare it to another histogram Q.



In [ ]:

    
k = 10
categories = range(k)



In [ ]:

    
# we can pull a random categorical distribution from the Dirichlet, which is a distribution over vectors that sum to 1. 
# This is super cool but don't worry too much about it!
P = np.random.dirichlet([1 for i in categories])
sns.barplot(x="categories", y="probability", data=pd.DataFrame({"probability":P, "categories":categories}))



In [ ]:

    
Q = np.random.dirichlet([1 for i in categories])
sns.barplot(x="categories", y="probability", data=pd.DataFrame({"probability":Q, "categories":categories}))



In [ ]:

    
DKL = lambda p,q : sum(p[i] * np.log(p[i]/q[i]) for i in range(len(p)))

Note that the Kullback Leibler Divergence IS NOT A DISTANCE! It's not symmetric, amongst other things. We can use the symmetrised DKL, which is just the sum. This still isn't a distance, but it's symmetric.



In [ ]:

    
print DKL(P,Q), DKL(Q,P), DKL(P,Q)+DKL(Q,P)
print scipy.stats.entropy(P,Q) ## note that scipy.stats.entropy will also give you the KL divergence!

if we make a distribution that is markedly different, we should see a higher divergence



In [ ]:

    
R = np.random.dirichlet([np.exp(i+1) for i in categories])
sns.barplot(x="categories", y="probability", data=pd.DataFrame({"probability":R, "categories":categories}))



In [ ]:

    
DKL(P,R)+DKL(R,P), DKL(R,Q)+DKL(Q,R)



In [ ]: