Kernel Density Estimation. Plotly plot of the joint pdf and marginal pdf estimations


In [1]:
import pandas as pd
import numpy as np

Read data from an Excel file:


In [2]:
xl = pd.ExcelFile("CSCEng.xls")
dfc = xl.parse("Sheet1")
dfc.columns


Out[2]:
Index([u'multiannual', u'bachelor-th'], dtype='object')

In [3]:
dfc.head()


Out[3]:
multiannual bachelor-th
0 8.01 7.95
1 8.63 8.63
2 7.03 8.37
3 8.53 8.05
4 8.41 9.53

We estimate the joint pdf of the two columns dfc['multiannual]', dfc['bachelor-th]', using a gaussian kernel:


In [4]:
import scipy.stats as st
def kde_scipy( vals1, vals2, (a,b), (c,d), N ):
    
    #vals1, vals2 are the values of two variables 
    #(a,b) interval for vals1; usually larger than (np.min(vals1), np.max(vals1))
    #(c,d) -"-          vals2 
    
    x=np.linspace(a,b,N)
    y=np.linspace(c,d,N)
    X,Y=np.meshgrid(x,y)
    positions = np.vstack([Y.ravel(), X.ravel()]) #X.ravel() concatenates the  rows of X

    values = np.vstack([vals1, vals2])
    kernel = st.gaussian_kde(values)
    Z = np.reshape(kernel(positions).T, X.shape)
    
    return [x, y, Z]# return x, y, Z to be passed to Plotly for plotting the contour of joint pdf

In [5]:
a,b=(5,11) # joint pdf is evaluated at the  N xN grid points of the square [a,b] x[a,b]
N=200

In [6]:
x=list(dfc['multiannual'])
y=list(dfc['bachelor-th'])

In [7]:
pdfx= st.gaussian_kde(x) #estimation of the pdfx from x-values
pdfy=st.gaussian_kde(y) 
X=np.linspace(a, b, 100)
Y=pdfx(X)#evaluate the pdfx at X
yy=np.linspace(a, b, 100)
xx=pdfy(yy)# the pdfy is a function of y-variable

In [8]:
Xvals, Yvals, Zvals = kde_scipy( dfc['bachelor-th'],dfc['multiannual'], (a,b), (a,b), N )
                                 #attn: here we reversed the columns order

Define Data and Layout for Plotly plot:


In [9]:
import plotly.plotly as py
from plotly.graph_objs import *

Set the text to be displayed when hovering the mouse over the contour plot of the joint pdf:


In [22]:
hover_xy=[
['f('+'{:0.2f}'.format(Xvals[j])+', '+'{:0.2f}'.format(Yvals[i])+')= '+'{:0.2f}'.format(Zvals[i][j])+')'
           for j in range(len(Xvals))] for i in range(len(Yvals)) ]

In [11]:
hover_xy[62][57]


Out[11]:
'f(6.72, 6.87)= 0.05)'

Plotly version of the matplotlib cmocean.salinity colormap:


In [12]:
pl_salinity=[[0.0, 'rgb(41,24,107)'],
 [0.05, 'rgb(45,27,137)'],
 [0.1, 'rgb(40,39,162)'],
 [0.15, 'rgb(24,61,158)'],
 [0.2, 'rgb(12,77,150)'],
 [0.25, 'rgb(15,91,144)'],
 [0.3, 'rgb(24,102,140)'],
 [0.35, 'rgb(35,113,138)'],
 [0.4, 'rgb(44,124,136)'],
 [0.45, 'rgb(52,135,136)'],
 [0.5, 'rgb(59,147,135)'],
 [0.55, 'rgb(66,158,132)'],
 [0.6, 'rgb(74,169,128)'],
 [0.65, 'rgb(85,181,122)'],
 [0.7, 'rgb(100,193,113)'],
 [0.75, 'rgb(122,203,102)'],
 [0.8, 'rgb(148,211,93)'],
 [0.85, 'rgb(179,217,94)'],
 [0.9, 'rgb(208,224,109)'],
 [0.95, 'rgb(232,231,131)'],
 [1.0, 'rgb(253,238,153)']]

Define a Contour object:


In [13]:
trace1= Contour(
           z=Zvals, 
           x=Xvals,
           y=Yvals,
           colorscale=pl_salinity,
           showscale=False,
           text=hover_xy,
           hoverinfo='text',
           contours=Contours(
           showlines=False), 
        )

Set hover text for the two marginal pdfs:


In [14]:
textx=['(x,g(x))=('+'{:0.2f}'.format(X[i])+', '+'{:0.2f}'.format(Y[i])+')' for i in range(len(X))]
texty=['(y,h(y))=('+'{:0.2f}'.format(yy[i])+', '+'{:0.2f}'.format(xx[i])+')' for i in range(len(yy))]

In [23]:
trace2 = Scatter(# Scatter object for the marginal pdf g(x)
    x=X, 
    y=Y,
    name='pdf-x',
    mode='lines',
    fill='tozeroy',
    fillcolor='rgb(122,203,102)', 
    line=Line(width=2, color='rgb(66,158,132)', shape='spline'),       
    xaxis='x1',
    yaxis='y2',
    text=textx,
    hoverinfo='text',
    
)
trace3 = Scatter(# Scatter object for the marginal pdf h(y)
    x=xx, 
    y=yy,
    name='pdf-y',
    mode='lines',
    fill='tozerox',
    fillcolor='rgb(122,203,102)',
    line=Line(width=2, color='rgb(66,158,132)', shape='spline'), 
    text=texty,
    hoverinfo='text', 
    xaxis='x2',
    yaxis='y1'
)

In [24]:
data = Data([trace1, trace2, trace3])

Set the plot layout:


In [26]:
layout=Layout(title='Kernel Density Estimation',
    autosize=False,
    font=Font(size=11),
    height=550,
    showlegend=False,
    width=650,
    xaxis=XAxis(
        showgrid=False,
        domain=[0, 0.8],
        range=[a, b],
        title='x',
        titlefont=Font(size=11),
        zeroline=False, 
        tickvals=[6,7,8,9,10, 11]
    ),
    xaxis2=XAxis(
        domain=[0.82, 1],
        showgrid=False,
        zeroline=False,
        side='top',
        ticklen=4,
    ),
    yaxis=YAxis(
        domain=[0, 0.8],
        range=[a, b],
        showgrid=False,
        title='y',
        zeroline=False,
        titlefont=Font(size=11),
    ),
    yaxis2=YAxis(
        domain=[0.82, 1],
        showgrid=False,
        zeroline=False,
        ticklen=4,
    
        
    ), 
    margin=Margin(t=50),
    hovermode='closest',
)
fig = Figure(data=data, layout=layout)

In [27]:
import plotly
plotly.offline.init_notebook_mode()



In [28]:
plotly.offline.iplot(fig)



In [29]:
from IPython.core.display import HTML
def  css_styling():
    styles = open("./custom.css", "r").read()
    return HTML(styles)
css_styling()


Out[29]: