Violin Plot with Python Plotly

Violin plot is a trace that visually encodes the distribution of a data set, along with its summary statistics. It displays the graph of the estimated probability density function (pdf) mirrored about y-axis, and inside the violin-like shaped region, the elements of a box plot (median, lower and upper quartile, whisker position).

In this Jupyter Notebook we define functions to get the Plotly plot of a violin plot. In order to get more insights into distributional properties we add the option to overlay onto the same axis the rug plot of the data set.


In [2]:
from IPython.display import HTML
HTML('<iframe src=https://plot.ly/~empet/13680/violin-rug-plot/ width=400 height=500></iframe>')


Out[2]:

In [3]:
import numpy as np
import pandas as pd
from scipy import stats

Compute the summary statistics of data:


In [4]:
def calc_stats(data) :
    x=np.asarray(data, np. float)    
    vals_min=np.min(x)
    vals_max=np.max(x)
    q2=np.percentile(x, 50, interpolation='linear')  
    q1=np.percentile(x, 25, interpolation='lower')
    q3=np.percentile(x, 75, interpolation='higher')
    IQR=q3-q1
    whisker_dist = 1.5 * IQR
    #in order to prevent drawing whiskers outside the interval 
    #of data  one defines the whisker positions as:
    d1 = np.min(x[x >= (q1 - whisker_dist)])
    d2 = np.max(x[x <= (q3 + whisker_dist)])
    return vals_min, vals_max, q1, q2 ,q3, d1,d2

In [5]:
import plotly.plotly as py
from plotly.graph_objs import *
import plotly.tools as tls

Functions that define violin components:


In [6]:
def make_half_violin( x,  y,  fillcolor='#1f77b4',  linecolor='rgb(50,50,50)'): 
    text=['(pdf(y), y)=('+'{:0.2f}'.format(x[i])+', '+'{:0.2f}'.format(y[i])+')'
          for i in range(len(x))] 
    return Scatter(x=x, 
                   y=y, mode='lines',
                   name='',
                   text=text,
                   fill='tonextx', 
                   fillcolor= fillcolor,
                   line=Line(width=0.5, color=linecolor, shape='spline'),
                   hoverinfo='text',
                   opacity=0.5
                   )
             

def make_rugplot(vals, pdf_max, distance, color='#1f77b4'):
    return Scatter(y=vals, 
                   x=[-pdf_max-distance]*len(vals),
                   marker=Marker(
                             color=color,
                             symbol='line-ew-open'
                             ),
                   mode='markers',
                   name='',
                   showlegend=False,
                   hoverinfo='y'
                )  
def make_quartiles(q1, q3):
    return Scatter(x=[0, 0],
                   y=[q1, q3],
                   text=['lower-quartile: '+'{:0.2f}'.format(q1), 
                         'upper-quartile: '+'{:0.2f}'.format(q3)],
                   mode='lines',
                   line=Line(width=4, color='rgb(0,0,0)'),
                   hoverinfo='text'
                  )
def make_median(q2):
    return   Scatter(x=[0],
                     y=[q2], 
                     text=['median: '+'{:0.2f}'.format(q2)],
                     mode='markers',
                     marker=dict(symbol='square', color='rgb(255,255,255)'),
                     hoverinfo='text'
                            )
def make_non_outlier_interval(d1,d2):
    return Scatter(x=[0,  0],
                   y=[d1, d2],
                   name='',
                   mode='lines',
                   line=Line(width=1.5, color='rgb(0,0,0)')
                   )

Set axes:


In [7]:
def make_XAxis(xaxis_title, xaxis_range):
    xaxis=XAxis(title=xaxis_title,
                range=xaxis_range,
                showgrid=False,
                zeroline=False,
                showline=False,
                mirror=False,
                ticks='',
                showticklabels=False,
               )
    return xaxis


def make_YAxis(yaxis_title):
    yaxis = YAxis(title=yaxis_title,
                  showticklabels=True,
                  autorange=True,
                  ticklen=4,
                  showline=True,
                  zeroline=False,
                  showgrid=False,
                  mirror=False)                   
    return yaxis

Data values, vals, can be given in a numeric list, numpy array of shape (n, ) or a pandas series.

Because a violin plot is symmetric with respect to a vertical axis, we define the range of x values in the plot either of the form range=[-a,a] or of the form [-b,a], when a rug plot is overlaid.


In [8]:
def create_violinplot(vals,  fillcolor='#1f77b4', rugplot=True):
    vals=np.asarray(vals, np.float)
    vals_min, vals_max, q1, q2, q3, d1, d2=calc_stats(vals)#summary statistics
    
    pdf= stats.gaussian_kde(vals)# kernel density estimation of pdf
    xx=np.linspace(vals_min, vals_max, 100)# grid over the data interval
    yy=pdf(xx)#evaluate the pdf at the grid xx
    max_pdf=np.max(yy)
    distance=2.0*max_pdf/10 if rugplot else 0# distance from the violin plot to rugplot
    plot_xrange=[-max_pdf-distance-0.1, max_pdf+0.1]# range for x values in the plot
    
    plot_data=[make_half_violin(-yy, xx, fillcolor=fillcolor),
               make_half_violin(yy, xx, fillcolor=fillcolor),
               make_non_outlier_interval(d1, d2),
               make_quartiles(q1,q3),
               make_median(q2)]
    if rugplot: 
        plot_data.append(make_rugplot(vals, max_pdf, distance=distance, color=fillcolor))
    return plot_data, plot_xrange

Let us define first a single violin plot:


In [37]:
df=pd.read_excel('Violin-plot-data.xlsx')
df.head()


Out[37]:
Score
0 6.55
1 9.13
2 8.46
3 9.38
4 6.35

In [38]:
x=list(df['Score'])
plot_data, plot_xrange=create_violinplot(x, fillcolor='rgb(102,194,163)')

In [39]:
layout=Layout(title='Violin  and  Rug Plot',
    autosize=False,
    font=Font(size=11),
    height=450,
    showlegend=False,
    width=350,
    xaxis=make_XAxis('', plot_xrange),     
    yaxis=make_YAxis(''),
    hovermode='closest'
    )

In [40]:
layout['yaxis'].update(dict(showline=False, showticklabels=False, ticks=''))

In [41]:
fig=Figure(data=Data(plot_data), layout=layout)

In [42]:
py.sign_in('empet', 'my_api_key')
py.iplot(fig, filename='Violin-Plot-Example')


Out[42]:

Data summary encoded in a violin plot facilitate comparison of multiple data sets. In the following we generate a few data sets and their violin plots:


In [9]:
np.random.seed(619517)
Nr=250
y = np.random.randn(Nr)
gr = np.random.choice(list("ABCDE"), Nr)
norm_params=[(0, 1.2), (0.7, 1), (-0.5, 1.4), (0.3, 1), (0.8, 0.9)]# mean and standard deviations 

for i, letter in enumerate("ABCDE"):
    y[gr == letter] *=norm_params[i][1]+ norm_params[i][0]
df = pd.DataFrame(dict(Score=y, Group=gr))
df.head()


Out[9]:
Group Score
0 B 1.656178
1 C -1.379259
2 C 1.567691
3 B 1.484571
4 E 0.410634

Group data:


In [10]:
gb=df.groupby(['Group'])
group_name=['A', 'B', 'C', 'D', 'E']
L=len(group_name)

Each violin plot will be displayed in a subplot:


In [11]:
fig = tls.make_subplots(rows=1, cols=L,  shared_yaxes=True, 
                           horizontal_spacing=0.025,                      
                           print_grid=True)


This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y1 ]  [ (1,3) x3,y1 ]  [ (1,4) x4,y1 ]  [ (1,5) x5,y1 ]

Set colors for violins:


In [12]:
violet_colors=['#604d9e','#6c4774','#9e70a2','#caaac2','#d6c7dd']

Get plot data for each group, and assign them to the corresponding subplot:


In [13]:
for k, gr in enumerate(group_name):
    vals= np.asarray( gb.get_group(gr)['Score'], np.float)
    plot_data, plot_xrange=create_violinplot(vals, fillcolor=violet_colors[k])
    for item in plot_data:
        fig.append_trace(item, 1, k+1)
    fig['layout'].update({'xaxis{}'.format(k+1): 
                          make_XAxis('Group '+'{:d}'.format(k+1), plot_xrange)})    
fig['layout'].update({'yaxis{}'.format(1): make_YAxis('')})# set the sharey axis style

In [14]:
pl_width=900
pl_height=500
title = 'Violin Plots'

fig['layout'].update(title=title,                                 
                        font= Font(family='Georgia, serif'),
                        showlegend=False,     
                        hovermode='closest',  
                        autosize=False,       
                        width=pl_width,       
                        height=pl_height,
                        margin=Margin(
                                      l=65,
                                      r=65,
                                      b=85,
                                      t=150
                                     )
                       )

In [15]:
py.sign_in('empet', 'my_api_key')
py.iplot(fig, filename='Multiple-Violins')


Out[15]:

In [16]:
from IPython.core.display import HTML
def  css_styling():
    styles = open("./custom.css", "r").read()
    return HTML(styles)
css_styling()


Out[16]: