Violin plot is a trace that visually encodes the distribution of a data set, along with its summary statistics. It displays the graph of the estimated probability density function (pdf) mirrored about y-axis, and inside the violin-like shaped region, the elements of a box plot (median, lower and upper quartile, whisker position).
In this Jupyter Notebook we define functions to get the Plotly plot of a violin plot. In order to get more insights into distributional properties we add the option to overlay onto the same axis the rug plot of the data set.
In [2]:
from IPython.display import HTML
HTML('<iframe src=https://plot.ly/~empet/13680/violin-rug-plot/ width=400 height=500></iframe>')
Out[2]:
In [3]:
import numpy as np
import pandas as pd
from scipy import stats
Compute the summary statistics of data:
In [4]:
def calc_stats(data) :
x=np.asarray(data, np. float)
vals_min=np.min(x)
vals_max=np.max(x)
q2=np.percentile(x, 50, interpolation='linear')
q1=np.percentile(x, 25, interpolation='lower')
q3=np.percentile(x, 75, interpolation='higher')
IQR=q3-q1
whisker_dist = 1.5 * IQR
#in order to prevent drawing whiskers outside the interval
#of data one defines the whisker positions as:
d1 = np.min(x[x >= (q1 - whisker_dist)])
d2 = np.max(x[x <= (q3 + whisker_dist)])
return vals_min, vals_max, q1, q2 ,q3, d1,d2
In [5]:
import plotly.plotly as py
from plotly.graph_objs import *
import plotly.tools as tls
Functions that define violin components:
In [6]:
def make_half_violin( x, y, fillcolor='#1f77b4', linecolor='rgb(50,50,50)'):
text=['(pdf(y), y)=('+'{:0.2f}'.format(x[i])+', '+'{:0.2f}'.format(y[i])+')'
for i in range(len(x))]
return Scatter(x=x,
y=y, mode='lines',
name='',
text=text,
fill='tonextx',
fillcolor= fillcolor,
line=Line(width=0.5, color=linecolor, shape='spline'),
hoverinfo='text',
opacity=0.5
)
def make_rugplot(vals, pdf_max, distance, color='#1f77b4'):
return Scatter(y=vals,
x=[-pdf_max-distance]*len(vals),
marker=Marker(
color=color,
symbol='line-ew-open'
),
mode='markers',
name='',
showlegend=False,
hoverinfo='y'
)
def make_quartiles(q1, q3):
return Scatter(x=[0, 0],
y=[q1, q3],
text=['lower-quartile: '+'{:0.2f}'.format(q1),
'upper-quartile: '+'{:0.2f}'.format(q3)],
mode='lines',
line=Line(width=4, color='rgb(0,0,0)'),
hoverinfo='text'
)
def make_median(q2):
return Scatter(x=[0],
y=[q2],
text=['median: '+'{:0.2f}'.format(q2)],
mode='markers',
marker=dict(symbol='square', color='rgb(255,255,255)'),
hoverinfo='text'
)
def make_non_outlier_interval(d1,d2):
return Scatter(x=[0, 0],
y=[d1, d2],
name='',
mode='lines',
line=Line(width=1.5, color='rgb(0,0,0)')
)
Set axes:
In [7]:
def make_XAxis(xaxis_title, xaxis_range):
xaxis=XAxis(title=xaxis_title,
range=xaxis_range,
showgrid=False,
zeroline=False,
showline=False,
mirror=False,
ticks='',
showticklabels=False,
)
return xaxis
def make_YAxis(yaxis_title):
yaxis = YAxis(title=yaxis_title,
showticklabels=True,
autorange=True,
ticklen=4,
showline=True,
zeroline=False,
showgrid=False,
mirror=False)
return yaxis
Data values, vals
, can be given in a numeric list, numpy array of shape (n, ) or a pandas series.
Because a violin plot is symmetric with respect to a vertical axis, we define the range of x values
in the plot either
of the form range=[-a,a]
or of the form [-b,a]
, when a rug plot is overlaid.
In [8]:
def create_violinplot(vals, fillcolor='#1f77b4', rugplot=True):
vals=np.asarray(vals, np.float)
vals_min, vals_max, q1, q2, q3, d1, d2=calc_stats(vals)#summary statistics
pdf= stats.gaussian_kde(vals)# kernel density estimation of pdf
xx=np.linspace(vals_min, vals_max, 100)# grid over the data interval
yy=pdf(xx)#evaluate the pdf at the grid xx
max_pdf=np.max(yy)
distance=2.0*max_pdf/10 if rugplot else 0# distance from the violin plot to rugplot
plot_xrange=[-max_pdf-distance-0.1, max_pdf+0.1]# range for x values in the plot
plot_data=[make_half_violin(-yy, xx, fillcolor=fillcolor),
make_half_violin(yy, xx, fillcolor=fillcolor),
make_non_outlier_interval(d1, d2),
make_quartiles(q1,q3),
make_median(q2)]
if rugplot:
plot_data.append(make_rugplot(vals, max_pdf, distance=distance, color=fillcolor))
return plot_data, plot_xrange
Let us define first a single violin plot:
In [37]:
df=pd.read_excel('Violin-plot-data.xlsx')
df.head()
Out[37]:
In [38]:
x=list(df['Score'])
plot_data, plot_xrange=create_violinplot(x, fillcolor='rgb(102,194,163)')
In [39]:
layout=Layout(title='Violin and Rug Plot',
autosize=False,
font=Font(size=11),
height=450,
showlegend=False,
width=350,
xaxis=make_XAxis('', plot_xrange),
yaxis=make_YAxis(''),
hovermode='closest'
)
In [40]:
layout['yaxis'].update(dict(showline=False, showticklabels=False, ticks=''))
In [41]:
fig=Figure(data=Data(plot_data), layout=layout)
In [42]:
py.sign_in('empet', 'my_api_key')
py.iplot(fig, filename='Violin-Plot-Example')
Out[42]:
Data summary encoded in a violin plot facilitate comparison of multiple data sets. In the following we generate a few data sets and their violin plots:
In [9]:
np.random.seed(619517)
Nr=250
y = np.random.randn(Nr)
gr = np.random.choice(list("ABCDE"), Nr)
norm_params=[(0, 1.2), (0.7, 1), (-0.5, 1.4), (0.3, 1), (0.8, 0.9)]# mean and standard deviations
for i, letter in enumerate("ABCDE"):
y[gr == letter] *=norm_params[i][1]+ norm_params[i][0]
df = pd.DataFrame(dict(Score=y, Group=gr))
df.head()
Out[9]:
Group data:
In [10]:
gb=df.groupby(['Group'])
group_name=['A', 'B', 'C', 'D', 'E']
L=len(group_name)
Each violin plot will be displayed in a subplot:
In [11]:
fig = tls.make_subplots(rows=1, cols=L, shared_yaxes=True,
horizontal_spacing=0.025,
print_grid=True)
Set colors for violins:
In [12]:
violet_colors=['#604d9e','#6c4774','#9e70a2','#caaac2','#d6c7dd']
Get plot data for each group, and assign them to the corresponding subplot:
In [13]:
for k, gr in enumerate(group_name):
vals= np.asarray( gb.get_group(gr)['Score'], np.float)
plot_data, plot_xrange=create_violinplot(vals, fillcolor=violet_colors[k])
for item in plot_data:
fig.append_trace(item, 1, k+1)
fig['layout'].update({'xaxis{}'.format(k+1):
make_XAxis('Group '+'{:d}'.format(k+1), plot_xrange)})
fig['layout'].update({'yaxis{}'.format(1): make_YAxis('')})# set the sharey axis style
In [14]:
pl_width=900
pl_height=500
title = 'Violin Plots'
fig['layout'].update(title=title,
font= Font(family='Georgia, serif'),
showlegend=False,
hovermode='closest',
autosize=False,
width=pl_width,
height=pl_height,
margin=Margin(
l=65,
r=65,
b=85,
t=150
)
)
In [15]:
py.sign_in('empet', 'my_api_key')
py.iplot(fig, filename='Multiple-Violins')
Out[15]:
In [16]:
from IPython.core.display import HTML
def css_styling():
styles = open("./custom.css", "r").read()
return HTML(styles)
css_styling()
Out[16]: