Violin-Plot-Example



In [1]:
import MySQLdb as mdb
import pandas as pd

In [ ]:
# Construct your query of choice
query = ("""
         SELECT eventID, name, value, label
         FROM my_table
         """)

try:
    # Connect to server
    db = Mdb.connect(read_default_file='../../.my.cnf',
                     read_default_group='guest',
                     host='mysql.server.host.name',
                     db='my_schema_name')
    
    # Read out result of query into a DataFrame
    data_df = pd.read_sql(query, db)
    data_df.index.name="id"
    
    # Close the connection
    db.close()
    
# Catch database exceptions
except Mdb.Error, e:
    print "Error %d: %s" % (e.args[0], e.args[1])

In [63]:
# Write data to csv for offline usage
data_df.to_csv('violin-plot-data.csv', index_label='id')

In [2]:
# Load offline data into DataFrame
data_df = pd.read_csv('violin-plot-data.csv', index_col='id')

In [3]:
# Here we have all the data in *Long* format
data_df.head()


Out[3]:
eventID key value label
id
0 441627 AfterInhMATRIX5 50999 1
1 441628 AfterInhMATRIX5 52941 1
2 441629 AfterInhMATRIX5 49258 1
3 441630 AfterInhMATRIX5 51881 1
4 441631 AfterInhMATRIX5 48586 1

In [4]:
# Use the pivot function to get it into *wide* format
wide_df = data_df.pivot(index='eventID',
                        columns='key',
                        values='value')
wide_df.head()


Out[4]:
key AfterInhMATRIX5 PrescaleMATRIX5 RawMATRIX4 RawTriggers
eventID
430001 27094 12 1598521 10759045
430002 34901 14 1670878 11813291
430003 36317 15 1675869 12002554
430004 34088 14 1637602 11564482
430005 27489 12 1587623 10627391

In [5]:
# This pivoted DataFrame doesn't have the *class* field
# Use the SQL-esque *merge* function to add it to each entry
label_df = (data_df[['eventID', 'label']].drop_duplicates()
                                         .set_index('eventID'))
wide_df = wide_df.merge(label_df,
                        left_index=True,
                        right_index=True)
wide_df = wide_df.sort_index()
wide_df.head()


Out[5]:
key AfterInhMATRIX5 PrescaleMATRIX5 RawMATRIX4 RawTriggers label
eventID
430001 27094 12 1598521 10759045 2
430002 34901 14 1670878 11813291 3
430003 36317 15 1675869 12002554 3
430004 34088 14 1637602 11564482 3
430005 27489 12 1587623 10627391 4

In [6]:
import matplotlib.pylab as plt
import seaborn as sns

# Show plots in notebook
%matplotlib inline

# Set some styling options
sns.set_style("darkgrid")
sns.set_context("paper", font_scale=1.4)

In [7]:
# Using Pandas
feature = "AfterInhMATRIX5"
wide_df.query("label == 1")[feature].hist()


Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f24c4b25890>

In [8]:
# Using Seaborn's 'distplot' shows histogram and a kde plot
sns.distplot(wide_df.query("label == 1")[feature])


Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f24c4859650>

In [9]:
for label in range(1,8):
    sns.distplot(
        wide_df.query("label == @label")[feature],
        label=str(label))
plt.legend(loc='best')


Out[9]:
<matplotlib.legend.Legend at 0x7f24c45c1050>

In [10]:
sns.set_style("whitegrid", )
sns.boxplot(x='label', y=feature,
            data=wide_df.sort_values('label'),
            palette='muted')


Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f24c4477390>

In [11]:
sns.violinplot(x='label', y=feature,
               data=wide_df.sort_values('label'),
               palette='muted')


Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f24bfcabfd0>

In [ ]: