Contour Similarity Algorithms Benchmark

Load stuff


In [1]:
# Load external jupyter stuff
%load_ext autoreload
%load_ext snakeviz

In [2]:
# jupyter settings
%autoreload 2
%matplotlib inline
%autocall 1

# imports
import numpy
import scipy
import pandas
import sklearn
import sklearn.cluster
import os
import glob
import json
from matplotlib import pyplot as plt

import itertools
import collections
import music21
import Levenshtein

from django.core.wsgi import get_wsgi_application
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "contourMetrics.settings.production")
application = get_wsgi_application()

import lib.contour
import cProfile

from django.db.models import Q
from lib.contour import ContourPoint as CP
from lib.contour import Contour as C

import lib.generator
import lib.utils
import lib.analysis
import lib

from apps.calculator import models


Automatic calling is: Smart

In [3]:
# Auxiliary function
def log_progress(sequence, every=None, size=None):
    # https://github.com/alexanderkuk/log-progress
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{index} / ?'.format(index=index)
                else:
                    progress.value = index
                    label.value = u'{index} / {size}'.format(
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = str(index or '?')

class PieceAux(object):
    def __init__(self, piece_model, voice=0):
        self.contours = [p.contour.normalized for p in piece_model.phrase_set.filter(voice=voice)]
        self.piece_id = piece_model.id
        self.piece_name = piece_model.name
        self.comparison = lib.analysis.GeneralComparison([lib.contour.Contour(c) for c in self.contours])
        self.algorithms = ['OSC', 'AGP']
        self.similarity_series = {algorithm: self.comparison.similarity_series(algorithm) for algorithm in self.algorithms}
        self._similarity_maps = {}
    
    def __repr__(self):
        return '<P {}>'.format(self.piece_name)

    def get_mean(self):
        return numpy.array(list(map(lambda a: self.similarity_series[a].mean(), self.algorithms)))
    
    def get_similarity_map(self, algorithm='OSC'):
        if algorithm not in self._similarity_maps:
            self._similarity_maps[algorithm] = lib.utils.ExtendedDataFrame(self.comparison.similarity_map(algorithm))
        return self._similarity_maps[algorithm]
    
    def plot_similarity_map(self, algorithm='OSC'):
        edf = self.get_similarity_map(algorithm)
        return edf.heat_plot()
        
    def get_adjacent_similarity(self, algorithm='OSC'):
        df = self.get_similarity_map(algorithm)
        arr = numpy.array(df).diagonal(1)
        return pandas.Series(arr, index=df.index[1:])
    
    def plot_adjacent_similarity(self, algorithm='OSC'):
        s = self.get_adjacent_similarity(algorithm)
        return s.plot(grid=True, title='Similarity with previous contour P.{} ({})'.format(self.piece_name, algorithm))
    
def make_similarity_df(phrases):
    # Get similarity means by piece
    s = []
    i = []
    for p in phrases:
        s.append(p.get_mean())
        i.append(p.piece_name)
    return pandas.DataFrame(s, index=i, columns=['OSC', 'AGP'])

Unexpected values


In [4]:
# Only 2-6 contours
csegs = [lib.contour.Contour(c['normalized']) for c in models.Contour.objects.filter(size__gt=1).filter(size__lt=6).values()]
gc = lib.analysis.GeneralComparison(csegs)

In [5]:
algorithms = ['OSC', 'AGP', 'EMB']
ind = list(map(lambda x: (tuple(x[0].sequence), tuple(x[1].sequence)), itertools.combinations(csegs, 2)))
seq = [gc.similarity_series(a) for a in log_progress(algorithms)]


/Users/marcossampaio/.virtualenvs/contourmetrics/lib/python3.6/site-packages/numpy/lib/function_base.py:2487: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
/Users/marcossampaio/.virtualenvs/contourmetrics/lib/python3.6/site-packages/numpy/lib/function_base.py:2496: RuntimeWarning: divide by zero encountered in double_scalars
  c *= 1. / np.float64(fact)
/Users/marcossampaio/.virtualenvs/contourmetrics/lib/python3.6/site-packages/numpy/lib/function_base.py:2496: RuntimeWarning: invalid value encountered in multiply
  c *= 1. / np.float64(fact)
/Users/marcossampaio/.virtualenvs/contourmetrics/lib/python3.6/site-packages/numpy/lib/function_base.py:2569: RuntimeWarning: invalid value encountered in true_divide
  c /= stddev[:, None]
/Users/marcossampaio/.virtualenvs/contourmetrics/lib/python3.6/site-packages/numpy/lib/function_base.py:2570: RuntimeWarning: invalid value encountered in true_divide
  c /= stddev[None, :]

In [6]:
gc_df = pandas.DataFrame(seq).T
gc_df.columns = algorithms
gc_df.index=ind

In [7]:
# OSC < 0.1 AND AGP > 0.9
gc_df[(gc_df.OSC<0.1)&(gc_df.AGP>0.9)]


Out[7]:
OSC AGP EMB
((1, 0, 1), (1, 0, 2)) 0.0 1.0 0.625000
((1, 0, 1), (2, 0, 1)) 0.0 1.0 0.625000
((1, 0, 2), (2, 0, 1)) 0.0 1.0 0.750000
((1, 2, 0), (0, 1, 0)) 0.0 1.0 0.625000
((1, 2, 0), (0, 2, 1)) 0.0 1.0 0.750000
((0, 1, 0), (0, 2, 1)) 0.0 1.0 0.625000
((0, 1, 0, 1, 0), (0, 2, 1, 2, 0)) 0.0 1.0 0.673077
((0, 1, 0, 2, 0), (0, 2, 1, 2, 0)) 0.0 1.0 0.634615
((0, 1, 0, 2, 1), (0, 2, 1, 2, 0)) 0.0 1.0 0.653846
((0, 2, 0, 1, 0), (0, 2, 1, 2, 0)) 0.0 1.0 0.634615
((0, 2, 0, 2, 1), (0, 2, 1, 2, 0)) 0.0 1.0 0.653846
((0, 2, 0, 3, 1), (0, 2, 1, 2, 0)) 0.0 1.0 0.615385
((0, 2, 1, 2, 1), (0, 2, 1, 2, 0)) 0.0 1.0 0.750000
((0, 2, 1, 3, 1), (0, 2, 1, 2, 0)) 0.0 1.0 0.673077
((0, 3, 1, 3, 2), (0, 2, 1, 2, 0)) 0.0 1.0 0.692308
((0, 3, 1, 4, 2), (0, 2, 1, 2, 0)) 0.0 1.0 0.557692
((1, 0, 1, 0, 1), (2, 0, 1, 0, 2)) 0.0 1.0 0.673077
((1, 0, 1, 0, 2), (2, 0, 1, 0, 2)) 0.0 1.0 0.750000
((1, 0, 2, 0, 2), (2, 0, 1, 0, 2)) 0.0 1.0 0.653846
((0, 2, 1, 4, 3), (0, 2, 1, 2, 0)) 0.0 1.0 0.538462
((0, 3, 0, 2, 1), (0, 2, 1, 2, 0)) 0.0 1.0 0.596154
((0, 3, 1, 2, 0), (0, 2, 1, 2, 0)) 0.0 1.0 0.788462
((0, 3, 1, 2, 1), (0, 2, 1, 2, 0)) 0.0 1.0 0.673077
((1, 2, 0, 2, 0), (0, 2, 1, 2, 0)) 0.0 1.0 0.653846
((1, 2, 0, 2, 1), (0, 2, 1, 2, 0)) 0.0 1.0 0.730769
((1, 2, 0, 3, 0), (0, 2, 1, 2, 0)) 0.0 1.0 0.596154
((1, 2, 0, 3, 1), (0, 2, 1, 2, 0)) 0.0 1.0 0.692308
((1, 2, 0, 3, 2), (0, 2, 1, 2, 0)) 0.0 1.0 0.673077
((1, 2, 1, 2, 0), (0, 2, 1, 2, 0)) 0.0 1.0 0.750000
((1, 2, 1, 3, 0), (0, 2, 1, 2, 0)) 0.0 1.0 0.673077
... ... ... ...
((2, 0, 1, 0, 2), (1, 0, 2, 0, 3)) 0.0 1.0 0.692308
((2, 0, 1, 0, 2), (2, 0, 2, 1, 3)) 0.0 1.0 0.673077
((2, 0, 1, 0, 2), (3, 0, 2, 0, 1)) 0.0 1.0 0.692308
((2, 0, 1, 0, 2), (3, 1, 4, 0, 2)) 0.0 1.0 0.519231
((2, 0, 1, 0, 2), (3, 2, 3, 0, 1)) 0.0 1.0 0.596154
((2, 0, 1, 0, 2), (1, 0, 3, 0, 2)) 0.0 1.0 0.615385
((2, 0, 1, 0, 2), (3, 2, 4, 0, 1)) 0.0 1.0 0.519231
((2, 0, 1, 0, 2), (1, 0, 3, 2, 4)) 0.0 1.0 0.538462
((2, 0, 1, 0, 2), (2, 1, 4, 0, 3)) 0.0 1.0 0.519231
((1, 3, 0, 2, 1), (0, 2, 1, 2, 0)) 0.0 1.0 0.692308
((1, 3, 0, 4, 2), (0, 2, 1, 2, 0)) 0.0 1.0 0.519231
((1, 3, 0, 2, 0), (0, 2, 1, 2, 0)) 0.0 1.0 0.615385
((1, 3, 1, 2, 0), (0, 2, 1, 2, 0)) 0.0 1.0 0.673077
((2, 3, 0, 2, 1), (0, 2, 1, 2, 0)) 0.0 1.0 0.673077
((2, 3, 1, 2, 0), (0, 2, 1, 2, 0)) 0.0 1.0 0.730769
((2, 3, 1, 3, 0), (0, 2, 1, 2, 0)) 0.0 1.0 0.692308
((2, 3, 1, 4, 0), (0, 2, 1, 2, 0)) 0.0 1.0 0.557692
((2, 4, 0, 3, 1), (0, 2, 1, 2, 0)) 0.0 1.0 0.519231
((2, 4, 1, 3, 0), (0, 2, 1, 2, 0)) 0.0 1.0 0.557692
((0, 1, 0, 3, 2), (0, 2, 1, 2, 0)) 0.0 1.0 0.596154
((3, 4, 1, 2, 0), (0, 2, 1, 2, 0)) 0.0 1.0 0.538462
((0, 3, 2, 3, 1), (0, 2, 1, 2, 0)) 0.0 1.0 0.788462
((3, 4, 0, 2, 1), (0, 2, 1, 2, 0)) 0.0 1.0 0.519231
((0, 2, 1, 2, 0), (0, 2, 1, 3, 0)) 0.0 1.0 0.788462
((0, 2, 1, 2, 0), (0, 4, 1, 3, 2)) 0.0 1.0 0.557692
((0, 2, 1, 2, 0), (2, 3, 0, 1, 0)) 0.0 1.0 0.596154
((0, 2, 1, 2, 0), (0, 2, 1, 3, 2)) 0.0 1.0 0.730769
((0, 2, 1, 2, 0), (1, 3, 0, 3, 2)) 0.0 1.0 0.615385
((0, 2, 1, 2, 0), (2, 3, 0, 4, 1)) 0.0 1.0 0.519231
((0, 2, 1, 2, 0), (1, 2, 0, 4, 3)) 0.0 1.0 0.519231

97 rows × 3 columns


In [8]:
# OSC > 0.9 AND OSC < 0.1
gc_df[(gc_df.AGP<0.1)&(gc_df.OSC>0.9)]


Out[8]:
OSC AGP EMB
((2, 1, 0), (0, 1, 2, 3)) 1.0 0.0 0.0
((2, 1, 0), (0, 1, 2, 3, 4)) 1.0 0.0 0.0
((0, 1, 2, 3), (1, 0)) 1.0 0.0 0.0
((0, 1, 2, 3), (3, 2, 1, 0)) 1.0 0.0 0.0
((0, 1, 2, 3), (4, 3, 2, 1, 0)) 1.0 0.0 0.0
((1, 0), (0, 1, 2, 3, 4)) 1.0 0.0 0.0
((0, 1, 2), (3, 2, 1, 0)) 1.0 0.0 0.0
((0, 1, 2), (4, 3, 2, 1, 0)) 1.0 0.0 0.0
((3, 2, 1, 0), (0, 1, 2, 3, 4)) 1.0 0.0 0.0
((3, 2, 1, 0), (0, 1)) 1.0 0.0 0.0
((0, 1, 2, 3, 4), (4, 3, 2, 1, 0)) 1.0 0.0 0.0
((4, 3, 2, 1, 0), (0, 1)) 1.0 0.0 0.0

Distribution


In [9]:
gc_df.describe()


Out[9]:
OSC AGP EMB
count 56280.000000 56280.000000 56280.000000
mean 0.987313 0.693195 0.574718
std 0.111919 0.152539 0.121426
min 0.000000 0.000000 0.000000
25% 1.000000 0.571429 0.500000
50% 1.000000 0.750000 0.576923
75% 1.000000 0.750000 0.653846
max 1.000000 1.000000 0.972973

In [10]:
gc_df.boxplot()


Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a8fa4e0>

In [11]:
gc_df.hist()


Out[11]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x11923f198>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119e5ab70>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x119ecc710>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119f1db38>]], dtype=object)

Correlation


In [12]:
gc_df.corr()


Out[12]:
OSC AGP EMB
OSC 1.000000 -0.030572 -0.011572
AGP -0.030572 1.000000 0.414281
EMB -0.011572 0.414281 1.000000

In [13]:
gc_df.plot(kind='scatter', x='OSC', y='AGP')


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a09cf60>

In [14]:
gc_df.plot(kind='scatter', x='OSC', y='EMB')


Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a184f98>

In [15]:
gc_df.plot(kind='scatter', x='EMB', y='AGP')


Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a2662b0>

Chorales similarity means


In [16]:
chorales_pieces = models.Piece.objects.filter(collection__name='Chorales')
soprano_phrases = [PieceAux(p, 0) for p in log_progress(chorales_pieces)]
bass_phrases = [PieceAux(p, 3) for p in log_progress(chorales_pieces)]


/Users/marcossampaio/.virtualenvs/contourmetrics/lib/python3.6/site-packages/numpy/lib/function_base.py:2487: RuntimeWarning: Degrees of freedom <= 0 for slice
  warnings.warn("Degrees of freedom <= 0 for slice", RuntimeWarning)
/Users/marcossampaio/.virtualenvs/contourmetrics/lib/python3.6/site-packages/numpy/lib/function_base.py:2496: RuntimeWarning: divide by zero encountered in double_scalars
  c *= 1. / np.float64(fact)
/Users/marcossampaio/.virtualenvs/contourmetrics/lib/python3.6/site-packages/numpy/lib/function_base.py:2496: RuntimeWarning: invalid value encountered in multiply
  c *= 1. / np.float64(fact)

In [17]:
s_df = make_similarity_df(soprano_phrases)
b_df = make_similarity_df(bass_phrases)

Distribution


In [18]:
s_df.describe()


Out[18]:
OSC AGP
count 370.000000 370.000000
mean 0.690155 0.645402
std 0.148880 0.090429
min 0.225145 0.308449
25% 0.591566 0.586671
50% 0.679591 0.642998
75% 0.797152 0.704137
max 1.000000 0.910707

In [19]:
b_df.describe()


Out[19]:
OSC AGP
count 370.000000 370.000000
mean 0.553334 0.615559
std 0.131005 0.068001
min 0.056036 0.248631
25% 0.458412 0.578458
50% 0.538479 0.620698
75% 0.628838 0.655401
max 0.903703 0.833333

In [20]:
s_df.boxplot()


Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x11bb910b8>

In [21]:
b_df.boxplot()


Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x11bc3e438>

In [22]:
s_df.hist(), b_df.hist()


Out[22]:
(array([[<matplotlib.axes._subplots.AxesSubplot object at 0x11bc68a90>,
         <matplotlib.axes._subplots.AxesSubplot object at 0x11beccbe0>]], dtype=object),
 array([[<matplotlib.axes._subplots.AxesSubplot object at 0x119c734e0>,
         <matplotlib.axes._subplots.AxesSubplot object at 0x11bfd4f98>]], dtype=object))

Correlation


In [23]:
s_df.corr().OSC.AGP, b_df.corr().OSC.AGP


Out[23]:
(0.17980912140641339, -0.25425603607896358)

In [24]:
s_df.plot(kind='scatter', x='OSC', y='AGP', grid=True), b_df.plot(kind='scatter', x='OSC', y='AGP', grid=True)


Out[24]:
(<matplotlib.axes._subplots.AxesSubplot at 0x11bf36ef0>,
 <matplotlib.axes._subplots.AxesSubplot at 0x11c369c88>)

The most and least similar chorales


In [25]:
v = 0.9
s_df[(s_df.OSC > v)&(s_df.AGP > v)]


Out[25]:
OSC AGP
233 0.954414 0.910707
350 0.944823 0.910707

In [26]:
v = 0.4
s_df[(s_df.OSC < v)&(s_df.AGP < v)]


Out[26]:
OSC AGP
124 0.225145 0.308449

The most similar


In [27]:
s1 = [s for s in soprano_phrases if s.piece_name=='233'][0]
s1.contours


Out[27]:
[[1, 2, 3, 2, 1, 0],
 [2, 3, 4, 2, 1, 0],
 [0, 1, 2, 1, 0],
 [0, 1, 2, 1, 0],
 [1, 2, 3, 2, 1, 0],
 [2, 3, 4, 3, 2, 1, 0]]

In [28]:
s1.get_similarity_map('OSC')


Out[28]:
< 1 2 3 2 1 0 > < 2 3 4 2 1 0 > < 0 1 2 1 0 > < 0 1 2 1 0 > < 1 2 3 2 1 0 > < 2 3 4 3 2 1 0 >
< 1 2 3 2 1 0 > 1.000000 0.990070 0.929396 0.929396 1.000000 0.987036
< 2 3 4 2 1 0 > 0.990070 1.000000 0.868283 0.868283 0.990070 0.954672
< 0 1 2 1 0 > 0.929396 0.868283 1.000000 1.000000 0.929396 0.976585
< 0 1 2 1 0 > 0.929396 0.868283 1.000000 1.000000 0.929396 0.976585
< 1 2 3 2 1 0 > 1.000000 0.990070 0.929396 0.929396 1.000000 0.987036
< 2 3 4 3 2 1 0 > 0.987036 0.954672 0.976585 0.976585 0.987036 1.000000

In [29]:
s1.plot_similarity_map('OSC')


<matplotlib.figure.Figure at 0x119efcba8>

In [30]:
s1.plot_adjacent_similarity('OSC')


Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x11c52d6a0>

In [31]:
# AGP internal similarities
s1.get_similarity_map('AGP')


Out[31]:
< 1 2 3 2 1 0 > < 2 3 4 2 1 0 > < 0 1 2 1 0 > < 0 1 2 1 0 > < 1 2 3 2 1 0 > < 2 3 4 3 2 1 0 >
< 1 2 3 2 1 0 > 1.000000 1.000000 0.888889 0.888889 1.000000 0.909091
< 2 3 4 2 1 0 > 1.000000 1.000000 0.888889 0.888889 1.000000 0.909091
< 0 1 2 1 0 > 0.888889 0.888889 1.000000 1.000000 0.888889 0.800000
< 0 1 2 1 0 > 0.888889 0.888889 1.000000 1.000000 0.888889 0.800000
< 1 2 3 2 1 0 > 1.000000 1.000000 0.888889 0.888889 1.000000 0.909091
< 2 3 4 3 2 1 0 > 0.909091 0.909091 0.800000 0.800000 0.909091 1.000000

In [32]:
s1.plot_similarity_map('AGP')


<matplotlib.figure.Figure at 0x11c7e5978>

In [33]:
s1.plot_adjacent_similarity('AGP')


Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x11c83da58>

The least similar


In [34]:
s2 = [s for s in soprano_phrases if s.piece_name=='124'][0]
s2.contours


Out[34]:
[[0],
 [3, 2, 1, 0],
 [0, 1, 2, 3, 4, 3, 2, 1, 0],
 [0],
 [3, 2, 1, 2, 1, 0],
 [0, 1, 2, 3, 4, 5, 4, 3, 2, 1]]

In [35]:
# OSC internal similarities
s2.get_similarity_map('OSC')


Out[35]:
< 0 > < 3 2 1 0 > < 0 1 2 3 4 3 2 1 0 > < 0 > < 3 2 1 2 1 0 > < 0 1 2 3 4 5 4 3 2 1 >
< 0 > 0.0 0.000000 0.000000 0.0 0.000000 0.000000
< 3 2 1 0 > 0.0 1.000000 0.716789 0.0 0.605203 0.707940
< 0 1 2 3 4 3 2 1 0 > 0.0 0.716789 1.000000 0.0 0.139881 0.990820
< 0 > 0.0 0.000000 0.000000 0.0 0.000000 0.000000
< 3 2 1 2 1 0 > 0.0 0.605203 0.139881 0.0 1.000000 0.216541
< 0 1 2 3 4 5 4 3 2 1 > 0.0 0.707940 0.990820 0.0 0.216541 1.000000

In [36]:
s2.plot_similarity_map('OSC')


<matplotlib.figure.Figure at 0x11c847710>

In [37]:
s2.plot_adjacent_similarity('OSC')


Out[37]:
<matplotlib.axes._subplots.AxesSubplot at 0x11cb50470>

In [38]:
# AGP internal similarities
s2.get_similarity_map('AGP')


Out[38]:
< 0 > < 3 2 1 0 > < 0 1 2 3 4 3 2 1 0 > < 0 > < 3 2 1 2 1 0 > < 0 1 2 3 4 5 4 3 2 1 >
< 0 > 1.0 0.000000 0.000000 1.0 0.000000 0.000000
< 3 2 1 0 > 0.0 1.000000 0.545455 0.0 0.750000 0.500000
< 0 1 2 3 4 3 2 1 0 > 0.0 0.545455 1.000000 0.0 0.461538 0.941176
< 0 > 1.0 0.000000 0.000000 1.0 0.000000 0.000000
< 3 2 1 2 1 0 > 0.0 0.750000 0.461538 0.0 1.000000 0.428571
< 0 1 2 3 4 5 4 3 2 1 > 0.0 0.500000 0.941176 0.0 0.428571 1.000000

In [39]:
s2.plot_similarity_map('AGP')


<matplotlib.figure.Figure at 0x11ce514a8>

In [40]:
s2.plot_adjacent_similarity('AGP')


Out[40]:
<matplotlib.axes._subplots.AxesSubplot at 0x11cf80748>