MUsicBrainz requires a recording artist for each Recording. For classical music, the style guidelines (CSG) recommend to use the performers of a recording as recording artist instead of the composer. So cleaning up the page of the main classical music composers in MusicBrainz requires to change hundreds or thousands recordings (up to a hundred thousands for Bach) to the correct recording artist.
How did this number evolve, particularly for composers that had a "Community cleanup" organized (where this number should be close to zero) and how much work is left for the main composers (Bach, Mozart, Beethoven)?
In [1]:
%run startup.ipy
In [2]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)
Since we can't directly go back to a point in time to check what the number of recordings was for each composer, we had to compute the difference with today; edits changing the number of recordings for a composer are:
For each of those we need a SQL query to determine (for a given composer) how many edits of each type we have and what date they were created at.
In [3]:
# lib/MusicBrainz/Server/Constants.pm
EDIT_MEDIUM_CREATE = 51
EDIT_RECORDING_CREATE = 71
EDIT_RECORDING_EDIT = 72
EDIT_RECORDING_DELETE = 73
EDIT_RECORDING_MERGE = 74
def current_recording_count(artist_name):
return sql("""
SELECT COUNT(*) AS cnt
FROM artist AS a
JOIN artist_credit_name AS acn ON a.id = acn.artist
JOIN artist_credit AS ac ON ac.id = acn.artist_credit
JOIN recording AS r ON r.artist_credit = ac.id
WHERE a.name = %(artist_name)s
""", artist_name=artist_name).cnt[0]
def edits_from_new_medium(artist_name):
artist_id = sql("SELECT id FROM artist WHERE name=%(artist_name)s",
artist_name=artist_name).id[0]
df = sql("""
SELECT date_trunc('day', edit.open_time) AS date,
edit.id AS new,
edit_data.data->'tracklist' AS tracklist,
editor.name AS editor
FROM artist
JOIN edit_artist ON artist.id = edit_artist.artist
JOIN edit ON edit.id = edit_artist.edit
JOIN edit_data ON edit_data.edit = edit_artist.edit
JOIN editor ON editor.id = edit.editor
WHERE artist.name = %(artist_name)s
AND edit.type = %(EDIT_TYPE)s
;""", artist_name=artist_name, EDIT_TYPE=EDIT_MEDIUM_CREATE)
df.index = df['date'].apply(lambda d: d.date())
del df['date']
df['new'] = df.tracklist.apply(lambda tl: [t['artist_credit']['names'][0]['artist']['id']
for t in tl].count(artist_id))
del df['tracklist']
return df
def edits_from_standalone_recordings(artist_name):
df = sql("""
SELECT date_trunc('day', edit.open_time) AS date,
edit.id AS standalone,
editor.name AS editor
FROM artist
JOIN edit_artist ON artist.id = edit_artist.artist
JOIN edit ON edit.id = edit_artist.edit
JOIN edit_data ON edit_data.edit = edit_artist.edit
JOIN editor ON editor.id = edit.editor
WHERE artist.name = %(artist_name)s
AND edit.type = %(EDIT_TYPE)s
;""", artist_name=artist_name, EDIT_TYPE=EDIT_RECORDING_CREATE)
df.index = df['date'].apply(lambda d: d.date())
del df['date']
return df
def edits_from_recordings_changed_from(artist_name):
df = sql("""
SELECT date_trunc('day', edit.open_time) AS date,
edit.id AS changed_from,
CASE WHEN edit_note.text ILIKE E'%%\r\nGM script:%%' THEN 1
ELSE 0
END AS userscript,
editor.name AS editor
FROM artist
JOIN edit_artist ON artist.id = edit_artist.artist
JOIN edit ON edit.id = edit_artist.edit
JOIN edit_data ON edit_data.edit = edit_artist.edit
JOIN editor ON editor.id = edit.editor
JOIN edit_note ON edit_note.edit = edit.id
WHERE artist.name = %(artist_name)s
AND edit.type = %(EDIT_TYPE)s
AND artist.id = CAST(edit_data.data->'old'->'artist_credit'->'names'->0->'artist'->>'id' AS integer)
;""", artist_name=artist_name, EDIT_TYPE=EDIT_RECORDING_EDIT)
df.index = df['date'].apply(lambda d: d.date())
del df['date']
return df
def edits_from_recordings_changed_to(artist_name):
df = sql("""
SELECT date_trunc('day', edit.open_time) AS date,
edit.id AS changed_to,
editor.name AS editor
FROM artist
JOIN edit_artist ON artist.id = edit_artist.artist
JOIN edit ON edit.id = edit_artist.edit
JOIN edit_data ON edit_data.edit = edit_artist.edit
JOIN editor ON editor.id = edit.editor
WHERE artist.name = %(artist_name)s
AND edit.type = %(EDIT_TYPE)s
AND artist.id = CAST(edit_data.data->'new'->'artist_credit'->'names'->0->'artist'->>'id' AS integer)
;""", artist_name=artist_name, EDIT_TYPE=EDIT_RECORDING_EDIT)
df.index = df['date'].apply(lambda d: d.date())
del df['date']
return df
def edits_from_recordings_merge(artist_name):
df = sql("""
SELECT date_trunc('day', edit.open_time) AS date,
edit.id AS merged,
editor.name AS editor
FROM artist
JOIN edit_artist ON artist.id = edit_artist.artist
JOIN edit ON edit.id = edit_artist.edit
JOIN edit_data ON edit_data.edit = edit_artist.edit
JOIN editor ON editor.id = edit.editor
WHERE artist.name = %(artist_name)s
AND edit.type = %(EDIT_TYPE)s
;""", artist_name=artist_name, EDIT_TYPE=EDIT_RECORDING_MERGE)
df.index = df['date'].apply(lambda d: d.date())
del df['date']
return df
We basically sum these contributions (which are probably not 100% accurate but we don't care too much).
In [4]:
def recording_evolution(artist_name, editor='loujin'):
current = current_recording_count(artist_name)
df_new = edits_from_new_medium(artist_name)
df_standalone = edits_from_standalone_recordings(artist_name)
df_changed_to = edits_from_recordings_changed_to(artist_name)
df_changed_from = edits_from_recordings_changed_from(artist_name)
df_merged = edits_from_recordings_merge(artist_name)
df = pd.concat([df_new.new.groupby('date').count(),
df_standalone.standalone.groupby('date').count(),
df_changed_to.changed_to.groupby('date').count(),
- df_changed_from.changed_from.groupby('date').count(),
- df_merged.merged.groupby('date').count(),
], axis=1, sort=True).fillna(0)
df['total'] = (df.new + df.standalone + df.changed_to + df.changed_from + df.merged).cumsum()
df['total'] += current - df['total'][-1]
df['userscript'] = - df_changed_from.userscript.groupby('date').count()
df['userscript'] = df['userscript'].fillna(0)
df['total_userscript'] = df.userscript.cumsum()
df['total_userscript'] += current - df['total_userscript'][-1]
# df_editor = pd.concat([df_new.new[df_new.editor == editor].groupby('date').count(),
# df_standalone.standalone[df_standalone.editor == editor].groupby('date').count(),
# df_changed_to.changed_to[df_changed_to.editor == editor].groupby('date').count(),
# df_changed_from.changed_from[df_changed_from.editor == editor].groupby('date').count(),
# df_merged.merged[df_merged.editor == editor].groupby('date').count(),
# ], axis=1, sort=True).fillna(0)
# df[editor] = (df_editor.new + df_editor.standalone + df_editor.changed_to
# - df_editor.changed_from - df_editor.merged)
# df['total_' + editor] = df[editor].fillna(0).cumsum()
# df['total_' + editor] += current - df['total_' + editor][-1]
iplot({
'data': [{
'x': df.index,
'y': df[col],
'name': col,
} for col in df.columns],
'layout': {
'title': artist_name,
'xaxis': {'title': 'Date'},
'yaxis': {'title': 'Number of recordings'},
},
}, show_link=False)
return df
Let's test with a composer that never had too many recordings:
In [5]:
webern = recording_evolution('Anton Webern')
We can see the decreasing total (converging to zero hopefully) and independent contributions (all centered around zero). The "total_userscript" line filters edits which edit note suggests a userscript was used to automatize the edit (like my "Replace recording artist from a release page" script), it gives an idea how the contribution of this script to the global cleanup effort.
We can test other composers:
In [6]:
ravel = recording_evolution('Maurice Ravel')
In [7]:
debussy = recording_evolution('Claude Debussy')
In [8]:
chopin = recording_evolution('Fryderyk Chopin')
In [9]:
bach = recording_evolution('Johann Sebastian Bach')
In [10]:
brahms = recording_evolution('Johannes Brahms')
In [11]:
beethoven = recording_evolution('Ludwig van Beethoven')
In [12]:
schubert = recording_evolution('Franz Schubert')
In [13]:
dvorak = recording_evolution('Antonín Dvořák')
In [15]:
mozart = recording_evolution('Wolfgang Amadeus Mozart')