From the live database
In [2]:
from datetime import datetime
import pandas
import sqlalchemy
SITE_URL = 'https://musicbrainz.org'
# import postgresql environ variables
# defined in postgres-dockerfile/postgres.env
PGHOST = '127.0.0.1'
PGDATABASE = 'musicbrainz'
PGUSER = 'musicbrainz'
PGPASSWORD = 'musicbrainz'
PGPORT = 5433
def sql(query, **kwargs):
"""helper function for SQL queries using the %(...) syntax
Parameters for the query must be passed as keyword arguments
e.g. sql('SELECT * FROM artist WHERE name=%(singer)s', singer='Bob Dylan')
"""
engine = sqlalchemy.create_engine(
'postgresql+psycopg2://'
'{PGUSER}:{PGPASSWORD}@{PGHOST}:{PGPORT}/{PGDATABASE}'.format(**globals()),
isolation_level='READ UNCOMMITTED')
return pandas.read_sql(query, engine, params=kwargs)
# helper function to build canonical URLs
def _mb_link(entity_type, mbid):
return '<a href="{url}/{entity_type}/{mbid}">{mbid}</a>'.format(
url=SITE_URL, **locals())
mb_release_link = lambda mbid: _mb_link('release', mbid) # noqa
In [3]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)
In [4]:
sql("""SELECT id, gid, name, comment FROM label WHERE name='hyperion';""")
Out[4]:
In [5]:
sql("""
SELECT
DISTINCT label.id,
label.name AS label,
label.gid AS mbid,
COUNT(*) AS releases_count
FROM release AS r
JOIN release_label AS rl ON rl.release = r.id
JOIN label ON rl.label = label.id
JOIN l_release_url AS lu ON r.id = lu.entity0
JOIN url AS u ON u.id = lu.entity1
WHERE u.url ILIKE '%%hyperion-records%%'
GROUP BY label.id, label.name, label.gid
ORDER BY releases_count DESC
;""")
Out[5]:
We keep only the first two for the moment, the other ones seem to only be distributed by hyperion.
In [6]:
mb_labels = sql("""
SELECT
label.name AS label,
COUNT(*) AS releases_count
FROM release AS r
JOIN release_label AS rl ON rl.release = r.id
JOIN label ON rl.label = label.id
WHERE label.id IN (388, 22708, 16197, 2900, 53283, 59800, 95583)
GROUP BY label.name
ORDER BY releases_count DESC
;""")
mb_labels
Out[6]:
In [7]:
sql("""
SELECT
COUNT(*) AS releases_count
FROM release AS r
LEFT OUTER JOIN release_label AS rl ON rl.release = r.id
JOIN l_release_url AS lu ON r.id = lu.entity0
JOIN url AS u ON u.id = lu.entity1
WHERE u.url ILIKE '%%hyperion-records%%'
AND rl.label IS NULL
;""")
Out[7]:
In [8]:
sql("""
SELECT
COUNT(r.name)
FROM release_label AS rl
JOIN release AS r ON rl.release = r.id
JOIN label ON rl.label = label.id
WHERE label.id IN (388, 22708);
""")
Out[8]:
In [9]:
mb_releases = sql("""
SELECT
rl.catalog_number,
r.barcode,
r.name AS title,
ac.name AS ac,
label.name AS label,
CASE WHEN r.quality=-1 THEN 'unset'
WHEN r.quality=0 THEN 'low'
WHEN r.quality=1 THEN 'normal'
WHEN r.quality=2 THEN 'high'
END AS quality,
STRING_AGG(DISTINCT at.name, ', ') AS covers,
STRING_AGG(DISTINCT
CASE WHEN u.url ILIKE '%%hyperion-records%%' THEN u.url
ELSE NULL
END, ', ') AS hyperion_url,
r.gid AS mb_url
FROM release_label AS rl
JOIN release AS r ON rl.release = r.id
JOIN label ON rl.label = label.id
JOIN artist_credit AS ac ON r.artist_credit = ac.id
LEFT OUTER JOIN cover_art_archive.cover_art AS cova ON cova.release = r.id
LEFT OUTER JOIN cover_art_archive.cover_art_type AS covat ON covat.id = cova.id
LEFT OUTER JOIN cover_art_archive.art_type AS at ON covat.type_id = at.id
LEFT OUTER JOIN l_release_url AS lu ON r.id = lu.entity0
LEFT OUTER JOIN url AS u ON u.id = lu.entity1
WHERE label.id IN (388, 22708)
GROUP BY rl.catalog_number, r.barcode, r.name, ac.name, label.name, r.quality, r.gid
ORDER BY rl.catalog_number, r.barcode;
""")
mb_releases['mb_url'] = mb_releases.mb_url.apply(mb_release_link)
mb_releases['hyperion_url'] = mb_releases.hyperion_url.apply(lambda url: '<a href="{0}">{0}</a>'.format(url) if url else None)
mb_releases.head()
Out[9]:
In [10]:
mb_releases.shape
Out[10]:
1677 releases already in MB, some might be duplicates
In [11]:
mb_releases_by_quality = sql("""
SELECT
CASE WHEN r.quality=-1 THEN 'unset'
WHEN r.quality=0 THEN 'low'
WHEN r.quality=1 THEN 'normal'
WHEN r.quality=2 THEN 'high'
END AS quality,
COUNT(r.gid)
FROM release_label AS rl
JOIN release AS r ON rl.release = r.id
JOIN label ON rl.label = label.id
WHERE label.id IN (388, 22708)
GROUP BY r.quality
ORDER BY r.quality;
""")
mb_releases_by_quality
Out[11]:
In [12]:
mb_releases_with_front = sql("""
SELECT
at.name,
COUNT(r.gid)
FROM release_label AS rl
JOIN release AS r ON rl.release = r.id
JOIN label ON rl.label = label.id
JOIN cover_art_archive.cover_art AS cova ON cova.release = r.id
JOIN cover_art_archive.cover_art_type AS covat ON covat.id = cova.id
JOIN cover_art_archive.art_type AS at ON covat.type_id = at.id
WHERE label.id IN (388, 22708)
AND at.name = 'Front'
GROUP BY at.name;
""")
mb_releases_with_front
Out[12]:
In [13]:
mb_releases_with_booklet = sql("""
SELECT
rl.catalog_number,
r.name AS title,
STRING_AGG(at.name, ', ') AS covers,
r.gid AS mb_url
FROM release_label AS rl
JOIN release AS r ON rl.release = r.id
JOIN label ON rl.label = label.id
JOIN cover_art_archive.cover_art AS cova ON cova.release = r.id
JOIN cover_art_archive.cover_art_type AS covat ON covat.id = cova.id
JOIN cover_art_archive.art_type AS at ON covat.type_id = at.id
WHERE label.id IN (388, 22708)
AND at.name = 'Booklet'
AND cova.mime_type = 'application/pdf'
GROUP BY rl.catalog_number, r.name, r.gid
ORDER BY rl.catalog_number;
""")
mb_releases_with_booklet['mb_url'] = mb_releases_with_booklet.mb_url.apply(mb_release_link)
mb_releases_with_booklet.head()
Out[13]:
In [14]:
mb_releases_with_booklet.shape
Out[14]:
In [15]:
#mb_releases_wo_catno = sql("""
#SELECT release.barcode,
# release.name AS title,
# label.name AS label,
# release.gid AS mb_url
# FROM release_label AS rl
# JOIN release ON rl.release = release.id
# JOIN label ON rl.label = label.id
# WHERE label.id IN (388, 22708, 16197)
# AND rl.catalog_number IS NULL
#ORDER BY rl.catalog_number;
#""")
#mb_releases_wo_catno['mb_url'] = mb_releases_wo_catno.mb_url.apply(mb_release_link)
#mb_releases_wo_catno.head()
In [16]:
#mb_releases_wo_catno.shape
In [17]:
mb_releases_wo_link = sql("""
SELECT
rl.catalog_number,
r.barcode,
r.name AS title,
label.name AS label,
r.gid AS mb_url
FROM release_label AS rl
JOIN release AS r ON rl.release = r.id
JOIN label ON rl.label = label.id
LEFT OUTER JOIN l_release_url AS lu ON r.id = lu.entity0
LEFT OUTER JOIN url AS u ON u.id = lu.entity1
WHERE label.id IN (388, 22708)
AND u.url IS NULL
ORDER BY rl.catalog_number, r.barcode;
""")
mb_releases_wo_link['mb_url'] = mb_releases_wo_link.mb_url.apply(mb_release_link)
mb_releases_wo_link.head()
Out[17]:
In [18]:
mb_releases_wo_link.shape
Out[18]:
In [ ]:
In [19]:
hyperion_releases = pd.read_html('https://www.hyperion-records.co.uk/n.asp?n=1')[0][:-1]
hyperion_releases.rename(columns={0: 'catalog_number', 1: 'name'}, inplace=True)
hyperion_releases['barcode'] = hyperion_releases.name.str[-12:]
hyperion_releases['name'] = hyperion_releases.name.str[:-12]
hyperion_releases['url'] = hyperion_releases.catalog_number.apply(
lambda no: '<a href="https://www.hyperion-records.co.uk/dc.asp?dc=D_{0}">{0}</a>'.format(no))
In [20]:
hyperion_releases.head()
Out[20]:
In [21]:
barcodes_not_in_hyperion = set(mb_releases.barcode) - set(hyperion_releases.barcode) - {''} - {None}
print(barcodes_not_in_hyperion)
catno_not_in_hyperion = set(mb_releases.catalog_number) - set(hyperion_releases.catalog_number) - {''} - {None}
print(sorted(list(catno_not_in_hyperion)))
In [22]:
barcodes_not_in_hyperion = set(mb_releases.barcode) - set(hyperion_releases.barcode) - {''} - {None}
releases_not_in_hyperion = mb_releases[mb_releases.barcode.apply(lambda b: b in barcodes_not_in_hyperion)]
releases_not_in_hyperion.head()
Out[22]:
In [23]:
releases_not_in_hyperion.select(lambda x: not releases_not_in_hyperion.ix[x].catalog_number.startswith('CDA'))
Out[23]:
In [24]:
set(releases_not_in_hyperion.catalog_number).intersection(set(hyperion_releases.catalog_number))
Out[24]:
In [25]:
hyphel_releases = hyperion_releases[hyperion_releases.catalog_number.str.startswith('CD')]
In [26]:
barcodes_not_in_musicbrainz = set(hyphel_releases.barcode) - set(mb_releases.barcode)
releases_not_in_mb = hyphel_releases[hyphel_releases.barcode.apply(lambda b: b in barcodes_not_in_musicbrainz)]
In [27]:
from jinja2 import Template
template = Template("""
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Community Cleanup #4: Hyperion</title>
<link href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
</head>
<body style="margin: 20px;">
<h1>Pre-Classical Clean Up #4</h1>
<a href="https://community.metabrainz.org/t/community-cleanup-4-hyperion/356213">
<img src="https://i.imgur.com/qtAyjmj.png" width="668" height="187">
</a>
<p>Latest update: {{ datetime.utcnow().isoformat()[:19] }}</p>
<ol>
<li><a href="#labels">Labels corresponding to Hyperion in MusicBrainz</a></li>
<li><a href="#booklets">Hyperion releases with PDF booklet in MB</a></li>
<li><a href="#wo_catno">Hyperion releases with no catalog number in MB</a></li>
<li><a href="#not_in_hyp">Releases in MusicBrainz not found in Hyperion catalogue</a></li>
<li><a href="#quality">Hyperion grouped by quality in MB</a></li>
<li><a href="#all_mb">All Hyperion releases in MusicBrainz</a></li>
<li><a href="#not_in_mb">Releases in Hyperion catalogue missing in MusicBrainz</a></li>
<li><a href="#official">Official Hyperion catalogue</a></li>
</ol>
<h2 id="labels">Labels corresponding to Hyperion in MusicBrainz</h2>
{{ mb_labels.to_html(index=False) }}
<p>We keep only the first two for the moment, the other ones seem to only be distributed by hyperion.</p>
<h2 id="booklets">Hyperion releases with PDF booklet in MB</h2>
<p>{{ mb_releases_with_booklet.shape[0] }} releases (Feb. 2nd: 51)</p>
<p>These should be checked and the booklets removed if coming from hyperion</p>
{{ mb_releases_with_booklet.to_html(index=False) }}
<h2 id="wo_catno">Hyperion releases with no catalog number in MB</h2>
<p>Cleaned! (Only one left which is a pseudo-release)</p>
<h2 id="not_in_hyp">Releases in MusicBrainz not found in Hyperion catalogue</h2>
<p>{{ releases_not_in_hyperion.shape[0] }} releases found by naive barcode comparison (Feb. 1st: 45)</p>
<p>These should be checked and barcode fixed if that is the explanation</p>
{{ releases_not_in_hyperion.to_html(index=False) }}
<h2 id="quality">Hyperion releases grouped by quality in MB</h2>
{{ mb_releases_by_quality.to_html(index=False) }}
<h2 id="all_mb">All Hyperion releases in MusicBrainz</h2>
<p>{{ mb_releases.shape[0] }} releases (Feb. 1st: ~1551)</p>
{{ mb_releases.to_html(index=True) }}
<h2 id="not_in_mb">Releases in Hyperion catalogue missing in MusicBrainz</h2>
<p>{{ releases_not_in_mb.shape[0] }} releases found by naive barcode comparison</p>
{{ releases_not_in_mb.to_html(index=True) }}
<h2 id="official">Official Hyperion catalogue</h2>
<p>{{ hyperion_releases.shape[0] }} releases</p>
{{ hyperion_releases.to_html(index=True) }}
</body>
</html>
""")
with open('docs/hyperion_releases.html', 'w') as f:
f.write(template.render(**globals())
.replace('<', '<').replace('>', '>')
.replace('class="dataframe"', 'class="table table-striped table-hover table-sm"')
.replace('thead', 'thead class="thead-light"'))