In [155]:
import re
import io
import json
from datetime import datetime
import urllib
from urllib.parse import quote, unquote
import pandas as pd
import requests
from jinja2 import Template
import pandas as pd
pd.set_option('display.max_colwidth', -1)
import recordlinkage
In [2]:
%run -i startup.ipy
The data is inside the javascript callled on the main page, not the HTML itself, so we need to parse it.
In [3]:
resp = requests.request('GET', 'https://composerdiversity.com/dist/build.js')
In [4]:
for line in io.StringIO(resp.content.decode()):
if 'module.exports' in line and 'labelI18nKey' in line:
pattern = re.match(r".*(\[\{.*\}\]).*", line).group(1)
break
headers = json.loads(pattern)
headers = [h.get('class') for h in headers][:-5] + ['region1', 'region2',
'country', 'url']
In [5]:
for line in io.StringIO(resp.content.decode()):
if 'module.exports' in line and 'Composer Diversity Database' in line:
pattern = re.match(r".*(\[\[.*\]\]).*", line).group(1)
break
table = json.loads(pattern)
diversity_df = pd.DataFrame(table, columns=headers)
In [6]:
diversity_df.shape
Out[6]:
Split death year from artist name and replace it in the "dead" column
In [7]:
for idx, artist in diversity_df.iterrows():
if 'd.' in artist['name']:
death_year = artist['name'].split(' (d.')[1].strip()[:-1]
artist['name'] = artist['name'].split(' (d.')[0]
artist['dead'] = death_year
Fix URLs
In [8]:
diversity_df.url = diversity_df.url.apply(lambda url: unquote(url) if url else None)
In [9]:
diversity_df.head()
Out[9]:
In [10]:
ws = 'https://musicbrainz.org/ws/2'
collection_mbid = '2d5b6052-9f4b-49c1-8e86-2c83cdc3b6e3'
In [11]:
resp = requests.request(
'GET',
f'{ws}/collection/{collection_mbid}/artists?fmt=json&limit=100').json()
In [12]:
collection = resp['artists']
collection_size = resp['artist-count']
print(collection_size)
In [13]:
collection[0]
Out[13]:
In [14]:
for idx in range(1, (collection_size) // 100 + 1):
resp = requests.request(
'GET',
f'{ws}/collection/{collection_mbid}/artists?fmt=json&limit=100&offset={100*idx}').json()
collection.extend(resp['artists'])
In [15]:
collection_df = pd.DataFrame(collection)
del collection_df['disambiguation']
collection_df['url'] = collection_df['id'].apply(mb_artist_link)
del collection_df['id']
In [16]:
collection_df.shape
Out[16]:
In [17]:
collection_df.head()
Out[17]:
We attempt a record linkage (finding artists with the same sorted name in the diversity project webpage and the musicbrainz collection), using the jarowinkler algorithm (artists are the same if the similarity computed on names is > 95%)
In [18]:
THRESHOLD = 0.95
In [19]:
# Indexation step
indexer = recordlinkage.Index()
indexer.full()
pairs = indexer.index(diversity_df, collection_df)
print(len(pairs))
# Comparison step
compare_cl = recordlinkage.Compare()
compare_cl.string('name', 'sort-name', method='jarowinkler',
threshold=THRESHOLD, label='name')
features = compare_cl.compute(pairs, diversity_df, collection_df)
# Classification step
linkage = []
for (idx0, idx1) in features[features.sum(axis=1) > 0].index:
linkage.append([
diversity_df.loc[idx0]['name'],
collection_df.loc[idx1]['sort-name'],
collection_df.loc[idx1]['url'],
])
collection_linkage = pd.DataFrame(linkage, columns=['DP name', 'MB sortname', 'mbid'])
print(collection_linkage.shape)
We found 149 common names.
In [20]:
print(len(set(collection_linkage['DP name'])))
print(len(set(collection_linkage['mbid'])))
Actually we have only 148 distinct MB entities on one side so at least one record link is wrong
In [21]:
collection_linkage.head()
Out[21]:
We deduce the list of artists present in the collection and missing from the Diversity Project:
In [25]:
missing_from_dp = collection_df[~collection_df['sort-name'].isin(
set(collection_linkage['MB sortname'])
)][['sort-name', 'url']]
print(missing_from_dp.shape)
missing_from_dp.head()
Out[25]:
Escape URLs with single quote in them
In [122]:
dp_urls = diversity_df.url[~diversity_df.url.isnull()]
dp_urls = dp_urls.str.replace("'", "''")
dp_urls = dp_urls.apply(lambda s: unquote(s))
In [113]:
print(dp_urls[797])
print(dp_urls[876])
Number of links for various sites:
In [127]:
print(f"Wikidata links: {len(dp_urls[dp_urls.str.match('.*wikidata.*')])}")
print(f"Google links: {len(dp_urls[dp_urls.str.match('.*google.*')])}")
print(f"Wikipedia links: {len(dp_urls[dp_urls.str.match('.*wikipedia.*')])}")
In [143]:
link_from_url = sql("""
SELECT artist.name,
artist.gid AS url
FROM artist
JOIN l_artist_url AS lau ON lau.entity0 = artist.id
JOIN url ON lau.entity1 = url.id
WHERE url IN ('%s');
""" % "', '".join(dp_urls.tolist()))
link_from_url['url'] = link_from_url['url'].apply(mb_artist_link)
print(link_from_url.shape)
link_from_url.head()
Out[143]:
517 links, but probably some are missed (e.g. wikipedia in Diversity Project vs wikidata for MusicBrainz)
From MusicBrainz we restrict the search to artists who are:
so we will be missing some results for the Diversity database.
In [130]:
mb_df = sql("""
SELECT artist.sort_name AS name, area.name AS country, artist.gid
FROM artist
LEFT OUTER JOIN area ON area.id = artist.area
WHERE artist.type = 1
AND artist.gender != 1
ORDER BY name;
""")
In [131]:
mb_df.head(10)
Out[131]:
Let's look for similarities with the recordlinkage library
In [132]:
# Indexation step
indexer = recordlinkage.index.SortedNeighbourhoodIndex('name', window=9)
pairs = indexer.index(diversity_df, mb_df)
print(len(pairs))
Comparison algorithm should take into account sex and date of death when known.
In [133]:
# Comparison step
compare_cl = recordlinkage.Compare()
compare_cl.string('name', 'name', method='jarowinkler',
threshold=0.95, label='name')
#compare_cl.string('country', 'country', method='jarowinkler',
# threshold=0.85, label='country')
features = compare_cl.compute(pairs, diversity_df, mb_df)
features[features.sum(axis=1) > 0].shape
Out[133]:
In [134]:
# Classification step
linkage = []
for (idx0, idx1) in features[features.sum(axis=1) > 0].index:
linkage.append([
diversity_df.loc[idx0]['name'],
mb_df.loc[idx1]['name'],
mb_artist_link(mb_df.loc[idx1]['gid']),
diversity_df.loc[idx0]['country'],
mb_df.loc[idx1]['country'],
diversity_df.loc[idx0]['url'],
])
In [135]:
linkage = pd.DataFrame(linkage, columns=['DP name', 'MB name', 'MBID',
'DP country', 'MB area', 'DP url'])
linkage.head()
Out[135]:
In [154]:
diversity_df.head()
Out[154]:
In [173]:
def f(s):
url = f"https://musicbrainz.org/artist/create?name={quote(s['name'])}&sort_name={quote(s['name'])}"
if s['male']:
url += "&gender_id=1"
elif s['female']:
url += "&gender_id=2"
if s['country']:
url += f"&area.name={quote(s['country'])}"
if s['dead']:
url += f"&period.end_date.year={s['dead']}"
if s['url']:
url += f"&edit_note={quote(s['url'])}"
return url
diversity_df['add to MB'] = diversity_df.apply(f, axis=1).apply(lambda s: f'<a href="{s}">add to MB</a>')
In [175]:
template = Template("""
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Community Project: Composer Diversity</title>
<link href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css"
rel="stylesheet"
integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm"
crossorigin="anonymous">
</head>
<body style="margin: 20px;">
<h1>Composer Diversity</h1>
<a href="https://composerdiversity.com/">
Composer Diversity homepage
</a>
<br />
<a href="https://community.metabrainz.org/t/project-composer-diversity-ex-womens-composer-database/387801/5">
Forum link
</a>
<br />
<a href="https://wiki.musicbrainz.org/Project_Composer_Diversity">
Wiki link
</a>
<br />
<a href="https://musicbrainz.org/collection/2d5b6052-9f4b-49c1-8e86-2c83cdc3b6e3">
Collection link
</a>
<p>Latest update: {{ datetime.utcnow().isoformat()[:19] }}</p>
<p>Latest MusicBrainz database update: {{ mb_database_version() }}</p>
<ol>
<li><a href="#collection">MusicBrainz public collection</a></li>
<li><a href="#linkage">Record linkage with collection</a></li>
<li><a href="#missing">Missing in Diversity Project</a></li>
<li><a href="#linkage2">Record linkage through URLs</a></li>
<li><a href="#linkage3">Record linkage with MusicBrainz dump</a></li>
<li><a href="#fulllist">Full list</a></li>
</ol>
<h2 id="collection">MusicBrainz Public collection</h2>
<p>The public MB collection has {{ collection_df.shape[0] }} Artists.</p>
{{ collection_df.to_html(index=True) }}
<h2 id="linkage">Record linkage with collection</h2>
<p>{{ collection_linkage.shape[0] }} linked artists based only on artist names.</p>
{{ collection_linkage.to_html(index=True) }}
<h2 id="missing">Missing in Diversity Project</h2>
<p>{{ missing_from_dp.shape[0] }} artists in the MB collection
and not found in The Diversity Project.</p>
{{ missing_from_dp.to_html(index=True) }}
<h2 id="linkage2">Record linkage through URLs</h2>
<p>{{ link_from_url.shape[0] }} common artists found by matching URLs.</p>
{{ link_from_url.to_html(index=True) }}
<h2 id="linkage3">Record linkage with MusicBrainz dump</h2>
{{ linkage.to_html(index=True) }}
<h2 id="fulllist">Full Diversity Project list</h2>
{{ diversity_df[['name', 'country', 'url', 'add to MB']].to_html(index=True) }}
</body>
</html>
""")
with open('docs/composer-diversity.html', 'w') as fdout:
fdout.write(template.render(**globals())
.replace('<', '<').replace('>', '>')
.replace('class="dataframe"', 'class="table table-striped table-hover table-sm"')
.replace('thead', 'thead class="thead-light"'))
In [ ]:
composers = [
"Meredith Monk",
"Caroline Shaw",
"Joan Tower",
"Kaija Saariaho",
"Pauline Oliveros",
"Julia Wolfe",
"Sofia Gubaidulina",
"Missy Mazzoli",
"Jennifer Higdon",
"Lili Boulanger",
"Augusta Read Thomas",
"Germaine Tailleferre",
"Ruth Crawford Seeger",
"Du Yun",
"Anna Clyne",
"Anna Thorvaldsdottir",
"Lera Auerbach",
"Paola Prestini",
"Unsuk Chin",
"Eve Beglarian",
"Sarah Kirkland Snider",
"Laura Kaminsky",
"Gabriela Lena Frank",
"Lisa Bielawa",
"Melinda Wagner",
"Galina Ustvolskaya",
"Shulamit Ran",
"Chen Yi",
"Amy Beach",
"Valerie Coleman",
"Libby Larsen",
"Florence Price",
"Gloria Coates",
"Judith Weir",
"Cécile Chaminade",
]
In [ ]:
df = sql("""
SELECT name, sort_name, gid AS mbid, begin_date_year FROM artist
WHERE name IN ('%s')
ORDER BY name;
""" % "', '".join(composers))
df.head()
Composers not found with exact name matching:
In [ ]:
print(set(composers) - set(df.name))
In [ ]:
for composer in set(composers) - set(df.name):
resp = requests.request(
'GET',
'https://musicbrainz.org/ws/2/artist?query=%22{}%22&fmt=json'.format(
'%20'.join(composer.split())))
if resp.json()['artists']:
print(composer)
d = resp.json()['artists'][0]
df.loc[len(df)] = {'name': d['name'],
'sort_name': d['sort-name'],
'mbid': d['id'],
'begin_date_year': None}
else:
print('not found: ' + composer)