Project Composer Diversity


In [155]:
import re
import io
import json
from datetime import datetime
import urllib
from urllib.parse import quote, unquote

import pandas as pd
import requests
from jinja2 import Template

import pandas as pd
pd.set_option('display.max_colwidth', -1)

import recordlinkage

In [2]:
%run -i startup.ipy


Last notebook update: 2018-08-06
Git repo: git@bitbucket.org:loujine/musicbrainz-dataviz.git
Importing libs
Defining database parameters

Defining *sql* helper function
Last database update: 2018-06-02

Python packages versions:
numpy       1.15.0
re          2.2.1
json        2.0.9
pandas      0.23.3
requests    2.19.1
recordlinkage  0.12
pandas      0.23.3
sqlalchemy  1.2.10
CPython 3.7.0b5
IPython 6.5.0

Extract data from the Composer Diversity page

The data is inside the javascript callled on the main page, not the HTML itself, so we need to parse it.


In [3]:
resp = requests.request('GET', 'https://composerdiversity.com/dist/build.js')

In [4]:
for line in io.StringIO(resp.content.decode()):
    if 'module.exports' in line and 'labelI18nKey' in line:
        pattern = re.match(r".*(\[\{.*\}\]).*", line).group(1)
        break
headers = json.loads(pattern)
headers = [h.get('class') for h in headers][:-5] + ['region1', 'region2',
                                                    'country', 'url']

In [5]:
for line in io.StringIO(resp.content.decode()):
    if 'module.exports' in line and 'Composer Diversity Database' in line:
        pattern = re.match(r".*(\[\[.*\]\]).*", line).group(1)
        break        
table = json.loads(pattern)
diversity_df = pd.DataFrame(table, columns=headers)

In [6]:
diversity_df.shape


Out[6]:
(3792, 44)

Split death year from artist name and replace it in the "dead" column


In [7]:
for idx, artist in diversity_df.iterrows():
    if 'd.' in artist['name']:
        death_year = artist['name'].split(' (d.')[1].strip()[:-1]
        artist['name'] = artist['name'].split(' (d.')[0]
        artist['dead'] = death_year

Fix URLs


In [8]:
diversity_df.url = diversity_df.url.apply(lambda url: unquote(url) if url else None)

In [9]:
diversity_df.head()


Out[9]:
name living dead male female non-binary orchestra wind-band chorus chamber ... asian wana south-asian southeast-asian american-indian other region1 region2 country url
0 Abbasi, Anahita X X X X ... X X San Diego, CA San Diego, California USA http://anahitaabbasi.com/
1 Abbott, Katy X X X X X X ... Melbourne Melbourne Australia http://www.katyabbott.com/
2 Abbuehl, Susanne X X ... Lucerne Lucerne Switzerland http://www.susanneabbuehl.com/
3 Abdelfattah, Mohamed Abdelwahab X X X X ... X Cairo Cairo Egypt https://mohamedabdelwahabdelfattah.musicaneo.com/
4 Abdelnour, Christine X X ... Pau Pau France https://christineabdelnoursehnaoui.jimdo.com/

5 rows × 44 columns

Find artists in MB collection and not in Diversity DB


In [10]:
ws = 'https://musicbrainz.org/ws/2'
collection_mbid = '2d5b6052-9f4b-49c1-8e86-2c83cdc3b6e3'

In [11]:
resp = requests.request(
    'GET', 
    f'{ws}/collection/{collection_mbid}/artists?fmt=json&limit=100').json()

In [12]:
collection = resp['artists']
collection_size = resp['artist-count']
print(collection_size)


239

In [13]:
collection[0]


Out[13]:
{'sort-name': 'Zwilich, Ellen Taaffe',
 'name': 'Ellen Taaffe Zwilich',
 'disambiguation': '',
 'id': '0265d7f1-a51b-4573-a7ce-28363ad89394'}

In [14]:
for idx in range(1, (collection_size) // 100 + 1):
    resp = requests.request(
        'GET', 
        f'{ws}/collection/{collection_mbid}/artists?fmt=json&limit=100&offset={100*idx}').json()
    collection.extend(resp['artists'])

In [15]:
collection_df = pd.DataFrame(collection)
del collection_df['disambiguation']
collection_df['url'] = collection_df['id'].apply(mb_artist_link)
del collection_df['id']

In [16]:
collection_df.shape


Out[16]:
(239, 3)

In [17]:
collection_df.head()


Out[17]:
name sort-name url
0 Ellen Taaffe Zwilich Zwilich, Ellen Taaffe <a href="https://musicbrainz.org/artist/0265d7f1-a51b-4573-a7ce-28363ad89394">0265d7f1-a51b-4573-a7ce-28363ad89394</a>
1 Ana Giurgiu-Bondue Giurgiu-Bondue, Ana <a href="https://musicbrainz.org/artist/03739055-e3cc-4eb8-88f3-ffd697a2399f">03739055-e3cc-4eb8-88f3-ffd697a2399f</a>
2 Adaline Shepherd Shepherd, Adaline <a href="https://musicbrainz.org/artist/0769ebb1-1c9c-41b4-ba0c-9a34562a8be0">0769ebb1-1c9c-41b4-ba0c-9a34562a8be0</a>
3 Amy Woodforde-Finden Woodforde-Finden, Amy <a href="https://musicbrainz.org/artist/0f194b11-a3de-4084-b9ec-454190c01f78">0f194b11-a3de-4084-b9ec-454190c01f78</a>
4 Denise Kelly Kelly, Denise <a href="https://musicbrainz.org/artist/102e9f4e-1446-4ecd-a009-56f862eba3d8">102e9f4e-1446-4ecd-a009-56f862eba3d8</a>

We attempt a record linkage (finding artists with the same sorted name in the diversity project webpage and the musicbrainz collection), using the jarowinkler algorithm (artists are the same if the similarity computed on names is > 95%)


In [18]:
THRESHOLD = 0.95

In [19]:
# Indexation step
indexer = recordlinkage.Index()
indexer.full()
pairs = indexer.index(diversity_df, collection_df)
print(len(pairs))

# Comparison step
compare_cl = recordlinkage.Compare()
compare_cl.string('name', 'sort-name', method='jarowinkler', 
                  threshold=THRESHOLD, label='name')
features = compare_cl.compute(pairs, diversity_df, collection_df)

# Classification step
linkage = []
for (idx0, idx1) in features[features.sum(axis=1) > 0].index:
    linkage.append([
        diversity_df.loc[idx0]['name'],
        collection_df.loc[idx1]['sort-name'],
        collection_df.loc[idx1]['url'],
    ])

collection_linkage = pd.DataFrame(linkage, columns=['DP name', 'MB sortname', 'mbid'])
print(collection_linkage.shape)


WARNING:recordlinkage:indexing - performance warning - A full index can result in large number of record pairs.
906288
(149, 3)

We found 149 common names.


In [20]:
print(len(set(collection_linkage['DP name'])))
print(len(set(collection_linkage['mbid'])))


149
148

Actually we have only 148 distinct MB entities on one side so at least one record link is wrong


In [21]:
collection_linkage.head()


Out[21]:
DP name MB sortname mbid
0 Abbasi, Anahita Abbasi, Anahita <a href="https://musicbrainz.org/artist/aedc6f10-3c2e-40e1-91d9-f877246a97c0">aedc6f10-3c2e-40e1-91d9-f877246a97c0</a>
1 Abbott, Katy Abbott, Katy <a href="https://musicbrainz.org/artist/8cb85238-ed11-479f-88e0-31774aa30a79">8cb85238-ed11-479f-88e0-31774aa30a79</a>
2 Abbuehl, Susanne Abbuehl, Susanne <a href="https://musicbrainz.org/artist/8c63ca0b-8336-445c-937e-fd2f6b09d4ab">8c63ca0b-8336-445c-937e-fd2f6b09d4ab</a>
3 Abdelfattah, Mohamed Abdelwahab Abdelfattah, Mohamed Abdelwahab <a href="https://musicbrainz.org/artist/0e97ab44-09a4-406c-a3cc-ea6fe319f024">0e97ab44-09a4-406c-a3cc-ea6fe319f024</a>
4 Abe, Keiko Abe, Keiko <a href="https://musicbrainz.org/artist/ee151f24-734b-4350-8b88-8175ebe2eb61">ee151f24-734b-4350-8b88-8175ebe2eb61</a>

We deduce the list of artists present in the collection and missing from the Diversity Project:


In [25]:
missing_from_dp = collection_df[~collection_df['sort-name'].isin(
    set(collection_linkage['MB sortname'])
)][['sort-name', 'url']]
print(missing_from_dp.shape)
missing_from_dp.head()


(91, 2)
Out[25]:
sort-name url
1 Giurgiu-Bondue, Ana <a href="https://musicbrainz.org/artist/03739055-e3cc-4eb8-88f3-ffd697a2399f">03739055-e3cc-4eb8-88f3-ffd697a2399f</a>
2 Shepherd, Adaline <a href="https://musicbrainz.org/artist/0769ebb1-1c9c-41b4-ba0c-9a34562a8be0">0769ebb1-1c9c-41b4-ba0c-9a34562a8be0</a>
4 Kelly, Denise <a href="https://musicbrainz.org/artist/102e9f4e-1446-4ecd-a009-56f862eba3d8">102e9f4e-1446-4ecd-a009-56f862eba3d8</a>
6 Horrocks-Hopayian, Cevanne <a href="https://musicbrainz.org/artist/193ef166-933c-40b6-a5d0-a1b7f9f7b27f">193ef166-933c-40b6-a5d0-a1b7f9f7b27f</a>
11 Kurme-Gedroica, Dzintra <a href="https://musicbrainz.org/artist/242b9f0b-49a5-4226-bc11-190c0a260f1e">242b9f0b-49a5-4226-bc11-190c0a260f1e</a>

Escape URLs with single quote in them


In [122]:
dp_urls = diversity_df.url[~diversity_df.url.isnull()]
dp_urls = dp_urls.str.replace("'", "''")
dp_urls = dp_urls.apply(lambda s: unquote(s))

In [113]:
print(dp_urls[797])
print(dp_urls[876])


http://www.prestoclassical.co.uk/c/Davison,\_H
http://auislandora.wrlc.org/islandora/object/auislandora:33018

Number of links for various sites:


In [127]:
print(f"Wikidata links: {len(dp_urls[dp_urls.str.match('.*wikidata.*')])}")
print(f"Google links: {len(dp_urls[dp_urls.str.match('.*google.*')])}")
print(f"Wikipedia links: {len(dp_urls[dp_urls.str.match('.*wikipedia.*')])}")


Wikidata links: 0
Google links: 26
Wikipedia links: 605

In [143]:
link_from_url = sql("""
SELECT artist.name, 
       artist.gid AS url
  FROM artist
  JOIN l_artist_url AS lau ON lau.entity0 = artist.id
  JOIN url                 ON lau.entity1 = url.id
 WHERE url IN ('%s');
""" % "', '".join(dp_urls.tolist()))
link_from_url['url'] = link_from_url['url'].apply(mb_artist_link)
print(link_from_url.shape)
link_from_url.head()


(517, 2)
Out[143]:
name url
0 Alexandra du Bois <a href="https://musicbrainz.org/artist/612bc843-d035-4afd-b700-e3bf3045a8e5">612bc843-d035-4afd-b700-e3bf3045a8e5</a>
1 Anahita Abbasi <a href="https://musicbrainz.org/artist/aedc6f10-3c2e-40e1-91d9-f877246a97c0">aedc6f10-3c2e-40e1-91d9-f877246a97c0</a>
2 Tara Busch <a href="https://musicbrainz.org/artist/a3a62081-271b-4e8e-bc9a-41c09b52792e">a3a62081-271b-4e8e-bc9a-41c09b52792e</a>
3 I Speak Machine <a href="https://musicbrainz.org/artist/f1bd6172-a361-4668-ae3e-2c2f71f8c04a">f1bd6172-a361-4668-ae3e-2c2f71f8c04a</a>
4 Anne Gillis <a href="https://musicbrainz.org/artist/f8ca8636-2a05-42ed-877e-16512015e09c">f8ca8636-2a05-42ed-877e-16512015e09c</a>

517 links, but probably some are missed (e.g. wikipedia in Diversity Project vs wikidata for MusicBrainz)

Full search in MusicBrainz database

From MusicBrainz we restrict the search to artists who are:

  • a person
  • not male

so we will be missing some results for the Diversity database.


In [130]:
mb_df = sql("""
SELECT artist.sort_name AS name, area.name AS country, artist.gid
  FROM artist
  LEFT OUTER JOIN area ON area.id = artist.area
 WHERE artist.type = 1
   AND artist.gender != 1
ORDER BY name;
""")

In [131]:
mb_df.head(10)


Out[131]:
name country gid
0 !Hero: The Rock Opera None 81c4ef77-5129-49a2-b63c-dcb59b02449f
1 "Sevigny", Lucia Rolle A Coruña 6728cfd9-357f-4600-b2d7-1682b6b83d76
2 $hroomy, Diana United States cf6d43e2-1970-4158-8dfc-c37c73190192
3 'Dead, Elle United Kingdom 14332712-fa0f-42fb-be43-0d7b6ab9147f
4 'Em None 5b1215d0-68bf-4610-824b-03f2f0ec036e
5 'Sup Boo United States 3eb9495e-77fb-4b83-8c77-d81644a2a6e5
6 (029) None ff8431d2-19ab-4ff4-94db-ab251861c157
7 (Bhangra Artist) None cfadc2da-6a8f-4885-93a8-96cecc2e924d
8 (Bhangra Artist) None 53b9610a-b712-40d3-9052-00f3c29c2a25
9 (Guest), Nti Indonesia 310c8044-3fbc-4d44-a275-fe92f43e12c0

Let's look for similarities with the recordlinkage library


In [132]:
# Indexation step
indexer = recordlinkage.index.SortedNeighbourhoodIndex('name', window=9)
pairs = indexer.index(diversity_df, mb_df)
print(len(pairs))


/home/chrom/.virtualenvs/dataviz-py37/local/lib/python3.7/site-packages/recordlinkage/utils.py:46: VisibleDeprecationWarning:

class recordlinkage.SortedNeighbourhoodIndex is renamed and moved, use recordlinkage.index.SortedNeighbourhood

30859

Comparison algorithm should take into account sex and date of death when known.


In [133]:
# Comparison step
compare_cl = recordlinkage.Compare()
compare_cl.string('name', 'name', method='jarowinkler', 
                  threshold=0.95, label='name')
#compare_cl.string('country', 'country', method='jarowinkler', 
#                  threshold=0.85, label='country')
features = compare_cl.compute(pairs, diversity_df, mb_df)
features[features.sum(axis=1) > 0].shape


Out[133]:
(1257, 1)

In [134]:
# Classification step
linkage = []
for (idx0, idx1) in features[features.sum(axis=1) > 0].index:
    linkage.append([
        diversity_df.loc[idx0]['name'],
        mb_df.loc[idx1]['name'],
        mb_artist_link(mb_df.loc[idx1]['gid']),
        diversity_df.loc[idx0]['country'],
        mb_df.loc[idx1]['country'],
        diversity_df.loc[idx0]['url'],
    ])

In [135]:
linkage = pd.DataFrame(linkage, columns=['DP name', 'MB name', 'MBID',
                                         'DP country', 'MB area', 'DP url'])
linkage.head()


Out[135]:
DP name MB name MBID DP country MB area DP url
0 Bailie, Joanna Baillie, Joanna <a href="https://musicbrainz.org/artist/7f18f07b-96f5-4573-9b4f-efcd7755beda">7f18f07b-96f5-4573-9b4f-efcd7755beda</a> Belgium United Kingdom http://joannabailie.com/
1 Bailie, Joanna Baillie, Joanna <a href="https://musicbrainz.org/artist/5ed47fc7-8c6e-42fc-900b-edc6a24a1cee">5ed47fc7-8c6e-42fc-900b-edc6a24a1cee</a> Belgium United Kingdom http://joannabailie.com/
2 Beath, Betty Beaton, Betty <a href="https://musicbrainz.org/artist/cbf277df-4217-46e1-a21d-6e3e3531e46b">cbf277df-4217-46e1-a21d-6e3e3531e46b</a> Australia Canada http://www.beathcox.com/bettybeath/
3 Booth, Annie Booth, Connie <a href="https://musicbrainz.org/artist/0e1b1e54-55ba-4c09-8412-dccea69ac328">0e1b1e54-55ba-4c09-8412-dccea69ac328</a> USA United Kingdom http://www.annieboothmusic.com/
4 Coleman, Lisa Coleman, Mia <a href="https://musicbrainz.org/artist/8ba93c4c-7bb4-4af0-8510-ebb7b0b8368b">8ba93c4c-7bb4-4af0-8510-ebb7b0b8368b</a> USA None http://thelisacoleman.com/

HTML output


In [154]:
diversity_df.head()


Out[154]:
name living dead male female non-binary orchestra wind-band chorus chamber ... asian wana south-asian southeast-asian american-indian other region1 region2 country url
0 Abbasi, Anahita X X X X ... X X San Diego, CA San Diego, California USA http://anahitaabbasi.com/
1 Abbott, Katy X X X X X X ... Melbourne Melbourne Australia http://www.katyabbott.com/
2 Abbuehl, Susanne X X ... Lucerne Lucerne Switzerland http://www.susanneabbuehl.com/
3 Abdelfattah, Mohamed Abdelwahab X X X X ... X Cairo Cairo Egypt https://mohamedabdelwahabdelfattah.musicaneo.com/
4 Abdelnour, Christine X X ... Pau Pau France https://christineabdelnoursehnaoui.jimdo.com/

5 rows × 44 columns


In [173]:
def f(s):
    url = f"https://musicbrainz.org/artist/create?name={quote(s['name'])}&sort_name={quote(s['name'])}"
    if s['male']:
        url += "&gender_id=1"
    elif s['female']:
        url += "&gender_id=2"
    if s['country']:
        url += f"&area.name={quote(s['country'])}"
    if s['dead']:
        url += f"&period.end_date.year={s['dead']}"
    if s['url']:
        url += f"&edit_note={quote(s['url'])}"
    return url
diversity_df['add to MB'] = diversity_df.apply(f, axis=1).apply(lambda s: f'<a href="{s}">add to MB</a>')

In [175]:
template = Template("""
<!doctype html>

<html lang="en">
  <head>
    <meta charset="utf-8">
    <title>Community Project: Composer Diversity</title>
    <link href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" 
          rel="stylesheet" 
          integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" 
          crossorigin="anonymous">
  </head>

  <body style="margin: 20px;">
    <h1>Composer Diversity</h1>
    
    <a href="https://composerdiversity.com/">
      Composer Diversity homepage
    </a>
    <br />
    <a href="https://community.metabrainz.org/t/project-composer-diversity-ex-womens-composer-database/387801/5">
      Forum link
    </a>
    <br />
    <a href="https://wiki.musicbrainz.org/Project_Composer_Diversity">
      Wiki link
    </a>
    <br />
    <a href="https://musicbrainz.org/collection/2d5b6052-9f4b-49c1-8e86-2c83cdc3b6e3">
      Collection link
    </a>    
    
    <p>Latest update: {{ datetime.utcnow().isoformat()[:19] }}</p>
    <p>Latest MusicBrainz database update: {{ mb_database_version() }}</p>
    
    <ol>
      <li><a href="#collection">MusicBrainz public collection</a></li>
      <li><a href="#linkage">Record linkage with collection</a></li>
      <li><a href="#missing">Missing in Diversity Project</a></li>
      <li><a href="#linkage2">Record linkage through URLs</a></li>
      <li><a href="#linkage3">Record linkage with MusicBrainz dump</a></li>
      <li><a href="#fulllist">Full list</a></li>
    </ol>

    <h2 id="collection">MusicBrainz Public collection</h2>
    <p>The public MB collection has {{ collection_df.shape[0] }} Artists.</p>
    {{ collection_df.to_html(index=True) }}

    <h2 id="linkage">Record linkage with collection</h2>
    <p>{{ collection_linkage.shape[0] }} linked artists based only on artist names.</p>
    {{ collection_linkage.to_html(index=True) }}
    
    <h2 id="missing">Missing in Diversity Project</h2>
    <p>{{ missing_from_dp.shape[0] }} artists in the MB collection 
       and not found in The Diversity Project.</p>
    {{ missing_from_dp.to_html(index=True) }}

    <h2 id="linkage2">Record linkage through URLs</h2>
    <p>{{ link_from_url.shape[0] }} common artists found by matching URLs.</p>
    {{ link_from_url.to_html(index=True) }}
    
    <h2 id="linkage3">Record linkage with MusicBrainz dump</h2>
    {{ linkage.to_html(index=True) }}
    
    <h2 id="fulllist">Full Diversity Project list</h2>
    {{ diversity_df[['name', 'country', 'url', 'add to MB']].to_html(index=True) }}
  </body>
</html>
""")

with open('docs/composer-diversity.html', 'w') as fdout:
    fdout.write(template.render(**globals())
            .replace('&lt;', '<').replace('&gt;', '>')
            .replace('class="dataframe"', 'class="table table-striped table-hover table-sm"')
            .replace('thead', 'thead class="thead-light"'))

WaPo 35 top female composers


In [ ]:
composers = [
    "Meredith Monk",
    "Caroline Shaw",
    "Joan Tower",
    "Kaija Saariaho",
    "Pauline Oliveros",
    "Julia Wolfe",
    "Sofia Gubaidulina",
    "Missy Mazzoli",
    "Jennifer Higdon",
    "Lili Boulanger",
    "Augusta Read Thomas",
    "Germaine Tailleferre",
    "Ruth Crawford Seeger",
    "Du Yun",
    "Anna Clyne",
    "Anna Thorvaldsdottir",
    "Lera Auerbach",
    "Paola Prestini",
    "Unsuk Chin",
    "Eve Beglarian",
    "Sarah Kirkland Snider",
    "Laura Kaminsky",
    "Gabriela Lena Frank",
    "Lisa Bielawa",
    "Melinda Wagner",
    "Galina Ustvolskaya",
    "Shulamit Ran",
    "Chen Yi",
    "Amy Beach",
    "Valerie Coleman",
    "Libby Larsen",
    "Florence Price",
    "Gloria Coates",
    "Judith Weir",
    "Cécile Chaminade",
]

In [ ]:
df = sql("""
SELECT name, sort_name, gid AS mbid, begin_date_year FROM artist
 WHERE name IN ('%s')
ORDER BY name;
""" % "', '".join(composers))
df.head()

Composers not found with exact name matching:


In [ ]:
print(set(composers) - set(df.name))

In [ ]:
for composer in set(composers) - set(df.name):
    resp = requests.request(
        'GET', 
        'https://musicbrainz.org/ws/2/artist?query=%22{}%22&fmt=json'.format(
        '%20'.join(composer.split())))
    if resp.json()['artists']:
        print(composer)
        d = resp.json()['artists'][0]
        df.loc[len(df)] = {'name': d['name'],
                           'sort_name': d['sort-name'],
                           'mbid': d['id'],
                           'begin_date_year': None}
    else: 
        print('not found: ' + composer)