In [69]:
import requests
import pandas as pd
import json
from datetime import datetime
import SPARQLWrapper
import recordlinkage
from jinja2 import Template
import pandas as pd
pd.set_option('display.max_colwidth', -1)
In [3]:
%run startup.ipy
In [4]:
# https://lawlesst.github.io/notebook/sparql-dataframe.html
def get_sparql_dataframe(endpoint, query):
"""
Helper function to convert SPARQL results into a Pandas data frame.
"""
sparql = SPARQLWrapper.SPARQLWrapper(endpoint)
sparql.setQuery(query)
sparql.setReturnFormat(SPARQLWrapper.JSON)
result = sparql.query()
processed_results = json.load(result.response)
cols = processed_results['head']['vars']
out = []
for row in processed_results['results']['bindings']:
item = []
for c in cols:
item.append(row.get(c, {}).get('value'))
out.append(item)
return pd.DataFrame(out, columns=cols)
In [5]:
doremus = 'http://data.doremus.org/sparql'
In [7]:
get_sparql_dataframe(endpoint=doremus, query="""
SELECT DISTINCT ?expression SAMPLE(?title) as ?title
WHERE {
?expression a efrbroo:F22_Self-Contained_Expression ;
rdfs:label ?title .
?expCreation efrbroo:R17_created ?expression ;
ecrm:P9_consists_of / ecrm:P14_carried_out_by ?composer .
?composer foaf:name "Wolfgang Amadeus Mozart"
} ORDER BY ?title
""").head()
Out[7]:
In [6]:
get_sparql_dataframe(endpoint=doremus, query="""
SELECT DISTINCT ?composer ?pred ?value
WHERE {
?composer foaf:name "Ludwig van Beethoven" .
?composer ?pred ?value
}
LIMIT 10
""").head()
Out[6]:
In [8]:
get_sparql_dataframe(endpoint=doremus, query="""
SELECT DISTINCT ?artist SAMPLE(?name) as ?name
WHERE {
[] mus:U31_had_function <http://data.doremus.org/vocabulary/function/composer> ;
ecrm:P14_carried_out_by ?artist .
?artist foaf:name ?name
}
LIMIT 10
""").head()
Out[8]:
In [9]:
get_sparql_dataframe(endpoint=doremus, query="""
SELECT DISTINCT ?composer ?viafurl ?mburl ?wdurl ?dburl
WHERE {
?composer foaf:name "Ludwig van Beethoven" .
OPTIONAL {
?composer owl:sameAs ?viafurl FILTER (regex (?viafurl, "viaf.org")) .
}
OPTIONAL {
?composer owl:sameAs ?mburl FILTER (regex (?mburl, "musicbrainz.org")) .
}
OPTIONAL {
?composer owl:sameAs ?wdurl FILTER (regex (?wdurl, "wikidata.org")) .
}
OPTIONAL {
?composer owl:sameAs ?dburl FILTER (regex (?dburl, "dbpedia.org")) .
}
}
""")
Out[9]:
Number of artists with VIAF link:
In [10]:
get_sparql_dataframe(endpoint=doremus, query="""
SELECT count(DISTINCT ?url)
WHERE {
?artist owl:sameAs ?url FILTER (regex (?url, "viaf.org")) .
}
""")
Out[10]:
Idem for composers only
In [11]:
get_sparql_dataframe(endpoint=doremus, query="""
SELECT count(DISTINCT ?url)
WHERE {
?artist owl:sameAs ?url FILTER (regex (?url, "viaf.org")) .
[] mus:U31_had_function <http://data.doremus.org/vocabulary/function/composer> ;
ecrm:P14_carried_out_by ?artist .
}
""")
Out[11]:
Number of artists with MB link
In [12]:
get_sparql_dataframe(endpoint=doremus, query="""
SELECT count(DISTINCT ?url)
WHERE {
?artist owl:sameAs ?url FILTER (regex (?url, "musicbrainz.org")) .
}
""")
Out[12]:
Idem for composers only
In [13]:
get_sparql_dataframe(endpoint=doremus, query="""
SELECT count(DISTINCT ?url)
WHERE {
?artist owl:sameAs ?url FILTER (regex (?url, "musicbrainz.org")) .
[] mus:U31_had_function <http://data.doremus.org/vocabulary/function/composer> ;
ecrm:P14_carried_out_by ?artist .
}
""")
Out[13]:
Number of artists with wikidata link
In [14]:
get_sparql_dataframe(endpoint=doremus, query="""
SELECT count(DISTINCT ?url)
WHERE {
?artist owl:sameAs ?url FILTER (regex (?url, "wikidata.org")) .
}
""")
Out[14]:
Number of artists with DBpedia link
In [15]:
get_sparql_dataframe(endpoint=doremus, query="""
SELECT count(DISTINCT ?url)
WHERE {
?artist owl:sameAs ?url FILTER (regex (?url, "dbpedia.org")) .
}
""")
Out[15]:
In [16]:
get_sparql_dataframe(endpoint=doremus, query="""
SELECT DISTINCT ?artist SAMPLE(?name) as ?name
WHERE {
[] a mus:M28_Individual_Performance ;
ecrm:P14_carried_out_by ?artist .
?artist foaf:name ?name
}
LIMIT 10
""")
Out[16]:
performance page: http://data.doremus.org/performance/6d8a5bfa-99c0-3c92-8bb2-6599ab21a2b5
In [17]:
get_sparql_dataframe(endpoint=doremus, query="""
SELECT DISTINCT *
WHERE {
?performance a efrbroo:F31_Performance ;
ecrm:P7_took_place_at <http://data.doremus.org/place/687896c9-0db7-31cf-8b20-5fa73dc268d6>
}
LIMIT 10
""")
Out[17]:
In [ ]:
In [18]:
df = get_sparql_dataframe(endpoint=doremus, query="""
SELECT DISTINCT ?doremus ?name ?viaf ?mbid
WHERE {
?doremus foaf:name ?name .
?doremus owl:sameAs ?viaf FILTER (regex (?viaf, "viaf.org")) .
OPTIONAL {
?doremus owl:sameAs ?mbid FILTER (regex (?mbid, "musicbrainz.org")) .
}
}
""")
In [19]:
df.mbid = df.mbid.str.replace('https://musicbrainz.org/artist/', '')
df = df.drop_duplicates(['doremus', 'viaf', 'mbid'])
In [20]:
df.head()
Out[20]:
In [21]:
set(df[df.mbid.isnull()].viaf)
Out[21]:
In [22]:
len(set(df[df.mbid.isnull()].viaf))
Out[22]:
In [23]:
mb_df = sql("""
SELECT artist.name, artist.gid AS mbid, url.url AS viaf
FROM artist
JOIN l_artist_url AS lau ON lau.entity0 = artist.id
JOIN url ON lau.entity1 = url.id
WHERE url IN (%s)
ORDER BY artist.name;
""" % ("'" + "','".join(set(df[df.mbid.isnull()].viaf)) + "'"))
mb_df.head()
Out[23]:
In [24]:
mb_df.shape
Out[24]:
So:
These 4300 should have the MBID added to Doremus. The 15000 others should have their VIAF added to MB or the artist created if necessary
In [42]:
df = df[df.mbid.isnull()]
In [43]:
# Indexation step
indexer = recordlinkage.BlockIndex('viaf')
pairs = indexer.index(df, mb_df)
print(len(pairs))
In [66]:
# Classification step
linkage = []
for (idx0, idx1) in pairs:
linkage.append([
df.loc[idx0]['name'],
mb_df.loc[idx1]['name'],
mb_artist_link(mb_df.loc[idx1]['mbid']),
'<a href="{}">{}</a>'.format(df.loc[idx0]['viaf'],
df.loc[idx0]['viaf'].replace('http://viaf.org/viaf/', '')),
'<a href="{}">{}</a>'.format(df.loc[idx0]['doremus'],
df.loc[idx0]['doremus'].replace('http://data.doremus.org/artist/', '')),
])
In [67]:
linkage = pd.DataFrame(linkage, columns=['Doremus name', 'MB name', 'MBID',
'VIAF', 'Doremus id'])
In [68]:
linkage.head()
Out[68]:
In [70]:
template = Template("""
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Doremus/MusicBrainz comparison</title>
<link href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
</head>
<body style="margin: 20px;">
<h1>Doremus/MusicBrainz comparison</h1>
<p>Latest update: {{ datetime.utcnow().isoformat()[:19] }}</p>
<ol>
<li><a href="#viaf">VIAF comparison</a></li>
</ol>
<h2 id="viaf">VIAF comparison</h2>
<p>Record linkage: Doremus records with no MusicBrainz link and a VIAF ID also known in MusicBrainz:</p>
{{ linkage.to_html(index=True) }}
</body>
</html>
""")
with open('docs/doremus.html', 'w') as fdout:
fdout.write(template.render(**globals())
.replace('<', '<').replace('>', '>')
.replace('class="dataframe"', 'class="table table-striped table-hover table-sm"')
.replace('thead', 'thead class="thead-light"'))