In [1]:
import urllib
import csv
def bitstream_info_from_pid(pid_url, mimetype=None):
"""
Appends well known magic to the `pid_url` and retrieves the metadata instead of the
landing page. The metadata are parsed to get the mimetype and bitstream url.
The algorithm looks for items that have ResourceType set to `Resource`.
"""
import xml.etree.ElementTree as ET
pid_metadata_url = pid_url + "@format=cmdi"
print "Fetching metadata in CMDI format [%s]" % pid_metadata_url
ns = "{http://www.clarin.eu/cmd/}"
metadata = urllib.urlopen(pid_metadata_url).read()
root = ET.fromstring(metadata)
# finding bitstream elements
bitstream_info_arr = []
for proxy in root.findall('.//%sResourceProxy' % ns):
rt = proxy.find("./%sResourceType" % ns)
rr = proxy.find("./%sResourceRef" % ns)
if rt.text == "Resource":
bitstream_info_arr.append(
(rt.attrib.get("mimetype", "unknown"), rr.text)
)
print "Found [%d] bitstreams elements in ResourceProxy elements" % len(bitstream_info_arr)
if mimetype is not None:
print "Filtering bitstreams according to specified mimetype [%s]" % mimetype
bitstream_info_arr = [x for x in bitstream_info_arr if x[0] == mimetype]
print "Found"
print "%s" % "\n".join(["%2d. %s [%s]" % (i, x[1], x[0]) for i, x in enumerate(bitstream_info_arr)])
return bitstream_info_arr
def plot(arr):
"""
Just testing one way of visualisation.
"""
import pandas as pd
from matplotlib import pyplot as plt
from math import radians
fig = plt.figure()
ax = fig.add_subplot(111, polar=True)
ax.scatter(x=[radians(float(deg)) for _1, deg in arr], y=[1] * len(arr))
ax.set_theta_zero_location('N')
ax.set_theta_direction(-1)
plt.show()
# PID to a clarin-dspace repository
# - metadata attached to the PID at http://hdl.handle.net/11346/TEST--HGGA?noredirect
pid_url = "http://hdl.handle.net/11346/TEST--HGGA"
# get urls to all bitstreams
show_n = 10
for bitstream_mimetype, bitstream_url in bitstream_info_from_pid(pid_url, mimetype="text/csv"):
print "Fetching [%s]" % bitstream_url
data_csv = csv.reader(urllib.urlopen(bitstream_url))
data_csv = [ [x.strip() for x in line] for line in data_csv ]
print "Number of rows (with header): %8d" % len(data_csv)
for i, row in enumerate(data_csv):
print row
if i > show_n:
break
plot(data_csv[1:3000])