In [1]:
    
import urllib
import csv
def bitstream_info_from_pid(pid_url, mimetype=None):
    """
        Appends well known magic to the `pid_url` and retrieves the  metadata instead of the 
        landing page. The metadata are parsed to get the mimetype and bitstream url.
        
        The algorithm looks for items that have ResourceType set to `Resource`.
    """
    import xml.etree.ElementTree as ET
    pid_metadata_url = pid_url + "@format=cmdi"
    
    print "Fetching metadata in CMDI format [%s]" % pid_metadata_url
    ns = "{http://www.clarin.eu/cmd/}"
    metadata = urllib.urlopen(pid_metadata_url).read()
    root = ET.fromstring(metadata)
    
    # finding bitstream elements
    bitstream_info_arr = []
    for proxy in root.findall('.//%sResourceProxy' % ns):
        rt = proxy.find("./%sResourceType" % ns)
        rr = proxy.find("./%sResourceRef" % ns)
        if rt.text == "Resource":
            bitstream_info_arr.append(
                (rt.attrib.get("mimetype", "unknown"), rr.text)
            )
    print "Found [%d] bitstreams elements in ResourceProxy elements" % len(bitstream_info_arr)
    if mimetype is not None:
        print "Filtering bitstreams according to specified mimetype [%s]" % mimetype
        bitstream_info_arr = [x for x in bitstream_info_arr if x[0] == mimetype]    
    
    print "Found"
    print "%s" % "\n".join(["%2d. %s [%s]" % (i, x[1], x[0]) for i, x in enumerate(bitstream_info_arr)])
    return bitstream_info_arr
def plot(arr):
    """
        Just testing one way of visualisation.
    """
    import pandas as pd
    from matplotlib import pyplot as plt
    from math import radians
    fig = plt.figure()
    ax = fig.add_subplot(111, polar=True)
    ax.scatter(x=[radians(float(deg)) for _1, deg in arr], y=[1] * len(arr))
    ax.set_theta_zero_location('N')
    ax.set_theta_direction(-1)
    plt.show()
# PID to a clarin-dspace repository
# - metadata attached to the PID at http://hdl.handle.net/11346/TEST--HGGA?noredirect
pid_url = "http://hdl.handle.net/11346/TEST--HGGA"
# get urls to all bitstreams 
show_n = 10
for bitstream_mimetype, bitstream_url in bitstream_info_from_pid(pid_url, mimetype="text/csv"):
    print "Fetching [%s]" % bitstream_url
    data_csv = csv.reader(urllib.urlopen(bitstream_url))
    data_csv = [ [x.strip() for x in line] for line in data_csv ]
    print "Number of rows (with header): %8d" % len(data_csv)
    for i, row in enumerate(data_csv):
        print row
        if i > show_n:
            break
plot(data_csv[1:3000])