This is the test record: http://cds.cern.ch/record/2058156


In [29]:
CDS_PHOTO_RECORD = """
<?xml version="1.0" encoding="UTF-8"?>
<collection xmlns="http://www.loc.gov/MARC21/slim">
<record>
  <controlfield tag="001">2058156</controlfield>
  <controlfield tag="005">20151008225323.0</controlfield>
  <datafield tag="024" ind1="8" ind2=" ">
    <subfield code="a">oai:cds.cern.ch:2058156</subfield>
    <subfield code="p">cerncds:FULLTEXT</subfield>
  </datafield>
  <datafield tag="037" ind1=" " ind2=" ">
    <subfield code="a">CERN-PHOTO-201510-197</subfield>
  </datafield>
  <datafield tag="100" ind1=" " ind2=" ">
    <subfield code="a">Bennett, Sophia Elizabeth</subfield>
    <subfield code="0">AUTHOR|(SzGeCERN)780240</subfield>
    <subfield code="u">CERN</subfield>
    <subfield code="m">sophia.bennett@cern.ch</subfield>
  </datafield>
  <datafield tag="245" ind1=" " ind2=" ">
    <subfield code="a">Big Bang Passport - New Location</subfield>
  </datafield>
  <datafield tag="260" ind1=" " ind2=" ">
    <subfield code="c">2015</subfield>
  </datafield>
  <datafield tag="269" ind1=" " ind2=" ">
    <subfield code="a">Geneva</subfield>
    <subfield code="b">CERN</subfield>
    <subfield code="c">2015-10-08</subfield>
  </datafield>
  <datafield tag="500" ind1=" " ind2=" ">
    <subfield code="a">General Photo</subfield>
  </datafield>
  <datafield tag="506" ind1=" " ind2=" ">
    <subfield code="a">public</subfield>
  </datafield>
  <datafield tag="520" ind1=" " ind2=" ">
    <subfield code="a">New loaction</subfield>
  </datafield>
  <datafield tag="542" ind1=" " ind2=" ">
    <subfield code="d">CERN</subfield>
    <subfield code="g">2015</subfield>
  </datafield>
  <datafield tag="595" ind1=" " ind2=" ">
    <subfield code="a">CERN EDS</subfield>
    <subfield code="s">PHOTOLAB</subfield>
  </datafield>
  <datafield tag="650" ind1="1" ind2="7">
    <subfield code="2">SzGeCERN</subfield>
    <subfield code="a">Photolab</subfield>
  </datafield>
  <datafield tag="650" ind1="2" ind2="7">
    <subfield code="2">SzGeCERN</subfield>
    <subfield code="a">Life at CERN</subfield>
  </datafield>
  <datafield tag="653" ind1="1" ind2=" ">
    <subfield code="a">Life at CERN</subfield>
    <subfield code="9">CERN</subfield>
  </datafield>
  <datafield tag="690" ind1="C" ind2=" ">
    <subfield code="a">CERN</subfield>
  </datafield>
  <datafield tag="690" ind1="C" ind2=" ">
    <subfield code="a">PHOTO</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="8">1150739</subfield>
    <subfield code="s">20888154</subfield>
    <subfield code="u">http://cds.cern.ch/record/2058156/files/MAX_0388.JPG</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="8">1150740</subfield>
    <subfield code="s">18005815</subfield>
    <subfield code="u">http://cds.cern.ch/record/2058156/files/MAX_0390.JPG</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="8">1150741</subfield>
    <subfield code="s">24104669</subfield>
    <subfield code="u">http://cds.cern.ch/record/2058156/files/MAX_0396.JPG</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="8">1150742</subfield>
    <subfield code="s">21212927</subfield>
    <subfield code="u">http://cds.cern.ch/record/2058156/files/MAX_0399.JPG</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="8">1150743</subfield>
    <subfield code="s">21257737</subfield>
    <subfield code="u">http://cds.cern.ch/record/2058156/files/MAX_0407.JPG</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="8">1150739</subfield>
    <subfield code="s">1406947</subfield>
    <subfield code="u">http://cds.cern.ch/record/2058156/files/MAX_0388.jpg?subformat=icon-1440</subfield>
    <subfield code="x">icon-1440</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="8">1150739</subfield>
    <subfield code="s">342545</subfield>
    <subfield code="u">http://cds.cern.ch/record/2058156/files/MAX_0388.jpg?subformat=icon-640</subfield>
    <subfield code="x">icon-640</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="8">1150739</subfield>
    <subfield code="s">73531</subfield>
    <subfield code="u">http://cds.cern.ch/record/2058156/files/MAX_0388.jpg?subformat=icon-180</subfield>
    <subfield code="x">icon-180</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="8">1150740</subfield>
    <subfield code="s">1282195</subfield>
    <subfield code="u">http://cds.cern.ch/record/2058156/files/MAX_0390.jpg?subformat=icon-1440</subfield>
    <subfield code="x">icon-1440</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="8">1150740</subfield>
    <subfield code="s">345928</subfield>
    <subfield code="u">http://cds.cern.ch/record/2058156/files/MAX_0390.jpg?subformat=icon-640</subfield>
    <subfield code="x">icon-640</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="8">1150740</subfield>
    <subfield code="s">79720</subfield>
    <subfield code="u">http://cds.cern.ch/record/2058156/files/MAX_0390.jpg?subformat=icon-180</subfield>
    <subfield code="x">icon-180</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="8">1150741</subfield>
    <subfield code="s">1558062</subfield>
    <subfield code="u">http://cds.cern.ch/record/2058156/files/MAX_0396.jpg?subformat=icon-1440</subfield>
    <subfield code="x">icon-1440</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="8">1150741</subfield>
    <subfield code="s">384736</subfield>
    <subfield code="u">http://cds.cern.ch/record/2058156/files/MAX_0396.jpg?subformat=icon-640</subfield>
    <subfield code="x">icon-640</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="8">1150741</subfield>
    <subfield code="s">78661</subfield>
    <subfield code="u">http://cds.cern.ch/record/2058156/files/MAX_0396.jpg?subformat=icon-180</subfield>
    <subfield code="x">icon-180</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="8">1150742</subfield>
    <subfield code="s">1410785</subfield>
    <subfield code="u">http://cds.cern.ch/record/2058156/files/MAX_0399.jpg?subformat=icon-1440</subfield>
    <subfield code="x">icon-1440</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="8">1150742</subfield>
    <subfield code="s">366776</subfield>
    <subfield code="u">http://cds.cern.ch/record/2058156/files/MAX_0399.jpg?subformat=icon-640</subfield>
    <subfield code="x">icon-640</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="8">1150742</subfield>
    <subfield code="s">81339</subfield>
    <subfield code="u">http://cds.cern.ch/record/2058156/files/MAX_0399.jpg?subformat=icon-180</subfield>
    <subfield code="x">icon-180</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="8">1150743</subfield>
    <subfield code="s">1298236</subfield>
    <subfield code="u">http://cds.cern.ch/record/2058156/files/MAX_0407.jpg?subformat=icon-1440</subfield>
    <subfield code="x">icon-1440</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="8">1150743</subfield>
    <subfield code="s">327592</subfield>
    <subfield code="u">http://cds.cern.ch/record/2058156/files/MAX_0407.jpg?subformat=icon-640</subfield>
    <subfield code="x">icon-640</subfield>
  </datafield>
  <datafield tag="856" ind1="4" ind2=" ">
    <subfield code="8">1150743</subfield>
    <subfield code="s">76882</subfield>
    <subfield code="u">http://cds.cern.ch/record/2058156/files/MAX_0407.jpg?subformat=icon-180</subfield>
    <subfield code="x">icon-180</subfield>
  </datafield>
  <datafield tag="859" ind1=" " ind2=" ">
    <subfield code="f">maximilien.brice@cern.ch</subfield>
  </datafield>
  <datafield tag="859" ind1=" " ind2=" ">
    <subfield code="f">Francois.Briard@cern.ch</subfield>
  </datafield>
  <datafield tag="916" ind1=" " ind2=" ">
    <subfield code="s">n</subfield>
    <subfield code="w">201541</subfield>
  </datafield>
  <datafield tag="923" ind1=" " ind2=" ">
    <subfield code="p">CERN</subfield>
    <subfield code="r">Briard Francois &lt;Francois.Briard@cern.ch></subfield>
  </datafield>
  <datafield tag="960" ind1=" " ind2=" ">
    <subfield code="a">86</subfield>
  </datafield>
  <datafield tag="963" ind1=" " ind2=" ">
    <subfield code="a">PUBLIC</subfield>
  </datafield>
  <datafield tag="963" ind1=" " ind2=" ">
    <subfield code="b">VISIBLE</subfield>
  </datafield>
  <datafield tag="980" ind1=" " ind2=" ">
    <subfield code="a">PHOTOLABCERN</subfield>
  </datafield>
</record>
</collection>
"""

Let's create the splitter!


In [30]:
from cds.ext.record_split.photo import PhotoSplitter
splitter = PhotoSplitter()

Now, let's split the photo record. We want to create 1 album record, and 5 photo records. The album should contain the references to the photo records, and the photo records should contain a reference back to the album. A few notes:

  • image subformats should be ignored (as we are going to use IIIF)
  • references will be handled with jsonref: records will be stored with references, which will be extended when needed (for example, before sending the records to Elasticsearch)

In [31]:
records = splitter.split(CDS_PHOTO_RECORD)

In [32]:
album_record = records[0][0]
image_records = records[0][1]

Let's make sure we have a nice printing:


In [33]:
import pprint
pp = pprint.PrettyPrinter(indent=2)

In [16]:
pp.pprint(album_record)


{ '001': ['2058156'],
  '005': ['20151008225323.0'],
  u'0248_': { 'a': 'oai:cds.cern.ch:2058156', 'p': 'cerncds:FULLTEXT'},
  u'037__': { 'a': 'CERN-PHOTO-201510-197'},
  u'100__': { '0': 'AUTHOR|(SzGeCERN)780240',
              'a': 'Bennett, Sophia Elizabeth',
              'm': 'sophia.bennett@cern.ch',
              'u': 'CERN'},
  u'245__': { 'a': 'Big Bang Passport - New Location'},
  u'260__': { 'c': '2015'},
  u'269__': { 'a': 'Geneva', 'b': 'CERN', 'c': '2015-10-08'},
  u'500__': { 'a': 'General Photo'},
  u'506__': { 'a': 'public'},
  u'520__': { 'a': 'New loaction'},
  u'542__': { 'd': 'CERN', 'g': '2015'},
  u'595__': { 'a': 'CERN EDS', 's': 'PHOTOLAB'},
  u'65017': { '2': 'SzGeCERN', 'a': 'Photolab'},
  u'65027': { '2': 'SzGeCERN', 'a': 'Life at CERN'},
  u'6531_': { '9': 'CERN', 'a': 'Life at CERN'},
  u'690C_': [{ 'a': 'CERN'}, { 'a': 'PHOTO'}],
  u'774__': [ { 'a': 'IMAGE', 'r': '5000000'},
              { 'a': 'IMAGE', 'r': '5000001'},
              { 'a': 'IMAGE', 'r': '5000002'},
              { 'a': 'IMAGE', 'r': '5000003'},
              { 'a': 'IMAGE', 'r': '5000004'}],
  u'859__': [ { 'f': 'maximilien.brice@cern.ch'},
              { 'f': 'Francois.Briard@cern.ch'}],
  u'916__': { 's': 'n', 'w': '201541'},
  u'923__': { 'p': 'CERN', 'r': 'Briard Francois <Francois.Briard@cern.ch>'},
  u'960__': { 'a': '86'},
  u'963__': [{ 'a': 'PUBLIC'}, { 'b': 'VISIBLE'}],
  u'980__': { 'a': 'PHOTOLABCERN'},
  u'999__': { 'a': 'ALBUM'}}

In [34]:
pp.pprint(image_records)


[ { u'001': ['5000000'],
    u'100__': { '0': 'AUTHOR|(SzGeCERN)780240',
                'a': 'Bennett, Sophia Elizabeth',
                'm': 'sophia.bennett@cern.ch',
                'u': 'CERN'},
    u'245__': { 'a': 'Big Bang Passport - New Location'},
    u'260__': { 'c': '2015'},
    u'269__': { 'a': 'Geneva', 'b': 'CERN', 'c': '2015-10-08'},
    u'542__': { 'd': 'CERN', 'g': '2015'},
    u'774__': { 'a': 'ALBUM', 'r': '2058156'},
    u'8564_': { '8': '1150739',
                's': '20888154',
                'u': 'http://cds.cern.ch/record/2058156/files/MAX_0388.JPG'},
    u'999__': { 'a': 'IMAGE'}},
  { u'001': ['5000001'],
    u'100__': { '0': 'AUTHOR|(SzGeCERN)780240',
                'a': 'Bennett, Sophia Elizabeth',
                'm': 'sophia.bennett@cern.ch',
                'u': 'CERN'},
    u'245__': { 'a': 'Big Bang Passport - New Location'},
    u'260__': { 'c': '2015'},
    u'269__': { 'a': 'Geneva', 'b': 'CERN', 'c': '2015-10-08'},
    u'542__': { 'd': 'CERN', 'g': '2015'},
    u'774__': { 'a': 'ALBUM', 'r': '2058156'},
    u'8564_': { '8': '1150740',
                's': '18005815',
                'u': 'http://cds.cern.ch/record/2058156/files/MAX_0390.JPG'},
    u'999__': { 'a': 'IMAGE'}},
  { u'001': ['5000002'],
    u'100__': { '0': 'AUTHOR|(SzGeCERN)780240',
                'a': 'Bennett, Sophia Elizabeth',
                'm': 'sophia.bennett@cern.ch',
                'u': 'CERN'},
    u'245__': { 'a': 'Big Bang Passport - New Location'},
    u'260__': { 'c': '2015'},
    u'269__': { 'a': 'Geneva', 'b': 'CERN', 'c': '2015-10-08'},
    u'542__': { 'd': 'CERN', 'g': '2015'},
    u'774__': { 'a': 'ALBUM', 'r': '2058156'},
    u'8564_': { '8': '1150741',
                's': '24104669',
                'u': 'http://cds.cern.ch/record/2058156/files/MAX_0396.JPG'},
    u'999__': { 'a': 'IMAGE'}},
  { u'001': ['5000003'],
    u'100__': { '0': 'AUTHOR|(SzGeCERN)780240',
                'a': 'Bennett, Sophia Elizabeth',
                'm': 'sophia.bennett@cern.ch',
                'u': 'CERN'},
    u'245__': { 'a': 'Big Bang Passport - New Location'},
    u'260__': { 'c': '2015'},
    u'269__': { 'a': 'Geneva', 'b': 'CERN', 'c': '2015-10-08'},
    u'542__': { 'd': 'CERN', 'g': '2015'},
    u'774__': { 'a': 'ALBUM', 'r': '2058156'},
    u'8564_': { '8': '1150742',
                's': '21212927',
                'u': 'http://cds.cern.ch/record/2058156/files/MAX_0399.JPG'},
    u'999__': { 'a': 'IMAGE'}},
  { u'001': ['5000004'],
    u'100__': { '0': 'AUTHOR|(SzGeCERN)780240',
                'a': 'Bennett, Sophia Elizabeth',
                'm': 'sophia.bennett@cern.ch',
                'u': 'CERN'},
    u'245__': { 'a': 'Big Bang Passport - New Location'},
    u'260__': { 'c': '2015'},
    u'269__': { 'a': 'Geneva', 'b': 'CERN', 'c': '2015-10-08'},
    u'542__': { 'd': 'CERN', 'g': '2015'},
    u'774__': { 'a': 'ALBUM', 'r': '2058156'},
    u'8564_': { '8': '1150743',
                's': '21257737',
                'u': 'http://cds.cern.ch/record/2058156/files/MAX_0407.JPG'},
    u'999__': { 'a': 'IMAGE'}}]

In [35]:
print len(image_records)


5

In [36]:
pp.pprint(image_records[0])


{ u'001': ['5000000'],
  u'100__': { '0': 'AUTHOR|(SzGeCERN)780240',
              'a': 'Bennett, Sophia Elizabeth',
              'm': 'sophia.bennett@cern.ch',
              'u': 'CERN'},
  u'245__': { 'a': 'Big Bang Passport - New Location'},
  u'260__': { 'c': '2015'},
  u'269__': { 'a': 'Geneva', 'b': 'CERN', 'c': '2015-10-08'},
  u'542__': { 'd': 'CERN', 'g': '2015'},
  u'774__': { 'a': 'ALBUM', 'r': '2058156'},
  u'8564_': { '8': '1150739',
              's': '20888154',
              'u': 'http://cds.cern.ch/record/2058156/files/MAX_0388.JPG'},
  u'999__': { 'a': 'IMAGE'}}

That's it! 1 CDS records with 5 images got split into 6 CDSLabs records that are ready to be translated into json, using doJSON (first tutorial of the day)


In [ ]: