In [3]:
import csv
import string
import xml.etree.ElementTree as ET
In [4]:
xml = '<record xmlns="http://www.loc.gov/MARC21/slim" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd"><leader>00000cam a 00000Ia </leader><controlfield tag="001">1000686</controlfield><controlfield tag="008">740906s1972 sp 000 0 spa d</controlfield><datafield tag="010" ind1=" " ind2=" "><subfield code="a"> 74357424 </subfield></datafield><datafield tag="029" ind1="1" ind2=" "><subfield code="a">AU@</subfield><subfield code="b">000028090919</subfield></datafield><datafield tag="040" ind1=" " ind2=" "><subfield code="a">NAM</subfield><subfield code="b">eng</subfield><subfield code="c">NAM</subfield><subfield code="d">GZM</subfield><subfield code="d">OCL</subfield><subfield code="d">OCLCQ</subfield></datafield><datafield tag="041" ind1="1" ind2=" "><subfield code="a">spa</subfield><subfield code="h">fre</subfield></datafield><datafield tag="050" ind1=" " ind2="4"><subfield code="a">PQ2637.A9236</subfield><subfield code="b">C5</subfield></datafield><datafield tag="100" ind1="1" ind2=" "><subfield code="a">Sauvajon, Marc-Gilbert.</subfield></datafield><datafield tag="245" ind1="1" ind2="0"><subfield code="a">Chao! /</subfield><subfield code="c">Comedia en dos actos ... de Marc-Gilbert Sauvajon. Adaptación castellana de Vicente Balart.</subfield></datafield><datafield tag="260" ind1=" " ind2=" "><subfield code="a">[Madrid] :</subfield><subfield code="b">Escelicer,</subfield><subfield code="c">[1972]</subfield></datafield><datafield tag="300" ind1=" " ind2=" "><subfield code="a">85 pages ;</subfield><subfield code="c">16 cm.</subfield></datafield><datafield tag="336" ind1=" " ind2=" "><subfield code="a">text</subfield><subfield code="b">txt</subfield><subfield code="2">rdacontent</subfield></datafield><datafield tag="337" ind1=" " ind2=" "><subfield code="a">unmediated</subfield><subfield code="b">n</subfield><subfield code="2">rdamedia</subfield></datafield><datafield tag="338" ind1=" " ind2=" "><subfield code="a">volume</subfield><subfield code="b">nc</subfield><subfield code="2">rdacarrier</subfield></datafield><datafield tag="490" ind1="0" ind2=" "><subfield code="a">Colección Teatro ;</subfield><subfield code="v">734</subfield></datafield><datafield tag="700" ind1="1" ind2=" "><subfield code="a">Balart, Vicente.</subfield></datafield><datafield tag="776" ind1="0" ind2="8"><subfield code="i">Online version:</subfield><subfield code="a">Sauvajon, Marc-Gilbert.</subfield><subfield code="t">Chao!</subfield><subfield code="d">[Madrid] Escelicer, [1972]</subfield><subfield code="w">(OCoLC)659093459</subfield></datafield></record> '
In [5]:
elem = ET.fromstring(xml)
In [6]:
child = elem.getchildren()
In [7]:
child[1].attrib
Out[7]:
In [8]:
child[1].text
Out[8]:
In [9]:
child[4].attrib
Out[9]:
In [10]:
1500000 % 50000
Out[10]:
In [11]:
# This is a hack for now, will write real python three marc record parser
def write_tsv(marc_file, out_file):
out_file = open(out_file, 'w')
writer = csv.writer(out_file, delimiter='\t')
header = ['control_number', 'title', 'uniform_title', 'author',
'publisher', 'pub_location', 'pub_year', 'translation',
'prev_language']
writer.writerow(header)
with open(marc_file, 'r') as f:
for i, line in enumerate(f):
if not i % 500000 and i != 0:
print("processed {} records".format(i))
# Default values
control_number = None
title = None
uniformtitle = None
author = None
# location = None
publisher = None
pubplace = None
pubyear = None
trans = False
original = None
elem = ET.fromstring(line)
children = list(elem)
for child in children:
attrs = child.attrib
tag = attrs.get("tag", "")
if tag:
# Control number
if tag == "001":
control_number = child.text.strip(string.punctuation).strip().strip(string.punctuation)
# Title
elif tag == "245":
grandchildren = list(child)
for grandchild in grandchildren:
gattrs = grandchild.attrib
code = gattrs.get("code", "")
if code == "a":
title = grandchild.text.strip(string.punctuation).strip().strip(string.punctuation)
elif code == "b" and title is not None:
title += grandchild.text.strip(string.punctuation).strip().strip(string.punctuation)
title = title.rstrip("/")
# Uniformtitle
elif (tag == "130" or tag == "240") and uniformtitle is None:
grandchildren = list(child)
for grandchild in grandchildren:
gattrs = grandchild.attrib
code = gattrs.get("code", "")
if code == "a":
uniformtitle = grandchild.text.strip(string.punctuation).strip().strip(string.punctuation)
# Author
elif (tag == "100" or tag == "110" or tag == "111") and author is None:
grandchildren = list(child)
for grandchild in grandchildren:
gattrs = grandchild.attrib
code = gattrs.get("code", "")
if code == "a":
author = grandchild.text.strip(string.punctuation).strip().strip(string.punctuation)
# elif tag == "852":
# grandchildren = list(child)
# for grandchild in grandchildren:
# gattrs = grandchild.attrib
# code = gattrs.get("code", "")
# if code == "a":
# location = grandchild.text.strip(string.punctuation).strip().strip(string.punctuation)
elif tag == "260" or (tag == "264" and attrs.get("ind1", "") == "1"):
grandchildren = list(child)
for grandchild in grandchildren:
gattrs = grandchild.attrib
code = gattrs.get("code", "")
if code == "a":
pubplace = grandchild.text.strip(string.punctuation).strip().strip(string.punctuation)
if code == "b":
publisher = grandchild.text.strip(string.punctuation).strip().strip(string.punctuation)
if code == "c":
pubyear = grandchild.text.strip(string.punctuation).strip().strip(string.punctuation)
pubyear = pubyear.lstrip("©")
# Language info
elif tag == "041":
indicator1 = attrs.get("ind1", "")
if indicator1 == "1":
trans = True
grandchildren = list(child)
for grandchild in grandchildren:
gattrs = grandchild.attrib
code = gattrs.get("code", "")
if code == "h":
original = grandchild.text.strip(string.punctuation).strip().strip(string.punctuation)
row = [control_number, title, uniformtitle, author,
publisher, pubplace, pubyear, trans, original]
writer.writerow(row)
out_file.close()
In [12]:
write_tsv('data/spanish.xml', 'data/output.tsv')
In [ ]: