In [1]:
import zipfile
In [137]:
def hundle_file(inputfile, outputfile):
archive = zipfile.ZipFile(inputfile, 'r')
targetxml = archive.open(archive.namelist()[0])
outputxml = open(outputfile,"w")
#outputxml.write(targetxml.readline().strip()) #first document declearation
targetxml.readline() #first document declearation
for line in targetxml:
if line.startswith('<?xml version="1.0" encoding="UTF-8"?>'):
outputxml.write("\r\n")
next
outputxml.write(line.strip())
outputxml.write("\r\n") #write last linebreak
targetxml.close()
archive.close()
outputxml.close()
In [138]:
%%time
data = hundle_file('data/ipa150716.zip', 'data/result.xml')
In [139]:
from xml.etree import ElementTree
inputfile = open('data/result.xml','r')
In [140]:
line = inputfile.readline()
In [141]:
node = ElementTree.fromstring(line)
In [146]:
node.find("us-bibliographic-data-application").items()
Out[146]:
In [179]:
node.findall('./abstract/p[0]')[0].text
Out[179]:
In [ ]: