In [1]:
import zipfile

In [137]:
def hundle_file(inputfile, outputfile):
    archive = zipfile.ZipFile(inputfile, 'r')
    targetxml = archive.open(archive.namelist()[0])
    outputxml = open(outputfile,"w")
    
    #outputxml.write(targetxml.readline().strip()) #first document declearation
    targetxml.readline() #first document declearation
    
    for line in targetxml:
        if line.startswith('<?xml version="1.0" encoding="UTF-8"?>'):
            outputxml.write("\r\n")
            next
        outputxml.write(line.strip())
    
    outputxml.write("\r\n")    #write last linebreak
    
    targetxml.close()
    archive.close()
    outputxml.close()

In [138]:
%%time
data = hundle_file('data/ipa150716.zip', 'data/result.xml')


CPU times: user 40 s, sys: 2.13 s, total: 42.1 s
Wall time: 44 s

In [139]:
from xml.etree import ElementTree
inputfile = open('data/result.xml','r')

In [140]:
line = inputfile.readline()

In [141]:
node = ElementTree.fromstring(line)

In [146]:
node.find("us-bibliographic-data-application").items()


Out[146]:
[('lang', 'EN'), ('country', 'US')]

In [179]:
node.findall('./abstract/p[0]')[0].text


Out[179]:
"The user places the foot on the foot rest to create an axils, so to prevent the rod from slipping backward, therefore, forcing the rod to pivot on itself and elevate the shovel's scoop, as the user pulls on the shovel's handle upwards."

In [ ]: