XML Processing

Here are the contents of the input/menu.xml
<breakfast_menu>
    <food name="Belgian Waffles">
        <price>$5.95</price>
        <description>two of our famous Belgian Waffles with plenty of real maple syrup</description
        <calories>650</calories>
    </food>
    <food name="Strawberry Belgian Waffles">
        <price>$7.95</price>
        <description>light Belgian waffles covered with strawberries and whipped cream</description>
        <calories>900</calories>
    </food>
    <food name="Berry-Berry Belgian Waffles">
        <price>$8.95</price>
        <description>light Belgian waffles covered with an assortment of fresh berries and whipped cream</description>
        <calories>900</calories>
    </food>
    <food name="French Toast">
        <price>$4.50</price>
        <description>thick slices made from our homemade sourdough bread</description>
        <calories>600</calories>
    </food>
    <food name="Homestyle Breakfast">
        <price>$6.95</price>
        <description>two eggs, bacon or sausage, toast, and our ever-popular hash browns</description>
        <calories>950</calories>
    </food>
</breakfast_menu>

Use ElementTree


In [1]:
# import the appropriate libraries
import xml.etree.ElementTree as ET # xml processing

In [2]:
# read the XML file
tree = ET.parse('input/menu.xml')

In [4]:
print('tree element:\t', tree)


tree element:	 <xml.etree.ElementTree.ElementTree object at 0x10be4fa90>

In [82]:
# get the root of the tree
root = tree.getroot()

In [83]:
print 'root element:\t ', root


root element:	  <Element 'breakfast_menu' at 0x19d4c50>

In [84]:
# here is the name of the root element
root.tag


Out[84]:
'breakfast_menu'

In [85]:
# get the children of breakfast_menu, the next level down the tree
children = root.getchildren()

In [86]:
for child in children:
    print (child.tag)


food
food
food
food
food
food

In [87]:
# for each child (node), get it's children and print out their names (tags)
for child in children:
    grand_children = child.getchildren()
    print (child.tag, '\t', child.attrib)
    for grand_child in grand_children:
        print (grand_child.tag)


food 	{'category': 'breakfast', 'name': 'Belgian Waffles'}
price
description
calories
food 	{'category': 'breakfast', 'name': 'Strawberry Belgian Waffles'}
price
description
calories
food 	{'category': 'breakfast', 'name': 'Berry-Berry Belgian Waffles'}
price
description
calories
food 	{'category': 'breakfast', 'name': 'French Toast'}
price
description
calories
food 	{'category': 'breakfast', 'name': 'Homestyle Breakfast'}
price
description
calories
food 	{'name': 'Three-Egg Omlette'}
price
description
calories

In [88]:
# make a list of all the <food> tags
food_tags = root.findall('food')
print ('number of food tags = ', len(food_tags))


number of food tags =  6

In [89]:
# print the <food> tags - it's not what you would expect
print (food_tags)


[<Element 'food' at 0x19d4cd0>, <Element 'food' at 0x19d4dd0>, <Element 'food' at 0x19d4f90>, <Element 'food' at 0x19d8110>, <Element 'food' at 0x19d8250>, <Element 'food' at 0x19d8a10>]

In [90]:
# access the enties in the list
first_food_item = food_tags[0]
print ('the first child node is:\t', first_food_item)


the first child node is:	<Element 'food' at 0x19d4cd0>

In [91]:
# here's how we can view a child node's content
ET.dump(first_food_item)


<food category="breakfast" name="Belgian Waffles">
		<price>$5.95</price>
		<description>two of our famous Belgian Waffles with plenty of real maple syrup</description>
		<calories>650</calories>
	</food>
	

In [92]:
section = 'food'
tag = 'price'
node = root.find(section)
subnode = node.find(tag)
print ("Path to Price subnode of Food node:")
print ("Root:", str(root), " Node: ", node, "Subnode: ", subnode)


Path to Price subnode of Food node:
Root: <Element 'breakfast_menu' at 0x19d4c50>  Node:  <Element 'food' at 0x19d4cd0> Subnode:  <Element 'price' at 0x19d4d10>


In [93]:
#node = root.find('food')
#subnode = node.find('prince')
#subsubnode = subnode.find('curr')

In [94]:
#Specify the path to the 'name' attribute of the 'food' node
node = root.find(section)
attribute = node.attrib['name']
print ("Path to Name attribute of Food node:")
print ("Root:", str(root), " Node: ", node, "Attribute: ", attribute)


Path to Name attribute of Food node:
Root: <Element 'breakfast_menu' at 0x19d4c50>  Node:  <Element 'food' at 0x19d4cd0> Attribute:  Belgian Waffles


In [95]:
#Find the attributes of each food node
print "All nodes, subnodes and attributes:" 
for node in root:
    print (node.tag, node.attrib)
    for subnode in node:
        print (subnode.tag, subnode.text)


All nodes, subnodes and attributes:
food {'category': 'breakfast', 'name': 'Belgian Waffles'}
price $5.95
description two of our famous Belgian Waffles with plenty of real maple syrup
calories 650
food {'category': 'breakfast', 'name': 'Strawberry Belgian Waffles'}
price $7.95
description light Belgian waffles covered with strawberries and whipped cream
calories 900
food {'category': 'breakfast', 'name': 'Berry-Berry Belgian Waffles'}
price $8.95
description light Belgian waffles covered with an assortment of fresh berries and whipped cream
calories 900
food {'category': 'breakfast', 'name': 'French Toast'}
price $6.50
description thick slices made from our homemade sourdough bread
calories 600
food {'category': 'breakfast', 'name': 'Homestyle Breakfast'}
price $6.95
description two eggs, bacon or sausage, toast, and our ever-popular hash browns
calories 950
food {'name': 'Three-Egg Omlette'}
price $7.95
description three-egg omlette with your choice of meat, cheese and vegetables
calories 900


In [96]:
#Add a new attribute to each food tag   
for node in tree.iter(tag='food'):
    node.set('category', 'breakfast')

In [97]:
# you can search by name
name = 'Belgian Waffles'
for selected_name in root.findall("./food/[@name='%s']" % name):
     #print the description associated with the selected name
    print "Found Belgian Waffles!"
    print name, ":",  selected_name.find('description').text


Found Belgian Waffles!
Belgian Waffles : two of our famous Belgian Waffles with plenty of real maple syrup

In [98]:
#find a specific node
#and update a subnode
for node in tree.iter(tag='food'):
    if node.attrib['name'] == 'French Toast':
        subnode = node.find('price')
        print ("Subnode text: ", subnode.text)
        subnode.text = '$6.50'
        print ("Modified subnode text: ",  subnode.text)


Subnode text:  $6.50
Modified subnode text:  $6.50

In [99]:
#Add a new subelement to the root                                                                        
new_name = 'Three-Egg Omlette'  
new_price = '$7.95'
new_description = 'three-egg omlette with your choice of meat, cheese and vegetables'
new_calories = '900'                                
food_node = ET.SubElement(root, 'food', {'name':new_name})
price_subnode = ET.SubElement(food_node, 'price')
price_subnode.text = new_price
description_subnode = ET.SubElement(food_node, 'description')
description_subnode.text = new_description
calories_subnode = ET.SubElement(food_node, 'calories')
calories_subnode.text = new_calories

In [100]:
#<el name='x'> 4 </el>
#<price amount='5.5' cur='$$'/>

In [106]:
#Write out the modified xml     
tree.write('output/outputMenu.xml')

Use xmltodict

xmltodict is another simple library that aims at making XML feel like working with JSON.


In [5]:
import xmltodict

with open('input/menu.xml') as fd:
    doc = xmltodict.parse(fd.read())

In [6]:
print(doc)


OrderedDict([('breakfast_menu', OrderedDict([('food', [OrderedDict([('@name', 'Belgian Waffles'), ('price', '$5.95'), ('description', 'two of our famous Belgian Waffles with plenty of real maple syrup'), ('calories', '650')]), OrderedDict([('@name', 'Strawberry Belgian Waffles'), ('price', '$7.95'), ('description', 'light Belgian waffles covered with strawberries and whipped cream'), ('calories', '900')]), OrderedDict([('@name', 'Berry-Berry Belgian Waffles'), ('price', '$8.95'), ('description', 'light Belgian waffles covered with an assortment of fresh berries and whipped cream'), ('calories', '900')]), OrderedDict([('@name', 'French Toast'), ('price', '$4.50'), ('description', 'thick slices made from our homemade sourdough bread'), ('calories', '600')]), OrderedDict([('@name', 'Homestyle Breakfast'), ('price', '$6.95'), ('description', 'two eggs, bacon or sausage, toast, and our ever-popular hash browns'), ('calories', '950')])])]))])

Use untangle

untangle is a simple library which takes an XML document and returns a Python object which mirrors the nodes and attributes in its structure


In [3]:
import untangle
obj = untangle.parse('input/menu.xml')
obj.breakfast_menu.food[0].calories


Out[3]:
Element(name = calories, attributes = {}, cdata = 650)

In [ ]: