In [24]:
from xml.etree import ElementTree as ET
import pandas as pd
In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )
In [3]:
# print names of all countries
for child in document_tree.getroot():
print(child.find('name').text)
In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
print('* ' + element.find('name').text + ':'),
capitals_string = ''
for subelement in element.getiterator('city'):
capitals_string += subelement.find('name').text + ', '
print(capitals_string[:-2])
Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find
In [21]:
document = ET.parse( './data/mondial_database.xml' )
In [57]:
names = []
infant_mortalities = []
for element in document.iterfind('country[infant_mortality]'):
names.append(element.find('name').text)
infant_mortalities.append(float(element.find('infant_mortality').text))
results = pd.DataFrame({'name': names, 'infant_mortality':infant_mortalities})
results.sort_values('infant_mortality').head(10)
Out[57]:
In [73]:
cities = []
populations = []
for element in document.iterfind('country[city]'):
for sub in element.iterfind('city[population]'):
cities.append(sub.find('name').text)
pops = [int(p.text) for p in sub.findall('population')]
populations.append(pops[-1])
results = pd.DataFrame({'city': cities, 'population': populations})
results.sort_values('population', ascending=False).head(10)
Out[73]:
In [154]:
ethnic_groups = [e.text for e in document.findall('.//ethnicgroup')]
ethnic_dict = dict.fromkeys(ethnic_groups, 0)
for element in document.iterfind('country[ethnicgroup]'):
population = [int(p.text) for p in element.findall('population')][-1]
groups = [e.text for e in element.findall('ethnicgroup')]
percentages = [float(perc.get('percentage')) for perc in element.findall('ethnicgroup')]
for i, group in enumerate(groups):
ethnic_dict[group] += percentages[i] * population
round(pd.Series(ethnic_dict)).astype(int).sort_values(ascending=False).head(10)
Out[154]:
In [ ]:
# convert country code
country_dict = {}
for element in document.iterfind('country'):
country_dict[element.get('car_code')] = element.find('name').text
In [136]:
# find longest river
river_name = ''
river_code = ''
river_length = 0.0
for element in document.iterfind('river[length]'):
if float(element.find('length').text) > river_length:
river_length = float(element.find('length').text)
river_name = element.find('name').text
river_code = element.get('country')
countries = ', '.join([country_dict[c] for c in river_code.split(' ')])
print('longest river \n name: {}\n countries: {}'.format(river_name, countries))
In [137]:
# find largest lake
lake_name = ''
lake_code = ''
lake_area = 0.0
for element in document.iterfind('lake[area]'):
if float(element.find('area').text) > lake_area:
lake_area = float(element.find('area').text)
lake_name = element.find('name').text
lake_code = element.get('country')
countries = ', '.join([country_dict[c] for c in lake_code.split(' ')])
print('largest lake\n name: {}\n countries: {}'.format(lake_name, countries))
In [152]:
# find highest airport elevation
airport_name = ''
airport_code = ''
airport_elevation = 0.0
for element in document.iterfind('airport[elevation]'):
if element.find('elevation').text is None:
continue
if float(element.find('elevation').text) > airport_elevation:
airport_elevation = float(element.find('elevation').text)
airport_name = element.find('name').text
airport_code = element.get('country')
countries = ', '.join([country_dict[c] for c in airport_code.split(' ')])
print('highest airport elevation\n name: {}\n countries: {}'.format(airport_name, countries))