XML example and exercise


  • study examples of accessing nodes in XML tree structure
  • work on exercise to be completed and submitted



In [74]:
from xml.etree import ElementTree as ET

XML example


In [75]:
document_tree = ET.parse( './data/mondial_database_less.xml' )

In [76]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)


Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra

In [77]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':', end=''),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])


* Albania:Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:Skopje, Kumanovo
* Serbia:Beograd, Novi Sad, Niš
* Montenegro:Podgorica
* Kosovo:Prishtine
* Andorra:Andorra la Vella

XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

  1. 10 countries with the lowest infant mortality rates
  2. 10 cities with the largest population
  3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
  4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [78]:
document = ET.parse( './data/mondial_database.xml' )

In [89]:
# print child and attributes
#for child in document.getroot():
#    print (child.tag, child.attrib)

In [80]:
import pandas as pd

In [84]:
# Create a list of country and their Infant Mortality Rate 
country_imr=[]
for country in document.getroot().findall('country'):
    name = country.find('name').text
    infant_mortality_rate = country.find('infant_mortality')
    if infant_mortality_rate is not None:
        infant_mortality_rate=infant_mortality_rate.text
    else :
        infant_mortality_rate = -1
    country_imr.append((name, (float)(infant_mortality_rate)))

10 countries with the lowest infant mortality rates


In [88]:
df = pd.DataFrame(country_imr, columns=['Country', 'Infant_Mortality_Rate'])
df_unknown_removed = df[df.Infant_Mortality_Rate != -1] 
df_unknown_removed.set_index('Infant_Mortality_Rate').sort().head(10)


Out[88]:
Country
Infant_Mortality_Rate
1.81 Monaco
2.13 Japan
2.48 Bermuda
2.48 Norway
2.53 Singapore
2.60 Sweden
2.63 Czech Republic
2.73 Hong Kong
3.13 Macao
3.15 Iceland

In [149]:
city_population=[]
for country in document.iterfind('country'):
    for state in country.iterfind('province'):
        for city in state.iterfind('city'):
            try:
                city_population.append((city.find('name').text, float(city.find('population').text)))
            except:
                next
    for city in country.iterfind('city'):
        try:
            city_population.append((city.find('name').text, float(city.find('population').text)))
        except:
            next

10 cities with the largest population


In [165]:
df = pd.DataFrame(city_population, columns=['City', 'Population'])
#df.info()
df.sort_index(by='Population', ascending=False).head(10)


Out[165]:
City Population
1763 Seoul 10229262
1421 Mumbai 9925891
2594 São Paulo 9412894
1629 Jakarta 8259266
1251 Shanghai 8205598
1942 Ciudad de México 8092449
443 Moskva 8010954
1725 Tokyo 7843000
1250 Beijing 7362426
1467 Delhi 7206704

In [229]:
ethnic_population={}
country_population={}
for country in document.iterfind('country'):
    try:
        country_population[country.find('name').text]= float(country.find('population').text)
    except:
        next
    for state in country.iterfind('province' or 'state'):
        try:
            country_population[country.find('name').text] += float(state.find('population').text)
        except:
            next
        for city in state.iterfind('city'):
            try:
                country_population[country.find('name').text] += float(city.find('population').text)
            except:
                next

for country in document.iterfind('country'):
    for ethnicgroup in country.iterfind('ethnicgroup'):
        try:
            if ethnicgroup.text in ethnic_population:
                ethnic_population[ethnicgroup.text] += country_population[country.find('name').text]*float(ethnicgroup.get('percentage'))/100
            else:
                ethnic_population[ethnicgroup.text] = country_population[country.find('name').text]*float(ethnicgroup.get('percentage'))/100
        except:
            next

10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)


In [241]:
pd.DataFrame(sorted(ethnic_population.items(), key=lambda x:x[1], reverse=True)[:10], columns=['Ethnic_Groups', 'Population'])


Out[241]:
Ethnic_Groups Population
0 Han Chinese 1.593119e+09
1 Indo-Aryan 7.776357e+08
2 European 6.668512e+08
3 African 2.896678e+08
4 Russian 2.705583e+08
5 Dravidian 2.700124e+08
6 Japanese 2.506371e+08
7 German 1.706358e+08
8 Mestizo 1.666139e+08
9 Javanese 1.413179e+08

Longest River


In [249]:
rivers_list=[]
rivers_df = pd.DataFrame()
for rivers in document.iterfind('river'):
    try:
        rivers_list.append({'name':rivers.find('name').text, 'length':int(rivers.find('length').text), 'country':rivers.find('located').attrib['country']})
    except:
        next
rivers_df = pd.DataFrame(rivers_list)
rivers_df.sort('length', ascending=False).head(1)


Out[249]:
country length name
161 CO 6448 Amazonas

Largest Lake


In [253]:
lake_list=[]
lake_df = pd.DataFrame()
for lakes in document.iterfind('lake'):
    try:
        lake_list.append({'name':lakes.find('name').text, 'area':int(lakes.find('area').text), 'country':lakes.find('located').attrib['country']})
    except:
        next
lakes_df = pd.DataFrame(lake_list)
lakes_df.sort('area', ascending=False).head(1)


Out[253]:
area country name
42 386400 R Caspian Sea

Airport At Highest Elevation


In [256]:
ap_list=[]
ap_df = pd.DataFrame()
for ap in document.iterfind('airport'):
    try:
        ap_list.append({'name':ap.find('name').text, 'elevation':int(ap.find('elevation').text), 'country':ap.attrib['country']})
    except:
        next
ap_df = pd.DataFrame(ap_list)
ap_df.sort('elevation', ascending=False).head(1)


Out[256]:
country elevation name
80 BOL 4063 El Alto Intl

Done.


In [ ]: