XML example and exercise


  • study examples of accessing nodes in XML tree structure
  • work on exercise to be completed and submitted



In [1]:
from xml.etree import ElementTree as ET

XML example


In [2]:
document_tree = ET.parse( 'data/mondial_database_less.xml' )

In [3]:
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)


Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra

In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':',)
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])


* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella

XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

  1. 10 countries with the lowest infant mortality rates
  2. 10 cities with the largest population
  3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
  4. name and country of a) longest river, b) largest lake and c) airport at highest elevation

In [5]:
document = ET.parse( './data/mondial_database.xml' )

10 countries with the lowest infant mortality rates


In [6]:
import pandas as pd

In [7]:
# gather all data into a list
root = document.getroot()
df = []
for country in root.findall('country'): 
    infmort = country.find('infant_mortality')
    cname   = country.find('name')
    if infmort != None:
        df.append([cname.text, pd.to_numeric(infmort.text)])

# convert into a dataframe
df = pd.DataFrame(df)

# assign column names
df.columns = ['name', 'infant_mortality']

# sort by infant mortality, ascending=True 
pd.options.display.float_format = '{:,.2f}'.format
df = df.sort_values(['infant_mortality'], ascending=[1]).head(10).reset_index(drop=True)
df


Out[7]:
name infant_mortality
0 Monaco 1.81
1 Japan 2.13
2 Bermuda 2.48
3 Norway 2.48
4 Singapore 2.53
5 Sweden 2.60
6 Czech Republic 2.63
7 Hong Kong 2.73
8 Macao 3.13
9 Iceland 3.15

10 cities with the largest population


In [8]:
# gather population tags per country (excluding per city/prov)
df = []
for root in document.getroot():
    country = root.find('name').text
    for child in root:
        if ((root.tag == 'country') & (child.tag=='population')):
            yr  = child.attrib['year']
            pop = child.text
    df.append([country, yr, pd.to_numeric(pop)])

# convert into a dataframe
df = pd.DataFrame(df, columns = ['country', 'year', 'population'])

# sort by population, ascending=False 
df.population = df.population.astype(float)
pd.options.display.float_format = '{:,.0f}'.format
print(df.sort_values(['population'], ascending=[0]).head(10).reset_index(drop=True))


         country  year    population
0          China  2013 1,360,720,000
1          India  2011 1,210,854,977
2  United States  2014   318,857,056
3      Indonesia  2014   252,124,458
4         Brazil  2014   202,768,562
5       Pakistan  2010   173,149,306
6        Nigeria  2011   164,294,516
7     Bangladesh  2011   149,772,364
8         Russia  2014   143,666,931
9          Japan  2013   127,298,000

10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)


In [ ]:


In [9]:
df_eth_cnt = []        
for root in document.getroot():
    country = root.find('name').text
    df_eth = []
    for child in root:
        if ((root.tag == 'country') & (child.tag=='population')):
            yr  = child.attrib['year']
            pop = child.text
        if ((root.tag == 'country') & (child.tag=='ethnicgroup')):
            percent = pd.to_numeric(child.attrib['percentage'])
            ethnic  = child.text
            df_eth.append([percent, ethnic])  
    for k in range(len(df_eth)):
        df_eth_cnt.append([df_eth[k][1], (df_eth[k][0])*(pd.to_numeric(pop)/100)])
    #df_pop.append([country, yr, pd.to_numeric(pop), df_eth_cnt])
df = pd.DataFrame(df_eth_cnt, columns = ['ethnicgroup', 'population'])

print(df.sort_values(['population'], ascending=[0]).head(10).reset_index(drop=True))


   ethnicgroup    population
0  Han Chinese 1,245,058,800
1   Indo-Aryan   871,815,583
2    Dravidian   302,713,744
3     European   254,958,102
4      African   162,651,571
5      Bengali   146,776,917
6     Japanese   126,534,212
7      Russian   114,646,211
8     Javanese   113,456,006
9     European   108,886,718

In [ ]:

name and country of a) longest river


In [10]:
# gather all data into a list
root = document.getroot()
dfcntry = []
for country in root.findall('country'):
    dfcntry.append([country.attrib['car_code'], country.find('name').text])
    
dfcntry = pd.DataFrame(dfcntry, columns=['country_code','country_name'])    
dfUniq = dfcntry[(dfcntry.country_name != '')].drop_duplicates('country_code')
dfUniq = pd.DataFrame(dfUniq, columns=['country_code','country_name'])  
   
df = []    
for river in root.findall('river'): 
    length = river.find('length')
    rivername   = river.find('name')
    if length != None:
        df.append([river.attrib['country'], rivername.text, pd.to_numeric(length.text)])

# convert into a dataframe
df = pd.DataFrame(df, columns = ['country(ies)', 'river_name', 'length'])

# print answer
df = df.sort_values(['length'], ascending=[0]).head(1).reset_index(drop=True)

# extract country names
cntry = []
for i in df['country(ies)'].str.split():
    for j in range(len(i)):
        cntry.append(dfUniq.loc[dfUniq['country_code'] == i[j]])

# concat names into a string
str =''
for k in range(len(cntry)):
    str += (cntry[k]['country_name'].values[0]) + " "

#remove last space
str = str[:-1]   

# update country name
df['country(ies)'] = str

# display answer
df


Out[10]:
country(ies) river_name length
0 Colombia Brazil Peru Amazonas 6,448

name and country of b) largest lake


In [11]:
# gather all data into a list
dfcntry = []
for country in root.findall('country'):
    dfcntry.append([country.attrib['car_code'], country.find('name').text])
    
dfcntry = pd.DataFrame(dfcntry, columns=['country_code','country_name'])    
dfUniq = dfcntry[(dfcntry.country_name != '')].drop_duplicates('country_code')
dfUniq = pd.DataFrame(dfUniq, columns=['country_code','country_name'])  

root = document.getroot()
df = []
for lake in root.findall('lake'): 
    area = lake.find('area')
    lakename   = lake.find('name')
    if area != None:
        df.append([lake.attrib['country'], lakename.text, pd.to_numeric(area.text)])

# convert into a dataframe
df = pd.DataFrame(df, columns = ['country(ies)', 'lake_name', 'area'])

# get largest lake by area
df = df.sort_values(['area'], ascending=[0]).head(1).reset_index(drop=True)

# extract country names
cntry = []
for i in df['country(ies)'].str.split():
    for j in range(len(i)):
        cntry.append(dfUniq.loc[dfUniq['country_code'] == i[j]])

# concat names into a string
str =''
for k in range(len(cntry)):
    str += (cntry[k]['country_name'].values[0]) + " "

#remove last space
str = str[:-1]   

# update country name
df['country(ies)'] = str

# display answer
df


Out[11]:
country(ies) lake_name area
0 Russia Azerbaijan Kazakhstan Iran Turkmenistan Caspian Sea 386,400

name and country of c) airport at highest elevation


In [12]:
# create a new dataframe with unique project code and name

root = document.getroot()
dfcntry = []
for country in root.findall('country'):
    dfcntry.append([country.attrib['car_code'], country.find('name').text])
    
dfcntry = pd.DataFrame(dfcntry, columns=['country_code','country_name'])    
dfUniq = dfcntry[(dfcntry.country_name != '')].drop_duplicates('country_code')
dfUniq = pd.DataFrame(dfUniq, columns=['country_code','country_name'])  

# gather all data into a list
df = []    
for airport in root.findall('airport'): 
    area = airport.find('elevation')
    airportname   = airport.find('name')
    if area.text != None:
        df.append([airport.attrib['country'], airportname.text, pd.to_numeric(area.text)])

# convert into a dataframe
df = pd.DataFrame(df, columns = ['country_code', 'airport_name', 'elevation'])

# get country with highest elevation
df = df.sort_values(['elevation'], ascending=[0]).head(1).reset_index(drop=True)

# locate the country names from list of unique country codes
dfCntry = dfUniq.loc[dfUniq['country_code'] == df['country_code'].values[0]] 

# merge two dataframes, print answer
df = pd.merge(dfCntry, df, on='country_code', how='outer')

# i don't want to display country code
del df['country_code']
df


Out[12]:
country_name airport_name elevation
0 Bolivia El Alto Intl 4063

In [ ]:


In [ ]: