XML example and exercise

study examples of accessing nodes in XML tree structure
work on exercise to be completed and submitted

reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
data source: http://www.dbis.informatik.uni-goettingen.de/Mondial



In [1]:

    
from xml.etree import ElementTree as ET

XML example

for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html



In [2]:

    
document_tree = ET.parse( 'data/mondial_database_less.xml' )



In [3]:

    
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)









    



Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra



In [4]:

    
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':',)
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])









    



* Albania:
Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:
Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:
Skopje, Kumanovo
* Serbia:
Beograd, Novi Sad, Niš
* Montenegro:
Podgorica
* Kosovo:
Prishtine
* Andorra:
Andorra la Vella

XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

10 countries with the lowest infant mortality rates
10 cities with the largest population
10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
name and country of a) longest river, b) largest lake and c) airport at highest elevation



In [5]:

    
document = ET.parse( './data/mondial_database.xml' )

10 countries with the lowest infant mortality rates



In [6]:

    
import pandas as pd



In [7]:

    
# gather all data into a list
root = document.getroot()
df = []
for country in root.findall('country'): 
    infmort = country.find('infant_mortality')
    cname   = country.find('name')
    if infmort != None:
        df.append([cname.text, pd.to_numeric(infmort.text)])

# convert into a dataframe
df = pd.DataFrame(df)

# assign column names
df.columns = ['name', 'infant_mortality']

# sort by infant mortality, ascending=True 
pd.options.display.float_format = '{:,.2f}'.format
df = df.sort_values(['infant_mortality'], ascending=[1]).head(10).reset_index(drop=True)
df









    Out[7]:






  
    
      
      name
      infant_mortality
    
  
  
    
      0
      Monaco
      1.81
    
    
      1
      Japan
      2.13
    
    
      2
      Bermuda
      2.48
    
    
      3
      Norway
      2.48
    
    
      4
      Singapore
      2.53
    
    
      5
      Sweden
      2.60
    
    
      6
      Czech Republic
      2.63
    
    
      7
      Hong Kong
      2.73
    
    
      8
      Macao
      3.13
    
    
      9
      Iceland
      3.15

10 cities with the largest population



In [8]:

    
# gather population tags per country (excluding per city/prov)
df = []
for root in document.getroot():
    country = root.find('name').text
    for child in root:
        if ((root.tag == 'country') & (child.tag=='population')):
            yr  = child.attrib['year']
            pop = child.text
    df.append([country, yr, pd.to_numeric(pop)])

# convert into a dataframe
df = pd.DataFrame(df, columns = ['country', 'year', 'population'])

# sort by population, ascending=False 
df.population = df.population.astype(float)
pd.options.display.float_format = '{:,.0f}'.format
print(df.sort_values(['population'], ascending=[0]).head(10).reset_index(drop=True))









    



         country  year    population
0          China  2013 1,360,720,000
1          India  2011 1,210,854,977
2  United States  2014   318,857,056
3      Indonesia  2014   252,124,458
4         Brazil  2014   202,768,562
5       Pakistan  2010   173,149,306
6        Nigeria  2011   164,294,516
7     Bangladesh  2011   149,772,364
8         Russia  2014   143,666,931
9          Japan  2013   127,298,000

10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)



In [ ]:



In [9]:

    
df_eth_cnt = []        
for root in document.getroot():
    country = root.find('name').text
    df_eth = []
    for child in root:
        if ((root.tag == 'country') & (child.tag=='population')):
            yr  = child.attrib['year']
            pop = child.text
        if ((root.tag == 'country') & (child.tag=='ethnicgroup')):
            percent = pd.to_numeric(child.attrib['percentage'])
            ethnic  = child.text
            df_eth.append([percent, ethnic])  
    for k in range(len(df_eth)):
        df_eth_cnt.append([df_eth[k][1], (df_eth[k][0])*(pd.to_numeric(pop)/100)])
    #df_pop.append([country, yr, pd.to_numeric(pop), df_eth_cnt])
df = pd.DataFrame(df_eth_cnt, columns = ['ethnicgroup', 'population'])

print(df.sort_values(['population'], ascending=[0]).head(10).reset_index(drop=True))









    



   ethnicgroup    population
0  Han Chinese 1,245,058,800
1   Indo-Aryan   871,815,583
2    Dravidian   302,713,744
3     European   254,958,102
4      African   162,651,571
5      Bengali   146,776,917
6     Japanese   126,534,212
7      Russian   114,646,211
8     Javanese   113,456,006
9     European   108,886,718



In [ ]:

name and country of a) longest river



In [10]:

    
# gather all data into a list
root = document.getroot()
dfcntry = []
for country in root.findall('country'):
    dfcntry.append([country.attrib['car_code'], country.find('name').text])
    
dfcntry = pd.DataFrame(dfcntry, columns=['country_code','country_name'])    
dfUniq = dfcntry[(dfcntry.country_name != '')].drop_duplicates('country_code')
dfUniq = pd.DataFrame(dfUniq, columns=['country_code','country_name'])  
   
df = []    
for river in root.findall('river'): 
    length = river.find('length')
    rivername   = river.find('name')
    if length != None:
        df.append([river.attrib['country'], rivername.text, pd.to_numeric(length.text)])

# convert into a dataframe
df = pd.DataFrame(df, columns = ['country(ies)', 'river_name', 'length'])

# print answer
df = df.sort_values(['length'], ascending=[0]).head(1).reset_index(drop=True)

# extract country names
cntry = []
for i in df['country(ies)'].str.split():
    for j in range(len(i)):
        cntry.append(dfUniq.loc[dfUniq['country_code'] == i[j]])

# concat names into a string
str =''
for k in range(len(cntry)):
    str += (cntry[k]['country_name'].values[0]) + " "

#remove last space
str = str[:-1]   

# update country name
df['country(ies)'] = str

# display answer
df









    Out[10]:






  
    
      
      country(ies)
      river_name
      length
    
  
  
    
      0
      Colombia Brazil Peru
      Amazonas
      6,448

name and country of b) largest lake



In [11]:

    
# gather all data into a list
dfcntry = []
for country in root.findall('country'):
    dfcntry.append([country.attrib['car_code'], country.find('name').text])
    
dfcntry = pd.DataFrame(dfcntry, columns=['country_code','country_name'])    
dfUniq = dfcntry[(dfcntry.country_name != '')].drop_duplicates('country_code')
dfUniq = pd.DataFrame(dfUniq, columns=['country_code','country_name'])  

root = document.getroot()
df = []
for lake in root.findall('lake'): 
    area = lake.find('area')
    lakename   = lake.find('name')
    if area != None:
        df.append([lake.attrib['country'], lakename.text, pd.to_numeric(area.text)])

# convert into a dataframe
df = pd.DataFrame(df, columns = ['country(ies)', 'lake_name', 'area'])

# get largest lake by area
df = df.sort_values(['area'], ascending=[0]).head(1).reset_index(drop=True)

# extract country names
cntry = []
for i in df['country(ies)'].str.split():
    for j in range(len(i)):
        cntry.append(dfUniq.loc[dfUniq['country_code'] == i[j]])

# concat names into a string
str =''
for k in range(len(cntry)):
    str += (cntry[k]['country_name'].values[0]) + " "

#remove last space
str = str[:-1]   

# update country name
df['country(ies)'] = str

# display answer
df









    Out[11]:






  
    
      
      country(ies)
      lake_name
      area
    
  
  
    
      0
      Russia Azerbaijan Kazakhstan Iran Turkmenistan
      Caspian Sea
      386,400

name and country of c) airport at highest elevation



In [12]:

    
# create a new dataframe with unique project code and name

root = document.getroot()
dfcntry = []
for country in root.findall('country'):
    dfcntry.append([country.attrib['car_code'], country.find('name').text])
    
dfcntry = pd.DataFrame(dfcntry, columns=['country_code','country_name'])    
dfUniq = dfcntry[(dfcntry.country_name != '')].drop_duplicates('country_code')
dfUniq = pd.DataFrame(dfUniq, columns=['country_code','country_name'])  

# gather all data into a list
df = []    
for airport in root.findall('airport'): 
    area = airport.find('elevation')
    airportname   = airport.find('name')
    if area.text != None:
        df.append([airport.attrib['country'], airportname.text, pd.to_numeric(area.text)])

# convert into a dataframe
df = pd.DataFrame(df, columns = ['country_code', 'airport_name', 'elevation'])

# get country with highest elevation
df = df.sort_values(['elevation'], ascending=[0]).head(1).reset_index(drop=True)

# locate the country names from list of unique country codes
dfCntry = dfUniq.loc[dfUniq['country_code'] == df['country_code'].values[0]] 

# merge two dataframes, print answer
df = pd.merge(dfCntry, df, on='country_code', how='outer')

# i don't want to display country code
del df['country_code']
df









    Out[12]:






  
    
      
      country_name
      airport_name
      elevation
    
  
  
    
      0
      Bolivia
      El Alto Intl
      4063



In [ ]:



In [ ]:

	name	infant_mortality
0	Monaco	1.81
1	Japan	2.13
2	Bermuda	2.48
3	Norway	2.48
4	Singapore	2.53
5	Sweden	2.60
6	Czech Republic	2.63
7	Hong Kong	2.73
8	Macao	3.13
9	Iceland	3.15