XML example and exercise

study examples of accessing nodes in XML tree structure
work on exercise to be completed and submitted

reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html
data source: http://www.dbis.informatik.uni-goettingen.de/Mondial



In [74]:

    
from xml.etree import ElementTree as ET

XML example

for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html



In [75]:

    
document_tree = ET.parse( './data/mondial_database_less.xml' )



In [76]:

    
# print names of all countries
for child in document_tree.getroot():
    print (child.find('name').text)









    



Albania
Greece
Macedonia
Serbia
Montenegro
Kosovo
Andorra



In [77]:

    
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
    print ('* ' + element.find('name').text + ':', end=''),
    capitals_string = ''
    for subelement in element.getiterator('city'):
        capitals_string += subelement.find('name').text + ', '
    print (capitals_string[:-2])









    



* Albania:Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë
* Greece:Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes
* Macedonia:Skopje, Kumanovo
* Serbia:Beograd, Novi Sad, Niš
* Montenegro:Podgorica
* Kosovo:Prishtine
* Andorra:Andorra la Vella

XML exercise

Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find

10 countries with the lowest infant mortality rates
10 cities with the largest population
10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
name and country of a) longest river, b) largest lake and c) airport at highest elevation



In [78]:

    
document = ET.parse( './data/mondial_database.xml' )



In [89]:

    
# print child and attributes
#for child in document.getroot():
#    print (child.tag, child.attrib)



In [80]:

    
import pandas as pd



In [84]:

    
# Create a list of country and their Infant Mortality Rate 
country_imr=[]
for country in document.getroot().findall('country'):
    name = country.find('name').text
    infant_mortality_rate = country.find('infant_mortality')
    if infant_mortality_rate is not None:
        infant_mortality_rate=infant_mortality_rate.text
    else :
        infant_mortality_rate = -1
    country_imr.append((name, (float)(infant_mortality_rate)))

10 countries with the lowest infant mortality rates



In [88]:

    
df = pd.DataFrame(country_imr, columns=['Country', 'Infant_Mortality_Rate'])
df_unknown_removed = df[df.Infant_Mortality_Rate != -1] 
df_unknown_removed.set_index('Infant_Mortality_Rate').sort().head(10)









    Out[88]:






  
    
      
      Country
    
    
      Infant_Mortality_Rate
      
    
  
  
    
      1.81
      Monaco
    
    
      2.13
      Japan
    
    
      2.48
      Bermuda
    
    
      2.48
      Norway
    
    
      2.53
      Singapore
    
    
      2.60
      Sweden
    
    
      2.63
      Czech Republic
    
    
      2.73
      Hong Kong
    
    
      3.13
      Macao
    
    
      3.15
      Iceland



In [149]:

    
city_population=[]
for country in document.iterfind('country'):
    for state in country.iterfind('province'):
        for city in state.iterfind('city'):
            try:
                city_population.append((city.find('name').text, float(city.find('population').text)))
            except:
                next
    for city in country.iterfind('city'):
        try:
            city_population.append((city.find('name').text, float(city.find('population').text)))
        except:
            next

10 cities with the largest population



In [165]:

    
df = pd.DataFrame(city_population, columns=['City', 'Population'])
#df.info()
df.sort_index(by='Population', ascending=False).head(10)









    Out[165]:






  
    
      
      City
      Population
    
  
  
    
      1763
      Seoul
      10229262
    
    
      1421
      Mumbai
      9925891
    
    
      2594
      São Paulo
      9412894
    
    
      1629
      Jakarta
      8259266
    
    
      1251
      Shanghai
      8205598
    
    
      1942
      Ciudad de México
      8092449
    
    
      443
      Moskva
      8010954
    
    
      1725
      Tokyo
      7843000
    
    
      1250
      Beijing
      7362426
    
    
      1467
      Delhi
      7206704



In [229]:

    
ethnic_population={}
country_population={}
for country in document.iterfind('country'):
    try:
        country_population[country.find('name').text]= float(country.find('population').text)
    except:
        next
    for state in country.iterfind('province' or 'state'):
        try:
            country_population[country.find('name').text] += float(state.find('population').text)
        except:
            next
        for city in state.iterfind('city'):
            try:
                country_population[country.find('name').text] += float(city.find('population').text)
            except:
                next

for country in document.iterfind('country'):
    for ethnicgroup in country.iterfind('ethnicgroup'):
        try:
            if ethnicgroup.text in ethnic_population:
                ethnic_population[ethnicgroup.text] += country_population[country.find('name').text]*float(ethnicgroup.get('percentage'))/100
            else:
                ethnic_population[ethnicgroup.text] = country_population[country.find('name').text]*float(ethnicgroup.get('percentage'))/100
        except:
            next

10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)



In [241]:

    
pd.DataFrame(sorted(ethnic_population.items(), key=lambda x:x[1], reverse=True)[:10], columns=['Ethnic_Groups', 'Population'])









    Out[241]:






  
    
      
      Ethnic_Groups
      Population
    
  
  
    
      0
      Han Chinese
      1.593119e+09
    
    
      1
      Indo-Aryan
      7.776357e+08
    
    
      2
      European
      6.668512e+08
    
    
      3
      African
      2.896678e+08
    
    
      4
      Russian
      2.705583e+08
    
    
      5
      Dravidian
      2.700124e+08
    
    
      6
      Japanese
      2.506371e+08
    
    
      7
      German
      1.706358e+08
    
    
      8
      Mestizo
      1.666139e+08
    
    
      9
      Javanese
      1.413179e+08

Longest River



In [249]:

    
rivers_list=[]
rivers_df = pd.DataFrame()
for rivers in document.iterfind('river'):
    try:
        rivers_list.append({'name':rivers.find('name').text, 'length':int(rivers.find('length').text), 'country':rivers.find('located').attrib['country']})
    except:
        next
rivers_df = pd.DataFrame(rivers_list)
rivers_df.sort('length', ascending=False).head(1)









    Out[249]:






  
    
      
      country
      length
      name
    
  
  
    
      161
      CO
      6448
      Amazonas

Largest Lake



In [253]:

    
lake_list=[]
lake_df = pd.DataFrame()
for lakes in document.iterfind('lake'):
    try:
        lake_list.append({'name':lakes.find('name').text, 'area':int(lakes.find('area').text), 'country':lakes.find('located').attrib['country']})
    except:
        next
lakes_df = pd.DataFrame(lake_list)
lakes_df.sort('area', ascending=False).head(1)









    Out[253]:






  
    
      
      area
      country
      name
    
  
  
    
      42
      386400
      R
      Caspian Sea

Airport At Highest Elevation



In [256]:

    
ap_list=[]
ap_df = pd.DataFrame()
for ap in document.iterfind('airport'):
    try:
        ap_list.append({'name':ap.find('name').text, 'elevation':int(ap.find('elevation').text), 'country':ap.attrib['country']})
    except:
        next
ap_df = pd.DataFrame(ap_list)
ap_df.sort('elevation', ascending=False).head(1)









    Out[256]:






  
    
      
      country
      elevation
      name
    
  
  
    
      80
      BOL
      4063
      El Alto Intl

Done.



In [ ]:

	Country
Infant_Mortality_Rate
1.81	Monaco
2.13	Japan
2.48	Bermuda
2.48	Norway
2.53	Singapore
2.60	Sweden
2.63	Czech Republic
2.73	Hong Kong
3.13	Macao
3.15	Iceland

	City	Population
1763	Seoul	10229262
1421	Mumbai	9925891
2594	São Paulo	9412894
1629	Jakarta	8259266
1251	Shanghai	8205598
1942	Ciudad de México	8092449
443	Moskva	8010954
1725	Tokyo	7843000
1250	Beijing	7362426
1467	Delhi	7206704

	Ethnic_Groups	Population
0	Han Chinese	1.593119e+09
1	Indo-Aryan	7.776357e+08
2	European	6.668512e+08
3	African	2.896678e+08
4	Russian	2.705583e+08
5	Dravidian	2.700124e+08
6	Japanese	2.506371e+08
7	German	1.706358e+08
8	Mestizo	1.666139e+08
9	Javanese	1.413179e+08