In [1]:
from xml.etree import ElementTree as ET
In [2]:
document_tree = ET.parse( 'data/mondial_database_less.xml' )
In [3]:
# print names of all countries
for child in document_tree.getroot():
print (child.find('name').text)
In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
print ('* ' + element.find('name').text + ':',)
capitals_string = ''
for subelement in element.getiterator('city'):
capitals_string += subelement.find('name').text + ', '
print (capitals_string[:-2])
Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find
In [5]:
document = ET.parse( './data/mondial_database.xml' )
10 countries with the lowest infant mortality rates
In [6]:
import pandas as pd
In [7]:
# gather all data into a list
root = document.getroot()
df = []
for country in root.findall('country'):
infmort = country.find('infant_mortality')
cname = country.find('name')
if infmort != None:
df.append([cname.text, pd.to_numeric(infmort.text)])
# convert into a dataframe
df = pd.DataFrame(df)
# assign column names
df.columns = ['name', 'infant_mortality']
# sort by infant mortality, ascending=True
pd.options.display.float_format = '{:,.2f}'.format
df = df.sort_values(['infant_mortality'], ascending=[1]).head(10).reset_index(drop=True)
df
Out[7]:
10 cities with the largest population
In [8]:
# gather population tags per country (excluding per city/prov)
df = []
for root in document.getroot():
country = root.find('name').text
for child in root:
if ((root.tag == 'country') & (child.tag=='population')):
yr = child.attrib['year']
pop = child.text
df.append([country, yr, pd.to_numeric(pop)])
# convert into a dataframe
df = pd.DataFrame(df, columns = ['country', 'year', 'population'])
# sort by population, ascending=False
df.population = df.population.astype(float)
pd.options.display.float_format = '{:,.0f}'.format
print(df.sort_values(['population'], ascending=[0]).head(10).reset_index(drop=True))
10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)
In [ ]:
In [9]:
df_eth_cnt = []
for root in document.getroot():
country = root.find('name').text
df_eth = []
for child in root:
if ((root.tag == 'country') & (child.tag=='population')):
yr = child.attrib['year']
pop = child.text
if ((root.tag == 'country') & (child.tag=='ethnicgroup')):
percent = pd.to_numeric(child.attrib['percentage'])
ethnic = child.text
df_eth.append([percent, ethnic])
for k in range(len(df_eth)):
df_eth_cnt.append([df_eth[k][1], (df_eth[k][0])*(pd.to_numeric(pop)/100)])
#df_pop.append([country, yr, pd.to_numeric(pop), df_eth_cnt])
df = pd.DataFrame(df_eth_cnt, columns = ['ethnicgroup', 'population'])
print(df.sort_values(['population'], ascending=[0]).head(10).reset_index(drop=True))
In [ ]:
name and country of a) longest river
In [10]:
# gather all data into a list
root = document.getroot()
dfcntry = []
for country in root.findall('country'):
dfcntry.append([country.attrib['car_code'], country.find('name').text])
dfcntry = pd.DataFrame(dfcntry, columns=['country_code','country_name'])
dfUniq = dfcntry[(dfcntry.country_name != '')].drop_duplicates('country_code')
dfUniq = pd.DataFrame(dfUniq, columns=['country_code','country_name'])
df = []
for river in root.findall('river'):
length = river.find('length')
rivername = river.find('name')
if length != None:
df.append([river.attrib['country'], rivername.text, pd.to_numeric(length.text)])
# convert into a dataframe
df = pd.DataFrame(df, columns = ['country(ies)', 'river_name', 'length'])
# print answer
df = df.sort_values(['length'], ascending=[0]).head(1).reset_index(drop=True)
# extract country names
cntry = []
for i in df['country(ies)'].str.split():
for j in range(len(i)):
cntry.append(dfUniq.loc[dfUniq['country_code'] == i[j]])
# concat names into a string
str =''
for k in range(len(cntry)):
str += (cntry[k]['country_name'].values[0]) + " "
#remove last space
str = str[:-1]
# update country name
df['country(ies)'] = str
# display answer
df
Out[10]:
name and country of b) largest lake
In [11]:
# gather all data into a list
dfcntry = []
for country in root.findall('country'):
dfcntry.append([country.attrib['car_code'], country.find('name').text])
dfcntry = pd.DataFrame(dfcntry, columns=['country_code','country_name'])
dfUniq = dfcntry[(dfcntry.country_name != '')].drop_duplicates('country_code')
dfUniq = pd.DataFrame(dfUniq, columns=['country_code','country_name'])
root = document.getroot()
df = []
for lake in root.findall('lake'):
area = lake.find('area')
lakename = lake.find('name')
if area != None:
df.append([lake.attrib['country'], lakename.text, pd.to_numeric(area.text)])
# convert into a dataframe
df = pd.DataFrame(df, columns = ['country(ies)', 'lake_name', 'area'])
# get largest lake by area
df = df.sort_values(['area'], ascending=[0]).head(1).reset_index(drop=True)
# extract country names
cntry = []
for i in df['country(ies)'].str.split():
for j in range(len(i)):
cntry.append(dfUniq.loc[dfUniq['country_code'] == i[j]])
# concat names into a string
str =''
for k in range(len(cntry)):
str += (cntry[k]['country_name'].values[0]) + " "
#remove last space
str = str[:-1]
# update country name
df['country(ies)'] = str
# display answer
df
Out[11]:
name and country of c) airport at highest elevation
In [12]:
# create a new dataframe with unique project code and name
root = document.getroot()
dfcntry = []
for country in root.findall('country'):
dfcntry.append([country.attrib['car_code'], country.find('name').text])
dfcntry = pd.DataFrame(dfcntry, columns=['country_code','country_name'])
dfUniq = dfcntry[(dfcntry.country_name != '')].drop_duplicates('country_code')
dfUniq = pd.DataFrame(dfUniq, columns=['country_code','country_name'])
# gather all data into a list
df = []
for airport in root.findall('airport'):
area = airport.find('elevation')
airportname = airport.find('name')
if area.text != None:
df.append([airport.attrib['country'], airportname.text, pd.to_numeric(area.text)])
# convert into a dataframe
df = pd.DataFrame(df, columns = ['country_code', 'airport_name', 'elevation'])
# get country with highest elevation
df = df.sort_values(['elevation'], ascending=[0]).head(1).reset_index(drop=True)
# locate the country names from list of unique country codes
dfCntry = dfUniq.loc[dfUniq['country_code'] == df['country_code'].values[0]]
# merge two dataframes, print answer
df = pd.merge(dfCntry, df, on='country_code', how='outer')
# i don't want to display country code
del df['country_code']
df
Out[12]:
In [ ]:
In [ ]: