In [2]:
from xml.etree import ElementTree as ET
In [3]:
document_tree = ET.parse( './data/mondial_database_less.xml' )
In [4]:
# print names of all countries
for child in document_tree.getroot():
print(child.find('name').text)
In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
print('* ' + element.find('name').text + ':', end=''),
capitals_string = ''
for subelement in element.getiterator('city'):
capitals_string += subelement.find('name').text + ', '
print(capitals_string[:-2])
Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find
In [6]:
document = ET.parse( './data/mondial_database.xml' )
In [7]:
#Question 1: Find 10 countries with the lowest infant mortality rates
import pandas as pd
import numpy as np
#Create a dataframe with columns 'Country' and 'Mortality'
country = []
mortality = []
root = document.getroot()
for element in root.findall('country'):
country.append(element.find('name').text)
if element.find('infant_mortality') is not None and element.find('infant_mortality').text is not None:
mortality.append(float(element.find('infant_mortality').text))
else:
mortality.append(np.NaN)
data = pd.DataFrame(list(zip(country,mortality)))
data.columns = ['Country', 'Mortality']
#Find 10 countries with the lowest infant mortality rates
data.sort_values('Mortality').iloc[:10,]
Out[7]:
In [54]:
#Question 2: Find 10 cities with the largest population
city = []
population = []
year=[]
for element in root.findall('country'):
this_country = element.find('name').text
for subelement in element.iter('city'):
for subelement2 in subelement.findall('population'):
city.append(subelement.find('name').text + ', ' + this_country)
population.append(float(subelement2.text))
year.append(float(subelement2.get('year')))
data_city = pd.DataFrame(list(zip(city, population, year)))
data_city.columns = ['City','Population','Year']
#Based on 'data_city', create a dataframe 'data_city_latest' that only include the rows of the most recent year
max_year = data_city.groupby('City')['Year'].max().to_frame().reset_index()
data_city_latest = pd.merge(max_year, data_city, how='left', on=['City', 'Year'])
#Find 10 cities with the largest population
data_city_latest.sort_values('Population', ascending=False).iloc[:10,]
Out[54]:
In [29]:
#Question 3: Find 10 ethnic groups with the largest overall populations
#Create a dataframe 'data_ethnic' with columns 'Country', 'Ethnicity' and 'Percentage'
country = []
ethnic = []
percentage = []
for element in root.findall('country'):
this_country = element.find('name').text
for subelement in element.findall('ethnicgroup'):
country.append(this_country)
ethnic.append(subelement.text)
percentage.append(float(subelement.get('percentage')))
data_ethnic = pd.DataFrame(list(zip(country, ethnic, percentage)))
data_ethnic.columns = ['Country','Ethnicity','Percentage']
#Create a dataframe 'data_pop' with columns 'Country', 'Population' and 'Year'
country = []
population = []
year = []
for element in root.findall('country'):
this_country = element.find('name').text
for subelement in element.findall('population'):
country.append(this_country)
population.append(float(subelement.text))
year.append(float(subelement.get('year')))
data_pop = pd.DataFrame(list(zip(country, population, year)))
data_pop.columns = ['Country','Population','Year']
#Based on 'data_pop', create a dataframe 'data_pop_latest' that only include the rows of the most recent year
max_year = data_pop.groupby('Country')['Year'].max().to_frame().reset_index()
data_pop_latest = pd.merge(max_year, data_pop, how='left', on=['Country', 'Year'])
#Merge dataframes 'data_ethnic' and 'data_pop_latest'.
#The new dataframe contains columns 'Country','Ethnicity','Percentage','Population', and 'Year'
data = pd.merge(data_ethnic, data_pop_latest, how='left', on=['Country'])
#Find 10 ethnic groups with the largest overall populations
data['Count']=data['Percentage']*data['Population']*0.01
data.groupby('Ethnicity')['Count'].sum().to_frame().reset_index().sort_values(by='Count', ascending=False).iloc[:10,]
Out[29]:
In [11]:
# Question 4 (a) name and country of longest river
# Create a dictionary with country code as the ID and country name as the value
country_dict = {}
for element in root.findall('country'):
this_country = element.find('name').text
this_country_id = element.get('car_code')
country_dict[this_country_id] = this_country
#Create a dataframe with columns 'River', 'Country ID', and 'Length'
river = []
country_ids = []
length = []
for element in root.findall('river'):
river.append(element.find('name').text)
country_ids.append(element.get('country'))
if element.find('length') is not None and element.find('length').text is not None:
length.append(float(element.find('length').text))
else:
length.append(np.NaN)
data_river = pd.DataFrame(list(zip(river, country_ids, length)))
data_river.columns = ['River','Country ID','Length']
#Find the name and country of longest river
this_river = data_river.sort_values(by='Length', ascending=False)
countries_ids = this_river.iloc[:1,1].values[0].split(' ')
countries = [country_dict[c] for c in countries_ids ]
print('The longest river is ' + this_river.iloc[:1,0].values[0] +', which crosses ' + ', '.join(countries))
In [14]:
# Question 4 (b) name and country of largest lake
#Create a dataframe with columns 'Lake', 'Country ID', and 'Area'
lake = []
country_ids = []
area = []
for element in root.findall('lake'):
lake.append(element.find('name').text)
country_ids.append(element.get('country'))
if element.find('area') is not None and element.find('area').text is not None:
area.append(float(element.find('area').text))
else:
area.append(np.NaN)
data_lake = pd.DataFrame(list(zip(lake, country_ids, area)))
data_lake.columns = ['Lake','Country ID','Area']
#Find the name and country of largest lake
this_lake = data_lake.sort_values(by='Area', ascending=False)
countries_ids = this_lake.iloc[:1,1].values[0].split(' ')
countries = [country_dict[c] for c in countries_ids ]
print('The largest lake is ' + this_lake.iloc[:1,0].values[0] +' crosses ' + ', '.join(countries))
In [15]:
# Question 4 (c) name and country of airport at highest elevation
#Create a dataframe with columns 'Airport', 'Country ID', and 'Elevation'
airport = []
country_ids = []
elevation = []
for element in root.findall('airport'):
airport.append(element.find('name').text)
country_ids.append(element.get('country'))
if element.find('elevation') is not None and element.find('elevation').text is not None:
elevation.append(float(element.find('elevation').text))
else:
elevation.append(np.NaN)
data_airport = pd.DataFrame(list(zip(airport, country_ids, elevation)))
data_airport.columns = ['Airport','Country ID','Elevation']
# Find the name and country of airport at highest elevation
this_airport = data_airport.sort_values(by='Elevation', ascending=False)
countries_ids = this_airport.iloc[:1,1].values[0].split(' ')
countries = [country_dict[c] for c in countries_ids ]
print('The airport at highest elevation is ' + this_airport.iloc[:1,0].values[0] +' crosses ' + ', '.join(countries))
In [ ]: