In [1]:
In [3]:
In [ ]:
In [ ]:
Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find
In [22]:
# Answer to Exercise 1 (Find 10 countries with the lowest infant mortality rates)
import pandas as pd
import numpy as np
from xml.etree import ElementTree as ET
document_tree = ET.parse( './data/mondial_database.xml' )
# Set-up an empty dataframe as a placeholder for country, infant_mortality column values
country_df = pd.DataFrame(columns = ["country","infant_mortality"])
# Initialize variables.
country = ""
infant_mortality = ""
# Iterate through the xml tree and get the country name and its corresponding mortality rate.
# Store this in a data frame for faster manipulation of data.
for element in document_tree.iterfind('country'):
country = element.find('name').text
for subelement in element.getiterator('infant_mortality'):
infant_mortality = float(subelement.text)
country_df.loc[len(country_df)] = [country, infant_mortality]
# Sort data and find top ten countries in ascending order (default)
country_df.sort_values(by = 'infant_mortality').head(10)
Out[22]:
In [7]:
# Answer to Exercise 2 (Find 10 cities with largest populations)
import pandas as pd
from xml.etree import ElementTree as ET
document_tree = ET.parse( './data/mondial_database.xml' )
# Set-up an empty dataframe as a placeholder for city, population column values
city_df = pd.DataFrame(columns = ["city","population"])
# Initialize variables
cityname = ""
population = float(0)
# Iterate through the xml tree and get the city name and its corresponding population.
# Need to loop through the country element first since that's top of the list.
# Store this in a data frame for faster manipulation of data.
for country in document_tree.iterfind('country'):
for city in country.iter('city'):
cityname = city.find('name').text
for pop in city.iterfind('population'):
population = float(pop.text)
city_df.loc[len(city_df)] = [cityname, population]
# Sort data and find top ten cities in descending order
city_df.sort_values(by = 'population', ascending = False).head(10)
Out[7]:
In [16]:
# Answer to Exercise 3 ( Find 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries))
import pandas as pd
from xml.etree import ElementTree as ET
document_tree = ET.parse( './data/mondial_database.xml' )
# Set-up an empty dataframe as a placeholder for city, population column values
pop_df = pd.DataFrame(columns = ["Country","Ethnicity","Population"])
# Initialize variables
countryname = ""
countrypop = float(0)
ethnicgrppop = float(0)
# Strategy: This xml file lists the population by country. However, it does list the
# different ethnic groups within the country and corresponding percent population.
# So we will just compute the ethnic population accordingly.
# Iterate through the xml tree and get the country name and its corresponding ethnic populations.
# Need to loop through the country element first since that's top of the list.
# Store this in a data frame for faster manipulation of data.
for country in document_tree.iterfind('country'):
countryname = country.find('name').text
for pop in country.iterfind('population'):
countrypop = float(pop.text)
for ethnicgrp in country.iterfind('ethnicgroup'):
ethnicgrpname = ethnicgrp.text
# Calculate the ethnic population.
# Formula = country population * ethnic group percentage
ethnicgrppop = round(float(ethnicgrp.attrib['percentage']) * int(countrypop) * 0.01)
# Save these values in a dataframe
pop_df.loc[len(pop_df)] = [countryname, ethnicgrpname, ethnicgrppop]
# Group the data first by ethnic group irrespective of the country.
# Then display the top ten ethnic groups with largest population.
pop_df.groupby('Ethnicity').sum().sort_values(by = 'Population', ascending=False).head(10)
Out[16]:
In [ ]:
In [ ]: