In [74]:
from xml.etree import ElementTree as ET
In [75]:
document_tree = ET.parse( './data/mondial_database_less.xml' )
In [76]:
# print names of all countries
for child in document_tree.getroot():
print (child.find('name').text)
In [77]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
print ('* ' + element.find('name').text + ':', end=''),
capitals_string = ''
for subelement in element.getiterator('city'):
capitals_string += subelement.find('name').text + ', '
print (capitals_string[:-2])
Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find
In [78]:
document = ET.parse( './data/mondial_database.xml' )
In [89]:
# print child and attributes
#for child in document.getroot():
# print (child.tag, child.attrib)
In [80]:
import pandas as pd
In [84]:
# Create a list of country and their Infant Mortality Rate
country_imr=[]
for country in document.getroot().findall('country'):
name = country.find('name').text
infant_mortality_rate = country.find('infant_mortality')
if infant_mortality_rate is not None:
infant_mortality_rate=infant_mortality_rate.text
else :
infant_mortality_rate = -1
country_imr.append((name, (float)(infant_mortality_rate)))
In [88]:
df = pd.DataFrame(country_imr, columns=['Country', 'Infant_Mortality_Rate'])
df_unknown_removed = df[df.Infant_Mortality_Rate != -1]
df_unknown_removed.set_index('Infant_Mortality_Rate').sort().head(10)
Out[88]:
In [149]:
city_population=[]
for country in document.iterfind('country'):
for state in country.iterfind('province'):
for city in state.iterfind('city'):
try:
city_population.append((city.find('name').text, float(city.find('population').text)))
except:
next
for city in country.iterfind('city'):
try:
city_population.append((city.find('name').text, float(city.find('population').text)))
except:
next
In [165]:
df = pd.DataFrame(city_population, columns=['City', 'Population'])
#df.info()
df.sort_index(by='Population', ascending=False).head(10)
Out[165]:
In [229]:
ethnic_population={}
country_population={}
for country in document.iterfind('country'):
try:
country_population[country.find('name').text]= float(country.find('population').text)
except:
next
for state in country.iterfind('province' or 'state'):
try:
country_population[country.find('name').text] += float(state.find('population').text)
except:
next
for city in state.iterfind('city'):
try:
country_population[country.find('name').text] += float(city.find('population').text)
except:
next
for country in document.iterfind('country'):
for ethnicgroup in country.iterfind('ethnicgroup'):
try:
if ethnicgroup.text in ethnic_population:
ethnic_population[ethnicgroup.text] += country_population[country.find('name').text]*float(ethnicgroup.get('percentage'))/100
else:
ethnic_population[ethnicgroup.text] = country_population[country.find('name').text]*float(ethnicgroup.get('percentage'))/100
except:
next
In [241]:
pd.DataFrame(sorted(ethnic_population.items(), key=lambda x:x[1], reverse=True)[:10], columns=['Ethnic_Groups', 'Population'])
Out[241]:
In [249]:
rivers_list=[]
rivers_df = pd.DataFrame()
for rivers in document.iterfind('river'):
try:
rivers_list.append({'name':rivers.find('name').text, 'length':int(rivers.find('length').text), 'country':rivers.find('located').attrib['country']})
except:
next
rivers_df = pd.DataFrame(rivers_list)
rivers_df.sort('length', ascending=False).head(1)
Out[249]:
In [253]:
lake_list=[]
lake_df = pd.DataFrame()
for lakes in document.iterfind('lake'):
try:
lake_list.append({'name':lakes.find('name').text, 'area':int(lakes.find('area').text), 'country':lakes.find('located').attrib['country']})
except:
next
lakes_df = pd.DataFrame(lake_list)
lakes_df.sort('area', ascending=False).head(1)
Out[253]:
In [256]:
ap_list=[]
ap_df = pd.DataFrame()
for ap in document.iterfind('airport'):
try:
ap_list.append({'name':ap.find('name').text, 'elevation':int(ap.find('elevation').text), 'country':ap.attrib['country']})
except:
next
ap_df = pd.DataFrame(ap_list)
ap_df.sort('elevation', ascending=False).head(1)
Out[256]:
In [ ]: