In [3]:
from xml.etree import ElementTree as ET
In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )
In [3]:
document_tree.getroot()[0].attrib
Out[3]:
In [4]:
# print names of all countries
for child in document_tree.getroot():
print child.find('name').text
In [5]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
print '* ' + element.find('name').text + ':',
capitals_string = ''
for subelement in element.getiterator('city'):
capitals_string += subelement.find('name').text + ', '
print capitals_string[:-2]
In [21]:
document_tree.getroot()[0].find('population').attrib
Out[21]:
In [24]:
for i in document_tree.getroot()[0].findall('population'):
print(str(i.text) + " year: "+ str(i.get('year')))
In [16]:
for child in document_tree.getroot():
print(child.find('name').text + ' infant : '+child.find('infant_mortality').text)
In [6]:
# print names of all countries
for child in document_tree.getroot()[0]:
print child.find('infant_mortality')
In [37]:
testroot=document_tree.getroot()
In [28]:
for child in root.findall(".//infant_mortality/.."):
print(child.find('name').text + ' infant : '+str(float(child.find('infant_mortality').text)))
In [69]:
for child in testroot.findall("./country"):
print([(i.text,i.get('year')) for i in child.findall("population[@year='2011']")])
In [74]:
for child in testroot[0].findall('./ethnicgroup'):
print(child.text)
In [162]:
(ET.tostring(root))[:2717100].rfind('river')
Out[162]:
In [163]:
(ET.tostring(root))[2716850:2716950]
Out[163]:
In [183]:
for i in root.find('./river'):
if i.tag!='located_at':
print(i.tag + ' ' + i.find('name').text)
In [256]:
for i in root.findall('./country/airport')[:10]:
print list(i)
In [248]:
for i in root.findall('.//airport/..')[:10]:
print(i.tag)
In [257]:
root.find('.//gmtOffset/..').tag
Out[257]:
In [271]:
float(root.find('.//airport/latitude').text)
Out[271]:
In [204]:
root.find('./river').find('source').attrib
Out[204]:
In [61]:
ET.tostring(testroot[6].find('./ethnicgroup'))
Out[61]:
In [107]:
p = []
for child in testroot.findall('.//ethnicgroup/..//population/..'):
p += [[i.text,float(i.get('percentage')),child.find('name').text,float(child.find("population[@year='2011']").text)] for i in child.findall('ethnicgroup')]
In [108]:
p
Out[108]:
In [93]:
testeth=pd.DataFrame(p,columns=['ethnicgroup','percentage','county','cpop'])
testeth['epop']=testeth['percentage']*testeth['cpop']/100.0
testeth[['ethnicgroup','epop']].groupby('ethnicgroup').sum().sort('epop',ascending=False)
Out[93]:
Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find
In [4]:
import pandas as pd
In [5]:
document = ET.parse( './data/mondial_database.xml' )
In [6]:
root=document.getroot()
In [7]:
pd.DataFrame([[child.find('name').text, float(child.find('infant_mortality').text)] for child in root.findall(".//infant_mortality/..")], columns=['country','infant mortality']).sort('infant mortality')[:10]
Out[7]:
In [8]:
pd.DataFrame([[child.find('name').text, int(child.find('population').text)] for child in root.findall("./country/city/population/..")], columns=['city','pop']).sort('pop',ascending=False)[:10]
Out[8]:
In [9]:
p = []
for child in root.findall('.//ethnicgroup/..//population/..'):
p += [[i.text,float(i.get('percentage')),child.find('name').text,float(child.findall("population")[-1].text)] for i in child.findall('ethnicgroup')]
In [ ]:
In [12]:
eth=pd.DataFrame(p,columns=['ethnic group','percentage','county','cpop'])
eth['epop']=eth['percentage']*eth['cpop']/100.0
eth[['ethnic group','epop']].groupby('ethnic group').sum().sort('epop',ascending=False)[:10]
Out[12]:
Create CAR code dictionary
In [13]:
codedict={child.get('car_code'):child.find('name').text for child in root.findall('./country')}
Create DF of all rivers and lengths
In [14]:
rivers=pd.DataFrame([[i.find('name').text,float(i.find('length').text),i.find('source').get('country')] for i in root.findall('./river/name/../length/..')], columns=['rname','length','scountry'])
Find the river with the longest length
In [15]:
maxriver=rivers.iloc[rivers['length'].idxmax('length')]
print('The longest river is the '+maxriver['rname']+", with it's source located in "+codedict[maxriver['scountry']])
In [16]:
lakes=pd.DataFrame([[i.find('name').text,float(i.find('area').text),i.find('located').get('country')] for i in root.findall('./lake/name/../area/../located/..')], columns=['lname','area','lco'])
In [17]:
maxlake=lakes.iloc[lakes['area'].idxmax()]
print('The largest lake is the '+maxlake['lname']+", located in "+codedict[maxlake['lco']])
I don't know how we're supposed to find the airport country, as the data file doesnt have country info as attributes or children of the airports, and the airports are not children of the root of the tree. They do have latitude and longitude data
In [64]:
airport=pd.DataFrame([[i.find('name').text,i.find('elevation').text,i.find('latitude').text,i.find('longitude').text] for i in root.findall('./airport/name/../latitude/../longitude/../elevation/..')], columns=['aname','elevation','latitude','longitude'])
In [67]:
airport['elevation']=airport['elevation'].astype(float)
In [66]:
maxairport=airport.iloc[airport['elevation'].idxmax()]
print('The highest airport is '+maxairport['aname']+", located at latitude "+maxairport['latitude']+", longitude "+ maxairport['longitude'])
In [ ]: