support wide variety of apps
See the official XML Origin and Goals
In [1]:
import xml.etree.ElementTree as ET
import pprint
In [2]:
def get_root(fname):
tree = ET.parse(fname)
return tree.getroot()
In [3]:
article_file = 'exampleResearchArticle.xml'
root = get_root(article_file)
In [4]:
for child in root:
print child.tag
We can use xpath to get nested tags
In [5]:
for a in root.findall("./fm/bibl/aug/au"):
email = a.find("email")
if email is not None:
print email.text
Let's try and get all of author data from the file. May need to look at ElementTree documentation
In [6]:
def get_authors(root):
authors = []
for author in root.findall('./fm/bibl/aug/au'):
data = {
"fnm": author.find('fnm').text,
"snm": author.find('snm').text,
"email": author.find('email').text,
"insr": []
}
for insr in author.findall("insr"):
data['insr'].append(insr.attrib['iid'])
authors.append(data)
return authors
In [7]:
def test1():
solution = [{'insr': ['I1'], 'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'},
{'insr': ['I2'], 'fnm': 'Mike', 'snm': 'Carmont', 'email': 'mcarmont@hotmail.com'},
{'insr': ['I3', 'I4'], 'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'},
{'insr': ['I3'], 'fnm': 'Meir', 'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'},
{'insr': ['I8'], 'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'},
{'insr': ['I3', 'I5'], 'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'},
{'insr': ['I6'], 'fnm': 'Barnaby', 'snm': 'Clarck', 'email': 'barns.nz@gmail.com'},
{'insr': ['I7'], 'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}]
root = get_root(article_file)
data = get_authors(root)
assert data[0] == solution[0]
assert data[1]["fnm"] == solution[1]["fnm"]
assert data[1]["insr"] == solution[1]["insr"]
In [8]:
test1()
Look at BeautifulSoup
In [12]:
from bs4 import BeautifulSoup
In [13]:
def options(soup, id):
option_values = []
carrier_list = soup.find(id = id)
for option in carrier_list.find_all('option'):
option_values.append(option['value'])
return option_values
In [15]:
soup = BeautifulSoup(open('air_trans_home.html'))
In [44]:
carrierList = options(soup, 'CarrierList')
In [54]:
airportList = options(soup, 'AirportList')
In [56]:
def removeAll(_list):
for value in _list[:]:
if value.startswith("All"):
_list.remove(value)
In [57]:
removeAll(carrierList)
removeAll(airportList)
In [58]:
print len(carrierList)
print len(airportList)
Wireshark could be useful
In [29]:
def getValueOfId(soup, id):
return soup.find(id = id)['value']
In [38]:
import requests
transtats_url = "http://www.transtats.bts.gov/Data_Elements.aspx?Data=2"
s = requests.Session()
r = s.get(transtats_url)
soup = BeautifulSoup(r.text)
r = s.post(transtats_url,
data={'AirportList': "BOS",
'CarrierList': "VX",
'Submit': 'Submit',
"__VIEWSTATEGENERATOR": getValueOfId(soup, '__VIEWSTATEGENERATOR'),
"__EVENTTARGET": "",
"__EVENTARGUMENT": "",
"__EVENTVALIDATION": getValueOfId(soup, '__EVENTVALIDATION'),
"__VIEWSTATE": getValueOfId(soup, '__VIEWSTATE')
})
f = open('virgin_and_logan_airport.html', 'w')
f.write(r.text)
In [71]:
soup = BeautifulSoup(open("data/FL-ATL.html"), "lxml")
In [75]:
table = soup.find("table", id="DataGrid1")
In [85]:
rows = table.find_all("tr")[1:]
for row in rows:
row_data = map(lambda x: x.string, row.find_all("td"))
if row_data[1] == 'TOTAL':
continue
print {
'year': int(row_data[0]),
'month': int(row_data[1]),
'flights': {
'domestic': int(row_data[2].replace(",", "")),
'international': int(row_data[3].replace(",", ""))
}
}