In [7]:
import requests
from bs4 import BeautifulSoup
import re
import math
from datetime import datetime
In [2]:
url = 'http://aviation-safety.net/database/dblist.php'
In [30]:
def extract_data(soup):
rows = []
table = soup.find('table')
for row in table.findAll('tr'):
if not len(row) > 16:
cells = row.findAll('td')
try:
inc_date = datetime.strptime(cells[0].text, '%d-%b-%Y')
except ValueError:
continue
if '+' in cells[4].text:
val = cells[4].text.split('+')
val = sum([int(i) for i in val])
else:
val = cells[4].text
data = [inc_date.strftime('%d-%m-%Y'),
cells[1].text.strip(),
cells[2].text.strip(),
cells[3].text.replace(',',''),
str(val),
cells[8].text.strip()]
rows.append(data)
return rows
In [28]:
def write_to_file(fl, data):
for line in data:
fl.write('%s\n' % ','.join(line).encode(encoding='UTF-8',errors='ignore'))
fl.flush()
In [31]:
# 1919 - 2015
# 1995 - 2015
for year in range(1995, 2015):
data_file = open('data/airline/%s.csv' % year, 'w')
payload = {'Year' : year}
r = requests.get(url, params=payload)
soup = BeautifulSoup(r.text)
records = re.search('[0-9]*.', soup.find('span', 'caption').text).group()
pages = math.ceil(int(records)/100)
data = extract_data(soup)
write_to_file(data_file, data)
for page in range(2, int(pages) + 1):
payload = {'Year' : year, 'page': page}
r = requests.get(url, params=payload)
soup = BeautifulSoup(r.text)
data = extract_data(soup)
write_to_file(data_file, data)
data_file.close()