In [7]:
import requests
from bs4 import BeautifulSoup
import re
import math
from datetime import datetime

In [2]:
url = 'http://aviation-safety.net/database/dblist.php'

In [30]:
def extract_data(soup):
    rows = []
    table = soup.find('table')
    for row in table.findAll('tr'):
        if not len(row) > 16:
            cells = row.findAll('td')
            try:
                inc_date = datetime.strptime(cells[0].text, '%d-%b-%Y')
            except ValueError:
                continue            
            if '+' in cells[4].text:
                val = cells[4].text.split('+')
                val = sum([int(i) for i in val])
            else:
                val = cells[4].text
            data = [inc_date.strftime('%d-%m-%Y'),
                    cells[1].text.strip(),
                    cells[2].text.strip(),
                    cells[3].text.replace(',',''),
                    str(val),
                    cells[8].text.strip()]
            rows.append(data)
    return rows

In [28]:
def write_to_file(fl, data):
    for line in data:
        fl.write('%s\n' % ','.join(line).encode(encoding='UTF-8',errors='ignore'))
        fl.flush()

In [31]:
# 1919 - 2015
# 1995 - 2015
for year in range(1995, 2015):
    data_file = open('data/airline/%s.csv' % year, 'w')
    payload = {'Year' : year}
    r = requests.get(url, params=payload)
    soup = BeautifulSoup(r.text)
    records = re.search('[0-9]*.', soup.find('span', 'caption').text).group()
    pages = math.ceil(int(records)/100)
    data = extract_data(soup)
    write_to_file(data_file, data)
    for page in range(2, int(pages) + 1):
        payload = {'Year' : year, 'page': page}
        r = requests.get(url, params=payload)
        soup = BeautifulSoup(r.text)
        data = extract_data(soup)
        write_to_file(data_file, data)
    data_file.close()