In [3]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
import re
import pdb
import time
import json
import datetime
import random
In [100]:
base_url = "https://en.wikipedia.org"
index_ref = "/wiki/List_of_accidents_and_incidents_involving_commercial_aircraft"
index_html = urlopen(base_url + index_ref)
index = BeautifulSoup(index_html, "lxml")
In [101]:
result = index.find_all('li')
However, there are some elements in our list that contain another list of dates, which look something like this:
<li><b><a href="/wiki/1950_Air_France_multiple_Douglas_DC-4_accidents" title="1950 Air France multiple Douglas DC-4 accidents">1950 Air France multiple Douglas DC-4 accidents</a></b>:
<ul>
<li>June 12 – An Air France Douglas DC-4 (F-BBDE) on a flight from Saigon to Paris crashes in the Arabian Sea while on approach to Bahrain Airport, killing 46 of 52 on board.</li>
<li>June 14 – An Air France Douglas DC-4, F-BBDM, crashes in the Arabian Sea while on approach to Bahrain Airport, killing 40 of 53 on board. This aircraft was operating on the same flight route as F-BBDE.</li>
</ul>
</li>
We'll need to make sure we can handle these cases. Additionally, since there are <li> tags within this block, the <li>June 12...</li> and <li>June 14...</li> will appear again in our ResultSet. We should get rid of these duplicates.
Looking at the HTML above, the duplicated <li> entries won't have both links and date separators in them, so that's one way we can drop them. Then we'll save rows from the interior <li> tags, supplementing link information from the parent if necessary.
For example, the entry for September 11 is:
In [102]:
result[829]
Out[102]:
But then there are also separate entries for each <li> in the list inside:
In [103]:
result[830:834]
Out[103]:
In [104]:
result[0].find('a').get('href')
Out[104]:
In [105]:
result[0].text
Out[105]:
In [106]:
result[0].text.split(' – ')
Out[106]:
In [107]:
result[0].text.split(' – ')[0]
Out[107]:
In [108]:
result[0].text.split(' – ')[1]
Out[108]:
We need to write a function that will handle the fact that the date separator changes during the page.
In [109]:
def get_date_separator(html_fragment):
# Date separator changes throughout the document, so let's handle both
if ' – ' in html_fragment.text:
return '–'
elif ' - ' in html_fragment.text:
return '-'
else:
return None
Let's write a function that will take each <li> and extract the crash details (month, day, link, and text description) from each element. We should also make sure that we handle the cases where there are several crashes per <li>.
In [110]:
def extract_details(html_fragment):
# these lists may have one or more elements when returned
bdates, blinks, bdescrips = [], [], []
if html_fragment.find_all('li') == []:
# Then there is only one crash for this bullet
separator = get_date_separator(html_fragment)
blinks.append(html_fragment.find('a').get('href'))
bdates.append(html_fragment.text.split(separator)[0].strip())
bdescrips.append(html_fragment.text.split(separator)[1].strip())
else:
# Then there are multiple crashes for this bullet
for bullet in html_fragment.find_all('li'):
# Dates might appear in current or parent <li>
separator = get_date_separator(bullet)
if separator != None:
bdates.append(bullet.text.split(separator)[0].strip())
bdescrips.append(bullet.text.split(separator)[1].strip())
else:
parent_separator = get_date_separator(html_fragment)
bdates.append(html_fragment.text.split(parent_separator)[0].strip())
bdescrips.append(bullet.text.strip())
# Relevant link might appear in current or parent <li>
if bullet.find('a') == None:
blinks.append(html_fragment.find('a').get('href'))
else:
blinks.append(bullet.find('a').get('href'))
return bdates, blinks, bdescrips
In [111]:
dates_month_day, links, descriptions = [], [], []
for each_li in result:
if (' – ' in each_li.text or ' - ' in each_li.text) and each_li.find('a') != None:
lis_dates, lis_links, lis_descrips = extract_details(each_li)
dates_month_day += lis_dates
links += lis_links
descriptions += lis_descrips
else:
# If neither condition is true, then we hit duplicate or extra links
# elsewhere in the page so we can skip these and throw them away
continue
Sanity check the lengths of each list:
In [112]:
len(dates_month_day), len(links), len(descriptions)
Out[112]:
Looks good! Time to make the DataFrame, which we can do by passing a Python dict:
In [113]:
df = pd.DataFrame({'date': dates_month_day, 'link': links, 'description': descriptions})
Sanity check again:
In [18]:
df.head()
Out[18]:
Let's check that we did everything right for the weird cases by checking one of the bullets that had multiple crashes:
In [19]:
df[df.date == 'September 11']
Out[19]:
This looks like exactly what we expected. Now let's proceed with clicking the links so we can add in the year. We have to click those links anyway to extract the additional crash details, so let's just grab the years from there.
In [20]:
df.to_csv('crashes_question_starter.csv')
In [21]:
df[['description', 'link']].to_csv('crashes_no_extra_credit.csv')
In [22]:
def try_request(url):
html = urlopen(url)
time.sleep(1)
return BeautifulSoup(html, "lxml")
In [23]:
def extract_summary(trs):
date_w_year, passengers, crew, fatalities, survivors = '', 0, 0, 0, 0
registration, origins, destination = 'No data', 'No data', 'No data'
for each_tr in trs:
if each_tr.find('th', text = re.compile('Destination')) != None:
try:
destination = each_tr.td.text
except:
pass
elif each_tr.find('th', text = re.compile('Date')) != None:
try:
date_w_year = each_tr.td.text
except:
pass
elif each_tr.find('th', text = re.compile('Passengers')) != None:
try:
passengers = extract_numbers(each_tr.td.text, passengers)
except:
pass
elif each_tr.find('th', text = re.compile('Crew')) != None:
try:
crew = extract_numbers(each_tr.td.text, passengers)
except:
pass
elif each_tr.find('th', text = re.compile('Fatalities')) != None:
try:
fatalities = extract_numbers(each_tr.td.text, passengers)
except:
pass
elif each_tr.find('th', text = re.compile('Survivors')) != None:
try:
survivors = extract_numbers(each_tr.td.text, passengers)
except:
pass
elif each_tr.find('th', text = re.compile('Flight origin')) != None:
try:
origins = each_tr.td.text
except:
pass
elif each_tr.find('th', text = re.compile('Registration')) != None:
try:
registration = each_tr.td.text
except:
pass
else:
pass
return {'destination': destination,
'date': date_w_year,
'passengers': passengers,
'crew': crew,
'fatalities': fatalities,
'survivors': survivors,
'origins': origins,
'registration': registration}
Upon inspection of the text of each summary, we can see that there are some cases where in addition to (or instead of) an integer, there is some extraneous text or just a string like "all", "unknown", or "none". Let's handle these special cases and extract numbers in a function:
In [89]:
def extract_numbers(td_text, passengers):
"""
Function that handles table data rows to extract numbers.
Handles special cases where there are strings like all, none, etc. in the text
"""
number_regex = re.compile('\d+')
all_regex = re.compile('ll')
none_regex = re.compile('one')
unknown_regex = re.compile('nknown')
try:
data_element = int(number_regex.findall(td_text)[0])
except:
if len(all_regex.findall(td_text)) >= 1:
data_element = passengers
elif len(none_regex.findall(td_text)) >= 1:
data_element = 0
elif len(unknown_regex.findall(td_text)) >= 1:
data_element = 0
else:
data_element = 0
return data_element
In [125]:
# Define lists we use to store our results
dates_w_year, passengers, crew, fatalities, survivors = [], [], [], [], []
registration, origins, destination = [], [], []
for row in links:
# Get HTML of detail page
summary_html = try_request(base_url + row)
trs = summary_html.find_all('tr')
# Extract data from summary HTML
summary = extract_summary(trs)
# Save the data for this page in our lists
dates_w_year.append(summary['date'])
passengers.append(summary['passengers'])
crew.append(summary['crew'])
fatalities.append(summary['fatalities'])
survivors.append(summary['survivors'])
origins.append(summary['origins'])
registration.append(summary['registration'])
destination.append(summary['destination'])
Let's sanity check the lengths of these lists.
In [126]:
len(destination), len(origins), len(registration), len(dates_w_year), len(passengers), len(crew), len(fatalities), len(survivors)
Out[126]:
In [127]:
df_full = pd.DataFrame({'date': dates_w_year, 'link': links, 'description': descriptions, 'passengers': passengers,
'crew': crew, 'fatalities': fatalities, 'survivors': survivors,
'registration': registration, 'flight origin': origins, 'destination': destination})
# save all this scraped stuff!
df_full.to_csv('all_data_rescraped.csv')
In [128]:
df_full = pd.read_csv('all_data_rescraped.csv')
dates_w_year = df_full['date']
In [129]:
df_full.columns
df_full.drop(['Unnamed: 0'], axis=1, inplace=True)
In [130]:
dates_w_year[0:10]
Out[130]:
Let's remove commas.
In [132]:
cleaned_dates = [str(d).replace(',', '') for d in dates_w_year]
Some dates have month first. Some dates have date first. Let's make them consistent while also getting rid of extraneous information appended to the end of the date (like links to references). We'll write our own function to parse dates because we like to do fun and cool things like that.
In [24]:
import calendar
months = list(calendar.month_name)
days = list(calendar.day_name)
dates = [str(d) for d in list(range(1, 32))]
years = [str(y) for y in list(range(1900, 2017))]
In [25]:
def parse_date_strings(text):
split_row = text.split()
month, day, year, date = '', '', '', ''
for each in split_row[0:4]:
if each in months:
month = each
elif each in days:
day = each
elif each in years:
year = each
elif each in dates:
date = each
else:
pass
return {'month': month,
'day': day,
'year': year,
'date': date}
def fix_dates(datecol):
correctedcol = []
for row in datecol:
parsed_date = parse_date_strings(row)
correctedcol.append('{} {} {}'.format(parsed_date['date'],
parsed_date['month'],
parsed_date['year']))
return correctedcol
In [135]:
datescol = fix_dates(cleaned_dates)
In [136]:
datescol[0:5]
Out[136]:
We can see now that our dates are nicely formatted and can create them as datetime objects:
In [137]:
dates_datetime = pd.to_datetime(datescol, format='%d %B %Y', errors='coerce')
In [138]:
df_full['date'] = dates_datetime
In [139]:
df_full.head()
Out[139]:
In [155]:
df_full = pd.DataFrame({'date': dates_datetime, 'link': links, 'description': descriptions, 'passengers': passengers,
'crew': crew, 'fatalities': fatalities, 'survivors': survivors,
'registration': registration, 'flight origin': origins, 'destination': destination})
# save all this scraped stuff!
df_full.to_csv('final_dataframe.csv')
I told you to ignore this for the sake of simplifying this assignment, but it is the case that there are multiple summaries on a couple pages, for example:
https://en.wikipedia.org/wiki/1950_Air_France_multiple_Douglas_DC-4_accidents
These we can separate using the dates.
However, in addition to this, there are some cases where planes crash into one another. In these cases, the summaries are separated into two tables, one for each page, for example:
https://en.wikipedia.org/wiki/1922_Picardie_mid-air_collision
We could handle this in two ways: Sum the numbers for passengers, fatalities, etc. Or we could instead create two rows for these crashes.
We could handle both these cases by doing the following (pseudocode):
if there are multiple tables
get the summary details for the appropriate date
else if there is one table
get the summary details for each summary table and sum them (this is a collision)
We will create a new function extract_summaries() that will implement this approach.
In [147]:
%pdb
In [163]:
def extract_summaries(tables, relevant_date):
if len(tables) == 1:
result = extract_single_table_summary(tables[0])
else:
result = extract_relevant_table_summary(tables, relevant_date)
return {'destination': result['destination'],
'date': result['date'],
'passengers': result['passengers'],
'crew': result['crew'],
'fatalities': result['fatalities'],
'survivors': result['survivors'],
'origins': result['origins'],
'registration': result['registration']}
def pick_out_table(tables, relevant_date):
for table in tables:
trs = table.find_all('tr')
for each_tr in trs:
if each_tr.find('th', text = re.compile('Date')) != None:
# Clean and parse date
date = each_tr.td.text.replace(',', '')
parsed_date = parse_date_strings(date)
if (parsed_date['month'] == relevant_date.split()[0]
and parsed_date['date'] == relevant_date.split()[1]):
return table
return tables[0]
def extract_relevant_table_summary(tables, relevant_date):
date_w_year, passengers, crew, fatalities, survivors = '', 0, 0, 0, 0
registration, origins, destination = '', '', ''
table = pick_out_table(tables, relevant_date)
trs = table.find_all('tr')
for each_tr in trs:
if each_tr.find('th', text = re.compile('Destination')) != None:
try:
destination = each_tr.td.text
except:
pass
elif each_tr.find('th', text = re.compile('Date')) != None:
try:
date_w_year = each_tr.td.text
except:
pass
elif each_tr.find('th', text = re.compile('Passengers')) != None:
try:
passengers = extract_numbers(each_tr.td.text, passengers)
except:
pass
elif each_tr.find('th', text = re.compile('Crew')) != None:
try:
crew = extract_numbers(each_tr.td.text, passengers)
except:
pass
elif each_tr.find('th', text = re.compile('Fatalities')) != None:
try:
fatalities = extract_numbers(each_tr.td.text, passengers)
except:
pass
elif each_tr.find('th', text = re.compile('Survivors')) != None:
try:
survivors = extract_numbers(each_tr.td.text, passengers)
except:
pass
elif each_tr.find('th', text = re.compile('Flight origin')) != None:
try:
origins = each_tr.td.text
except:
pass
elif each_tr.find('th', text = re.compile('Registration')) != None:
try:
registration = each_tr.td.text
except:
pass
else:
continue
return {'destination': destination.strip(),
'date': date_w_year,
'passengers': passengers,
'crew': crew,
'fatalities': fatalities,
'survivors': survivors,
'origins': origins.strip(),
'registration': registration.strip()}
def extract_single_table_summary(table):
date_w_year, passengers, crew, fatalities, survivors = '', 0, 0, 0, 0
registration, origins, destination = '', '', ''
trs = table.find_all('tr')
for each_tr in trs:
if each_tr.find('th', text = re.compile('Destination')) != None:
try:
destination += ' ' + each_tr.td.text
except:
pass
elif each_tr.find('th', text = re.compile('Date')) != None:
try:
date_w_year = each_tr.td.text
except:
pass
elif each_tr.find('th', text = re.compile('Passengers')) != None:
try:
passengers += extract_numbers(each_tr.td.text, passengers)
except:
pass
elif each_tr.find('th', text = re.compile('Crew')) != None:
try:
crew += extract_numbers(each_tr.td.text, passengers)
except:
pass
elif each_tr.find('th', text = re.compile('Fatalities')) != None:
try:
fatalities += extract_numbers(each_tr.td.text, passengers)
except:
pass
elif each_tr.find('th', text = re.compile('Survivors')) != None:
try:
survivors += extract_numbers(each_tr.td.text, passengers)
except:
pass
elif each_tr.find('th', text = re.compile('Flight origin')) != None:
try:
origins += ' ' + each_tr.td.text
except:
pass
elif each_tr.find('th', text = re.compile('Registration')) != None:
try:
registration += ' ' + each_tr.td.text
except:
pass
else:
continue
return {'destination': destination.strip(),
'date': date_w_year,
'passengers': passengers,
'crew': crew,
'fatalities': fatalities,
'survivors': survivors,
'origins': origins.strip(),
'registration': registration.strip()}
Let's test with the two URLs from earlier:
In [120]:
test_collision_url = 'https://en.wikipedia.org/wiki/1922_Picardie_mid-air_collision'
summary_html = try_request(test_collision_url)
In [59]:
tables = summary_html.find_all('table', {"class" : "infobox vcard vevent"})
result_updated = extract_summaries(tables)
In [117]:
result_updated
Out[117]:
Looks like we can correctly extract a summary table with multiple aircraft in it.
Now let's try on a page that has multiple crashes in it on different days.
In [150]:
test_multiple_dates_url = 'https://en.wikipedia.org/wiki/1950_Air_France_multiple_Douglas_DC-4_accidents'
summary_html = try_request(test_multiple_dates_url)
In [151]:
first_crash = 'June 12'
second_crash = 'June 14'
In [152]:
tables = summary_html.find_all('table', {"class" : "infobox vcard vevent"})
result_updated = extract_summaries(tables, first_crash)
In [153]:
result_updated
Out[153]:
In [154]:
result_updated = extract_summaries(tables, second_crash)
In [155]:
result_updated
Out[155]:
In [164]:
dates_w_year, passengers, crew, fatalities, survivors = [], [], [], [], []
registration, origins, destination = [], [], []
for num_row in range(len(links)):
# Get HTML of detail page
summary_html = try_request(base_url + links[num_row])
# Get tables that are in these sidebars (mostly one, but sometimes multiple)
tables = summary_html.find_all('table', {"class" : ["infobox", "vcard"]})
# Extract data from summary HTML
summary = extract_summaries(tables, dates_month_day[num_row])
# Save the data for this page in our lists
dates_w_year.append(summary['date'])
passengers.append(summary['passengers'])
crew.append(summary['crew'])
fatalities.append(summary['fatalities'])
survivors.append(summary['survivors'])
origins.append(summary['origins'])
registration.append(summary['registration'])
destination.append(summary['destination'])
In [165]:
# Clean dates
cleaned_dates = [str(d).replace(',', '') for d in dates_w_year]
datescol = fix_dates(cleaned_dates)
dates_datetime = pd.to_datetime(datescol, format='%d %B %Y', errors='coerce')
# Save!
df_summary = pd.DataFrame({'date': dates_datetime, 'link': links, 'description': descriptions, 'passengers': passengers,
'crew': crew, 'fatalities': fatalities, 'survivors': survivors,
'registration': registration, 'flight origin': origins, 'destination': destination})
# save all this scraped stuff!
df_summary.to_csv('final_dataframe_summary.csv')
In [140]:
top_5_crashes = df_full.sort_values('fatalities', ascending=False)[0:5]
So the top 5 crashes, the number of fatalities and the flight origin was:
In [141]:
top_5_crashes[['fatalities', 'flight origin']]
Out[141]:
Let's see the description:
In [154]:
top_5_crashes['description']
Out[154]:
It's 2016, so let's take accidents from 1991 and later and see which is the most common flight origin.
This crash is the first one to occur in 1991:
In [148]:
df_full.date[672]
Out[148]:
In [149]:
recent_incidents = df_full[673:]
In [150]:
recent_incidents['flight origin'].value_counts()[0:5]
Out[150]:
Without de-duplication, Bergen Airport, Ninoy Aquino International Airport, and Domodedovo International Airport in Moscow had the highest number of aviation incidents.
Out of curiosity, let's do this for the entire dataset:
In [151]:
df_full['flight origin'].value_counts()[0:10]
Out[151]:
London Heathrow and LAX (entered twice as two slightly different strings) come out on top, which is not unexpected given the number of flights these airports have.
Note that one way we could proceed with de-duplication would be to use the fact that the summary tables actually contain links to their corresponding wikipedia pages. We could link together strings that correspond to the same airport using their common link.
In [152]:
df_full.to_json('crashes.json')