This is the first step in our project. The code below shows a crawler (written using BeautifulSoup, the old school way) that gets raw HTML data from this site, extracts the data from the HTML tables, and writes it to a MongoDB instance running on the same machine.
The entire data pipeline is shown below:
In [1]:
__author__ = 'shivam_gaur'
import requests
from bs4 import BeautifulSoup
import re
import os
import pymongo
from pymongo import MongoClient
import datetime
In [2]:
# The URL
rooturl = "http://www.planecrashinfo.com"
url = "http://www.planecrashinfo.com/database.htm"
#change start_year to 1920 to crawl the entire dataset
start_year = 2014
end_year = 2016
year_range = range(start_year,end_year+1,1)
newurl=''
In [3]:
# Connecting to Mongo instance
client = MongoClient()
# specify the name of the db in brackets
db = client['aircrashdb']
# specify the name of the collection in brackets
collection = db['crawled_data']
In [4]:
def getMonth(month):
Months = ['january','february','march','april','may','june','july','august','september','october','november','december']
month = month.lower()
for i,value in enumerate(Months):
if value == month:
return i+1
return 0 # if it is not a valid month string
In [5]:
def makeBeautifulSoupObject(url):
# Use a `Session` instance to customize how `requests` handles making HTTP requests.
session = requests.Session()
# `mount` a custom adapter that retries failed connections for HTTP and HTTPS requests, in this case- 5 times
session.mount("http://", requests.adapters.HTTPAdapter(max_retries=5))
session.mount("https://", requests.adapters.HTTPAdapter(max_retries=5))
source_code = session.get(url=url)
plain_text = source_code.text.encode('utf8')
soup = BeautifulSoup(plain_text, "lxml")
return soup
The string.encode('utf-8') is necessary, as the website uses windows-1252 character set- which causes some characters to get messed up if the encoding is not explicitly changed.
This is what the HTML table looks like:
In [6]:
def push_record_to_mongo(table_):
record = {}
table=BeautifulSoup(str(table_[0]))
for tr in table.find_all("tr")[1:]:
tds = tr.find_all("td")
# encoding the 'value' string to utf-8 and removing any non-breaking space (HTML Character)
tmp_str = tds[1].string.encode('utf-8').replace(" ", "")
value = str(tmp_str) # this is the value- In Column #2 of the HTML table
key = tds[0].string # this is the key- In Column #1 of the HTML table
if key == "Date:":
dat = str(value).replace(',','').split(' ')
date = datetime.datetime(int(dat[2]),getMonth(dat[0]),int(dat[1]))
record["date"] = date
elif key == "Time:":
if not value == '?':
time = re.sub("[^0-9]", "",value)
record["time"] = time
else:
record["time"] = "NULL"
elif key == "Location:":
if not value == '?':
record["loc"] = str(value)
else:
record["loc"] = "NULL"
elif key == "Operator:":
if not value == '?':
record["op"] = str(value)
else:
record["op"] = "NULL"
elif key == "Flight#:":
if not value == '?':
record["flight"] = str(value)
else:
record["flight"] = "NULL"
elif key == "Route:":
if not value == '?':
record["route"] = str(value)
else:
record["route"] = "NULL"
elif key == "Registration:":
if not value == '?':
record["reg"] = str(value)
else:
record["reg"] = "NULL"
elif key == "cn / ln:":
if not value == '?':
record["cnln"] = str(value)
else:
record["cnln"] = "NULL"
elif key == "Aboard:":
if not value == '?' :
s = ' '.join(value.split())
aboard_ = s.replace('(','').replace(')','').split(' ')
if aboard_[0] != '?':
record["aboard_total"] = aboard_[0]
else:
record["aboard_total"] = 'NULL'
passengers = aboard_[1].replace("passengers:","")
if passengers != '?':
record["aboard_passengers"] = passengers
else:
record["aboard_passengers"] = 'NULL'
crew = aboard_[2].replace("crew:","")
if crew != '?':
record["aboard_crew"] = crew
else:
record["aboard_crew"] = 'NULL'
else:
record["aboard_total"] = 'NULL'
record["aboard_passengers"] = 'NULL'
record["aboard_crew"] = 'NULL'
elif key == "Fatalities:":
if not value == '?':
s = ' '.join(value.split())
fatalities_ = s.replace('(','').replace(')','').split(' ')
if fatalities_[0] != '?':
record["fatalities_total"] = fatalities_[0]
else:
record["fatalities_total"] = 'NULL'
passengers = fatalities_[1].replace("passengers:","")
if passengers != '?':
record["fatalities_passengers"] = passengers
else:
record["fatalities_passengers"] = 'NULL'
crew = fatalities_[2].replace("crew:","")
if crew != '?':
record["fatalities_crew"] = crew
else:
record["fatalities_crew"] = 'NULL'
else:
record["aboard_total"] = 'NULL'
record["aboard_passengers"] = 'NULL'
record["aboard_crew"] = 'NULL'
elif key == "Ground:":
if not value == '?':
record["ground"] = str(value)
else:
record["ground"] = "NULL"
elif key == "Summary:":
if not value == '?':
record["summary"] = str(value)
else:
record["summary"] = "NULL"
else:
st1 = ''.join(tds[0].string.split()).lower()
if not value == '?':
record[st1] = str(value)
else:
record[st1] = "NULL"
collection.insert_one(record)
In [7]:
program_start_time = datetime.datetime.utcnow() # you could uncomment this line if you wish to time the runtime of blocks from here onwards
for i in year_range:
year_start = datetime.datetime.utcnow()
# appending the path (year) to the url hostname
newurl = rooturl + "/" + str(i) + "/" + str(i) + ".htm"
soup = makeBeautifulSoupObject(newurl)
tables = soup.find_all('table')
print (newurl)
for table in tables:
#finding the no. of records for the given year
number_of_rows = len(table.findAll(lambda tag: tag.name == 'tr' and tag.findParent('table') == table))
row_range = range(1,number_of_rows,1)
for j in row_range:
# appending the row number to sub-path of the url, and building the final url that will be used for sending http request
accident_url = newurl.replace(".htm","") + "-" + str(j) + ".htm"
web_record = makeBeautifulSoupObject(accident_url)
# removing all the boilerplate html code except the data table
table_ = web_record.find_all('table')
push_record_to_mongo(table_)
print("Time to crawl year " + str(i) + "-" + str(datetime.datetime.utcnow()-year_start))
program_end_time = datetime.datetime.utcnow()
print ("_____________________________________")
print ("Total program time - " + str(program_end_time-program_start_time))