Lesson 2

Data in more complex formats


In [3]:
# set up environment

Carrier list


In [7]:
page = "options.html"
with open(page, "r") as html:
    soup = BeautifulSoup(html, "lxml")

In [38]:
cl = soup.find(id="CarrierList").find_all("option")
print(cl)


[<option selected="selected" value="All">All U.S. and Foreign Carriers</option>, <option value="AllUS">All U.S. Carriers</option>, <option value="AllForeign">All Foreign Carriers</option>, <option value="FL">AirTran Airways</option>, <option value="AS">Alaska Airlines </option>, <option value="AA">American Airlines </option>, <option value="MQ">American Eagle Airlines </option>, <option value="5Y">Atlas Air </option>, <option value="DL">Delta Air Lines </option>, <option value="EV">ExpressJet Airlines </option>, <option value="F9">Frontier Airlines </option>, <option value="HA">Hawaiian Airlines </option>, <option value="B6">JetBlue Airways</option>, <option value="OO">SkyWest Airlines </option>, <option value="WN">Southwest Airlines </option>, <option value="NK">Spirit Air Lines</option>, <option value="US">US Airways </option>, <option value="UA">United Air Lines </option>, <option value="VX">Virgin America</option>]

In [44]:
result = []
for e in cl:
    if (e["value"] == "All") | (e["value"] == "AllUS") | (e["value"] == "AllForeign"):
        next
    else:
        result.append(e["value"])

print(result)


['FL', 'AS', 'AA', 'MQ', '5Y', 'DL', 'EV', 'F9', 'HA', 'B6', 'OO', 'WN', 'NK', 'US', 'UA', 'VX']

In [56]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Please note that the function 'make_request' is provided for your reference only.
You will not be able to to actually use it from within the Udacity web UI.
All your changes should be in the 'extract_carrier' function.
Also note that the html file is a stripped down version of what is actually on
the website.

Your task in this exercise is to get a list of all airlines. Exclude all of the
combination values like "All U.S. Carriers" from the data that you return.
You should return a list of codes for the carriers.
"""

from bs4 import BeautifulSoup
html_page = "options.html"


def extract_carriers(page):
    data = []

    with open(page, "r") as html:
        # parse html pasge
        soup = BeautifulSoup(html, "lxml")
        
        # find list of carries in parsed page
        carrierlist = soup.find(id="CarrierList").find_all("option")
        
        # filter combination values and save valid results to list
        for e in carrierlist:
            if (e["value"] == "All") | (e["value"] == "AllUS") | (e["value"] == "AllForeign"):
                next
            else:
                data.append(e["value"])
        
    return data


def make_request(data):
    eventvalidation = data["eventvalidation"]
    viewstate = data["viewstate"]
    airport = data["airport"]
    carrier = data["carrier"]

    r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
                    data={'AirportList': airport,
                          'CarrierList': carrier,
                          'Submit': 'Submit',
                          "__EVENTTARGET": "",
                          "__EVENTARGUMENT": "",
                          "__EVENTVALIDATION": eventvalidation,
                          "__VIEWSTATE": viewstate
                    })

    return r.text


def test():
    data = extract_carriers(html_page)
    assert len(data) == 16
    assert "FL" in data
    assert "NK" in data

if __name__ == "__main__":
    test()

In [60]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Complete the 'extract_airports' function so that it returns a list of airport
codes, excluding any combinations like "All".
"""

from bs4 import BeautifulSoup
html_page = "options.html"


def extract_airports(page):
    data = []
    with open(page, "r") as html:
        # parse html pasge
        soup = BeautifulSoup(html, "lxml")
        
        # find list of airports in parsed page
        airportlist = soup.find(id="AirportList").find_all("option")
        
        # filter combination values and save valid results to list
        for e in airportlist:
            if (e["value"] == "All") | (e["value"] == "AllMajors") | (e["value"] == "AllOthers"):
                next
            else:
                data.append(e["value"])
    
    return data


def test():
    data = extract_airports(html_page)
    assert len(data) == 15
    assert "ATL" in data
    assert "ABR" in data

if __name__ == "__main__":
    test()


15
['ATL', 'BWI', 'BOS', 'CLT', 'MDW', 'ORD', 'DFW', 'DEN', 'DTW', 'FLL', 'IAH', 'LAS', 'LAX', 'ABR', 'ABI']

In [ ]:
page2 = "FL-ATL.htm"
with open(page2, "r") as html:
    soup = BeautifulSoup(html, "lxml")

In [105]:
table = soup.find("table", class_="dataTDRight")
print(table)


<table border="1" cellpadding="4" cellspacing="1" class="dataTDRight" id="DataGrid1" rules="all" style="width:750px;">
<tr class="libraryTHY2_Center" style="color:White;background-color:#5D95C9;">
<td>Year</td><td>Month</td><td>DOMESTIC</td><td>INTERNATIONAL</td><td>TOTAL</td>
</tr><tr class="dataTDRight">
<td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">2002</td><td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">10</td><td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">815,489</td><td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">92,565</td><td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">908,054</td>
</tr><tr class="dataTDRight" style="background-color:#EFEFEF;">
<td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">2002</td><td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">11</td><td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">766,775</td><td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">91,342</td><td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">858,117</td>
</tr><tr class="dataTDRight">
<td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">2002</td><td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">12</td><td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">782,175</td><td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">96,881</td><td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;">879,056</td>
</tr><tr class="dataTDRight" style="background-color:LightYellow;">
<td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;"><b>2002</b></td><td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;"><b>TOTAL</b></td><td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;"><b>8,085,083</b></td><td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;"><b>1,023,994</b></td><td style="font-family: Verdana, Geneva, Arial, Helvetica, sans-serif;"><b>9,109,077</b></td>
</tr><tr class="dataTDRight">
</tr>
</table>

In [144]:
data = []
info = {}
result = []
header = ["Year", "Month", "DOMESTIC", "INTERNATIONAL", "TOTAL"]

# loop over rows
for row in table.find_all("tr"):
    
    # loop over columns
    for col in row.find_all("td"):
        
        # filter header strings
        if col.text in header:
            pass
        
        # add data to result and fix thousand separator
        else:
            result.append(col.text.replace(",", ""))
            
# save result data for each row
info["year"] = int(result[0])
info["month"] = int(result[1])
info["flights"] = {}
info["flights"]["domestic"] = int(result[2])
info["flights"]["international"] = int(result[3])
    
# append temp data to final result
print(info)


{'year': 2002, 'month': 10, 'flights': {'international': 92565, 'domestic': 815489}}

In [134]:
info = {}
info["year"] = int(result[0])
info["month"] = int(result[1])
info["flights"] = {}
info["flights"]["domestic"] = int(result[2])
info["flights"]["international"] = int(result[3])

print(info)


{'year': 2002, 'month': 10, 'flights': {'international': 92565, 'domestic': 815489}}

In [149]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Let's assume that you combined the code from the previous 2 exercises with code
from the lesson on how to build requests, and downloaded all the data locally.
The files are in a directory "data", named after the carrier and airport:
"{}-{}.html".format(carrier, airport), for example "FL-ATL.html".

The table with flight info has a table class="dataTDRight". Your task is to
extract the flight data from that table as a list of dictionaries, each
dictionary containing relevant data from the file and table row. This is an
example of the data structure you should return:

data = [{"courier": "FL",
         "airport": "ATL",
         "year": 2012,
         "month": 12,
         "flights": {"domestic": 100,
                     "international": 100}
        },
         {"courier": "..."}
]

Note - year, month, and the flight data should be integers.
You should skip the rows that contain the TOTAL data for a year.

There are couple of helper functions to deal with the data files.
Please do not change them for grading purposes.
All your changes should be in the 'process_file' function.
"""
from bs4 import BeautifulSoup
from zipfile import ZipFile
import os

#datadir = "data"
datadir = ""

def open_zip(datadir):
    with ZipFile('{0}.zip'.format(datadir), 'r') as myzip:
        myzip.extractall()


def process_all(datadir):
    files = os.listdir(datadir)
    return files


def process_file(f):
    """
    This function extracts data from the file given as the function argument in
    a list of dictionaries. This is example of the data structure you should
    return:

    data = [{"courier": "FL",
             "airport": "ATL",
             "year": 2012,
             "month": 12,
             "flights": {"domestic": 100,
                         "international": 100}
            },
            {"courier": "..."}
    ]


    Note - year, month, and the flight data should be integers.
    You should skip the rows that contain the TOTAL data for a year.
    """
    data = []
    info = {}
    info["courier"], info["airport"] = f[:6].split("-")
    # Note: create a new dictionary for each entry in the output data list.
    # If you use the info dictionary defined here each element in the list 
    # will be a reference to the same info dictionary.
    with open("{}/{}".format(datadir, f), "r") as html:
        
        # parse html data
        soup = BeautifulSoup(html, "lxml")
        
        # find relevant data
        table = soup.find("table", class_="dataTDRight")
        
        # find all rows
        rows = table.find_all("tr")

        # loop over rows        
        for row in rows:
            
            # for reach row find all cells
            cells = row.find_all("td")
            
            # filter TOTAL
            headers = ["Year", "Month", "DOMESTIC", "INTERNATIONAL", "TOTAL"]
            if (cells[1].text not in headers):
        
                # add information to dict
                info = {
                    "courier" : info["courier"],
                    "airport" : info["airport"],
                    "year" : int(cells[0].text),
                    "month" : int(cells[1].text),
                    "flights" : {
                        "domestic" : int(cells[2].text.replace(",","")), # remove punctuation
                        "international" : int(cells[3].text.replace(",","")) # remove punctuation
                        }  
                    }
   
                # append info dict to data list
                data.append(info)
            
    return data


def test():
    print("Running a simple test...")
    open_zip(datadir)
    files = process_all(datadir)
    data = []
    # Test will loop over three data files.
    for f in files:
        data += process_file(f)
        
    assert len(data) == 399  # Total number of rows
    for entry in data[:3]:
        assert type(entry["year"]) == int
        assert type(entry["month"]) == int
        assert type(entry["flights"]["domestic"]) == int
        assert len(entry["airport"]) == 3
        assert len(entry["courier"]) == 2
    assert data[0]["courier"] == 'FL'
    assert data[0]["month"] == 10
    assert data[-1]["airport"] == "ATL"
    assert data[-1]["flights"] == {'international': 108289, 'domestic': 701425}
    
    print("... success!")

if __name__ == "__main__":
    test()


Running a simple test...
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-149-5ff9f47b4317> in <module>()
    135 
    136 if __name__ == "__main__":
--> 137     test()

<ipython-input-149-5ff9f47b4317> in test()
    113 def test():
    114     print("Running a simple test...")
--> 115     open_zip(datadir)
    116     files = process_all(datadir)
    117     data = []

<ipython-input-149-5ff9f47b4317> in open_zip(datadir)
     37 
     38 def open_zip(datadir):
---> 39     with ZipFile('{0}.zip'.format(datadir), 'r') as myzip:
     40         myzip.extractall()
     41 

/Users/stefan/anaconda/lib/python3.4/zipfile.py in __init__(self, file, mode, compression, allowZip64)
    921             modeDict = {'r' : 'rb', 'w': 'wb', 'a' : 'r+b'}
    922             try:
--> 923                 self.fp = io.open(file, modeDict[mode])
    924             except OSError:
    925                 if mode == 'a':

FileNotFoundError: [Errno 2] No such file or directory: '.zip'

In [ ]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This and the following exercise are using US Patent database. The patent.data
file is a small excerpt of much larger datafiles that are available for
download from US Patent website. These files are pretty large ( >100 MB each).
The original file is ~600MB large, you might not be able to open it in a text
editor.

The data itself is in XML, however there is a problem with how it's formatted.
Please run this script and observe the error. Then find the line that is
causing the error. You can do that by just looking at the datafile in the web
UI, or programmatically. For quiz purposes it does not matter, but as an
exercise we suggest that you try to do it programmatically.

NOTE: You do not need to correct the error - for now, just find where the error
is occurring.
"""

import xml.etree.ElementTree as ET

PATENTS = 'patent.data'

def get_root(fname):

    tree = ET.parse(fname)
    return tree.getroot()


get_root(PATENTS)

In [165]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# So, the problem is that the gigantic file is actually not a valid XML, because
# it has several root elements, and XML declarations.
# It is, a matter of fact, a collection of a lot of concatenated XML documents.
# So, one solution would be to split the file into separate documents,
# so that you can process the resulting files as valid XML documents.

import xml.etree.ElementTree as ET
PATENTS = 'patent.data'

def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()


def split_file(filename):
    """
    Split the input file into separate files, each containing a single patent.
    As a hint - each patent declaration starts with the same line that was
    causing the error found in the previous exercises.
    
    The new files should be saved with filename in the following format:
    "{}-{}".format(filename, n) where n is a counter, starting from 0.
    """
    
    # open input file
    with open(PATENTS, "r") as f_in:
        
        # read xml tag
        xml_tag = f_in.readline()
        
        # split lines according to xml tag
        lines = f_in.read().split(xml_tag)
        
        # loop over lines
        for n, line in enumerate(lines):
            
            # write lines to separate files
            with open("{}-{}".format(PATENTS, n), "w") as f_out:
                f_out.writelines(xml_tag)
                f_out.write(line.strip())

def test():
    split_file(PATENTS)
    for n in range(4):
        try:
            fname = "{}-{}".format(PATENTS, n)
            f = open(fname, "r")
            if not f.readline().startswith("<?xml"):
                print("You have not split the file {} in the correct boundary!".format(fname))
            f.close()
        except:
            print("Could not find file {}. Check if the filename is correct!".format(fname))


test()