New-style pages (2014 onwards)


In [1]:
import urllib.request
from bs4 import BeautifulSoup as bs

In [69]:
def _extract_titles(movie_table):
    rows = movie_table.find_all('tr')
    content = []
    for row in rows:
        content.append([td.get_text() for td in row.find_all(["td", "th"])])
    titles = []
    for row in content:
        if len(row) == 6:
            titles.append(row[0])
        elif len(row) == 7:
            titles.append(row[1])
        elif len(row) == 8:
            titles.append(row[2])
        else:
            print("unknown length!")
            print(row)
    return titles

In [70]:
def _extract_titles_from_wiki_page(wiki_url):
    wiki_page = bs(urllib.request.urlopen(wiki_url), "html.parser")
    movies_tables = wiki_page.find_all('table', {'class': 'wikitable'})
    titles = []
    for table in movies_tables:
        print("Extracting a table...")
        titles += _extract_titles(table)
    titles = [title for title in titles if title != "Title"]
    print('{} titles collected.'.format(len(titles)))
    return titles

In [71]:
url_map_by_year = {
    2014: 'https://en.wikipedia.org/wiki/List_of_American_films_of_2014',
    2015: 'https://en.wikipedia.org/wiki/List_of_American_films_of_2015',   
}

In [72]:
titles_by_year = {}

In [73]:
for year in url_map_by_year:
    titles_by_year[year] = _extract_titles_from_wiki_page(url_map_by_year[year])


Extracting a table...
Extracting a table...
Extracting a table...
Extracting a table...
212 titles collected.
Extracting a table...
Extracting a table...
Extracting a table...
Extracting a table...
129 titles collected.

Old-style pages (before 2014)


In [5]:
# def _old_extract(wiki_url):
wiki_url = 'https://en.wikipedia.org/wiki/List_of_American_films_of_2013'
wiki_page = bs(urllib.request.urlopen(wiki_url), "html.parser")
table = wiki_page.find_all('table', {'class': 'wikitable'})[0]

In [51]:
import re

In [75]:
MIRROR_REGEX = r"([\w\s]+):\1"

In [91]:
def _parse_title(title):
    if "TheThe" in title:
        title = title[title.rfind('TheThe')+3:]
    elif "AA" in title:
        title = title[title.rfind('AA')+1:]
    else:
        matches = re.findall(MIRROR_REGEX, title)
        if len(matches) > 0:
            title = title[title.rfind(matches[0]):]
    return title.strip()

In [92]:
rows = table.find_all('tr')
titles = []
for row in rows:
    try:
        titles.append(_parse_title(row.find_all(["td"])[0].get_text()))
    except IndexError:
        pass

In [94]:
# titles