In [1]:
import urllib.request
from bs4 import BeautifulSoup as bs
In [69]:
def _extract_titles(movie_table):
rows = movie_table.find_all('tr')
content = []
for row in rows:
content.append([td.get_text() for td in row.find_all(["td", "th"])])
titles = []
for row in content:
if len(row) == 6:
titles.append(row[0])
elif len(row) == 7:
titles.append(row[1])
elif len(row) == 8:
titles.append(row[2])
else:
print("unknown length!")
print(row)
return titles
In [70]:
def _extract_titles_from_wiki_page(wiki_url):
wiki_page = bs(urllib.request.urlopen(wiki_url), "html.parser")
movies_tables = wiki_page.find_all('table', {'class': 'wikitable'})
titles = []
for table in movies_tables:
print("Extracting a table...")
titles += _extract_titles(table)
titles = [title for title in titles if title != "Title"]
print('{} titles collected.'.format(len(titles)))
return titles
In [71]:
url_map_by_year = {
2014: 'https://en.wikipedia.org/wiki/List_of_American_films_of_2014',
2015: 'https://en.wikipedia.org/wiki/List_of_American_films_of_2015',
}
In [72]:
titles_by_year = {}
In [73]:
for year in url_map_by_year:
titles_by_year[year] = _extract_titles_from_wiki_page(url_map_by_year[year])
In [5]:
# def _old_extract(wiki_url):
wiki_url = 'https://en.wikipedia.org/wiki/List_of_American_films_of_2013'
wiki_page = bs(urllib.request.urlopen(wiki_url), "html.parser")
table = wiki_page.find_all('table', {'class': 'wikitable'})[0]
In [51]:
import re
In [75]:
MIRROR_REGEX = r"([\w\s]+):\1"
In [91]:
def _parse_title(title):
if "TheThe" in title:
title = title[title.rfind('TheThe')+3:]
elif "AA" in title:
title = title[title.rfind('AA')+1:]
else:
matches = re.findall(MIRROR_REGEX, title)
if len(matches) > 0:
title = title[title.rfind(matches[0]):]
return title.strip()
In [92]:
rows = table.find_all('tr')
titles = []
for row in rows:
try:
titles.append(_parse_title(row.find_all(["td"])[0].get_text()))
except IndexError:
pass
In [94]:
# titles