In [8]:
from time import sleep
from bs4 import BeautifulSoup
import requests
def find_next_wikipedia_link(url):
'''Given a Wikipedia URL, find the first Wikipedia URL
on the page, not including disambiguation or pronunciation.
'''
# Be polite to Wikipedia. It gives us so much.
sleep(30)
# pull the HTML text and parse it as XML https://en.wikipedia.org/wiki/Help:IPA_for_English
html = requests.get(url).text
soup = BeautifulSoup(html)
if soup.find('table'):
soup.find('table').decompose()
if soup.find('table'):
soup.find('table').decompose()
# open up each paragraph and look for attribute links
for paragraph in soup.find_all('p'):
attrs = paragraph.find_all('a')
for attr in attrs:
link = attr.get('href')
if not link: continue
if not link.startswith('/wiki/'): continue
if link.startswith('/wiki/Help'): continue
if 'cite_note' in link: continue
if link.endswith('.ogg'): continue
return link
return None
In [9]:
print(find_next_wikipedia_link('https://en.wikipedia.org/wiki/Mathematics'))
print(find_next_wikipedia_link('https://en.wikipedia.org/wiki/Blacksmith'))
print(find_next_wikipedia_link('https://en.wikipedia.org/wiki/Kakapo'))
print(find_next_wikipedia_link('https://en.wikipedia.org/wiki/Life'))
In [10]:
def find_philosophy(url):
'''The theory goes that if you follow the first link on
any Wikipedia article, you will always find your way back
to the main Philosophy article. This function will test that
theory, given a particular starting link.
It will also trap the case of an infinite loop.'''
# This is a partial URL, to support all languages.
PHILOSOPHY_URL = 'wikipedia.org/wiki/Philosophy'
BASE_URL = 'https://en.wikipedia.org'
pages = [url]
current_url = url
while not current_url.endswith(PHILOSOPHY_URL):
current_url = BASE_URL + find_next_wikipedia_link(current_url)
if current_url in pages:
pages.append(current_url)
return pages
pages.append(current_url)
return pages
In [11]:
def print_philosophy(link_list):
'''Just a pretty-print helper method.'''
if link_list[-1].endswith('/Philosophy'):
print('\nPhilosophy found!\n')
else:
print('\nPhilosophy not found!\n')
print(str(len(link_list)) + ' steps:')
for link in link_list:
print(link.split('/')[-1])
pages = find_philosophy('https://en.wikipedia.org/wiki/Kakapo')
print_philosophy(pages)
In [91]:
pages = find_philosophy('https://en.wikipedia.org/wiki/Doctor_Who')
print_philosophy(pages)
In [93]:
pages = find_philosophy('https://en.wikipedia.org/wiki/Norman_Borlaug')
print_philosophy(pages)
In [94]:
pages = find_philosophy('https://en.wikipedia.org/wiki/Arya_Stark')
print_philosophy(pages)
In [13]:
pages = find_philosophy('https://en.wikipedia.org/wiki/Dune_%28novel%29')
print_philosophy(pages)
In [ ]: