notebook.community

Edit and run



In [8]:

    
from time import sleep
from bs4 import BeautifulSoup
import requests


def find_next_wikipedia_link(url):
    '''Given a Wikipedia URL, find the first Wikipedia URL
    on the page, not including disambiguation or pronunciation.
    '''
    # Be polite to Wikipedia. It gives us so much.
    sleep(30)
    
    # pull the HTML text and parse it as XML  https://en.wikipedia.org/wiki/Help:IPA_for_English
    html = requests.get(url).text
    soup = BeautifulSoup(html)
    if soup.find('table'):
        soup.find('table').decompose()
    if soup.find('table'):
        soup.find('table').decompose()
    
    # open up each paragraph and look for attribute links
    for paragraph in soup.find_all('p'):
        attrs = paragraph.find_all('a')
        for attr in attrs:
            link = attr.get('href')
            if not link: continue
            if not link.startswith('/wiki/'): continue
            if link.startswith('/wiki/Help'): continue
            if 'cite_note' in link: continue
            if link.endswith('.ogg'): continue
            return link
    
    return None



In [9]:

    
print(find_next_wikipedia_link('https://en.wikipedia.org/wiki/Mathematics'))
print(find_next_wikipedia_link('https://en.wikipedia.org/wiki/Blacksmith'))
print(find_next_wikipedia_link('https://en.wikipedia.org/wiki/Kakapo'))
print(find_next_wikipedia_link('https://en.wikipedia.org/wiki/Life'))









    



/wiki/Ancient_Greek
/wiki/Metalsmith
/wiki/M%C4%81ori_language
/wiki/Physical_body



In [10]:

    
def find_philosophy(url):
    '''The theory goes that if you follow the first link on
    any Wikipedia article, you will always find your way back
    to the main Philosophy article. This function will test that
    theory, given a particular starting link.
    It will also trap the case of an infinite loop.'''
    # This is a partial URL, to support all languages.
    PHILOSOPHY_URL = 'wikipedia.org/wiki/Philosophy'
    BASE_URL = 'https://en.wikipedia.org'
    pages = [url]
    current_url = url
    while not current_url.endswith(PHILOSOPHY_URL):
        current_url = BASE_URL + find_next_wikipedia_link(current_url)
        if current_url in pages:
            pages.append(current_url)
            return pages
        pages.append(current_url)
    
    return pages



In [11]:

    
def print_philosophy(link_list):
    '''Just a pretty-print helper method.'''
    if link_list[-1].endswith('/Philosophy'):
        print('\nPhilosophy found!\n')
    else:
        print('\nPhilosophy not found!\n')
    print(str(len(link_list)) + ' steps:')
    for link in link_list:
        print(link.split('/')[-1])


pages = find_philosophy('https://en.wikipedia.org/wiki/Kakapo')
print_philosophy(pages)









    



Philosophy found!

23 steps:
Kakapo
M%C4%81ori_language
Eastern_Polynesian_languages
Language_family
Language
Communication
Latin
Classical_language
Literature
Sense
Physiological
Ancient_Greek_language
Greek_language
Modern_Greek
Colloquialism
Word
Linguistics
Science
Knowledge
Awareness
Conscious
Quality_(philosophy)
Philosophy



In [91]:

    
pages = find_philosophy('https://en.wikipedia.org/wiki/Doctor_Who')
print_philosophy(pages)









    



Philosophy found!

21 steps:
Doctor_Who
British_television_science_fiction
BBC
United_Kingdom
Sovereign_state
International_law
State_(polity)
Political_division
Region
Geography
Greek_language
Modern_Greek
Colloquialism
Word
Linguistics
Science
Knowledge
Awareness
Conscious
Quality_(philosophy)
Philosophy



In [93]:

    
pages = find_philosophy('https://en.wikipedia.org/wiki/Norman_Borlaug')
print_philosophy(pages)









    



Philosophy found!

8 steps:
Norman_Borlaug
Biologist
Scientist
Knowledge
Awareness
Conscious
Quality_(philosophy)
Philosophy



In [94]:

    
pages = find_philosophy('https://en.wikipedia.org/wiki/Arya_Stark')
print_philosophy(pages)









    



Philosophy found!

22 steps:
Arya_Stark
Fictional_character
Person
Human
Homo_sapiens
Latin
Classical_language
Literature
Sense
Physiological
Ancient_Greek_language
Greek_language
Modern_Greek
Colloquialism
Word
Linguistics
Science
Knowledge
Awareness
Conscious
Quality_(philosophy)
Philosophy



In [13]:

    
pages = find_philosophy('https://en.wikipedia.org/wiki/Dune_%28novel%29')
print_philosophy(pages)









    



Philosophy found!

24 steps:
Dune_%28novel%29
Epic_(genre)
Genre
French_language
Romance_languages
Vulgar_Latin
Classical_Latin
Latin
Classical_language
Literature
Sense
Physiological
Ancient_Greek_language
Greek_language
Modern_Greek
Colloquialism
Word
Linguistics
Science
Knowledge
Awareness
Conscious
Quality_(philosophy)
Philosophy



In [ ]: