Read Contents from URL


In [43]:
from urllib.request import urlopen

url_response = urlopen('http://www.py4inf.com/code/romeo.txt')

contents = str(url_response.read())

print(contents)


b'But soft what light through yonder window breaks\nIt is the east and Juliet is the sun\nArise fair sun and kill the envious moon\nWho is already sick and pale with grief\n'

Split Contents Into Lines Using New-line ('\n')


In [44]:
lines = contents.split('\\n')

print(lines)


["b'But soft what light through yonder window breaks", 'It is the east and Juliet is the sun', 'Arise fair sun and kill the envious moon', 'Who is already sick and pale with grief', "'"]

Extract Words from Each Line

Note: You might have to strip out the ' character that seems to be slipping through. My guess is that you'll have to specify a RegEx expression to only accept.


In [47]:
Jword_set = set()

for line in lines:
    # Passing no args to split() will do what you want in this case:
    #   split on all weird characters (aka whitespace characters)
    words = line.split()
    for word in words:
        # Lowercase the word or else alphabetical sort puts capitals ahead
        word = word.lower()
        # Adding to a set (vs list) will automatically de-duplicate
        word_set.add(word)

print(word_set)


{'moon', 'what', 'with', 'the', 'it', 'pale', 'window', 'grief', 'arise', 'fair', 'and', 'soft', 'sun', "b'but", 'envious', 'kill', 'who', 'light', 'through', 'juliet', 'yonder', 'is', 'sick', 'breaks', 'east', 'already', "'"}

In [49]:
sorted_word_set = sorted(word_set)

print(sorted_word_set)


["'", 'already', 'and', 'arise', "b'but", 'breaks', 'east', 'envious', 'fair', 'grief', 'is', 'it', 'juliet', 'kill', 'light', 'moon', 'pale', 'sick', 'soft', 'sun', 'the', 'through', 'what', 'who', 'window', 'with', 'yonder']

In [ ]:


In [ ]: