In [1]:
# the beginning of a crawler...
# find the url in the response and use that for the next step

import urllib.request

fhand = urllib.request.urlopen('http://www.dr-chuck.com/page1.htm')
for line in fhand:
    print(line.strip())


b'<h1>The First Page</h1>'
b'<p>'
b'If you like, you can switch to the'
b'<a href="http://www.dr-chuck.com/page2.htm">'
b'Second Page</a>.'
b'</p>'

Parsing with Beautiful soup

Beautifulsoup is already installed in the Anaconda package


In [ ]:
# rewritten the code for Python 3... if fetches anchor tags

import urllib.request
from bs4 import BeautifulSoup

url = input('Enter - ')

html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'lxml')

# Retrieve a list of anchor tags
# Each tag is like a dictionary of HTML attributes

tags = soup('a')
for tag in tags:
    print(tag.get('href', None))
    
# test on this page: http://www.dr-chuck.com/page1.htm

Doing the first assignment


In [31]:
import urllib.request
from bs4 import BeautifulSoup

url = 'http://python-data.dr-chuck.net/comments_371514.html'

html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'lxml')

# Retrieve a list of anchor tags
# Each tag is like a dictionary of HTML attributes

tags = soup('span')
#print(tags)

sum_of_tags = 0

for tag in tags:
    sum_of_tags += int(tag.contents[0])
    
print(sum_of_tags)


2482

In [33]:
# Example code to retrieve information

# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
    # Look at the parts of a tag
    print('TAG:',tag)
    print('URL:',tag.get('href', None))
    print('Contents:',tag.contents[0])
    print('Attrs:',tag.attrs)
    
def adding_series():
    summ = 0
    for i in range(11):
        summ += i
    return summ

adding_series()


Out[33]:
55

In [ ]:
# Making a function out of the assignment

In [4]:
import urllib.request
from bs4 import BeautifulSoup

url = 'http://python-data.dr-chuck.net/comments_371514.html'

def assign1(url):
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'lxml')
    tags = soup('span')
    
    sum_of_tags = 0
    for tag in tags:
        sum_of_tags += int(tag.contents[0])
    return sum_of_tags
    
assign1(url)


Out[4]:
2482

Doing the second assignment


In [28]:
# training

import urllib.request
from bs4 import BeautifulSoup

seed_url = 'http://python-data.dr-chuck.net/known_by_Fikret.html'
urls = [seed_url]

position = 3
count = 4

pos = position - 1

for i in range(count):
    html = urllib.request.urlopen(urls[-1]).read()
    soup = BeautifulSoup(html, 'lxml')

    tags = soup('a')
    urls.append(tags[pos].get('href', None))

print(urls)


['http://python-data.dr-chuck.net/known_by_Fikret.html', 'http://python-data.dr-chuck.net/known_by_Montgomery.html', 'http://python-data.dr-chuck.net/known_by_Mhairade.html', 'http://python-data.dr-chuck.net/known_by_Butchi.html', 'http://python-data.dr-chuck.net/known_by_Anayah.html']

In [30]:
# solution

import urllib.request
from bs4 import BeautifulSoup

seed_url = 'http://python-data.dr-chuck.net/known_by_Alan.html '
urls = [seed_url]

position = 18
count = 7

pos = position - 1

for i in range(count):
    html = urllib.request.urlopen(urls[-1]).read()
    soup = BeautifulSoup(html, 'lxml')

    tags = soup('a')
    urls.append(tags[pos].get('href', None))

print(urls[-1])


http://python-data.dr-chuck.net/known_by_Shinay.html

In [ ]:


In [ ]:


In [ ]:


In [ ]: