In [1]:

    
# the beginning of a crawler...
# find the url in the response and use that for the next step

import urllib.request

fhand = urllib.request.urlopen('http://www.dr-chuck.com/page1.htm')
for line in fhand:
    print(line.strip())









    



b'<h1>The First Page</h1>'
b'<p>'
b'If you like, you can switch to the'
b'<a href="http://www.dr-chuck.com/page2.htm">'
b'Second Page</a>.'
b'</p>'

Parsing with Beautiful soup

Beautifulsoup is already installed in the Anaconda package



In [ ]:

    
# rewritten the code for Python 3... if fetches anchor tags

import urllib.request
from bs4 import BeautifulSoup

url = input('Enter - ')

html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'lxml')

# Retrieve a list of anchor tags
# Each tag is like a dictionary of HTML attributes

tags = soup('a')
for tag in tags:
    print(tag.get('href', None))
    
# test on this page: http://www.dr-chuck.com/page1.htm

Doing the first assignment



In [31]:

    
import urllib.request
from bs4 import BeautifulSoup

url = 'http://python-data.dr-chuck.net/comments_371514.html'

html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'lxml')

# Retrieve a list of anchor tags
# Each tag is like a dictionary of HTML attributes

tags = soup('span')
#print(tags)

sum_of_tags = 0

for tag in tags:
    sum_of_tags += int(tag.contents[0])
    
print(sum_of_tags)



In [33]:

    
# Example code to retrieve information

# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
    # Look at the parts of a tag
    print('TAG:',tag)
    print('URL:',tag.get('href', None))
    print('Contents:',tag.contents[0])
    print('Attrs:',tag.attrs)
    
def adding_series():
    summ = 0
    for i in range(11):
        summ += i
    return summ

adding_series()









    Out[33]:





55



In [ ]:

    
# Making a function out of the assignment



In [4]:

    
import urllib.request
from bs4 import BeautifulSoup

url = 'http://python-data.dr-chuck.net/comments_371514.html'

def assign1(url):
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'lxml')
    tags = soup('span')
    
    sum_of_tags = 0
    for tag in tags:
        sum_of_tags += int(tag.contents[0])
    return sum_of_tags
    
assign1(url)









    Out[4]:





2482

Doing the second assignment



In [28]:

    
# training

import urllib.request
from bs4 import BeautifulSoup

seed_url = 'http://python-data.dr-chuck.net/known_by_Fikret.html'
urls = [seed_url]

position = 3
count = 4

pos = position - 1

for i in range(count):
    html = urllib.request.urlopen(urls[-1]).read()
    soup = BeautifulSoup(html, 'lxml')

    tags = soup('a')
    urls.append(tags[pos].get('href', None))

print(urls)









    



['http://python-data.dr-chuck.net/known_by_Fikret.html', 'http://python-data.dr-chuck.net/known_by_Montgomery.html', 'http://python-data.dr-chuck.net/known_by_Mhairade.html', 'http://python-data.dr-chuck.net/known_by_Butchi.html', 'http://python-data.dr-chuck.net/known_by_Anayah.html']



In [30]:

    
# solution

import urllib.request
from bs4 import BeautifulSoup

seed_url = 'http://python-data.dr-chuck.net/known_by_Alan.html '
urls = [seed_url]

position = 18
count = 7

pos = position - 1

for i in range(count):
    html = urllib.request.urlopen(urls[-1]).read()
    soup = BeautifulSoup(html, 'lxml')

    tags = soup('a')
    urls.append(tags[pos].get('href', None))

print(urls[-1])









    



http://python-data.dr-chuck.net/known_by_Shinay.html



In [ ]:



In [ ]:



In [ ]:



In [ ]: