In [1]:
# the beginning of a crawler...
# find the url in the response and use that for the next step
import urllib.request
fhand = urllib.request.urlopen('http://www.dr-chuck.com/page1.htm')
for line in fhand:
print(line.strip())
In [ ]:
# rewritten the code for Python 3... if fetches anchor tags
import urllib.request
from bs4 import BeautifulSoup
url = input('Enter - ')
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'lxml')
# Retrieve a list of anchor tags
# Each tag is like a dictionary of HTML attributes
tags = soup('a')
for tag in tags:
print(tag.get('href', None))
# test on this page: http://www.dr-chuck.com/page1.htm
In [31]:
import urllib.request
from bs4 import BeautifulSoup
url = 'http://python-data.dr-chuck.net/comments_371514.html'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'lxml')
# Retrieve a list of anchor tags
# Each tag is like a dictionary of HTML attributes
tags = soup('span')
#print(tags)
sum_of_tags = 0
for tag in tags:
sum_of_tags += int(tag.contents[0])
print(sum_of_tags)
In [33]:
# Example code to retrieve information
# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
# Look at the parts of a tag
print('TAG:',tag)
print('URL:',tag.get('href', None))
print('Contents:',tag.contents[0])
print('Attrs:',tag.attrs)
def adding_series():
summ = 0
for i in range(11):
summ += i
return summ
adding_series()
Out[33]:
In [ ]:
# Making a function out of the assignment
In [4]:
import urllib.request
from bs4 import BeautifulSoup
url = 'http://python-data.dr-chuck.net/comments_371514.html'
def assign1(url):
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'lxml')
tags = soup('span')
sum_of_tags = 0
for tag in tags:
sum_of_tags += int(tag.contents[0])
return sum_of_tags
assign1(url)
Out[4]:
In [28]:
# training
import urllib.request
from bs4 import BeautifulSoup
seed_url = 'http://python-data.dr-chuck.net/known_by_Fikret.html'
urls = [seed_url]
position = 3
count = 4
pos = position - 1
for i in range(count):
html = urllib.request.urlopen(urls[-1]).read()
soup = BeautifulSoup(html, 'lxml')
tags = soup('a')
urls.append(tags[pos].get('href', None))
print(urls)
In [30]:
# solution
import urllib.request
from bs4 import BeautifulSoup
seed_url = 'http://python-data.dr-chuck.net/known_by_Alan.html '
urls = [seed_url]
position = 18
count = 7
pos = position - 1
for i in range(count):
html = urllib.request.urlopen(urls[-1]).read()
soup = BeautifulSoup(html, 'lxml')
tags = soup('a')
urls.append(tags[pos].get('href', None))
print(urls[-1])
In [ ]:
In [ ]:
In [ ]:
In [ ]: