In [ ]:
import logging
import requests
from pprint import pprint
from requests import RequestException
from os import path
from bs4 import BeautifulSoup
from html2corpus import HTML2Corpus
from html2corpus.extractors import ReadabilityExtractor, ParagraphExtractor
def check(link, blackwords):
return all([blackword not in link for blackword in blackwords])
In [ ]:
domain = 'http://www.fdp.de'
keyword = '/content/'
blackwords = set(['datenschutz', 'impressum'])
site = 'http://www.fdp.de/pressemitteilungen'
pages = ['{}?page={}'.format(site, i) for i in range(1, 97)]
pages.append(site)
def get_data():
for page in pages:
try:
req = requests.get(page, timeout=10)
soup = BeautifulSoup(req.content)
links = set([link['href'] for link in soup.findAll('a')
if keyword in link['href'] and check(link['href'], blackwords)])
links = map(lambda x: '{}/{}'.format(domain, x), links)
for article in list(links):
article_req = requests.get(article)
yield article_req.content
except RequestException as error:
logging.error('Error: %s', error)
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=100)).save(path.join('data', 'Corpus_FDP.txt'))