In [ ]:
import logging
import requests
from pprint import pprint
from requests import RequestException
from os import path
from bs4 import BeautifulSoup
from html2corpus import HTML2Corpus
from html2corpus.extractors import ReadabilityExtractor, ParagraphExtractor
def check(link, blackwords):
return all([blackword not in link for blackword in blackwords])
News from the website of the party: http://www.die-linke.de/nc/die-linke/nachrichten
In [ ]:
domain = 'http://www.die-linke.de'
keyword = 'artikel'
site = 'http://www.die-linke.de/nc/die-linke/nachrichten'
pages = ['{}/browse/{}'.format(site, i) for i in range(1, 99)]
pages.append(site)
def get_data():
for page in pages:
try:
req = requests.get(page, timeout=10)
soup = BeautifulSoup(req.content)
links = set([link['href'] for link in soup.findAll('a')
if keyword in link['href']])
links = map(lambda x: '{}/{}'.format(domain, x), links)
for article in list(links):
article_req = requests.get(article)
yield article_req.content
except RequestException as error:
logging.error('Error: %s', error)
HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=100)).save(path.join('data', 'Corpus_DieLinke.txt'))
Press releases from the website of the party: http://www.die-linke.de/nc/presse/presseerklaerungen/presseerklaerungen
In [ ]:
domain = 'http://www.die-linke.de'
keyword = 'artikel'
site = 'http://www.die-linke.de/nc/presse/presseerklaerungen/presseerklaerungen'
pages = ['{}/browse/{}'.format(site, i) for i in range(1, 272)]
pages.append(site)
def get_data():
for page in pages:
try:
req = requests.get(page, timeout=10)
soup = BeautifulSoup(req.content)
links = set([link['href'] for link in soup.findAll('a')
if keyword in link['href']])
links = map(lambda x: '{}/{}'.format(domain, x), links)
for article in list(links):
article_req = requests.get(article)
yield article_req.content
except RequestException as error:
logging.error('Error: %s', error)
HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=100)).save(path.join('data', 'Corpus_DieLinke_PR.txt'))
Press releases from the faction: http://www.linksfraktion.de/pressemitteilungen
In [ ]:
domain = 'http://www.linksfraktion.de'
keyword = 'pressemitteilungen'
site = 'http://www.linksfraktion.de/pressemitteilungen'
pages = ['{}/?s={}'.format(site, i) for i in range(1, 1384)]
pages.append(site)
def get_data():
for page in pages:
try:
req = requests.get(page, timeout=10)
soup = BeautifulSoup(req.content)
links = set([link['href'] for link in soup.findAll('a')
if link.get('href') and keyword in link['href']])
links = map(lambda x: '{}/{}'.format(domain, x), links)
for article in list(links):
article_req = requests.get(article)
yield article_req.content
except RequestException as error:
logging.error('Error: %s', error)
HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=100)).save(path.join('data', 'Corpus_DieLinke_Fraktion.txt'))
Press releases from the website of the faction: http://www.spdfraktion.de/presse/pressemitteilungen
In [ ]:
domain = 'http://www.spdfraktion.de'
keyword = 'presse/pressemitteilungen/'
blackword = 'feed'
site = 'http://www.spdfraktion.de/presse/pressemitteilungen'
pages = ['{}?page={}'.format(site, i) for i in range(1, 733)]
pages.append(site)
def get_data():
for page in pages:
try:
req = requests.get(page, timeout=10)
soup = BeautifulSoup(req.content)
links = set([link['href'] for link in soup.findAll('a')
if keyword in link['href'] and blackword not in link['href']])
links = map(lambda x: '{}/{}'.format(domain, x), links)
for article in list(links):
article_req = requests.get(article)
yield article_req.content
except RequestException as error:
logging.error('Error: %s', error)
HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=100)).save(path.join('data', 'Corpus_SPD_Fraktion.txt'))
Press releases from the website of the eu faction: https://www.spd-europa.de/pressemitteilung
In [ ]:
domain = 'https://www.spd-europa.de'
keyword = '/pressemitteilungen/'
site = 'https://www.spd-europa.de/pressemitteilung'
pages = ['{}?page={}'.format(site, i) for i in range(1, 165)]
pages.append(site)
def get_data():
for page in pages:
try:
req = requests.get(page, timeout=10)
soup = BeautifulSoup(req.content, 'lxml')
links = set([link['href'] for link in soup.findAll('a')
if keyword in link['href']])
links = map(lambda x: '{}{}'.format(domain, x), links)
for article in list(links):
article_req = requests.get(article)
yield article_req.content
except RequestException as error:
logging.error('Error: %s', error)
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=150)).save(path.join('data', 'SPD_EU.txt'))
Vorwärts, a spd news paper.
In [ ]:
domain = 'http://www.vorwaerts.de'
keyword = '/artikel/'
blackwords = ['#comment-form']
site = 'http://www.vorwaerts.de/international'
pages = ['{}?page={}'.format(site, i) for i in range(1, 124)]
pages.append(site)
def get_data():
for page in pages:
try:
req = requests.get(page, timeout=10)
soup = BeautifulSoup(req.content, 'lxml')
links = [link['href'] for link in soup.findAll('a')
if link.get('href', None) and keyword in link['href'] and check(link['href'], blackwords)]
links = map(lambda x: '{}{}'.format(domain, x), links)
for article in list(links):
article_req = requests.get(article)
yield article_req.content
except RequestException as error:
logging.error('Error: %s', error)
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=100)).save(path.join('data', 'SPD_Vorwärts.txt'))
Press releases from the website of the faction: http://www.gruene-bundestag.de/presse_ID_2000127
In [ ]:
domain = 'http://www.spdfraktion.de'
keyword = 'presse/pressemitteilungen/'
blackword = 'feed'
site = 'http://www.spdfraktion.de/presse/pressemitteilungen'
pages = ['{}?page={}'.format(site, i) for i in range(1, 733)]
pages.append(site)
def get_data():
for page in pages:
try:
req = requests.get(page, timeout=10)
soup = BeautifulSoup(req.content)
links = set([link['href'] for link in soup.findAll('a')
if keyword in link['href'] and blackword not in link['href']])
links = map(lambda x: '{}/{}'.format(domain, x), links)
for article in list(links):
article_req = requests.get(article)
yield article_req.content
except RequestException as error:
logging.error('Error: %s', error)
HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=100)).save(path.join('data', 'Corpus_SPD_Fraktion.txt'))
In [ ]:
domain = 'http://www.gruene-bundestag.de'
keyword = 'presse/pressemitteilungen/'
blackword = 'feed'
site = 'http://www.gruene-bundestag.de/presse_ID_2000127'
pages = ['{}/pb_id/100/seite/{}'.format(site, i) for i in range(2, 1322)]
pages.append(site)
def get_data():
for page in pages:
try:
req = requests.get(page, timeout=10)
soup = BeautifulSoup(req.content)
links = set([link['href'] for link in soup.findAll('a')
if keyword in link['href'] and blackword not in link['href']])
links = map(lambda x: '{}/{}'.format(domain, x), links)
for article in list(links):
article_req = requests.get(article)
yield article_req.content
except RequestException as error:
logging.error('Error: %s', error)
HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=100)).save(path.join('data', 'Corpus_Grüne_Fraktion.txt'))
News from the website of the party: http://www.fdp.de/pressemitteilungen
In [ ]:
domain = 'http://www.fdp.de'
keyword = '/content/'
blackwords = set(['datenschutz', 'impressum'])
site = 'http://www.fdp.de/pressemitteilungen'
pages = ['{}?page={}'.format(site, i) for i in range(1, 97)]
pages.append(site)
def get_data():
for page in pages:
try:
req = requests.get(page, timeout=10)
soup = BeautifulSoup(req.content)
links = set([link['href'] for link in soup.findAll('a')
if keyword in link['href'] and check(link['href'], blackwords)])
links = map(lambda x: '{}/{}'.format(domain, x), links)
for article in list(links):
article_req = requests.get(article)
yield article_req.content
except RequestException as error:
logging.error('Error: %s', error)
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=100)).save(path.join('data', 'Corpus_FDP.txt'))
PR from the faction: http://www.liberale.de/page/pressemitteilungen
In [ ]:
domain = 'http://www.liberale.de'
keyword = '/content/'
blackwords = set(['datenschutz', 'impressum'])
site = 'http://www.liberale.de/page/pressemitteilungen'
pages = ['{}?page=0%2C{}'.format(site, i) for i in range(1, 1063)]
pages.append(site)
def get_data():
for page in pages:
try:
req = requests.get(page, timeout=10)
soup = BeautifulSoup(req.content)
links = set([link['href'] for link in soup.findAll('a')
if keyword in link['href'] and check(link['href'], blackwords)])
links = map(lambda x: '{}/{}'.format(domain, x), links)
for article in list(links):
article_req = requests.get(article)
yield article_req.content
except RequestException as error:
logging.error('Error: %s', error)
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=100)).save(path.join('data', 'Corpus_FDP_Fraktion.txt'))
CDU/CSU Fraktion
In [ ]:
domain = 'http://www.presseportal.de'
keyword = '/pm/7846/'
site = 'http://www.presseportal.de/nr/7846'
pages = ['{}/{}'.format(site, i * 27) for i in range(1, 621)]
pages.append(site)
def get_data():
for page in pages:
try:
req = requests.get(page, timeout=10)
soup = BeautifulSoup(req.content)
links = set([link['href'] for link in soup.findAll('a')
if keyword in link['href']])
links = map(lambda x: '{}/{}'.format(domain, x), links)
for article in list(links):
article_req = requests.get(article)
yield article_req.content
except RequestException as error:
logging.error('Error: %s', error)
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=150)).save(path.join('data', 'CDU_Fraktion.txt'))
In [ ]:
domain = 'http://www.presseportal.de'
keyword = '/pm/6518/'
site = 'http://www.presseportal.de/nr/6518'
pages = ['{}/{}'.format(site, i * 27) for i in range(1, 38)]
pages.append(site)
def get_data():
for page in pages:
try:
req = requests.get(page, timeout=10)
soup = BeautifulSoup(req.content)
links = set([link['href'] for link in soup.findAll('a')
if keyword in link['href']])
links = map(lambda x: '{}/{}'.format(domain, x), links)
for article in list(links):
article_req = requests.get(article)
yield article_req.content
except RequestException as error:
logging.error('Error: %s', error)
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=150)).save(path.join('data', 'CDU.txt'))
In [ ]:
domain = 'http://www.cdu-csu-ep.de'
keyword = '/presse/pressemitteilungen/'
blackwords = set(['content'])
site = 'http://www.cdu-csu-ep.de/pressearchiv.html'
pages = ['{}?start={}'.format(site, i * 5) for i in range(0, 643)]
def get_data():
for page in pages:
try:
req = requests.get(page, timeout=10)
soup = BeautifulSoup(req.content)
links = set([link['href'] for link in soup.findAll('a')
if link.get('href', None) and keyword in link['href'] and check(link['href'], blackwords)])
links = map(lambda x: '{}/{}'.format(domain, x), links)
for article in list(links):
article_req = requests.get(article)
yield article_req.content
except RequestException as error:
logging.error('Error: %s', error)
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=150)).save(path.join('data', 'CDU_EU.txt'))
In [ ]:
keyword = '/?p='
site = 'http://aktion-widerstand.de/?page_id=11042'
pages = ['{}&paged={}'.format(site, i) for i in range(2, 335)]
pages.append(site)
def get_data():
for page in pages:
try:
req = requests.get(page, timeout=10)
soup = BeautifulSoup(req.content)
links = set([link['href'] for link in soup.findAll('a')
if keyword in link['href']])
for article in list(links):
article_req = requests.get(article)
yield article_req.content
except RequestException as error:
logging.error('Error: %s', error)
HTML2Corpus(get_data(), extractor=ParagraphExtractor(min_len=100)).save(path.join('data', 'Corpus_NPD_Jung.txt'))
In [ ]:
domain = 'http://www.npd-fraktion-mv.de'
keyword = '&view=article&'
blackwords = set(['content'])
site = 'http://www.npd-fraktion-mv.de/index.php?com=news&view=archive'
pages = ['{}&b={}&mid=8'.format(site, i * 50) for i in range(0, 38)]
def get_data():
for page in pages:
try:
req = requests.get(page, timeout=10)
soup = BeautifulSoup(req.content)
links = set([link['href'] for link in soup.findAll('a')
if keyword in link['href'] and check(link['href'], blackwords)])
links = map(lambda x: '{}/{}'.format(domain, x), links)
for article in list(links):
article_req = requests.get(article)
yield article_req.content
except RequestException as error:
logging.error('Error: %s', error)
HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=100)).save(path.join('data', 'Corpus_NPD_MV.txt'))
In [ ]:
domain = 'http://www.npd-fraktion-sachsen.de'
blackwords = set(['meldungen', 'category', 'author'])
site = 'http://www.npd-fraktion-sachsen.de/category/meldungen'
pages = ['{}/page/{}'.format(site, i) for i in range(2, 194)]
def get_data():
for page in pages:
try:
req = requests.get(page, timeout=10)
soup = BeautifulSoup(req.content)
blog = soup.find('div', id='blog-left')
links = set([link['href'] for link in blog.findAll('a')
if check(link['href'], blackwords)])
for article in list(links):
article_req = requests.get(article)
yield article_req.content
except RequestException as error:
logging.error('Error: %s', error)
HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=100)).save(path.join('data', 'Corpus_NPD_Sachsen.txt'))