In [ ]:
import time
import sqlite3
import logging
import requests
import pandas as pd
from os import path
import datetime as dt
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
In [ ]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
fh = logging.FileHandler(path.join('logs', 'spiegel_crawler.log'))
fh.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
logger.addHandler(fh)
In [ ]:
db_file = path.join('Data', 'archive_spiegel.sqlite')
conn = sqlite3.connect(db_file)
cursor = conn.cursor()
In [ ]:
def generate_url(url):
if url.startswith('/'):
return ''.join(['http://www.spiegel.de', url])
else:
return url
def extract_category(s):
return s.strip().replace('(', '').replace(')', '').split(',')
def generate_date(date, time):
hours, minutes = time.split(':')
day, month, year = date.split('.')
return dt.datetime(int(year), int(month), int(day), int(hours), int(minutes))
In [ ]:
In [ ]:
for date in dates:
data = []
url = 'http://www.spiegel.de/nachrichtenarchiv/artikel-{}.html'.format(date)
try:
day = requests.get(url, timeout=10)
soup = BeautifulSoup(day.text)
articles = soup.find('div', class_='column-wide')
for article in articles.find_all('li'):
article_url = generate_url(article.a['href'])
title = article.a['title']
category, t = extract_category(article.find('span', class_='headline-date').contents[0])
dtime = generate_date(date, t)
try:
html = requests.get(article_url, stream=True, timeout=10).content
data.append( (title, dtime, category, article_url, html) )
except RequestException as error:
logger.error('ARTICLE FAIL: %s : %s, %s', error, article_url, date)
cursor.executemany(sql_insert, data)
conn.commit()
except RequestException as error:
logger.error('DAY FAIL: %s : %s, %s', error, url, date)
except AttributeError as error:
logger.error('DAY FAIL: %s : %s, %s', error, url, date)
else:
logger.info('Successfully crawled articles from: %s', date)
time.sleep(2)
conn.close()