notebook.community

Edit and run



In [ ]:

    
import time
import sqlite3
import logging
import requests
import pandas as pd
from os import path
import datetime as dt
from bs4 import BeautifulSoup
from requests.exceptions import RequestException



In [ ]:

    
logger = logging.getLogger()
logger.setLevel(logging.INFO)
fh = logging.FileHandler(path.join('logs', 'spiegel_crawler.log'))
fh.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
logger.addHandler(fh)



In [ ]:

    
db_file = path.join('Data', 'archive_spiegel.sqlite')
conn = sqlite3.connect(db_file)
cursor = conn.cursor()



In [ ]:

    
def generate_url(url):
    if url.startswith('/'):
        return ''.join(['http://www.spiegel.de', url])
    else:
        return url

def extract_category(s):
    return s.strip().replace('(', '').replace(')', '').split(',')

def generate_date(date, time):
    hours, minutes = time.split(':')
    day, month, year = date.split('.')
    return dt.datetime(int(year), int(month), int(day), int(hours), int(minutes))



In [ ]:



In [ ]:

    
for date in dates:
    data = []
    url = 'http://www.spiegel.de/nachrichtenarchiv/artikel-{}.html'.format(date)
    try:
        day = requests.get(url, timeout=10)
        soup = BeautifulSoup(day.text)
        articles = soup.find('div', class_='column-wide')
        for article in articles.find_all('li'):
            article_url = generate_url(article.a['href'])
            title = article.a['title']
            category, t = extract_category(article.find('span', class_='headline-date').contents[0])
            dtime = generate_date(date, t)
            try:
                html = requests.get(article_url, stream=True, timeout=10).content
                data.append( (title, dtime, category, article_url, html) )
            except RequestException as error:
                logger.error('ARTICLE FAIL: %s : %s, %s', error, article_url, date)
        cursor.executemany(sql_insert, data)
        conn.commit()
    except RequestException as error:
        logger.error('DAY FAIL: %s : %s, %s', error, url, date)
    except AttributeError as error:
        logger.error('DAY FAIL: %s : %s, %s', error, url, date)
    else:
        logger.info('Successfully crawled articles from: %s', date)
    time.sleep(2)
conn.close()