notebook.community

Edit and run



In [ ]:

    
import sqlite3
import logging
import requests
from requests.exceptions import RequestException
import pandas as pd
import datetime as dt
from pprint import pprint
from bs4 import BeautifulSoup
from os import path
import time



In [ ]:

    
logger = logging.getLogger()
logger.setLevel(logging.ERROR)
fh = logging.FileHandler(path.join('..', 'logs', 'stern_crawler.log'))
fh.setLevel(logging.ERROR)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
logger.addHandler(fh)



In [ ]:

    
db_file = path.join('..', '..', 'Crawler', 'data', 'archive_stern.sqlite')
conn = sqlite3.connect(db_file)
cursor = conn.cursor()

sql_insert = 'INSERT INTO Stern (title, date, tag, url, html) VALUES (?, ?, ?, ?, ?)'



In [ ]:

    
def generate_date(datestring):
    if datestring:
        return dt.datetime.strptime(datestring, '%Y-%m-%d %H:%M')
    else:
        return None



In [ ]:

    
date = dt.datetime.now()
domain = 'http://www.stern.de/wirtschaft'
tag = 'Wirtschaft'

urls = ['{}/archiv/?month={}&year={}'.format(domain, month, year) for month in range(1, 13) for year in range(2001, 2016)]



In [ ]:

    
def get_articles(soup):
    data = []
    articles = soup.find('div', class_='article-content')
    for article in articles.find_all('div', class_='o-teaser-catchline'):
        article_url = article.a.get('href')
        title = article.a.get('title')
        date = generate_date(article.time.get('datetime').strip())
        try:
            html = requests.get(article_url, stream=True, timeout=10).content
            data.append( (title, date, tag, article_url, html) )
        except RequestException as error:
            logger.error('ARTICLE FAIL: %s : %s, %s', error, article_url, date)
    cursor.executemany(sql_insert, data)
    conn.commit()
    
def scrape_site(url):
    req = requests.get(url, timeout=10)
    soup = BeautifulSoup(req.content)
    get_articles(soup)
    next_page = soup.find('li', class_='page-next')
    if next_page and next_page.a.get('href'):
        scrape_site(next_page.a.get('href'))
        
for url in urls:
    try:
        scrape_site(url)
    except RequestException as error:
        logger.error('MONTH FAIL: %s : %s, %s', error, url, date)
    except AttributeError as error:
        logger.error('MONTH FAIL: %s : %s, %s', error, url, date)
    else:
        logger.debug('Successfully crawled articles from: %s', date)
    time.sleep(2)
conn.close()