notebook.community

Edit and run



In [ ]:

    
import sqlite3
import logging
import requests
from requests.exceptions import RequestException
import pandas as pd
import datetime as dt
from pprint import pprint
from bs4 import BeautifulSoup
from os import path
import time



In [ ]:

    
logger = logging.getLogger()
logger.setLevel(logging.ERROR)
fh = logging.FileHandler(path.join('..', 'logs', 'wiwo_crawler.log'))
fh.setLevel(logging.ERROR)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
logger.addHandler(fh)



In [ ]:

    
db_file = path.join('..', '..', 'Crawler', 'data', 'archive_wirtschaftswoche.sqlite')
conn = sqlite3.connect(db_file)
cursor = conn.cursor()

sql_insert = 'INSERT INTO WirtschaftsWoche (title, date, tag, url, html) VALUES (?, ?, ?, ?, ?)'



In [ ]:

    
domain = 'http://www.wiwo.de'

start = dt.date(2010, 5, 21)
end = dt.datetime.now()
dates = pd.date_range(start, end)
urls = ['{}/archiv/{}'.format(domain, date.strftime('%Y/%m/%d')) for date in dates]
dates = [date.to_pydatetime() for date in dates]



In [ ]:

    
def generate_url(url):
    if url.startswith('/'):
        return ''.join([domain, url])
    else:
        return url
    
def extract_tag(url):
    tags = url.split('/')
    return tags[3]



In [ ]:

    
for url, date in zip(urls, dates):
    data = []
    try:
        req = requests.get(url, timeout=10)
        soup = BeautifulSoup(req.content)
        articles = soup.find('ul', class_='hcf-headline-list')
        for article in articles.find_all('li'):
            article_url = generate_url(article.a.get('href'))
            tag = extract_tag(article_url)
            title = article.a.get('title')
            try:
                html = requests.get(article_url, stream=True, timeout=10).content
                data.append( (title, date, tag, article_url, html) )
            except RequestException as error:
                logger.error('ARTICLE FAIL: %s : %s, %s', error, article_url, date)
        cursor.executemany(sql_insert, data)
        conn.commit()
    except RequestException as error:
        logger.error('DAY FAIL: %s : %s, %s', error, url, date)
    except AttributeError as error:
        logger.error('DAY FAIL: %s : %s, %s', error, url, date)
    else:
        logger.debug('Successfully crawled articles from: %s', date)
    time.sleep(2)
conn.close()