notebook.community

Edit and run



In [ ]:

    
import sqlite3
from os import path
from time import time
from html2corpus import HTML2Corpus
from html2corpus.extractors import ReadabilityExtractor, ParagraphExtractor



In [ ]:

    
archive = 'Stern'
tag = 'Wirtschaft'
tags = ['Wirtschaft']



In [ ]:

    
file = path.join('..', 'data', '{}_{}.txt'.format(archive, tag))
db_file = path.join('..', '..', 'Crawler', 'data', 'archive_{}.sqlite'.format(archive.lower()))
conn = sqlite3.connect(db_file)
cursor = conn.cursor()



In [ ]:

    
def get_data():
    query = 'SELECT title, html FROM {} WHERE tag IN ({})'.format(archive, ','.join(['?' for i in range(len(tags))]))
    cursor.execute(query, tuple(tags))

    data = cursor.fetchmany(100)
    
    while len(data) > 0:
        for row in data:
            title, html = row
            yield html
        data = cursor.fetchmany(100)

HTML2Corpus(get_data(), extractor=ReadabilityExtractor(min_len=150)).save(file)
conn.close()