notebook.community

Edit and run



In [ ]:

    
import re
import sqlite3
from os import path
from time import time
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.externals import joblib
from html2corpus import HTML2Corpus
from html2corpus.extractors import ReadabilityExtractor, ParagraphExtractor



In [ ]:

    
labels = ['2015-44', '2015-45', '2015-46', '2015-47', '2015-48', '2015-49', '2015-50', '2015-51', 
          '2015-52', '2015-53', '2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06',
          '2016-07', '2016-08']
files = [path.join('data', 'CurrentNews', '{}.csv'.format(label)) for label in labels]
dates = [('2015-10-26', '2015-11-02'),
         ('2015-11-02', '2015-11-09'),
         ('2015-11-09', '2015-11-16'),
         ('2015-11-16', '2015-11-23'),
         ('2015-11-23', '2015-11-30'),
         ('2015-11-30', '2015-12-07'),
         ('2015-12-07', '2015-12-14'),
         ('2015-12-14', '2015-12-21'),
         ('2015-12-21', '2015-12-28'),
         ('2015-12-28', '2016-01-04'),
         ('2016-01-04', '2016-01-11'),
         ('2016-01-11', '2016-01-18'),
         ('2016-01-18', '2016-01-25'),
         ('2016-01-25', '2016-02-01'),
         ('2016-02-01', '2016-02-08'),
         ('2016-02-08', '2016-02-15'),
         ('2016-02-15', '2016-02-22'),
         ('2016-02-22', '2016-02-29')]

db_file = path.join('..', 'Crawler', 'data', 'de_news.sqlite')
conn = sqlite3.connect(db_file)
cursor = conn.cursor()



In [ ]:

    
vectorizer = joblib.load(path.join('models', 'classifier', 'Vectorizer.pkl'))
classifier = joblib.load(path.join('models', 'classifier', 'Classifier.pkl'))
ext = ReadabilityExtractor(min_len=150)

Note that the title has newlines and other shit in it...



In [ ]:

    
def clean_html(html):
    soup = BeautifulSoup(html, 'lxml')
    for link in soup.find_all('a'):
        link.extract()
    for script in soup.find_all('script'):
        script.extract()
    text = soup.get_text().strip()
    text = re.sub(r'[\n\t\r\[\]]', ' ', text)
    text = re.sub(r' +', ' ', text)
    return text

def classify(rows):
    tfidf = vectorizer.transform(rows['text'])
    return classifier.predict(tfidf)

def get_data(kw):
    query = '''SELECT title, summary, site, html 
               FROM Articles 
               LEFT JOIN Sources ON Articles.source = Sources.id 
               WHERE date BETWEEN ? and ?'''
    cursor.execute(query, kw)

    data = cursor.fetchmany(100)
    
    while len(data) > 0:
        for row in data:
            title, summary, site, html = row
            text = ext.extract(html)
            if len(text) > 0:
                yield { 
                        'title': clean_html(title),
                        'summary': clean_html(summary),
                        'text': text, 
                        'site': site 
                }
        data = cursor.fetchmany(100)
        
def process(file, date, i):
    articles = pd.DataFrame([article for article in get_data(date)])
    tags = classify(articles)
    articles['week'] = labels[i]
    articles['tag'] = tags
    articles.loc[articles['tag'] == 1, ['week','title', 'summary', 'text', 'site']].to_csv(file, index=False, encoding='utf-8', sep='|')



In [ ]:

    
for i, (file, date) in enumerate(zip(files, dates)):
    process(file, date, i)



In [ ]:

    
conn.close()