In [ ]:
import re
import sqlite3
from os import path
from time import time
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.externals import joblib
from html2corpus import HTML2Corpus
from html2corpus.extractors import ReadabilityExtractor, ParagraphExtractor
In [ ]:
labels = ['2015-44', '2015-45', '2015-46', '2015-47', '2015-48', '2015-49', '2015-50', '2015-51',
'2015-52', '2015-53', '2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06',
'2016-07', '2016-08']
files = [path.join('data', 'CurrentNews', '{}.csv'.format(label)) for label in labels]
dates = [('2015-10-26', '2015-11-02'),
('2015-11-02', '2015-11-09'),
('2015-11-09', '2015-11-16'),
('2015-11-16', '2015-11-23'),
('2015-11-23', '2015-11-30'),
('2015-11-30', '2015-12-07'),
('2015-12-07', '2015-12-14'),
('2015-12-14', '2015-12-21'),
('2015-12-21', '2015-12-28'),
('2015-12-28', '2016-01-04'),
('2016-01-04', '2016-01-11'),
('2016-01-11', '2016-01-18'),
('2016-01-18', '2016-01-25'),
('2016-01-25', '2016-02-01'),
('2016-02-01', '2016-02-08'),
('2016-02-08', '2016-02-15'),
('2016-02-15', '2016-02-22'),
('2016-02-22', '2016-02-29')]
db_file = path.join('..', 'Crawler', 'data', 'de_news.sqlite')
conn = sqlite3.connect(db_file)
cursor = conn.cursor()
In [ ]:
vectorizer = joblib.load(path.join('models', 'classifier', 'Vectorizer.pkl'))
classifier = joblib.load(path.join('models', 'classifier', 'Classifier.pkl'))
ext = ReadabilityExtractor(min_len=150)
Note that the title has newlines and other shit in it...
In [ ]:
def clean_html(html):
soup = BeautifulSoup(html, 'lxml')
for link in soup.find_all('a'):
link.extract()
for script in soup.find_all('script'):
script.extract()
text = soup.get_text().strip()
text = re.sub(r'[\n\t\r\[\]]', ' ', text)
text = re.sub(r' +', ' ', text)
return text
def classify(rows):
tfidf = vectorizer.transform(rows['text'])
return classifier.predict(tfidf)
def get_data(kw):
query = '''SELECT title, summary, site, html
FROM Articles
LEFT JOIN Sources ON Articles.source = Sources.id
WHERE date BETWEEN ? and ?'''
cursor.execute(query, kw)
data = cursor.fetchmany(100)
while len(data) > 0:
for row in data:
title, summary, site, html = row
text = ext.extract(html)
if len(text) > 0:
yield {
'title': clean_html(title),
'summary': clean_html(summary),
'text': text,
'site': site
}
data = cursor.fetchmany(100)
def process(file, date, i):
articles = pd.DataFrame([article for article in get_data(date)])
tags = classify(articles)
articles['week'] = labels[i]
articles['tag'] = tags
articles.loc[articles['tag'] == 1, ['week','title', 'summary', 'text', 'site']].to_csv(file, index=False, encoding='utf-8', sep='|')
In [ ]:
for i, (file, date) in enumerate(zip(files, dates)):
process(file, date, i)
In [ ]:
conn.close()