In [1]:
import pandas as pd
pd.options.display.max_columns = 500
import numpy as np
from collections import defaultdict
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("white")
from pymystem3 import Mystem; mystem = Mystem()
from functools import lru_cache
from tqdm import tqdm
tqdm.pandas()
%matplotlib inline
In [2]:
data = pd.read_csv('../data/DATA_MANY_TOPICS.csv')
data = data[~((data.topic == 'forces') & (data.url.str.contains('https://www.gazeta.ru/')))]
In [3]:
data.drop('index', 1, inplace=True)
data.drop(data[data.topic == 'football'].sample(25000).index, inplace=True)
data.drop(data[data.topic == 'hockey'].sample(5000).index, inplace=True)
data.drop(data[data.topic == 'business'].sample(12000).index, inplace=True)
data.drop(data[data.topic == 'politics'].sample(5000).index, inplace=True)
In [4]:
data.sample(5)
Out[4]:
In [5]:
print('Number of news:', len(data))
In [6]:
print('Number of topics:', data.topic.nunique())
In [7]:
topics_freq = data.topic.value_counts(normalize=True, ascending=True)
media_freq = data.url.apply(lambda x: x.split('/')[2].replace('www.', '').replace('.ru', '')).value_counts(normalize=True, ascending=True)
font_size=18
fig, ax = plt.subplots(1, 2, figsize=(24, 7))
ax[0].bar(range(len(media_freq)), media_freq.values, color='#6C7A89')
ax[0].set_xticks(range(len(media_freq)))
ax[0].set_xticklabels(media_freq.index, rotation=90)
ax[0].tick_params(labelsize=font_size)
ax[0].yaxis.grid()
ax[1].bar(range(len(topics_freq)), topics_freq.values, color='#6C7A89')
ax[1].set_xticks(range(len(topics_freq)))
ax[1].set_xticklabels(topics_freq.index, rotation=90)
ax[1].tick_params(labelsize=font_size)
ax[1].yaxis.grid()
fig.savefig('../docs/images/media_topi_distr.pdf', bbox_inches='tight')
In [8]:
class Pipeline(object):
def __init__(self, *args):
self.transformations = args
def __call__(self, x):
res = x
for f in self.transformations:
res = f(res)
return res
In [9]:
from nltk.corpus import stopwords
from stop_words import get_stop_words
en_sw = get_stop_words('en')
ru_sw = get_stop_words('ru')
STOP_WORDS = set(en_sw) | set(ru_sw)
STOP_WORDS = STOP_WORDS | set(stopwords.words('russian')) | set(stopwords.words('english'))
STOP_WORDS = STOP_WORDS | set(['лента', 'новость', 'риа', 'тасс',
'редакция', 'газета', 'корра', 'daily',
'village', 'интерфакс', 'reuters', 'уточняться'])
def remove_ria(text):
prefix = text[:50]
ria = 'РИА Новости'
if ria in prefix:
text = text[text.find(ria)+len(ria)+1:]
return text
def remove_tass(text):
prefix = text[:100]
return text[max(0, prefix.find('/.')+1):]
def get_lower(text):
return str(text).lower().strip()
def remove_punctuation(text):
return ''.join([c if c.isalpha() or c in ['-',"'"] else ' ' for c in text])
@lru_cache(maxsize=None)
def get_word_normal_form(word):
return ''.join(mystem.lemmatize(word)).strip().replace('ё', 'е').strip('-')
def lemmatize_words(text):
res = []
for word in text.split():
norm_form = get_word_normal_form(word)
if len(norm_form) > 2 and norm_form not in STOP_WORDS:
res.append(norm_form)
return ' '.join(res)
In [10]:
TEXT_PIPELINE = Pipeline(remove_tass, remove_ria, get_lower, remove_punctuation, lemmatize_words)
In [11]:
%%time
data.text = data.text.progress_apply(TEXT_PIPELINE)
In [12]:
%%time
data.title = data.title.progress_apply(TEXT_PIPELINE)
In [13]:
vocabulary = defaultdict(int)
for news in data.itertuples():
for word in news.text.split():
vocabulary[word] += 1
print('Unique words in dataset:', len(vocabulary))
In [14]:
n_min_occurance = 2
print('Number of words occured more than %d times: %d' %
(n_min_occurance, len(list(filter(lambda x: x[1] >= n_min_occurance, vocabulary.items())))))
In [15]:
top_k = 20
print('Top %d most frequent words:' % top_k)
for i, (word, freq) in enumerate(sorted(vocabulary.items(), key=lambda x: -x[1])[:top_k]):
print(i+1, word, freq)
In [53]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, make_scorer, classification_report
from sklearn.svm import LinearSVC
from scipy.sparse import hstack
In [17]:
le = LabelEncoder()
data.topic = le.fit_transform(data.topic)
In [18]:
trainX = pd.concat([pd.DataFrame(data.title + ' ' + data.text, columns=['text']), data.url], axis=1)
trainY = data.topic
In [19]:
trainX.head()
Out[19]:
In [20]:
X_train, X_test, y_train, y_test = train_test_split(trainX, trainY, test_size=0.3,
stratify=trainY, random_state=42)
In [21]:
%%time
tfidf_vectorizer = TfidfVectorizer(min_df=3, ngram_range=(1,2), lowercase=False).fit(trainX.text)
print(len(tfidf_vectorizer.vocabulary_))
In [22]:
X_train_tf = tfidf_vectorizer.transform(X_train.text)
X_test_tf = tfidf_vectorizer.transform(X_test.text)
In [95]:
# clf = LinearSVC(C=1, max_iter=10000, loss='hinge', random_state=42)
clf = SGDClassifier(loss='hinge', n_iter=70, random_state=42, n_jobs=8, alpha=1e-5)
In [96]:
%%time
clf.fit(X_train_tf, y_train)
svm_prediciton = clf.predict(X_test_tf)
print('F1 score:', f1_score(y_test, svm_prediciton, average='macro'))
print('Accuracy:', accuracy_score(y_test, svm_prediciton))
In [99]:
def plot_confusion_matrix(true, pred, class_names, filename=None):
cm = confusion_matrix(true, pred)
cm_normalized = cm / cm.sum(axis=1)[:, np.newaxis]
classes = [c for c in class_names]
df_cm = pd.DataFrame(np.round(cm_normalized, 2), columns=classes, index=classes)
sns.set_context("notebook", font_scale=1.4);
plt.figure(figsize=(20, 20))
ax = sns.heatmap(df_cm, annot=True, linewidths=.5, fmt='g', cbar=False, square=True, annot_kws={"size": 10});
ax.xaxis.tick_top()
ax.set_xlabel('Предсказанный класс', fontsize=16)
ax.set_ylabel('Истинный класс', fontsize=16)
ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=90);
if filename:
plt.savefig('../docs/images/{}.pdf'.format(filename), bbox_inches='tight')
return cm_normalized
In [100]:
svm_cm_normalized = plot_confusion_matrix(y_test, svm_prediciton, le.classes_, 'svm_confusion_matrix')
In [50]:
def plot_accuracy_per_class(conf_matrix, classes, filename=None):
class_scores = {}
for i in range(conf_matrix.shape[0]):
class_scores[classes[i]] = conf_matrix[i,i]
topics = []
scores = []
for topic, score in sorted(class_scores.items(), key=lambda x: x[1]):
topics.append(topic)
scores.append(score)
font_size=15
fig, ax = plt.subplots(1, 1, figsize=(12, 12))
ax.barh(range(len(topics)), scores, color='#6C7A89')
ax.set_yticks(range(len(topics)));
ax.set_xticks(np.arange(0, 1.01, 0.1));
ax.set_yticklabels(topics);
ax.tick_params(labelsize=font_size)
ax.xaxis.grid()
ax.set_xlabel('Accuracy')
if filename:
fig.savefig('../docs/images/{}.pdf'.format(filename), bbox_inches='tight')
In [ ]:
plot_accuracy_per_class(svm_cm_normalized, le.classes_, 'svm_accuracy_per_class')
In [ ]:
# for i, (true, pred) in enumerate(zip(y_test, clf_prediciton)):
# if true == le.transform(['technologies'])[0] and pred == le.transform(['realty'])[0]:
# print(X_test[i][1])
In [ ]:
def print_top_K(vectorizer, clf, class_labels, k=10):
"""Prints features with the highest coefficient values, per class"""
feature_names = vectorizer.get_feature_names()
top_words = {}
for i, class_label in enumerate(class_labels):
top_K = np.argsort(clf.coef_[i])[-k:]
top_words[class_label] = [feature_names[j] for j in top_K]
return pd.DataFrame(top_words)
In [ ]:
df = print_top_K(tfidf_vectorizer, clf, le.classes_, k=8)
In [ ]:
column_format='c|'+'c'*8
In [ ]:
with open('top_10_svm_words.tex', 'w') as f:
print(df.transpose().to_latex(escape=False, header=False, column_format=column_format), file=f)
In [ ]:
from gensim.models import Word2Vec
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
In [ ]:
corpus = pd.read_csv('../data/DATA_MANY_TOPICS.csv')[['text', 'topic']]
In [ ]:
corpus.text = corpus.text.apply(TEXT_PIPELINE)
In [25]:
min_count = 3
vec_size = 300
window = 5
In [ ]:
%%time
w2v = Word2Vec([text.split() for text in corpus.text],
min_count=min_count, size=vec_size, window=window, hs=1, sg=1, workers=8, iter=10)
In [ ]:
word_vectors = w2v.wv
del w2v
In [23]:
import pickle
# with open('../data/w2v.pickle', 'wb') as f:
# pickle.dump(word_vectors, f, pickle.HIGHEST_PROTOCOL)
with open('../data/w2v.pickle', 'rb') as f:
word_vectors = pickle.load(f)
In [26]:
word2tfidf = dict(zip(tfidf_vectorizer.get_feature_names(), tfidf_vectorizer.idf_))
vectorized_news = np.zeros((trainX.shape[0], vec_size), dtype="float32")
for i in tqdm(range(trainX.shape[0])):
words = [word for word in trainX.text.iloc[i].split()
if word in word_vectors and word in word2tfidf]
word_vecs = [word_vectors[word] for word in words]
word_idfs = [word2tfidf[word] for word in words]
vectorized_news[i,:] = np.average(word_vecs, axis=0, weights=word_idfs)
In [27]:
trainX_w2v = pd.DataFrame(vectorized_news)
In [28]:
trainX_w2v.head()
Out[28]:
In [ ]:
trainX_w2v = pd.read_csv('trainX_w2v.csv')
# trainX_w2v.to_csv('./trainX_w2v.csv')
In [29]:
trainX_w2v.shape, trainY.shape
Out[29]:
In [ ]:
import warnings
warnings.simplefilter('ignore')
import artm
In [ ]:
corpus.text = corpus.text.apply(lambda x: ' '.join([w for w in str(x).split() if vocabulary[w] > 1]))
In [ ]:
vw_file_name = '../data/news_for_bigartm.vw'
with open(vw_file_name, 'w') as file:
for i, doc in enumerate(corpus.text):
print('doc{} {}'.format(i+1, doc), file=file)
In [ ]:
batch_vectorizer = artm.BatchVectorizer(data_path=vw_file_name, data_format='vowpal_wabbit',
target_folder='../data/bigartm_batches')
In [ ]:
dictionary = artm.Dictionary()
dictionary.gather(data_path='../data/bigartm_batches')
In [ ]:
%%time
model = artm.ARTM(num_topics=150, dictionary=dictionary, cache_theta=True, show_progress_bars=True)
model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.2))
model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=3e5))
model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='SmoothSparseThetaRegularizer', tau=-0.001))
model.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
model.scores.add(artm.TopTokensScore(name='top_tokens_score', dictionary=dictionary))
model.num_tokens = 10 # макс. число токенов для отображения
model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=50)
In [ ]:
saved_top_tokens = model.score_tracker['top_tokens_score'].last_tokens
for topic, top_tokens in saved_top_tokens.items():
print(topic, *top_tokens)
In [ ]:
def create_batch(documents, vocab):
''' Creates bath from list of documents
:param: documents - list of documents represented as string,
eg. ['Hello world!', 'How are you? - I am fine, thanks!']
:param: vocab - dict of term frequencies
:return: bigARTM batch
'''
import uuid
from collections import defaultdict
batch = artm.messages.Batch()
batch.id = str(uuid.uuid4())
dictionary = {}
# first step: fill the general batch vocabulary
for i, token in enumerate(vocab):
batch.token.append(token)
dictionary[token] = i
# second step: fill the items
for doc in documents:
item = batch.item.add()
local_dict = defaultdict(int)
for token in doc.split():
local_dict[token] += 1
for k, v in local_dict.items():
if k in dictionary:
item.token_id.append(dictionary[k])
item.token_weight.append(v)
return batch
In [ ]:
batch = create_batch(trainX.text.values, vocabulary)
bigartm_feats = model.master.transform(batches=[batch])[1]
top_cols = sorted(saved_top_tokens.keys(), key=lambda x: int(x.split('_')[1]))
In [36]:
# pd.DataFrame(bigartm_feats, columns=top_cols).to_csv('../data/data_bigartm_feats.csv')
# trainX_bigartm = pd.DataFrame(bigartm_feats, columns=top_cols)
trainX_bigartm = pd.read_csv('../data/data_bigartm_feats.csv')
In [37]:
del trainX_bigartm['Unnamed: 0']
In [38]:
trainX_bigartm.shape, trainX_w2v.shape
Out[38]:
In [31]:
import lightgbm as lgb
import warnings
warnings.simplefilter('ignore')
In [39]:
# pd.concat([trainX_w2v, trainX_bigartm], axis=1)
X_train, X_test, y_train, y_test = train_test_split(pd.concat([trainX_w2v, trainX_bigartm], axis=1).values, trainY.values, test_size=0.3,
stratify=trainY.values, random_state=42)
lgb_train = lgb.Dataset(X_train, y_train)
lgb_valid = lgb.Dataset(X_test, y_test)
In [40]:
X_train.shape
Out[40]:
In [45]:
lgb_param = {
'colsample_bytree': 0.9,
'is_unbalance': False,
'learning_rate': 0.1,
'max_bin': 255,
'max_depth': 7,
'min_child_samples': 10,
'min_child_weight': 5,
'min_split_gain': 0,
'nthread': 4,
'num_leaves': 255,
'objective': 'multiclass',
'metric': 'multi_error',
'is_training_metric': True,
'num_class': len(le.classes_),
'reg_alpha': 0,
'reg_lambda': 0,
'silent': True,
'subsample': 0.8, #0.8
}
In [46]:
%%time
clf = lgb.train(lgb_param, lgb_train, num_boost_round=10000, valid_sets=[lgb_valid],
early_stopping_rounds=100, verbose_eval=25)
In [ ]:
# np.save('lgb_prediciton_w2v_only', lgb_prediciton)
# lgb_prediciton = np.load('lgb_prediciton_w2v_only.npy')
In [47]:
lgb_prediciton = clf.predict(X_test).argmax(axis=1)
print('F1 score:', f1_score(y_test, lgb_prediciton, average='macro'))
print('Accuracy:', accuracy_score(y_test, lgb_prediciton))
# w2v only:
# F1 score: 0.848625247727
# Accuracy: 0.842781896702 26 min
# bigARTM only:
# F1 score: 0.825758239533
# Accuracy: 0.837190144537
# bigARTM + w2v:
# F1 score: 0.871120225443
# Accuracy: 0.868743603185
# 51min 7s
In [51]:
lgb_cm_normalized = plot_confusion_matrix(y_test, lgb_prediciton, le.classes_, 'lgb_bigartm_w2v_confusion_matrix')
In [52]:
plot_accuracy_per_class(lgb_cm_normalized, le.classes_, 'lgb_bigartm_w2v_accuracy_per_class')
In [126]:
def show_values(pc, fmt="%.2f", **kw):
'''
Heatmap with text in each cell with matplotlib's pyplot
Source: http://stackoverflow.com/a/25074150/395857
By HYRY
'''
pc.update_scalarmappable()
ax = pc.get_axes()
for p, color, value in zip(pc.get_paths(), pc.get_facecolors(), pc.get_array()):
x, y = p.vertices[:-2, :].mean(0)
if np.all(color[:3] > 0.5):
color = (0.0, 0.0, 0.0)
else:
color = (1.0, 1.0, 1.0)
ax.text(x, y, fmt % value, ha="center", va="center", color=color, **kw);
def cm2inch(*tupl):
'''
Specify figure size in centimeter in matplotlib
Source: http://stackoverflow.com/a/22787457/395857
By gns-ank
'''
inch = 2.54
if type(tupl[0]) == tuple:
return tuple(i/inch for i in tupl[0])
else:
return tuple(i/inch for i in tupl)
def heatmap(AUC, title, xlabel, ylabel, xticklabels, yticklabels,
figure_width=40, figure_height=20, correct_orientation=False, cmap='RdBu'):
'''
Inspired by:
- http://stackoverflow.com/a/16124677/395857
- http://stackoverflow.com/a/25074150/395857
'''
# Plot it out
fig, ax = plt.subplots();
#c = ax.pcolor(AUC, edgecolors='k', linestyle= 'dashed', linewidths=0.2, cmap='RdBu', vmin=0.0, vmax=1.0)
c = ax.pcolor(AUC, edgecolors='k', linestyle= 'dashed', linewidths=0.2, cmap=cmap, vmin=0.5, vmax=1.0);
# put the major ticks at the middle of each cell
ax.set_yticks(np.arange(AUC.shape[0]) + 0.5, minor=False);
ax.set_xticks(np.arange(AUC.shape[1]) + 0.5, minor=False);
# set tick labels
#ax.set_xticklabels(np.arange(1,AUC.shape[1]+1), minor=False)
ax.set_xticklabels(xticklabels, minor=False);
ax.set_yticklabels(yticklabels, minor=False);
# set title and x/y labels
plt.title(title)
plt.xlabel(xlabel);
plt.ylabel(ylabel);
# Remove last blank column
plt.xlim( (0, AUC.shape[1]) )
# Turn off all the ticks
ax = plt.gca();
for t in ax.xaxis.get_major_ticks():
t.tick1On = False
t.tick2On = False
for t in ax.yaxis.get_major_ticks():
t.tick1On = False
t.tick2On = False
# Add color bar
# plt.colorbar(c);
# Add text in each cell
show_values(c);
# Proper orientation (origin at the top left instead of bottom left)
if correct_orientation:
ax.invert_yaxis();
ax.xaxis.tick_top();
# resize
fig = plt.gcf();
#fig.set_size_inches(cm2inch(40, 20))
#fig.set_size_inches(cm2inch(40*4, 20*4))
fig.set_size_inches(cm2inch(figure_width, figure_height));
def plot_classification_report(classification_report, title='', cmap='RdBu'):
'''
Plot scikit-learn classification report.
Extension based on http://stackoverflow.com/a/31689645/395857
'''
lines = classification_report.split('\n')
classes = []
plotMat = []
support = []
class_names = []
for line in lines[2 : (len(lines) - 2)]:
t = line.strip().split()
if len(t) < 2: continue
classes.append(t[0])
v = [float(x) for x in t[1: len(t) - 1]]
support.append(int(t[-1]))
class_names.append(t[0])
# print(v)
plotMat.append(v)
# print('plotMat: {0}'.format(plotMat))
# print('support: {0}'.format(support))
classes = classes[::-1]
plotMat = plotMat[::-1]
support = support[::-1]
class_names = class_names[::-1]
xlabel = ''
ylabel = ''
xticklabels = ['Precision', 'Recall', 'F1-score']
yticklabels = ['{0} ({1})'.format(class_names[idx], sup) for idx, sup in enumerate(support)]
figure_width = 25
figure_height = len(class_names) + 7
correct_orientation = False
heatmap(np.array(plotMat), title, xlabel, ylabel, xticklabels, yticklabels, figure_width, figure_height, correct_orientation, cmap=cmap);
In [128]:
plot_classification_report(classification_report(y_test, lgb_prediciton, target_names=le.classes_))
plt.savefig('../docs/images/lgb_w2v_bigartm_classif_report.pdf', bbox_inches='tight')
plt.close()
In [127]:
plot_classification_report(classification_report(y_test, svm_prediciton, target_names=le.classes_))
plt.savefig('../docs/images/svm_classif_report.pdf', bbox_inches='tight')
plt.close()
In [ ]: