In [1]:
import pandas as pd
# pd.options.display.max_colwidth = 200
import numpy as np
from collections import defaultdict
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("white")
from pymystem3 import Mystem; mystem = Mystem()
from functools import lru_cache
from tqdm import tqdm
tqdm.pandas()
%matplotlib inline
In [2]:
data = []
DATA_PATH = '../../NewsParser/data/'
for csv_name in ['recent_news.csv']:
data.append(pd.read_csv(DATA_PATH + csv_name))
data = pd.concat(data)
In [3]:
data.head()
Out[3]:
In [4]:
class Pipeline(object):
def __init__(self, *args):
self.transformations = args
def __call__(self, x):
res = x
for f in self.transformations:
res = f(res)
return res
from nltk.corpus import stopwords
from stop_words import get_stop_words
en_sw = get_stop_words('en')
ru_sw = get_stop_words('ru')
STOP_WORDS = set(en_sw) | set(ru_sw)
STOP_WORDS = STOP_WORDS | set(stopwords.words('russian')) | set(stopwords.words('english'))
STOP_WORDS = STOP_WORDS | set(['лента', 'новость', 'риа', 'тасс', 'редакция'])
def get_lower(text):
return str(text).lower().strip()
def remove_punctuation(text):
return ''.join([c if c.isalpha() or c in ['-',"'"] else ' ' for c in text])
@lru_cache(maxsize=None)
def get_word_normal_form(word):
return ''.join(mystem.lemmatize(word)).strip().replace('ё', 'е').strip('-')
def lemmatize_words(text):
res = []
for word in text.split():
norm_form = get_word_normal_form(word)
if len(norm_form) > 2 and norm_form not in STOP_WORDS:
res.append(norm_form)
return ' '.join(res)
TEXT_PIPELINE = Pipeline(get_lower, remove_punctuation, lemmatize_words)
In [5]:
%%time
data.text = data.text.progress_apply(TEXT_PIPELINE)
data.title = data.title.apply(lambda x: x.strip())
data['title_norm'] = data.title.progress_apply(TEXT_PIPELINE)
In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
In [7]:
trainX = data['title_norm'] + ' ' + data.text
# print(trainX)
trainX = trainX.values
In [8]:
trainX.shape
Out[8]:
In [9]:
import pickle
In [10]:
with open('count_vectorizer.bin', 'rb') as pickle_file:
tfidf_vectorizer = pickle.load(pickle_file)
In [11]:
%%time
# tfidf_vectorizer = TfidfVectorizer(min_df=5, ngram_range=(1,2), lowercase=False).fit(trainX)
In [12]:
len(tfidf_vectorizer.vocabulary_)
Out[12]:
In [13]:
tfidf_matrix = tfidf_vectorizer.transform(trainX)
In [14]:
from sklearn.cluster import KMeans
# from spherecluster import SphericalKMeans
In [15]:
kmeans = KMeans(n_clusters=30, random_state=42).fit(tfidf_matrix)
# kmeans = SphericalKMeans(n_clusters=K).fit(tfidf_matrix)
In [16]:
clasters = kmeans.predict(tfidf_matrix)
c_list = [ [] for i in range(30) ]
for i, claster in enumerate(clasters):
tfidf_news = tfidf_matrix[i,:]
# print(kmeans.cluster_centers_[claster].reshape(1, -1).shape, tfidf_news.shape)
if cosine_similarity(tfidf_news, kmeans.cluster_centers_[claster].reshape(1, -1))[0][0] > 0.65:
c_list[claster].append(i)
In [17]:
c_list
Out[17]:
In [18]:
for i, group in enumerate(c_list):
if len(group) < 3:
continue
print('Topic', i)
for id_ in group:
print(data.title[id_],data.url[id_])
print()
In [17]:
kmeans.cluster_centers_
Out[17]:
In [24]:
cosines = []
for tfidf_news in tfidf_matrix:
cosine = cosine_similarity(tfidf_news, tfidf_matrix)
cosines.append(cosine.tolist()[0])
In [31]:
COS_THRESHOLD = 0.75
themes = [ -1 for _ in range(len(cosines)) ]
themes_ids = [ [] for _ in range(len(cosines)) ]
curr_theme = 0
for v, theme in enumerate(themes):
if theme == -1:
curr_theme += 1
Q = []
Q.append(v)
themes[v] = curr_theme
themes_ids[curr_theme].append(v)
while Q:
curr_v = Q.pop(0)
for u, cos in enumerate(cosines[curr_v]):
if cos >= COS_THRESHOLD and themes[u] == -1:
themes[u] = curr_theme
themes_ids[curr_theme].append(u)
Q.append(u)
In [32]:
# themes_ids
In [33]:
groups = sorted(themes_ids, key=lambda x: -len(x))
In [34]:
for i, group in enumerate(groups):
if len(group) < 2:
break
print('Topic', i)
for id_ in group:
print(data.title[id_],data.url[id_])
print()
In [ ]: