In [1]:
import itertools
import sys
import json
import warnings
import os
warnings.filterwarnings("ignore")

sys.path.append('..') # for importing from parent module
from util import *
from config import VK_TOKEN # only when creating corpora
from scipy.sparse import vstack, csr_matrix, coo_matrix
from sklearn.model_selection import train_test_split
import tqdm

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from keras.models import load_model
from sklearn.utils import class_weight
from keras.layers import *
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, TensorBoard, CSVLogger


Using TensorFlow backend.

Corpora for training model


In [2]:
sources = {'art': [60114472, 19191317, "https://blog.mann-ivanov-ferber.ru/category/tvorchestvo/"], 
           'politics': [29534144, 23482909, 76982440], 
           'finances': [62438886, 81354264, 18925793],
           'strateg_management': [68241326, "http://www.stplan.ru/", "http://www.30n.ru/2/1.html"], 
           'law': [702176, "https://pravo.ru", 79084019],
           'elaboration': ["http://www.km.ru/category/tegi/nauchnye-issledovaniya-i-otkrytiya-v-rossii"], 
           'industry': [67183197, 66233468, "http://government.ru/department/54/events/"], 
           'education': [30558759, 98643656, 74404187],
           'charity': [133169189, 35966541, 15274000],
           'public_health': [78860407, 61490488, "http://rusrand.ru/analytics/analyticszdravoohranenie-rossii-mify-realnost-reshenija"],
           'agriculture': ["http://www.nsh.ru/", "http://россельхоз.рф/"],
           'government_management': ["http://be5.biz/upravlenie/gosudarstvennoe_upravlenie.html", 97296142],
           'smm': [74686342, 79925455, 40018862],
           'innovations': [98643656, 63337812, 34737049],
           'safety': [37959220, 10933209],
           'military': ["http://www.soldiering.ru", 75106422],
           'corporative_management': ["http://www.cfin.ru/management/", "https://psyera.ru/2984/ponyatie-korporativnogo-menedzhmenta"],
           'social_safety': [49582956, 72388807],
           'building': [30713157, 26978036],
           'entrepreneurship': [69560028, 73537456],
           'sport': [29809500, 128350290],
           'investitions': [37876217, 3800580]
          }

In [3]:
# getting data for corpora

for tag, ids_ in sources.items():
    path = f"corpora/{tag}.txt"
    s = set()
    if not os.path.exists(path):
        with open(path, "w") as f:
            for id_ in ids_: # iterating through available links/ids
                if isinstance(id_, int): # if it's vk public id
                    wall = ParseClass.get_wall_vk(-id_, 1000, VK_TOKEN)
                    for post in tqdm.tqdm(wall):
                        if len(post) and post not in s: # write unique posts
                            s.add(post)
                            _ = f.write(f"{post}\n")
                elif isinstance(id_, str):
                    links = np.random.choice(
                        list(ParseClass.get_all_links(id_)), 666)
                    for link in tqdm.tqdm(links):
                        try:
                            r = requests.get(link)
                        except requests.RequestException:
                            continue
                        if r.ok and r.encoding == "utf-8":
                            page = r.text
                            soup = BeautifulSoup(page, "lxml")
                            for text in soup.text.strip().split("\n"):
                                if len(text) and len(text.split()) > 2 and text not in s:
                                    s.add(text)
                                    _ = f.write(f"{text}\n")

Normalizing file sizes


In [4]:
file_paths = [path for path in os.listdir("corpora") if path.endswith("txt")]
lengths = [os.path.getsize(f"corpora/{filepath}") for filepath in file_paths]

In [5]:
indexes = np.array(lengths).argsort()
first_half = indexes[:len(lengths) // 2]
second_half = indexes[len(lengths) // 2:]

In [6]:
for i, ind_file in enumerate(first_half):
    sym_length = os.path.getsize(f"corpora/{file_paths[ind_file]}") // 2 # length of a file in first_half
    path = f"corpora/{file_paths[second_half[i]]}"
    all_symbols = "".join(open(path))
    with open(path, "w") as f:
        _ = f.write(all_symbols[:sym_length])

In [7]:
categories = list(sources.keys())

In [8]:
labels = ["Искусство", "Политика", "Финансы", "Стратегическое управление", "Юриспруденция", "Исследования и разработки",
          "Промышленность", "Образование", "Благотворительность", "Здравоохранение", "Сельское хозяйство", 
          "Государственное управление", "Реклама и маркетинг", "Инновации и модернизация", "Безопасность", 
          "Военное дело", "Корпоративное управление", "Социальная защита", "Строительство", "Предпринимательство",
          "Спорт", "Инвестиции"]
norm_names = dict(zip(categories, labels))
norm_names_reversed = dict([(a[1], a[0]) for a in norm_names.items()])

Using labeled data from LEADER-ID


In [9]:
# df = pd.read_msgpack("df.msg")
# competencies = pd.read_csv("competencies.csv")
# competencies = competencies.dropna()[['Id', 'Интересы']]
# social = pd.read_excel("assets/social.xlsx", names=['id', 'name', 'fb', 'ok', 'vk', 'email', 'twitter'])
# accepted = social.fb.dropna().index | social.vk.dropna().index
# social = social.loc[accepted, ['name', 'id', 'vk', 'fb']]
# social.vk = social.vk.apply(lambda x: str(x).split("/")[-1])
# social.fb = social.fb.apply(lambda x: str(x).split("/")[-1])
# social.fb = social.fb.apply(lambda x: str(x).split("id=")[-1])
# social.fb = social.fb.apply(lambda x: str(x).split("?ref=")[0])
# social.fb = social.fb.replace('nan', np.nan)
# social = social.merge(df[['vk', 'vk_id']], how='outer', on='vk')
# social.vk = social.vk_id.fillna(0)
# social = social.replace(0, np.nan).drop(labels=['vk_id'], axis=1)
# social = social.set_index('id').merge(how='inner', right=competencies.set_index('Id'), left_index=True, right_index=True)
# to_exclude = pd.read_csv("assets/known_users.csv")['Leader-ID'].dropna().astype('int').values
# social = social.loc[~social.index.isin(to_exclude), :]
# social.to_msgpack("assets/social.msg")

# def get_id(screen_name):
#     try:
#         item = vk.users.get(user_ids=screen_name)
#         if 'deactivated' not in item:
#             return item[0]['id']
#     except Exception as e:
#         print(e.args)
# Нужно замапить screen_name с их vk_id, поэтому придется обрабатывать по одному

social = pd.read_msgpack("social.msg")
d = json.load(open("d.json"))

In [10]:
label_encoder = LabelEncoder()
int_labels = label_encoder.fit_transform(labels)

def f(x):
    b = []
    for a in x.split(","):
        t = a.strip()
        if t == "Частный бизнес" or t == "Социальное предпринимательство":
            b.append("Предпринимательство")
        elif t == "Дошкольное образование/детский отдых":
            b.append("Образование")
        elif t in {'Журналистика', 'Управление персоналом', 'Управление рисками'}:
            pass
        else:
            b.append(t)
    return b[:6]
social.Интересы = social.Интересы.apply(f)
def g(x):
    if not x:
        return np.in1d(int_labels, []).astype('int')
    return np.in1d(int_labels, label_encoder.transform(x)).astype('int')
social['y'] = social.Интересы.apply(g)

Corpora from created texts


In [11]:
corpora_class = CorporaClass()

for filename in categories:
    with open(f"corpora/{filename}.txt") as f:
        corpora_class.add_to_corpora(f, filename)

In [12]:
vectorizer = TfidfVectorizer(tokenizer=corpora_class.full_process, 
                             max_df=0.55, 
                             min_df=5, 
                             sublinear_tf=True, 
                             ngram_range=(1, 2))

docterm_matrix = vectorizer.fit_transform(list(itertools.chain(*(doc for doc in corpora_class.corpora))))

In [13]:
DELIM = 1300
NUM_OF_CLASSES = 22
vector_size = docterm_matrix[0].shape[1]

y for created texts


In [14]:
y = []
for (i, x), cat in zip( zip(range(len(corpora_class.corpora)), corpora_class.corpora), 
                       categories ):
    for _ in range(len(x)):
        y_ = np.zeros(NUM_OF_CLASSES)
        t = np.zeros(NUM_OF_CLASSES)
        for arr in (social.loc[d[norm_names[cat]], 'y'] * 0.016):
            t = t + arr
        y_ = y_ + t
        y_[i] = 1
        y.append(y_)
y = np.array(y)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(docterm_matrix, y, test_size=0.2)

In [18]:
X_train = np.expand_dims(X_train.toarray(), axis=2)
X_test = np.expand_dims(X_test.toarray(), axis=2)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-18-eda054603726> in <module>()
      1 X_train = np.expand_dims(X_train.toarray(), axis=2)
----> 2 X_test = np.expand_dims(X_test.toarray(), axis=2)

AttributeError: 'numpy.ndarray' object has no attribute 'toarray'

In [23]:
STRIDE = 5
classifier = Sequential((
    Conv1D(filters=1, kernel_size=STRIDE, activation='relu', input_shape=(vector_size, 1)),
    MaxPooling1D(),
    Conv1D(filters=1, kernel_size=STRIDE, activation='relu'),
    MaxPooling1D(),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(64, activation='sigmoid'),
    Dense(NUM_OF_CLASSES, activation='sigmoid')
))

classifier.compile(optimizer='adadelta', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [25]:
tbCallBack = TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True)

classifier.fit(X_train, 
               y_train, 
               validation_data=(X_test, y_test), 
               batch_size=196, 
               epochs=60, 
               class_weight='balanced',
               callbacks=[tbCallBack]
              )


Train on 49947 samples, validate on 12487 samples
Epoch 1/60
49947/49947 [==============================] - 238s 5ms/step - loss: 4.4883 - categorical_accuracy: 0.0730 - val_loss: 4.4483 - val_categorical_accuracy: 0.0765
Epoch 2/60
49947/49947 [==============================] - 235s 5ms/step - loss: 4.4452 - categorical_accuracy: 0.0797 - val_loss: 4.4471 - val_categorical_accuracy: 0.0765
Epoch 3/60
49947/49947 [==============================] - 215s 4ms/step - loss: 4.4440 - categorical_accuracy: 0.0797 - val_loss: 4.4461 - val_categorical_accuracy: 0.0765
Epoch 4/60
49947/49947 [==============================] - 218s 4ms/step - loss: 4.4376 - categorical_accuracy: 0.0797 - val_loss: 4.4280 - val_categorical_accuracy: 0.0765
Epoch 5/60
49947/49947 [==============================] - 237s 5ms/step - loss: 4.2613 - categorical_accuracy: 0.1661 - val_loss: 3.8916 - val_categorical_accuracy: 0.4036
Epoch 6/60
49947/49947 [==============================] - 220s 4ms/step - loss: 3.5539 - categorical_accuracy: 0.5161 - val_loss: 3.3550 - val_categorical_accuracy: 0.5613
Epoch 7/60
49947/49947 [==============================] - 224s 4ms/step - loss: 3.1772 - categorical_accuracy: 0.6223 - val_loss: 3.2010 - val_categorical_accuracy: 0.6080
Epoch 8/60
49947/49947 [==============================] - 218s 4ms/step - loss: 3.0065 - categorical_accuracy: 0.6768 - val_loss: 3.1384 - val_categorical_accuracy: 0.6213
Epoch 9/60
49947/49947 [==============================] - 217s 4ms/step - loss: 2.8986 - categorical_accuracy: 0.7138 - val_loss: 3.1050 - val_categorical_accuracy: 0.6302
Epoch 10/60
49947/49947 [==============================] - 217s 4ms/step - loss: 2.8148 - categorical_accuracy: 0.7441 - val_loss: 3.0900 - val_categorical_accuracy: 0.6381
Epoch 11/60
49947/49947 [==============================] - 217s 4ms/step - loss: 2.7495 - categorical_accuracy: 0.7684 - val_loss: 3.0779 - val_categorical_accuracy: 0.6404
Epoch 12/60
49947/49947 [==============================] - 217s 4ms/step - loss: 2.6907 - categorical_accuracy: 0.7904 - val_loss: 3.0753 - val_categorical_accuracy: 0.6429
Epoch 13/60
49947/49947 [==============================] - 218s 4ms/step - loss: 2.6433 - categorical_accuracy: 0.8082 - val_loss: 3.0741 - val_categorical_accuracy: 0.6453
Epoch 14/60
  196/49947 [..............................] - ETA: 3:46 - loss: 2.5862 - categorical_accuracy: 0.8214
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-25-7e8116829202> in <module>()
     25                epochs=60,
     26                class_weight='balanced',
---> 27                callbacks=[tbCallBack]
     28               )
     29 # classifier.fit_generator(nn_batch_generator(X_train, y_train, 196),

~/.virtualenvs/vk_text_classifier-CgDYx6B0/lib/python3.6/site-packages/keras/models.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)
    958                               initial_epoch=initial_epoch,
    959                               steps_per_epoch=steps_per_epoch,
--> 960                               validation_steps=validation_steps)
    961 
    962     def evaluate(self, x, y, batch_size=32, verbose=1,

~/.virtualenvs/vk_text_classifier-CgDYx6B0/lib/python3.6/site-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)
   1655                               initial_epoch=initial_epoch,
   1656                               steps_per_epoch=steps_per_epoch,
-> 1657                               validation_steps=validation_steps)
   1658 
   1659     def evaluate(self, x=None, y=None,

~/.virtualenvs/vk_text_classifier-CgDYx6B0/lib/python3.6/site-packages/keras/engine/training.py in _fit_loop(self, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch, steps_per_epoch, validation_steps)
   1211                     batch_logs['size'] = len(batch_ids)
   1212                     callbacks.on_batch_begin(batch_index, batch_logs)
-> 1213                     outs = f(ins_batch)
   1214                     if not isinstance(outs, list):
   1215                         outs = [outs]

~/.virtualenvs/vk_text_classifier-CgDYx6B0/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py in __call__(self, inputs)
   2355         session = get_session()
   2356         updated = session.run(fetches=fetches, feed_dict=feed_dict,
-> 2357                               **self.session_kwargs)
   2358         return updated[:len(self.outputs)]
   2359 

~/.virtualenvs/vk_text_classifier-CgDYx6B0/lib/python3.6/site-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
    887     try:
    888       result = self._run(None, fetches, feed_dict, options_ptr,
--> 889                          run_metadata_ptr)
    890       if run_metadata:
    891         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

~/.virtualenvs/vk_text_classifier-CgDYx6B0/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
   1118     if final_fetches or final_targets or (handle and feed_dict_tensor):
   1119       results = self._do_run(handle, final_targets, final_fetches,
-> 1120                              feed_dict_tensor, options, run_metadata)
   1121     else:
   1122       results = []

~/.virtualenvs/vk_text_classifier-CgDYx6B0/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1315     if handle is None:
   1316       return self._do_call(_run_fn, self._session, feeds, fetches, targets,
-> 1317                            options, run_metadata)
   1318     else:
   1319       return self._do_call(_prun_fn, self._session, handle, feeds, fetches)

~/.virtualenvs/vk_text_classifier-CgDYx6B0/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
   1321   def _do_call(self, fn, *args):
   1322     try:
-> 1323       return fn(*args)
   1324     except errors.OpError as e:
   1325       message = compat.as_text(e.message)

~/.virtualenvs/vk_text_classifier-CgDYx6B0/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
   1300           return tf_session.TF_Run(session, options,
   1301                                    feed_dict, fetch_list, target_list,
-> 1302                                    status, run_metadata)
   1303 
   1304     def _prun_fn(session, handle, feed_dict, fetch_list):

KeyboardInterrupt: 

In [26]:
classifier.save("../assets/vk_texts_classifier-dev.h5")

In [19]:
classifier = load_model("../assets/vk_texts_classifier.h5")

Result


In [1]:
import sys
sys.path.append('..')
from util import ResultClass


Using TensorFlow backend.

In [2]:
import redis
redis_obj = redis.Redis()

In [3]:
result_class = ResultClass(redis_obj)

In [4]:
verdict = result_class.get_result(78543018, None)


VK Parsing 78543018
dict_keys(['users_vk:78543018', 'publics_vk:76982440', 'publics_vk:74404187', 'publics_vk:40886007', 'publics_vk:46860100', 'publics_vk:20629724', 'publics_vk:9471321', 'publics_vk:32896153', 'publics_vk:25336774', 'publics_vk:52298374'])
1-th public have been parsed. (76982440)
2-th public have been parsed. (74404187)
3-th public have been parsed. (40886007)
4-th public have been parsed. (46860100)
5-th public have been parsed. (20629724)
6-th public have been parsed. (9471321)
7-th public have been parsed. (32896153)
8-th public have been parsed. (25336774)
9-th public have been parsed. (52298374)
VK Parse completed in 3.0752711296081543 sec.
Added to corpora
Transformed corpora.

In [5]:
verdict


Out[5]:
[('art', 0.12067863),
 ('politics', 0.11904053),
 ('finances', 0.11047247),
 ('strateg_management', 0.24823546),
 ('law', 0.099051803),
 ('elaboration', 0.35977983),
 ('industry', 0.20506588),
 ('education', 0.42244613),
 ('charity', 0.063216761),
 ('public_health', 0.093201287),
 ('agriculture', 0.070799313),
 ('government_management', 0.15874875),
 ('smm', 0.2779991),
 ('innovations', 0.28204551),
 ('safety', 0.16917785),
 ('military', 0.10215277),
 ('corporative_management', 0.23524198),
 ('social_safety', 0.14480668),
 ('building', 0.21360886),
 ('entrepreneurship', 0.35831749),
 ('sport', 0.11745396),
 ('investitions', 0.18645653)]

In [5]:
from config import VK_TOKEN

In [7]:
import vk_api

In [8]:
vk_session = vk_api.VkApi(token=VK_TOKEN)

In [9]:
vk = vk_session.get_api()

In [22]:
pool_data = {}
publics=[-58219172, -28905875, -84926122, -55284725, -139246969, -29534144, -54530371]

In [33]:
with vk_api.VkRequestsPool(vk_session) as pool:
    for owner_id in publics:
        pool_data[owner_id] = pool.method("wall.get", {"owner_id": owner_id, "count": 100})

In [34]:
for pub in publics:
    wall = [a.get('text', '') for a in pool_data[pub].result.get('items', [])]
    print(pub)


-58219172
-28905875
-84926122
-55284725
-139246969
-29534144
-54530371

In [29]:
t = " ".join(corpora_class.corpora[0])
normalize(result_class.classifier.predict(np.sum(result_class.vectorizer.transform([t]), axis=0)).reshape(1, -1))[0]


Out[29]:
array([  1.58050482e-03,   6.46794308e-03,   7.65688455e-05,
         1.75090730e-01,   1.35872543e-01,   8.20550978e-01,
         5.53579675e-03,   4.65691447e-01,   8.31513375e-07,
         3.94662988e-04,   1.56653114e-05,   7.64702559e-02,
         4.29760478e-02,   1.14148781e-01,   3.48257017e-03,
         8.69610987e-04,   1.33026719e-01,   6.09717565e-03,
         5.01470780e-03,   1.34844348e-01,   3.38851464e-06,
         6.28931075e-02], dtype=float32)

In [32]:
%%timeit
normalize(result_class.classifier.predict(np.sum(result_class.vectorizer.transform([" ".join(corpora_class.corpora[0])]), axis=0)).reshape(1, -1))[0]


2.13 s ± 170 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [22]:
norm_categories = np.array(list(norm_names.values()))

In [39]:
dict_for_mean = []

labels_ = np.array(test_labels)
corpora_ = np.array(test_corpora)

acc_d = {}
for col in norm_names.values():
    t = social.Интересы.apply(lambda s: col in s)
    col_labels = np.array(list(set(t[t == True].index).intersection(labels_)))
    c_0 = 0
    c_1 = 0
    c_2 = 0
    for item in corpora_[np.in1d(labels_, col_labels)]:
        t = np.sum(classifier.predict(vectorizer.transform(item).toarray()), axis=0)
        dict_for_mean.append(t)
        pred_categories = norm_categories[t.argsort()[::-1][:8]]
        if col in pred_categories[:2]:
            c_0 += 1
        if col in pred_categories[:5]:
            c_1 += 1
        if col in pred_categories:
            c_2 += 1
    l = len(col_labels)
    if l == 0:
        l = 1
    col, l, c_0 / l, c_1 / l, c_2 / l
    acc_d[col] = c_2 / l
    
    
#     sorted(np.sum(classifier.predict(vectorizer.transform(corpora_user_gen.corpora[5]).toarray()), axis=0), reverse=True)[:3]


Out[39]:
('Искусство', 9, 0.2222222222222222, 0.2222222222222222, 0.3333333333333333)
Out[39]:
('Политика', 17, 0.23529411764705882, 0.23529411764705882, 0.29411764705882354)
Out[39]:
('Финансы', 6, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333)
Out[39]:
('Стратегическое управление',
 27,
 0.25925925925925924,
 0.6666666666666666,
 0.8888888888888888)
Out[39]:
('Юриспруденция',
 6,
 0.3333333333333333,
 0.3333333333333333,
 0.3333333333333333)
Out[39]:
('Исследования и разработки',
 35,
 0.11428571428571428,
 0.45714285714285713,
 0.8)
Out[39]:
('Промышленность', 17, 0.0, 0.11764705882352941, 0.23529411764705882)
Out[39]:
('Образование', 33, 0.3333333333333333, 0.7272727272727273, 0.9090909090909091)
Out[39]:
('Благотворительность', 1, 0.0, 0.0, 0.0)
Out[39]:
('Здравоохранение', 10, 0.4, 0.4, 0.4)
Out[39]:
('Сельское хозяйство', 4, 0.5, 0.5, 0.5)
Out[39]:
('Государственное управление',
 23,
 0.21739130434782608,
 0.30434782608695654,
 0.6086956521739131)
Out[39]:
('Реклама и маркетинг',
 9,
 0.2222222222222222,
 0.3333333333333333,
 0.3333333333333333)
Out[39]:
('Инновации и модернизация',
 29,
 0.10344827586206896,
 0.3448275862068966,
 0.9310344827586207)
Out[39]:
('Безопасность', 9, 0.3333333333333333, 0.4444444444444444, 0.4444444444444444)
Out[39]:
('Военное дело', 4, 0.0, 0.0, 0.25)
Out[39]:
('Корпоративное управление',
 17,
 0.11764705882352941,
 0.4117647058823529,
 0.6470588235294118)
Out[39]:
('Социальная защита',
 3,
 0.3333333333333333,
 0.3333333333333333,
 0.3333333333333333)
Out[39]:
('Строительство', 8, 0.125, 0.125, 0.125)
Out[39]:
('Предпринимательство', 36, 0.25, 0.6666666666666666, 0.9722222222222222)
Out[39]:
('Спорт', 1, 0.0, 0.0, 0.0)
Out[39]:
('Инвестиции',
 21,
 0.19047619047619047,
 0.19047619047619047,
 0.3333333333333333)

In [40]:
np.mean(list(acc_d.values()))


Out[40]:
0.45481123580974059

In [41]:
list(zip(categories, [np.mean(a) for a in np.array([np.array(b) for b in dict_for_mean]).T]))


Out[41]:
[('art', 0.11978178),
 ('politics', 0.2571713),
 ('finances', 0.15092145),
 ('strateg_management', 0.27437153),
 ('law', 0.14367713),
 ('elaboration', 0.2649323),
 ('industry', 0.14514737),
 ('education', 0.29252267),
 ('charity', 0.11408015),
 ('public_health', 0.16153345),
 ('agriculture', 0.21364743),
 ('government_management', 0.26083517),
 ('smm', 0.23419189),
 ('innovations', 0.23632175),
 ('safety', 0.15871054),
 ('military', 0.059539381),
 ('corporative_management', 0.25076938),
 ('social_safety', 0.12933674),
 ('building', 0.14101389),
 ('entrepreneurship', 0.26689848),
 ('sport', 0.21294023),
 ('investitions', 0.18971834)]

In [23]:
# social = pd.read_excel("social.xlsx", names=['id', 'name', 'fb', 'ok', 'vk', 'email', 'twitter'])
# social.set_index('id', inplace=True)
# social = social[social.vk.notnull()]
# social.vk = social.vk.apply(lambda x: str(x).split("/")[-1])
# df = social[social.vk.notnull()]
# df = df[['name', 'vk']]

# def get_id(screen_name):
#     try:
#         item = vk.users.get(user_ids=screen_name)
#         if 'deactivated' not in item:
#             return item[0]['id']
#     except Exception as e:
#         print(e.args)
# # Нужно замапить screen_name с их vk_id, поэтому придется обрабатывать по одному

# df['vk_id'] = df.vk.progress_apply(get_id)
# df.dropna(inplace=True)
# df.vk_id = df.vk_id.astype('int')
# df.to_msgpack("df.msg")
df = pd.read_msgpack("assets/df.msg")
known = pd.read_csv("assets/known_users.csv")
known = known.merge(df[['vk_id']], left_on='Leader-ID', right_index=True, how='left')

In [83]:
from line_profiler import LineProfiler

In [88]:
lp = LineProfiler()

In [89]:
lp.add_function(corpora_class.full_process)

In [91]:
lp.run("corpora_class.full_process(corpora[0][0])")


Out[91]:
<line_profiler.LineProfiler at 0x12eff8e88>

In [93]:
lp.print_stats()


Timer unit: 1e-06 s

Total time: 0.017674 s
File: /Users/george/Dropbox/Projects/vk_text_classifier/util.py
Function: full_process at line 36

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    36                                               @staticmethod
    37                                               def full_process(text, tokenizer=tokenizer, morph=morph, ru_pattern=ru_pattern):
    38                                                   # Clear text from punctuation etc.'''
    39         1          110    110.0      0.6          tokens = tokenizer.tokenize(text)
    40                                           
    41                                                   # Turn tokens into normal form excluding non-nouns or verbs
    42         1            1      1.0      0.0          processed = []
    43       102           77      0.8      0.4          for token in tokens:
    44       101         8888     88.0     50.3              morphed = morph.parse(token)[0].normal_form
    45       101         7591     75.2     43.0              nf_tag = str(morph.parse(morphed)[0].tag.POS)
    46       101          129      1.3      0.7              if nf_tag in ("NOUN", "ADJF", "INFN", "NUMR") and len(token) < 16:
    47        66          811     12.3      4.6                  if len(morphed) == len(re.findall(ru_pattern, morphed)):
    48        66           67      1.0      0.4                      processed.append(morphed)
    49                                           
    50         1            0      0.0      0.0          return processed


In [109]:
import requests
import vk_api
vk_s = vk_api.VkApi()
vk = vk_s.get_api()
t = vk.friends.get(user_id=134070307)['items']

In [119]:
{"name": "", "user_vk": int(np.random.choice(vk.friends.get(user_id=np.random.choice(t))['items']))}


Out[119]:
{'name': '', 'user_vk': 61356}

In [57]:
requests.post("http://78.155.197.212:9999/get_result", json={"name": "", "user_vk": int(np.random.choice(vk.friends.get(user_id=np.random.choice(t))['items']))}).json()

In [104]:
import requests
import vk_api
vk_s = vk_api.VkApi()
vk = vk_s.get_api()
t = vk.friends.get(user_id=134070307)['items']

while 1:
    requests.post("http://78.155.197.212:9999/get_result", json={"name": "", "user_vk": int(np.random.choice(vk.friends.get(user_id=np.random.choice(t))['items']))}).json()


The slowest run took 34.22 times longer than the fastest. This could mean that an intermediate result is being cached.
19.3 s ± 17.5 s per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [105]:
import vk_api
vk_s = vk_api.VkApi()
vk = vk_s.get_api()
t = vk.friends.get(user_id=134070307)['items']

In [56]:
transformed


Out[56]:
<1518x19770 sparse matrix of type '<class 'numpy.float64'>'
	with 13149 stored elements in Compressed Sparse Row format>

In [37]:
offset = 0
for index, row in tqdm.tqdm(known.iloc[offset:, :].iterrows(), total=len(known) - offset):
    user_vk = row['vk_id']
    if str(user_vk) == "nan":
        user_vk = None
    user_fb = row['FB']
    if str(user_fb) == "nan":
        user_fb = None
    try:
        verdict = result_class.get_result(user_vk, user_fb)
        result_class.texts = []
        for cat, value in verdict:
            known.loc[index, cat] = value
    except ValueError:
        for cat in categories:
            known.loc[index, cat] = 0
    except IndexError:
        for cat in categories:
            known.loc[index, cat] = 0


  0%|          | 0/44 [00:00<?, ?it/s]
FB Parsing alya.blesk
FB Parse completed.
Added to corpora
Transformed corpora.
  2%|▏         | 1/44 [00:01<00:52,  1.21s/it]
FB Parsing zonovaangelina
FB Parse completed.
Added to corpora
  5%|▍         | 2/44 [00:01<00:38,  1.09it/s]
Transformed corpora.
FB Parsing 100018653177414
FB Parse completed.
  7%|▋         | 3/44 [00:01<00:28,  1.43it/s]
Added to corpora
FB Parsing 1088734051
FB Parse completed.
  9%|▉         | 4/44 [00:02<00:29,  1.35it/s]
Added to corpora
Transformed corpora.
FB Parsing kadilevazm
FB Parse completed.
Added to corpora
 11%|█▏        | 5/44 [00:03<00:33,  1.16it/s]
Transformed corpora.
FB Parsing 100013685044932
 14%|█▎        | 6/44 [00:03<00:26,  1.45it/s]
FB Parse completed.
Added to corpora
Transformed corpora.
FB Parsing i.g.grigorenko
 16%|█▌        | 7/44 [00:04<00:20,  1.77it/s]
FB Parse completed.
Added to corpora
Transformed corpora.
FB Parsing nvkulbyatskaya
FB Parse completed.
Added to corpora
Transformed corpora.
 18%|█▊        | 8/44 [00:05<00:32,  1.10it/s]
FB Parsing 100017461808545
FB Parse completed.
 20%|██        | 9/44 [00:06<00:25,  1.37it/s]
Added to corpora
Transformed corpora.
FB Parsing 1496282246
 23%|██▎       | 10/44 [00:06<00:25,  1.34it/s]
FB Parse completed.
Added to corpora
Transformed corpora.
FB Parsing khoryushin
 25%|██▌       | 11/44 [00:07<00:21,  1.55it/s]
FB Parse completed.
Added to corpora
Transformed corpora.
VK Parsing 222216857.0
VK Parse completed.
FB Parsing andrey.siling
FB Parse completed.
Added to corpora
Transformed corpora.
 27%|██▋       | 12/44 [00:09<00:35,  1.10s/it]
FB Parsing devident
FB Parse completed.
Added to corpora
 30%|██▉       | 13/44 [00:10<00:30,  1.00it/s]
Transformed corpora.
FB Parsing denis.unzhakov
FB Parse completed.
Added to corpora
 32%|███▏      | 14/44 [00:11<00:28,  1.06it/s]
Transformed corpora.
FB Parsing blag54
 34%|███▍      | 15/44 [00:11<00:21,  1.32it/s]
FB Parse completed.
Added to corpora
Transformed corpora.
FB Parsing aksenova0
 36%|███▋      | 16/44 [00:11<00:16,  1.69it/s]
FB Parse completed.
Added to corpora
FB Parsing ekaterina.shakina.73
FB Parse completed.
 39%|███▊      | 17/44 [00:12<00:14,  1.82it/s]
Added to corpora
Transformed corpora.
FB Parsing eugene.colchev
FB Parse completed.
Added to corpora
Transformed corpora.
 41%|████      | 18/44 [00:12<00:16,  1.55it/s]
FB Parsing eugene.kolganov
 43%|████▎     | 19/44 [00:13<00:14,  1.71it/s]
FB Parse completed.
Added to corpora
Transformed corpora.
FB Parsing ekovnir
FB Parse completed.
Added to corpora
Transformed corpora.
 45%|████▌     | 20/44 [00:15<00:26,  1.09s/it]
FB Parsing GeorgeBoissonade
FB Parse completed.
 48%|████▊     | 21/44 [00:16<00:19,  1.16it/s]
Added to corpora
Transformed corpora.
VK Parsing 4842118.0
1-th public have been parsed. (23868023)
2-th public have been parsed. (45938293)
3-th public have been parsed. (88640919)
4-th public have been parsed. (40393408)
5-th public have been parsed. (30525261)
VK Parse completed.
FB Parsing iruzhentsev
FB Parse completed.
Added to corpora
Transformed corpora.
 50%|█████     | 22/44 [00:20<00:40,  1.86s/it]
FB Parsing irina.gordinanevmerzhitskaya
FB Parse completed.
Added to corpora
Transformed corpora.
 52%|█████▏    | 23/44 [00:21<00:35,  1.71s/it]
FB Parsing 100006012883228
FB Parse completed.
 55%|█████▍    | 24/44 [00:21<00:25,  1.29s/it]
Added to corpora
Transformed corpora.
FB Parsing 710434546
 57%|█████▋    | 25/44 [00:22<00:19,  1.02s/it]
FB Parse completed.
Added to corpora
Transformed corpora.
FB Parsing 100007301839238
 59%|█████▉    | 26/44 [00:22<00:13,  1.29it/s]
FB Parse completed.
Added to corpora
Transformed corpora.
FB Parsing ksusha.andreeva.14
FB Parse completed.
Added to corpora
Transformed corpora.
 61%|██████▏   | 27/44 [00:24<00:20,  1.20s/it]
FB Parsing liubov.kirienko
FB Parse completed.
 64%|██████▎   | 28/44 [00:25<00:16,  1.06s/it]
Added to corpora
Transformed corpora.
FB Parsing maria.dolgikh.7
FB Parse completed.
Added to corpora
Transformed corpora.
 66%|██████▌   | 29/44 [00:26<00:18,  1.21s/it]
FB Parsing nikita.lebedev.9484
FB Parse completed.
Added to corpora
 68%|██████▊   | 30/44 [00:28<00:17,  1.28s/it]
Transformed corpora.
FB Parsing oleg.podolskiy
 70%|███████   | 31/44 [00:28<00:12,  1.00it/s]
FB Parse completed.
Added to corpora
Transformed corpora.
FB Parsing olga.potemkina.125
FB Parse completed.
Added to corpora
Transformed corpora.
 73%|███████▎  | 32/44 [00:29<00:12,  1.02s/it]
FB Parsing 100004241860483
FB Parse completed.
 75%|███████▌  | 33/44 [00:30<00:09,  1.20it/s]
Added to corpora
Transformed corpora.
FB Parsing ruslan.karmanny.57
 77%|███████▋  | 34/44 [00:31<00:10,  1.05s/it]
FB Parse completed.
Added to corpora
Transformed corpora.
FB Parsing nacvasin
FB Parse completed.
Added to corpora
 80%|███████▉  | 35/44 [00:32<00:09,  1.04s/it]
Transformed corpora.
FB Parsing tatiana.anisimova.50
 82%|████████▏ | 36/44 [00:33<00:07,  1.07it/s]
FB Parse completed.
Added to corpora
Transformed corpora.
FB Parsing 1660860881
FB Parse completed.
Added to corpora
Transformed corpora.
 84%|████████▍ | 37/44 [00:34<00:07,  1.07s/it]
FB Parsing tretyakov.vasily
FB Parse completed.
Added to corpora
 86%|████████▋ | 38/44 [00:35<00:06,  1.00s/it]
Transformed corpora.
FB Parsing luckashinav
FB Parse completed.
Added to corpora
 89%|████████▊ | 39/44 [00:36<00:04,  1.00it/s]
Transformed corpora.
FB Parsing yulia.gudach
FB Parse completed.
Added to corpora
Transformed corpora.
 91%|█████████ | 40/44 [00:37<00:04,  1.05s/it]
FB Parsing yury.sushinov
FB Parse completed.
Added to corpora
Transformed corpora.
 93%|█████████▎| 41/44 [00:39<00:03,  1.17s/it]
FB Parsing peskov
 95%|█████████▌| 42/44 [00:40<00:02,  1.32s/it]
FB Parse completed.
Added to corpora
Transformed corpora.
FB Parsing 100007131103601
FB Parse completed.
 98%|█████████▊| 43/44 [00:42<00:01,  1.26s/it]
Added to corpora
Transformed corpora.
VK Parsing 2416234.0
1-th public have been parsed. (31920990)
2-th public have been parsed. (1967021)
3-th public have been parsed. (227)
4-th public have been parsed. (8179697)
VK Parse completed.
FB Parsing dmitry.zemtsov
FB Parse completed.
Added to corpora
Transformed corpora.
100%|██████████| 44/44 [00:45<00:00,  1.98s/it]

In [38]:
norm_dict = {"values": []}
for name in known.ФИ:
    results = []
    for col in known[known.ФИ == name].iloc[:, 4:].columns:
        results.append({"name": norm_names[col], "value": float(known.loc[known.ФИ == name, col].values[0])})
    norm_dict['values'].append({"name": name, "results": results})

In [42]:
[(norm_categories[x.argsort()[-1]], norm_categories[x.argsort()[-2]]) for x in known.loc[:, categories].values]


Out[42]:
[('Образование', 'Предпринимательство'),
 ('Строительство', 'Образование'),
 ('Инвестиции', 'Спорт'),
 ('Спорт', 'Предпринимательство'),
 ('Реклама и маркетинг', 'Образование'),
 ('Образование', 'Здравоохранение'),
 ('Государственное управление', 'Корпоративное управление'),
 ('Образование', 'Предпринимательство'),
 ('Образование', 'Промышленность'),
 ('Образование', 'Реклама и маркетинг'),
 ('Искусство', 'Исследования и разработки'),
 ('Спорт', 'Образование'),
 ('Образование', 'Предпринимательство'),
 ('Спорт', 'Предпринимательство'),
 ('Образование', 'Предпринимательство'),
 ('Инвестиции', 'Спорт'),
 ('Образование', 'Предпринимательство'),
 ('Образование', 'Предпринимательство'),
 ('Социальная защита', 'Образование'),
 ('Реклама и маркетинг', 'Образование'),
 ('Здравоохранение', 'Благотворительность'),
 ('Предпринимательство', 'Образование'),
 ('Предпринимательство', 'Образование'),
 ('Образование', 'Строительство'),
 ('Строительство', 'Исследования и разработки'),
 ('Здравоохранение', 'Безопасность'),
 ('Образование', 'Предпринимательство'),
 ('Предпринимательство', 'Образование'),
 ('Образование', 'Предпринимательство'),
 ('Предпринимательство', 'Образование'),
 ('Строительство', 'Образование'),
 ('Образование', 'Предпринимательство'),
 ('Здравоохранение', 'Государственное управление'),
 ('Предпринимательство', 'Образование'),
 ('Образование', 'Предпринимательство'),
 ('Образование', 'Предпринимательство'),
 ('Благотворительность', 'Предпринимательство'),
 ('Образование', 'Предпринимательство'),
 ('Образование', 'Предпринимательство'),
 ('Реклама и маркетинг', 'Образование'),
 ('Реклама и маркетинг', 'Образование'),
 ('Образование', 'Предпринимательство'),
 ('Образование', 'Исследования и разработки'),
 ('Образование', 'Предпринимательство')]

In [43]:
means = []
for col in known.iloc[:, 4:].columns:
    norm_names[col], known.loc[:, col].median()
    means.append(known.loc[:, col].median())


Out[43]:
('Искусство', 0.11816754192113876)
Out[43]:
('Политика', 0.1563805639743805)
Out[43]:
('Финансы', 0.13835492730140686)
Out[43]:
('Стратегическое управление', 0.2304811328649521)
Out[43]:
('Юриспруденция', 0.07182104140520096)
Out[43]:
('Исследования и разработки', 0.2729760706424713)
Out[43]:
('Промышленность', 0.23073923587799072)
Out[43]:
('Образование', 0.36262887716293335)
Out[43]:
('Благотворительность', 0.059181734919548035)
Out[43]:
('Здравоохранение', 0.16277647018432617)
Out[43]:
('Сельское хозяйство', 0.10010015964508057)
Out[43]:
('Государственное управление', 0.1793888360261917)
Out[43]:
('Реклама и маркетинг', 0.20943303406238556)
Out[43]:
('Инновации и модернизация', 0.237436905503273)
Out[43]:
('Безопасность', 0.11387616395950317)
Out[43]:
('Военное дело', 0.043465837836265564)
Out[43]:
('Корпоративное управление', 0.1961883306503296)
Out[43]:
('Социальная защита', 0.1679375171661377)
Out[43]:
('Строительство', 0.19163548946380615)
Out[43]:
('Предпринимательство', 0.3297472596168518)
Out[43]:
('Спорт', 0.022987650707364082)
Out[43]:
('Инвестиции', 0.20343999564647675)

In [53]:
# t is from user-gen up
means_2 = [(a + b) / 2 for a, b in zip([z[1] for z in t], [b for b in means])]

In [54]:
json.dump(dict(zip(categories, means_2)), open("margins.json", "w"))

In [27]:
known.to_csv("assets/known.csv")
json.dump(norm_dict, open("assets/temporary_result.json", "w"))

In [51]:
tt = []
for i, item in known.iterrows():
    print(item['ФИ'], end=" ")
    accepted_cols = []
    for col, margin in dict(zip(categories, means_2)).items():
        if item[col] > 1.1 * margin:
            accepted_cols.append(col)
    np.array(accepted_cols)[item[accepted_cols].argsort()[::-1]][:5]


Александрина Клюс 
Out[51]:
array(['education', 'smm', 'entrepreneurship', 'strateg_management',
       'politics'],
      dtype='<U18')
Ангелина Зонова 
Out[51]:
array(['education', 'entrepreneurship', 'smm', 'corporative_management',
       'building'],
      dtype='<U22')
Владислав Широков 
Out[51]:
array([], dtype=float64)
Галина Жукова 
Out[51]:
array(['sport'],
      dtype='<U5')
Жанна Кадылева 
Out[51]:
array(['education', 'smm', 'entrepreneurship', 'law',
       'corporative_management'],
      dtype='<U22')
Ирина Горькова 
Out[51]:
array(['elaboration', 'corporative_management', 'strateg_management',
       'education', 'innovations'],
      dtype='<U22')
Ирина Григоренко 
Out[51]:
array(['law', 'education', 'government_management',
       'corporative_management', 'public_health'],
      dtype='<U22')
Наталья Кульбятская 
Out[51]:
array(['education', 'elaboration', 'strateg_management',
       'entrepreneurship', 'corporative_management'],
      dtype='<U22')
Нелли Бадалян 
Out[51]:
array(['industry', 'education', 'elaboration', 'corporative_management',
       'law'],
      dtype='<U22')
Юлия Ханьжина 
Out[51]:
array(['education', 'smm', 'strateg_management', 'corporative_management',
       'finances'],
      dtype='<U22')
Alexey Khoryushin 
Out[51]:
array(['smm', 'strateg_management', 'charity', 'art'],
      dtype='<U18')
Andrey Siling 
Out[51]:
array(['sport', 'art', 'military'],
      dtype='<U8')
Denis  Trunov 
Out[51]:
array(['education', 'entrepreneurship', 'elaboration', 'industry',
       'innovations'],
      dtype='<U16')
Denis Unzhakov 
Out[51]:
array(['sport', 'charity', 'industry', 'military'],
      dtype='<U8')
Dima Blaginin 
Out[51]:
array(['charity', 'politics', 'education', 'law', 'industry'],
      dtype='<U11')
Ekaterina Aksenova 
Out[51]:
array([], dtype=float64)
Ekaterina Shakina 
Out[51]:
array(['education', 'corporative_management', 'strateg_management',
       'elaboration', 'entrepreneurship'],
      dtype='<U22')
Eugene Colchev 
Out[51]:
array(['corporative_management', 'education', 'strateg_management',
       'entrepreneurship', 'law'],
      dtype='<U22')
Eugene Kolganov 
Out[51]:
array(['entrepreneurship', 'social_safety', 'elaboration', 'finances',
       'investitions'],
      dtype='<U16')
Evgeny Kovnir 
Out[51]:
array(['smm', 'education', 'corporative_management', 'industry'],
      dtype='<U22')
George Boissonade 
Out[51]:
array(['charity', 'public_health'],
      dtype='<U13')
Igor  Ruzhentsev 
Out[51]:
array(['sport', 'charity'],
      dtype='<U7')
Irina Gordina-Nevmerzhitskaya 
Out[51]:
array(['smm', 'entrepreneurship', 'strateg_management',
       'corporative_management', 'industry'],
      dtype='<U22')
Ivan  Aristov 
Out[51]:
array(['sport'],
      dtype='<U5')
Katerina  Novikova 
Out[51]:
array(['entrepreneurship', 'elaboration', 'sport', 'industry',
       'agriculture'],
      dtype='<U16')
Kirill Konev 
Out[51]:
array(['safety', 'elaboration', 'innovations', 'building'],
      dtype='<U11')
Ksusha Andreeva 
Out[51]:
array(['education', 'elaboration', 'entrepreneurship', 'smm', 'innovations'],
      dtype='<U16')
Liubov Kirienko 
Out[51]:
array(['entrepreneurship', 'strateg_management', 'elaboration', 'politics',
       'innovations'],
      dtype='<U18')
Maria Dolgikh 
Out[51]:
array(['politics', 'smm', 'industry', 'art'],
      dtype='<U8')
Nikita Lebedev 
Out[51]:
array(['charity', 'smm', 'sport', 'social_safety'],
      dtype='<U13')
Oleg Podolskiy 
Out[51]:
array(['building', 'social_safety', 'entrepreneurship', 'agriculture',
       'investitions'],
      dtype='<U16')
Olga Potemkina 
Out[51]:
array(['smm', 'strateg_management', 'corporative_management', 'sport',
       'finances'],
      dtype='<U22')
Olya  Zaytseva 
Out[51]:
array(['strateg_management', 'entrepreneurship', 'smm',
       'corporative_management', 'innovations'],
      dtype='<U22')
Ruslan Karmannyy 
Out[51]:
array(['strateg_management', 'smm', 'entrepreneurship',
       'corporative_management', 'innovations'],
      dtype='<U22')
Sergey Nakvasin 
Out[51]:
array(['education', 'entrepreneurship', 'elaboration',
       'corporative_management', 'strateg_management'],
      dtype='<U22')
Tatiana Anisimova 
Out[51]:
array(['sport', 'strateg_management', 'entrepreneurship', 'smm', 'charity'],
      dtype='<U18')
Tatyana Mazhutis 
Out[51]:
array(['charity', 'law', 'finances', 'social_safety'],
      dtype='<U13')
Tretyakov Vasily 
Out[51]:
array(['education', 'elaboration', 'strateg_management',
       'entrepreneurship', 'smm'],
      dtype='<U22')
Varvara  Lukashina 
Out[51]:
array(['education', 'entrepreneurship', 'charity', 'elaboration',
       'corporative_management'],
      dtype='<U22')
Yulia  Gudach 
Out[51]:
array(['smm', 'education', 'politics', 'elaboration', 'industry'],
      dtype='<U11')
Yury Sushinov 
Out[51]:
array(['smm', 'education', 'strateg_management', 'entrepreneurship',
       'industry'],
      dtype='<U18')
Песков Дмитрий 
Out[51]:
array(['smm', 'education', 'strateg_management', 'entrepreneurship',
       'corporative_management'],
      dtype='<U22')
Гнитько Ксения 
Out[51]:
array(['charity', 'education', 'law', 'smm', 'industry'],
      dtype='<U9')
Дмитрий Земцов 
Out[51]:
array(['strateg_management', 'entrepreneurship', 'elaboration', 'smm',
       'innovations'],
      dtype='<U18')

In [ ]:


In [ ]:


In [ ]:
import vk_api
from config import VK_TOKEN

In [ ]:
vk = vk_api.VkApi(token=VK_TOKEN)
vk = vk.get_api()

In [ ]:
t = [a['text'] for a in vk.newsfeed.search(q="корпоративное управление", count=200)['items']]

In [ ]:
with open('t.txt', 'w') as f:
    for line in t:
        _ = f.write(f'{line}\n')

In [ ]: