In [1]:
import itertools
import sys
import json
import warnings
import os
warnings.filterwarnings("ignore")
sys.path.append('..') # for importing from parent module
from util import *
from config import VK_TOKEN # only when creating corpora
from scipy.sparse import vstack, csr_matrix, coo_matrix
from sklearn.model_selection import train_test_split
import tqdm
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from keras.models import load_model
from sklearn.utils import class_weight
from keras.layers import *
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, TensorBoard, CSVLogger
In [2]:
sources = {'art': [60114472, 19191317, "https://blog.mann-ivanov-ferber.ru/category/tvorchestvo/"],
'politics': [29534144, 23482909, 76982440],
'finances': [62438886, 81354264, 18925793],
'strateg_management': [68241326, "http://www.stplan.ru/", "http://www.30n.ru/2/1.html"],
'law': [702176, "https://pravo.ru", 79084019],
'elaboration': ["http://www.km.ru/category/tegi/nauchnye-issledovaniya-i-otkrytiya-v-rossii"],
'industry': [67183197, 66233468, "http://government.ru/department/54/events/"],
'education': [30558759, 98643656, 74404187],
'charity': [133169189, 35966541, 15274000],
'public_health': [78860407, 61490488, "http://rusrand.ru/analytics/analyticszdravoohranenie-rossii-mify-realnost-reshenija"],
'agriculture': ["http://www.nsh.ru/", "http://россельхоз.рф/"],
'government_management': ["http://be5.biz/upravlenie/gosudarstvennoe_upravlenie.html", 97296142],
'smm': [74686342, 79925455, 40018862],
'innovations': [98643656, 63337812, 34737049],
'safety': [37959220, 10933209],
'military': ["http://www.soldiering.ru", 75106422],
'corporative_management': ["http://www.cfin.ru/management/", "https://psyera.ru/2984/ponyatie-korporativnogo-menedzhmenta"],
'social_safety': [49582956, 72388807],
'building': [30713157, 26978036],
'entrepreneurship': [69560028, 73537456],
'sport': [29809500, 128350290],
'investitions': [37876217, 3800580]
}
In [3]:
# getting data for corpora
for tag, ids_ in sources.items():
path = f"corpora/{tag}.txt"
s = set()
if not os.path.exists(path):
with open(path, "w") as f:
for id_ in ids_: # iterating through available links/ids
if isinstance(id_, int): # if it's vk public id
wall = ParseClass.get_wall_vk(-id_, 1000, VK_TOKEN)
for post in tqdm.tqdm(wall):
if len(post) and post not in s: # write unique posts
s.add(post)
_ = f.write(f"{post}\n")
elif isinstance(id_, str):
links = np.random.choice(
list(ParseClass.get_all_links(id_)), 666)
for link in tqdm.tqdm(links):
try:
r = requests.get(link)
except requests.RequestException:
continue
if r.ok and r.encoding == "utf-8":
page = r.text
soup = BeautifulSoup(page, "lxml")
for text in soup.text.strip().split("\n"):
if len(text) and len(text.split()) > 2 and text not in s:
s.add(text)
_ = f.write(f"{text}\n")
Normalizing file sizes
In [4]:
file_paths = [path for path in os.listdir("corpora") if path.endswith("txt")]
lengths = [os.path.getsize(f"corpora/{filepath}") for filepath in file_paths]
In [5]:
indexes = np.array(lengths).argsort()
first_half = indexes[:len(lengths) // 2]
second_half = indexes[len(lengths) // 2:]
In [6]:
for i, ind_file in enumerate(first_half):
sym_length = os.path.getsize(f"corpora/{file_paths[ind_file]}") // 2 # length of a file in first_half
path = f"corpora/{file_paths[second_half[i]]}"
all_symbols = "".join(open(path))
with open(path, "w") as f:
_ = f.write(all_symbols[:sym_length])
In [7]:
categories = list(sources.keys())
In [8]:
labels = ["Искусство", "Политика", "Финансы", "Стратегическое управление", "Юриспруденция", "Исследования и разработки",
"Промышленность", "Образование", "Благотворительность", "Здравоохранение", "Сельское хозяйство",
"Государственное управление", "Реклама и маркетинг", "Инновации и модернизация", "Безопасность",
"Военное дело", "Корпоративное управление", "Социальная защита", "Строительство", "Предпринимательство",
"Спорт", "Инвестиции"]
norm_names = dict(zip(categories, labels))
norm_names_reversed = dict([(a[1], a[0]) for a in norm_names.items()])
Using labeled data from LEADER-ID
In [9]:
# df = pd.read_msgpack("df.msg")
# competencies = pd.read_csv("competencies.csv")
# competencies = competencies.dropna()[['Id', 'Интересы']]
# social = pd.read_excel("assets/social.xlsx", names=['id', 'name', 'fb', 'ok', 'vk', 'email', 'twitter'])
# accepted = social.fb.dropna().index | social.vk.dropna().index
# social = social.loc[accepted, ['name', 'id', 'vk', 'fb']]
# social.vk = social.vk.apply(lambda x: str(x).split("/")[-1])
# social.fb = social.fb.apply(lambda x: str(x).split("/")[-1])
# social.fb = social.fb.apply(lambda x: str(x).split("id=")[-1])
# social.fb = social.fb.apply(lambda x: str(x).split("?ref=")[0])
# social.fb = social.fb.replace('nan', np.nan)
# social = social.merge(df[['vk', 'vk_id']], how='outer', on='vk')
# social.vk = social.vk_id.fillna(0)
# social = social.replace(0, np.nan).drop(labels=['vk_id'], axis=1)
# social = social.set_index('id').merge(how='inner', right=competencies.set_index('Id'), left_index=True, right_index=True)
# to_exclude = pd.read_csv("assets/known_users.csv")['Leader-ID'].dropna().astype('int').values
# social = social.loc[~social.index.isin(to_exclude), :]
# social.to_msgpack("assets/social.msg")
# def get_id(screen_name):
# try:
# item = vk.users.get(user_ids=screen_name)
# if 'deactivated' not in item:
# return item[0]['id']
# except Exception as e:
# print(e.args)
# Нужно замапить screen_name с их vk_id, поэтому придется обрабатывать по одному
social = pd.read_msgpack("social.msg")
d = json.load(open("d.json"))
In [10]:
label_encoder = LabelEncoder()
int_labels = label_encoder.fit_transform(labels)
def f(x):
b = []
for a in x.split(","):
t = a.strip()
if t == "Частный бизнес" or t == "Социальное предпринимательство":
b.append("Предпринимательство")
elif t == "Дошкольное образование/детский отдых":
b.append("Образование")
elif t in {'Журналистика', 'Управление персоналом', 'Управление рисками'}:
pass
else:
b.append(t)
return b[:6]
social.Интересы = social.Интересы.apply(f)
def g(x):
if not x:
return np.in1d(int_labels, []).astype('int')
return np.in1d(int_labels, label_encoder.transform(x)).astype('int')
social['y'] = social.Интересы.apply(g)
In [11]:
corpora_class = CorporaClass()
for filename in categories:
with open(f"corpora/{filename}.txt") as f:
corpora_class.add_to_corpora(f, filename)
In [12]:
vectorizer = TfidfVectorizer(tokenizer=corpora_class.full_process,
max_df=0.55,
min_df=5,
sublinear_tf=True,
ngram_range=(1, 2))
docterm_matrix = vectorizer.fit_transform(list(itertools.chain(*(doc for doc in corpora_class.corpora))))
In [13]:
DELIM = 1300
NUM_OF_CLASSES = 22
vector_size = docterm_matrix[0].shape[1]
In [14]:
y = []
for (i, x), cat in zip( zip(range(len(corpora_class.corpora)), corpora_class.corpora),
categories ):
for _ in range(len(x)):
y_ = np.zeros(NUM_OF_CLASSES)
t = np.zeros(NUM_OF_CLASSES)
for arr in (social.loc[d[norm_names[cat]], 'y'] * 0.016):
t = t + arr
y_ = y_ + t
y_[i] = 1
y.append(y_)
y = np.array(y)
In [15]:
X_train, X_test, y_train, y_test = train_test_split(docterm_matrix, y, test_size=0.2)
In [18]:
X_train = np.expand_dims(X_train.toarray(), axis=2)
X_test = np.expand_dims(X_test.toarray(), axis=2)
In [23]:
STRIDE = 5
classifier = Sequential((
Conv1D(filters=1, kernel_size=STRIDE, activation='relu', input_shape=(vector_size, 1)),
MaxPooling1D(),
Conv1D(filters=1, kernel_size=STRIDE, activation='relu'),
MaxPooling1D(),
Flatten(),
Dense(128, activation='relu'),
Dropout(0.2),
Dense(64, activation='sigmoid'),
Dense(NUM_OF_CLASSES, activation='sigmoid')
))
classifier.compile(optimizer='adadelta', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
In [25]:
tbCallBack = TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True)
classifier.fit(X_train,
y_train,
validation_data=(X_test, y_test),
batch_size=196,
epochs=60,
class_weight='balanced',
callbacks=[tbCallBack]
)
In [26]:
classifier.save("../assets/vk_texts_classifier-dev.h5")
In [19]:
classifier = load_model("../assets/vk_texts_classifier.h5")
In [1]:
import sys
sys.path.append('..')
from util import ResultClass
In [2]:
import redis
redis_obj = redis.Redis()
In [3]:
result_class = ResultClass(redis_obj)
In [4]:
verdict = result_class.get_result(78543018, None)
In [5]:
verdict
Out[5]:
In [5]:
from config import VK_TOKEN
In [7]:
import vk_api
In [8]:
vk_session = vk_api.VkApi(token=VK_TOKEN)
In [9]:
vk = vk_session.get_api()
In [22]:
pool_data = {}
publics=[-58219172, -28905875, -84926122, -55284725, -139246969, -29534144, -54530371]
In [33]:
with vk_api.VkRequestsPool(vk_session) as pool:
for owner_id in publics:
pool_data[owner_id] = pool.method("wall.get", {"owner_id": owner_id, "count": 100})
In [34]:
for pub in publics:
wall = [a.get('text', '') for a in pool_data[pub].result.get('items', [])]
print(pub)
In [29]:
t = " ".join(corpora_class.corpora[0])
normalize(result_class.classifier.predict(np.sum(result_class.vectorizer.transform([t]), axis=0)).reshape(1, -1))[0]
Out[29]:
In [32]:
%%timeit
normalize(result_class.classifier.predict(np.sum(result_class.vectorizer.transform([" ".join(corpora_class.corpora[0])]), axis=0)).reshape(1, -1))[0]
In [22]:
norm_categories = np.array(list(norm_names.values()))
In [39]:
dict_for_mean = []
labels_ = np.array(test_labels)
corpora_ = np.array(test_corpora)
acc_d = {}
for col in norm_names.values():
t = social.Интересы.apply(lambda s: col in s)
col_labels = np.array(list(set(t[t == True].index).intersection(labels_)))
c_0 = 0
c_1 = 0
c_2 = 0
for item in corpora_[np.in1d(labels_, col_labels)]:
t = np.sum(classifier.predict(vectorizer.transform(item).toarray()), axis=0)
dict_for_mean.append(t)
pred_categories = norm_categories[t.argsort()[::-1][:8]]
if col in pred_categories[:2]:
c_0 += 1
if col in pred_categories[:5]:
c_1 += 1
if col in pred_categories:
c_2 += 1
l = len(col_labels)
if l == 0:
l = 1
col, l, c_0 / l, c_1 / l, c_2 / l
acc_d[col] = c_2 / l
# sorted(np.sum(classifier.predict(vectorizer.transform(corpora_user_gen.corpora[5]).toarray()), axis=0), reverse=True)[:3]
Out[39]:
Out[39]:
Out[39]:
Out[39]:
Out[39]:
Out[39]:
Out[39]:
Out[39]:
Out[39]:
Out[39]:
Out[39]:
Out[39]:
Out[39]:
Out[39]:
Out[39]:
Out[39]:
Out[39]:
Out[39]:
Out[39]:
Out[39]:
Out[39]:
Out[39]:
In [40]:
np.mean(list(acc_d.values()))
Out[40]:
In [41]:
list(zip(categories, [np.mean(a) for a in np.array([np.array(b) for b in dict_for_mean]).T]))
Out[41]:
In [23]:
# social = pd.read_excel("social.xlsx", names=['id', 'name', 'fb', 'ok', 'vk', 'email', 'twitter'])
# social.set_index('id', inplace=True)
# social = social[social.vk.notnull()]
# social.vk = social.vk.apply(lambda x: str(x).split("/")[-1])
# df = social[social.vk.notnull()]
# df = df[['name', 'vk']]
# def get_id(screen_name):
# try:
# item = vk.users.get(user_ids=screen_name)
# if 'deactivated' not in item:
# return item[0]['id']
# except Exception as e:
# print(e.args)
# # Нужно замапить screen_name с их vk_id, поэтому придется обрабатывать по одному
# df['vk_id'] = df.vk.progress_apply(get_id)
# df.dropna(inplace=True)
# df.vk_id = df.vk_id.astype('int')
# df.to_msgpack("df.msg")
df = pd.read_msgpack("assets/df.msg")
known = pd.read_csv("assets/known_users.csv")
known = known.merge(df[['vk_id']], left_on='Leader-ID', right_index=True, how='left')
In [83]:
from line_profiler import LineProfiler
In [88]:
lp = LineProfiler()
In [89]:
lp.add_function(corpora_class.full_process)
In [91]:
lp.run("corpora_class.full_process(corpora[0][0])")
Out[91]:
In [93]:
lp.print_stats()
In [109]:
import requests
import vk_api
vk_s = vk_api.VkApi()
vk = vk_s.get_api()
t = vk.friends.get(user_id=134070307)['items']
In [119]:
{"name": "", "user_vk": int(np.random.choice(vk.friends.get(user_id=np.random.choice(t))['items']))}
Out[119]:
In [57]:
requests.post("http://78.155.197.212:9999/get_result", json={"name": "", "user_vk": int(np.random.choice(vk.friends.get(user_id=np.random.choice(t))['items']))}).json()
In [104]:
import requests
import vk_api
vk_s = vk_api.VkApi()
vk = vk_s.get_api()
t = vk.friends.get(user_id=134070307)['items']
while 1:
requests.post("http://78.155.197.212:9999/get_result", json={"name": "", "user_vk": int(np.random.choice(vk.friends.get(user_id=np.random.choice(t))['items']))}).json()
In [105]:
import vk_api
vk_s = vk_api.VkApi()
vk = vk_s.get_api()
t = vk.friends.get(user_id=134070307)['items']
In [56]:
transformed
Out[56]:
In [37]:
offset = 0
for index, row in tqdm.tqdm(known.iloc[offset:, :].iterrows(), total=len(known) - offset):
user_vk = row['vk_id']
if str(user_vk) == "nan":
user_vk = None
user_fb = row['FB']
if str(user_fb) == "nan":
user_fb = None
try:
verdict = result_class.get_result(user_vk, user_fb)
result_class.texts = []
for cat, value in verdict:
known.loc[index, cat] = value
except ValueError:
for cat in categories:
known.loc[index, cat] = 0
except IndexError:
for cat in categories:
known.loc[index, cat] = 0
In [38]:
norm_dict = {"values": []}
for name in known.ФИ:
results = []
for col in known[known.ФИ == name].iloc[:, 4:].columns:
results.append({"name": norm_names[col], "value": float(known.loc[known.ФИ == name, col].values[0])})
norm_dict['values'].append({"name": name, "results": results})
In [42]:
[(norm_categories[x.argsort()[-1]], norm_categories[x.argsort()[-2]]) for x in known.loc[:, categories].values]
Out[42]:
In [43]:
means = []
for col in known.iloc[:, 4:].columns:
norm_names[col], known.loc[:, col].median()
means.append(known.loc[:, col].median())
Out[43]:
Out[43]:
Out[43]:
Out[43]:
Out[43]:
Out[43]:
Out[43]:
Out[43]:
Out[43]:
Out[43]:
Out[43]:
Out[43]:
Out[43]:
Out[43]:
Out[43]:
Out[43]:
Out[43]:
Out[43]:
Out[43]:
Out[43]:
Out[43]:
Out[43]:
In [53]:
# t is from user-gen up
means_2 = [(a + b) / 2 for a, b in zip([z[1] for z in t], [b for b in means])]
In [54]:
json.dump(dict(zip(categories, means_2)), open("margins.json", "w"))
In [27]:
known.to_csv("assets/known.csv")
json.dump(norm_dict, open("assets/temporary_result.json", "w"))
In [51]:
tt = []
for i, item in known.iterrows():
print(item['ФИ'], end=" ")
accepted_cols = []
for col, margin in dict(zip(categories, means_2)).items():
if item[col] > 1.1 * margin:
accepted_cols.append(col)
np.array(accepted_cols)[item[accepted_cols].argsort()[::-1]][:5]
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
Out[51]:
In [ ]:
In [ ]:
In [ ]:
import vk_api
from config import VK_TOKEN
In [ ]:
vk = vk_api.VkApi(token=VK_TOKEN)
vk = vk.get_api()
In [ ]:
t = [a['text'] for a in vk.newsfeed.search(q="корпоративное управление", count=200)['items']]
In [ ]:
with open('t.txt', 'w') as f:
for line in t:
_ = f.write(f'{line}\n')
In [ ]: