In [10]:
#Анализ показателей стратпланирования по моногородам


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


#Загрузка даных
xl_file = pd.ExcelFile("D:/Projects/Aspirantura/GASU_SP/Показатели моногородов.xlsx")
# dfs = {sheet_name: xl_file.parse(sheet_name)
#           for sheet_name in xl_file.sheet_names}
ds = xl_file.parse("Лист1")
print(ds.apply(lambda x: sum(x.isnull()),axis=0)) #missing values
print("Количество показателей: ",len(ds))
#ds = pd.read_csv("D:/projects/Aspirantura/GASU_SP/20161017_gasu_sp_exp01.dsv", encoding='cp1251', sep='|', lineterminator='\r') #Reading the dataset in a dataframe using Pandas
#ds.describe()
#ds.INDS.value_counts() #количество показателей по уровням власти


OKTMO                  0
OKTMO_NAME             0
OKTMO_SHORT_NAME       0
ID_MONOCITY            0
MONOCITY_NAME          0
MONOCITY_FULLNAME      0
MONOCITY_CATEGORY      0
MONOCITY_NUM_IN_ACT    0
DOCLEVELCODE           0
DOCLEVELNAME           0
IND_TYPE               0
IND_NAME               0
dtype: int64
Количество показателей:  19995

In [7]:
# Подготовительная часть
# включаем возможности стемминга
import pymorphy2
import nltk
import pickle

morph = pymorphy2.MorphAnalyzer()
nltk.download('punkt')  #не знаю нужно ли каждый раз это скачивать
nltk.download('stopwords')  #не знаю нужно ли каждый раз это скачивать
g_path = "D:/Projects/Aspirantura/GASU_SP/"
print("Done")


[nltk_data] Downloading package punkt to
[nltk_data]     D:\Users\PDudarin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     D:\Users\PDudarin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Done

In [12]:
import collections

#s = ds.INDS.str.upper().tolist()[0]
print("Показатели до стемминга", ds.IND_NAME.str.upper().tolist()[0])
print("Показатели до стемминга", ds.IND_NAME.str.upper().tolist()[1])
print("Показатели до стемминга", ds.IND_NAME.str.upper().tolist()[2])
print("Показатели до стемминга", ds.IND_NAME.str.upper().tolist()[3])
print("Показатели до стемминга", ds.IND_NAME.str.upper().tolist()[4])
#print("Показатели до стемминга",ds.INDS.str.upper().tolist())
#pymorphy2.tokenizers.simple_word_tokenize(s)
#nltk.word_tokenize(s)
#ds['IND_NAME_TOKENIZED'] = ds.apply(lambda x: x["IND_NAME"], axis=1)
#morph.parse(s)[0].normal_form
#morph.parse('использованных')[0].normal_form

#TODO хотелось бы сделать новую фичу IND_NAME_TOKENIZED и ее токенизировать, но пока не получается

# dfs = {sheet_name: xl_file.parse(sheet_name)
#           for sheet_name in xl_file.sheet_names}

inds = {0 : ''}
i = 0
words = collections.Counter()


chars_to_remove = [u'«', u'»', u'!', u'<', u'>', u'?', u',', u'.', u'-', u'(', u')', u'[', u']', u'"']
dd = {ord(c):' ' for c in chars_to_remove}

for j in ds.IND_NAME.str.upper().tolist():
  # try:
  inds[i] = [morph.parse(x)[0].normal_form for x in nltk.word_tokenize(j.translate(dd))]
  for w in inds[i]:
      words[w] += 1

                                      # TODO выяснить какие строки он не в состоянии обработать
          #txts[0] = ' '.join(nltk.word_tokenize(j))
  i=i+1
  # except:
  #   print("Unable to tokenize ", j)

print("Показатели после стемминга",inds[0])
print("Показатели после стемминга",inds[1])
print("Показатели после стемминга",inds[2])
print("Показатели после стемминга",inds[3])
print("Показатели после стемминга",inds[4])

print("Топ 10 слов: ",words.most_common(10))
print("Всего разных слов: ", len(words))


Показатели до стемминга ДОЛЯ НАСЕЛЕНИЯ ИМЕЮЩЕГО ДЕНЕЖНЫЕ ДОХОДЫ НИЖЕ ВЕЛИЧИНЫ ПРОЖИТОЧНОГО МИНИМУМА В ОБЩЕЙ ЧИСЛЕННОСТИ НАСЕЛЕНИЯ АЛЕЙСКОГО РАЙОНА АЛТАЙСКОГО КРАЯ
Показатели до стемминга КОЛИЧЕСТВО ИСТОРИКОПАТРИОТИЧЕСКИХ ГЕРОИКОПАТРИОТИЧЕСКИХ И ВОЕННОПАТРИОТИЧЕСКИХ МУЗЕЕВ ИЛИ МУЗЕЙНЫХ КОМНАТ В ОБЩЕОБРАЗОВАТЕЛЬНЫХ УЧРЕЖДЕНИЯХ
Показатели до стемминга ОХВАТ ПРОФИЛАКТИЧЕСКИМИ МЕРАМИ ПОДРОСТКОВ И МОЛОДЕЖИ В ВОЗРАСТЕ ОТ ДО ЛЕТ
Показатели до стемминга УРОВЕНЬ ЗАНЯТОСТИ НАСЕЛЕНИЯ
Показатели до стемминга УДЕЛЬНЫЙ ВЕС РАБОТНИКОВ С ПРОФЕССИОНАЛЬНЫМ ОБРАЗОВАНИЕМ В ОБЩЕЙ ЧИСЛЕННОСТИ ЗАНЯТЫХ В ЭКОНОМИКЕ
Показатели после стемминга ['доля', 'население', 'иметь', 'денежный', 'доход', 'ниже', 'величина', 'прожиточный', 'минимум', 'в', 'общий', 'численность', 'население', 'алейский', 'район', 'алтайский', 'край']
Показатели после стемминга ['количество', 'историкопатриотический', 'героикопатриотический', 'и', 'военнопатриотический', 'музей', 'или', 'музейный', 'комната', 'в', 'общеобразовательный', 'учреждение']
Показатели после стемминга ['охват', 'профилактический', 'мера', 'подросток', 'и', 'молодёжь', 'в', 'возраст', 'от', 'до', 'год']
Показатели после стемминга ['уровень', 'занятость', 'население']
Показатели после стемминга ['удельный', 'вес', 'работник', 'с', 'профессиональный', 'образование', 'в', 'общий', 'численность', 'занятой', 'в', 'экономика']
Топ 10 слов:  [('в', 11499), ('и', 7217), ('количество', 5445), ('доля', 4409), ('на', 4254), ('общий', 3971), ('муниципальный', 3746), ('по', 2962), ('с', 2079), ('от', 1925)]
Всего разных слов:  6766

In [13]:
#Загрузка стоп слов
from nltk.corpus import stopwords

xl_file = pd.ExcelFile("D:/Projects/Aspirantura/GASU_SP/Стоп слова.xlsx")
# dfs = {sheet_name: xl_file.parse(sheet_name)
#           for sheet_name in xl_file.sheet_names}
ds_stop_words = xl_file.parse("Лист1")

stop_list = [morph.parse(x)[0].normal_form for x in ds_stop_words.STOP_WORDS.str.upper().tolist() ]
nltk_stop_words = stopwords.words('russian')
for w in nltk_stop_words:
    stop_list.append(w)

print("Кол-во стоп слов: ",len(stop_list))

#Уберем стоп слова из нашего массива
for w in list(words):
    if w in stop_list:
        del words[w]

print("Топ 10 слов после очистки: ",words.most_common(10))
print("Всего разных слов после очистки: ", len(words))


Кол-во стоп слов:  372
Топ 10 слов после очистки:  [('население', 1852), ('ребёнок', 1519), ('услуга', 1447), ('городской', 1416), ('мероприятие', 1314), ('округа', 1215), ('гражданин', 1103), ('бюджет', 1054), ('город', 1038), ('территория', 1018)]
Всего разных слов после очистки:  6574

In [10]:
#create Dictionay of word pairs
import pickle
import time
import requests
import time


start_time = time.time()

i=0;
d = {}

for w1 in words:
    for w2 in words:
        # if w1>w2 and i<10000:
        if w1>w2:
            i+=1
            d[w1+'__'+w2] = None
            # time.sleep(0.5) # delays for 1/2 seconds
            # resp = requests.get('http://ling.go.mail.ru/dsm/ruwikiruscorpora/'+w1+'__'+w2+'/api/similarity/')
            # if resp.status_code != 200:
            #   # This means something went wrong.
            #   print("Error with service RusVectores ", w1," - ",w2)
            # else:
            #   d[w1+'#'+w2] = (nltk.word_tokenize(resp.text)[0])

output = open('D:/Projects/Aspirantura/GASU_SP/words_dict_monocities.pkl', 'wb')
pickle.dump(d, output)
output.close()
print("Записано пар слов: ", i)

end_time = time.time()
print("--- %s seconds ---" % (end_time - start_time))


Записано пар слов:  21869191
--- 138.5899999141693 seconds ---

In [3]:
# Перенесем данные по парам слов из старого справочника в новый
import time
from time import gmtime, strftime
import pickle

print("--- Start time %s ---" % strftime("%a, %d %b %Y %H:%M:%S", gmtime()))
start_time = time.time()

pkl_file_old = open('D:/Projects/Python/20161210_StratPlanClusters/words_sem_dict2.pkl', 'rb')
d_old = pickle.load(pkl_file_old)
pkl_file_old.close()

pkl_file_new = open('D:/Projects/Aspirantura/GASU_SP/words_dict_monocities_sem.pkl', 'rb')
d_new = pickle.load(pkl_file_new)
pkl_file_new.close()

for w2, v in d_old.items():
  if w2 in d_new.keys():
      d_new[w2] = v

output = open('D:/Projects/Aspirantura/GASU_SP/words_dict_monocities_sem.pkl', 'wb')
pickle.dump(d_new, output)
output.close()

l = {k for k, v in d_new.items() if v is not None } #all the empty
print("Already processed pair count: ", len(l))

end_time = time.time()
print("--- %s seconds ---" % (end_time - start_time))


--- Start time Fri, 17 Feb 2017 10:18:03 ---
Already processed pair count:  770661
--- 508.2660000324249 seconds ---

In [2]:
# test new dictionary 
import pickle

# pkl_file_new = open('D:/Projects/Aspirantura/GASU_SP/words_dict_monocities_sem.pkl', 'rb')
# d_new = pickle.load(pkl_file_new)
# pkl_file_new.close()

l = {k for k, v in d_new.items() if v is not None } #all the empty
print("Already processed pair count: ", len(l))
j = 0

for i in l:
    j+=1
    print("Sample ", i, " - ", d_new.get(i))
    if j>10:
        break
    
    
# print("Sample ", l(1), " - ",d_new.get(l(1)))
# print("Sample ", l[10], " - ",d_new.get(l[10]))
# print("Sample ", l[100], " - ",d_new.get(l[100]))
# print("Sample ", l[1000], " - ",d_new.get(l[1000]))


Already processed pair count:  770661
Sample  индивидуальный__бюджетный  -  0.0902280646743
Sample  природный__автоматический  -  -0.0183039430446
Sample  модернизация__дело  -  0.0323260518363
Sample  коммуникационный__квалифицировать  -  0.00406434728715
Sample  оздоровление__зарубежный  -  0.0369522816396
Sample  теплов__магистральный  -  -0.0187530035771
Sample  профилактика__проезд  -  0.0581372491988
Sample  течение__объединение  -  0.0761837203231
Sample  потребность__вещество  -  0.043745834564
Sample  энергоэффективность__технически  -  0.0175944903677
Sample  рекультивированный__признанный  -  0.00087841042564

In [ ]:


In [8]:
# подготовим функции для последующей параллельной работы

import requests
import time
from time import gmtime, strftime
import pickle

print("--- Start time %s ---" % strftime("%a, %d %b %Y %H:%M:%S", gmtime()))
start_time = time.time()

#---------------------------------------------------------

def fill_sem_dict(p_words):
    l_dict = {}
    for w in p_words:
        #time.sleep(0.1) # delays in secs
        for it in range(0,3):
          resp = requests.get('http://ling.go.mail.ru/dsm/ruwikiruscorpora/'+w.replace("ё","е")+'/api/similarity/')
          if resp.status_code == 200:
              break
        if resp.status_code != 200:
          # This means something went wrong.
          print("Error with service RusVectores ", w)
          l_dict[w] = 'Error'
        else:
          l_dict[w] = nltk.word_tokenize(resp.text)[0]
    return l_dict

#---------------------------------------------------------

def merge_sem_dicts(p_sem_dict, p_sem_dict_list):

    for d in p_sem_dict_list:
        for w, v in d.items():
           p_sem_dict[w] = v


#---------------------------------------------------------

def split_word_dict(p_word_dict_source, p_list_count, p_list_len):
    l_word_list = []
    i = 0
    j = 0
    l_word_lists = []
    l_word_list = []
    for w,v in p_word_dict_source.items():
        l_word_list.append(w)
        p_word_dict_source[w] = 'Processed'
        j +=1
        if j >= p_list_len:
            l_word_lists.append(l_word_list)
            l_word_list = []
            j = 0
            i += 1
        if i >= p_list_count:
            break

    if j != 0:
        l_word_lists.append(l_word_list)

    p_word_dict_source = {k:v for k, v in p_word_dict_source.items() if v is None }

    return [p_word_dict_source, l_word_lists]


#---------------------------------------------------------
#   Test functions block
#---------------------------------------------------------

def test_fill_sem_dict():
    sem_dict = fill_sem_dict(["муж__пьяница", "жена__муж", "вино__водка"])
    print("Test fill_sem_dict results:  ", sem_dict)
    res = {'вино__водка': '0.724427524288', 'муж__пьяница': '0.298472315507', 'жена__муж': '0.904485693336'}
    if sem_dict == res:
        print("Test: Passed")
    else:
        print("Test: !!! FAILED !!!")

def test_merge_sem_dicts():
    sem_dict = {}
    sem_dict_list = [{"я_ты":0.5},{"он_она":0.9, "я_она":0.1}]
    merge_sem_dicts(sem_dict, sem_dict_list)
    print("Test join_sem_dicts results:  ", sem_dict)
    res =  {'он_она': 0.9, 'я_она': 0.1, 'я_ты': 0.5}
    if sem_dict == res:
        print("Test: Passed")
    else:
        print("Test: !!! FAILED !!!")

def test_split_word_dict():
    word_dict_source = {'он_она': 0.9, 'я_она': 0.1, 'я_ты': 0.5}
    word_lists = {}
    [word_dict_source, word_lists] = split_word_dict(word_dict_source, 2, 2)
    print("Test split_word_dict results (word_dict_source):  ", word_dict_source)
    print("Test split_word_dict results (word_lists):  ", word_lists)
    res =  [['я_она', 'я_ты'], ['он_она']]
    if (word_lists == res)and(len(word_dict_source)==0):
        print("Test: Passed")
    else:
        print("Test: !!! FAILED !!!")

    
#---------------------------------------------------------
# Testing....

test_fill_sem_dict()
test_merge_sem_dicts()
test_split_word_dict()

#---------------------------------------------------------

#---------------------------------------------------------

print("Done")
end_time = time.time()
print("--- %s seconds ---" % (end_time - start_time))


--- Start time Thu, 30 Mar 2017 06:09:32 ---
Error with service RusVectores  муж__пьяница
Error with service RusVectores  жена__муж
Error with service RusVectores  вино__водка
Test fill_sem_dict results:   {'муж__пьяница': 'Error', 'жена__муж': 'Error', 'вино__водка': 'Error'}
Test: !!! FAILED !!!
Test join_sem_dicts results:   {'он_она': 0.9, 'я_ты': 0.5, 'я_она': 0.1}
Test: Passed
Test split_word_dict results (word_dict_source):   {}
Test split_word_dict results (word_lists):   [['он_она', 'я_она'], ['я_ты']]
Test: !!! FAILED !!!
Done
--- 0.7840783596038818 seconds ---

In [9]:
#union dictionaries

print("Union process started...")
pkl_file = open('D:/Projects/Aspirantura/GASU_SP/words_dict_monocities_sem_small_14.pkl', 'rb')
d_in_2 = pickle.load(pkl_file)
pkl_file.close()

# pkl_file = open('D:/Projects/Aspirantura/GASU_SP/words_dict_monocities_sem_small_7.pkl', 'rb')
# d_in_3 = pickle.load(pkl_file)
# pkl_file.close()
#
# pkl_file = open('D:/Projects/Aspirantura/GASU_SP/words_dict_monocities_sem_small_8.pkl', 'rb')
# d_in_4 = pickle.load(pkl_file)
# pkl_file.close()
#
# pkl_file = open('D:/Projects/Aspirantura/GASU_SP/words_dict_monocities_sem_small_9.pkl', 'rb')
# d_in_5 = pickle.load(pkl_file)
# pkl_file.close()


pkl_file = open('D:/Projects/Aspirantura/GASU_SP/words_dict_monocities_sem.pkl', 'rb')
d_out = pickle.load(pkl_file)
pkl_file.close()


merge_sem_dicts(d_out, [d_in_2])

output = open('D:/Projects/Aspirantura/GASU_SP/words_dict_monocities_sem.pkl', 'wb')
pickle.dump(d_out, output)
output.close()

l_sem_dict_not_empty = {k:v for k, v in d_out.items() if v is not None } #all not empty
print("Already processed pair count: ", len(l_sem_dict_not_empty))
print("Union process finished.")


Union process started...
Already processed pair count:  21869191
Union process finished.

In [6]:
#---------------------------------------------------------
# Разрезать файл (семантический словарь) на мелкие куски
import time
from time import gmtime, strftime

def split_sem_dictionary(p_in_dict_file_name, p_out_dict_file_name, p_num_start, p_dict_cnt, p_dict_len):

    print("--- Start time %s ---" % strftime("%a, %d %b %Y %H:%M:%S", gmtime()))
    start_time = time.time()


    pkl_file = open(p_in_dict_file_name, 'rb')
    d = pickle.load(pkl_file)
    pkl_file.close()

    l_small_dict = {}
    i = 0
    j = p_num_start
    for k, v in d.items():

        if v is None:
            i += 1
            l_small_dict[k] = v

        if i>=p_dict_len:
            output = open(p_out_dict_file_name+str(j), 'wb')
            pickle.dump(l_small_dict, output)
            output.close()
            print("p_out_dict_file_name "+str(j)+" length: ", len(l_small_dict))
            l_small_dict = {}
            i = 0
            j += 1
            if j >= p_dict_cnt+p_num_start:
                break

    if len(l_small_dict) > 0:
        output = open(p_out_dict_file_name+str(j), 'wb')
        pickle.dump(l_small_dict, output)
        output.close()
        print("p_out_dict_file_name "+str(j)+"length: ", len(l_small_dict))

    end_time = time.time()
    print("Done")
    print("--- %s seconds ---" % (end_time - start_time))


#---------------------------------------------------------

split_sem_dictionary('D:/Projects/Aspirantura/GASU_SP/words_dict_monocities_sem.pkl',
                     'D:/Projects/Aspirantura/GASU_SP/words_dict_monocities_sem_small_.pkl',
                     13,
                     1,
                     3000000
                    )



#---------------------------------------------------------
#---------------------------------------------------------
#---------------------------------------------------------


--- Start time Tue, 28 Mar 2017 08:31:55 ---
p_out_dict_file_name 13length:  721150
Done
--- 108.36269521713257 seconds ---

In [3]:
#паралельное получение пар слов из сервиса

import requests
import time
from time import gmtime, strftime
import pickle
from multiprocessing.dummy import Pool as ThreadPool


print("--- Start time %s ---" % strftime("%a, %d %b %Y %H:%M:%S", gmtime()))
total_start_time = time.time()

#---------------------------------------------------------

def get_semm_words_from_rus_vectores(p_dict_file_name, p_iteration_cnt,
                                     p_thread_cnt, p_word_list_cnt, p_word_list_len):


    pkl_file = open(p_dict_file_name, 'rb')
    d = pickle.load(pkl_file)
    pkl_file.close()

    l_sem_dict_empty = {k:v for k, v in d.items() if v is None } #all not empty
    print("Already processed pair count: ", len(d)-len(l_sem_dict_empty))

    pool = ThreadPool(p_thread_cnt)

    for j in range(1, p_iteration_cnt+1):
        print("--- Iteration # %s ---" % j)
        start_time = time.time()

        l_sem_dict_empty_cnt1 = len(l_sem_dict_empty)
        if len(l_sem_dict_empty)==0:
            break
        split_start_time = time.time()
        [l_sem_dict_empty, l_word_lists] = split_word_dict(l_sem_dict_empty, p_word_list_cnt,
                                                           p_word_list_len)
        split_end_time = time.time()
        print("--- Split words time %s seconds ---" % (split_end_time - split_start_time))
        l_sem_dict_empty_cnt2 = len(l_sem_dict_empty)
        print("Будет записано пар слов: ", l_sem_dict_empty_cnt1-l_sem_dict_empty_cnt2)

        pool_start_time = time.time()
        l_sem_dict_list = pool.map(fill_sem_dict, l_word_lists)
        pool_end_time = time.time()
        print("--- Pool word time %s seconds ---" % (pool_end_time - pool_start_time))

        merge_sem_dicts(d, l_sem_dict_list)

        output = open(p_dict_file_name, 'wb')
        pickle.dump(d, output)
        output.close()
        end_time = time.time()
        print("iteration successed")
        print("--- %s seconds ---" % (end_time - start_time))

#---------------------------------------------------------

print("Start get_semm_words_from_rus_vectores(..., iter=3, thread=4, list_cnt=16, list_len=1000)")
get_semm_words_from_rus_vectores('D:/Projects/Aspirantura/GASU_SP/words_dict_monocities_sem_small_9.pkl'
                                 , 100,
                                 4, 8, 1000)

#---------------------------------------------------------

print("Done")
total_end_time = time.time()
print("--- %s seconds ---" % (total_end_time - total_start_time))


--- Start time Mon, 13 Mar 2017 20:06:10 ---
Start get_semm_words_from_rus_vectores(..., iter=3, thread=4, list_cnt=16, list_len=1000)
Already processed pair count:  0
--- Iteration # 1 ---
--- Split words time 0.7690439224243164 seconds ---
Будет записано пар слов:  8000
--- Pool word time 646.3829710483551 seconds ---
iteration successed
--- 649.442146062851 seconds ---
--- Iteration # 2 ---
--- Split words time 0.8840503692626953 seconds ---
Будет записано пар слов:  8000
--- Pool word time 644.1898455619812 seconds ---
iteration successed
--- 646.3459687232971 seconds ---
--- Iteration # 3 ---
--- Split words time 0.8210468292236328 seconds ---
Будет записано пар слов:  8000
--- Pool word time 648.7621068954468 seconds ---
iteration successed
--- 650.9152302742004 seconds ---
--- Iteration # 4 ---
--- Split words time 0.8540487289428711 seconds ---
Будет записано пар слов:  8000
--- Pool word time 649.4601469039917 seconds ---
iteration successed
--- 651.6682732105255 seconds ---
--- Iteration # 5 ---
--- Split words time 0.8810503482818604 seconds ---
Будет записано пар слов:  8000
--- Pool word time 649.9531750679016 seconds ---
iteration successed
--- 652.2233049869537 seconds ---
--- Iteration # 6 ---
--- Split words time 0.9100520610809326 seconds ---
Будет записано пар слов:  8000
--- Pool word time 648.6551008224487 seconds ---
iteration successed
--- 650.8812282085419 seconds ---
--- Iteration # 7 ---
--- Split words time 0.8860507011413574 seconds ---
Будет записано пар слов:  8000
--- Pool word time 645.3699128627777 seconds ---
iteration successed
--- 647.6360428333282 seconds ---
--- Iteration # 8 ---
--- Split words time 0.8780500888824463 seconds ---
Будет записано пар слов:  8000
--- Pool word time 643.2187900543213 seconds ---
iteration successed
--- 645.4479174613953 seconds ---
--- Iteration # 9 ---
--- Split words time 0.8880507946014404 seconds ---
Будет записано пар слов:  8000
--- Pool word time 653.6093842983246 seconds ---
iteration successed
--- 655.8295111656189 seconds ---
--- Iteration # 10 ---
--- Split words time 0.836047887802124 seconds ---
Будет записано пар слов:  8000
--- Pool word time 654.0684103965759 seconds ---
iteration successed
--- 656.2535355091095 seconds ---
--- Iteration # 11 ---
--- Split words time 0.8470485210418701 seconds ---
Будет записано пар слов:  8000
--- Pool word time 648.4580895900726 seconds ---
iteration successed
--- 650.6612157821655 seconds ---
--- Iteration # 12 ---
--- Split words time 0.8600490093231201 seconds ---
Будет записано пар слов:  8000
--- Pool word time 650.4312026500702 seconds ---
iteration successed
--- 652.6583299636841 seconds ---
--- Iteration # 13 ---
--- Split words time 0.8500485420227051 seconds ---
Будет записано пар слов:  8000
--- Pool word time 641.4396884441376 seconds ---
iteration successed
--- 643.7138183116913 seconds ---
--- Iteration # 14 ---
--- Split words time 0.8480486869812012 seconds ---
Будет записано пар слов:  8000
--- Pool word time 646.8839995861053 seconds ---
iteration successed
--- 649.1521294116974 seconds ---
--- Iteration # 15 ---
--- Split words time 0.8820505142211914 seconds ---
Будет записано пар слов:  8000
--- Pool word time 652.124299287796 seconds ---
iteration successed
--- 654.356427192688 seconds ---
--- Iteration # 16 ---
--- Split words time 0.8580491542816162 seconds ---
Будет записано пар слов:  8000
--- Pool word time 654.1524152755737 seconds ---
iteration successed
--- 656.399543762207 seconds ---
--- Iteration # 17 ---
--- Split words time 0.844048261642456 seconds ---
Будет записано пар слов:  8000
--- Pool word time 653.9554040431976 seconds ---
iteration successed
--- 656.1655304431915 seconds ---
--- Iteration # 18 ---
--- Split words time 0.8490486145019531 seconds ---
Будет записано пар слов:  8000
--- Pool word time 655.1644732952118 seconds ---
iteration successed
--- 657.3866002559662 seconds ---
--- Iteration # 19 ---
--- Split words time 0.844048261642456 seconds ---
Будет записано пар слов:  8000
--- Pool word time 652.4183163642883 seconds ---
iteration successed
--- 654.6244425773621 seconds ---
--- Iteration # 20 ---
--- Split words time 0.8310472965240479 seconds ---
Будет записано пар слов:  8000
--- Pool word time 649.7861657142639 seconds ---
iteration successed
--- 651.9762907028198 seconds ---
--- Iteration # 21 ---
--- Split words time 0.8370480537414551 seconds ---
Будет записано пар слов:  8000
--- Pool word time 641.7177042961121 seconds ---
iteration successed
--- 643.944831609726 seconds ---
--- Iteration # 22 ---
--- Split words time 0.842048168182373 seconds ---
Будет записано пар слов:  8000
--- Pool word time 641.6577005386353 seconds ---
iteration successed
--- 643.8658268451691 seconds ---
--- Iteration # 23 ---
--- Split words time 0.8580491542816162 seconds ---
Будет записано пар слов:  8000
--- Pool word time 647.8300535678864 seconds ---
iteration successed
--- 650.127185344696 seconds ---
--- Iteration # 24 ---
--- Split words time 0.8660495281219482 seconds ---
Будет записано пар слов:  8000
--- Pool word time 642.6827592849731 seconds ---
iteration successed
--- 644.9258875846863 seconds ---
--- Iteration # 25 ---
--- Split words time 0.8100464344024658 seconds ---
Будет записано пар слов:  8000
--- Pool word time 641.5046920776367 seconds ---
iteration successed
--- 643.7648212909698 seconds ---
--- Iteration # 26 ---
--- Split words time 0.8170468807220459 seconds ---
Будет записано пар слов:  8000
--- Pool word time 653.6143846511841 seconds ---
iteration successed
--- 655.824511051178 seconds ---
--- Iteration # 27 ---
--- Split words time 0.7990455627441406 seconds ---
Будет записано пар слов:  8000
--- Pool word time 648.015064239502 seconds ---
iteration successed
--- 650.2691931724548 seconds ---
--- Iteration # 28 ---
--- Split words time 0.8660495281219482 seconds ---
Будет записано пар слов:  8000
--- Pool word time 643.7728216648102 seconds ---
iteration successed
--- 646.051952123642 seconds ---
--- Iteration # 29 ---
--- Split words time 0.8130466938018799 seconds ---
Будет записано пар слов:  8000
--- Pool word time 642.4987487792969 seconds ---
iteration successed
--- 644.7498776912689 seconds ---
--- Iteration # 30 ---
--- Split words time 0.8330478668212891 seconds ---
Будет записано пар слов:  8000
--- Pool word time 649.8621699810028 seconds ---
iteration successed
--- 652.1643018722534 seconds ---
--- Iteration # 31 ---
--- Split words time 0.8000457286834717 seconds ---
Будет записано пар слов:  8000
--- Pool word time 652.1813027858734 seconds ---
iteration successed
--- 654.4024295806885 seconds ---
--- Iteration # 32 ---
--- Split words time 0.7880449295043945 seconds ---
Будет записано пар слов:  8000
--- Pool word time 651.1112413406372 seconds ---
iteration successed
--- 653.3813712596893 seconds ---
--- Iteration # 33 ---
--- Split words time 0.8040459156036377 seconds ---
Будет записано пар слов:  8000
--- Pool word time 651.2432489395142 seconds ---
iteration successed
--- 653.5353801250458 seconds ---
--- Iteration # 34 ---
--- Split words time 0.7990458011627197 seconds ---
Будет записано пар слов:  8000
--- Pool word time 650.8292253017426 seconds ---
iteration successed
--- 653.0823543071747 seconds ---
--- Iteration # 35 ---
--- Split words time 0.8030459880828857 seconds ---
Будет записано пар слов:  8000
--- Pool word time 644.3168528079987 seconds ---
iteration successed
--- 646.5019779205322 seconds ---
--- Iteration # 36 ---
--- Split words time 0.7900452613830566 seconds ---
Будет записано пар слов:  8000
--- Pool word time 638.4235157966614 seconds ---
iteration successed
--- 640.6556434631348 seconds ---
--- Iteration # 37 ---
--- Split words time 0.7650437355041504 seconds ---
Будет записано пар слов:  8000
--- Pool word time 667.7591936588287 seconds ---
iteration successed
--- 670.023323059082 seconds ---
--- Iteration # 38 ---
--- Split words time 0.7880451679229736 seconds ---
Будет записано пар слов:  8000
--- Pool word time 650.1131844520569 seconds ---
iteration successed
--- 652.2983093261719 seconds ---
--- Iteration # 39 ---
--- Split words time 0.7760443687438965 seconds ---
Будет записано пар слов:  8000
--- Pool word time 647.3790278434753 seconds ---
iteration successed
--- 649.6121556758881 seconds ---
--- Iteration # 40 ---
--- Split words time 0.7670438289642334 seconds ---
Будет записано пар слов:  8000
--- Pool word time 650.4232022762299 seconds ---
iteration successed
--- 652.6613302230835 seconds ---
--- Iteration # 41 ---
--- Split words time 0.7990458011627197 seconds ---
Будет записано пар слов:  8000
--- Pool word time 639.583582162857 seconds ---
iteration successed
--- 641.8257102966309 seconds ---
--- Iteration # 42 ---
--- Split words time 0.7970454692840576 seconds ---
Будет записано пар слов:  8000
--- Pool word time 641.1096694469452 seconds ---
iteration successed
--- 643.3557977676392 seconds ---
--- Iteration # 43 ---
--- Split words time 0.7760446071624756 seconds ---
Будет записано пар слов:  8000
--- Pool word time 640.6386425495148 seconds ---
iteration successed
--- 642.8697702884674 seconds ---
--- Iteration # 44 ---
--- Split words time 0.7520430088043213 seconds ---
Будет записано пар слов:  8000
--- Pool word time 641.1746730804443 seconds ---
iteration successed
--- 643.451803445816 seconds ---
--- Iteration # 45 ---
--- Split words time 0.7680439949035645 seconds ---
Будет записано пар слов:  8000
--- Pool word time 643.9048292636871 seconds ---
iteration successed
--- 646.2209618091583 seconds ---
--- Iteration # 46 ---
--- Split words time 0.7800447940826416 seconds ---
Будет записано пар слов:  8000
--- Pool word time 655.8095102310181 seconds ---
iteration successed
--- 658.0706396102905 seconds ---
--- Iteration # 47 ---
--- Split words time 0.7300417423248291 seconds ---
Будет записано пар слов:  8000
--- Pool word time 648.8101098537445 seconds ---
iteration successed
--- 651.0272364616394 seconds ---
--- Iteration # 48 ---
--- Split words time 0.7830448150634766 seconds ---
Будет записано пар слов:  8000
--- Pool word time 640.9036576747894 seconds ---
iteration successed
--- 643.1457858085632 seconds ---
--- Iteration # 49 ---
--- Split words time 0.7640438079833984 seconds ---
Будет записано пар слов:  8000
--- Pool word time 650.1701874732971 seconds ---
iteration successed
--- 652.3873145580292 seconds ---
--- Iteration # 50 ---
--- Split words time 0.7830448150634766 seconds ---
Будет записано пар слов:  8000
--- Pool word time 652.9323456287384 seconds ---
iteration successed
--- 655.2134761810303 seconds ---
--- Iteration # 51 ---
--- Split words time 0.7590434551239014 seconds ---
Будет записано пар слов:  8000
--- Pool word time 652.6853313446045 seconds ---
iteration successed
--- 654.9074585437775 seconds ---
--- Iteration # 52 ---
--- Split words time 0.7480428218841553 seconds ---
Будет записано пар слов:  8000
--- Pool word time 655.860512971878 seconds ---
iteration successed
--- 658.0736398696899 seconds ---
--- Iteration # 53 ---
--- Split words time 0.7680437564849854 seconds ---
Будет записано пар слов:  8000
--- Pool word time 643.8188242912292 seconds ---
iteration successed
--- 646.1019549369812 seconds ---
--- Iteration # 54 ---
--- Split words time 0.7680437564849854 seconds ---
Будет записано пар слов:  8000
Error with service RusVectores  размещениен__многообразие
Error with service RusVectores  понеснный__инвалидовколясочник
---------------------------------------------------------------------------
gaierror                                  Traceback (most recent call last)
D:\Users\PDudarin\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\connection.py in _new_conn(self)
    141             conn = connection.create_connection(
--> 142                 (self.host, self.port), self.timeout, **extra_kw)
    143 

D:\Users\PDudarin\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\util\connection.py in create_connection(address, timeout, source_address, socket_options)
     74 
---> 75     for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
     76         af, socktype, proto, canonname, sa = res

D:\Users\PDudarin\AppData\Local\Continuum\Anaconda3\lib\socket.py in getaddrinfo(host, port, family, type, proto, flags)
    731     addrlist = []
--> 732     for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
    733         af, socktype, proto, canonname, sa = res

gaierror: [Errno 11004] getaddrinfo failed

During handling of the above exception, another exception occurred:

NewConnectionError                        Traceback (most recent call last)
D:\Users\PDudarin\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, **response_kw)
    594                                                   body=body, headers=headers,
--> 595                                                   chunked=chunked)
    596 

D:\Users\PDudarin\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    362         else:
--> 363             conn.request(method, url, **httplib_request_kw)
    364 

D:\Users\PDudarin\AppData\Local\Continuum\Anaconda3\lib\http\client.py in request(self, method, url, body, headers)
   1105         """Send a complete request to the server."""
-> 1106         self._send_request(method, url, body, headers)
   1107 

D:\Users\PDudarin\AppData\Local\Continuum\Anaconda3\lib\http\client.py in _send_request(self, method, url, body, headers)
   1150             body = _encode(body, 'body')
-> 1151         self.endheaders(body)
   1152 

D:\Users\PDudarin\AppData\Local\Continuum\Anaconda3\lib\http\client.py in endheaders(self, message_body)
   1101             raise CannotSendHeader()
-> 1102         self._send_output(message_body)
   1103 

D:\Users\PDudarin\AppData\Local\Continuum\Anaconda3\lib\http\client.py in _send_output(self, message_body)
    933 
--> 934         self.send(msg)
    935         if message_body is not None:

D:\Users\PDudarin\AppData\Local\Continuum\Anaconda3\lib\http\client.py in send(self, data)
    876             if self.auto_open:
--> 877                 self.connect()
    878             else:

D:\Users\PDudarin\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\connection.py in connect(self)
    166     def connect(self):
--> 167         conn = self._new_conn()
    168         self._prepare_conn(conn)

D:\Users\PDudarin\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\connection.py in _new_conn(self)
    150             raise NewConnectionError(
--> 151                 self, "Failed to establish a new connection: %s" % e)
    152 

NewConnectionError: <requests.packages.urllib3.connection.HTTPConnection object at 0x000000002943B4E0>: Failed to establish a new connection: [Errno 11004] getaddrinfo failed

During handling of the above exception, another exception occurred:

MaxRetryError                             Traceback (most recent call last)
D:\Users\PDudarin\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    422                     retries=self.max_retries,
--> 423                     timeout=timeout
    424                 )

D:\Users\PDudarin\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, **response_kw)
    639             retries = retries.increment(method, url, error=e, _pool=self,
--> 640                                         _stacktrace=sys.exc_info()[2])
    641             retries.sleep()

D:\Users\PDudarin\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\packages\urllib3\util\retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
    286         if new_retry.is_exhausted():
--> 287             raise MaxRetryError(_pool, url, error or ResponseError(cause))
    288 

MaxRetryError: HTTPConnectionPool(host='ling.go.mail.ru', port=80): Max retries exceeded with url: /dsm/ruwikiruscorpora/%D1%8D%D1%84%D1%84%D0%B5%D0%BA%D1%82%D0%B8%D0%B2%D0%BD%D0%BE%D0%B8%D1%81%D0%BF%D0%BE%D0%BB%D1%8C%D0%B7%D0%BE%D0%B2%D0%B0%D1%82%D1%8C__%D1%81%D0%BE%D1%81%D0%BE%D1%82%D0%BE%D1%8F%D1%89%D0%B8%D0%B9/api/similarity/ (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x000000002943B4E0>: Failed to establish a new connection: [Errno 11004] getaddrinfo failed',))

During handling of the above exception, another exception occurred:

ConnectionError                           Traceback (most recent call last)
<ipython-input-3-0c85a0f9546c> in <module>()
     60 get_semm_words_from_rus_vectores('D:/Projects/Aspirantura/GASU_SP/words_dict_monocities_sem_small_9.pkl'
     61                                  , 100,
---> 62                                  4, 8, 1000)
     63 
     64 #---------------------------------------------------------

<ipython-input-3-0c85a0f9546c> in get_semm_words_from_rus_vectores(p_dict_file_name, p_iteration_cnt, p_thread_cnt, p_word_list_cnt, p_word_list_len)
     42 
     43         pool_start_time = time.time()
---> 44         l_sem_dict_list = pool.map(fill_sem_dict, l_word_lists)
     45         pool_end_time = time.time()
     46         print("--- Pool word time %s seconds ---" % (pool_end_time - pool_start_time))

D:\Users\PDudarin\AppData\Local\Continuum\Anaconda3\lib\multiprocessing\pool.py in map(self, func, iterable, chunksize)
    258         in a list that is returned.
    259         '''
--> 260         return self._map_async(func, iterable, mapstar, chunksize).get()
    261 
    262     def starmap(self, func, iterable, chunksize=None):

D:\Users\PDudarin\AppData\Local\Continuum\Anaconda3\lib\multiprocessing\pool.py in get(self, timeout)
    606             return self._value
    607         else:
--> 608             raise self._value
    609 
    610     def _set(self, i, obj):

D:\Users\PDudarin\AppData\Local\Continuum\Anaconda3\lib\multiprocessing\pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
    117         job, i, func, args, kwds = task
    118         try:
--> 119             result = (True, func(*args, **kwds))
    120         except Exception as e:
    121             if wrap_exception:

D:\Users\PDudarin\AppData\Local\Continuum\Anaconda3\lib\multiprocessing\pool.py in mapstar(args)
     42 
     43 def mapstar(args):
---> 44     return list(map(*args))
     45 
     46 def starmapstar(args):

<ipython-input-2-ef1307a2f166> in fill_sem_dict(p_words)
     16         #time.sleep(0.1) # delays in secs
     17         for it in range(0,3):
---> 18           resp = requests.get('http://ling.go.mail.ru/dsm/ruwikiruscorpora/'+w+'/api/similarity/')
     19           if resp.status_code == 200:
     20               break

D:\Users\PDudarin\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\api.py in get(url, params, **kwargs)
     68 
     69     kwargs.setdefault('allow_redirects', True)
---> 70     return request('get', url, params=params, **kwargs)
     71 
     72 

D:\Users\PDudarin\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\api.py in request(method, url, **kwargs)
     54     # cases, and look like a memory leak in others.
     55     with sessions.Session() as session:
---> 56         return session.request(method=method, url=url, **kwargs)
     57 
     58 

D:\Users\PDudarin\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    473         }
    474         send_kwargs.update(settings)
--> 475         resp = self.send(prep, **send_kwargs)
    476 
    477         return resp

D:\Users\PDudarin\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\sessions.py in send(self, request, **kwargs)
    594 
    595         # Send the request
--> 596         r = adapter.send(request, **kwargs)
    597 
    598         # Total elapsed time of the request (approximately)

D:\Users\PDudarin\AppData\Local\Continuum\Anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    485                 raise ProxyError(e, request=request)
    486 
--> 487             raise ConnectionError(e, request=request)
    488 
    489         except ClosedPoolError as e:

ConnectionError: HTTPConnectionPool(host='ling.go.mail.ru', port=80): Max retries exceeded with url: /dsm/ruwikiruscorpora/%D1%8D%D1%84%D1%84%D0%B5%D0%BA%D1%82%D0%B8%D0%B2%D0%BD%D0%BE%D0%B8%D1%81%D0%BF%D0%BE%D0%BB%D1%8C%D0%B7%D0%BE%D0%B2%D0%B0%D1%82%D1%8C__%D1%81%D0%BE%D1%81%D0%BE%D1%82%D0%BE%D1%8F%D1%89%D0%B8%D0%B9/api/similarity/ (Caused by NewConnectionError('<requests.packages.urllib3.connection.HTTPConnection object at 0x000000002943B4E0>: Failed to establish a new connection: [Errno 11004] getaddrinfo failed',))

In [4]:
# Проверки и корректировки. Временное

pkl_file = open('D:/Projects/Aspirantura/GASU_SP/words_dict_monocities_sem1.pkl', 'rb')
d = pickle.load(pkl_file)
pkl_file.close()

#----------------------------------------------------------------------------
#найти все пары с "ребЁнок" и поставить как необработанные
i = 0
for k,v in d.items():
    if ("ё" in k) and (v in ["Unknown","Error"]):
        i += 1
        d[k] = None

print("Пар слов с ё >> ", i)
output = open('D:/Projects/Aspirantura/GASU_SP/words_dict_monocities_sem1.pkl', 'wb')
pickle.dump(d, output)
output.close()

#----------------------------------------------------------------------------

# pkl_file = open(g_path+'monocity_words_ds_'+str(g_max_inds_count)+'.pkl', 'rb')
# words_ds = pickle.load(pkl_file)
# pkl_file.close()
# print("words_ds len: ", len(words_ds))


# pkl_file = open(g_path+'monocity_words_'+str(g_max_inds_count)+'.pkl', 'rb')
# words = pickle.load(pkl_file)
# pkl_file.close()
# print("words len: ", len(words))

# l_cnt = 0
# # for ws in words_ds:
# #   if max(w for w in ws if w != 1)  < 0.001:
# #       l_cnt +=1
# print("l_cnt >> ", l_cnt)
# print(l_words_sem_dist_dict["ребёнок__девочка"])
# # print(l_words_sem_dist_dict["девочка__ребёнок"])
# print(l_words_sem_dist_dict["факт__оборот"])
# print(l_words_sem_dist_dict["оборот__факт"])
# print("None cnt >> ", sum(1 for k,v in l_words_sem_dist_dict.items() if v==None))
# print("Error cnt >> ",sum(1 for k,v in l_words_sem_dist_dict.items() if v=='Error'))
# print("Unknown cnt >> ", sum(1 for k,v in l_words_sem_dist_dict.items() if v=='Unknown'))

# i = 0
# print("Unknown pairs:")
# for k,v in l_words_sem_dist_dict.items():
#     if v == "Unknown":
#       print(k)
#       i += 1
#     if i > 10:
#         break

print("Done")


Пар слов с ё >>  662010
Done

In [ ]:


In [ ]:


In [ ]: