In [11]:
import os
import sys
import glob
from itertools import islice, chain
from collections import defaultdict
from tqdm import tqdm
import pycrfsuite
import pymorphy2
import opencorpora

m = pymorphy2.MorphAnalyzer()

In [32]:
!mkdir -p models
!mkdir -p results

In [4]:
!opencorpora download -d -o annot.opcorpora.no_ambig.xml


Creating annot.opcorpora.no_ambig.xml from http://opencorpora.org/files/export/annot/annot.opcorpora.no_ambig.xml.bz2
...
Done.

In [5]:
!opencorpora download


Creating annot.opcorpora.xml from http://opencorpora.org/files/export/annot/annot.opcorpora.xml.bz2
............................................................................................
Done.

In [6]:
corpus_noambig = opencorpora.CorpusReader('annot.opcorpora.no_ambig.xml')
corpus_full = opencorpora.CorpusReader('annot.opcorpora.xml', use_cache=True)

In [7]:
list(islice(corpus_full.iter_parsed_sents(), 6, 7))[0][:10]


Out[7]:
[('Впрочем', [('впрочем', 'CONJ,Prnt')]),
 (',', [(',', 'PNCT')]),
 ('на', [('на', 'PREP')]),
 ('канале', [('канал', 'NOUN,inan,masc,sing,loct')]),
 ('«', [('«', 'PNCT')]),
 ('Культура', [('культура', 'NOUN,inan,femn,sing,nomn')]),
 ('»', [('»', 'PNCT')]),
 ('в', [('в', 'PREP')]),
 ('роли', [('роль', 'NOUN,inan,femn,sing,loct')]),
 ('телеведущих', [('телеведущая', 'NOUN,anim,femn,plur,gent')])]

In [5]:
# sents_opcorpora = [sent for sent in corpus_noambig.iter_tagged_sents() 
#                    if not any(tag=='UNKN' for tok, tag in sent)]
# sents_opcorpora = [[(tok, m.TagClass(tag)) for tok, tag in sent if tag != 'PNCT'] for sent in sents_opcorpora]

In [6]:
# len(sents_opcorpora)

In [8]:
MICROCORPUS_GLOB = '../../microcorpus/data/done/*.txt'

def load_microcorpus(path, out_func, remove_pnct=True):
    for fn in glob.glob(path):
        with open(fn, 'r', encoding='utf8') as f:
            sent = []
            for line in f:
                token, tag = line.split(None, 1)
                if remove_pnct and tag.strip() == 'PNCT':
                    continue                    
                tag = m.TagClass(tag.strip())                
                sent.append((token, out_func(tag)))
            yield sent

In [9]:
def get_outvalue(tag):
    case = str(tag.case or 'NA')
    case = m.TagClass.RARE_CASES.get(case, case)
    return case
    
#     number = str(tag.number or 'NA')
#     return '%s,%s' % (case, number)

#     return case
#     return str(tag.POS or 'OTHER')
#     return str(tag.gender or 'NA')
#     case = str(tag.case or '')
#     case = m.TagClass.RARE_CASES.get(case, case)
#     if not case:
#         return str(tag._POS)    
#     return "%s,%s" % (tag._POS, case)

def _has_tag(parsed_sent, tag):
    return any(tag == ptag for token, parses in parsed_sent for norm, ptag in parses)

def _has_unkn(parsed_sent):
    return _has_tag(parsed_sent, 'UNKN')

def get_unambig_sents(parsed_sents, out_func, remove_pnct=True):
    for sent in parsed_sents:
        if _has_unkn(sent):
            continue

        processed_sent = []
        for token, parses in sent:
            if remove_pnct and any(tag=='PNCT' for norm, tag in parses):
                continue                
            seen_y = list(set(out_func(m.TagClass(tag)) for norm, tag in parses))
            if len(seen_y) != 1:
                # ambiguous
                break
#             if any('Init' in tag for tag in m.tag(token)):
#                 # Initials are not handled in corpora yet,
#                 # so pymorphy2 probability estimates are wrong
#                 # for them.
#                 break
            processed_sent.append((token, seen_y[0]))
        else:
            if processed_sent:
                yield processed_sent

def getY(sents):
    return [[out for token, out in sent] for sent in sents]

In [10]:
for sent in get_unambig_sents(islice(corpus_full.iter_parsed_sents(), 200, 230), get_outvalue):
    print(sent, end='\n\n')


[('А', 'NA'), ('приходится', 'NA')]

[('А', 'NA'), ('может', 'NA'), ('и', 'NA'), ('не', 'NA'), ('увлекается', 'NA'), ('а', 'NA'), ('просто', 'NA'), ('резвится', 'NA')]

[('Соблазняет', 'NA')]

[('Шалит', 'NA')]

[('Фрик', 'nomn')]

[('Ради', 'NA'), ('него', 'gent'), ('стоит', 'NA'), ('смотреть', 'NA'), ('эту', 'accs'), ('не', 'NA'), ('самую', 'accs'), ('интересную', 'accs'), ('картину', 'accs')]


In [14]:
%%time
sents_opcorpora_raw = list(
    get_unambig_sents(tqdm(corpus_full.iter_parsed_sents(), total=0), get_outvalue, remove_pnct=False)
)


CPU times: user 1min 42s, sys: 2.22 s, total: 1min 44s
Wall time: 1min 46s

In [15]:
sents_opcorpora = [s for s in sents_opcorpora_raw if len(s) > 1]
len(sents_opcorpora_raw), len(sents_opcorpora)


Out[15]:
(7668, 7305)

In [16]:
# TRAIN_SIZE = 60

sents_train = sents_opcorpora
sents_test = list(load_microcorpus(MICROCORPUS_GLOB, get_outvalue, remove_pnct=False))

y_train = getY(sents_train)
y_test = getY(sents_test)

# sents = list(load_microcorpus(MICROCORPUS_GLOB))
# sents_train, sents_test = sents[:TRAIN_SIZE], sents[TRAIN_SIZE:]

# y = [[get_case(tag) for token, tag in sent] for sent in sents]
# y_train, y_test = y[:TRAIN_SIZE], y[TRAIN_SIZE:]

In [17]:
set(chain(*y_train)), set(chain(*y_test)), len(y_train), len(y_test), m.TagClass.RARE_CASES


Out[17]:
({'NA', 'ablt', 'accs', 'datv', 'gent', 'loct', 'nomn'},
 {'NA', 'ablt', 'accs', 'datv', 'gent', 'loct', 'nomn'},
 7305,
 94,
 {'acc2': 'accs',
  'gen2': 'gent',
  'loc2': 'loct',
  'gen1': 'gent',
  'voct': 'nomn',
  'loc1': 'loct',
  'acc1': 'accs'})

In [18]:
THRESH = 0.1
FORBIDDEN_GRAMMEMES = {} # {'inan', 'anim', 'sing', 'plur', 'masc', 'femn', 'neut'}
# ALLOWED_GRAMMEMES = (m.TagClass.CASES | m.TagClass.PARTS_OF_SPEECH |
#                      m.TagClass.TRANSITIVITY | m.TagClass.VOICES | m.TagClass.ASPECTS |
#                      m.TagClass.INVOLVEMENT | m.TagClass.MOODS)

def _add_grammeme_features(token, namespace, features, k=1):
    for p in m.parse(token):
        if p.score < THRESH:
            continue
        for grammeme in p.tag.grammemes:
            if grammeme in FORBIDDEN_GRAMMEMES:
                continue
            key = "%s:%s" % (namespace, grammeme)
            features[key] = max(p.score*k, features[key])
            if p.score == 1:
                features['unambig:'+key] = 1
            
            seen = set()
            for grammeme2 in p.tag.grammemes:
                if grammeme2 == grammeme:
                    continue                
                if grammeme2 in FORBIDDEN_GRAMMEMES:
                    continue
                    
                if grammeme > grammeme2:
                    key2 = "%s:%s,%s" % (namespace, grammeme, grammeme2)
                else:
                    key2 = "%s:%s,%s" % (namespace, grammeme2, grammeme)
                    
                if key2 in seen:
                    continue
                seen.add(key2)
                features[key2] = max(p.score*k, features[key2])
            
                if p.score == 1:
                    features['unambig:'+key2] = 1
            
    

def get_features(tokens, i):
    token = tokens[i]
    features = defaultdict(float)
    features['bias'] = 1
    
    features['i:token:%s' % token.lower()] = 1
    _add_grammeme_features(token, "i", features, k=2)  # WAS: k=2
            
    if i > 0:
        token = tokens[i-1]
        features['i-1:token:%s' % token.lower()] = 1
#         features['i-1:bigram:%s/%s' % (token.lower(), tokens[i][0].lower())] = 1
        _add_grammeme_features(token, "i-1", features)                
    else:
        features['BOS'] = 1
                
    if i > 1:
        token = tokens[i-2]
        features['i-2:token:%s' % token.lower()] = 1
#         features['i-2:bigram:%s/%s' % (token.lower(), tokens[i-1][0].lower())] = 1
        _add_grammeme_features(token, "i-2", features)

    if i < len(tokens) - 1:
        token = tokens[i+1]
        features['i+1:token:%s' % token.lower()] = 1
        _add_grammeme_features(token, "i+1", features)                
    else:
        features['EOS'] = 1

#     if i < len(tokens) - 2:
#         token = tokens[i+2]
#         features['i+2:token:%s' % token.lower()] = 1
#         _add_grammeme_features(token, "i+2", features)                

    return dict(features)

def tokens2features(tokens):
    return [get_features(tokens, i) for i in range(len(tokens))]

def sent2features(sent):
    return tokens2features(sent2tokens(sent))

def sent2tokens(sent):
    return [item[0] for item in sent]

def sent2y(sent):
    return [item[1] for item in sent]

def getX(sents):
    return [sent2features(s) for s in sents]

X_train, X_test = getX(sents_train), getX(sents_test)

In [19]:
import pycrfsuite

class LessNoisyTrainer(pycrfsuite.Trainer):
    
    def on_iteration(self, log, info):
        if 'avg_precision' in info:
            print(("Iter {num:<3} "
                   "time={time:<5.2f} "
                   "loss={loss:<8.2f} "
                   "active={active_features:<5} "
    #                "feature_norm={feature_norm:<8.2f} "
                   "precision={avg_precision:0.3f}  "
                   "recall={avg_recall:0.3f}  "
                   "F1={avg_f1:0.3f}  "
                   "accuracy(item/instance)="
                   "{item_accuracy_float:0.3f} {instance_accuracy_float:0.3f}"
                ).format(**info).strip())
        else:
            print(("Iter {num:<3} "
                   "time={time:<5.2f} "
                   "loss={loss:<8.2f} "
                   "active={active_features:<5} "
                   "feature_norm={feature_norm:<8.2f} "
                ).format(**info).strip())
            

trainer = LessNoisyTrainer('lbfgs')

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)
    
for xseq, yseq in zip(X_test, y_test):
    trainer.append(xseq, yseq, 1)

# for xseq, yseq in zip(X_test, y_test):
#     trainer.append(xseq, yseq)

trainer.set_params({
    'max_iterations': 80,
#     'epsilon': 1e-6,
#     'c': 10,
#     'type': 2,
   'c1': 0.5,
   'c2': 0.01,
#     'feature.minfreq': 0.05,
})

In [27]:
%%time
trainer.train('models/model-12.crfsuite', 1)


Holdout group: 2

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 76645
Seconds required: 1.216

L-BFGS optimization
c1: 0.500000
c2: 0.010000
num_memories: 6
max_iterations: 80
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.42  loss=40921.22 active=73390 precision=0.177  recall=0.194  F1=0.171  accuracy(item/instance)=0.586 0.064
Iter 2   time=0.23  loss=23429.97 active=63574 precision=0.422  recall=0.282  F1=0.279  accuracy(item/instance)=0.659 0.064
Iter 3   time=0.24  loss=13310.97 active=47397 precision=0.709  recall=0.457  F1=0.530  accuracy(item/instance)=0.732 0.106
Iter 4   time=0.23  loss=7219.86  active=39084 precision=0.870  recall=0.620  F1=0.691  accuracy(item/instance)=0.811 0.191
Iter 5   time=0.28  loss=4519.11  active=32259 precision=0.859  recall=0.710  F1=0.765  accuracy(item/instance)=0.845 0.202
Iter 6   time=0.26  loss=3097.77  active=26547 precision=0.855  recall=0.758  F1=0.800  accuracy(item/instance)=0.864 0.213
Iter 7   time=0.23  loss=2222.36  active=21103 precision=0.869  recall=0.799  F1=0.828  accuracy(item/instance)=0.886 0.245
Iter 8   time=0.22  loss=1746.62  active=18454 precision=0.862  recall=0.816  F1=0.838  accuracy(item/instance)=0.888 0.255
Iter 9   time=0.23  loss=1520.39  active=17731 precision=0.868  recall=0.832  F1=0.847  accuracy(item/instance)=0.900 0.298
Iter 10  time=0.24  loss=1370.45  active=16627 precision=0.865  recall=0.847  F1=0.855  accuracy(item/instance)=0.904 0.319
Iter 11  time=0.30  loss=1221.69  active=15143 precision=0.868  recall=0.844  F1=0.854  accuracy(item/instance)=0.905 0.319
Iter 12  time=0.23  loss=1117.62  active=14421 precision=0.869  recall=0.851  F1=0.859  accuracy(item/instance)=0.908 0.340
Iter 13  time=0.23  loss=1029.34  active=13468 precision=0.864  recall=0.851  F1=0.857  accuracy(item/instance)=0.907 0.351
Iter 14  time=0.22  loss=942.45   active=12090 precision=0.872  recall=0.854  F1=0.862  accuracy(item/instance)=0.911 0.340
Iter 15  time=0.22  loss=879.99   active=11667 precision=0.869  recall=0.860  F1=0.863  accuracy(item/instance)=0.910 0.362
Iter 16  time=0.23  loss=840.40   active=11069 precision=0.872  recall=0.861  F1=0.866  accuracy(item/instance)=0.914 0.372
Iter 17  time=0.22  loss=791.03   active=10102 precision=0.870  recall=0.860  F1=0.864  accuracy(item/instance)=0.914 0.372
Iter 18  time=0.23  loss=744.57   active=9211  precision=0.876  recall=0.858  F1=0.867  accuracy(item/instance)=0.917 0.394
Iter 19  time=0.22  loss=713.31   active=8547  precision=0.881  recall=0.865  F1=0.872  accuracy(item/instance)=0.917 0.394
Iter 20  time=0.23  loss=683.46   active=7801  precision=0.888  recall=0.872  F1=0.879  accuracy(item/instance)=0.922 0.404
Iter 21  time=0.22  loss=652.13   active=6520  precision=0.883  recall=0.871  F1=0.877  accuracy(item/instance)=0.919 0.404
Iter 22  time=0.43  loss=647.02   active=6158  precision=0.887  recall=0.876  F1=0.881  accuracy(item/instance)=0.922 0.404
Iter 23  time=0.23  loss=634.15   active=6245  precision=0.886  recall=0.874  F1=0.880  accuracy(item/instance)=0.922 0.415
Iter 24  time=0.23  loss=627.79   active=6004  precision=0.886  recall=0.875  F1=0.880  accuracy(item/instance)=0.922 0.404
Iter 25  time=0.24  loss=617.15   active=5564  precision=0.884  recall=0.874  F1=0.878  accuracy(item/instance)=0.920 0.415
Iter 26  time=0.23  loss=604.20   active=4981  precision=0.882  recall=0.875  F1=0.878  accuracy(item/instance)=0.920 0.394
Iter 27  time=0.44  loss=599.76   active=4840  precision=0.888  recall=0.875  F1=0.880  accuracy(item/instance)=0.922 0.415
Iter 28  time=0.22  loss=593.38   active=4738  precision=0.885  recall=0.875  F1=0.880  accuracy(item/instance)=0.922 0.415
Iter 29  time=0.22  loss=589.30   active=4496  precision=0.892  recall=0.878  F1=0.884  accuracy(item/instance)=0.925 0.426
Iter 30  time=0.22  loss=584.80   active=4277  precision=0.888  recall=0.876  F1=0.881  accuracy(item/instance)=0.922 0.415
Iter 31  time=0.23  loss=580.34   active=4050  precision=0.894  recall=0.879  F1=0.885  accuracy(item/instance)=0.925 0.426
Iter 32  time=0.23  loss=575.62   active=3827  precision=0.888  recall=0.876  F1=0.881  accuracy(item/instance)=0.922 0.415
Iter 33  time=0.27  loss=571.76   active=3594  precision=0.900  recall=0.879  F1=0.889  accuracy(item/instance)=0.925 0.426
Iter 34  time=0.28  loss=568.21   active=3453  precision=0.898  recall=0.879  F1=0.888  accuracy(item/instance)=0.925 0.426
Iter 35  time=0.24  loss=565.30   active=3303  precision=0.897  recall=0.877  F1=0.886  accuracy(item/instance)=0.924 0.415
Iter 36  time=0.23  loss=562.01   active=3110  precision=0.897  recall=0.878  F1=0.887  accuracy(item/instance)=0.924 0.426
Iter 37  time=0.30  loss=559.37   active=2944  precision=0.897  recall=0.879  F1=0.887  accuracy(item/instance)=0.925 0.426
Iter 38  time=0.23  loss=557.21   active=2850  precision=0.898  recall=0.881  F1=0.889  accuracy(item/instance)=0.925 0.426
Iter 39  time=0.23  loss=555.39   active=2785  precision=0.897  recall=0.879  F1=0.887  accuracy(item/instance)=0.925 0.426
Iter 40  time=0.23  loss=553.34   active=2710  precision=0.898  recall=0.881  F1=0.889  accuracy(item/instance)=0.925 0.426
Iter 41  time=0.22  loss=551.68   active=2585  precision=0.898  recall=0.882  F1=0.889  accuracy(item/instance)=0.926 0.426
Iter 42  time=0.24  loss=550.01   active=2550  precision=0.897  recall=0.883  F1=0.890  accuracy(item/instance)=0.926 0.426
Iter 43  time=0.31  loss=548.81   active=2532  precision=0.898  recall=0.882  F1=0.889  accuracy(item/instance)=0.926 0.426
Iter 44  time=0.25  loss=547.65   active=2446  precision=0.897  recall=0.883  F1=0.890  accuracy(item/instance)=0.926 0.426
Iter 45  time=0.25  loss=546.56   active=2398  precision=0.898  recall=0.882  F1=0.889  accuracy(item/instance)=0.926 0.426
Iter 46  time=0.26  loss=545.62   active=2343  precision=0.898  recall=0.884  F1=0.891  accuracy(item/instance)=0.927 0.426
Iter 47  time=0.22  loss=544.64   active=2321  precision=0.899  recall=0.883  F1=0.891  accuracy(item/instance)=0.927 0.426
Iter 48  time=0.23  loss=543.84   active=2282  precision=0.897  recall=0.883  F1=0.890  accuracy(item/instance)=0.926 0.426
Iter 49  time=0.26  loss=543.07   active=2244  precision=0.899  recall=0.885  F1=0.892  accuracy(item/instance)=0.927 0.426
Iter 50  time=0.22  loss=542.40   active=2183  precision=0.898  recall=0.884  F1=0.891  accuracy(item/instance)=0.927 0.426
Iter 51  time=0.22  loss=541.50   active=2159  precision=0.898  recall=0.884  F1=0.891  accuracy(item/instance)=0.927 0.426
Iter 52  time=0.22  loss=540.97   active=2129  precision=0.898  recall=0.884  F1=0.891  accuracy(item/instance)=0.927 0.426
Iter 53  time=0.23  loss=540.25   active=2105  precision=0.897  recall=0.883  F1=0.890  accuracy(item/instance)=0.926 0.426
Iter 54  time=0.23  loss=539.88   active=2053  precision=0.899  recall=0.886  F1=0.892  accuracy(item/instance)=0.927 0.426
Iter 55  time=0.22  loss=539.13   active=2027  precision=0.900  recall=0.886  F1=0.893  accuracy(item/instance)=0.928 0.426
Iter 56  time=0.22  loss=538.63   active=1988  precision=0.899  recall=0.886  F1=0.892  accuracy(item/instance)=0.927 0.426
Iter 57  time=0.29  loss=538.09   active=1958  precision=0.903  recall=0.887  F1=0.894  accuracy(item/instance)=0.929 0.436
Iter 58  time=0.23  loss=537.78   active=1929  precision=0.899  recall=0.886  F1=0.892  accuracy(item/instance)=0.927 0.426
Iter 59  time=0.23  loss=537.12   active=1908  precision=0.900  recall=0.886  F1=0.893  accuracy(item/instance)=0.928 0.426
Iter 60  time=0.27  loss=536.73   active=1891  precision=0.899  recall=0.886  F1=0.892  accuracy(item/instance)=0.927 0.426
Iter 61  time=0.23  loss=536.29   active=1878  precision=0.904  recall=0.888  F1=0.896  accuracy(item/instance)=0.930 0.426
Iter 62  time=0.23  loss=535.98   active=1869  precision=0.901  recall=0.886  F1=0.893  accuracy(item/instance)=0.928 0.426
Iter 63  time=0.24  loss=535.39   active=1850  precision=0.905  recall=0.887  F1=0.896  accuracy(item/instance)=0.930 0.426
Iter 64  time=0.28  loss=535.23   active=1833  precision=0.900  recall=0.887  F1=0.893  accuracy(item/instance)=0.928 0.426
Iter 65  time=0.23  loss=534.68   active=1817  precision=0.904  recall=0.887  F1=0.895  accuracy(item/instance)=0.929 0.415
Iter 66  time=0.23  loss=534.45   active=1803  precision=0.904  recall=0.889  F1=0.896  accuracy(item/instance)=0.930 0.426
Iter 67  time=0.23  loss=533.98   active=1792  precision=0.904  recall=0.887  F1=0.895  accuracy(item/instance)=0.929 0.415
Iter 68  time=0.22  loss=533.63   active=1761  precision=0.904  recall=0.889  F1=0.897  accuracy(item/instance)=0.930 0.415
Iter 69  time=0.23  loss=533.29   active=1724  precision=0.904  recall=0.888  F1=0.895  accuracy(item/instance)=0.929 0.415
Iter 70  time=0.32  loss=533.10   active=1708  precision=0.904  recall=0.889  F1=0.897  accuracy(item/instance)=0.930 0.415
Iter 71  time=0.22  loss=532.53   active=1703  precision=0.904  recall=0.889  F1=0.896  accuracy(item/instance)=0.929 0.404
Iter 72  time=0.22  loss=532.19   active=1707  precision=0.904  recall=0.889  F1=0.897  accuracy(item/instance)=0.930 0.415
Iter 73  time=0.27  loss=531.82   active=1693  precision=0.903  recall=0.888  F1=0.895  accuracy(item/instance)=0.928 0.404
Iter 74  time=0.23  loss=531.55   active=1690  precision=0.903  recall=0.888  F1=0.895  accuracy(item/instance)=0.928 0.404
Iter 75  time=0.22  loss=531.16   active=1665  precision=0.903  recall=0.888  F1=0.895  accuracy(item/instance)=0.928 0.404
Iter 76  time=0.23  loss=530.91   active=1631  precision=0.902  recall=0.887  F1=0.894  accuracy(item/instance)=0.928 0.404
Iter 77  time=0.31  loss=530.51   active=1628  precision=0.903  recall=0.888  F1=0.895  accuracy(item/instance)=0.928 0.404
Iter 78  time=0.25  loss=530.37   active=1612  precision=0.902  recall=0.887  F1=0.894  accuracy(item/instance)=0.928 0.404
Iter 79  time=0.26  loss=529.92   active=1598  precision=0.903  recall=0.888  F1=0.895  accuracy(item/instance)=0.928 0.404
Iter 80  time=0.28  loss=529.88   active=1600  precision=0.904  recall=0.886  F1=0.895  accuracy(item/instance)=0.927 0.404
L-BFGS terminated with the maximum number of iterations
Total seconds required for training: 19.811

Storing the model
Number of active features: 1600 (76645)
Number of active attributes: 999 (53363)
Number of active labels: 7 (7)
Writing labels
Writing attributes
Writing feature references for transitions
Writing feature references for attributes
Seconds required: 0.006

CPU times: user 20.9 s, sys: 109 ms, total: 21 s
Wall time: 21 s

In [28]:
trainer.get_params()


Out[28]:
{'delta': 1e-05,
 'c2': 0.01,
 'max_linesearch': 20,
 'period': 10,
 'feature.possible_transitions': False,
 'epsilon': 1e-05,
 'linesearch': 'MoreThuente',
 'c1': 0.5,
 'max_iterations': 80,
 'feature.possible_states': False,
 'feature.minfreq': 0.0,
 'num_memories': 6}

In [29]:
tagger = pycrfsuite.Tagger()
tagger.open('models/model-12.crfsuite')


Out[29]:
<contextlib.closing at 0x11eadcf98>

In [30]:
info = tagger.info()
from collections import Counter

cnt_trans = Counter(info.transitions)
cnt_state = Counter(info.state_features)

print(len(cnt_state))

# Counter(info.transitions).most_common()
# info.state_features
[v for v in cnt_state.most_common() if 'i+1' in v[0][0]]
# cnt_trans.most_common()


1572
Out[30]:
[(('i+1:token:нет', 'gent'), 2.356573),
 (('i+1:plur,loct', 'loct'), 2.261851),
 (('i+1:token:верующих', 'nomn'), 1.54064),
 (('i+1:token:гойи', 'accs'), 1.439646),
 (('i+1:token:удалось', 'accs'), 1.390432),
 (('i+1:PRED', 'accs'), 1.315811),
 (('i+1:pres,PRED', 'accs'), 1.315811),
 (('i+1:gent,femn', 'gent'), 1.258567),
 (('i+1:tran,past', 'accs'), 1.067378),
 (('i+1:plur,NOUN', 'datv'), 1.047563),
 (('i+1:sing,neut', 'nomn'), 1.002115),
 (('i+1:nomn,inan', 'accs'), 0.978682),
 (('i+1:token:сми', 'accs'), 0.852168),
 (('i+1:token:долларов', 'accs'), 0.814067),
 (('i+1:tran,sing', 'nomn'), 0.753328),
 (('i+1:token:!', 'nomn'), 0.742419),
 (('i+1:plur,intr', 'nomn'), 0.738442),
 (('i+1:sing,intr', 'gent'), 0.686234),
 (('i+1:sing,ablt', 'ablt'), 0.659065),
 (('i+1:ablt', 'ablt'), 0.639773),
 (('i+1:token:отца', 'nomn'), 0.607304),
 (('i+1:token:»', 'gent'), 0.601045),
 (('i+1:ablt,NOUN', 'ablt'), 0.590321),
 (('i+1:token:делятся', 'nomn'), 0.577587),
 (('i+1:tran,perf', 'datv'), 0.560743),
 (('i+1:gent', 'nomn'), 0.560131),
 (('i+1:nomn,NPRO', 'accs'), 0.549427),
 (('unambig:i+1:nomn,NPRO', 'accs'), 0.541925),
 (('unambig:i+1:nomn,NOUN', 'nomn'), 0.528403),
 (('i+1:token:разместить', 'datv'), 0.506042),
 (('unambig:i+1:perf,past', 'gent'), 0.489177),
 (('unambig:i+1:PREP', 'nomn'), 0.488829),
 (('i+1:token:долой', 'accs'), 0.484933),
 (('i+1:token:во', 'nomn'), 0.468113),
 (('i+1:token:(', 'nomn'), 0.450778),
 (('i+1:token:слова', 'nomn'), 0.450686),
 (('unambig:i+1:PREP', 'ablt'), 0.448912),
 (('unambig:i+1:tran,past', 'accs'), 0.434714),
 (('i+1:sing,gent', 'nomn'), 0.431974),
 (('i+1:token:к', 'accs'), 0.413905),
 (('i+1:token:состоят', 'nomn'), 0.413064),
 (('i+1:datv,NOUN', 'datv'), 0.412396),
 (('unambig:i+1:NPRO', 'accs'), 0.407181),
 (('i+1:token:полномочий', 'datv'), 0.363526),
 (('i+1:token:сказал', 'accs'), 0.350134),
 (('i+1:sing,femn', 'datv'), 0.34992),
 (('unambig:i+1:sing,NPRO', 'accs'), 0.338477),
 (('i+1:token:,', 'NA'), 0.33034),
 (('i+1:femn', 'datv'), 0.322719),
 (('i+1:perf,VERB', 'accs'), 0.318513),
 (('i+1:sing', 'NA'), 0.316287),
 (('unambig:i+1:plur,neut', 'datv'), 0.315943),
 (('i+1:token:хватит', 'gent'), 0.311031),
 (('unambig:i+1:ADVB', 'accs'), 0.305085),
 (('i+1:token:,', 'accs'), 0.286457),
 (('i+1:masc', 'nomn'), 0.285596),
 (('i+1:pres', 'accs'), 0.284843),
 (('unambig:i+1:tran', 'accs'), 0.280615),
 (('i+1:token:обмена', 'accs'), 0.277288),
 (('unambig:i+1:plur,NOUN', 'datv'), 0.271313),
 (('i+1:pres,plur', 'nomn'), 0.270597),
 (('i+1:anim', 'nomn'), 0.26911),
 (('i+1:masc,NOUN', 'NA'), 0.264862),
 (('i+1:plur,neut', 'datv'), 0.263478),
 (('i+1:tran', 'accs'), 0.254611),
 (('i+1:sing,masc', 'NA'), 0.241759),
 (('unambig:i+1:masc', 'loct'), 0.2415),
 (('i+1:NOUN', 'NA'), 0.240124),
 (('i+1:sing,masc', 'nomn'), 0.239752),
 (('i+1:inan,gent', 'gent'), 0.23949),
 (('i+1:token:?', 'nomn'), 0.236229),
 (('i+1:impf,INFN', 'NA'), 0.236111),
 (('unambig:i+1:ablt,NOUN', 'ablt'), 0.234202),
 (('i+1:PRCL', 'gent'), 0.228267),
 (('i+1:masc,anim', 'nomn'), 0.216911),
 (('unambig:i+1:impf,INFN', 'NA'), 0.214285),
 (('unambig:i+1:tran,VERB', 'accs'), 0.207618),
 (('unambig:i+1:inan,ablt', 'ablt'), 0.203286),
 (('i+1:indc,impf', 'nomn'), 0.202959),
 (('unambig:i+1:perf,VERB', 'accs'), 0.201431),
 (('i+1:token:–', 'nomn'), 0.193149),
 (('i+1:token:-', 'nomn'), 0.192502),
 (('unambig:i+1:PNCT', 'nomn'), 0.190981),
 (('i+1:PNCT', 'nomn'), 0.190981),
 (('i+1:neut', 'nomn'), 0.183389),
 (('i+1:masc,inan', 'accs'), 0.180286),
 (('i+1:token:отогнали', 'accs'), 0.179278),
 (('i+1:plur,impf', 'nomn'), 0.171809),
 (('i+1:token:.', 'gent'), 0.171082),
 (('i+1:token:—', 'nomn'), 0.164745),
 (('i+1:impf,VERB', 'nomn'), 0.157235),
 (('i+1:tran,perf', 'accs'), 0.157043),
 (('i+1:nomn', 'accs'), 0.146734),
 (('unambig:i+1:Vpre,PREP', 'nomn'), 0.14144),
 (('unambig:i+1:Vpre', 'nomn'), 0.14144),
 (('i+1:nomn', 'NA'), 0.138998),
 (('i+1:tran,VERB', 'accs'), 0.131631),
 (('i+1:token:никто', 'accs'), 0.131386),
 (('unambig:i+1:sing,perf', 'gent'), 0.130392),
 (('i+1:token:на', 'gent'), 0.125683),
 (('i+1:token:о', 'accs'), 0.123115),
 (('i+1:token:и', 'gent'), 0.116002),
 (('i+1:CONJ', 'gent'), 0.10932),
 (('unambig:i+1:past,intr', 'gent'), 0.108652),
 (('unambig:i+1:tran,indc', 'accs'), 0.108089),
 (('i+1:token:даровали', 'datv'), 0.107199),
 (('i+1:sing,indc', 'nomn'), 0.104212),
 (('i+1:sing', 'nomn'), 0.103788),
 (('i+1:NOUN', 'nomn'), 0.103169),
 (('i+1:sing,nomn', 'NA'), 0.098963),
 (('i+1:sing,NOUN', 'nomn'), 0.096725),
 (('i+1:masc,NOUN', 'accs'), 0.094044),
 (('i+1:token:;', 'accs'), 0.087348),
 (('unambig:i+1:indc,impf', 'nomn'), 0.084118),
 (('i+1:past,intr', 'gent'), 0.078038),
 (('i+1:tran,plur', 'accs'), 0.077591),
 (('unambig:i+1:perf,femn', 'gent'), 0.072393),
 (('unambig:i+1:impf,VERB', 'nomn'), 0.07135),
 (('i+1:plur,indc', 'accs'), 0.0664),
 (('i+1:tran,indc', 'accs'), 0.065061),
 (('unambig:i+1:plur,inan', 'datv'), 0.060588),
 (('i+1:token:так', 'gent'), 0.059531),
 (('i+1:perf,femn', 'gent'), 0.058142),
 (('unambig:i+1:sing,masc', 'loct'), 0.052204),
 (('i+1:sing,past', 'nomn'), 0.05028),
 (('i+1:gent,NOUN', 'nomn'), 0.049235),
 (('i+1:sing,nomn', 'accs'), 0.048679),
 (('unambig:i+1:indc,3per', 'nomn'), 0.047553),
 (('unambig:i+1:VERB,3per', 'nomn'), 0.047553),
 (('unambig:i+1:past', 'gent'), 0.046964),
 (('unambig:i+1:perf,indc', 'accs'), 0.046232),
 (('i+1:plur,VERB', 'accs'), 0.045803),
 (('i+1:sing,PRTS', 'loct'), 0.045537),
 (('i+1:token:на', 'datv'), 0.045274),
 (('i+1:perf,indc', 'accs'), 0.042365),
 (('i+1:inan,ablt', 'ablt'), 0.041699),
 (('i+1:pssv,PRTS', 'loct'), 0.039612),
 (('i+1:PRTS', 'loct'), 0.039612),
 (('i+1:past,PRTS', 'loct'), 0.039258),
 (('i+1:perf,PRTS', 'loct'), 0.039235),
 (('unambig:i+1:past,femn', 'gent'), 0.037858),
 (('unambig:i+1:tran,sing', 'nomn'), 0.037193),
 (('unambig:i+1:neut,NOUN', 'datv'), 0.035916),
 (('unambig:i+1:neut,inan', 'datv'), 0.035916),
 (('unambig:i+1:VERB', 'accs'), 0.033907),
 (('i+1:token:.', 'ablt'), 0.033752),
 (('unambig:i+1:femn', 'gent'), 0.033553),
 (('i+1:Vpre,PREP', 'nomn'), 0.032885),
 (('i+1:Vpre', 'nomn'), 0.032885),
 (('i+1:token:!', 'datv'), 0.025857),
 (('i+1:pssv,past', 'loct'), 0.024425),
 (('i+1:sing,pssv', 'loct'), 0.024399),
 (('i+1:pssv,perf', 'loct'), 0.024372),
 (('i+1:femn', 'NA'), 0.022798),
 (('i+1:intr,impf', 'nomn'), 0.021993),
 (('i+1:femn', 'gent'), 0.02141),
 (('i+1:plur', 'datv'), 0.020684),
 (('unambig:i+1:sing,PRTS', 'loct'), 0.020351),
 (('i+1:pssv', 'loct'), 0.019679),
 (('i+1:indc', 'nomn'), 0.018176),
 (('i+1:indc,VERB', 'nomn'), 0.018176),
 (('i+1:token:обновления', 'accs'), 0.017283),
 (('unambig:i+1:sing,pssv', 'loct'), 0.016183),
 (('unambig:i+1:pssv', 'loct'), 0.015943),
 (('unambig:i+1:pssv,perf', 'loct'), 0.015803),
 (('unambig:i+1:pssv,past', 'loct'), 0.015803),
 (('unambig:i+1:PNCT', 'ablt'), 0.01579),
 (('i+1:PNCT', 'ablt'), 0.01579),
 (('i+1:sing,VERB', 'nomn'), 0.015325),
 (('unambig:i+1:perf,PRTS', 'loct'), 0.015011),
 (('unambig:i+1:pssv,PRTS', 'loct'), 0.015011),
 (('unambig:i+1:PRTS', 'loct'), 0.015011),
 (('unambig:i+1:past,PRTS', 'loct'), 0.015011),
 (('i+1:PREP', 'nomn'), 0.014867),
 (('i+1:token:и', 'nomn'), 0.011713),
 (('unambig:i+1:PNCT', 'gent'), 0.007293),
 (('i+1:PNCT', 'gent'), 0.007293),
 (('unambig:i+1:sing', 'loct'), 0.007193),
 (('i+1:neut,NOUN', 'datv'), 0.004845),
 (('i+1:neut,inan', 'datv'), 0.004795),
 (('unambig:i+1:nomn', 'loct'), 0.003677),
 (('i+1:neut,PRTS', 'loct'), 0.00342),
 (('i+1:past,VERB', 'accs'), 0.003281),
 (('i+1:past,indc', 'accs'), 0.003281),
 (('i+1:sing,futr', 'gent'), 0.002553),
 (('unambig:i+1:sing,nomn', 'loct'), 0.002458),
 (('i+1:pssv,neut', 'loct'), 0.002322),
 (('i+1:inan', 'accs'), 0.002035),
 (('unambig:i+1:sing,NOUN', 'ablt'), 0.000951),
 (('unambig:i+1:sing', 'accs'), 0.000523),
 (('i+1:sing,NOUN', 'NA'), 0.000388),
 (('unambig:i+1:sing,past', 'gent'), 0.000166),
 (('i+1:indc,impf', 'datv'), 0.000129),
 (('i+1:sing,perf', 'gent'), 0.000102),
 (('i+1:impf,VERB', 'datv'), 6.1e-05),
 (('i+1:tran,impf', 'accs'), 2.7e-05),
 (('unambig:i+1:past,indc', 'accs'), 1.2e-05),
 (('unambig:i+1:past,VERB', 'accs'), 1.2e-05),
 (('i+1:token:найдено', 'loct'), 1e-05),
 (('unambig:i+1:neut,PRTS', 'loct'), 1e-05),
 (('unambig:i+1:pssv,neut', 'loct'), 1e-05),
 (('unambig:i+1:tran,impf', 'accs'), 6e-06),
 (('i+1:perf,neut', 'datv'), -2e-06),
 (('i+1:past,neut', 'datv'), -4e-06),
 (('i+1:sing,PRTS', 'datv'), -9e-06),
 (('i+1:PRTS', 'datv'), -9e-06),
 (('i+1:pssv,PRTS', 'datv'), -9e-06),
 (('i+1:past,PRTS', 'datv'), -9e-06),
 (('i+1:sing,pssv', 'datv'), -1e-05),
 (('i+1:pssv,past', 'datv'), -1.1e-05),
 (('i+1:intr,3per', 'gent'), -2e-05),
 (('i+1:token:сказал', 'NA'), -2.9e-05),
 (('i+1:gent', 'accs'), -3.6e-05),
 (('i+1:token:в', 'accs'), -7.9e-05),
 (('i+1:token:о', 'NA'), -9.3e-05),
 (('unambig:i+1:sing,past', 'NA'), -0.000102),
 (('unambig:i+1:sing,inan', 'NA'), -0.000134),
 (('unambig:i+1:LATN', 'datv'), -0.00016),
 (('i+1:LATN', 'datv'), -0.000161),
 (('i+1:plur,intr', 'gent'), -0.000547),
 (('unambig:i+1:femn', 'nomn'), -0.001089),
 (('i+1:tran', 'nomn'), -0.001423),
 (('i+1:token:–', 'NA'), -0.001457),
 (('i+1:CONJ', 'accs'), -0.002139),
 (('i+1:inan', 'nomn'), -0.002213),
 (('i+1:inan,gent', 'accs'), -0.003399),
 (('unambig:i+1:perf', 'nomn'), -0.005383),
 (('i+1:inan,femn', 'accs'), -0.005393),
 (('i+1:pssv', 'datv'), -0.005869),
 (('unambig:i+1:PREP', 'gent'), -0.006238),
 (('i+1:pres', 'datv'), -0.007175),
 (('i+1:token:.', 'datv'), -0.008835),
 (('unambig:i+1:sing,past', 'datv'), -0.011123),
 (('unambig:i+1:perf,past', 'nomn'), -0.01883),
 (('i+1:token:"', 'gent'), -0.019928),
 (('unambig:i+1:pres', 'gent'), -0.024705),
 (('unambig:i+1:perf,past', 'datv'), -0.02713),
 (('i+1:sing,femn', 'nomn'), -0.036203),
 (('unambig:i+1:sing,neut', 'datv'), -0.036779),
 (('i+1:sing', 'accs'), -0.038389),
 (('i+1:masc,Name', 'nomn'), -0.041616),
 (('i+1:pres', 'gent'), -0.042937),
 (('unambig:i+1:sing,perf', 'datv'), -0.044127),
 (('i+1:sing,past', 'datv'), -0.045249),
 (('unambig:i+1:impf,INFN', 'nomn'), -0.04589),
 (('i+1:femn,NOUN', 'accs'), -0.046297),
 (('unambig:i+1:PNCT', 'datv'), -0.04727),
 (('i+1:PNCT', 'datv'), -0.04727),
 (('unambig:i+1:sing,nomn', 'gent'), -0.04728),
 (('unambig:i+1:intr', 'accs'), -0.048956),
 (('i+1:neut,inan', 'accs'), -0.053531),
 (('i+1:neut,NOUN', 'accs'), -0.053734),
 (('unambig:i+1:plur,gent', 'nomn'), -0.05745),
 (('i+1:plur', 'gent'), -0.057674),
 (('i+1:intr,VERB', 'accs'), -0.060756),
 (('i+1:sing,intr', 'accs'), -0.061011),
 (('unambig:i+1:sing,masc', 'accs'), -0.06645),
 (('i+1:intr,indc', 'accs'), -0.074615),
 (('unambig:i+1:impf', 'gent'), -0.074893),
 (('unambig:i+1:tran', 'nomn'), -0.074995),
 (('i+1:sing,neut', 'datv'), -0.075119),
 (('i+1:impf,INFN', 'nomn'), -0.079353),
 (('i+1:tran,perf', 'NA'), -0.082573),
 (('i+1:loct', 'gent'), -0.084693),
 (('i+1:intr,impf', 'accs'), -0.088355),
 (('i+1:inan,NOUN', 'ablt'), -0.088529),
 (('unambig:i+1:sing,femn', 'nomn'), -0.088958),
 (('i+1:inan', 'ablt'), -0.088959),
 (('unambig:i+1:LATN', 'nomn'), -0.09604),
 (('i+1:LATN', 'nomn'), -0.096137),
 (('i+1:plur,femn', 'accs'), -0.097238),
 (('i+1:tran,3per', 'accs'), -0.100565),
 (('i+1:token:,', 'datv'), -0.100701),
 (('unambig:i+1:masc', 'accs'), -0.107425),
 (('i+1:token:во', 'gent'), -0.107614),
 (('unambig:i+1:ADVB', 'gent'), -0.114803),
 (('unambig:i+1:tran,INFN', 'nomn'), -0.118744),
 (('unambig:i+1:tran,perf', 'NA'), -0.125487),
 (('i+1:sing,perf', 'datv'), -0.134353),
 (('i+1:sing,Name', 'nomn'), -0.137249),
 (('i+1:Name', 'nomn'), -0.13794),
 (('i+1:Name,NOUN', 'nomn'), -0.13794),
 (('i+1:anim,Name', 'nomn'), -0.13794),
 (('i+1:tran,INFN', 'nomn'), -0.141119),
 (('i+1:token:,', 'nomn'), -0.143049),
 (('i+1:gent', 'gent'), -0.157299),
 (('i+1:neut', 'gent'), -0.159759),
 (('i+1:ADVB', 'nomn'), -0.161109),
 (('i+1:token:и', 'accs'), -0.170186),
 (('unambig:i+1:plur,inan', 'nomn'), -0.180146),
 (('i+1:masc,inan', 'nomn'), -0.185068),
 (('unambig:i+1:past', 'datv'), -0.192722),
 (('i+1:PNCT', 'accs'), -0.21089),
 (('unambig:i+1:PNCT', 'accs'), -0.21089),
 (('unambig:i+1:NPRO', 'nomn'), -0.216683),
 (('i+1:intr', 'accs'), -0.231986),
 (('i+1:sing', 'ablt'), -0.235879),
 (('unambig:i+1:neut', 'gent'), -0.239524),
 (('i+1:token:.', 'accs'), -0.246141),
 (('i+1:sing,3per', 'accs'), -0.285583),
 (('i+1:PREP', 'NA'), -0.290563),
 (('unambig:i+1:tran,plur', 'nomn'), -0.296729),
 (('unambig:i+1:inan,gent', 'nomn'), -0.300236),
 (('unambig:i+1:gent,NOUN', 'nomn'), -0.301395),
 (('unambig:i+1:gent', 'nomn'), -0.310038),
 (('i+1:femn', 'accs'), -0.314944),
 (('i+1:datv,NOUN', 'ablt'), -0.318586),
 (('i+1:tran,plur', 'nomn'), -0.326114),
 (('i+1:token:!', 'gent'), -0.33844),
 (('unambig:i+1:INFN', 'nomn'), -0.356457),
 (('i+1:token:;', 'nomn'), -0.367099),
 (('i+1:pres,intr', 'gent'), -0.378179),
 (('i+1:INFN', 'nomn'), -0.405895),
 (('i+1:impf', 'gent'), -0.419514),
 (('i+1:pres,PRED', 'datv'), -0.434409),
 (('i+1:PRED', 'datv'), -0.434409),
 (('i+1:NOUN', 'ablt'), -0.462978),
 (('i+1:token:!', 'accs'), -0.473565),
 (('i+1:token:.', 'nomn'), -0.48155),
 (('unambig:i+1:nomn,masc', 'accs'), -0.515454),
 (('i+1:token::', 'accs'), -0.547935),
 (('unambig:i+1:PREP', 'datv'), -0.569685),
 (('i+1:3per', 'accs'), -0.569835),
 (('i+1:token:и', 'NA'), -0.637569),
 (('i+1:token:сми', 'gent'), -0.655599),
 (('i+1:token:?', 'NA'), -0.694424),
 (('i+1:token:гойи', 'gent'), -0.749931),
 (('unambig:i+1:femn', 'accs'), -0.798648),
 (('i+1:token:»', 'nomn'), -0.853486),
 (('i+1:token:…', 'nomn'), -0.935337),
 (('i+1:token:к', 'gent'), -1.054533),
 (('i+1:plur', 'ablt'), -1.675221),
 (('i+1:token:»', 'NA'), -1.772177),
 (('i+1:token:(', 'gent'), -1.826416)]

In [33]:
def _best_parse(tok):
    try:
        return m.parse(tok)[0]
    except IndexError:
        return m.parse('FOO')[0]
    
def _partial_tag(y):
    return y
    return [tag.split(',')[1] for tag in y]

def print_results(fp):
    fp.write("токен                true           crf                pymorphy2        result\n")
    fp.write("="*80 + '\n')
    crf_win, crf_err, pym_win, pym_err, fails = 0, 0, 0, 0, 0
    for sent in sents_test:
        tokens = sent2tokens(sent)
        y_true = sent2y(sent)

        y_pymorphy2 = [get_outvalue(_best_parse(tok).tag) for tok, _ in sent]
        pymorphy2_score = [_best_parse(tok).score for tok, _ in sent]

        y_tagger = tagger.tag(sent2features(sent))
        tagger_score = [tagger.marginal(y, pos) for pos, y in enumerate(y_tagger)]
        
        y_true = _partial_tag(y_true)
        y_pymorphy2 = _partial_tag(y_pymorphy2)
        y_tagger = _partial_tag(y_tagger)

        for tok in zip(tokens, y_true, y_tagger, tagger_score, y_pymorphy2, pymorphy2_score):
            res = ''
            
            if tok[1] != tok[2]:
                crf_err += 1
            
            if tok[1] != tok[4]:
                pym_err += 1
                
            if tok[1] == tok[2] and tok[1] != tok[4]:
                res = '+crf'                
                crf_win += 1
            elif tok[1] == tok[4] and tok[1] != tok[2]:
                res = '+pym'
                pym_win += 1
            elif tok[1] != tok[4] and tok[1] != tok[2]:
                res = '!'
                fails += 1
            
            tag_scored = "%-10s %0.3f" % (tok[2], tok[3])
            pym_scored = "%-10s %0.3f" % (tok[4], tok[5])
            fp.write("%-20s %-14s %-18s %-16s %-14s\n" % (tok[0], tok[1], tag_scored, pym_scored, res))
        fp.write("CRF: %s err, fix %s (+%s), pymorphy2: %s err, fix %s, failed: %s\n" % (
                    crf_err, crf_win, crf_win-pym_win, pym_err, pym_win, fails))
        fp.write("-"*70 + '\n')
    
    print("CRF: %s err, fix %s (+%s), pymorphy2: %s err, fix %s, failed: %s\n" % (
                crf_err, crf_win, crf_win-pym_win, pym_err, pym_win, fails))
    
with open('results/res20.txt', 'wt', encoding='utf8') as f:
    print_results(f)


CRF: 118 err, fix 97 (+70), pymorphy2: 188 err, fix 27, failed: 91


In [34]:
from pymorphy2.tokenizers import simple_word_tokenize


def tag_text(tokens):
    features = tokens2features(tokens)
    
#     print("gent")
#     for feat, w in features[2].items():
#         if (feat, 'ablt') in info.state_features:
#             print("%s %s" % (feat, w * info.state_features[(feat, 'ablt')]))
            
#     print("\naccs")
#     for feat, w in features[2].items():
#         if (feat, 'accs') in info.state_features:
#             print("%s %s" % (feat, w * info.state_features[(feat, 'accs')]))
    IDX = 0
        
    print("ACTIVE:")
    for feat, w in features[IDX].items():
        state_ablt = (feat, 'ablt')
        state_datv = (feat, 'datv')
        
        if state_ablt in info.state_features:
            print("%s %s" % (state_ablt, w * info.state_features[state_ablt]))
        if state_datv in info.state_features:
            print("%s %s" % (state_datv, w * info.state_features[state_datv]))
                

#     print("\nINACTIVE:")
#     for feat, w in features[2].items():
#         if feat in info.attributes:
#             print(feat, w)
                
    
    res = tagger.tag(features)
    print(tagger.marginal('ablt', IDX))
    print(tagger.marginal('datv', IDX))
    return res

def tag_text_pymorphy2(tokens):
    return [get_outvalue(m.parse(tok)[0].tag) for tok in tokens]


def demo(text):    
    tokens = simple_word_tokenize(text)
    for tok, tag_tag, pym_tag in zip(tokens, tag_text(tokens), tag_text_pymorphy2(tokens)):
        print(tok, tag_tag, pym_tag, sep='/', end=' ')
    
#demo('''потея пью киндзмараули кричу пернатым гули-гули''')
#demo("не видит будущего")
# demo('''невозможно упрекнуть в отношении к возможным российско-азербайджанским закулисным договоренностям''')
demo('В поселке Дальний Иркутской области сгорели 22 жилых дома')


ACTIVE:
('i+1:inan,NOUN', 'ablt') -0.088529
('BOS', 'ablt') -0.08872
('BOS', 'datv') 0.700913
('i+1:sing', 'ablt') -0.235879
('i+1:inan', 'ablt') -0.088959
('i+1:NOUN', 'ablt') -0.462978
('bias', 'ablt') -0.583812
('bias', 'datv') -0.106407
('unambig:i+1:sing,NOUN', 'ablt') 0.000951
6.664286729561451e-06
4.23518938255024e-05
В/NA/NA поселке/loct/loct Дальний/accs/accs Иркутской/loct/gent области/loct/loct сгорели/NA/NA 22/NA/NA жилых/gent/gent дома/gent/gent 

In [35]:
class DisambigCase(object):
    def __init__(self, morph, tagger_path):
        self.tagger = pycrfsuite.Tagger()
        self.tagger.open(tagger_path)
        self.morph = morph
        
    def tag(self, tokens):
        if isinstance(tokens, str):
            tokens = simple_word_tokenize(tokens)
        
        unkn = self.morph.parse('FOO')[0]
        pym_parses = [self.morph.parse(tok) for tok in tokens]
        pym_parses0 = [p[0] if p else unkn for p in pym_parses]
        
        
        return pym_parses0

In [34]:
disambig = DisambigCase(m, 'models/model-12.crfsuite')
disambig.tag("потея, пью киндзмараули кричу пернатым гули-гули")


Out[34]:
[Parse(word='потея', tag=OpencorporaTag('GRND,impf,intr pres'), normal_form='потеть', score=1.0, methods_stack=((<DictionaryAnalyzer>, 'потея', 14, 67),)),
 Parse(word=',', tag=OpencorporaTag('PNCT'), normal_form=',', score=1.0, methods_stack=((<PunctuationAnalyzer>, ','),)),
 Parse(word='пью', tag=OpencorporaTag('VERB,impf,tran sing,1per,pres,indc'), normal_form='пить', score=1.0, methods_stack=((<DictionaryAnalyzer>, 'пью', 442, 1),)),
 Parse(word='киндзмараули', tag=OpencorporaTag('NOUN,inan,neut,Sgtm,Fixd sing,nomn'), normal_form='киндзмараули', score=0.16666666666666666, methods_stack=((<DictionaryAnalyzer>, 'киндзмараули', 146, 0),)),
 Parse(word='кричу', tag=OpencorporaTag('VERB,impf,intr sing,1per,pres,indc'), normal_form='кричать', score=1.0, methods_stack=((<DictionaryAnalyzer>, 'кричу', 534, 1),)),
 Parse(word='пернатым', tag=OpencorporaTag('ADJF,Qual masc,sing,ablt'), normal_form='пернатый', score=0.2, methods_stack=((<DictionaryAnalyzer>, 'пернатым', 4, 5),)),
 Parse(word='гули-гули', tag=OpencorporaTag('INTJ'), normal_form='гули-гули', score=1.0, methods_stack=((<DictionaryAnalyzer>, 'гули-гули', 20, 0),))]

In [36]:
disambig = DisambigCase(m, 'models/model-12.crfsuite')
disambig.tag("потея, пью киндзмараули кричу пернатым гули-гули")


Out[36]:
[Parse(word='потея', tag=OpencorporaTag('GRND,impf,intr pres'), normal_form='потеть', score=1.0, methods_stack=((<DictionaryAnalyzer>, 'потея', 14, 67),)),
 Parse(word=',', tag=OpencorporaTag('PNCT'), normal_form=',', score=1.0, methods_stack=((<PunctuationAnalyzer>, ','),)),
 Parse(word='пью', tag=OpencorporaTag('VERB,impf,tran sing,1per,pres,indc'), normal_form='пить', score=1.0, methods_stack=((<DictionaryAnalyzer>, 'пью', 442, 1),)),
 Parse(word='киндзмараули', tag=OpencorporaTag('NOUN,inan,neut,Sgtm,Fixd sing,nomn'), normal_form='киндзмараули', score=0.16666666666666666, methods_stack=((<DictionaryAnalyzer>, 'киндзмараули', 146, 0),)),
 Parse(word='кричу', tag=OpencorporaTag('VERB,impf,intr sing,1per,pres,indc'), normal_form='кричать', score=1.0, methods_stack=((<DictionaryAnalyzer>, 'кричу', 534, 1),)),
 Parse(word='пернатым', tag=OpencorporaTag('ADJF,Qual masc,sing,ablt'), normal_form='пернатый', score=0.2, methods_stack=((<DictionaryAnalyzer>, 'пернатым', 4, 5),)),
 Parse(word='гули-гули', tag=OpencorporaTag('INTJ'), normal_form='гули-гули', score=1.0, methods_stack=((<DictionaryAnalyzer>, 'гули-гули', 20, 0),))]

In [ ]: