define tagger by hand as tagger supplied with NLTK is too slow
In [1]:
import pandas as pd
import numpy as np
import re
import string
import time
from bs4 import BeautifulSoup
import nltk, itertools
from nltk.tag import brill
import nltk.stem.wordnet
from nltk.corpus import wordnet
In [ ]:
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return ''
In [ ]:
def braubt_Tagger():
treebank_sents = nltk.corpus.treebank.tagged_sents()
treebank_train = list(treebank_sents[:1500])
treebank_test = list(treebank_sents[1500:3000])
train_sents = treebank_train
test_sents = treebank_test
def backoff_tagger(tagged_sents, tagger_classes, backoff=None):
if not backoff:
backoff = tagger_classes[0](tagged_sents)
del tagger_classes[0]
for cls in tagger_classes:
tagger = cls(tagged_sents, backoff=backoff)
backoff = tagger
return backoff
word_patterns = [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*ould$', 'MD'), (r'.*ing$', 'VBG'),
(r'.*ed$', 'VBD'), (r'.*ness$', 'NN'), (r'.*ment$', 'NN'), (r'.*ful$', 'JJ'),
(r'.*ious$', 'JJ'), (r'.*ble$', 'JJ'), (r'.*ic$', 'JJ'), (r'.*ive$', 'JJ'),
(r'.*ic$', 'JJ'), (r'.*est$', 'JJ'), (r'^a$', 'PREP'),]
raubt_tagger = backoff_tagger(train_sents, [nltk.tag.AffixTagger,
nltk.tag.UnigramTagger, nltk.tag.BigramTagger, nltk.tag.TrigramTagger],backoff=nltk.tag.RegexpTagger(word_patterns))
templates = [brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)),
brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)),
brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)),
brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)),
brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)),
brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)),
brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)),
brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)),
brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)),
brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1))
]
trainer = brill.FastBrillTaggerTrainer(raubt_tagger, templates)
braubt_tagger = trainer.train(train_sents, max_rules=100, min_score=3)
return braubt_tagger