define tagger by hand as tagger supplied with NLTK is too slow

taken from http://streamhacker.com/2008/11/03/part-of-speech-tagging-with-nltk-part-1/ http://streamhacker.com/2008/11/10/part-of-speech-tagging-with-nltk-part-2/ http://streamhacker.com/2008/12/03/part-of-speech-tagging-with-nltk-part-3/



In [1]:

    
import pandas as pd
import numpy as np
import re
import string
import time
from bs4 import BeautifulSoup
import nltk, itertools
from nltk.tag import brill
import nltk.stem.wordnet
from nltk.corpus import wordnet



In [ ]:

    
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''



In [ ]:

    
def braubt_Tagger():
    treebank_sents = nltk.corpus.treebank.tagged_sents()
    treebank_train = list(treebank_sents[:1500])
    treebank_test = list(treebank_sents[1500:3000])
    
    train_sents = treebank_train
    test_sents = treebank_test
    
    def backoff_tagger(tagged_sents, tagger_classes, backoff=None):
        if not backoff:
            backoff = tagger_classes[0](tagged_sents)
            del tagger_classes[0]
     
        for cls in tagger_classes:
            tagger = cls(tagged_sents, backoff=backoff)
            backoff = tagger
     
        return backoff

    word_patterns = [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*ould$', 'MD'), (r'.*ing$', 'VBG'), 
    (r'.*ed$', 'VBD'), (r'.*ness$', 'NN'), (r'.*ment$', 'NN'), (r'.*ful$', 'JJ'), 
    (r'.*ious$', 'JJ'), (r'.*ble$', 'JJ'), (r'.*ic$', 'JJ'), (r'.*ive$', 'JJ'),
    (r'.*ic$', 'JJ'), (r'.*est$', 'JJ'), (r'^a$', 'PREP'),]
    
    raubt_tagger = backoff_tagger(train_sents, [nltk.tag.AffixTagger, 
        nltk.tag.UnigramTagger, nltk.tag.BigramTagger, nltk.tag.TrigramTagger],backoff=nltk.tag.RegexpTagger(word_patterns))
    
    templates = [brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)),
    brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)),
    brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)),
    brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1))
    ]
    
    trainer = brill.FastBrillTaggerTrainer(raubt_tagger, templates)
    braubt_tagger = trainer.train(train_sents, max_rules=100, min_score=3)
    
    return braubt_tagger