In [1]:
from nltk.corpus import conll2000

In [44]:
print conll2000.chunked_sents('train.txt')[0]

import nltk
print nltk.chunk.tree2conlltags(conll2000.chunked_sents('train.txt')[0])


(S
  (NP Confidence/NN)
  (PP in/IN)
  (NP the/DT pound/NN)
  (VP is/VBZ widely/RB expected/VBN to/TO take/VB)
  (NP another/DT sharp/JJ dive/NN)
  if/IN
  (NP trade/NN figures/NNS)
  (PP for/IN)
  (NP September/NNP)
  ,/,
  due/JJ
  (PP for/IN)
  (NP release/NN)
  (NP tomorrow/NN)
  ,/,
  (VP fail/VB to/TO show/VB)
  (NP a/DT substantial/JJ improvement/NN)
  (PP from/IN)
  (NP July/NNP and/CC August/NNP)
  (NP 's/POS near-record/JJ deficits/NNS)
  ./.)
[(u'Confidence', u'NN', u'B-NP'), (u'in', u'IN', u'B-PP'), (u'the', u'DT', u'B-NP'), (u'pound', u'NN', u'I-NP'), (u'is', u'VBZ', u'B-VP'), (u'widely', u'RB', u'I-VP'), (u'expected', u'VBN', u'I-VP'), (u'to', u'TO', u'I-VP'), (u'take', u'VB', u'I-VP'), (u'another', u'DT', u'B-NP'), (u'sharp', u'JJ', u'I-NP'), (u'dive', u'NN', u'I-NP'), (u'if', u'IN', u'O'), (u'trade', u'NN', u'B-NP'), (u'figures', u'NNS', u'I-NP'), (u'for', u'IN', u'B-PP'), (u'September', u'NNP', u'B-NP'), (u',', u',', u'O'), (u'due', u'JJ', u'O'), (u'for', u'IN', u'B-PP'), (u'release', u'NN', u'B-NP'), (u'tomorrow', u'NN', u'B-NP'), (u',', u',', u'O'), (u'fail', u'VB', u'B-VP'), (u'to', u'TO', u'I-VP'), (u'show', u'VB', u'I-VP'), (u'a', u'DT', u'B-NP'), (u'substantial', u'JJ', u'I-NP'), (u'improvement', u'NN', u'I-NP'), (u'from', u'IN', u'B-PP'), (u'July', u'NNP', u'B-NP'), (u'and', u'CC', u'I-NP'), (u'August', u'NNP', u'I-NP'), (u"'s", u'POS', u'B-NP'), (u'near-record', u'JJ', u'I-NP'), (u'deficits', u'NNS', u'I-NP'), (u'.', u'.', u'O')]

In [39]:
def conll_transform_sentence(sentence, i):
    """
    i is the word position in the sentence.
    """
    if i < 2:
        no = "<start>"
        nopos = "<startPOS>"
        result = {'word_t1':no, 'pos_t1':nopos, 'word_t2':no, 'pos_t2':nopos, 'curr_word': sentence[i][0], 'curr_pos': sentence[i][1]}
    else:
        word_t1 = sentence[i-1][0]
        pos_t1 = sentence[i-1][1]
        word_t2 = sentence[i-2][0]
        pos_t2 = sentence[i-2][1]
        result = {'word_t1':word_t1, 'pos_t1':pos_t1, 'word_t2':word_t2, 'pos_t2':pos_t2, 'curr_word': sentence[i][0], 'curr_pos': sentence[i][1]}
    return result

def get_label(token):
    return token[2]

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import precision_score, recall_score
class SequentialTrainer(object):
    """sequentially train"""
    def __init__(self, model, xform, label_func):
        self._model = model
        self._xform = xform
        self._label_func = label_func
        
        self.enc = DictVectorizer()
        self.lbls_enc = LabelEncoder()
    
    def train(self, training_sentences):
        transformed_train_x = []
        train_y = []
        for sent in training_sentences:
            for i in xrange(len(sent)):
                label = self._label_func(sent[i])
                temp = self._xform(sent, i)
                transformed_train_x.append(temp)
                train_y.append(label)

        labels = self.lbls_enc.fit_transform(train_y)
        encoded = self.enc.fit_transform(transformed_train_x)
        
        self._model.fit(encoded, labels)
        self._model.enc = self.enc
        self._model.lbls_enc = self.lbls_enc
        self._model.xform = self._xform
        
        return self._model
        
    def eval(self, testing):
        transformed_test_x = []
        test_y = []
        for sent in testing:
            for i in xrange(len(sent)):
                label = self._label_func(sent[i])
                temp = self._xform(sent, i)
                transformed_test_x.append(temp)
                test_y.append(label)

        labels = self.lbls_enc.transform(test_y)
        encoded = self.enc.transform(transformed_test_x)
        
        predict = self._model.predict(encoded)
        precision = precision_score(labels, predict)
        recall = recall_score(labels, predict)
        return precision, recall

In [40]:
training = (nltk.chunk.tree2conlltags(sent) for sent in conll2000.chunked_sents('train.txt'))

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
st = SequentialTrainer(lr, conll_transform_sentence, get_label)
st.train(training)

In [41]:
testing = (nltk.chunk.tree2conlltags(sent) for sent in conll2000.chunked_sents('test.txt'))

p,r = st.eval(testing)


/usr/local/lib/python2.7/dist-packages/sklearn/metrics/classification.py:1082: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)
/usr/local/lib/python2.7/dist-packages/sklearn/metrics/classification.py:1172: DeprecationWarning: The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".
  sample_weight=sample_weight)

In [42]:
print p
print r


0.930700475702
0.930641450493