In [1]:
from nltk.corpus import conll2000
In [44]:
print conll2000.chunked_sents('train.txt')[0]
import nltk
print nltk.chunk.tree2conlltags(conll2000.chunked_sents('train.txt')[0])
In [39]:
def conll_transform_sentence(sentence, i):
"""
i is the word position in the sentence.
"""
if i < 2:
no = "<start>"
nopos = "<startPOS>"
result = {'word_t1':no, 'pos_t1':nopos, 'word_t2':no, 'pos_t2':nopos, 'curr_word': sentence[i][0], 'curr_pos': sentence[i][1]}
else:
word_t1 = sentence[i-1][0]
pos_t1 = sentence[i-1][1]
word_t2 = sentence[i-2][0]
pos_t2 = sentence[i-2][1]
result = {'word_t1':word_t1, 'pos_t1':pos_t1, 'word_t2':word_t2, 'pos_t2':pos_t2, 'curr_word': sentence[i][0], 'curr_pos': sentence[i][1]}
return result
def get_label(token):
return token[2]
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import precision_score, recall_score
class SequentialTrainer(object):
"""sequentially train"""
def __init__(self, model, xform, label_func):
self._model = model
self._xform = xform
self._label_func = label_func
self.enc = DictVectorizer()
self.lbls_enc = LabelEncoder()
def train(self, training_sentences):
transformed_train_x = []
train_y = []
for sent in training_sentences:
for i in xrange(len(sent)):
label = self._label_func(sent[i])
temp = self._xform(sent, i)
transformed_train_x.append(temp)
train_y.append(label)
labels = self.lbls_enc.fit_transform(train_y)
encoded = self.enc.fit_transform(transformed_train_x)
self._model.fit(encoded, labels)
self._model.enc = self.enc
self._model.lbls_enc = self.lbls_enc
self._model.xform = self._xform
return self._model
def eval(self, testing):
transformed_test_x = []
test_y = []
for sent in testing:
for i in xrange(len(sent)):
label = self._label_func(sent[i])
temp = self._xform(sent, i)
transformed_test_x.append(temp)
test_y.append(label)
labels = self.lbls_enc.transform(test_y)
encoded = self.enc.transform(transformed_test_x)
predict = self._model.predict(encoded)
precision = precision_score(labels, predict)
recall = recall_score(labels, predict)
return precision, recall
In [40]:
training = (nltk.chunk.tree2conlltags(sent) for sent in conll2000.chunked_sents('train.txt'))
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
st = SequentialTrainer(lr, conll_transform_sentence, get_label)
st.train(training)
In [41]:
testing = (nltk.chunk.tree2conlltags(sent) for sent in conll2000.chunked_sents('test.txt'))
p,r = st.eval(testing)
In [42]:
print p
print r