This notebook was put together by [Roman Prokofyev](http://prokofyev.ch)@[eXascale Infolab](http://exascale.info/). Source and license info is on [GitHub](https://github.com/dragoon/kilogram/).
This notebook is a part of bigger tutorial on fixing grammatical edits.
You will need to install the following python packages to run the notebook:
Download Stanford POS Tagger, for example current latest version is 3.5: http://nlp.stanford.edu/software/stanford-postagger-2014-10-26.zip. It requires Java 8 to run.
Start POS tagger as a network service:
java -mx300m -cp stanford-postagger.jar edu.stanford.nlp.tagger.maxent.MaxentTaggerServer -model models/english-bidirectional-distsim.tagger -port 2020 -tokenize false
In [1]:
from __future__ import division
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cross_validation import StratifiedShuffleSplit, ShuffleSplit
from kilogram import NgramService
from kilogram import EditNgram
PREPS = set(open('../extra/preps.txt').read().split('\n'))
PREPS_SORTED = sorted(PREPS)
NgramService.configure(PREPS_SORTED, mongo_host=('localhost', '27017'), hbase_host=('diufpc301', 9090))
ALLOWED_TYPES={1,2,3}
In [2]:
from kilogram import extract_edits
prep_edits = extract_edits('/home/roman/fce_edits.tsv', substitutions=PREPS)
total_error = len([1 for x in prep_edits if x.is_error])
print 'Total errors:', total_error
In [4]:
from kilogram.edit import EditCollection
collection = EditCollection(prep_edits)
In [4]:
import re
from kilogram.edit import Edit
EDIT_RE = re.compile(r'\(\w+\*/(\w+)\)')
from kilogram.lang.tokenize import default_tokenize_func
test_col = []
conll_data = open('../extra/data/conll-test.txt').read().splitlines()
for line in conll_data:
tokens = default_tokenize_func(line, set('!",:;<=>?[]{}.?'))
context = EDIT_RE.sub(r'\1', ' '.join(tokens))
for i, token in enumerate(tokens):
if EDIT_RE.match(token):
edit1, edit2 = token[1:-1].split('*/')
test_col.append(Edit(edit1, edit2, context, context, (i, i+1), (i, i+1)))
elif token in PREPS:
test_col.append(Edit(token, token, context, context, (i, i+1), (i, i+1)))
print 'Test collection size:', len(test_col)
print 'Total errors:', len([1 for x in test_col if x.is_error])
In [6]:
import kilogram
kilogram.DEBUG = False
# for Stanford tagger
kilogram.edit.ST_PORT = 2020
res = []
for i in xrange(1):
train_data, train_labels, feature_names = collection.balance_features(PREPS_SORTED, class1_k=1, class0_k=3046./60097)
clf = ExtraTreesClassifier(n_estimators=100, random_state=0)
print 'Fitting...'
clf.fit(train_data, train_labels)
print sorted(zip(list(clf.feature_importances_), feature_names), key=lambda x: x[0], reverse=True)
print 'Validating...'
score = collection.test_validation(PREPS_SORTED, clf, test_col)
print score
res.append(score)
print np.mean([x['f1'] for x in res]), np.std([x['f1'] for x in res])
print np.mean([x['precision'] for x in res]), np.std([x['precision'] for x in res])
print np.mean([x['recall'] for x in res]), np.std([x['recall'] for x in res])
In [7]:
from IPython.display import Image #needed to render in notebook
import StringIO, pydot #needed to convert dot format to png
from sklearn.tree import DecisionTreeClassifier, export_graphviz
clf = DecisionTreeClassifier(max_depth=5, min_samples_leaf=10)
clf.fit(train_data, train_labels)
dot_data = StringIO.StringIO()
export_graphviz(clf, out_file=dot_data, feature_names=feature_names)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
In [8]:
EditNgram(u'budget of <SKIP:DT> community'.split(), 1).association()
Out[8]:
In [8]:
a = prep_edits[3].get_single_feature(PREPS_SORTED, collection.TOP_POS_TAGS, collection.reverse_confusion_matrix())
# features
str(a[0][0])
Out[8]:
In [8]:
for x in collection.test_errors:
print x[0], x[1]
In [ ]: