This notebook was put together by [Roman Prokofyev](http://prokofyev.ch)@[eXascale Infolab](http://exascale.info/). Source and license info is on [GitHub](https://github.com/dragoon/kilogram/).

This notebook is a part of bigger tutorial on fixing grammatical edits.

Prerequisites

You will need to install the following python packages to run the notebook:

pip install scikit-learn

Download Stanford POS Tagger, for example current latest version is 3.5: http://nlp.stanford.edu/software/stanford-postagger-2014-10-26.zip. It requires Java 8 to run.

Start POS tagger as a network service:

java -mx300m -cp stanford-postagger.jar edu.stanford.nlp.tagger.maxent.MaxentTaggerServer -model models/english-bidirectional-distsim.tagger -port 2020 -tokenize false



In [1]:

    
from __future__ import division
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cross_validation import StratifiedShuffleSplit, ShuffleSplit
from kilogram import NgramService
from kilogram import EditNgram

PREPS = set(open('../extra/preps.txt').read().split('\n'))
PREPS_SORTED = sorted(PREPS)
NgramService.configure(PREPS_SORTED, mongo_host=('localhost', '27017'), hbase_host=('diufpc301', 9090))
ALLOWED_TYPES={1,2,3}



In [2]:

    
from kilogram import extract_edits
prep_edits = extract_edits('/home/roman/fce_edits.tsv', substitutions=PREPS)
total_error = len([1 for x in prep_edits if x.is_error])
print 'Total errors:', total_error









    



Total edits extracted: 60097
Total errors: 3046



In [4]:

    
from kilogram.edit import EditCollection
collection = EditCollection(prep_edits)

Create test dataset



In [4]:

    
import re
from kilogram.edit import Edit
EDIT_RE = re.compile(r'\(\w+\*/(\w+)\)')
from kilogram.lang.tokenize import default_tokenize_func
test_col = []
conll_data = open('../extra/data/conll-test.txt').read().splitlines()
for line in conll_data:
    tokens = default_tokenize_func(line, set('!",:;<=>?[]{}.?'))
    context = EDIT_RE.sub(r'\1', ' '.join(tokens))
    for i, token in enumerate(tokens):
        if EDIT_RE.match(token):
            edit1, edit2 = token[1:-1].split('*/')
            test_col.append(Edit(edit1, edit2, context, context, (i, i+1), (i, i+1)))
        elif token in PREPS:
            test_col.append(Edit(token, token, context, context, (i, i+1), (i, i+1)))
print 'Test collection size:', len(test_col)
print 'Total errors:', len([1 for x in test_col if x.is_error])









    



Test collection size: 2945
Total errors: 152



In [6]:

    
import kilogram
kilogram.DEBUG = False
# for Stanford tagger
kilogram.edit.ST_PORT = 2020
res = []
for i in xrange(1):
    train_data, train_labels, feature_names = collection.balance_features(PREPS_SORTED, class1_k=1, class0_k=3046./60097)
    clf = ExtraTreesClassifier(n_estimators=100, random_state=0)
    print 'Fitting...'
    clf.fit(train_data, train_labels)
    print sorted(zip(list(clf.feature_importances_), feature_names), key=lambda x: x[0], reverse=True)
    print 'Validating...'
    score = collection.test_validation(PREPS_SORTED, clf, test_col)
    print score
    res.append(score)
print np.mean([x['f1'] for x in res]), np.std([x['f1'] for x in res])
print np.mean([x['precision'] for x in res]), np.std([x['precision'] for x in res])
print np.mean([x['recall'] for x in res]), np.std([x['recall'] for x in res])









    



Balancing errors
Generating features from raw data
Started data loading: 14:33:36
Finish data loading: 14:37:41IGNORED EDIT: for→to
go to pat's to sleep , "


Converting to numpy arrays
(290864, 75)
(290864,)
('1st class', 5936)
('2nd class', 284928)
Fitting...
[(0.19223766588203117, 'conf_matrix_score'), (0.092266738805822118, 'top_prep_count_3gram'), (0.0750025993601042, 'zero_ngram_rank'), (0.066362124884136439, 'avg_rank_3gram'), (0.055578154793889042, 'avg_rank_position_0'), (0.054909354576390026, 'avg_pmi_3gram'), (0.047932157528988197, 'to'), (0.046310708432147407, 'avg_rank_position_1'), (0.044190646308023877, 'avg_pmi_2gram'), (0.041266156047087756, 'avg_rank_2gram'), (0.040410517389277707, 'avg_rank_position_-1'), (0.028327179833764066, 'has_avg_2gram'), (0.019820722653526187, 'in'), (0.015362461125586018, 'has_zero_ngram'), (0.013559363044811335, 'OTHERnext'), (0.012261013801383109, 'NNnext'), (0.012038112036390358, 'top_prep_count_2gram'), (0.011047372074415487, 'of'), (0.010733399538908792, 'VBnext'), (0.010629643086274313, 'OTHERprev'), (0.010335457670689375, 'VBprev'), (0.010272555728491261, 'DTprev'), (0.010114355452542008, 'on'), (0.0093802048399273478, 'at'), (0.0084161883388067793, 'for'), (0.0066323950376878512, 'JJnext'), (0.006465102235396666, 'NNprev'), (0.0055927199668789861, 'with'), (0.0041718344316569941, 'RBnext'), (0.0038343019279850408, 'about'), (0.0037175354302630163, 'from'), (0.0033505246793454043, 'but'), (0.0030161594264236703, 'JJprev'), (0.0028373331538184198, 'by'), (0.0022510934392736253, 'into'), (0.0020890944053636048, 'DTnext'), (0.0019413323318978285, 'during'), (0.0015828639309416288, 'until'), (0.0014365474210828726, 'after'), (0.001299652950825495, 'RBprev'), (0.0010982307441806624, 'before'), (0.0010707980581276086, 'over'), (0.00095425147554846588, 'through'), (0.00084050712531133498, 'upon'), (0.00074815858660677277, 'since'), (0.00060124254988533441, 'than'), (0.00055660665433281066, 'off'), (0.00051101431017587452, 'has_avg_3gram'), (0.00044869719430675186, 'between'), (0.00040527532411792198, 'among'), (0.00036774681736464164, 'onto'), (0.00033629332325088297, 'behind'), (0.00033080540512888139, 'around'), (0.00032298500769790945, 'beside'), (0.00028672328477203648, 'outside'), (0.00026377616795662974, 'except'), (0.00025740952168038406, 'inside'), (0.00022569978079218272, 'across'), (0.00019531142468923196, 'along'), (0.00019269896563383241, 'under'), (0.00019114851177653235, 'beyond'), (0.0001899582041961689, 'against'), (0.00018746689826012596, 'above'), (0.00014619550530801129, 'towards'), (5.7132469756048989e-05, 'toward'), (5.6159454291085201e-05, 'despite'), (4.9815085799714504e-05, 'amongst'), (2.5825393730868046e-05, 'besides'), (2.0495616622888125e-05, 'alongside'), (1.8133763164464299e-05, 'below'), (1.6670469608497845e-05, 'opposite'), (1.4789789123995891e-05, 'underneath'), (1.3704680413208057e-05, 'amid'), (9.8010598953977285e-06, 'absent'), (5.1273742374078081e-06, 'beneath')]
Validating...
Started data loading: 14:39:20
Finish data loading: 14:41:26IGNORED EDIT: to→to
in personal lockers to record other's moves


Total errors: 152
{'f1': 0.3195266272189349, 'false': 132, 'min_split': 2, 'true_err': 54, 'precision': 0.2903225806451613, 'skips': 2758, 'false_err': 6, 'true': 54, 'recall': 0.35526315789473684, 'depth': None, 'skips_err': 92, 'accuracy': 0.2903225806451613}
0.319526627219 0.0
0.290322580645 0.0
0.355263157895 0.0

Plotting decision tree (example)



In [7]:

    
from IPython.display import Image #needed to render in notebook
import StringIO, pydot  #needed to convert dot format to png
from sklearn.tree import DecisionTreeClassifier, export_graphviz
clf = DecisionTreeClassifier(max_depth=5, min_samples_leaf=10)
clf.fit(train_data, train_labels)
dot_data = StringIO.StringIO()
export_graphviz(clf, out_file=dot_data, feature_names=feature_names) 
graph = pydot.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

N-gram operations examples



In [8]:

    
EditNgram(u'budget of <SKIP:DT> community'.split(), 1).association()









    Out[8]:





[((u'budget', 'of', u'community'), -2.450496249370829),
 ((u'budget', 'for', u'community'), -2.908402624834963),
 ((u'budget', 'to', u'community'), -6.667452552663903)]



In [8]:

    
a = prep_edits[3].get_single_feature(PREPS_SORTED, collection.TOP_POS_TAGS, collection.reverse_confusion_matrix())
# features
str(a[0][0])









    Out[8]:





'[41.0, 1, 50.0, 0, -6.8353552673962206, -10.0, 3, 50.0, 0.0, 0, 0, 32, 50, 50, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1]'



In [8]:

    
for x in collection.test_errors:
    print x[0], x[1]









    



over→on
people use it on us , the ('to',)
to→on
hold positive views on surveillance technology and ('of', 'on')
with→through
phones , because through their phones , ('of',)
to→for
also an achievement for the governments , ('of',)
at→on
more people staying on the street and ('in',)
of→in
and more variation in human beings , ('among',)



In [ ]: