TODO: create infostat training data

OntoNotes: Corpus & Annotation Scheme

  • test corpus: WSJ section of OntoNotes
  • annotation scheme:
    • APPOS: linking attributive/appositive NPs
    • IDENT: (other) coreference links
  • extracted all NPs with IDENT relation
    • marked them as anaphoric (AN), if they had an antecedent,
      i.e. an expression to its left referring to the same ID
    • otherwise marked as non-anaphoric (NON)

In [8]:
import cPickle as pickle
from itertools import chain, izip
import os
import re

import discoursegraphs as dg
import discourseinfostat as di

In [2]:
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.cross_validation import train_test_split

from sklearn.svm import LinearSVC

vec = DictVectorizer()

In [3]:
# from sklearn import svm
# clf = svm.SVC(gamma=0.001, C=100.)

In [37]:
def pickles2list(pickle_filepaths):
    for pickle_filepath in pickle_filepaths:
        with open(pickle_filepath) as pickle_file:
            yield pickle.load(pickle_file)

def corpus_stats2sklearn_input(corpus_stats):
    """
    Parameters
    ----------
    corpus_stats : list(tuple(str, str))
    list of (features pickle path, labels pickle path) tuples
    """
    feat_files, label_files = zip(*corpus_stats)
    samples = pickles2list(feat_files)
    labels = pickles2list(label_files)
    return chain(*samples), chain(*labels)

In [5]:
%%time
corpus = dg.read_exportxml(di.infostat.TUEBADZ8_FILE)
infostat = di.Infostat(corpus)


CPU times: user 56.2 s, sys: 812 ms, total: 57.1 s
Wall time: 57.2 s

In [40]:
# infostat._reset_corpus_iterator()

In [39]:
%%time
corpus_stats = infostat.corpus_stats()


CPU times: user 6 µs, sys: 1 µs, total: 7 µs
Wall time: 11 µs

In [41]:
%%time
samples, y = corpus_stats2sklearn_input(corpus_stats)


Document 'text_922' has maximum recursion depth exceeded
Document 'text_2877' has maximum recursion depth exceeded
Document 'text_3003' has maximum recursion depth exceeded
Document 'text_3175' has maximum recursion depth exceeded
CPU times: user 12min 10s, sys: 6.22 s, total: 12min 16s
Wall time: 12min 17s

In [42]:
%%time
X = vec.fit_transform(samples)


CPU times: user 23.3 s, sys: 1.16 s, total: 24.4 s
Wall time: 26.2 s

In [43]:
y_bool_array = np.array(list(y), dtype=bool)

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y_bool_array, test_size=0.1, random_state=23)

Training a linear SVC w/out hyperparam tuning, no linguistic features


In [45]:
clf = LinearSVC()

In [46]:
%%time
clf.fit(X_train, y_train)


CPU times: user 4min 13s, sys: 9.08 ms, total: 4min 13s
Wall time: 4min 13s
Out[46]:
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [47]:
# clf.predict(X_test)

In [48]:
clf.score(X_test, y_test)


Out[48]:
0.83496788585054416

In [49]:
from sklearn.externals import joblib

In [52]:
classifier_path = os.path.expanduser('~/repos/discourseinfostat/results')

joblib.dump(clf, os.path.join(classifier_path, 'linear_svc_clf.pkl'))


Out[52]:
['/home/arne/repos/discourseinfostat/results/linear_svc_clf.pkl',
 '/home/arne/repos/discourseinfostat/results/linear_svc_clf.pkl_01.npy',
 '/home/arne/repos/discourseinfostat/results/linear_svc_clf.pkl_02.npy',
 '/home/arne/repos/discourseinfostat/results/linear_svc_clf.pkl_03.npy']

In [53]:
joblib.dump(X_train, os.path.join(classifier_path, 'X_train.pkl'))


Out[53]:
['/home/arne/repos/discourseinfostat/results/X_train.pkl',
 '/home/arne/repos/discourseinfostat/results/X_train.pkl_01.npy',
 '/home/arne/repos/discourseinfostat/results/X_train.pkl_02.npy',
 '/home/arne/repos/discourseinfostat/results/X_train.pkl_03.npy']

In [54]:
joblib.dump(X_test, os.path.join(classifier_path, 'X_test.pkl'))


Out[54]:
['/home/arne/repos/discourseinfostat/results/X_test.pkl',
 '/home/arne/repos/discourseinfostat/results/X_test.pkl_01.npy',
 '/home/arne/repos/discourseinfostat/results/X_test.pkl_02.npy',
 '/home/arne/repos/discourseinfostat/results/X_test.pkl_03.npy']

In [55]:
joblib.dump(y_train, os.path.join(classifier_path, 'y_train.pkl'))


Out[55]:
['/home/arne/repos/discourseinfostat/results/y_train.pkl',
 '/home/arne/repos/discourseinfostat/results/y_train.pkl_01.npy']

In [56]:
joblib.dump(y_test, os.path.join(classifier_path, 'y_test.pkl'))


Out[56]:
['/home/arne/repos/discourseinfostat/results/y_test.pkl',
 '/home/arne/repos/discourseinfostat/results/y_test.pkl_01.npy']

In [58]:
joblib.dump(X, os.path.join(classifier_path, 'X.pkl'))


Out[58]:
['/home/arne/repos/discourseinfostat/results/X.pkl',
 '/home/arne/repos/discourseinfostat/results/X.pkl_01.npy',
 '/home/arne/repos/discourseinfostat/results/X.pkl_02.npy',
 '/home/arne/repos/discourseinfostat/results/X.pkl_03.npy']

In [59]:
joblib.dump(y, os.path.join(classifier_path, 'y.pkl'))


Out[59]:
['/home/arne/repos/discourseinfostat/results/y.pkl']

In [ ]: