TODO: create infostat training data

OntoNotes: Corpus & Annotation Scheme

test corpus: WSJ section of OntoNotes
annotation scheme:
- APPOS: linking attributive/appositive NPs
- IDENT: (other) coreference links
extracted all NPs with IDENT relation
- marked them as anaphoric (AN), if they had an antecedent,
  i.e. an expression to its left referring to the same ID
- otherwise marked as non-anaphoric (NON)



In [8]:

    
import cPickle as pickle
from itertools import chain, izip
import os
import re

import discoursegraphs as dg
import discourseinfostat as di



In [2]:

    
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.cross_validation import train_test_split

from sklearn.svm import LinearSVC

vec = DictVectorizer()



In [3]:

    
# from sklearn import svm
# clf = svm.SVC(gamma=0.001, C=100.)



In [37]:

    
def pickles2list(pickle_filepaths):
    for pickle_filepath in pickle_filepaths:
        with open(pickle_filepath) as pickle_file:
            yield pickle.load(pickle_file)

def corpus_stats2sklearn_input(corpus_stats):
    """
    Parameters
    ----------
    corpus_stats : list(tuple(str, str))
    list of (features pickle path, labels pickle path) tuples
    """
    feat_files, label_files = zip(*corpus_stats)
    samples = pickles2list(feat_files)
    labels = pickles2list(label_files)
    return chain(*samples), chain(*labels)



In [5]:

    
%%time
corpus = dg.read_exportxml(di.infostat.TUEBADZ8_FILE)
infostat = di.Infostat(corpus)









    



CPU times: user 56.2 s, sys: 812 ms, total: 57.1 s
Wall time: 57.2 s



In [40]:

    
# infostat._reset_corpus_iterator()



In [39]:

    
%%time
corpus_stats = infostat.corpus_stats()









    



CPU times: user 6 µs, sys: 1 µs, total: 7 µs
Wall time: 11 µs



In [41]:

    
%%time
samples, y = corpus_stats2sklearn_input(corpus_stats)









    



Document 'text_922' has maximum recursion depth exceeded
Document 'text_2877' has maximum recursion depth exceeded
Document 'text_3003' has maximum recursion depth exceeded
Document 'text_3175' has maximum recursion depth exceeded
CPU times: user 12min 10s, sys: 6.22 s, total: 12min 16s
Wall time: 12min 17s



In [42]:

    
%%time
X = vec.fit_transform(samples)









    



CPU times: user 23.3 s, sys: 1.16 s, total: 24.4 s
Wall time: 26.2 s



In [43]:

    
y_bool_array = np.array(list(y), dtype=bool)



In [44]:

    
X_train, X_test, y_train, y_test = train_test_split(X, y_bool_array, test_size=0.1, random_state=23)

Training a linear SVC w/out hyperparam tuning, no linguistic features



In [45]:

    
clf = LinearSVC()



In [46]:

    
%%time
clf.fit(X_train, y_train)









    



CPU times: user 4min 13s, sys: 9.08 ms, total: 4min 13s
Wall time: 4min 13s






    Out[46]:





LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)



In [47]:

    
# clf.predict(X_test)



In [48]:

    
clf.score(X_test, y_test)









    Out[48]:





0.83496788585054416



In [49]:

    
from sklearn.externals import joblib



In [52]:

    
classifier_path = os.path.expanduser('~/repos/discourseinfostat/results')

joblib.dump(clf, os.path.join(classifier_path, 'linear_svc_clf.pkl'))









    Out[52]:





['/home/arne/repos/discourseinfostat/results/linear_svc_clf.pkl',
 '/home/arne/repos/discourseinfostat/results/linear_svc_clf.pkl_01.npy',
 '/home/arne/repos/discourseinfostat/results/linear_svc_clf.pkl_02.npy',
 '/home/arne/repos/discourseinfostat/results/linear_svc_clf.pkl_03.npy']



In [53]:

    
joblib.dump(X_train, os.path.join(classifier_path, 'X_train.pkl'))









    Out[53]:





['/home/arne/repos/discourseinfostat/results/X_train.pkl',
 '/home/arne/repos/discourseinfostat/results/X_train.pkl_01.npy',
 '/home/arne/repos/discourseinfostat/results/X_train.pkl_02.npy',
 '/home/arne/repos/discourseinfostat/results/X_train.pkl_03.npy']



In [54]:

    
joblib.dump(X_test, os.path.join(classifier_path, 'X_test.pkl'))









    Out[54]:





['/home/arne/repos/discourseinfostat/results/X_test.pkl',
 '/home/arne/repos/discourseinfostat/results/X_test.pkl_01.npy',
 '/home/arne/repos/discourseinfostat/results/X_test.pkl_02.npy',
 '/home/arne/repos/discourseinfostat/results/X_test.pkl_03.npy']



In [55]:

    
joblib.dump(y_train, os.path.join(classifier_path, 'y_train.pkl'))









    Out[55]:





['/home/arne/repos/discourseinfostat/results/y_train.pkl',
 '/home/arne/repos/discourseinfostat/results/y_train.pkl_01.npy']



In [56]:

    
joblib.dump(y_test, os.path.join(classifier_path, 'y_test.pkl'))









    Out[56]:





['/home/arne/repos/discourseinfostat/results/y_test.pkl',
 '/home/arne/repos/discourseinfostat/results/y_test.pkl_01.npy']



In [58]:

    
joblib.dump(X, os.path.join(classifier_path, 'X.pkl'))









    Out[58]:





['/home/arne/repos/discourseinfostat/results/X.pkl',
 '/home/arne/repos/discourseinfostat/results/X.pkl_01.npy',
 '/home/arne/repos/discourseinfostat/results/X.pkl_02.npy',
 '/home/arne/repos/discourseinfostat/results/X.pkl_03.npy']



In [59]:

    
joblib.dump(y, os.path.join(classifier_path, 'y.pkl'))









    Out[59]:





['/home/arne/repos/discourseinfostat/results/y.pkl']



In [ ]: