Pipeline for classifiying texts

The classifier expects input text files of containing:
sentence id[tab]sentence[tab]None
etc.

The sentences should be tokenized, and tokens should be separated by a space.

It is best to have a single file for each text for which labels should be predicted.

The text files should be put together in a single directory.

Use notebook 00_CreateClassifiers to create the classifier and 01_CreateDataForPrediction to generate data in the correct format.


In [2]:
import os

embem_data_dir = '/home/jvdzwaan/data/embem/'

In [3]:
# Annotation

# path to the input data
data_dir = os.path.join(embem_data_dir, 'txt/annotation-for_prediction-normalized/')

# specify the path where output should be written
out_dir = '~/tmp/annotation-predicted-heem-normalized/'

In [4]:
# Corpus big

# path to the input data
data_dir = os.path.join(embem_data_dir, 'txt/corpus_big-for_prediction-normalized/')

# specify the path where output should be written
out_dir = '~/tmp/corpus_big-predicted-heem-normalized/'

In [5]:
# Ceneton data

# path to the input data
data_dir = os.path.join(embem_data_dir, 'txt/ceneton-for_prediction-normalized/')

# specify the path where output should be written
out_dir = '~/tmp/ceneton-predicted-heem-normalized/'

In [7]:
# EDBO data

# path to the input data
data_dir = os.path.join(embem_data_dir, 'txt/edbo-for_prediction-normalized/')

# specify the path where output should be written
out_dir = '~/tmp/edbo-predicted-heem-normalized/'

In [8]:
import os

if not os.path.exists(out_dir):
    os.makedirs(out_dir)

# classifier file
classifier = '/home/jvdzwaan/data/classifier/classifier.pkl'

# train file
train_file = os.path.join(embem_data_dir, 'ml/all_spellingnormalized.txt')

In [9]:
from sklearn.externals import joblib
import codecs
from utils import get_data, load_data

# load classifier
clf = joblib.load(classifier)

text_files = [fi for fi in os.listdir(data_dir) if fi.endswith('.txt')]
for i, text_file in enumerate(text_files):
    in_file = os.path.join(data_dir, text_file)
    print('({} of {}) {}'.format(i+1, len(text_files), text_file))

    # load data
    X_train, X_data, Y_train, Y_data, classes_ = get_data(train_file, in_file)

    # classifiy
    pred = clf.predict(X_data)

    # save results
    out_file = os.path.join(out_dir, text_file)

    X_data_with_ids, Y_data = load_data(in_file)

    with codecs.open(out_file, 'wb', 'utf8') as f:
        for x, y in zip(X_data_with_ids, pred):
            f.write(u'{}\t{}\n'.format(x.decode('utf8'),
                                       '_'.join(classes_[y]) or 'None'))


(1 of 67) Ff48fdcd8588e.txt
(2 of 67) F1bdd71564d85.txt
(3 of 67) F168fa8f9842e.txt
(4 of 67) F5a1168654ba9.txt
(5 of 67) Feb376834078a.txt
(6 of 67) F934371ae3480.txt
(7 of 67) Fde99457f05b2.txt
(8 of 67) Ff1277be87709.txt
(9 of 67) F2def6fc1991d.txt
(10 of 67) Fafac452c47f2.txt
(11 of 67) F1fc4e2f26f83.txt
(12 of 67) F00e07fe03042.txt
(13 of 67) Fe2261ffc9608.txt
(14 of 67) Fdec68314024d.txt
(15 of 67) Fde9ccd6aba49.txt
(16 of 67) Ff181f3aadced.txt
(17 of 67) F5758b6f36ced.txt
(18 of 67) Fc2ab541f9310.txt
(19 of 67) F16f680783b48.txt
(20 of 67) F0c180c461248.txt
(21 of 67) F237437d6c466.txt
(22 of 67) F1d3f48d0974d.txt
(23 of 67) F9d26211403b2.txt
(24 of 67) Fdf230a240d87.txt
(25 of 67) Fc053722ef59a.txt
(26 of 67) F572e994794fd.txt
(27 of 67) F88892c3a86b1.txt
(28 of 67) F4b5458704e16.txt
(29 of 67) F8888dddfd2dc.txt
(30 of 67) Fa252607e6eef.txt
(31 of 67) Fbfac05bb416b.txt
(32 of 67) Fba52678c1f75.txt
(33 of 67) Fe0a7e4c6ab5f.txt
(34 of 67) F78efb1028513.txt
(35 of 67) Ff899faf1f27a.txt
(36 of 67) F3638a5322877.txt
(37 of 67) F62f2a9465577.txt
(38 of 67) Fd52ac420d560.txt
(39 of 67) F6e2ee00f7923.txt
(40 of 67) Fe5ba27c960ef.txt
(41 of 67) Ffc4c9ad7e26b.txt
(42 of 67) Fc3b2e33a908e.txt
(43 of 67) Fc88c1f5207f6.txt
(44 of 67) F5621245aa9ff.txt
(45 of 67) F9f53f91c8b33.txt
(46 of 67) F1efc22bbaaef.txt
(47 of 67) F25bc6d62c587.txt
(48 of 67) Fea3a73500ba3.txt
(49 of 67) F8d8b1f0c2db8.txt
(50 of 67) F2956ed0af5d1.txt
(51 of 67) F5dce36e5dbe0.txt
(52 of 67) F2dd6e857dc91.txt
(53 of 67) Ff019d344615a.txt
(54 of 67) Fa105fea365b2.txt
(55 of 67) F84a0273f4636.txt
(56 of 67) F2326ec0fa910.txt
(57 of 67) Ffd48e027871e.txt
(58 of 67) Fcc6232440c73.txt
(59 of 67) F5386951ff0d9.txt
(60 of 67) F3d0b532583ca.txt
(61 of 67) F584bfe7ebaa7.txt
(62 of 67) F99490d2f3f26.txt
(63 of 67) Fccb23dad4e3b.txt
(64 of 67) F508fe5d4fd32.txt
(65 of 67) Fb167a7fd9cf3.txt
(66 of 67) F75ad7db0a874.txt
(67 of 67) Ffac41d4f3021.txt

In [6]:
# make unnormalized version of predicted labels (needed before expanding body part labels)

%run merge_data_and_labels.py /home/jvdzwaan/data/embem/txt/annotation-predicted-heem-normalized/ /home/jvdzwaan/data/embem/txt/annotation-for_prediction/ /home/jvdzwaan/data/embem/txt/annotation-predicted-heem
#%run merge_data_and_labels.py /home/jvdzwaan/data/embem/txt/corpus_big-predicted-heem-normalized/ /home/jvdzwaan/data/embem/txt/corpus_big-for_prediction/ /home/jvdzwaan/data/embem/txt/corpus_big-predicted-heem
#%run merge_data_and_labels.py /home/jvdzwaan/data/embem/txt/ceneton-predicted-heem-normalized/ /home/jvdzwaan/data/embem/txt/ceneton-for_prediction/ /home/jvdzwaan/data/embem/txt/ceneton-predicted-heem
#%run merge_data_and_labels.py /home/jvdzwaan/data/embem/txt/edbo-predicted-heem-normalized/ /home/jvdzwaan/data/embem/txt/edbo-for_prediction/ /home/jvdzwaan/data/embem/txt/edbo-predicted-heem


(1 of 29) vond001gysb04.txt
(2 of 29) ross006zing01.txt
(3 of 29) huyd001achi01.txt
(4 of 29) hoof001gran01.txt
(5 of 29) stee033adag01.txt
(6 of 29) rivi001jeug01.txt
(7 of 29) fres003pefr01.txt
(8 of 29) bidl001nede01.txt
(9 of 29) hare003agon01.txt
(10 of 29) alew001puit01.txt
(11 of 29) hoof001achi01.txt
(12 of 29) lijn002vlug01.txt
(13 of 29) bred001moor01.txt
(14 of 29) stee033tham01.txt
(15 of 29) alew001besl01.txt
(16 of 29) vinc001pefr02.txt
(17 of 29) bren001scha01.txt
(18 of 29) lang020chph01.txt
(19 of 29) bren001goud01.txt
(20 of 29) meij001verl01.txt
(21 of 29) vond001jose05.txt
(22 of 29) vond001pala01.txt
(23 of 29) rivi001vero01.txt
(24 of 29) pels001verw02.txt
(25 of 29) focq001mini02.txt
(26 of 29) noms001mich01.txt
(27 of 29) weye002holl01.txt
(28 of 29) vos_002kluc01.txt
(29 of 29) ling001ontd01.txt

In [7]:
# Expand body parts

%run classify_body_parts.py /home/jvdzwaan/data/embem/dict/body_part_mapping.json /home/jvdzwaan/data/embem/txt/annotation-predicted-heem/ /home/jvdzwaan/data/embem/txt/annotation-predicted-heem-expanded_body_parts  /home/jvdzwaan/data/embem/dict/annotation_heem_expanded_body_parts.csv
#%run classify_body_parts.py /home/jvdzwaan/data/embem/dict/body_part_mapping.json /home/jvdzwaan/data/embem/txt/corpus_big-predicted-heem/ /home/jvdzwaan/data/embem/txt/corpus_big-predicted-heem-expanded_body_parts  /home/jvdzwaan/data/embem/dict/corpus_big_heem_expanded_body_parts.csv
#%run classify_body_parts.py /home/jvdzwaan/data/embem/dict/body_part_mapping.json /home/jvdzwaan/data/embem/txt/ceneton-predicted-heem/ /home/jvdzwaan/data/embem/txt/ceneton-predicted-heem-expanded_body_parts  /home/jvdzwaan/data/embem/dict/ceneton_heem_expanded_body_parts.csv
#%run classify_body_parts.py /home/jvdzwaan/data/embem/dict/body_part_mapping.json /home/jvdzwaan/data/embem/txt/edbo-predicted-heem/ /home/jvdzwaan/data/embem/txt/edbo-predicted-heem-expanded_body_parts  /home/jvdzwaan/data/embem/dict/edbo_heem_expanded_body_parts.csv


ignored: rose-kaken (cheeks)
classify_body_parts.py:34: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
  if w in word2cat.keys():

The next step is to look at the results!