The classifier expects input text files of containing:
sentence id[tab]sentence[tab]None
etc.
The sentences should be tokenized, and tokens should be separated by a space.
It is best to have a single file for each text for which labels should be predicted.
The text files should be put together in a single directory.
Use notebook 00_CreateClassifiers to create the classifier and 01_CreateDataForPrediction to generate data in the correct format.
In [2]:
import os
embem_data_dir = '/home/jvdzwaan/data/embem/'
In [3]:
# Annotation
# path to the input data
data_dir = os.path.join(embem_data_dir, 'txt/annotation-for_prediction-normalized/')
# specify the path where output should be written
out_dir = '~/tmp/annotation-predicted-heem-normalized/'
In [4]:
# Corpus big
# path to the input data
data_dir = os.path.join(embem_data_dir, 'txt/corpus_big-for_prediction-normalized/')
# specify the path where output should be written
out_dir = '~/tmp/corpus_big-predicted-heem-normalized/'
In [5]:
# Ceneton data
# path to the input data
data_dir = os.path.join(embem_data_dir, 'txt/ceneton-for_prediction-normalized/')
# specify the path where output should be written
out_dir = '~/tmp/ceneton-predicted-heem-normalized/'
In [7]:
# EDBO data
# path to the input data
data_dir = os.path.join(embem_data_dir, 'txt/edbo-for_prediction-normalized/')
# specify the path where output should be written
out_dir = '~/tmp/edbo-predicted-heem-normalized/'
In [8]:
import os
if not os.path.exists(out_dir):
os.makedirs(out_dir)
# classifier file
classifier = '/home/jvdzwaan/data/classifier/classifier.pkl'
# train file
train_file = os.path.join(embem_data_dir, 'ml/all_spellingnormalized.txt')
In [9]:
from sklearn.externals import joblib
import codecs
from utils import get_data, load_data
# load classifier
clf = joblib.load(classifier)
text_files = [fi for fi in os.listdir(data_dir) if fi.endswith('.txt')]
for i, text_file in enumerate(text_files):
in_file = os.path.join(data_dir, text_file)
print('({} of {}) {}'.format(i+1, len(text_files), text_file))
# load data
X_train, X_data, Y_train, Y_data, classes_ = get_data(train_file, in_file)
# classifiy
pred = clf.predict(X_data)
# save results
out_file = os.path.join(out_dir, text_file)
X_data_with_ids, Y_data = load_data(in_file)
with codecs.open(out_file, 'wb', 'utf8') as f:
for x, y in zip(X_data_with_ids, pred):
f.write(u'{}\t{}\n'.format(x.decode('utf8'),
'_'.join(classes_[y]) or 'None'))
In [6]:
# make unnormalized version of predicted labels (needed before expanding body part labels)
%run merge_data_and_labels.py /home/jvdzwaan/data/embem/txt/annotation-predicted-heem-normalized/ /home/jvdzwaan/data/embem/txt/annotation-for_prediction/ /home/jvdzwaan/data/embem/txt/annotation-predicted-heem
#%run merge_data_and_labels.py /home/jvdzwaan/data/embem/txt/corpus_big-predicted-heem-normalized/ /home/jvdzwaan/data/embem/txt/corpus_big-for_prediction/ /home/jvdzwaan/data/embem/txt/corpus_big-predicted-heem
#%run merge_data_and_labels.py /home/jvdzwaan/data/embem/txt/ceneton-predicted-heem-normalized/ /home/jvdzwaan/data/embem/txt/ceneton-for_prediction/ /home/jvdzwaan/data/embem/txt/ceneton-predicted-heem
#%run merge_data_and_labels.py /home/jvdzwaan/data/embem/txt/edbo-predicted-heem-normalized/ /home/jvdzwaan/data/embem/txt/edbo-for_prediction/ /home/jvdzwaan/data/embem/txt/edbo-predicted-heem
In [7]:
# Expand body parts
%run classify_body_parts.py /home/jvdzwaan/data/embem/dict/body_part_mapping.json /home/jvdzwaan/data/embem/txt/annotation-predicted-heem/ /home/jvdzwaan/data/embem/txt/annotation-predicted-heem-expanded_body_parts /home/jvdzwaan/data/embem/dict/annotation_heem_expanded_body_parts.csv
#%run classify_body_parts.py /home/jvdzwaan/data/embem/dict/body_part_mapping.json /home/jvdzwaan/data/embem/txt/corpus_big-predicted-heem/ /home/jvdzwaan/data/embem/txt/corpus_big-predicted-heem-expanded_body_parts /home/jvdzwaan/data/embem/dict/corpus_big_heem_expanded_body_parts.csv
#%run classify_body_parts.py /home/jvdzwaan/data/embem/dict/body_part_mapping.json /home/jvdzwaan/data/embem/txt/ceneton-predicted-heem/ /home/jvdzwaan/data/embem/txt/ceneton-predicted-heem-expanded_body_parts /home/jvdzwaan/data/embem/dict/ceneton_heem_expanded_body_parts.csv
#%run classify_body_parts.py /home/jvdzwaan/data/embem/dict/body_part_mapping.json /home/jvdzwaan/data/embem/txt/edbo-predicted-heem/ /home/jvdzwaan/data/embem/txt/edbo-predicted-heem-expanded_body_parts /home/jvdzwaan/data/embem/dict/edbo_heem_expanded_body_parts.csv
The next step is to look at the results!