In [1]:
relation_ids = {
    "Other": 0,
    "Cause-Effect(e1,e2)": 1,
    "Instrument-Agency(e1,e2)": 2,
    "Product-Producer(e1,e2)": 3,
    "Content-Container(e1,e2)": 4,
    "Entity-Origin(e1,e2)": 5,
    "Entity-Destination(e1,e2)": 6,
    "Component-Whole(e1,e2)": 7,
    "Member-Collection(e1,e2)": 8,
    "Message-Topic(e1,e2)": 9,
}

In [2]:
from os.path import join

def find_between(s, a, b):
    return s.split(a)[1].split(b)[0].strip().replace(" ", "")

def preprocess_data(path, out_path, save_opposites=False, num_rel=None):
    print("Loading data from", path, "and saving preprocessed files to", out_path)
    in_file = open(path)

    relation_files = {}
    for relation_id in relation_ids:
        relation_files[relation_id] = open(
            out_path % relation_id, "w", encoding="utf-8")

    while True:
        line = in_file.readline()

        if num_rel is not None:
            num_rel -= 1
            if num_rel < 0:
                break

        if not line:
            break

        text_id, text = line.split("\t")

        word_a = find_between(text, "<e1>", "</e1>")
        word_b = find_between(text, "<e2>", "</e2>")

        relation = in_file.readline().strip()
        comment = in_file.readline()

        # Swap words if the relation-order is not word_1, word_2
        if relation != "Other" and not "(e1,e2)" in relation:
            word_a, word_b = word_b, word_a
            relation = relation.replace("(e2,e1)", "(e1,e2)")

        word_a = word_a.replace(" ", "_").lower()
        word_b = word_b.replace(" ", "_").lower()

        relation_files[relation].write("%s %s\n" % (word_a, word_b))

        if save_opposites:
            relation_files["Other"].write("%s %s\n" % (word_b, word_a))

        # New line
        in_file.readline()
    
train_file = "TRAIN_FILE.txt"
test_file = "TEST_FILE_FULL.txt"
out_path = "."

# Train

preprocess_data(train_file, join(
    out_path, "train_%s_1000.csv"), num_rel=1000)
preprocess_data(train_file, join(
    out_path, "train_%s_2000.csv"), num_rel=2000)
preprocess_data(train_file, join(
    out_path, "train_%s_4000.csv"), num_rel=4000)
preprocess_data(train_file, join(
    out_path, "train_%s_8000.csv"), num_rel=8000)

# Test
preprocess_data(test_file, join(out_path, "test_%s.csv"))


Loading data from TRAIN_FILE.txt and saving preprocessed files to .\train_%s_1000.csv
Loading data from TRAIN_FILE.txt and saving preprocessed files to .\train_%s_2000.csv
Loading data from TRAIN_FILE.txt and saving preprocessed files to .\train_%s_4000.csv
Loading data from TRAIN_FILE.txt and saving preprocessed files to .\train_%s_8000.csv
Loading data from TEST_FILE_FULL.txt and saving preprocessed files to .\test_%s.csv

In [3]:
from ontokom.embeddings import create_relation_dataset, DataFrameEmbeddings
from glob import glob

embeddings = DataFrameEmbeddings("embeddings_acm_wiki_glove_300.h5")
embeddings.load()

relation_paths_train = glob("train_*_8000.csv")
relation_paths_test = glob("test_*.csv")

create_relation_dataset(embeddings, "relations_train_8000.h5", "labels_train_8000.h5", relation_paths_train,
                        unknown_word="<unk>")
create_relation_dataset(embeddings, "relations_test.h5", "labels_test.h5", relation_paths_test,
                        unknown_word="<unk>")


Processing relations at train_Cause-Effect(e1,e2)_8000.csv
1003it [00:01, 711.14it/s]
Processing relations at train_Component-Whole(e1,e2)_8000.csv
941it [00:00, 3034.99it/s]
Processing relations at train_Content-Container(e1,e2)_8000.csv
540it [00:00, 3120.99it/s]
Processing relations at train_Entity-Destination(e1,e2)_8000.csv
845it [00:00, 3001.40it/s]
Processing relations at train_Entity-Origin(e1,e2)_8000.csv
716it [00:00, 3254.14it/s]
Processing relations at train_Instrument-Agency(e1,e2)_8000.csv
504it [00:00, 3251.19it/s]
Processing relations at train_Member-Collection(e1,e2)_8000.csv
690it [00:00, 3121.77it/s]
Processing relations at train_Message-Topic(e1,e2)_8000.csv
634it [00:00, 2934.83it/s]
Processing relations at train_Other_8000.csv
1410it [00:00, 3074.86it/s]
Processing relations at train_Product-Producer(e1,e2)_8000.csv
717it [00:00, 3334.32it/s]
-- Relations statistics
	 8000 total relations
	 8000 found relations (100.00%)
	 0 invalid relations (0.00%)
	 0 unavailable relations (0.00%)
Relation embeddings count: 7664
Saving embeddings to relations_train_8000.h5
Saving labels to labels_train_8000.h5
F:\Anaconda3\lib\site-packages\pandas\io\pytables.py:280: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->axis1] [items->None]

  f(store)
Processing relations at test_Cause-Effect(e1,e2).csv
328it [00:00, 2954.40it/s]
Processing relations at test_Component-Whole(e1,e2).csv
312it [00:00, 3318.75it/s]
Processing relations at test_Content-Container(e1,e2).csv
192it [00:00, 3199.02it/s]
Processing relations at test_Entity-Destination(e1,e2).csv
292it [00:00, 3139.05it/s]
Processing relations at test_Entity-Origin(e1,e2).csv
258it [00:00, 3089.09it/s]
Processing relations at test_Instrument-Agency(e1,e2).csv
156it [00:00, 3088.05it/s]
Processing relations at test_Member-Collection(e1,e2).csv
233it [00:00, 3085.33it/s]
Processing relations at test_Message-Topic(e1,e2).csv
261it [00:00, 3069.80it/s]
Processing relations at test_Other.csv
454it [00:00, 2828.11it/s]
Processing relations at test_Product-Producer(e1,e2).csv
231it [00:00, 3207.48it/s]
-- Relations statistics
	 2717 total relations
	 2717 found relations (100.00%)
	 0 invalid relations (0.00%)
	 0 unavailable relations (0.00%)
Relation embeddings count: 2671
Saving embeddings to relations_test.h5
Saving labels to labels_test.h5

In [7]:
from ontokom.classification import RelationClassifier, load_relations, load_labels
import numpy as np
from sklearn.metrics import classification_report

train_relations = load_relations("relations_train_8000.h5")
train_labels = load_labels("labels_train_8000.h5")
assert train_relations.shape[0] == train_labels.shape[0]

test_relations = load_relations("relations_test.h5")
test_labels = load_labels("labels_test.h5")
test_labels = np.argmax(test_labels, 1)
assert test_relations.shape[0] == test_labels.shape[0]

classifier = RelationClassifier()
classifier.new(train_relations.shape[1], train_labels.shape[1], one_hot=True,
               filters=64, max_filters=256,
               optimizer="rmsprop", learn_rate=0.01,
               dropout=0.0, kernel_size=5)

classifier.train(train_relations, train_labels,
                 epochs=20, validation_split=0, verbose=0)

predicted_labels = np.argmax(classifier.predict(test_relations), 1)

print(classification_report(test_labels, predicted_labels))


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
reshape_5 (Reshape)          (None, 2, 300, 1)         0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 1, 147, 64)        960       
_________________________________________________________________
batch_normalization_17 (Batc (None, 1, 147, 64)        256       
_________________________________________________________________
reshape_6 (Reshape)          (None, 147, 64)           0         
_________________________________________________________________
conv1d_15 (Conv1D)           (None, 74, 128)           41088     
_________________________________________________________________
batch_normalization_18 (Batc (None, 74, 128)           512       
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 37, 256)           164096    
_________________________________________________________________
batch_normalization_19 (Batc (None, 37, 256)           1024      
_________________________________________________________________
conv1d_17 (Conv1D)           (None, 19, 256)           327936    
_________________________________________________________________
batch_normalization_20 (Batc (None, 19, 256)           1024      
_________________________________________________________________
conv1d_18 (Conv1D)           (None, 10, 256)           327936    
_________________________________________________________________
batch_normalization_21 (Batc (None, 10, 256)           1024      
_________________________________________________________________
conv1d_19 (Conv1D)           (None, 5, 256)            327936    
_________________________________________________________________
batch_normalization_22 (Batc (None, 5, 256)            1024      
_________________________________________________________________
conv1d_20 (Conv1D)           (None, 3, 256)            327936    
_________________________________________________________________
batch_normalization_23 (Batc (None, 3, 256)            1024      
_________________________________________________________________
conv1d_21 (Conv1D)           (None, 1, 256)            196864    
_________________________________________________________________
batch_normalization_24 (Batc (None, 1, 256)            1024      
_________________________________________________________________
flatten_3 (Flatten)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                2570      
=================================================================
Total params: 1,724,234
Trainable params: 1,720,778
Non-trainable params: 3,456
_________________________________________________________________
None
             precision    recall  f1-score   support

          0       0.66      0.86      0.74       325
          1       0.69      0.66      0.67       302
          2       0.60      0.61      0.61       183
          3       0.39      0.38      0.38       292
          4       0.40      0.51      0.44       251
          5       0.56      0.40      0.46       156
          6       0.71      0.70      0.71       226
          7       0.67      0.72      0.69       261
          8       0.40      0.29      0.34       450
          9       0.63      0.63      0.63       225

avg / total       0.56      0.56      0.56      2671


In [ ]: