In [1]:
import os
import sys
import glob
import csv
import ctypes

cmp_pairs = [('azp1552', 'azp1556'),
             ('azp1556', 'azp1573')]

segmented_dir = "./data/manual"
csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))

segmented = {}

for file in glob.glob(segmented_dir + "/*_seg.csv"):
    segmented[os.path.splitext(os.path.basename(file))[0][:-4]] = {}

    with open(file, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            segmented[os.path.splitext(os.path.basename(file))[0][:-4]][row[0]] = row[1]

editions = list(segmented.keys())

print ("Parsed {} segmented editions: {}\n".format(len(segmented), editions))

for ed in editions:
    print("{} has {:n} segments ({} ...).".format(ed, len(segmented[ed]), list(segmented[ed].keys())[0:5]))


Parsed 3 segmented editions: ['azp1552_ch17', 'azp1556_ch17', 'azp1573_ch17']

azp1552_ch17 has 387 segments (['azp1552_ch17_0', 'azp1552_ch17_1', 'azp1552_ch17_2', 'azp1552_ch17_3', 'azp1552_ch17_4'] ...).
azp1556_ch17 has 591 segments (['azp1556_ch17_0', 'azp1556_ch17_1', 'azp1556_ch17_2', 'azp1556_ch17_3', 'azp1556_ch17_4'] ...).
azp1573_ch17 has 5 segments (['azp1573_ch17_0', 'azp1573_ch17_1', 'azp1573_ch17_2', 'azp1573_ch17_3', 'azp1573_ch17_4'] ...).

In [2]:
# Check pytorch install
from __future__ import print_function
import torch
x = torch.rand(5, 3)
print(x)
torch.cuda.is_available()


tensor([[0.3425, 0.6921, 0.4903],
        [0.4602, 0.3310, 0.5553],
        [0.4410, 0.0047, 0.1565],
        [0.4233, 0.8458, 0.0175],
        [0.2623, 0.3609, 0.0075]])
Out[2]:
False

Use Universal Embeddings: embeddings that are pre-trained on a large corpus... Transfer Learning...


In [5]:
import torch
# from transformers import BertTokenizer, BertModel, BertForMaskedLM
from transformers import XLMTokenizer, XLMWithLMHeadModel
# xlm-roberta-base
# xlm-roberta-large
# bert-base-multilingual-cased

# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)


# XLM-R model
xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.large')
xlmr.eval()  # disable dropout (or leave in train mode to finetune)

# Extract the last layer's features
last_layer_features = xlmr.extract_features(la_tokens)
assert last_layer_features.size() == torch.Size([1, 6, 1024])

# Extract all layer's features (layer 0 is the embedding layer)
all_layers = xlmr.extract_features(zh_tokens, return_all_hiddens=True)
assert len(all_layers) == 25
assert torch.all(all_layers[-1] == last_layer_features)

# Load pre-trained model tokenizer (vocabulary)
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = XLMTokenizer.from_pretrained("xlm-clm-1024-enfr")




# Tokenize input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)

# Mask a token that we will try to predict back
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

# Convert inputs to PyTorch tensors
tokens_tensor    = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

input_ids        = torch.tensor(tokenizer.encode(text)).unsqueeze(0)  # encode input context



# Set language(s)
print(tokenizer.lang2id)  # {'en': 0, 'fr': 1}

language_id = tokenizer.lang2id['en']  # 0
langs       = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])

# We reshape it to be of size (batch_size, sequence_length)
langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1)

# You can then feed it all as input to your model
outputs = model(input_ids, langs=langs)


Using cache found in /dev/shm/torch/hub/pytorch_fairseq_master
INFO:fairseq.file_utils:loading archive file http://dl.fbaipublicfiles.com/fairseq/models/xlmr.large.tar.gz from cache at /dev/shm/torch/pytorch_fairseq/3f864e15bb396f062dd37494309dbc4238416edd1f8ef293df18b1424813f2fe.cf46c7deb6b9eaa3e47c17b9fc181669c52bc639c165fbc69166a61487662ac9
INFO:fairseq.tasks.multilingual_masked_lm:dictionary: 250001 types
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-92bbb5aca6b3> in <module>
     16 
     17 # Extract the last layer's features
---> 18 last_layer_features = xlmr.extract_features(la_tokens)
     19 assert last_layer_features.size() == torch.Size([1, 6, 1024])
     20 

NameError: name 'la_tokens' is not defined