In [1]:
import os
import sys
import glob
import csv
import ctypes
cmp_pairs = [('azp1552', 'azp1556'),
('azp1556', 'azp1573')]
segmented_dir = "./data/manual"
csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))
segmented = {}
for file in glob.glob(segmented_dir + "/*_seg.csv"):
segmented[os.path.splitext(os.path.basename(file))[0][:-4]] = {}
with open(file, newline='', encoding='utf-8') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
segmented[os.path.splitext(os.path.basename(file))[0][:-4]][row[0]] = row[1]
editions = list(segmented.keys())
print ("Parsed {} segmented editions: {}\n".format(len(segmented), editions))
for ed in editions:
print("{} has {:n} segments ({} ...).".format(ed, len(segmented[ed]), list(segmented[ed].keys())[0:5]))
In [2]:
# Check pytorch install
from __future__ import print_function
import torch
x = torch.rand(5, 3)
print(x)
torch.cuda.is_available()
Out[2]:
Use Universal Embeddings: embeddings that are pre-trained on a large corpus... Transfer Learning...
In [5]:
import torch
# from transformers import BertTokenizer, BertModel, BertForMaskedLM
from transformers import XLMTokenizer, XLMWithLMHeadModel
# xlm-roberta-base
# xlm-roberta-large
# bert-base-multilingual-cased
# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)
# XLM-R model
xlmr = torch.hub.load('pytorch/fairseq', 'xlmr.large')
xlmr.eval() # disable dropout (or leave in train mode to finetune)
# Extract the last layer's features
last_layer_features = xlmr.extract_features(la_tokens)
assert last_layer_features.size() == torch.Size([1, 6, 1024])
# Extract all layer's features (layer 0 is the embedding layer)
all_layers = xlmr.extract_features(zh_tokens, return_all_hiddens=True)
assert len(all_layers) == 25
assert torch.all(all_layers[-1] == last_layer_features)
# Load pre-trained model tokenizer (vocabulary)
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = XLMTokenizer.from_pretrained("xlm-clm-1024-enfr")
# Tokenize input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)
# Mask a token that we will try to predict back
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']
# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0) # encode input context
# Set language(s)
print(tokenizer.lang2id) # {'en': 0, 'fr': 1}
language_id = tokenizer.lang2id['en'] # 0
langs = torch.tensor([language_id] * input_ids.shape[1]) # torch.tensor([0, 0, 0, ..., 0])
# We reshape it to be of size (batch_size, sequence_length)
langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1)
# You can then feed it all as input to your model
outputs = model(input_ids, langs=langs)