In [ ]:
from gensim.models.keyedvectors import KeyedVectors
# wordvectors_file_vec = './fasttext-sbwc.3.6.e20.vec'
# cantidad = 100000
# wordvectors = KeyedVectors.load_word2vec_format(wordvectors_file_vec, limit=cantidad)
In [44]:
from torchtext import data
import torch
import random
import spacy
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
# tokenizer function using spacy
nlp = spacy.load('es')
def tokenizer(s):
return [w.text.lower() for w in nlp(tweet_clean(s))]
def set_up_torch_text(path='cache/', path_test_csv='test_data.csv', path_valid_csv='valid_data.csv', path_train_csv='train_data.csv'):
random_state = random.seed(SEED)
txt_field = data.Field(sequential=False, use_vocab=True, tokenize=tokenizer)
label_field = data.Field(sequential=False, use_vocab=True, pad_token=None, unk_token=None)
val_fields = [('text', txt_field), ('tag', label_field)]
train, valds, test = data.TabularDataset.splits(path=path, format='csv', train=path_train_csv, validation=path_valid_csv, test=path_test_csv, fields=val_fields)
# test = data.TabularDataset(path=path+path_test_csv, format='csv', skip_header=True, fields=val_fields)
txt_field.build_vocab(train, valds, test, max_size=25000)
label_field.build_vocab(train, valds, test, max_size=25000)
return train, valds, test, txt_field, label_field
In [45]:
train_data, valid_data, test_data, txt_field, label_field = set_up_torch_text()
In [47]:
len(txt_field.vocab)
Out[47]:
In [49]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size = BATCH_SIZE,
device = device)
In [53]:
import torch.nn as nn
import torch.nn.functional as F
class CNN(nn.Module):
def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim,
dropout, pad_idx):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.convs = nn.ModuleList([
nn.Conv2d(in_channels = 1,
out_channels = n_filters,
kernel_size = (fs, embedding_dim))
for fs in filter_sizes
])
self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, text):
#text = [sent len, batch size]
text = text.permute(1, 0)
#text = [batch size, sent len]
embedded = self.embedding(text)
#embedded = [batch size, sent len, emb dim]
embedded = embedded.unsqueeze(1)
#embedded = [batch size, 1, sent len, emb dim]
conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
#conv_n = [batch size, n_filters, sent len - filter_sizes[n]]
pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
#pooled_n = [batch size, n_filters]
cat = self.dropout(torch.cat(pooled, dim = 1))
#cat = [batch size, n_filters * len(filter_sizes)]
return self.fc(cat)
In [54]:
INPUT_DIM = len(txt_field.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [2,3,4]
OUTPUT_DIM = len(label_field.vocab)
DROPOUT = 0.5
PAD_IDX = txt_field.vocab.stoi[txt_field.pad_token]
model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
In [55]:
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')
In [56]:
pretrained_embeddings = txt_field.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)