In [1]:
%install-location $cwd/swift-install
%install-swiftpm-flags -c release
%install '.package(url: "https://github.com/tensorflow/swift-models", .branch("master"))' ModelSupport Datasets
In [2]:
import TensorFlow
import Foundation
import ModelSupport
import Datasets
First thing first, we will need to download the data somewhere. We use DatasetUtilities
for this. You just need to split the url where the archived file is between the filename, extension and rest of the host url, then specify the folder where you want it downloaded. The function .downloadResource
will then automatically download the archive (if needed) and inflate it (in the process the folder you specified will be created if it didn't exist).
In [3]:
let cwdURL = URL(fileURLWithPath: FileManager.default.currentDirectoryPath)
let dataFolder = DatasetUtilities.downloadResource(
filename: "wikitext-2",
fileExtension: "tgz",
remoteRoot: URL(string: "https://s3.amazonaws.com/fast-ai-nlp/")!,
localStorageDirectory: cwdURL.appendingPathComponent("data/", isDirectory: true)
)
In [4]:
var trainTexts = try! String(contentsOf: dataFolder.appendingPathComponent("train.csv"), encoding: .utf8)
In [5]:
public func readCSV(in file: URL) -> [String] {
let rawText = try! String(contentsOf: file, encoding: .utf8)
var rows = rawText.components(separatedBy: "\"\n\"")
//Removing the initial "
rows[0] = String(rows[0].dropFirst())
//Removing the last "\n
rows[rows.indices.last!] = String(rows.last!.dropLast(2))
return rows
}
In [6]:
let trainTexts = readCSV(in: dataFolder.appendingPathComponent("train.csv"))
let validTexts = readCSV(in: dataFolder.appendingPathComponent("test.csv"))
In [7]:
trainTexts[0]
Out[7]:
A model won't be able to train on raw texts like the one above. We will need to convert it into numbers first. To do this, there are two different steps: transforming a text into a list of words (called tokens) and then transforming those words in numbers. Those steps are usually called tokenization and numericalization in NLP.
Tokenizing a text is converting it into a list of meaningful tokens. There are several way to do this:
While character-level tokenization is pretty straightforward, the two other kinds are a bit trickier. How do you split a word like "don't" for instance, which is actually "do not"? In our case, we don't ahve to worry about that since wikitext-103 has been pre-tokenized, so we can just split on space. (Alternatively, we could train a BPE tokenizer on those texts.)
In [8]:
func easyTokenize(_ text: String) -> [String] {
return text.components(separatedBy: " ")
}
In [9]:
let trainTokenizedTexts = trainTexts.map(easyTokenize)
let validTokenizedTexts = validTexts.map(easyTokenize)
One our texts are splits into tokens, we can make a mapping token to unique index and convert them into numbers. We usually try to limit the size of the vocabulary by keeping only the most common tokens, or removing the tokens that are only present more than a given number of times. All tokens that are not part of the vocabulary will be changed to <unk>
(for unkown).
So first, let's count how many times each token is used in our texts. We also save the length of each text since we will need that later on.
In [10]:
func countTokens(_ texts: [[String]]) -> ([Int], [String:Int]) {
var counts: [String:Int] = [:]
var lengths: [Int] = []
for tokens in texts {
lengths.append(tokens.count)
for token in tokens {
counts[token] = (counts[token] ?? 0) + 1
}
}
return (lengths,counts)
}
We only use the training set to build our vocabulary.
In [11]:
let (trainLengths, trainCounts) = countTokens(trainTokenizedTexts)
Then the following function will create a vocabulary containing all the most frequent words up to maxCount
, and with a minimum frequency of minFrequency
(NB: a language model can barely learn anything about words rarely present in the dataset). We return a tuple with the two mappings int to string and string to int (often called itos and stoi in NLP).
In [12]:
func makeVocabulary(
_ counts: [String:Int],
minFrequency: Int = 2,
maxCount: Int = 60000)
-> (itos: [Int:String], stoi: [String:Int]) {
let withoutSpec = counts.filter { $0.0 != "xxunk" && $0.0 != "xxpad" }
let sorted = withoutSpec.sorted { $0.1 > $1.1 }
var itos: [Int:String] = [0:"xxunk", 1:"xxpad"]
var stoi: [String:Int] = ["xxunk":0, "xxpad":1]
for (i,x) in sorted.enumerated() {
if i+2 >= maxCount || x.1 < minFrequency { break }
itos[i+2] = (x.0)
stoi[x.0] = i+2
}
return (itos: itos, stoi: stoi)
}
Let's use our previous counts to build a vocabulary:
In [13]:
let vocabulary = makeVocabulary(trainCounts)
And then we can use it to numericalize our tokenized texts, let's just check what is the index of the unknown token to use it for words that are our of vocabulary.
In [14]:
vocabulary.stoi["<unk>"]
Out[14]:
In [15]:
func numericalize(_ tokens: [String], with stoi: [String:Int]) -> [Int] {
return tokens.map { stoi[$0] ?? 6 }
}
And we can apply it to all our tokenized texts:
In [16]:
let trainNumericalizedTexts = trainTokenizedTexts.map{ numericalize($0, with: vocabulary.stoi) }
let validNumericalizedTexts = validTokenizedTexts.map{ numericalize($0, with: vocabulary.stoi) }
A language model task is to guess the next word in a stream of texts. When having a list of tokenized and numericalized texts, we usually concatenate them all together in one big stream, separate it in the desired numbers of batches (which are batchSize
chunks of continuous texts) then read through those sequenceLength
at a time.
Let's look at an example:
In [17]:
let items = [[0,1,2,3,4],[5,6,7,8,9,10],[11,12,13,14,15,16,17,18],[19,20],[21,22]]
In [18]:
var dataset = LanguageModelDataset(batchSize: 4, sequenceLength: 3, numericalizedTexts: items)
Here our stream is the sequence of integers from 0 to 22. With a batchsize of 4, we split it in four chunks which are:
0,1,2,3,4
5,6,7,8,9
10,11,12,13,14
15,16,17,18,19
The last three bits of the stream are thrown away because we don't have a round multiple of 4.
Then if read with a sequenceLength of 3, the first batch has for input
0,1,2
5,6,7
10,11,12
15,16,17
and for target the next words:
1,2,3
6,7,8
11,12,13
16,17,18
Let's put our dataset in batches to check it does all of this for us:
In [19]:
let inBatches = dataset.inBatches(of: 4).lazy.map {
(
data: Tensor<Int32>($0.map(\.first)),
label: Tensor<Int32>($0.map(\.second))
)
}
In [20]:
for x in inBatches { print(x) }
The first batch is as expected, and the second one has only a sequence length of 2 because our big chunks of text have a length of 5 here.
Behind the scenes, LanguageModelDataset
implements a new collection which has the proper length and subscrit, to return the pair input/target of text (and not the raw texts of varying lengths).
With the shuffle enabled, the texts are shuffled before being concatenated to form the stream.
In [21]:
dataset.shuffle()
let inBatches = dataset.inBatches(of: 4).lazy.map {
(
data: Tensor<Int32>($0.map(\.first)),
label: Tensor<Int32>($0.map(\.second))
)
}
In [22]:
for x in inBatches { print(x) }
We can create a LanguageModelDataset
from all our text. Since it will need all the lengths of every sample to work, we can provide the array of lengths of each text to speed up the init (if we don't, it will make a pass over the dataset to compute them).
In [23]:
let trainSet = LanguageModelDataset(
batchSize: 64,
sequenceLength: 72,
numericalizedTexts: trainNumericalizedTexts,
lengths: trainLengths
)
In [24]:
let validSet = LanguageModelDataset(
batchSize: 64,
sequenceLength: 72,
numericalizedTexts: validNumericalizedTexts
)
And we can batch our samples with Epochs APIs. Let's start with defining some typealias for convenience.
In [25]:
typealias Samples = LanguageModelDataset<[[Int]]>
typealias LabeledTextBatch = (data: Tensor<Int32>, label: Tensor<Int32>)
typealias Batches = Slices<Sampling<Samples, ArraySlice<Int>>>
typealias Training = LazyMapSequence<
TrainingEpochs<Samples, SystemRandomNumberGenerator>,
LazyMapSequence<Batches, LabeledTextBatch>
>
typealias Validation = LazyMapSequence<
Slices<Samples>,
LabeledTextBatch
>
In [26]:
let training: Training = TrainingEpochs(
samples: trainSet,
batchSize: 64,
entropy: SystemRandomNumberGenerator()
).lazy.map { (batches: Batches) -> LazyMapSequence<Batches, LabeledTextBatch> in
batches.lazy.map {
(
data: Tensor<Int32>($0.map(\.first)),
label: Tensor<Int32>($0.map(\.second))
)
}
}
In [27]:
let validation: Validation = validSet.inBatches(of: 64).lazy.map {
(
data: Tensor<Int32>($0.map(\.first)),
label: Tensor<Int32>($0.map(\.second))
)
}
To iterate through our training batches, we just use .enumerated()
. Here let's check we do read through the texts in order by storing the first five batches.
In [28]:
var sampleTrainingBatches: [LabeledTextBatch] = []
for (epoch, epochBatches) in training.prefix(1).enumerated() {
for batch in epochBatches {
sampleTrainingBatches.append(batch)
if sampleTrainingBatches.count >= 5 {
break
}
}
}
Iterating through our validation batches is even easier:
In [29]:
var sampleValidationBatches: [LabeledTextBatch] = []
for batch in validation {
sampleValidationBatches.append(batch)
if sampleValidationBatches.count >= 5 {
break
}
}
To show one of the lines of our tensor, we will use this function:
In [30]:
func showText(_ x: Tensor<Int32>) -> String {
var tokens = x.scalars.map { vocabulary.itos[Int($0)]! }
return tokens.joined(separator: " ")
}
Now let's look at the first row of our first batch:
In [31]:
showText(sampleTrainingBatches[0].data[0])
Out[31]:
The targets are just shifted one word to the right:
In [32]:
showText(sampleTrainingBatches[0].label[0])
Out[32]:
Since validation set is not shuttled, we should be able to pick up a pair of texts where second one is exactly where the first one stopped:
In [33]:
showText(sampleValidationBatches[0].data[1])
Out[33]:
In [34]:
showText(sampleValidationBatches[1].data[1])
Out[34]: