pytorch · vincentqb · Sep 24, 2020 · May 12, 2020 · May 12, 2020 · May 12, 2020
diff --git a/examples/pipeline_wav2letter/README.md b/examples/pipeline_wav2letter/README.md
@@ -0,0 +1,45 @@
+This is an example pipeline for speech recognition using a greedy or Viterbi CTC decoder, along with the Wav2Letter model trained on LibriSpeech, see [Wav2Letter: an End-to-End ConvNet-based Speech Recognition System](https://arxiv.org/pdf/1609.03193.pdf). Wav2Letter and LibriSpeech are available in torchaudio.
+
+### Usage
+
+More information about each command line parameters is available with the `--help` option. An example can be invoked as follows.
+```
+python main.py \
+    --reduce-lr-valid \
+    --dataset-train train-clean-100 train-clean-360 train-other-500 \
+    --dataset-valid dev-clean \
+    --batch-size 128 \
+    --learning-rate .6 \
+    --momentum .8 \
+    --weight-decay .00001 \
+    --clip-grad 0. \
+    --gamma .99 \
+    --hop-length 160 \
+    --n-hidden-channels 2000 \
+    --win-length 400 \
+    --n-bins 13 \
+    --normalize \
+    --optimizer adadelta \
+    --scheduler reduceonplateau \
+    --epochs 30
+```
+With these default parameters, we get a character error rate of 13.8% on dev-clean after 30 epochs.
+
+### Output
+
+The information reported at each iteration and epoch (e.g. loss, character error rate, word error rate) is printed to standard output in the form of one json per line, e.g.
+```python
+{"name": "train", "epoch": 0, "cer over target length": 1.0, "cumulative cer": 23317.0, "total chars": 23317.0, "cer": 0.0, "cumulative cer over target length": 0.0, "wer over target length": 1.0, "cumulative wer": 4446.0, "total words": 4446.0, "wer": 0.0, "cumulative wer over target length": 0.0, "lr": 0.6, "batch size": 128, "n_channel": 13, "n_time": 2453, "dataset length": 128.0, "iteration": 1.0, "loss": 8.712121963500977, "cumulative loss": 8.712121963500977, "average loss": 8.712121963500977, "iteration time": 41.46276903152466, "epoch time": 41.46276903152466}
+{"name": "train", "epoch": 0, "cer over target length": 1.0, "cumulative cer": 46005.0, "total chars": 46005.0, "cer": 0.0, "cumulative cer over target length": 0.0, "wer over target length": 1.0, "cumulative wer": 8762.0, "total words": 8762.0, "wer": 0.0, "cumulative wer over target length": 0.0, "lr": 0.6, "batch size": 128, "n_channel": 13, "n_time": 1703, "dataset length": 256.0, "iteration": 2.0, "loss": 8.918599128723145, "cumulative loss": 17.63072109222412, "average loss": 8.81536054611206, "iteration time": 1.2905676364898682, "epoch time": 42.753336668014526}
+{"name": "train", "epoch": 0, "cer over target length": 1.0, "cumulative cer": 70030.0, "total chars": 70030.0, "cer": 0.0, "cumulative cer over target length": 0.0, "wer over target length": 1.0, "cumulative wer": 13348.0, "total words": 13348.0, "wer": 0.0, "cumulative wer over target length": 0.0, "lr": 0.6, "batch size": 128, "n_channel": 13, "n_time": 1713, "dataset length": 384.0, "iteration": 3.0, "loss": 8.550191879272461, "cumulative loss": 26.180912971496582, "average loss": 8.726970990498861, "iteration time": 1.2109291553497314, "epoch time": 43.96426582336426}
+```
+One way to import the output in python with pandas is by saving the standard output to a file, and then using `pandas.read_json(filename, lines=True)`.
+
+## Structure of pipeline
+
+* `main.py` -- the entry point
+* `ctc_decoders.py` -- the greedy CTC decoder
+* `datasets.py` -- the function to split and process librispeech, a collate factory function
+* `languagemodels.py` -- a class to encode and decode strings
+* `metrics.py` -- the levenshtein edit distance
+* `utils.py` -- functions to log metrics, save checkpoint, and count parameters
diff --git a/examples/pipeline_wav2letter/ctc_decoders.py b/examples/pipeline_wav2letter/ctc_decoders.py
@@ -0,0 +1,15 @@
+from torch import topk
+
+
+class GreedyDecoder:
+    def __call__(self, outputs):
+        """Greedy Decoder. Returns highest probability of class labels for each timestep
+
+        Args:
+            outputs (torch.Tensor): shape (input length, batch size, number of classes (including blank))
+
+        Returns:
+            torch.Tensor: class labels per time step.
+        """
+        _, indices = topk(outputs, k=1, dim=-1)
+        return indices[..., 0]
diff --git a/examples/pipeline_wav2letter/datasets.py b/examples/pipeline_wav2letter/datasets.py
@@ -0,0 +1,113 @@
+import torch
+from torchaudio.datasets import LIBRISPEECH
+
+
+class MapMemoryCache(torch.utils.data.Dataset):
+    """
+    Wrap a dataset so that, whenever a new item is returned, it is saved to memory.
+    """
+
+    def __init__(self, dataset):
+        self.dataset = dataset
+        self._cache = [None] * len(dataset)
+
+    def __getitem__(self, n):
+        if self._cache[n] is not None:
+            return self._cache[n]
+
+        item = self.dataset[n]
+        self._cache[n] = item
+
+        return item
+
+    def __len__(self):
+        return len(self.dataset)
+
+
+class Processed(torch.utils.data.Dataset):
+    def __init__(self, dataset, transforms, encode):
+        self.dataset = dataset
+        self.transforms = transforms
+        self.encode = encode
+
+    def __getitem__(self, key):
+        item = self.dataset[key]
+        return self.process_datapoint(item)
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def process_datapoint(self, item):
+        transformed = item[0]
+        target = item[2].lower()
+
+        transformed = self.transforms(transformed)
+        transformed = transformed[0, ...].transpose(0, -1)
+
+        target = self.encode(target)
+        target = torch.tensor(target, dtype=torch.long, device=transformed.device)
+
+        return transformed, target
+
+
+def split_process_librispeech(
+    datasets, transforms, language_model, root, folder_in_archive,
+):
+    def create(tags, cache=True):
+
+        if isinstance(tags, str):
+            tags = [tags]
+        if isinstance(transforms, list):
+            transform_list = transforms
+        else:
+            transform_list = [transforms]
+
+        data = torch.utils.data.ConcatDataset(
+            [
+                Processed(
+                    LIBRISPEECH(
+                        root, tag, folder_in_archive=folder_in_archive, download=False,
+                    ),
+                    transform,
+                    language_model.encode,
+                )
+                for tag, transform in zip(tags, transform_list)
+            ]
+        )
+
+        data = MapMemoryCache(data)
+        return data
+
+    # For performance, we cache all datasets
+    return tuple(create(dataset) for dataset in datasets)
+
+
+def collate_factory(model_length_function, transforms=None):
+
+    if transforms is None:
+        transforms = torch.nn.Sequential()
+
+    def collate_fn(batch):
+
+        tensors = [transforms(b[0]) for b in batch if b]
+
+        tensors_lengths = torch.tensor(
+            [model_length_function(t) for t in tensors],
+            dtype=torch.long,
+            device=tensors[0].device,
+        )
+
+        tensors = torch.nn.utils.rnn.pad_sequence(tensors, batch_first=True)
+        tensors = tensors.transpose(1, -1)
+
+        targets = [b[1] for b in batch if b]
+        target_lengths = torch.tensor(
+            [target.shape[0] for target in targets],
+            dtype=torch.long,
+            device=tensors.device,
+        )
+        targets = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True)
+
+        return tensors, targets, tensors_lengths, target_lengths
+
+    return collate_fn
diff --git a/examples/pipeline_wav2letter/languagemodels.py b/examples/pipeline_wav2letter/languagemodels.py
@@ -0,0 +1,38 @@
+import collections
+import itertools
+
+
+class LanguageModel:
+    def __init__(self, labels, char_blank, char_space):
+
+        self.char_space = char_space
+        self.char_blank = char_blank
+
+        labels = [l for l in labels]
+        self.length = len(labels)
+        enumerated = list(enumerate(labels))
+        flipped = [(sub[1], sub[0]) for sub in enumerated]
+
+        d1 = collections.OrderedDict(enumerated)
+        d2 = collections.OrderedDict(flipped)
+        self.mapping = {**d1, **d2}
+
+    def encode(self, iterable):
+        if isinstance(iterable, list):
+            return [self.encode(i) for i in iterable]
+        else:
+            return [self.mapping[i] + self.mapping[self.char_blank] for i in iterable]
+
+    def decode(self, tensor):
+        if len(tensor) > 0 and isinstance(tensor[0], list):
+            return [self.decode(t) for t in tensor]
+        else:
+            # not idempotent, since clean string
+            x = (self.mapping[i] for i in tensor)
+            x = "".join(i for i, _ in itertools.groupby(x))
+            x = x.replace(self.char_blank, "")
+            # x = x.strip()
+            return x
+
+    def __len__(self):
+        return self.length