From b6592ca3d30df2a649dd7cb250ccc93e980226a9 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 12 May 2020 12:04:27 -0700 Subject: [PATCH 001/129] example pipeline, initial commit. --- examples/pipeline/wav2letter.py | 1556 +++++++++++++++++++++++++++++++ 1 file changed, 1556 insertions(+) create mode 100644 examples/pipeline/wav2letter.py diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py new file mode 100644 index 0000000000..f2de936f81 --- /dev/null +++ b/examples/pipeline/wav2letter.py @@ -0,0 +1,1556 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[ ]: + + +# https://github.com/pytorch/pytorch/issues/13883 +import torch.multiprocessing as mp + +if __name__ == '__main__': + mp.set_start_method('forkserver') + + +# In[ ]: + + +import argparse +import collections +import cProfile +import hashlib +import itertools +import math +import os +import pprint +import pstats +import random +import re +import shutil +import signal +import statistics +import string +from array import array +from collections import defaultdict +from datetime import datetime +from io import StringIO +from typing import Optional + +import matplotlib +import torch +import torch.distributed as dist +import torchaudio +from matplotlib import pyplot as plt +from tabulate import tabulate +from torch import nn, topk +from torch.optim import SGD, Adadelta, Adam +from torch.optim.lr_scheduler import ExponentialLR, ReduceLROnPlateau +from torch.utils.data import DataLoader +from torchaudio.datasets import LIBRISPEECH, SPEECHCOMMANDS +from torchaudio.datasets.utils import bg_iterator, diskcache_iterator +from torchaudio.transforms import MFCC, Resample +from tqdm.notebook import tqdm as tqdm + +print("start time: {}".format(str(datetime.now())), flush=True) + +try: + get_ipython().run_line_magic('matplotlib', 'inline') + in_notebook = True +except NameError: + matplotlib.use("Agg") + in_notebook = False + +# Empty CUDA cache +torch.cuda.empty_cache() + +# Profiling performance +pr = cProfile.Profile() +pr.enable() + + +# In[ ]: + + +# Create argument parser +parser = argparse.ArgumentParser() + +parser.add_argument('--workers', default=0, type=int, + metavar='N', help='number of data loading workers') +parser.add_argument('--resume', default='', type=str, + metavar='PATH', help='path to latest checkpoint') +parser.add_argument('--figures', default='', type=str, + metavar='PATH', help='folder path to save figures') + +parser.add_argument('--epochs', default=200, type=int, + metavar='N', help='number of total epochs to run') +parser.add_argument('--start-epoch', default=0, type=int, + metavar='N', help='manual epoch number') +parser.add_argument('--print-freq', default=10, type=int, + metavar='N', help='print frequency in epochs') + +parser.add_argument('--arch', metavar='ARCH', default='wav2letter', + choices=["wav2letter", "lstm"], help='model architecture') +parser.add_argument('--batch-size', default=64, type=int, + metavar='N', help='mini-batch size') + +parser.add_argument('--learning-rate', default=1., type=float, + metavar='LR', help='initial learning rate') +parser.add_argument('--gamma', default=.96, type=float, + metavar='GAMMA', help='learning rate exponential decay constant') +# parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum') +parser.add_argument('--weight-decay', default=1e-5, + type=float, metavar='W', help='weight decay') +parser.add_argument("--eps", metavar='EPS', type=float, default=1e-8) +parser.add_argument("--rho", metavar='RHO', type=float, default=.95) + +parser.add_argument('--n-bins', default=13, type=int, + metavar='N', help='number of bins in transforms') + +parser.add_argument('--world-size', default=1, type=int, + help='number of distributed processes') +parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', + type=str, help='url used to set up distributed training') +parser.add_argument('--dist-backend', default='nccl', + type=str, help='distributed backend') +parser.add_argument('--distributed', action="store_true") + +parser.add_argument('--dataset', default='librispeech', type=str) +parser.add_argument('--gradient', action="store_true") +parser.add_argument('--jit', action="store_true") +parser.add_argument('--viterbi-decoder', action="store_true") + +if in_notebook: + args, _ = parser.parse_known_args() +else: + args = parser.parse_args() + + +# In[ ]: + + +if args.learning_rate < 0.: + args.learning_rate = 10 ** random.uniform(-3, 1) + +if args.weight_decay < 0.: + args.weight_decay = 10 ** random.uniform(-6, 0) + +if args.gamma < 0.: + args.gamma = random.uniform(.95, 1.) + + +# In[ ]: + + +args.batch_size = 32 +args.model = "wav2letter" +args.dataset = "speechcommand" +args.print_freq = 1 + + +# # Checkpoint + +# In[ ]: + + +MAIN_PID = os.getpid() +CHECKPOINT_filename = args.resume if args.resume else 'checkpoint.pth.tar' +CHECKPOINT_tempfile = CHECKPOINT_filename + '.temp' +HALT_filename = CHECKPOINT_filename + '.HALT' +SIGNAL_RECEIVED = False + +# HALT file is used as a sign of job completion. +# Make sure no HALT file left from previous runs. +if os.path.isfile(HALT_filename): + os.remove(HALT_filename) + +# Remove CHECKPOINT_tempfile, in case the signal arrives in the +# middle of copying from CHECKPOINT_tempfile to CHECKPOINT_filename +if os.path.isfile(CHECKPOINT_tempfile): + os.remove(CHECKPOINT_tempfile) + + +def SIGTERM_handler(a, b): + print('received sigterm') + pass + + +def signal_handler(a, b): + global SIGNAL_RECEIVED + print('Signal received', a, datetime.now().strftime( + "%y%m%d.%H%M%S"), flush=True) + SIGNAL_RECEIVED = True + + # If HALT file exists, which means the job is done, exit peacefully. + if os.path.isfile(HALT_filename): + print('Job is done, exiting') + exit(0) + + return + + +def trigger_job_requeue(): + # Submit a new job to resume from checkpoint. + if os.path.isfile(CHECKPOINT_filename) and os.environ['SLURM_PROCID'] == '0' and os.getpid() == MAIN_PID: + print('pid: ', os.getpid(), ' ppid: ', os.getppid(), flush=True) + print('time is up, back to slurm queue', flush=True) + command = 'scontrol requeue ' + os.environ['SLURM_JOB_ID'] + print(command) + if os.system(command): + raise RuntimeError('requeue failed') + print('New job submitted to the queue', flush=True) + exit(0) + + +# Install signal handler +signal.signal(signal.SIGUSR1, signal_handler) +signal.signal(signal.SIGTERM, SIGTERM_handler) +print('Signal handler installed', flush=True) + + +def save_checkpoint(state, is_best, filename=CHECKPOINT_filename): + """ + Save the model to a temporary file first, + then copy it to filename, in case the signal interrupts + the torch.save() process. + """ + if not args.distributed or os.environ['SLURM_PROCID'] == '0': + torch.save(state, CHECKPOINT_tempfile) + if os.path.isfile(CHECKPOINT_tempfile): + os.rename(CHECKPOINT_tempfile, filename) + if is_best: + shutil.copyfile(filename, 'model_best.pth.tar') + print("Checkpoint: saved") + + +# # Distributed + +# In[ ]: + + +# Use #nodes as world_size +if 'SLURM_NNODES' in os.environ: + args.world_size = int(os.environ['SLURM_NNODES']) + +args.distributed = args.distributed or args.world_size > 1 + +if args.distributed: + os.environ['RANK'] = os.environ['SLURM_PROCID'] + os.environ['WORLD_SIZE'] = str(args.world_size) + print('in distributed', os.environ['RANK'], + os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'], flush=True) + dist.init_process_group(backend=args.dist_backend, + init_method=args.dist_url, world_size=args.world_size) + + print('init process', flush=True) + + +# # Parameters + +# In[ ]: + + +if not args.distributed or os.environ['SLURM_PROCID'] == '0': + print(pprint.pformat(vars(args)), flush=True) + + +# In[ ]: + + +audio_backend = "soundfile" +torchaudio.set_audio_backend(audio_backend) + +root = "/datasets01/" +folder_in_archive = "librispeech/062419/" + +device = "cuda" if torch.cuda.is_available() else "cpu" +num_devices = torch.cuda.device_count() +# num_devices = 1 +print(num_devices, "GPUs", flush=True) + +# max number of sentences per batch +batch_size = args.batch_size +# batch_size = 2048 +# batch_size = 512 +# batch_size = 256 +# batch_size = 64 +# batch_size = 1 + +training_percentage = 90. +validation_percentage = 5. + +data_loader_training_params = { + "num_workers": args.workers, + "pin_memory": True, + "shuffle": True, + "drop_last": True, +} +data_loader_validation_params = data_loader_training_params.copy() +data_loader_validation_params["shuffle"] = False + +non_blocking = True + + +# text preprocessing + +char_blank = "*" +char_space = " " +char_apostrophe = "'" + +labels = char_blank + char_space + char_apostrophe + string.ascii_lowercase + +# excluded_dir = ["_background_noise_"] +# folder_speechcommands = './SpeechCommands/speech_commands_v0.02' +# labels = [char_blank, char_space] + [d for d in next(os.walk(folder_speechcommands))[1] if d not in excluded_dir] + + +# audio + +sample_rate_original = 16000 +sample_rate_new = 8000 + +n_bins = args.n_bins # 13, 128 +melkwargs = { + 'n_fft': 512, + 'n_mels': 20, + 'hop_length': 80, # (160, 80) +} + +transforms = nn.Sequential( + # torchaudio.transforms.Resample(sample_rate_original, sample_rate_new), + # torchaudio.transforms.MFCC(sample_rate=sample_rate_original, n_mfcc=n_bins, melkwargs=melkwargs), + torchaudio.transforms.MelSpectrogram( + sample_rate=sample_rate_original, n_mels=n_bins), + # torchaudio.transforms.FrequencyMasking(freq_mask_param=n_bins), + # torchaudio.transforms.TimeMasking(time_mask_param=35) +) + + +# Optimizer + +optimizer_params_adadelta = { + "lr": args.learning_rate, + "eps": args.eps, + "rho": args.rho, + "weight_decay": args.weight_decay, +} + +optimizer_params_adam = { + "lr": args.learning_rate, + "eps": args.eps, + "weight_decay": args.weight_decay, +} + +optimizer_params_sgd = { + "lr": args.learning_rate, + "weight_decay": args.weight_decay, +} + +optimizer_params_adadelta = { + "lr": args.learning_rate, + "eps": args.eps, + "rho": args.rho, + "weight_decay": args.weight_decay, +} + +Optimizer = Adadelta +optimizer_params = optimizer_params_sgd + +# Model + +num_features = n_bins if n_bins else 1 + +lstm_params = { + "hidden_size": 800, + "num_layers": 5, + "batch_first": False, + "bidirectional": False, + "dropout": 0., +} + +clip_norm = 0. # 10. + +zero_infinity = False + + +# # Text encoding + +# In[ ]: + + +class Coder: + def __init__(self, labels): + labels = [l for l in labels] + self.length = len(labels) + enumerated = list(enumerate(labels)) + flipped = [(sub[1], sub[0]) for sub in enumerated] + + d1 = collections.OrderedDict(enumerated) + d2 = collections.OrderedDict(flipped) + self.mapping = {**d1, **d2} + + def encode(self, iterable): + if isinstance(iterable, list): + return [self.encode(i) for i in iterable] + else: + return [self.mapping[i] + self.mapping[char_blank] for i in iterable] + + def decode(self, tensor): + if isinstance(tensor[0], list): + return [self.decode(t) for t in tensor] + else: + # not idempotent, since clean string + x = (self.mapping[i] for i in tensor) + x = ''.join(i for i, _ in itertools.groupby(x)) + x = x.replace(char_blank, "") + # x = x.strip() + return x + + +coder = Coder(labels) +encode = coder.encode +decode = coder.decode +vocab_size = coder.length +print("vocab_size", vocab_size, flush=True) + + +# # Model +# +# [Wav2Letter](https://github.com/LearnedVector/Wav2Letter/blob/master/Google%20Speech%20Command%20Example.ipynb) + +# In[ ]: + + +def weight_init(m): + if isinstance(m, nn.Linear): + size = m.weight.size() + fan_out = size[0] # number of rows + fan_in = size[1] # number of columns + variance = math.sqrt(2.0/(fan_in + fan_out)) + m.weight.data.normal_(0.0, variance) + + +class PrintLayer(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + print(x, flush=True) + return x + + +class Wav2Letter(nn.Module): + """Wav2Letter Speech Recognition model + https://arxiv.org/pdf/1609.03193.pdf + This specific architecture accepts mfcc or power spectrums speech signals + + Args: + num_features (int): number of mfcc features + num_classes (int): number of unique grapheme class labels + """ + + def __init__(self, num_features, num_classes): + super().__init__() + + # Conv1d(in_channels, out_channels, kernel_size, stride) + self.layers = nn.Sequential( + nn.Conv1d(in_channels=num_features, out_channels=250, + kernel_size=48, stride=2, padding=23), + nn.ReLU(inplace=True), + nn.Conv1d(in_channels=250, out_channels=250, + kernel_size=7, stride=1, padding=3), + nn.ReLU(inplace=True), + nn.Conv1d(in_channels=250, out_channels=250, + kernel_size=7, stride=1, padding=3), + nn.ReLU(inplace=True), + nn.Conv1d(in_channels=250, out_channels=250, + kernel_size=7, stride=1, padding=3), + nn.ReLU(inplace=True), + nn.Conv1d(in_channels=250, out_channels=250, + kernel_size=7, stride=1, padding=3), + nn.ReLU(inplace=True), + nn.Conv1d(in_channels=250, out_channels=250, + kernel_size=7, stride=1, padding=3), + nn.ReLU(inplace=True), + nn.Conv1d(in_channels=250, out_channels=250, + kernel_size=7, stride=1, padding=3), + nn.ReLU(inplace=True), + nn.Conv1d(in_channels=250, out_channels=250, + kernel_size=7, stride=1, padding=3), + nn.ReLU(inplace=True), + nn.Conv1d(in_channels=250, out_channels=2000, + kernel_size=32, stride=1, padding=16), + nn.ReLU(inplace=True), + nn.Conv1d(in_channels=2000, out_channels=2000, + kernel_size=1, stride=1, padding=0), + nn.ReLU(inplace=True), + nn.Conv1d(in_channels=2000, out_channels=num_classes, + kernel_size=1, stride=1, padding=0), + nn.ReLU(inplace=True), + ) + + def forward(self, batch): + """Forward pass through Wav2Letter network than + takes log probability of output + Args: + batch (int): mini batch of data + shape (batch, num_features, frame_len) + Returns: + Tensor with shape (batch_size, num_classes, output_len) + """ + # batch: (batch_size, num_features, seq_len) + y_pred = self.layers(batch) + # y_pred: (batch_size, num_classes, output_len) + y_pred = y_pred.transpose(-1, -2) + # y_pred: (batch_size, output_len, num_classes) + return nn.functional.log_softmax(y_pred, dim=-1) + + +# In[ ]: + + +class LSTMModel(nn.Module): + + def __init__(self, num_features, num_classes, hidden_size, num_layers, bidirectional, dropout, batch_first): + super().__init__() + + directions = bidirectional + 1 + + self.layer = nn.LSTM( + num_features, hidden_size=hidden_size, + num_layers=num_layers, bidirectional=bidirectional, dropout=dropout, batch_first=batch_first + ) + # self.activation = nn.ReLU(inplace=True) + self.hidden2class = nn.Linear(directions*hidden_size, num_classes) + + def forward(self, batch): + self.layer.flatten_parameters() + # print("forward", flush=True) + # batch: batch, num_features, seq_len + # print(batch.shape, flush=True) + batch = batch.transpose(-1, -2).contiguous() + # batch: batch, seq_len, num_features + # print(batch.shape, flush=True) + outputs, _ = self.layer(batch) + # outputs = self.activation(outputs) + # outputs: batch, seq_len, directions*num_features + outputs = self.hidden2class(outputs) + # outputs: batch, seq_len, num_features + # print(outputs.shape, flush=True) + return nn.functional.log_softmax(outputs, dim=-1) + + +# In[ ]: + + +if args.arch == "wav2letter": + model = Wav2Letter(num_features, vocab_size) + + def model_length_function(tensor): + return int(tensor.shape[0])//2 + 1 + +elif args.arch == "lstm": + model = LSTMModel(num_features, vocab_size, **lstm_params) + + def model_length_function(tensor): + return int(tensor.shape[0]) + + +# # Dataset + +# In[ ]: + + +class IterableMemoryCache: + + def __init__(self, iterable): + self.iterable = iterable + self._iter = iter(iterable) + self._done = False + self._values = [] + + def __iter__(self): + if self._done: + return iter(self._values) + return itertools.chain(self._values, self._gen_iter()) + + def _gen_iter(self): + for new_value in self._iter: + self._values.append(new_value) + yield new_value + self._done = True + + def __len__(self): + return len(self._iterable) + + +class MapMemoryCache(torch.utils.data.Dataset): + """ + Wrap a dataset so that, whenever a new item is returned, it is saved to memory. + """ + + def __init__(self, dataset): + self.dataset = dataset + self._cache = [None] * len(dataset) + + def __getitem__(self, n): + if self._cache[n]: + return self._cache[n] + + item = self.dataset[n] + self._cache[n] = item + + return item + + def __len__(self): + return len(self.dataset) + + +class Processed(torch.utils.data.Dataset): + + def __init__(self, process_datapoint, dataset): + self.process_datapoint = process_datapoint + self.dataset = dataset + + def __getitem__(self, n): + try: + item = self.dataset[n] + return self.process_datapoint(item) + except (FileNotFoundError, RuntimeError): + return None + + def __next__(self): + try: + item = next(self.dataset) + return self.process_datapoint(item) + except (FileNotFoundError, RuntimeError): + return self.__next__() + + def __len__(self): + return len(self.dataset) + + +# In[ ]: + + +# mfcc = mfcc.to(device) +# resample = resample.to(device) + +# @torch.jit.script + + +def process_datapoint(item): + transformed = item[0] # .to(device, non_blocking=non_blocking) + target = item[2].lower() + + transformed = transforms(transformed) + + transformed = transformed[0, ...].transpose(0, -1) + + target = " " + target + " " + target = encode(target) + target = torch.tensor(target, dtype=torch.long, device=transformed.device) + + transformed = transformed # .to("cpu") + target = target # .to("cpu") + return transformed, target + + +# In[ ]: + + +def datasets_librispeech(): + + def create(tag): + + if isinstance(tag, str): + data = LIBRISPEECH( + root, tag, folder_in_archive=folder_in_archive, download=False) + else: + data = torch.utils.data.ConcatDataset([LIBRISPEECH( + root, t, folder_in_archive=folder_in_archive, download=False) for t in tag]) + + data = Processed(process_datapoint, data) + # data = diskcache_iterator(data) + data = MapMemoryCache(data) + return data + + return create("train-clean-100"), create("dev-clean"), None + # return create(["train-clean-100", "train-clean-360", "train-other-500"]), create(["dev-clean", "dev-other"]), None + + +# In[ ]: + + +def which_set(filename, validation_percentage, testing_percentage): + """Determines which data partition the file should belong to. + + We want to keep files in the same training, validation, or testing sets even + if new ones are added over time. This makes it less likely that testing + samples will accidentally be reused in training when long runs are restarted + for example. To keep this stability, a hash of the filename is taken and used + to determine which set it should belong to. This determination only depends on + the name and the set proportions, so it won't change as other files are added. + + It's also useful to associate particular files as related (for example words + spoken by the same person), so anything after '_nohash_' in a filename is + ignored for set determination. This ensures that 'bobby_nohash_0.wav' and + 'bobby_nohash_1.wav' are always in the same set, for example. + + Args: + filename: File path of the data sample. + validation_percentage: How much of the data set to use for validation. + testing_percentage: How much of the data set to use for testing. + + Returns: + String, one of 'training', 'validation', or 'testing'. + """ + + MAX_NUM_WAVS_PER_CLASS = 2**27 - 1 # ~134M + + base_name = os.path.basename(filename) + + # We want to ignore anything after '_nohash_' in the file name when + # deciding which set to put a wav in, so the data set creator has a way of + # grouping wavs that are close variations of each other. + hash_name = re.sub(r'_nohash_.*$', '', base_name).encode("utf-8") + + # This looks a bit magical, but we need to decide whether this file should + # go into the training, testing, or validation sets, and we want to keep + # existing files in the same set even if more files are subsequently + # added. + # To do that, we need a stable way of deciding based on just the file name + # itself, so we do a hash of that and then use that to generate a + # probability value that we use to assign it. + hash_name_hashed = hashlib.sha1(hash_name).hexdigest() + percentage_hash = ((int(hash_name_hashed, 16) % ( + MAX_NUM_WAVS_PER_CLASS + 1)) * (100.0 / MAX_NUM_WAVS_PER_CLASS)) + + if percentage_hash < validation_percentage: + result = 'validation' + elif percentage_hash < (testing_percentage + validation_percentage): + result = 'testing' + else: + result = 'training' + + return result + + +def filter_speechcommands(tag, training_percentage, data): + if training_percentage < 100.: + testing_percentage = ( + 100. - training_percentage - validation_percentage) + + def which_set_filter(x): return which_set( + x, validation_percentage, testing_percentage) == tag + data._walker = list(filter(which_set_filter, data._walker)) + return data + + +def datasets_speechcommands(): + + root = "./" + + def create(tag): + data = SPEECHCOMMANDS(root, download=True) + data = filter_speechcommands(tag, training_percentage, data) + data = Processed(process_datapoint, data) + # data = diskcache_iterator(data) + data = MapMemoryCache(data) + return data + + return create("training"), create("validation"), create("testing") + + +# In[ ]: + + +if args.dataset == "librispeech": + training, validation, _ = datasets_librispeech() +elif args.dataset == "speechcommand": + training, validation, _ = datasets_speechcommands() + + +# In[ ]: + + +if False: + + from collections import Counter + from collections import OrderedDict + + training_unprocessed = SPEECHCOMMANDS("./", download=True) + training_unprocessed = filter_speechcommands( + training_percentage, training_unprocessed) + + counter = Counter([t[2] for t in training_unprocessed]) + counter = OrderedDict(counter.most_common()) + + plt.bar(counter.keys(), counter.values(), align='center') + + if resample is not None: + waveform, sample_rate = training_unprocessed[0][0], training_unprocessed[0][1] + + fn = "sound.wav" + torchaudio.save(fn, waveform, sample_rate_new) + ipd.Audio(fn) + + +# # Word Decoder + +# In[ ]: + + +def greedy_decode(outputs): + """Greedy Decoder. Returns highest probability of class labels for each timestep + + Args: + outputs (torch.Tensor): shape (input length, batch size, number of classes (including blank)) + + Returns: + torch.Tensor: class labels per time step. + """ + _, indices = topk(outputs, k=1, dim=-1) + return indices[..., 0] + + +# In[ ]: + + +def build_transitions(): + + from collections import Counter + + c = None + + for _, label in training: + # Count bigrams + count = [((a.item(), b.item())) for (a, b) in zip(label, label[1:])] + count = Counter(count) + if c is None: + c = count + else: + c = c + count + + # Encode as transition matrix + + ind = torch.tensor(list(zip(*[a for (a, b) in c.items()]))) + val = torch.tensor([b for (a, b) in c.items()], dtype=torch.float) + + transitions = torch.sparse_coo_tensor(indices=ind, values=val, size=[ + vocab_size, vocab_size]).coalesce().to_dense() + transitions = (transitions/torch.max(torch.tensor(1.), + transitions.max(dim=1)[0]).unsqueeze(1)) + + return transitions + + +if args.viterbi_decoder: + print("transitions: building", flush=True) + transitions = build_transitions() + print("transitions: done", flush=True) + + +# In[ ]: + + +# https://gist.github.com/PetrochukM/afaa3613a99a8e7213d2efdd02ae4762 +# https://github.com/napsternxg/pytorch-practice/blob/master/Viterbi%20decoding%20and%20CRF.ipynb + + +def viterbi_decode(tag_sequence: torch.Tensor, transition_matrix: torch.Tensor, top_k: int = 5): + """ + Perform Viterbi decoding in log space over a sequence given a transition matrix + specifying pairwise (transition) potentials between tags and a matrix of shape + (sequence_length, num_tags) specifying unary potentials for possible tags per + timestep. + Parameters + ---------- + tag_sequence : torch.Tensor, required. + A tensor of shape (sequence_length, num_tags) representing scores for + a set of tags over a given sequence. + transition_matrix : torch.Tensor, required. + A tensor of shape (num_tags, num_tags) representing the binary potentials + for transitioning between a given pair of tags. + top_k : int, required. + Integer defining the top number of paths to decode. + Returns + ------- + viterbi_path : List[int] + The tag indices of the maximum likelihood tag sequence. + viterbi_score : float + The score of the viterbi path. + """ + sequence_length, num_tags = tag_sequence.size() + + path_scores = [] + path_indices = [] + # At the beginning, the maximum number of permutations is 1; therefore, we unsqueeze(0) + # to allow for 1 permutation. + path_scores.append(tag_sequence[0, :].unsqueeze(0)) + # assert path_scores[0].size() == (n_permutations, num_tags) + + # Evaluate the scores for all possible paths. + for timestep in range(1, sequence_length): + # Add pairwise potentials to current scores. + # assert path_scores[timestep - 1].size() == (n_permutations, num_tags) + summed_potentials = path_scores[timestep - + 1].unsqueeze(2) + transition_matrix + summed_potentials = summed_potentials.view(-1, num_tags) + + # Best pairwise potential path score from the previous timestep. + max_k = min(summed_potentials.size()[0], top_k) + scores, paths = torch.topk(summed_potentials, k=max_k, dim=0) + # assert scores.size() == (n_permutations, num_tags) + # assert paths.size() == (n_permutations, num_tags) + + scores = tag_sequence[timestep, :] + scores + # assert scores.size() == (n_permutations, num_tags) + path_scores.append(scores) + path_indices.append(paths.squeeze()) + + # Construct the most likely sequence backwards. + path_scores = path_scores[-1].view(-1) + max_k = min(path_scores.size()[0], top_k) + viterbi_scores, best_paths = torch.topk(path_scores, k=max_k, dim=0) + + viterbi_paths = [] + for i in range(max_k): + + viterbi_path = [best_paths[i].item()] + for backward_timestep in reversed(path_indices): + viterbi_path.append( + int(backward_timestep.view(-1)[viterbi_path[-1]])) + + # Reverse the backward path. + viterbi_path.reverse() + + # Viterbi paths uses (num_tags * n_permutations) nodes; therefore, we need to modulo. + viterbi_path = [j % num_tags for j in viterbi_path] + viterbi_paths.append(viterbi_path) + + return viterbi_paths, viterbi_scores + + +def batch_viterbi_decode(tag_sequence: torch.Tensor, transition_matrix: torch.Tensor, top_k: int = 5): + + outputs = [] + scores = [] + for i in range(tag_sequence.shape[1]): + paths, score = viterbi_decode(tag_sequence[:, i, :], transitions) + outputs.append(paths) + scores.append(score) + + return torch.tensor(outputs).transpose(0, -1), torch.cat(scores) + + +def top_batch_viterbi_decode(tag_sequence: torch.Tensor): + output, _ = batch_viterbi_decode(tag_sequence, transitions, top_k=1) + return output[:, 0, :] + + +# In[ ]: + + +def levenshtein_distance_array(r, h): + + # initialisation + dnew = array('d', [0] * (len(h)+1)) + dold = array('d', [0] * (len(h)+1)) + + # computation + for i in range(1, len(r)+1): + for j in range(1, len(h)+1): + + if r[i-1] == h[j-1]: + dnew[j] = dold[j-1] + else: + substitution = dold[j-1] + 1 + insertion = dnew[j-1] + 1 + deletion = dold[j] + 1 + dnew[j] = min(substitution, insertion, deletion) + + dnew, dold = dold, dnew + + return dnew[-1] + + +# In[ ]: + + +def levenshtein_distance_list(r, h): + + # initialisation + d = [[0] * (len(h)+1)] * (len(r)+1) + + # computation + for i in range(1, len(r)+1): + for j in range(1, len(h)+1): + + if r[i-1] == h[j-1]: + d[i].append(d[i-1][j-1]) + else: + substitution = d[i-1][j-1] + 1 + insertion = d[i][j-1] + 1 + deletion = d[i-1][j] + 1 + d[i].append(min(substitution, insertion, deletion)) + + return d[len(r)][len(h)] + + +# In[ ]: + + +# https://martin-thoma.com/word-error-rate-calculation/ + + +def levenshtein_distance(r: str, h: str, device: Optional[str] = None): + + # initialisation + d = torch.zeros((2, len(h)+1), dtype=torch.long) # , device=device) + dold = 0 + dnew = 1 + + # computation + for i in range(1, len(r)+1): + d[dnew, 0] = 0 + for j in range(1, len(h)+1): + + if r[i-1] == h[j-1]: + d[dnew, j] = d[dnew-1, j-1] + else: + substitution = d[dnew-1, j-1] + 1 + insertion = d[dnew, j-1] + 1 + deletion = d[dnew-1, j] + 1 + d[dnew, j] = min(substitution, insertion, deletion) + + dnew, dold = dold, dnew + + dist = d[dnew, -1].item() + + return dist + + +# In[ ]: + + +if False: + r = "abcdddee" + h = "abcddde" + + get_ipython().run_line_magic('timeit', 'levenshtein_distance(r, h)') + + jitted = torch.jit.script(levenshtein_distance) + get_ipython().run_line_magic('timeit', 'jitted(r, h)') + + get_ipython().run_line_magic('timeit', 'levenshtein_distance_list(r, h)') + + jitted = torch.jit.script(levenshtein_distance_list) + # %timeit jitted(r, h) + + get_ipython().run_line_magic('timeit', 'levenshtein_distance_array(r, h)') + + jitted = torch.jit.script(levenshtein_distance_array) + # %timeit jitted(r, h) + + +# # Train + +# In[ ]: + + +def collate_fn(batch): + + tensors = [b[0] for b in batch if b] + + tensors_lengths = torch.tensor( + [model_length_function(t) for t in tensors], dtype=torch.long, device=tensors[0].device + ) + + tensors = torch.nn.utils.rnn.pad_sequence(tensors, batch_first=True) + tensors = tensors.transpose(1, -1) + + targets = [b[1] for b in batch if b] + target_lengths = torch.tensor( + [target.shape[0] for target in targets], dtype=torch.long, device=tensors.device + ) + targets = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True) + + return tensors, targets, tensors_lengths, target_lengths + + +# In[ ]: + + +if args.jit: + model = torch.jit.script(model) + +if not args.distributed: + model = torch.nn.DataParallel(model) +else: + model.cuda() + model = torch.nn.parallel.DistributedDataParallel(model) + # model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True) + +model = model.to(device, non_blocking=non_blocking) +print('model cuda', flush=True) +# model.apply(weight_init) + + +# In[ ]: + + +def count_parameters(model): + return sum(p.numel() for p in model.parameters() if p.requires_grad) + + +if not args.distributed or os.environ['SLURM_PROCID'] == '0': + n = count_parameters(model) + print(f"Number of parameters: {n}", flush=True) + # Each float32 is 4 bytes. + print(f"Approximate space taken: {n * 4 / (10 ** 6):.1f} MB", flush=True) + +if False: + print("Total memory: ", torch.cuda.get_device_properties( + 0).total_memory / 10**6) # Convert to MB + + t = torch.cuda.get_device_properties(0).total_memory + c = torch.cuda.memory_cached(0) + a = torch.cuda.memory_allocated(0) + f = c-a # free inside cache + + print("Free memory inside cache: ", f) + + +# In[ ]: + + +print(torch.cuda.memory_summary(), flush=True) + + +# In[ ]: + + +optimizer = Optimizer(model.parameters(), **optimizer_params) +scheduler = ExponentialLR(optimizer, gamma=args.gamma) +# scheduler = ReduceLROnPlateau(optimizer, patience=2, threshold=1e-3) + +criterion = torch.nn.CTCLoss( + blank=coder.mapping[char_blank], zero_infinity=zero_infinity) +# criterion = nn.MSELoss() +# criterion = torch.nn.NLLLoss() + +best_loss = 1. + + +# In[ ]: + + +loader_training = DataLoader( + training, batch_size=batch_size, collate_fn=collate_fn, **data_loader_training_params +) + +loader_validation = DataLoader( + validation, batch_size=batch_size, collate_fn=collate_fn, **data_loader_validation_params +) + +print("Length of data loaders: ", len(loader_training), + len(loader_validation), flush=True) + +# num_features = next(iter(loader_training))[0].shape[1] +# print(num_features, flush=True) + + +# In[ ]: + + +def forward_loss(inputs, targets, tensors_lengths, target_lengths): + + inputs = inputs.to(device, non_blocking=non_blocking) + targets = targets.to(device, non_blocking=non_blocking) + + # keep batch first for data parallel + outputs = model(inputs).transpose(0, 1) + + # this_batch_size = outputs.shape[1] + # seq_len = outputs.shape[0] + # input_lengths = torch.full((this_batch_size,), seq_len, dtype=torch.long, device=outputs.device) + # input_lengths = tensors_lengths + + # CTC + # outputs: input length, batch size, number of classes (including blank) + # targets: batch size, max target length + # input_lengths: batch size + # target_lengths: batch size + + return criterion(outputs, targets, tensors_lengths, target_lengths) + + +inds = random.sample(range(args.batch_size), k=2) + + +def forward_decode(inputs, targets, decoder): + + inputs = inputs.to(device, non_blocking=True) + output = model(inputs).to("cpu") + output = decoder(output) + + output = decode(output.tolist()) + target = decode(targets.tolist()) + + print_length = 20 + for i in inds: + output_print = output[i].ljust(print_length)[:print_length] + target_print = target[i].ljust(print_length)[:print_length] + print( + f"Epoch: {epoch:4} Target: {target_print} Output: {output_print}", flush=True) + + cers = [levenshtein_distance(a, b) for a, b in zip(target, output)] + cers_normalized = [d/len(a) for a, d in zip(target, cers)] + cers = statistics.mean(cers) + cers_normalized = statistics.mean(cers_normalized) + + output = [o.split(char_space) for o in output] + target = [o.split(char_space) for o in target] + + wers = [levenshtein_distance(a, b) for a, b in zip(target, output)] + wers_normalized = [d/len(a) for a, d in zip(target, wers)] + wers = statistics.mean(wers) + wers_normalized = statistics.mean(wers_normalized) + + print(f"Epoch: {epoch:4} CER: {cers:1.5f} WER: {wers:1.5f}", flush=True) + + return cers, wers, cers_normalized, wers_normalized + + +# In[ ]: + + +history_loader = defaultdict(list) +history_training = defaultdict(list) +history_validation = defaultdict(list) + +if args.resume and os.path.isfile(CHECKPOINT_filename): + print("Checkpoint: loading '{}'".format(CHECKPOINT_filename)) + checkpoint = torch.load(CHECKPOINT_filename) + + args.start_epoch = checkpoint['epoch'] + best_loss = checkpoint['best_loss'] + history_training = checkpoint['history_training'] + history_validation = checkpoint['history_validation'] + + model.load_state_dict(checkpoint['state_dict']) + optimizer.load_state_dict(checkpoint['optimizer']) + scheduler.load_state_dict(checkpoint['scheduler']) + + print("Checkpoint: loaded '{}' at epoch {}".format( + CHECKPOINT_filename, checkpoint['epoch'])) + print(tabulate(history_training, headers="keys"), flush=True) + print(tabulate(history_validation, headers="keys"), flush=True) +else: + print("Checkpoint: not found") + + save_checkpoint({ + 'epoch': args.start_epoch, + 'state_dict': model.state_dict(), + 'best_loss': best_loss, + 'optimizer': optimizer.state_dict(), + 'scheduler': scheduler.state_dict(), + 'history_training': history_training, + 'history_validation': history_validation, + }, False) + + +# In[ ]: + + +with tqdm(total=args.epochs, unit_scale=1, disable=args.distributed) as pbar: + for epoch in range(args.start_epoch, args.epochs): + torch.cuda.reset_max_memory_allocated() + model.train() + + sum_loss = 0. + total_norm = 0. + for inputs, targets, tensors_lengths, target_lengths in bg_iterator(loader_training, maxsize=2): + + loss = forward_loss( + inputs, targets, tensors_lengths, target_lengths) + sum_loss += loss.item() + + optimizer.zero_grad() + loss.backward() + + norm = 0. + if clip_norm > 0: + norm = torch.nn.utils.clip_grad_norm_( + model.parameters(), clip_norm) + total_norm += norm + elif args.gradient: + for p in list(filter(lambda p: p.grad is not None, model.parameters())): + norm += p.grad.data.norm(2).item() ** 2 + norm = norm ** .5 + total_norm += norm + + optimizer.step() + + memory = torch.cuda.max_memory_allocated() + # print(f"memory in training: {memory}", flush=True) + + history_loader["epoch"].append(epoch) + history_loader["n"].append(pbar.n) + history_loader["memory"].append(memory) + + if SIGNAL_RECEIVED: + save_checkpoint({ + 'epoch': epoch, + 'state_dict': model.state_dict(), + 'best_loss': best_loss, + 'optimizer': optimizer.state_dict(), + 'scheduler': scheduler.state_dict(), + 'history_training': history_training, + 'history_validation': history_validation, + }, False) + trigger_job_requeue() + + pbar.update(1/len(loader_training)) + + total_norm = (total_norm ** .5) / len(loader_training) + if total_norm > 0: + print( + f"Epoch: {epoch:4} Gradient: {total_norm:4.5f}", flush=True) + + # Average loss + sum_loss = sum_loss / len(loader_training) + sum_loss_str = f"Epoch: {epoch:4} Train: {sum_loss:4.5f}" + + scheduler.step() + + memory = torch.cuda.max_memory_allocated() + print(f"memory after training: {memory}", flush=True) + + history_training["epoch"].append(epoch) + history_training["gradient_norm"].append(total_norm) + history_training["sum_loss"].append(sum_loss) + history_training["max_memory_allocated"].append(memory) + + if not epoch % args.print_freq or epoch == args.epochs - 1: + + with torch.no_grad(): + + # Switch to evaluation mode + model.eval() + + sum_loss = 0. + sum_out_greedy = [0, 0, 0, 0] + sum_out_viterbi = [0, 0, 0, 0] + + for inputs, targets, tensors_lengths, target_lengths in bg_iterator(loader_validation, maxsize=2): + sum_loss += forward_loss(inputs, targets, + tensors_lengths, target_lengths).item() + + if True: + out_greedy = forward_decode( + inputs, targets, greedy_decode) + for i in range(len(out_greedy)): + sum_out_greedy[i] += out_greedy[i] + if args.viterbi_decoder: + out_viterbi = forward_decode( + inputs, targets, top_batch_viterbi_decode) + for i in range(len(out_greedy)): + sum_out_viterbi[i] += out_viterbi[i] + + if SIGNAL_RECEIVED: + break + + # Average loss + sum_loss = sum_loss / len(loader_validation) + sum_loss_str += f" Validation: {sum_loss:.5f}" + print(sum_loss_str, flush=True) + + if True: + for i in range(len(out_greedy)): + sum_out_greedy[i] /= len(loader_validation) + print(f"greedy decoder: {sum_out_greedy}", flush=True) + cer1, wer1, cern1, wern1 = sum_out_greedy + if args.viterbi_decoder: + for i in range(len(out_viterbi)): + sum_out_viterbi[i] /= len(loader_validation) + print(f"viterbi decoder: {sum_out_viterbi}", flush=True) + cer2, wer2, cern2, wern2 = sum_out_viterbi + + memory = torch.cuda.max_memory_allocated() + print(f"memory after validation: {memory}", flush=True) + + history_validation["epoch"].append(epoch) + history_validation["max_memory_allocated"].append(memory) + history_validation["sum_loss"].append(sum_loss) + + if True: + history_validation["greedy_cer"].append(cer1) + history_validation["greedy_cer_normalized"].append(cern1) + history_validation["greedy_wer"].append(wer1) + history_validation["greedy_wer_normalized"].append(wern1) + if args.viterbi_decoder: + history_validation["viterbi_cer"].append(cer2) + history_validation["viterbi_cer_normalized"].append(cern2) + history_validation["viterbi_wer"].append(wer2) + history_validation["viterbi_wer_normalized"].append(wern2) + + is_best = sum_loss < best_loss + best_loss = min(sum_loss, best_loss) + save_checkpoint({ + 'epoch': epoch + 1, + 'state_dict': model.state_dict(), + 'best_loss': best_loss, + 'optimizer': optimizer.state_dict(), + 'scheduler': scheduler.state_dict(), + 'history_training': history_training, + 'history_validation': history_validation, + }, is_best) + + print(tabulate(history_training, headers="keys"), flush=True) + print(tabulate(history_validation, headers="keys"), flush=True) + print(torch.cuda.memory_summary(), flush=True) + + # scheduler.step(sum_loss) + + # Create an empty file HALT_filename, mark the job as finished + if epoch == args.epochs - 1: + open(HALT_filename, 'a').close() + + +# In[ ]: + + +print(tabulate(history_training, headers="keys"), flush=True) +print(tabulate(history_validation, headers="keys"), flush=True) +print(torch.cuda.memory_summary(), flush=True) + + +# In[ ]: + + +print(tabulate(history_loader, headers="keys"), flush=True) + + +# In[ ]: + + +plt.plot(history_loader["epoch"], + history_loader["memory"], label="memory") + + +# In[ ]: + + +history_validation["epoch"] + + +# In[ ]: + + +if not args.distributed or os.environ['SLURM_PROCID'] == '0': + + if "greedy_cer" in history_validation: + plt.plot(history_validation["epoch"], + history_validation["greedy_cer"], label="greedy") + if "viterbi_cer" in history_validation: + plt.plot(history_validation["epoch"], + history_validation["viterbi_cer"], label="viterbi") + plt.legend() + plt.savefig(os.path.join(args.figures, "cer.png") + + +# In[ ]: + + +if not args.distributed or os.environ['SLURM_PROCID'] == '0': + + if "greedy_wer" in history_validation: + plt.plot(history_validation["epoch"], + history_validation["greedy_wer"], label="greedy") + if "viterbi_wer" in history_validation: + plt.plot(history_validation["epoch"], + history_validation["viterbi_wer"], label="viterbi") + plt.legend() + plt.savefig(os.path.join(args.figures, "wer.png") + + +# In[ ]: + + +if not args.distributed or os.environ['SLURM_PROCID'] == '0': + + if "greedy_cer_normalized" in history_validation: + plt.plot(history_validation["epoch"], + history_validation["greedy_cer_normalized"], label="greedy") + if "viterbi_cer_normalized" in history_validation: + plt.plot(history_validation["epoch"], + history_validation["viterbi_cer_normalized"], label="viterbi") + plt.legend() + plt.savefig(os.path.join(args.figures, "cer_normalized.png") + + +# In[ ]: + + +if not args.distributed or os.environ['SLURM_PROCID'] == '0': + + if "greedy_wer_normalized" in history_validation: + plt.plot(history_validation["epoch"], + history_validation["greedy_wer_normalized"], label="greedy") + if "viterbi_wer_normalized" in history_validation: + plt.plot(history_validation["epoch"], + history_validation["viterbi_wer_normalized"], label="viterbi") + plt.legend() + plt.savefig(os.path.join(args.figures, "wer_normalized.png") + + +# In[ ]: + + +if not args.distributed or os.environ['SLURM_PROCID'] == '0': + + plt.plot(history_training["epoch"], + history_training["sum_loss"], label="training") + plt.plot(history_validation["epoch"], + history_validation["sum_loss"], label="validation") + plt.legend() + plt.savefig(os.path.join(args.figures, "sum_loss.png") + + +# In[ ]: + + +if not args.distributed or os.environ['SLURM_PROCID'] == '0': + + plt.plot(history_training["epoch"], + history_training["sum_loss"], label="training") + plt.plot(history_validation["epoch"], + history_validation["sum_loss"], label="validation") + plt.yscale("log") + plt.legend() + plt.savefig(os.path.join(args.figures, "log_sum_loss.png") + + +# In[ ]: + + +if not args.distributed or os.environ['SLURM_PROCID'] == '0': + print(torch.cuda.memory_summary(), flush=True) + + +# In[ ]: + + +# Print performance +pr.disable() +s = StringIO() +ps = ( + pstats + .Stats(pr, stream=s) + .strip_dirs() + .sort_stats("cumtime") + .print_stats(20) +) +print(s.getvalue(), flush=True) +print("stop time: {}".format(str(datetime.now())), flush=True) + From 3e2f24a43100e218b1d9688884e41c53216c68da Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 12 May 2020 12:36:32 -0700 Subject: [PATCH 002/129] removing notebook conversion artifacts. --- examples/pipeline/wav2letter.py | 397 +------------------------------- 1 file changed, 12 insertions(+), 385 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index f2de936f81..50eb393eb2 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -1,19 +1,9 @@ -#!/usr/bin/env python -# coding: utf-8 - -# In[ ]: - - # https://github.com/pytorch/pytorch/issues/13883 import torch.multiprocessing as mp if __name__ == '__main__': mp.set_start_method('forkserver') - -# In[ ]: - - import argparse import collections import cProfile @@ -48,6 +38,7 @@ from torchaudio.datasets import LIBRISPEECH, SPEECHCOMMANDS from torchaudio.datasets.utils import bg_iterator, diskcache_iterator from torchaudio.transforms import MFCC, Resample +from torchaudio.models.wav2letter import Wav2Letter from tqdm.notebook import tqdm as tqdm print("start time: {}".format(str(datetime.now())), flush=True) @@ -67,9 +58,6 @@ pr.enable() -# In[ ]: - - # Create argument parser parser = argparse.ArgumentParser() @@ -124,9 +112,6 @@ args = parser.parse_args() -# In[ ]: - - if args.learning_rate < 0.: args.learning_rate = 10 ** random.uniform(-3, 1) @@ -137,18 +122,7 @@ args.gamma = random.uniform(.95, 1.) -# In[ ]: - - -args.batch_size = 32 -args.model = "wav2letter" -args.dataset = "speechcommand" -args.print_freq = 1 - - -# # Checkpoint - -# In[ ]: +# Checkpoint MAIN_PID = os.getpid() @@ -221,10 +195,7 @@ def save_checkpoint(state, is_best, filename=CHECKPOINT_filename): print("Checkpoint: saved") -# # Distributed - -# In[ ]: - +# Distributed # Use #nodes as world_size if 'SLURM_NNODES' in os.environ: @@ -243,18 +214,12 @@ def save_checkpoint(state, is_best, filename=CHECKPOINT_filename): print('init process', flush=True) -# # Parameters - -# In[ ]: +# Parameters if not args.distributed or os.environ['SLURM_PROCID'] == '0': print(pprint.pformat(vars(args)), flush=True) - -# In[ ]: - - audio_backend = "soundfile" torchaudio.set_audio_backend(audio_backend) @@ -371,10 +336,7 @@ def save_checkpoint(state, is_best, filename=CHECKPOINT_filename): zero_infinity = False -# # Text encoding - -# In[ ]: - +# Text encoding class Coder: def __init__(self, labels): @@ -412,151 +374,15 @@ def decode(self, tensor): print("vocab_size", vocab_size, flush=True) -# # Model -# -# [Wav2Letter](https://github.com/LearnedVector/Wav2Letter/blob/master/Google%20Speech%20Command%20Example.ipynb) - -# In[ ]: - - -def weight_init(m): - if isinstance(m, nn.Linear): - size = m.weight.size() - fan_out = size[0] # number of rows - fan_in = size[1] # number of columns - variance = math.sqrt(2.0/(fan_in + fan_out)) - m.weight.data.normal_(0.0, variance) - - -class PrintLayer(nn.Module): - def __init__(self): - super().__init__() +# Model - def forward(self, x): - print(x, flush=True) - return x +model = Wav2Letter(num_features, vocab_size) +def model_length_function(tensor): + return int(tensor.shape[0])//2 + 1 -class Wav2Letter(nn.Module): - """Wav2Letter Speech Recognition model - https://arxiv.org/pdf/1609.03193.pdf - This specific architecture accepts mfcc or power spectrums speech signals - Args: - num_features (int): number of mfcc features - num_classes (int): number of unique grapheme class labels - """ - - def __init__(self, num_features, num_classes): - super().__init__() - - # Conv1d(in_channels, out_channels, kernel_size, stride) - self.layers = nn.Sequential( - nn.Conv1d(in_channels=num_features, out_channels=250, - kernel_size=48, stride=2, padding=23), - nn.ReLU(inplace=True), - nn.Conv1d(in_channels=250, out_channels=250, - kernel_size=7, stride=1, padding=3), - nn.ReLU(inplace=True), - nn.Conv1d(in_channels=250, out_channels=250, - kernel_size=7, stride=1, padding=3), - nn.ReLU(inplace=True), - nn.Conv1d(in_channels=250, out_channels=250, - kernel_size=7, stride=1, padding=3), - nn.ReLU(inplace=True), - nn.Conv1d(in_channels=250, out_channels=250, - kernel_size=7, stride=1, padding=3), - nn.ReLU(inplace=True), - nn.Conv1d(in_channels=250, out_channels=250, - kernel_size=7, stride=1, padding=3), - nn.ReLU(inplace=True), - nn.Conv1d(in_channels=250, out_channels=250, - kernel_size=7, stride=1, padding=3), - nn.ReLU(inplace=True), - nn.Conv1d(in_channels=250, out_channels=250, - kernel_size=7, stride=1, padding=3), - nn.ReLU(inplace=True), - nn.Conv1d(in_channels=250, out_channels=2000, - kernel_size=32, stride=1, padding=16), - nn.ReLU(inplace=True), - nn.Conv1d(in_channels=2000, out_channels=2000, - kernel_size=1, stride=1, padding=0), - nn.ReLU(inplace=True), - nn.Conv1d(in_channels=2000, out_channels=num_classes, - kernel_size=1, stride=1, padding=0), - nn.ReLU(inplace=True), - ) - - def forward(self, batch): - """Forward pass through Wav2Letter network than - takes log probability of output - Args: - batch (int): mini batch of data - shape (batch, num_features, frame_len) - Returns: - Tensor with shape (batch_size, num_classes, output_len) - """ - # batch: (batch_size, num_features, seq_len) - y_pred = self.layers(batch) - # y_pred: (batch_size, num_classes, output_len) - y_pred = y_pred.transpose(-1, -2) - # y_pred: (batch_size, output_len, num_classes) - return nn.functional.log_softmax(y_pred, dim=-1) - - -# In[ ]: - - -class LSTMModel(nn.Module): - - def __init__(self, num_features, num_classes, hidden_size, num_layers, bidirectional, dropout, batch_first): - super().__init__() - - directions = bidirectional + 1 - - self.layer = nn.LSTM( - num_features, hidden_size=hidden_size, - num_layers=num_layers, bidirectional=bidirectional, dropout=dropout, batch_first=batch_first - ) - # self.activation = nn.ReLU(inplace=True) - self.hidden2class = nn.Linear(directions*hidden_size, num_classes) - - def forward(self, batch): - self.layer.flatten_parameters() - # print("forward", flush=True) - # batch: batch, num_features, seq_len - # print(batch.shape, flush=True) - batch = batch.transpose(-1, -2).contiguous() - # batch: batch, seq_len, num_features - # print(batch.shape, flush=True) - outputs, _ = self.layer(batch) - # outputs = self.activation(outputs) - # outputs: batch, seq_len, directions*num_features - outputs = self.hidden2class(outputs) - # outputs: batch, seq_len, num_features - # print(outputs.shape, flush=True) - return nn.functional.log_softmax(outputs, dim=-1) - - -# In[ ]: - - -if args.arch == "wav2letter": - model = Wav2Letter(num_features, vocab_size) - - def model_length_function(tensor): - return int(tensor.shape[0])//2 + 1 - -elif args.arch == "lstm": - model = LSTMModel(num_features, vocab_size, **lstm_params) - - def model_length_function(tensor): - return int(tensor.shape[0]) - - -# # Dataset - -# In[ ]: +# Dataset class IterableMemoryCache: @@ -628,15 +454,6 @@ def __len__(self): return len(self.dataset) -# In[ ]: - - -# mfcc = mfcc.to(device) -# resample = resample.to(device) - -# @torch.jit.script - - def process_datapoint(item): transformed = item[0] # .to(device, non_blocking=non_blocking) target = item[2].lower() @@ -654,9 +471,6 @@ def process_datapoint(item): return transformed, target -# In[ ]: - - def datasets_librispeech(): def create(tag): @@ -677,9 +491,6 @@ def create(tag): # return create(["train-clean-100", "train-clean-360", "train-other-500"]), create(["dev-clean", "dev-other"]), None -# In[ ]: - - def which_set(filename, validation_percentage, testing_percentage): """Determines which data partition the file should belong to. @@ -760,43 +571,13 @@ def create(tag): return create("training"), create("validation"), create("testing") -# In[ ]: - - if args.dataset == "librispeech": training, validation, _ = datasets_librispeech() elif args.dataset == "speechcommand": training, validation, _ = datasets_speechcommands() -# In[ ]: - - -if False: - - from collections import Counter - from collections import OrderedDict - - training_unprocessed = SPEECHCOMMANDS("./", download=True) - training_unprocessed = filter_speechcommands( - training_percentage, training_unprocessed) - - counter = Counter([t[2] for t in training_unprocessed]) - counter = OrderedDict(counter.most_common()) - - plt.bar(counter.keys(), counter.values(), align='center') - - if resample is not None: - waveform, sample_rate = training_unprocessed[0][0], training_unprocessed[0][1] - - fn = "sound.wav" - torchaudio.save(fn, waveform, sample_rate_new) - ipd.Audio(fn) - - -# # Word Decoder - -# In[ ]: +# Word Decoder def greedy_decode(outputs): @@ -812,9 +593,6 @@ def greedy_decode(outputs): return indices[..., 0] -# In[ ]: - - def build_transitions(): from collections import Counter @@ -849,13 +627,6 @@ def build_transitions(): print("transitions: done", flush=True) -# In[ ]: - - -# https://gist.github.com/PetrochukM/afaa3613a99a8e7213d2efdd02ae4762 -# https://github.com/napsternxg/pytorch-practice/blob/master/Viterbi%20decoding%20and%20CRF.ipynb - - def viterbi_decode(tag_sequence: torch.Tensor, transition_matrix: torch.Tensor, top_k: int = 5): """ Perform Viterbi decoding in log space over a sequence given a transition matrix @@ -947,35 +718,6 @@ def top_batch_viterbi_decode(tag_sequence: torch.Tensor): return output[:, 0, :] -# In[ ]: - - -def levenshtein_distance_array(r, h): - - # initialisation - dnew = array('d', [0] * (len(h)+1)) - dold = array('d', [0] * (len(h)+1)) - - # computation - for i in range(1, len(r)+1): - for j in range(1, len(h)+1): - - if r[i-1] == h[j-1]: - dnew[j] = dold[j-1] - else: - substitution = dold[j-1] + 1 - insertion = dnew[j-1] + 1 - deletion = dold[j] + 1 - dnew[j] = min(substitution, insertion, deletion) - - dnew, dold = dold, dnew - - return dnew[-1] - - -# In[ ]: - - def levenshtein_distance_list(r, h): # initialisation @@ -996,12 +738,6 @@ def levenshtein_distance_list(r, h): return d[len(r)][len(h)] -# In[ ]: - - -# https://martin-thoma.com/word-error-rate-calculation/ - - def levenshtein_distance(r: str, h: str, device: Optional[str] = None): # initialisation @@ -1029,32 +765,7 @@ def levenshtein_distance(r: str, h: str, device: Optional[str] = None): return dist -# In[ ]: - - -if False: - r = "abcdddee" - h = "abcddde" - - get_ipython().run_line_magic('timeit', 'levenshtein_distance(r, h)') - - jitted = torch.jit.script(levenshtein_distance) - get_ipython().run_line_magic('timeit', 'jitted(r, h)') - - get_ipython().run_line_magic('timeit', 'levenshtein_distance_list(r, h)') - - jitted = torch.jit.script(levenshtein_distance_list) - # %timeit jitted(r, h) - - get_ipython().run_line_magic('timeit', 'levenshtein_distance_array(r, h)') - - jitted = torch.jit.script(levenshtein_distance_array) - # %timeit jitted(r, h) - - -# # Train - -# In[ ]: +# Train def collate_fn(batch): @@ -1077,9 +788,6 @@ def collate_fn(batch): return tensors, targets, tensors_lengths, target_lengths -# In[ ]: - - if args.jit: model = torch.jit.script(model) @@ -1092,10 +800,6 @@ def collate_fn(batch): model = model.to(device, non_blocking=non_blocking) print('model cuda', flush=True) -# model.apply(weight_init) - - -# In[ ]: def count_parameters(model): @@ -1105,30 +809,11 @@ def count_parameters(model): if not args.distributed or os.environ['SLURM_PROCID'] == '0': n = count_parameters(model) print(f"Number of parameters: {n}", flush=True) - # Each float32 is 4 bytes. - print(f"Approximate space taken: {n * 4 / (10 ** 6):.1f} MB", flush=True) - -if False: - print("Total memory: ", torch.cuda.get_device_properties( - 0).total_memory / 10**6) # Convert to MB - - t = torch.cuda.get_device_properties(0).total_memory - c = torch.cuda.memory_cached(0) - a = torch.cuda.memory_allocated(0) - f = c-a # free inside cache - - print("Free memory inside cache: ", f) - - -# In[ ]: print(torch.cuda.memory_summary(), flush=True) -# In[ ]: - - optimizer = Optimizer(model.parameters(), **optimizer_params) scheduler = ExponentialLR(optimizer, gamma=args.gamma) # scheduler = ReduceLROnPlateau(optimizer, patience=2, threshold=1e-3) @@ -1140,10 +825,6 @@ def count_parameters(model): best_loss = 1. - -# In[ ]: - - loader_training = DataLoader( training, batch_size=batch_size, collate_fn=collate_fn, **data_loader_training_params ) @@ -1155,12 +836,6 @@ def count_parameters(model): print("Length of data loaders: ", len(loader_training), len(loader_validation), flush=True) -# num_features = next(iter(loader_training))[0].shape[1] -# print(num_features, flush=True) - - -# In[ ]: - def forward_loss(inputs, targets, tensors_lengths, target_lengths): @@ -1220,10 +895,6 @@ def forward_decode(inputs, targets, decoder): return cers, wers, cers_normalized, wers_normalized - -# In[ ]: - - history_loader = defaultdict(list) history_training = defaultdict(list) history_validation = defaultdict(list) @@ -1259,9 +930,6 @@ def forward_decode(inputs, targets, decoder): }, False) -# In[ ]: - - with tqdm(total=args.epochs, unit_scale=1, disable=args.distributed) as pbar: for epoch in range(args.start_epoch, args.epochs): torch.cuda.reset_max_memory_allocated() @@ -1417,36 +1085,16 @@ def forward_decode(inputs, targets, decoder): open(HALT_filename, 'a').close() -# In[ ]: - - print(tabulate(history_training, headers="keys"), flush=True) print(tabulate(history_validation, headers="keys"), flush=True) print(torch.cuda.memory_summary(), flush=True) - - -# In[ ]: - - print(tabulate(history_loader, headers="keys"), flush=True) -# In[ ]: - - plt.plot(history_loader["epoch"], history_loader["memory"], label="memory") -# In[ ]: - - -history_validation["epoch"] - - -# In[ ]: - - if not args.distributed or os.environ['SLURM_PROCID'] == '0': if "greedy_cer" in history_validation: @@ -1459,9 +1107,6 @@ def forward_decode(inputs, targets, decoder): plt.savefig(os.path.join(args.figures, "cer.png") -# In[ ]: - - if not args.distributed or os.environ['SLURM_PROCID'] == '0': if "greedy_wer" in history_validation: @@ -1474,9 +1119,6 @@ def forward_decode(inputs, targets, decoder): plt.savefig(os.path.join(args.figures, "wer.png") -# In[ ]: - - if not args.distributed or os.environ['SLURM_PROCID'] == '0': if "greedy_cer_normalized" in history_validation: @@ -1489,9 +1131,6 @@ def forward_decode(inputs, targets, decoder): plt.savefig(os.path.join(args.figures, "cer_normalized.png") -# In[ ]: - - if not args.distributed or os.environ['SLURM_PROCID'] == '0': if "greedy_wer_normalized" in history_validation: @@ -1504,9 +1143,6 @@ def forward_decode(inputs, targets, decoder): plt.savefig(os.path.join(args.figures, "wer_normalized.png") -# In[ ]: - - if not args.distributed or os.environ['SLURM_PROCID'] == '0': plt.plot(history_training["epoch"], @@ -1517,9 +1153,6 @@ def forward_decode(inputs, targets, decoder): plt.savefig(os.path.join(args.figures, "sum_loss.png") -# In[ ]: - - if not args.distributed or os.environ['SLURM_PROCID'] == '0': plt.plot(history_training["epoch"], @@ -1531,16 +1164,10 @@ def forward_decode(inputs, targets, decoder): plt.savefig(os.path.join(args.figures, "log_sum_loss.png") -# In[ ]: - - if not args.distributed or os.environ['SLURM_PROCID'] == '0': print(torch.cuda.memory_summary(), flush=True) -# In[ ]: - - # Print performance pr.disable() s = StringIO() From ff90ee90a985039a558df5e8d9a1aea3d61ff2ad Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 12 May 2020 12:39:11 -0700 Subject: [PATCH 003/129] remove extra comments. lint. --- examples/pipeline/wav2letter.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 50eb393eb2..ce3c4cfcbe 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -1,6 +1,6 @@ -# https://github.com/pytorch/pytorch/issues/13883 import torch.multiprocessing as mp +# https://github.com/pytorch/pytorch/issues/13883 if __name__ == '__main__': mp.set_start_method('forkserver') @@ -30,17 +30,19 @@ import torch.distributed as dist import torchaudio from matplotlib import pyplot as plt -from tabulate import tabulate from torch import nn, topk from torch.optim import SGD, Adadelta, Adam from torch.optim.lr_scheduler import ExponentialLR, ReduceLROnPlateau from torch.utils.data import DataLoader from torchaudio.datasets import LIBRISPEECH, SPEECHCOMMANDS from torchaudio.datasets.utils import bg_iterator, diskcache_iterator -from torchaudio.transforms import MFCC, Resample from torchaudio.models.wav2letter import Wav2Letter +from torchaudio.transforms import MFCC, Resample from tqdm.notebook import tqdm as tqdm +from tabulate import tabulate + + print("start time: {}".format(str(datetime.now())), flush=True) try: @@ -228,16 +230,10 @@ def save_checkpoint(state, is_best, filename=CHECKPOINT_filename): device = "cuda" if torch.cuda.is_available() else "cpu" num_devices = torch.cuda.device_count() -# num_devices = 1 print(num_devices, "GPUs", flush=True) # max number of sentences per batch batch_size = args.batch_size -# batch_size = 2048 -# batch_size = 512 -# batch_size = 256 -# batch_size = 64 -# batch_size = 1 training_percentage = 90. validation_percentage = 5. @@ -254,7 +250,7 @@ def save_checkpoint(state, is_best, filename=CHECKPOINT_filename): non_blocking = True -# text preprocessing +# Text preprocessing char_blank = "*" char_space = " " @@ -550,8 +546,9 @@ def filter_speechcommands(tag, training_percentage, data): testing_percentage = ( 100. - training_percentage - validation_percentage) - def which_set_filter(x): return which_set( - x, validation_percentage, testing_percentage) == tag + def which_set_filter(x): + return which_set(x, validation_percentage, testing_percentage) == tag + data._walker = list(filter(which_set_filter, data._walker)) return data @@ -1180,4 +1177,3 @@ def forward_decode(inputs, targets, decoder): ) print(s.getvalue(), flush=True) print("stop time: {}".format(str(datetime.now())), flush=True) - From 30efb4adb1732bd36e00d00ad79daa16dbd130e7 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 13 May 2020 15:34:10 -0700 Subject: [PATCH 004/129] addressing some feedback. --- examples/pipeline/wav2letter.py | 44 +++++---------------------------- 1 file changed, 6 insertions(+), 38 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index ce3c4cfcbe..5ec5a45af3 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -45,12 +45,7 @@ print("start time: {}".format(str(datetime.now())), flush=True) -try: - get_ipython().run_line_magic('matplotlib', 'inline') - in_notebook = True -except NameError: - matplotlib.use("Agg") - in_notebook = False +matplotlib.use("Agg") # Empty CUDA cache torch.cuda.empty_cache() @@ -433,18 +428,12 @@ def __init__(self, process_datapoint, dataset): self.dataset = dataset def __getitem__(self, n): - try: - item = self.dataset[n] - return self.process_datapoint(item) - except (FileNotFoundError, RuntimeError): - return None + item = self.dataset[n] + return self.process_datapoint(item) def __next__(self): - try: - item = next(self.dataset) - return self.process_datapoint(item) - except (FileNotFoundError, RuntimeError): - return self.__next__() + item = next(self.dataset) + return self.process_datapoint(item) def __len__(self): return len(self.dataset) @@ -475,8 +464,7 @@ def create(tag): data = LIBRISPEECH( root, tag, folder_in_archive=folder_in_archive, download=False) else: - data = torch.utils.data.ConcatDataset([LIBRISPEECH( - root, t, folder_in_archive=folder_in_archive, download=False) for t in tag]) + data = sum(LIBRISPEECH(root, t, folder_in_archive=folder_in_archive, download=False) for t in tag) data = Processed(process_datapoint, data) # data = diskcache_iterator(data) @@ -715,26 +703,6 @@ def top_batch_viterbi_decode(tag_sequence: torch.Tensor): return output[:, 0, :] -def levenshtein_distance_list(r, h): - - # initialisation - d = [[0] * (len(h)+1)] * (len(r)+1) - - # computation - for i in range(1, len(r)+1): - for j in range(1, len(h)+1): - - if r[i-1] == h[j-1]: - d[i].append(d[i-1][j-1]) - else: - substitution = d[i-1][j-1] + 1 - insertion = d[i][j-1] + 1 - deletion = d[i-1][j] + 1 - d[i].append(min(substitution, insertion, deletion)) - - return d[len(r)][len(h)] - - def levenshtein_distance(r: str, h: str, device: Optional[str] = None): # initialisation From a55b7cdc5415220fa8b886a33da57b7cdbab41cf Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 13 May 2020 15:56:09 -0700 Subject: [PATCH 005/129] main function. --- examples/pipeline/wav2letter.py | 113 +++++++++++++++----------------- 1 file changed, 54 insertions(+), 59 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 5ec5a45af3..be16581d49 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -55,68 +55,62 @@ pr.enable() -# Create argument parser -parser = argparse.ArgumentParser() - -parser.add_argument('--workers', default=0, type=int, - metavar='N', help='number of data loading workers') -parser.add_argument('--resume', default='', type=str, - metavar='PATH', help='path to latest checkpoint') -parser.add_argument('--figures', default='', type=str, - metavar='PATH', help='folder path to save figures') - -parser.add_argument('--epochs', default=200, type=int, - metavar='N', help='number of total epochs to run') -parser.add_argument('--start-epoch', default=0, type=int, - metavar='N', help='manual epoch number') -parser.add_argument('--print-freq', default=10, type=int, - metavar='N', help='print frequency in epochs') - -parser.add_argument('--arch', metavar='ARCH', default='wav2letter', - choices=["wav2letter", "lstm"], help='model architecture') -parser.add_argument('--batch-size', default=64, type=int, - metavar='N', help='mini-batch size') - -parser.add_argument('--learning-rate', default=1., type=float, - metavar='LR', help='initial learning rate') -parser.add_argument('--gamma', default=.96, type=float, - metavar='GAMMA', help='learning rate exponential decay constant') -# parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum') -parser.add_argument('--weight-decay', default=1e-5, - type=float, metavar='W', help='weight decay') -parser.add_argument("--eps", metavar='EPS', type=float, default=1e-8) -parser.add_argument("--rho", metavar='RHO', type=float, default=.95) - -parser.add_argument('--n-bins', default=13, type=int, - metavar='N', help='number of bins in transforms') - -parser.add_argument('--world-size', default=1, type=int, - help='number of distributed processes') -parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', - type=str, help='url used to set up distributed training') -parser.add_argument('--dist-backend', default='nccl', - type=str, help='distributed backend') -parser.add_argument('--distributed', action="store_true") - -parser.add_argument('--dataset', default='librispeech', type=str) -parser.add_argument('--gradient', action="store_true") -parser.add_argument('--jit', action="store_true") -parser.add_argument('--viterbi-decoder', action="store_true") - -if in_notebook: - args, _ = parser.parse_known_args() -else: +def parse_args(): + parser = argparse.ArgumentParser() + + parser.add_argument('--workers', default=0, type=int, + metavar='N', help='number of data loading workers') + parser.add_argument('--resume', default='', type=str, + metavar='PATH', help='path to latest checkpoint') + parser.add_argument('--figures', default='', type=str, + metavar='PATH', help='folder path to save figures') + + parser.add_argument('--epochs', default=200, type=int, + metavar='N', help='number of total epochs to run') + parser.add_argument('--start-epoch', default=0, type=int, + metavar='N', help='manual epoch number') + parser.add_argument('--print-freq', default=10, type=int, + metavar='N', help='print frequency in epochs') + + parser.add_argument('--arch', metavar='ARCH', default='wav2letter', + choices=["wav2letter", "lstm"], help='model architecture') + parser.add_argument('--batch-size', default=64, type=int, + metavar='N', help='mini-batch size') + + parser.add_argument('--learning-rate', default=1., type=float, + metavar='LR', help='initial learning rate') + parser.add_argument('--gamma', default=.96, type=float, + metavar='GAMMA', help='learning rate exponential decay constant') + # parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum') + parser.add_argument('--weight-decay', default=1e-5, + type=float, metavar='W', help='weight decay') + parser.add_argument("--eps", metavar='EPS', type=float, default=1e-8) + parser.add_argument("--rho", metavar='RHO', type=float, default=.95) + + parser.add_argument('--n-bins', default=13, type=int, + metavar='N', help='number of bins in transforms') + + parser.add_argument('--world-size', default=1, type=int, + help='number of distributed processes') + parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', + type=str, help='url used to set up distributed training') + parser.add_argument('--dist-backend', default='nccl', + type=str, help='distributed backend') + parser.add_argument('--distributed', action="store_true") + + parser.add_argument('--dataset', default='librispeech', type=str) + parser.add_argument('--gradient', action="store_true") + parser.add_argument('--jit', action="store_true") + parser.add_argument('--viterbi-decoder', action="store_true") + args = parser.parse_args() + return args -if args.learning_rate < 0.: - args.learning_rate = 10 ** random.uniform(-3, 1) - -if args.weight_decay < 0.: - args.weight_decay = 10 ** random.uniform(-6, 0) -if args.gamma < 0.: - args.gamma = random.uniform(.95, 1.) +if __name__ == "__main__": + args = parse_args() + main(args) # Checkpoint @@ -160,7 +154,7 @@ def signal_handler(a, b): def trigger_job_requeue(): # Submit a new job to resume from checkpoint. - if os.path.isfile(CHECKPOINT_filename) and os.environ['SLURM_PROCID'] == '0' and os.getpid() == MAIN_PID: + if os.path.isfile(CHECKPOINT_filename) and os.environ['SLURM_PROCID'] == '0' and os.getpid() == MAIN_PID: print('pid: ', os.getpid(), ' ppid: ', os.getppid(), flush=True) print('time is up, back to slurm queue', flush=True) command = 'scontrol requeue ' + os.environ['SLURM_JOB_ID'] @@ -369,6 +363,7 @@ def decode(self, tensor): model = Wav2Letter(num_features, vocab_size) + def model_length_function(tensor): return int(tensor.shape[0])//2 + 1 From b95581525937917d36dfc765c17c307d597013a5 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 13 May 2020 16:12:47 -0700 Subject: [PATCH 006/129] defining args in function. --- examples/pipeline/wav2letter.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index be16581d49..547bafca05 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -105,6 +105,16 @@ def parse_args(): args = parser.parse_args() + + # Use #nodes as world_size + if 'SLURM_NNODES' in os.environ: + args.world_size = int(os.environ['SLURM_NNODES']) + + args.distributed = args.distributed or args.world_size > 1 + + if not args.distributed or os.environ['SLURM_PROCID'] == '0': + print(pprint.pformat(vars(args)), flush=True) + return args @@ -188,12 +198,6 @@ def save_checkpoint(state, is_best, filename=CHECKPOINT_filename): # Distributed -# Use #nodes as world_size -if 'SLURM_NNODES' in os.environ: - args.world_size = int(os.environ['SLURM_NNODES']) - -args.distributed = args.distributed or args.world_size > 1 - if args.distributed: os.environ['RANK'] = os.environ['SLURM_PROCID'] os.environ['WORLD_SIZE'] = str(args.world_size) @@ -524,7 +528,7 @@ def which_set(filename, validation_percentage, testing_percentage): return result -def filter_speechcommands(tag, training_percentage, data): +def filter_speechcommands(data, tag, training_percentage, validation_percentage): if training_percentage < 100.: testing_percentage = ( 100. - training_percentage - validation_percentage) @@ -542,7 +546,7 @@ def datasets_speechcommands(): def create(tag): data = SPEECHCOMMANDS(root, download=True) - data = filter_speechcommands(tag, training_percentage, data) + data = filter_speechcommands(data, tag, 90, 5) data = Processed(process_datapoint, data) # data = diskcache_iterator(data) data = MapMemoryCache(data) @@ -786,11 +790,11 @@ def count_parameters(model): best_loss = 1. loader_training = DataLoader( - training, batch_size=batch_size, collate_fn=collate_fn, **data_loader_training_params + training, batch_size=args.batch_size, collate_fn=collate_fn, **data_loader_training_params ) loader_validation = DataLoader( - validation, batch_size=batch_size, collate_fn=collate_fn, **data_loader_validation_params + validation, batch_size=args.batch_size, collate_fn=collate_fn, **data_loader_validation_params ) print("Length of data loaders: ", len(loader_training), From 7565d6123d63fe4d8e9a40b904778887e5f925ca Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 13 May 2020 16:50:18 -0700 Subject: [PATCH 007/129] refactor. --- examples/pipeline/wav2letter.py | 793 ++++++++++++++------------------ 1 file changed, 345 insertions(+), 448 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 547bafca05..4ba1973783 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -43,17 +43,8 @@ from tabulate import tabulate -print("start time: {}".format(str(datetime.now())), flush=True) - matplotlib.use("Agg") -# Empty CUDA cache -torch.cuda.empty_cache() - -# Profiling performance -pr = cProfile.Profile() -pr.enable() - def parse_args(): parser = argparse.ArgumentParser() @@ -115,6 +106,8 @@ def parse_args(): if not args.distributed or os.environ['SLURM_PROCID'] == '0': print(pprint.pformat(vars(args)), flush=True) + args.clip_norm = 0. + return args @@ -125,7 +118,6 @@ def parse_args(): # Checkpoint - MAIN_PID = os.getpid() CHECKPOINT_filename = args.resume if args.resume else 'checkpoint.pth.tar' CHECKPOINT_tempfile = CHECKPOINT_filename + '.temp' @@ -209,125 +201,7 @@ def save_checkpoint(state, is_best, filename=CHECKPOINT_filename): print('init process', flush=True) -# Parameters - - -if not args.distributed or os.environ['SLURM_PROCID'] == '0': - print(pprint.pformat(vars(args)), flush=True) - -audio_backend = "soundfile" -torchaudio.set_audio_backend(audio_backend) - -root = "/datasets01/" -folder_in_archive = "librispeech/062419/" - -device = "cuda" if torch.cuda.is_available() else "cpu" -num_devices = torch.cuda.device_count() -print(num_devices, "GPUs", flush=True) - -# max number of sentences per batch -batch_size = args.batch_size - -training_percentage = 90. -validation_percentage = 5. - -data_loader_training_params = { - "num_workers": args.workers, - "pin_memory": True, - "shuffle": True, - "drop_last": True, -} -data_loader_validation_params = data_loader_training_params.copy() -data_loader_validation_params["shuffle"] = False - -non_blocking = True - - -# Text preprocessing - -char_blank = "*" -char_space = " " -char_apostrophe = "'" - -labels = char_blank + char_space + char_apostrophe + string.ascii_lowercase - -# excluded_dir = ["_background_noise_"] -# folder_speechcommands = './SpeechCommands/speech_commands_v0.02' -# labels = [char_blank, char_space] + [d for d in next(os.walk(folder_speechcommands))[1] if d not in excluded_dir] - - -# audio - -sample_rate_original = 16000 -sample_rate_new = 8000 - -n_bins = args.n_bins # 13, 128 -melkwargs = { - 'n_fft': 512, - 'n_mels': 20, - 'hop_length': 80, # (160, 80) -} - -transforms = nn.Sequential( - # torchaudio.transforms.Resample(sample_rate_original, sample_rate_new), - # torchaudio.transforms.MFCC(sample_rate=sample_rate_original, n_mfcc=n_bins, melkwargs=melkwargs), - torchaudio.transforms.MelSpectrogram( - sample_rate=sample_rate_original, n_mels=n_bins), - # torchaudio.transforms.FrequencyMasking(freq_mask_param=n_bins), - # torchaudio.transforms.TimeMasking(time_mask_param=35) -) - - -# Optimizer - -optimizer_params_adadelta = { - "lr": args.learning_rate, - "eps": args.eps, - "rho": args.rho, - "weight_decay": args.weight_decay, -} - -optimizer_params_adam = { - "lr": args.learning_rate, - "eps": args.eps, - "weight_decay": args.weight_decay, -} - -optimizer_params_sgd = { - "lr": args.learning_rate, - "weight_decay": args.weight_decay, -} - -optimizer_params_adadelta = { - "lr": args.learning_rate, - "eps": args.eps, - "rho": args.rho, - "weight_decay": args.weight_decay, -} - -Optimizer = Adadelta -optimizer_params = optimizer_params_sgd - -# Model - -num_features = n_bins if n_bins else 1 - -lstm_params = { - "hidden_size": 800, - "num_layers": 5, - "batch_first": False, - "bidirectional": False, - "dropout": 0., -} - -clip_norm = 0. # 10. - -zero_infinity = False - - -# Text encoding - -class Coder: +class LanguageModel: def __init__(self, labels): labels = [l for l in labels] self.length = len(labels) @@ -356,20 +230,8 @@ def decode(self, tensor): return x -coder = Coder(labels) -encode = coder.encode -decode = coder.decode -vocab_size = coder.length -print("vocab_size", vocab_size, flush=True) - - -# Model - -model = Wav2Letter(num_features, vocab_size) - - def model_length_function(tensor): - return int(tensor.shape[0])//2 + 1 + return int(tensor.shape[0]) // 2 + 1 # Dataset @@ -555,12 +417,6 @@ def create(tag): return create("training"), create("validation"), create("testing") -if args.dataset == "librispeech": - training, validation, _ = datasets_librispeech() -elif args.dataset == "speechcommand": - training, validation, _ = datasets_speechcommands() - - # Word Decoder @@ -605,10 +461,6 @@ def build_transitions(): return transitions -if args.viterbi_decoder: - print("transitions: building", flush=True) - transitions = build_transitions() - print("transitions: done", flush=True) def viterbi_decode(tag_sequence: torch.Tensor, transition_matrix: torch.Tensor, top_k: int = 5): @@ -705,21 +557,21 @@ def top_batch_viterbi_decode(tag_sequence: torch.Tensor): def levenshtein_distance(r: str, h: str, device: Optional[str] = None): # initialisation - d = torch.zeros((2, len(h)+1), dtype=torch.long) # , device=device) + d = torch.zeros((2, len(h) + 1), dtype=torch.long) # , device=device) dold = 0 dnew = 1 # computation - for i in range(1, len(r)+1): + for i in range(1, len(r) + 1): d[dnew, 0] = 0 - for j in range(1, len(h)+1): + for j in range(1, len(h) + 1): - if r[i-1] == h[j-1]: - d[dnew, j] = d[dnew-1, j-1] + if r[i - 1] == h[j - 1]: + d[dnew, j] = d[dnew - 1, j - 1] else: - substitution = d[dnew-1, j-1] + 1 - insertion = d[dnew, j-1] + 1 - deletion = d[dnew-1, j] + 1 + substitution = d[dnew - 1, j - 1] + 1 + insertion = d[dnew, j - 1] + 1 + deletion = d[dnew - 1, j] + 1 d[dnew, j] = min(substitution, insertion, deletion) dnew, dold = dold, dnew @@ -729,9 +581,6 @@ def levenshtein_distance(r: str, h: str, device: Optional[str] = None): return dist -# Train - - def collate_fn(batch): tensors = [b[0] for b in batch if b] @@ -752,55 +601,10 @@ def collate_fn(batch): return tensors, targets, tensors_lengths, target_lengths -if args.jit: - model = torch.jit.script(model) - -if not args.distributed: - model = torch.nn.DataParallel(model) -else: - model.cuda() - model = torch.nn.parallel.DistributedDataParallel(model) - # model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True) - -model = model.to(device, non_blocking=non_blocking) -print('model cuda', flush=True) - - def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) -if not args.distributed or os.environ['SLURM_PROCID'] == '0': - n = count_parameters(model) - print(f"Number of parameters: {n}", flush=True) - - -print(torch.cuda.memory_summary(), flush=True) - - -optimizer = Optimizer(model.parameters(), **optimizer_params) -scheduler = ExponentialLR(optimizer, gamma=args.gamma) -# scheduler = ReduceLROnPlateau(optimizer, patience=2, threshold=1e-3) - -criterion = torch.nn.CTCLoss( - blank=coder.mapping[char_blank], zero_infinity=zero_infinity) -# criterion = nn.MSELoss() -# criterion = torch.nn.NLLLoss() - -best_loss = 1. - -loader_training = DataLoader( - training, batch_size=args.batch_size, collate_fn=collate_fn, **data_loader_training_params -) - -loader_validation = DataLoader( - validation, batch_size=args.batch_size, collate_fn=collate_fn, **data_loader_validation_params -) - -print("Length of data loaders: ", len(loader_training), - len(loader_validation), flush=True) - - def forward_loss(inputs, targets, tensors_lengths, target_lengths): inputs = inputs.to(device, non_blocking=non_blocking) @@ -823,9 +627,6 @@ def forward_loss(inputs, targets, tensors_lengths, target_lengths): return criterion(outputs, targets, tensors_lengths, target_lengths) -inds = random.sample(range(args.batch_size), k=2) - - def forward_decode(inputs, targets, decoder): inputs = inputs.to(device, non_blocking=True) @@ -836,14 +637,14 @@ def forward_decode(inputs, targets, decoder): target = decode(targets.tolist()) print_length = 20 - for i in inds: + for i in range(2): output_print = output[i].ljust(print_length)[:print_length] target_print = target[i].ljust(print_length)[:print_length] print( f"Epoch: {epoch:4} Target: {target_print} Output: {output_print}", flush=True) cers = [levenshtein_distance(a, b) for a, b in zip(target, output)] - cers_normalized = [d/len(a) for a, d in zip(target, cers)] + cers_normalized = [d / len(a) for a, d in zip(target, cers)] cers = statistics.mean(cers) cers_normalized = statistics.mean(cers_normalized) @@ -851,7 +652,7 @@ def forward_decode(inputs, targets, decoder): target = [o.split(char_space) for o in target] wers = [levenshtein_distance(a, b) for a, b in zip(target, output)] - wers_normalized = [d/len(a) for a, d in zip(target, wers)] + wers_normalized = [d / len(a) for a, d in zip(target, wers)] wers = statistics.mean(wers) wers_normalized = statistics.mean(wers_normalized) @@ -859,288 +660,384 @@ def forward_decode(inputs, targets, decoder): return cers, wers, cers_normalized, wers_normalized -history_loader = defaultdict(list) -history_training = defaultdict(list) -history_validation = defaultdict(list) -if args.resume and os.path.isfile(CHECKPOINT_filename): - print("Checkpoint: loading '{}'".format(CHECKPOINT_filename)) - checkpoint = torch.load(CHECKPOINT_filename) +def main(args): - args.start_epoch = checkpoint['epoch'] - best_loss = checkpoint['best_loss'] - history_training = checkpoint['history_training'] - history_validation = checkpoint['history_validation'] + print("start time: {}".format(str(datetime.now())), flush=True) - model.load_state_dict(checkpoint['state_dict']) - optimizer.load_state_dict(checkpoint['optimizer']) - scheduler.load_state_dict(checkpoint['scheduler']) + # Empty CUDA cache + torch.cuda.empty_cache() - print("Checkpoint: loaded '{}' at epoch {}".format( - CHECKPOINT_filename, checkpoint['epoch'])) - print(tabulate(history_training, headers="keys"), flush=True) - print(tabulate(history_validation, headers="keys"), flush=True) -else: - print("Checkpoint: not found") - - save_checkpoint({ - 'epoch': args.start_epoch, - 'state_dict': model.state_dict(), - 'best_loss': best_loss, - 'optimizer': optimizer.state_dict(), - 'scheduler': scheduler.state_dict(), - 'history_training': history_training, - 'history_validation': history_validation, - }, False) - - -with tqdm(total=args.epochs, unit_scale=1, disable=args.distributed) as pbar: - for epoch in range(args.start_epoch, args.epochs): - torch.cuda.reset_max_memory_allocated() - model.train() - - sum_loss = 0. - total_norm = 0. - for inputs, targets, tensors_lengths, target_lengths in bg_iterator(loader_training, maxsize=2): - - loss = forward_loss( - inputs, targets, tensors_lengths, target_lengths) - sum_loss += loss.item() - - optimizer.zero_grad() - loss.backward() - - norm = 0. - if clip_norm > 0: - norm = torch.nn.utils.clip_grad_norm_( - model.parameters(), clip_norm) - total_norm += norm - elif args.gradient: - for p in list(filter(lambda p: p.grad is not None, model.parameters())): - norm += p.grad.data.norm(2).item() ** 2 - norm = norm ** .5 - total_norm += norm - - optimizer.step() + # Profiling performance + pr = cProfile.Profile() + pr.enable() - memory = torch.cuda.max_memory_allocated() - # print(f"memory in training: {memory}", flush=True) + audio_backend = "soundfile" + torchaudio.set_audio_backend(audio_backend) - history_loader["epoch"].append(epoch) - history_loader["n"].append(pbar.n) - history_loader["memory"].append(memory) + root = "/datasets01/" + folder_in_archive = "librispeech/062419/" - if SIGNAL_RECEIVED: - save_checkpoint({ - 'epoch': epoch, - 'state_dict': model.state_dict(), - 'best_loss': best_loss, - 'optimizer': optimizer.state_dict(), - 'scheduler': scheduler.state_dict(), - 'history_training': history_training, - 'history_validation': history_validation, - }, False) - trigger_job_requeue() + device = "cuda" if torch.cuda.is_available() else "cpu" + num_devices = torch.cuda.device_count() + print(num_devices, "GPUs", flush=True) - pbar.update(1/len(loader_training)) + data_loader_training_params = { + "num_workers": args.workers, + "pin_memory": True, + "shuffle": True, + "drop_last": True, + } + data_loader_validation_params = data_loader_training_params.copy() + data_loader_validation_params["shuffle"] = False - total_norm = (total_norm ** .5) / len(loader_training) - if total_norm > 0: - print( - f"Epoch: {epoch:4} Gradient: {total_norm:4.5f}", flush=True) + non_blocking = True - # Average loss - sum_loss = sum_loss / len(loader_training) - sum_loss_str = f"Epoch: {epoch:4} Train: {sum_loss:4.5f}" + # audio - scheduler.step() + n_bins = args.n_bins # 13, 128 + melkwargs = { + 'n_fft': 512, + 'n_mels': 20, + 'hop_length': 80, # (160, 80) + } - memory = torch.cuda.max_memory_allocated() - print(f"memory after training: {memory}", flush=True) + sample_rate_original = 16000 + sample_rate_new = 8000 - history_training["epoch"].append(epoch) - history_training["gradient_norm"].append(total_norm) - history_training["sum_loss"].append(sum_loss) - history_training["max_memory_allocated"].append(memory) + transforms = nn.Sequential( + # torchaudio.transforms.Resample(sample_rate_original, sample_rate_new), + # torchaudio.transforms.MFCC(sample_rate=sample_rate_original, n_mfcc=n_bins, melkwargs=melkwargs), + torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate_original, n_mels=n_bins), + # torchaudio.transforms.FrequencyMasking(freq_mask_param=n_bins), + # torchaudio.transforms.TimeMasking(time_mask_param=35) + ) - if not epoch % args.print_freq or epoch == args.epochs - 1: + # Text preprocessing - with torch.no_grad(): + char_blank = "*" + char_space = " " + char_apostrophe = "'" - # Switch to evaluation mode - model.eval() + labels = char_blank + char_space + char_apostrophe + string.ascii_lowercase + coder = LanguageModel(labels) + encode = coder.encode + decode = coder.decode + vocab_size = coder.length + print("vocab_size", vocab_size, flush=True) - sum_loss = 0. - sum_out_greedy = [0, 0, 0, 0] - sum_out_viterbi = [0, 0, 0, 0] + if args.dataset == "librispeech": + training, validation, _ = datasets_librispeech() + elif args.dataset == "speechcommand": + training, validation, _ = datasets_speechcommands() - for inputs, targets, tensors_lengths, target_lengths in bg_iterator(loader_validation, maxsize=2): - sum_loss += forward_loss(inputs, targets, - tensors_lengths, target_lengths).item() + if args.viterbi_decoder: + print("transitions: building", flush=True) + transitions = build_transitions() + print("transitions: done", flush=True) - if True: - out_greedy = forward_decode( - inputs, targets, greedy_decode) - for i in range(len(out_greedy)): - sum_out_greedy[i] += out_greedy[i] - if args.viterbi_decoder: - out_viterbi = forward_decode( - inputs, targets, top_batch_viterbi_decode) - for i in range(len(out_greedy)): - sum_out_viterbi[i] += out_viterbi[i] - - if SIGNAL_RECEIVED: - break - - # Average loss - sum_loss = sum_loss / len(loader_validation) - sum_loss_str += f" Validation: {sum_loss:.5f}" - print(sum_loss_str, flush=True) - - if True: - for i in range(len(out_greedy)): - sum_out_greedy[i] /= len(loader_validation) - print(f"greedy decoder: {sum_out_greedy}", flush=True) - cer1, wer1, cern1, wern1 = sum_out_greedy - if args.viterbi_decoder: - for i in range(len(out_viterbi)): - sum_out_viterbi[i] /= len(loader_validation) - print(f"viterbi decoder: {sum_out_viterbi}", flush=True) - cer2, wer2, cern2, wern2 = sum_out_viterbi + # Model - memory = torch.cuda.max_memory_allocated() - print(f"memory after validation: {memory}", flush=True) + num_features = n_bins if n_bins else 1 + model = Wav2Letter(num_features, vocab_size) + + if args.jit: + model = torch.jit.script(model) + + if not args.distributed: + model = torch.nn.DataParallel(model) + else: + model.cuda() + model = torch.nn.parallel.DistributedDataParallel(model) + # model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True) + + model = model.to(device, non_blocking=non_blocking) + + if not args.distributed or os.environ['SLURM_PROCID'] == '0': + n = count_parameters(model) + print(f"Number of parameters: {n}", flush=True) + + print(torch.cuda.memory_summary(), flush=True) + + # Optimizer + + optimizer_params = { + "lr": args.learning_rate, + # "eps": args.eps, + # "rho": args.rho, + "weight_decay": args.weight_decay, + } + + Optimizer = Adadelta + optimizer_params = optimizer_params + + optimizer = Optimizer(model.parameters(), **optimizer_params) + scheduler = ExponentialLR(optimizer, gamma=args.gamma) + # scheduler = ReduceLROnPlateau(optimizer, patience=2, threshold=1e-3) + + criterion = torch.nn.CTCLoss(blank=coder.mapping[char_blank], zero_infinity=False) + # criterion = nn.MSELoss() + # criterion = torch.nn.NLLLoss() + + best_loss = 1. + + loader_training = DataLoader(training, batch_size=args.batch_size, collate_fn=collate_fn, **data_loader_training_params) + loader_validation = DataLoader(validation, batch_size=args.batch_size, collate_fn=collate_fn, **data_loader_validation_params) + + print("Length of data loaders: ", len(loader_training), len(loader_validation), flush=True) - history_validation["epoch"].append(epoch) - history_validation["max_memory_allocated"].append(memory) - history_validation["sum_loss"].append(sum_loss) - - if True: - history_validation["greedy_cer"].append(cer1) - history_validation["greedy_cer_normalized"].append(cern1) - history_validation["greedy_wer"].append(wer1) - history_validation["greedy_wer_normalized"].append(wern1) - if args.viterbi_decoder: - history_validation["viterbi_cer"].append(cer2) - history_validation["viterbi_cer_normalized"].append(cern2) - history_validation["viterbi_wer"].append(wer2) - history_validation["viterbi_wer_normalized"].append(wern2) - - is_best = sum_loss < best_loss - best_loss = min(sum_loss, best_loss) - save_checkpoint({ - 'epoch': epoch + 1, - 'state_dict': model.state_dict(), - 'best_loss': best_loss, - 'optimizer': optimizer.state_dict(), - 'scheduler': scheduler.state_dict(), - 'history_training': history_training, - 'history_validation': history_validation, - }, is_best) + history_loader = defaultdict(list) + history_training = defaultdict(list) + history_validation = defaultdict(list) - print(tabulate(history_training, headers="keys"), flush=True) - print(tabulate(history_validation, headers="keys"), flush=True) - print(torch.cuda.memory_summary(), flush=True) + if args.resume and os.path.isfile(CHECKPOINT_filename): + print("Checkpoint: loading '{}'".format(CHECKPOINT_filename)) + checkpoint = torch.load(CHECKPOINT_filename) - # scheduler.step(sum_loss) + args.start_epoch = checkpoint['epoch'] + best_loss = checkpoint['best_loss'] + history_training = checkpoint['history_training'] + history_validation = checkpoint['history_validation'] - # Create an empty file HALT_filename, mark the job as finished - if epoch == args.epochs - 1: - open(HALT_filename, 'a').close() + model.load_state_dict(checkpoint['state_dict']) + optimizer.load_state_dict(checkpoint['optimizer']) + scheduler.load_state_dict(checkpoint['scheduler']) + print("Checkpoint: loaded '{}' at epoch {}".format(CHECKPOINT_filename, checkpoint['epoch'])) + print(tabulate(history_training, headers="keys"), flush=True) + print(tabulate(history_validation, headers="keys"), flush=True) + else: + print("Checkpoint: not found") + + save_checkpoint({ + 'epoch': args.start_epoch, + 'state_dict': model.state_dict(), + 'best_loss': best_loss, + 'optimizer': optimizer.state_dict(), + 'scheduler': scheduler.state_dict(), + 'history_training': history_training, + 'history_validation': history_validation, + }, False) + + with tqdm(total=args.epochs, unit_scale=1, disable=args.distributed) as pbar: + for epoch in range(args.start_epoch, args.epochs): + torch.cuda.reset_max_memory_allocated() + model.train() + + sum_loss = 0. + total_norm = 0. + for inputs, targets, tensors_lengths, target_lengths in bg_iterator(loader_training, maxsize=2): + + loss = forward_loss(inputs, targets, tensors_lengths, target_lengths) + sum_loss += loss.item() + + optimizer.zero_grad() + loss.backward() + + norm = 0. + if args.clip_norm > 0: + norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_norm) + total_norm += norm + elif args.gradient: + for p in list(filter(lambda p: p.grad is not None, model.parameters())): + norm += p.grad.data.norm(2).item() ** 2 + norm = norm ** .5 + total_norm += norm + + optimizer.step() -print(tabulate(history_training, headers="keys"), flush=True) -print(tabulate(history_validation, headers="keys"), flush=True) -print(torch.cuda.memory_summary(), flush=True) -print(tabulate(history_loader, headers="keys"), flush=True) + memory = torch.cuda.max_memory_allocated() + # print(f"memory in training: {memory}", flush=True) + history_loader["epoch"].append(epoch) + history_loader["n"].append(pbar.n) + history_loader["memory"].append(memory) -plt.plot(history_loader["epoch"], - history_loader["memory"], label="memory") + if SIGNAL_RECEIVED: + save_checkpoint({ + 'epoch': epoch, + 'state_dict': model.state_dict(), + 'best_loss': best_loss, + 'optimizer': optimizer.state_dict(), + 'scheduler': scheduler.state_dict(), + 'history_training': history_training, + 'history_validation': history_validation, + }, False) + trigger_job_requeue() + pbar.update(1/len(loader_training)) -if not args.distributed or os.environ['SLURM_PROCID'] == '0': + total_norm = (total_norm ** .5) / len(loader_training) + if total_norm > 0: + print(f"Epoch: {epoch:4} Gradient: {total_norm:4.5f}", flush=True) - if "greedy_cer" in history_validation: - plt.plot(history_validation["epoch"], - history_validation["greedy_cer"], label="greedy") - if "viterbi_cer" in history_validation: - plt.plot(history_validation["epoch"], - history_validation["viterbi_cer"], label="viterbi") - plt.legend() - plt.savefig(os.path.join(args.figures, "cer.png") + # Average loss + sum_loss = sum_loss / len(loader_training) + sum_loss_str = f"Epoch: {epoch:4} Train: {sum_loss:4.5f}" + scheduler.step() -if not args.distributed or os.environ['SLURM_PROCID'] == '0': + memory = torch.cuda.max_memory_allocated() + print(f"memory after training: {memory}", flush=True) - if "greedy_wer" in history_validation: - plt.plot(history_validation["epoch"], - history_validation["greedy_wer"], label="greedy") - if "viterbi_wer" in history_validation: - plt.plot(history_validation["epoch"], - history_validation["viterbi_wer"], label="viterbi") - plt.legend() - plt.savefig(os.path.join(args.figures, "wer.png") + history_training["epoch"].append(epoch) + history_training["gradient_norm"].append(total_norm) + history_training["sum_loss"].append(sum_loss) + history_training["max_memory_allocated"].append(memory) + if not epoch % args.print_freq or epoch == args.epochs - 1: -if not args.distributed or os.environ['SLURM_PROCID'] == '0': + with torch.no_grad(): - if "greedy_cer_normalized" in history_validation: - plt.plot(history_validation["epoch"], - history_validation["greedy_cer_normalized"], label="greedy") - if "viterbi_cer_normalized" in history_validation: - plt.plot(history_validation["epoch"], - history_validation["viterbi_cer_normalized"], label="viterbi") - plt.legend() - plt.savefig(os.path.join(args.figures, "cer_normalized.png") + # Switch to evaluation mode + model.eval() + sum_loss = 0. + sum_out_greedy = [0, 0, 0, 0] + sum_out_viterbi = [0, 0, 0, 0] -if not args.distributed or os.environ['SLURM_PROCID'] == '0': + for inputs, targets, tensors_lengths, target_lengths in bg_iterator(loader_validation, maxsize=2): + sum_loss += forward_loss(inputs, targets, tensors_lengths, target_lengths).item() - if "greedy_wer_normalized" in history_validation: - plt.plot(history_validation["epoch"], - history_validation["greedy_wer_normalized"], label="greedy") - if "viterbi_wer_normalized" in history_validation: - plt.plot(history_validation["epoch"], - history_validation["viterbi_wer_normalized"], label="viterbi") - plt.legend() - plt.savefig(os.path.join(args.figures, "wer_normalized.png") + if True: + out_greedy = forward_decode(inputs, targets, greedy_decode) + for i in range(len(out_greedy)): + sum_out_greedy[i] += out_greedy[i] + if args.viterbi_decoder: + out_viterbi = forward_decode(inputs, targets, top_batch_viterbi_decode) + for i in range(len(out_greedy)): + sum_out_viterbi[i] += out_viterbi[i] + if SIGNAL_RECEIVED: + break -if not args.distributed or os.environ['SLURM_PROCID'] == '0': + # Average loss + sum_loss = sum_loss / len(loader_validation) + sum_loss_str += f" Validation: {sum_loss:.5f}" + print(sum_loss_str, flush=True) - plt.plot(history_training["epoch"], - history_training["sum_loss"], label="training") - plt.plot(history_validation["epoch"], - history_validation["sum_loss"], label="validation") - plt.legend() - plt.savefig(os.path.join(args.figures, "sum_loss.png") + if True: + for i in range(len(out_greedy)): + sum_out_greedy[i] /= len(loader_validation) + print(f"greedy decoder: {sum_out_greedy}", flush=True) + cer1, wer1, cern1, wern1 = sum_out_greedy + if args.viterbi_decoder: + for i in range(len(out_viterbi)): + sum_out_viterbi[i] /= len(loader_validation) + print(f"viterbi decoder: {sum_out_viterbi}", flush=True) + cer2, wer2, cern2, wern2 = sum_out_viterbi + memory = torch.cuda.max_memory_allocated() + print(f"memory after validation: {memory}", flush=True) -if not args.distributed or os.environ['SLURM_PROCID'] == '0': + history_validation["epoch"].append(epoch) + history_validation["max_memory_allocated"].append(memory) + history_validation["sum_loss"].append(sum_loss) - plt.plot(history_training["epoch"], - history_training["sum_loss"], label="training") - plt.plot(history_validation["epoch"], - history_validation["sum_loss"], label="validation") - plt.yscale("log") - plt.legend() - plt.savefig(os.path.join(args.figures, "log_sum_loss.png") + if True: + history_validation["greedy_cer"].append(cer1) + history_validation["greedy_cer_normalized"].append(cern1) + history_validation["greedy_wer"].append(wer1) + history_validation["greedy_wer_normalized"].append(wern1) + if args.viterbi_decoder: + history_validation["viterbi_cer"].append(cer2) + history_validation["viterbi_cer_normalized"].append(cern2) + history_validation["viterbi_wer"].append(wer2) + history_validation["viterbi_wer_normalized"].append(wern2) + + is_best = sum_loss < best_loss + best_loss = min(sum_loss, best_loss) + save_checkpoint({ + 'epoch': epoch + 1, + 'state_dict': model.state_dict(), + 'best_loss': best_loss, + 'optimizer': optimizer.state_dict(), + 'scheduler': scheduler.state_dict(), + 'history_training': history_training, + 'history_validation': history_validation, + }, is_best) + + print(tabulate(history_training, headers="keys"), flush=True) + print(tabulate(history_validation, headers="keys"), flush=True) + print(torch.cuda.memory_summary(), flush=True) + + # scheduler.step(sum_loss) + + # Create an empty file HALT_filename, mark the job as finished + if epoch == args.epochs - 1: + open(HALT_filename, 'a').close() -if not args.distributed or os.environ['SLURM_PROCID'] == '0': + print(tabulate(history_training, headers="keys"), flush=True) + print(tabulate(history_validation, headers="keys"), flush=True) print(torch.cuda.memory_summary(), flush=True) + print(tabulate(history_loader, headers="keys"), flush=True) + + plt.plot(history_loader["epoch"], history_loader["memory"], label="memory") + + if not args.distributed or os.environ['SLURM_PROCID'] == '0': + + if "greedy_cer" in history_validation: + plt.plot(history_validation["epoch"], history_validation["greedy_cer"], label="greedy") + if "viterbi_cer" in history_validation: + plt.plot(history_validation["epoch"], history_validation["viterbi_cer"], label="viterbi") + plt.legend() + plt.savefig(os.path.join(args.figures, "cer.png") + + if not args.distributed or os.environ['SLURM_PROCID'] == '0': + + if "greedy_wer" in history_validation: + plt.plot(history_validation["epoch"], history_validation["greedy_wer"], label="greedy") + if "viterbi_wer" in history_validation: + plt.plot(history_validation["epoch"], history_validation["viterbi_wer"], label="viterbi") + plt.legend() + plt.savefig(os.path.join(args.figures, "wer.png") + + if not args.distributed or os.environ['SLURM_PROCID'] == '0': + if "greedy_cer_normalized" in history_validation: + plt.plot(history_validation["epoch"], history_validation["greedy_cer_normalized"], label="greedy") + if "viterbi_cer_normalized" in history_validation: + plt.plot(history_validation["epoch"], history_validation["viterbi_cer_normalized"], label="viterbi") + plt.legend() + plt.savefig(os.path.join(args.figures, "cer_normalized.png") -# Print performance -pr.disable() -s = StringIO() -ps = ( - pstats - .Stats(pr, stream=s) - .strip_dirs() - .sort_stats("cumtime") - .print_stats(20) -) -print(s.getvalue(), flush=True) -print("stop time: {}".format(str(datetime.now())), flush=True) + if not args.distributed or os.environ['SLURM_PROCID'] == '0': + + if "greedy_wer_normalized" in history_validation: + plt.plot(history_validation["epoch"], history_validation["greedy_wer_normalized"], label="greedy") + if "viterbi_wer_normalized" in history_validation: + plt.plot(history_validation["epoch"], history_validation["viterbi_wer_normalized"], label="viterbi") + plt.legend() + plt.savefig(os.path.join(args.figures, "wer_normalized.png") + + if not args.distributed or os.environ['SLURM_PROCID'] == '0': + + plt.plot(history_training["epoch"], history_training["sum_loss"], label="training") + plt.plot(history_validation["epoch"], history_validation["sum_loss"], label="validation") + plt.legend() + plt.savefig(os.path.join(args.figures, "sum_loss.png") + + if not args.distributed or os.environ['SLURM_PROCID'] == '0': + + plt.plot(history_training["epoch"], history_training["sum_loss"], label="training") + plt.plot(history_validation["epoch"], history_validation["sum_loss"], label="validation") + plt.yscale("log") + plt.legend() + plt.savefig(os.path.join(args.figures, "log_sum_loss.png") + + if not args.distributed or os.environ['SLURM_PROCID'] == '0': + print(torch.cuda.memory_summary(), flush=True) + + # Print performance + pr.disable() + s = StringIO() + ps = ( + pstats + .Stats(pr, stream=s) + .strip_dirs() + .sort_stats("cumtime") + .print_stats(20) + ) + print(s.getvalue(), flush=True) + print("stop time: {}".format(str(datetime.now())), flush=True) From 8fef1714699505dcf0b672320b3c1312ab3370f3 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 13 May 2020 16:51:06 -0700 Subject: [PATCH 008/129] lint. --- examples/pipeline/wav2letter.py | 78 ++++++++++++++------------------- 1 file changed, 33 insertions(+), 45 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 4ba1973783..ce416d0524 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -9,17 +9,14 @@ import cProfile import hashlib import itertools -import math import os import pprint import pstats -import random import re import shutil import signal import statistics import string -from array import array from collections import defaultdict from datetime import datetime from io import StringIO @@ -48,26 +45,26 @@ def parse_args(): parser = argparse.ArgumentParser() - + parser.add_argument('--workers', default=0, type=int, metavar='N', help='number of data loading workers') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint') parser.add_argument('--figures', default='', type=str, metavar='PATH', help='folder path to save figures') - + parser.add_argument('--epochs', default=200, type=int, metavar='N', help='number of total epochs to run') parser.add_argument('--start-epoch', default=0, type=int, metavar='N', help='manual epoch number') parser.add_argument('--print-freq', default=10, type=int, metavar='N', help='print frequency in epochs') - + parser.add_argument('--arch', metavar='ARCH', default='wav2letter', choices=["wav2letter", "lstm"], help='model architecture') parser.add_argument('--batch-size', default=64, type=int, metavar='N', help='mini-batch size') - + parser.add_argument('--learning-rate', default=1., type=float, metavar='LR', help='initial learning rate') parser.add_argument('--gamma', default=.96, type=float, @@ -77,10 +74,10 @@ def parse_args(): type=float, metavar='W', help='weight decay') parser.add_argument("--eps", metavar='EPS', type=float, default=1e-8) parser.add_argument("--rho", metavar='RHO', type=float, default=.95) - + parser.add_argument('--n-bins', default=13, type=int, metavar='N', help='number of bins in transforms') - + parser.add_argument('--world-size', default=1, type=int, help='number of distributed processes') parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', @@ -88,12 +85,12 @@ def parse_args(): parser.add_argument('--dist-backend', default='nccl', type=str, help='distributed backend') parser.add_argument('--distributed', action="store_true") - + parser.add_argument('--dataset', default='librispeech', type=str) parser.add_argument('--gradient', action="store_true") parser.add_argument('--jit', action="store_true") parser.add_argument('--viterbi-decoder', action="store_true") - + args = parser.parse_args() @@ -202,7 +199,7 @@ def save_checkpoint(state, is_best, filename=CHECKPOINT_filename): class LanguageModel: - def __init__(self, labels): + def __init__(self, labels, char_blank): labels = [l for l in labels] self.length = len(labels) enumerated = list(enumerate(labels)) @@ -216,7 +213,7 @@ def encode(self, iterable): if isinstance(iterable, list): return [self.encode(i) for i in iterable] else: - return [self.mapping[i] + self.mapping[char_blank] for i in iterable] + return [self.mapping[i] + self.mapping[self.char_blank] for i in iterable] def decode(self, tensor): if isinstance(tensor[0], list): @@ -225,7 +222,7 @@ def decode(self, tensor): # not idempotent, since clean string x = (self.mapping[i] for i in tensor) x = ''.join(i for i, _ in itertools.groupby(x)) - x = x.replace(char_blank, "") + x = x.replace(self.char_blank, "") # x = x.strip() return x @@ -317,13 +314,12 @@ def process_datapoint(item): return transformed, target -def datasets_librispeech(): +def datasets_librispeech(root="/datasets01/", folder_in_archive="librispeech/062419/"): def create(tag): if isinstance(tag, str): - data = LIBRISPEECH( - root, tag, folder_in_archive=folder_in_archive, download=False) + data = LIBRISPEECH(root, tag, folder_in_archive=folder_in_archive, download=False) else: data = sum(LIBRISPEECH(root, t, folder_in_archive=folder_in_archive, download=False) for t in tag) @@ -392,8 +388,7 @@ def which_set(filename, validation_percentage, testing_percentage): def filter_speechcommands(data, tag, training_percentage, validation_percentage): if training_percentage < 100.: - testing_percentage = ( - 100. - training_percentage - validation_percentage) + testing_percentage = (100. - training_percentage - validation_percentage) def which_set_filter(x): return which_set(x, validation_percentage, testing_percentage) == tag @@ -402,9 +397,7 @@ def which_set_filter(x): return data -def datasets_speechcommands(): - - root = "./" +def datasets_speechcommands(root="./"): def create(tag): data = SPEECHCOMMANDS(root, download=True) @@ -417,9 +410,6 @@ def create(tag): return create("training"), create("validation"), create("testing") -# Word Decoder - - def greedy_decode(outputs): """Greedy Decoder. Returns highest probability of class labels for each timestep @@ -433,7 +423,7 @@ def greedy_decode(outputs): return indices[..., 0] -def build_transitions(): +def build_transitions(training, vocab_size): from collections import Counter @@ -455,14 +445,11 @@ def build_transitions(): transitions = torch.sparse_coo_tensor(indices=ind, values=val, size=[ vocab_size, vocab_size]).coalesce().to_dense() - transitions = (transitions/torch.max(torch.tensor(1.), - transitions.max(dim=1)[0]).unsqueeze(1)) + transitions = (transitions / torch.max(torch.tensor(1.), transitions.max(dim=1)[0]).unsqueeze(1)) return transitions - - def viterbi_decode(tag_sequence: torch.Tensor, transition_matrix: torch.Tensor, top_k: int = 5): """ Perform Viterbi decoding in log space over a sequence given a transition matrix @@ -542,7 +529,7 @@ def batch_viterbi_decode(tag_sequence: torch.Tensor, transition_matrix: torch.Te outputs = [] scores = [] for i in range(tag_sequence.shape[1]): - paths, score = viterbi_decode(tag_sequence[:, i, :], transitions) + paths, score = viterbi_decode(tag_sequence[:, i, :], transition_matrix) outputs.append(paths) scores.append(score) @@ -675,9 +662,6 @@ def main(args): audio_backend = "soundfile" torchaudio.set_audio_backend(audio_backend) - root = "/datasets01/" - folder_in_archive = "librispeech/062419/" - device = "cuda" if torch.cuda.is_available() else "cpu" num_devices = torch.cuda.device_count() print(num_devices, "GPUs", flush=True) @@ -720,7 +704,7 @@ def main(args): char_apostrophe = "'" labels = char_blank + char_space + char_apostrophe + string.ascii_lowercase - coder = LanguageModel(labels) + coder = LanguageModel(labels, char_blank) encode = coder.encode decode = coder.decode vocab_size = coder.length @@ -733,7 +717,7 @@ def main(args): if args.viterbi_decoder: print("transitions: building", flush=True) - transitions = build_transitions() + transitions = build_transitions(training, vocab_size) print("transitions: done", flush=True) # Model @@ -865,7 +849,7 @@ def main(args): }, False) trigger_job_requeue() - pbar.update(1/len(loader_training)) + pbar.update(1 / len(loader_training)) total_norm = (total_norm ** .5) / len(loader_training) if total_norm > 0: @@ -967,7 +951,6 @@ def main(args): if epoch == args.epochs - 1: open(HALT_filename, 'a').close() - print(tabulate(history_training, headers="keys"), flush=True) print(tabulate(history_validation, headers="keys"), flush=True) print(torch.cuda.memory_summary(), flush=True) @@ -982,7 +965,7 @@ def main(args): if "viterbi_cer" in history_validation: plt.plot(history_validation["epoch"], history_validation["viterbi_cer"], label="viterbi") plt.legend() - plt.savefig(os.path.join(args.figures, "cer.png") + plt.savefig(os.path.join(args.figures, "cer.png")) if not args.distributed or os.environ['SLURM_PROCID'] == '0': @@ -991,7 +974,7 @@ def main(args): if "viterbi_wer" in history_validation: plt.plot(history_validation["epoch"], history_validation["viterbi_wer"], label="viterbi") plt.legend() - plt.savefig(os.path.join(args.figures, "wer.png") + plt.savefig(os.path.join(args.figures, "wer.png")) if not args.distributed or os.environ['SLURM_PROCID'] == '0': @@ -1000,7 +983,7 @@ def main(args): if "viterbi_cer_normalized" in history_validation: plt.plot(history_validation["epoch"], history_validation["viterbi_cer_normalized"], label="viterbi") plt.legend() - plt.savefig(os.path.join(args.figures, "cer_normalized.png") + plt.savefig(os.path.join(args.figures, "cer_normalized.png")) if not args.distributed or os.environ['SLURM_PROCID'] == '0': @@ -1009,14 +992,14 @@ def main(args): if "viterbi_wer_normalized" in history_validation: plt.plot(history_validation["epoch"], history_validation["viterbi_wer_normalized"], label="viterbi") plt.legend() - plt.savefig(os.path.join(args.figures, "wer_normalized.png") + plt.savefig(os.path.join(args.figures, "wer_normalized.png")) if not args.distributed or os.environ['SLURM_PROCID'] == '0': plt.plot(history_training["epoch"], history_training["sum_loss"], label="training") plt.plot(history_validation["epoch"], history_validation["sum_loss"], label="validation") plt.legend() - plt.savefig(os.path.join(args.figures, "sum_loss.png") + plt.savefig(os.path.join(args.figures, "sum_loss.png")) if not args.distributed or os.environ['SLURM_PROCID'] == '0': @@ -1024,7 +1007,7 @@ def main(args): plt.plot(history_validation["epoch"], history_validation["sum_loss"], label="validation") plt.yscale("log") plt.legend() - plt.savefig(os.path.join(args.figures, "log_sum_loss.png") + plt.savefig(os.path.join(args.figures, "log_sum_loss.png")) if not args.distributed or os.environ['SLURM_PROCID'] == '0': print(torch.cuda.memory_summary(), flush=True) @@ -1032,7 +1015,7 @@ def main(args): # Print performance pr.disable() s = StringIO() - ps = ( + ( pstats .Stats(pr, stream=s) .strip_dirs() @@ -1041,3 +1024,8 @@ def main(args): ) print(s.getvalue(), flush=True) print("stop time: {}".format(str(datetime.now())), flush=True) + + +if __name__ == "__main__": + args = parse_args() + main(args) From b27ec81bf760e8e9797e4c973ebc6f8816154395 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 13 May 2020 17:16:04 -0700 Subject: [PATCH 009/129] checkpoint. --- examples/pipeline/wav2letter.py | 93 +++++++++++++++------------------ 1 file changed, 41 insertions(+), 52 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index ce416d0524..167821f173 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -41,6 +41,8 @@ matplotlib.use("Agg") +MAIN_PID = os.getpid() +SIGNAL_RECEIVED = False def parse_args(): @@ -108,39 +110,14 @@ def parse_args(): return args -if __name__ == "__main__": - args = parse_args() - main(args) - - -# Checkpoint - -MAIN_PID = os.getpid() -CHECKPOINT_filename = args.resume if args.resume else 'checkpoint.pth.tar' -CHECKPOINT_tempfile = CHECKPOINT_filename + '.temp' -HALT_filename = CHECKPOINT_filename + '.HALT' -SIGNAL_RECEIVED = False - -# HALT file is used as a sign of job completion. -# Make sure no HALT file left from previous runs. -if os.path.isfile(HALT_filename): - os.remove(HALT_filename) - -# Remove CHECKPOINT_tempfile, in case the signal arrives in the -# middle of copying from CHECKPOINT_tempfile to CHECKPOINT_filename -if os.path.isfile(CHECKPOINT_tempfile): - os.remove(CHECKPOINT_tempfile) - - def SIGTERM_handler(a, b): print('received sigterm') pass -def signal_handler(a, b): +def signal_handler(a, b, HALT_filename): global SIGNAL_RECEIVED - print('Signal received', a, datetime.now().strftime( - "%y%m%d.%H%M%S"), flush=True) + print('Signal received', a, datetime.now().strftime("%y%m%d.%H%M%S"), flush=True) SIGNAL_RECEIVED = True # If HALT file exists, which means the job is done, exit peacefully. @@ -151,7 +128,7 @@ def signal_handler(a, b): return -def trigger_job_requeue(): +def trigger_job_requeue(CHECKPOINT_filename): # Submit a new job to resume from checkpoint. if os.path.isfile(CHECKPOINT_filename) and os.environ['SLURM_PROCID'] == '0' and os.getpid() == MAIN_PID: print('pid: ', os.getpid(), ' ppid: ', os.getppid(), flush=True) @@ -164,19 +141,20 @@ def trigger_job_requeue(): exit(0) -# Install signal handler -signal.signal(signal.SIGUSR1, signal_handler) -signal.signal(signal.SIGTERM, SIGTERM_handler) -print('Signal handler installed', flush=True) - - -def save_checkpoint(state, is_best, filename=CHECKPOINT_filename): +def save_checkpoint(state, is_best, filename): """ Save the model to a temporary file first, then copy it to filename, in case the signal interrupts the torch.save() process. """ if not args.distributed or os.environ['SLURM_PROCID'] == '0': + CHECKPOINT_tempfile = filename + '.temp' + + # Remove CHECKPOINT_tempfile, in case the signal arrives in the + # middle of copying from CHECKPOINT_tempfile to CHECKPOINT_filename + if os.path.isfile(CHECKPOINT_tempfile): + os.remove(CHECKPOINT_tempfile) + torch.save(state, CHECKPOINT_tempfile) if os.path.isfile(CHECKPOINT_tempfile): os.rename(CHECKPOINT_tempfile, filename) @@ -185,19 +163,6 @@ def save_checkpoint(state, is_best, filename=CHECKPOINT_filename): print("Checkpoint: saved") -# Distributed - -if args.distributed: - os.environ['RANK'] = os.environ['SLURM_PROCID'] - os.environ['WORLD_SIZE'] = str(args.world_size) - print('in distributed', os.environ['RANK'], - os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'], flush=True) - dist.init_process_group(backend=args.dist_backend, - init_method=args.dist_url, world_size=args.world_size) - - print('init process', flush=True) - - class LanguageModel: def __init__(self, labels, char_blank): labels = [l for l in labels] @@ -659,6 +624,30 @@ def main(args): pr = cProfile.Profile() pr.enable() + # Checkpoint + + CHECKPOINT_filename = args.resume if args.resume else 'checkpoint.pth.tar' + HALT_filename = CHECKPOINT_filename + '.HALT' + + # HALT file is used as a sign of job completion. + # Make sure no HALT file left from previous runs. + if os.path.isfile(HALT_filename): + os.remove(HALT_filename) + + # Install signal handler + signal.signal(signal.SIGUSR1, lambda a, b: signal_handler(a, b, HALT_filename)) + signal.signal(signal.SIGTERM, SIGTERM_handler) + print('Signal handler installed', flush=True) + + # Distributed + + if args.distributed: + os.environ['RANK'] = os.environ['SLURM_PROCID'] + os.environ['WORLD_SIZE'] = str(args.world_size) + print('in distributed', os.environ['RANK'], os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'], flush=True) + dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) + print('init process', flush=True) + audio_backend = "soundfile" torchaudio.set_audio_backend(audio_backend) @@ -801,7 +790,7 @@ def main(args): 'scheduler': scheduler.state_dict(), 'history_training': history_training, 'history_validation': history_validation, - }, False) + }, False, CHECKPOINT_filename) with tqdm(total=args.epochs, unit_scale=1, disable=args.distributed) as pbar: for epoch in range(args.start_epoch, args.epochs): @@ -846,8 +835,8 @@ def main(args): 'scheduler': scheduler.state_dict(), 'history_training': history_training, 'history_validation': history_validation, - }, False) - trigger_job_requeue() + }, False, CHECKPOINT_filename) + trigger_job_requeue(CHECKPOINT_filename) pbar.update(1 / len(loader_training)) @@ -939,7 +928,7 @@ def main(args): 'scheduler': scheduler.state_dict(), 'history_training': history_training, 'history_validation': history_validation, - }, is_best) + }, is_best, CHECKPOINT_filename) print(tabulate(history_training, headers="keys"), flush=True) print(tabulate(history_validation, headers="keys"), flush=True) From 4c5e4de291f90e15adff34907bd49fae65f5ff11 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 19 May 2020 15:27:40 -0700 Subject: [PATCH 010/129] clean version to start with. --- examples/pipeline/wav2letterclean.py | 629 +++++++++++++++++++++++++++ 1 file changed, 629 insertions(+) create mode 100644 examples/pipeline/wav2letterclean.py diff --git a/examples/pipeline/wav2letterclean.py b/examples/pipeline/wav2letterclean.py new file mode 100644 index 0000000000..acee79286c --- /dev/null +++ b/examples/pipeline/wav2letterclean.py @@ -0,0 +1,629 @@ +import argparse +import collections +import cProfile +import hashlib +import itertools +import os +import pprint +import pstats +import re +import shutil +import signal +import statistics +import string +from collections import defaultdict +from datetime import datetime +from io import StringIO +from typing import Optional + +import matplotlib +import torch +import torch.distributed as dist +import torchaudio +from matplotlib import pyplot as plt +from torch import nn, topk +from torch.optim import SGD, Adadelta, Adam +from torch.optim.lr_scheduler import ExponentialLR, ReduceLROnPlateau +from torch.utils.data import DataLoader +from torchaudio.datasets import LIBRISPEECH, SPEECHCOMMANDS +from torchaudio.datasets.utils import bg_iterator, diskcache_iterator +from torchaudio.models.wav2letter import Wav2Letter +from torchaudio.transforms import MFCC, Resample +from tqdm.notebook import tqdm as tqdm + +from tabulate import tabulate + + +def parse_args(): + parser = argparse.ArgumentParser() + + parser.add_argument('--workers', default=0, type=int, + metavar='N', help='number of data loading workers') + parser.add_argument('--resume', default='', type=str, + metavar='PATH', help='path to latest checkpoint') + parser.add_argument('--figures', default='', type=str, + metavar='PATH', help='folder path to save figures') + + parser.add_argument('--epochs', default=200, type=int, + metavar='N', help='number of total epochs to run') + parser.add_argument('--start-epoch', default=0, type=int, + metavar='N', help='manual epoch number') + parser.add_argument('--print-freq', default=10, type=int, + metavar='N', help='print frequency in epochs') + + parser.add_argument('--arch', metavar='ARCH', default='wav2letter', + choices=["wav2letter", "lstm"], help='model architecture') + parser.add_argument('--batch-size', default=64, type=int, + metavar='N', help='mini-batch size') + + parser.add_argument('--learning-rate', default=1., type=float, + metavar='LR', help='initial learning rate') + parser.add_argument('--gamma', default=.96, type=float, + metavar='GAMMA', help='learning rate exponential decay constant') + # parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum') + parser.add_argument('--weight-decay', default=1e-5, + type=float, metavar='W', help='weight decay') + parser.add_argument("--eps", metavar='EPS', type=float, default=1e-8) + parser.add_argument("--rho", metavar='RHO', type=float, default=.95) + + parser.add_argument('--n-bins', default=13, type=int, + metavar='N', help='number of bins in transforms') + + parser.add_argument('--world-size', default=1, type=int, + help='number of distributed processes') + parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', + type=str, help='url used to set up distributed training') + parser.add_argument('--dist-backend', default='nccl', + type=str, help='distributed backend') + parser.add_argument('--distributed', action="store_true") + + parser.add_argument('--dataset', default='librispeech', type=str) + parser.add_argument('--gradient', action="store_true") + parser.add_argument('--jit', action="store_true") + parser.add_argument('--viterbi-decoder', action="store_true") + + args = parser.parse_args() + + + # Use #nodes as world_size + if 'SLURM_NNODES' in os.environ: + args.world_size = int(os.environ['SLURM_NNODES']) + + args.distributed = args.distributed or args.world_size > 1 + + if not args.distributed or os.environ['SLURM_PROCID'] == '0': + print(pprint.pformat(vars(args)), flush=True) + + args.clip_norm = 0. + + return args + + +def SIGTERM_handler(a, b): + print('received sigterm') + pass + + +def signal_handler(a, b): + global SIGNAL_RECEIVED + print('Signal received', a, datetime.now().strftime("%y%m%d.%H%M%S"), flush=True) + SIGNAL_RECEIVED = True + + +def save_checkpoint(state, is_best, filename): + """ + Save the model to a temporary file first, + then copy it to filename, in case the signal interrupts + the torch.save() process. + """ + CHECKPOINT_tempfile = filename + '.temp' + + # Remove CHECKPOINT_tempfile, in case the signal arrives in the + # middle of copying from CHECKPOINT_tempfile to CHECKPOINT_filename + if os.path.isfile(CHECKPOINT_tempfile): + os.remove(CHECKPOINT_tempfile) + + torch.save(state, CHECKPOINT_tempfile) + if os.path.isfile(CHECKPOINT_tempfile): + os.rename(CHECKPOINT_tempfile, filename) + if is_best: + shutil.copyfile(filename, 'model_best.pth.tar') + print("Checkpoint: saved") + + +class LanguageModel: + def __init__(self, labels, char_blank): + labels = [l for l in labels] + self.length = len(labels) + enumerated = list(enumerate(labels)) + flipped = [(sub[1], sub[0]) for sub in enumerated] + + d1 = collections.OrderedDict(enumerated) + d2 = collections.OrderedDict(flipped) + self.mapping = {**d1, **d2} + + def encode(self, iterable): + if isinstance(iterable, list): + return [self.encode(i) for i in iterable] + else: + return [self.mapping[i] + self.mapping[self.char_blank] for i in iterable] + + def decode(self, tensor): + if isinstance(tensor[0], list): + return [self.decode(t) for t in tensor] + else: + # not idempotent, since clean string + x = (self.mapping[i] for i in tensor) + x = ''.join(i for i, _ in itertools.groupby(x)) + x = x.replace(self.char_blank, "") + # x = x.strip() + return x + + +def model_length_function(tensor): + return int(tensor.shape[0]) // 2 + 1 + + +class IterableMemoryCache: + + def __init__(self, iterable): + self.iterable = iterable + self._iter = iter(iterable) + self._done = False + self._values = [] + + def __iter__(self): + if self._done: + return iter(self._values) + return itertools.chain(self._values, self._gen_iter()) + + def _gen_iter(self): + for new_value in self._iter: + self._values.append(new_value) + yield new_value + self._done = True + + def __len__(self): + return len(self._iterable) + + +class MapMemoryCache(torch.utils.data.Dataset): + """ + Wrap a dataset so that, whenever a new item is returned, it is saved to memory. + """ + + def __init__(self, dataset): + self.dataset = dataset + self._cache = [None] * len(dataset) + + def __getitem__(self, n): + if self._cache[n]: + return self._cache[n] + + item = self.dataset[n] + self._cache[n] = item + + return item + + def __len__(self): + return len(self.dataset) + + +class Processed(torch.utils.data.Dataset): + + def __init__(self, process_datapoint, dataset): + self.process_datapoint = process_datapoint + self.dataset = dataset + + def __getitem__(self, n): + item = self.dataset[n] + return self.process_datapoint(item) + + def __next__(self): + item = next(self.dataset) + return self.process_datapoint(item) + + def __len__(self): + return len(self.dataset) + + +def process_datapoint(item, transforms, encode): + transformed = item[0] # .to(device, non_blocking=non_blocking) + target = item[2].lower() + + transformed = transforms(transformed) + + transformed = transformed[0, ...].transpose(0, -1) + + target = " " + target + " " + target = encode(target) + target = torch.tensor(target, dtype=torch.long, device=transformed.device) + + transformed = transformed # .to("cpu") + target = target # .to("cpu") + return transformed, target + + +def datasets_librispeech(transforms, encode, root="/datasets01/", folder_in_archive="librispeech/062419/"): + + def create(tag): + + if isinstance(tag, str): + data = LIBRISPEECH(root, tag, folder_in_archive=folder_in_archive, download=False) + else: + data = sum(LIBRISPEECH(root, t, folder_in_archive=folder_in_archive, download=False) for t in tag) + + data = Processed(lambda x: process_datapoint(x, transforms, encode), data) + # data = diskcache_iterator(data) + data = MapMemoryCache(data) + return data + + return create("train-clean-100"), create("dev-clean"), None + # return create(["train-clean-100", "train-clean-360", "train-other-500"]), create(["dev-clean", "dev-other"]), None + + +def greedy_decode(outputs): + """Greedy Decoder. Returns highest probability of class labels for each timestep + + Args: + outputs (torch.Tensor): shape (input length, batch size, number of classes (including blank)) + + Returns: + torch.Tensor: class labels per time step. + """ + _, indices = topk(outputs, k=1, dim=-1) + return indices[..., 0] + + +def levenshtein_distance(r: str, h: str, device: Optional[str] = None): + + # initialisation + d = torch.zeros((2, len(h) + 1), dtype=torch.long) # , device=device) + dold = 0 + dnew = 1 + + # computation + for i in range(1, len(r) + 1): + d[dnew, 0] = 0 + for j in range(1, len(h) + 1): + + if r[i - 1] == h[j - 1]: + d[dnew, j] = d[dnew - 1, j - 1] + else: + substitution = d[dnew - 1, j - 1] + 1 + insertion = d[dnew, j - 1] + 1 + deletion = d[dnew - 1, j] + 1 + d[dnew, j] = min(substitution, insertion, deletion) + + dnew, dold = dold, dnew + + dist = d[dnew, -1].item() + + return dist + + +def collate_fn(batch): + + tensors = [b[0] for b in batch if b] + + tensors_lengths = torch.tensor( + [model_length_function(t) for t in tensors], dtype=torch.long, device=tensors[0].device + ) + + tensors = torch.nn.utils.rnn.pad_sequence(tensors, batch_first=True) + tensors = tensors.transpose(1, -1) + + targets = [b[1] for b in batch if b] + target_lengths = torch.tensor( + [target.shape[0] for target in targets], dtype=torch.long, device=tensors.device + ) + targets = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True) + + return tensors, targets, tensors_lengths, target_lengths + + +def count_parameters(model): + return sum(p.numel() for p in model.parameters() if p.requires_grad) + + +def forward_decode(inputs, targets, decoder): + + inputs = inputs.to(device, non_blocking=True) + output = model(inputs).to("cpu") + output = decoder(output) + + output = decode(output.tolist()) + target = decode(targets.tolist()) + + print_length = 20 + for i in range(2): + output_print = output[i].ljust(print_length)[:print_length] + target_print = target[i].ljust(print_length)[:print_length] + print( + f"Epoch: {epoch:4} Target: {target_print} Output: {output_print}", flush=True) + + cers = [levenshtein_distance(a, b) for a, b in zip(target, output)] + cers_normalized = [d / len(a) for a, d in zip(target, cers)] + cers = statistics.mean(cers) + cers_normalized = statistics.mean(cers_normalized) + + output = [o.split(char_space) for o in output] + target = [o.split(char_space) for o in target] + + wers = [levenshtein_distance(a, b) for a, b in zip(target, output)] + wers_normalized = [d / len(a) for a, d in zip(target, wers)] + wers = statistics.mean(wers) + wers_normalized = statistics.mean(wers_normalized) + + print(f"Epoch: {epoch:4} CER: {cers:1.5f} WER: {wers:1.5f}", flush=True) + + return cers, wers, cers_normalized, wers_normalized + + +def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, print_freq, pbar=None): + + model.train() + + sum_loss = 0. + for inputs, targets, tensors_lengths, target_lengths in bg_iterator(data_loader, maxsize=2): + + inputs = inputs.to(device, non_blocking=non_blocking) + targets = targets.to(device, non_blocking=non_blocking) + + # keep batch first for data parallel + outputs = model(inputs).transpose(0, 1) + + # CTC + # outputs: input length, batch size, number of classes (including blank) + # targets: batch size, max target length + # input_lengths: batch size + # target_lengths: batch size + + sum_loss += criterion(outputs, targets, tensors_lengths, target_lengths).item() + + optimizer.zero_grad() + loss.backward() + + optimizer.step() + + if SIGNAL_RECEIVED: + return + + if pbar is not None: + pbar.update(1 / len(loader_training)) + + # Average loss + sum_loss = sum_loss / len(loader_training) + print(f"Training loss: {sum_loss:4.5f}", flush=True) + + scheduler.step() + + +def evaluate(model, criterion, data_loader, device, print_freq=100): + + with torch.no_grad(): + + model.eval() + + sum_loss = 0. + sum_error_rates = None + + for inputs, targets, tensors_lengths, target_lengths in bg_iterator(data_loader, maxsize=2): + + inputs = inputs.to(device, non_blocking=non_blocking) + targets = targets.to(device, non_blocking=non_blocking) + + # keep batch first for data parallel + outputs = model(inputs).transpose(0, 1) + + # CTC + # outputs: input length, batch size, number of classes (including blank) + # targets: batch size, max target length + # input_lengths: batch size + # target_lengths: batch size + + sum_loss += criterion(outputs, targets, tensors_lengths, target_lengths).item() + + error_rates = forward_decode(inputs, targets, greedy_decode) + if sum_error_rates is None: + sum_error_rates = [0 for _ in error_rates] + for i in range(len(error_rates)): + sum_error_rates[i] += error_rates[i] + + if SIGNAL_RECEIVED: + break + + # Average loss + sum_loss = sum_loss / len(loader_validation) + print(f"Validation loss: {sum_loss:.5f}", flush=True) + + for i in range(len(error_rates)): + sum_error_rates[i] /= len(data_loader) + print(f"Decoder: {sum_error_rates}", flush=True) + cer1, wer1, cern1, wern1 = sum_error_rates + print(f"CER: {cer} WER: {wer} CERN: {cern} WERN: {wern}", flush=True) + + return sum_loss + + +def main(args): + + print("start time: {}".format(str(datetime.now())), flush=True) + + # Empty CUDA cache + torch.cuda.empty_cache() + + CHECKPOINT_filename = args.resume if args.resume else 'checkpoint.pth.tar' + + # Install signal handler + signal.signal(signal.SIGUSR1, lambda a, b: signal_handler(a, b, HALT_filename)) + signal.signal(signal.SIGTERM, SIGTERM_handler) + print('Signal handler installed', flush=True) + + + audio_backend = "soundfile" + torchaudio.set_audio_backend(audio_backend) + + device = "cuda" if torch.cuda.is_available() else "cpu" + num_devices = torch.cuda.device_count() + + data_loader_training_params = { + "num_workers": args.workers, + "pin_memory": True, + "shuffle": True, + "drop_last": True, + } + data_loader_validation_params = data_loader_training_params.copy() + data_loader_validation_params["shuffle"] = False + + non_blocking = True + + # audio + + n_bins = args.n_bins # 13, 128 + melkwargs = { + 'n_fft': 512, + 'n_mels': 20, + 'hop_length': 80, # (160, 80) + } + + sample_rate_original = 16000 + + transforms = nn.Sequential( + # torchaudio.transforms.Resample(sample_rate_original, sample_rate_original//2), + # torchaudio.transforms.MFCC(sample_rate=sample_rate_original, n_mfcc=n_bins, melkwargs=melkwargs), + torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate_original, n_mels=n_bins), + # torchaudio.transforms.FrequencyMasking(freq_mask_param=n_bins), + # torchaudio.transforms.TimeMasking(time_mask_param=35) + ) + + # Text preprocessing + + char_blank = "*" + char_space = " " + char_apostrophe = "'" + + labels = char_blank + char_space + char_apostrophe + string.ascii_lowercase + coder = LanguageModel(labels, char_blank) + encode = coder.encode + decode = coder.decode + vocab_size = coder.length + print("vocab_size", vocab_size, flush=True) + + training, validation, _ = datasets_librispeech(transforms, encode) + + num_features = n_bins if n_bins else 1 + model = Wav2Letter(num_features, vocab_size) + + if args.jit: + model = torch.jit.script(model) + + if not args.distributed: + model = torch.nn.DataParallel(model) + else: + model.cuda() + model = torch.nn.parallel.DistributedDataParallel(model) + # model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True) + + model = model.to(device, non_blocking=non_blocking) + + n = count_parameters(model) + print(f"Number of parameters: {n}", flush=True) + + print(torch.cuda.memory_summary(), flush=True) + + # Optimizer + + optimizer_params = { + "lr": args.learning_rate, + # "eps": args.eps, + # "rho": args.rho, + "weight_decay": args.weight_decay, + } + + Optimizer = SGD + optimizer_params = optimizer_params + + optimizer = Optimizer(model.parameters(), **optimizer_params) + scheduler = ExponentialLR(optimizer, gamma=args.gamma) + # scheduler = ReduceLROnPlateau(optimizer, patience=2, threshold=1e-3) + + criterion = torch.nn.CTCLoss(blank=coder.mapping[char_blank], zero_infinity=False) + # criterion = nn.MSELoss() + # criterion = torch.nn.NLLLoss() + + best_loss = 1. + + loader_training = DataLoader(training, batch_size=args.batch_size, collate_fn=collate_fn, **data_loader_training_params) + loader_validation = DataLoader(validation, batch_size=args.batch_size, collate_fn=collate_fn, **data_loader_validation_params) + + print("Length of data loaders: ", len(loader_training), len(loader_validation), flush=True) + + history_loader = defaultdict(list) + history_training = defaultdict(list) + history_validation = defaultdict(list) + + if args.resume and os.path.isfile(CHECKPOINT_filename): + print("Checkpoint: loading '{}'".format(CHECKPOINT_filename)) + checkpoint = torch.load(CHECKPOINT_filename) + + args.start_epoch = checkpoint['epoch'] + best_loss = checkpoint['best_loss'] + history_training = checkpoint['history_training'] + history_validation = checkpoint['history_validation'] + + model.load_state_dict(checkpoint['state_dict']) + optimizer.load_state_dict(checkpoint['optimizer']) + scheduler.load_state_dict(checkpoint['scheduler']) + + print("Checkpoint: loaded '{}' at epoch {}".format(CHECKPOINT_filename, checkpoint['epoch'])) + print(tabulate(history_training, headers="keys"), flush=True) + print(tabulate(history_validation, headers="keys"), flush=True) + else: + print("Checkpoint: not found") + + save_checkpoint({ + 'epoch': args.start_epoch, + 'state_dict': model.state_dict(), + 'best_loss': best_loss, + 'optimizer': optimizer.state_dict(), + 'scheduler': scheduler.state_dict(), + 'history_training': history_training, + 'history_validation': history_validation, + }, False, CHECKPOINT_filename) + + with tqdm(total=args.epochs, unit_scale=1, disable=args.distributed) as pbar: + + for epoch in range(args.start_epoch, args.epochs): + + train_one_epoch( + if SIGNAL_RECEIVED: + save_checkpoint({ + 'epoch': epoch, + 'state_dict': model.state_dict(), + 'best_loss': best_loss, + 'optimizer': optimizer.state_dict(), + 'scheduler': scheduler.state_dict(), + 'history_training': history_training, + 'history_validation': history_validation, + }, False, CHECKPOINT_filename) + if not epoch % args.print_freq or epoch == args.epochs - 1: + + sum_loss = evaluate_one_epoch + + is_best = sum_loss < best_loss + best_loss = min(sum_loss, best_loss) + save_checkpoint({ + 'epoch': epoch + 1, + 'state_dict': model.state_dict(), + 'best_loss': best_loss, + 'optimizer': optimizer.state_dict(), + 'scheduler': scheduler.state_dict(), + 'history_training': history_training, + 'history_validation': history_validation, + }, is_best, CHECKPOINT_filename) + + +if __name__ == "__main__": + args = parse_args() + main(args) From f24674701e6a5be1bdfd2599b41c28ce22f1efea Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 20 May 2020 14:39:57 -0700 Subject: [PATCH 011/129] adding more parameters. --- examples/pipeline/wav2letter.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 167821f173..338226faa9 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -196,9 +196,6 @@ def model_length_function(tensor): return int(tensor.shape[0]) // 2 + 1 -# Dataset - - class IterableMemoryCache: def __init__(self, iterable): @@ -262,7 +259,7 @@ def __len__(self): return len(self.dataset) -def process_datapoint(item): +def process_datapoint(item, transforms, encode): transformed = item[0] # .to(device, non_blocking=non_blocking) target = item[2].lower() @@ -279,7 +276,7 @@ def process_datapoint(item): return transformed, target -def datasets_librispeech(root="/datasets01/", folder_in_archive="librispeech/062419/"): +def datasets_librispeech(transforms, encode, root="/datasets01/", folder_in_archive="librispeech/062419/"): def create(tag): @@ -288,7 +285,7 @@ def create(tag): else: data = sum(LIBRISPEECH(root, t, folder_in_archive=folder_in_archive, download=False) for t in tag) - data = Processed(process_datapoint, data) + data = Processed(lambda x: process_datapoint(x, transforms, encode), data) # data = diskcache_iterator(data) data = MapMemoryCache(data) return data @@ -362,12 +359,12 @@ def which_set_filter(x): return data -def datasets_speechcommands(root="./"): +def datasets_speechcommands(transforms, encode, root="./"): def create(tag): data = SPEECHCOMMANDS(root, download=True) data = filter_speechcommands(data, tag, 90, 5) - data = Processed(process_datapoint, data) + data = Processed(lambda x: process_datapoint(x, transforms, encode), data) # data = diskcache_iterator(data) data = MapMemoryCache(data) return data @@ -501,8 +498,8 @@ def batch_viterbi_decode(tag_sequence: torch.Tensor, transition_matrix: torch.Te return torch.tensor(outputs).transpose(0, -1), torch.cat(scores) -def top_batch_viterbi_decode(tag_sequence: torch.Tensor): - output, _ = batch_viterbi_decode(tag_sequence, transitions, top_k=1) +def top_batch_viterbi_decode(tag_sequence: torch.Tensor, transition_matrix: torch.Tensor): + output, _ = batch_viterbi_decode(tag_sequence, transition_matrix, top_k=1) return output[:, 0, :] @@ -700,9 +697,9 @@ def main(args): print("vocab_size", vocab_size, flush=True) if args.dataset == "librispeech": - training, validation, _ = datasets_librispeech() + training, validation, _ = datasets_librispeech(transforms, encode) elif args.dataset == "speechcommand": - training, validation, _ = datasets_speechcommands() + training, validation, _ = datasets_speechcommands(transforms, encode) if args.viterbi_decoder: print("transitions: building", flush=True) From 26b327a98a61da3be7f72381c4915ed1cb5a6149 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 20 May 2020 14:40:26 -0700 Subject: [PATCH 012/129] lint. --- examples/pipeline/wav2letterclean.py | 176 ++++++++++----------------- 1 file changed, 62 insertions(+), 114 deletions(-) diff --git a/examples/pipeline/wav2letterclean.py b/examples/pipeline/wav2letterclean.py index acee79286c..139ad53c30 100644 --- a/examples/pipeline/wav2letterclean.py +++ b/examples/pipeline/wav2letterclean.py @@ -1,26 +1,18 @@ import argparse import collections -import cProfile -import hashlib import itertools import os import pprint -import pstats -import re import shutil import signal import statistics import string from collections import defaultdict from datetime import datetime -from io import StringIO from typing import Optional -import matplotlib import torch -import torch.distributed as dist import torchaudio -from matplotlib import pyplot as plt from torch import nn, topk from torch.optim import SGD, Adadelta, Adam from torch.optim.lr_scheduler import ExponentialLR, ReduceLROnPlateau @@ -31,8 +23,6 @@ from torchaudio.transforms import MFCC, Resample from tqdm.notebook import tqdm as tqdm -from tabulate import tabulate - def parse_args(): parser = argparse.ArgumentParser() @@ -84,18 +74,10 @@ def parse_args(): args = parser.parse_args() - - # Use #nodes as world_size - if 'SLURM_NNODES' in os.environ: - args.world_size = int(os.environ['SLURM_NNODES']) - - args.distributed = args.distributed or args.world_size > 1 - - if not args.distributed or os.environ['SLURM_PROCID'] == '0': - print(pprint.pformat(vars(args)), flush=True) - args.clip_norm = 0. + print(pprint.pformat(vars(args)), flush=True) + return args @@ -132,7 +114,10 @@ def save_checkpoint(state, is_best, filename): class LanguageModel: - def __init__(self, labels, char_blank): + def __init__(self, labels, char_blank, char_space): + + self.char_space = char_space + labels = [l for l in labels] self.length = len(labels) enumerated = list(enumerate(labels)) @@ -244,7 +229,7 @@ def process_datapoint(item, transforms, encode): return transformed, target -def datasets_librispeech(transforms, encode, root="/datasets01/", folder_in_archive="librispeech/062419/"): +def datasets_librispeech(transforms, language_model, root="/datasets01/", folder_in_archive="librispeech/062419/"): def create(tag): @@ -253,7 +238,7 @@ def create(tag): else: data = sum(LIBRISPEECH(root, t, folder_in_archive=folder_in_archive, download=False) for t in tag) - data = Processed(lambda x: process_datapoint(x, transforms, encode), data) + data = Processed(lambda x: process_datapoint(x, transforms, language_model.encode), data) # data = diskcache_iterator(data) data = MapMemoryCache(data) return data @@ -297,9 +282,7 @@ def levenshtein_distance(r: str, h: str, device: Optional[str] = None): dnew, dold = dold, dnew - dist = d[dnew, -1].item() - - return dist + return d[dnew, -1].item() def collate_fn(batch): @@ -326,41 +309,7 @@ def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) -def forward_decode(inputs, targets, decoder): - - inputs = inputs.to(device, non_blocking=True) - output = model(inputs).to("cpu") - output = decoder(output) - - output = decode(output.tolist()) - target = decode(targets.tolist()) - - print_length = 20 - for i in range(2): - output_print = output[i].ljust(print_length)[:print_length] - target_print = target[i].ljust(print_length)[:print_length] - print( - f"Epoch: {epoch:4} Target: {target_print} Output: {output_print}", flush=True) - - cers = [levenshtein_distance(a, b) for a, b in zip(target, output)] - cers_normalized = [d / len(a) for a, d in zip(target, cers)] - cers = statistics.mean(cers) - cers_normalized = statistics.mean(cers_normalized) - - output = [o.split(char_space) for o in output] - target = [o.split(char_space) for o in target] - - wers = [levenshtein_distance(a, b) for a, b in zip(target, output)] - wers_normalized = [d / len(a) for a, d in zip(target, wers)] - wers = statistics.mean(wers) - wers_normalized = statistics.mean(wers_normalized) - - print(f"Epoch: {epoch:4} CER: {cers:1.5f} WER: {wers:1.5f}", flush=True) - - return cers, wers, cers_normalized, wers_normalized - - -def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, print_freq, pbar=None): +def train_one_epoch(model, criterion, optimizer, scheduler, data_loader, device, epoch, pbar=None, non_blocking=False): model.train() @@ -379,7 +328,8 @@ def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, pri # input_lengths: batch size # target_lengths: batch size - sum_loss += criterion(outputs, targets, tensors_lengths, target_lengths).item() + loss = criterion(outputs, targets, tensors_lengths, target_lengths) + sum_loss += loss.item() optimizer.zero_grad() loss.backward() @@ -390,23 +340,22 @@ def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, pri return if pbar is not None: - pbar.update(1 / len(loader_training)) + pbar.update(1 / len(data_loader)) # Average loss - sum_loss = sum_loss / len(loader_training) + sum_loss = sum_loss / len(data_loader) print(f"Training loss: {sum_loss:4.5f}", flush=True) scheduler.step() -def evaluate(model, criterion, data_loader, device, print_freq=100): +def evaluate(model, criterion, data_loader, decoder, language_model, device, non_blocking=False): with torch.no_grad(): model.eval() - sum_loss = 0. - sum_error_rates = None + sums = defaultdict(lambda: 0.) for inputs, targets, tensors_lengths, target_lengths in bg_iterator(data_loader, maxsize=2): @@ -422,28 +371,44 @@ def evaluate(model, criterion, data_loader, device, print_freq=100): # input_lengths: batch size # target_lengths: batch size - sum_loss += criterion(outputs, targets, tensors_lengths, target_lengths).item() + sums["loss"] += criterion(outputs, targets, tensors_lengths, target_lengths).item() + + output = outputs.transpose(0, 1).to("cpu") + output = decoder(output) + + output = language_model.decode(output.tolist()) + target = language_model.decode(targets.tolist()) + + print_length = 20 + for i in range(2): + output_print = output[i].ljust(print_length)[:print_length] + target_print = target[i].ljust(print_length)[:print_length] + print(f"Target: {target_print} Output: {output_print}", flush=True) + + cers = [levenshtein_distance(a, b) for a, b in zip(target, output)] + # cers_normalized = [d / len(a) for a, d in zip(target, cers)] + cers = statistics.mean(cers) + sums["cer"] += cers + + output = [o.split(language_model.char_space) for o in output] + target = [o.split(language_model.char_space) for o in target] - error_rates = forward_decode(inputs, targets, greedy_decode) - if sum_error_rates is None: - sum_error_rates = [0 for _ in error_rates] - for i in range(len(error_rates)): - sum_error_rates[i] += error_rates[i] + wers = [levenshtein_distance(a, b) for a, b in zip(target, output)] + # wers_normalized = [d / len(a) for a, d in zip(target, wers)] + wers = statistics.mean(wers) + sums["wer"] += wers if SIGNAL_RECEIVED: break - # Average loss - sum_loss = sum_loss / len(loader_validation) - print(f"Validation loss: {sum_loss:.5f}", flush=True) + # Average loss + for k in sums.keys(): + sums[k] /= len(data_loader) - for i in range(len(error_rates)): - sum_error_rates[i] /= len(data_loader) - print(f"Decoder: {sum_error_rates}", flush=True) - cer1, wer1, cern1, wern1 = sum_error_rates - print(f"CER: {cer} WER: {wer} CERN: {cern} WERN: {wern}", flush=True) + print(f"Validation loss: {sums['loss']:.5f}", flush=True) + print(f"CER: {sums['cer']} WER: {sums['wer']} CERN: {sums['cern']} WERN: {sums['wern']}", flush=True) - return sum_loss + return sums['loss'] def main(args): @@ -456,16 +421,15 @@ def main(args): CHECKPOINT_filename = args.resume if args.resume else 'checkpoint.pth.tar' # Install signal handler - signal.signal(signal.SIGUSR1, lambda a, b: signal_handler(a, b, HALT_filename)) + signal.signal(signal.SIGUSR1, lambda a, b: signal_handler(a, b)) signal.signal(signal.SIGTERM, SIGTERM_handler) print('Signal handler installed', flush=True) - audio_backend = "soundfile" torchaudio.set_audio_backend(audio_backend) device = "cuda" if torch.cuda.is_available() else "cpu" - num_devices = torch.cuda.device_count() + # num_devices = torch.cuda.device_count() data_loader_training_params = { "num_workers": args.workers, @@ -504,13 +468,11 @@ def main(args): char_apostrophe = "'" labels = char_blank + char_space + char_apostrophe + string.ascii_lowercase - coder = LanguageModel(labels, char_blank) - encode = coder.encode - decode = coder.decode - vocab_size = coder.length + language_model = LanguageModel(labels, char_blank, char_space) + vocab_size = language_model.length print("vocab_size", vocab_size, flush=True) - training, validation, _ = datasets_librispeech(transforms, encode) + training, validation, _ = datasets_librispeech(transforms, language_model) num_features = n_bins if n_bins else 1 model = Wav2Letter(num_features, vocab_size) @@ -548,7 +510,7 @@ def main(args): scheduler = ExponentialLR(optimizer, gamma=args.gamma) # scheduler = ReduceLROnPlateau(optimizer, patience=2, threshold=1e-3) - criterion = torch.nn.CTCLoss(blank=coder.mapping[char_blank], zero_infinity=False) + criterion = torch.nn.CTCLoss(blank=language_model.mapping[char_blank], zero_infinity=False) # criterion = nn.MSELoss() # criterion = torch.nn.NLLLoss() @@ -559,26 +521,18 @@ def main(args): print("Length of data loaders: ", len(loader_training), len(loader_validation), flush=True) - history_loader = defaultdict(list) - history_training = defaultdict(list) - history_validation = defaultdict(list) - if args.resume and os.path.isfile(CHECKPOINT_filename): print("Checkpoint: loading '{}'".format(CHECKPOINT_filename)) checkpoint = torch.load(CHECKPOINT_filename) args.start_epoch = checkpoint['epoch'] best_loss = checkpoint['best_loss'] - history_training = checkpoint['history_training'] - history_validation = checkpoint['history_validation'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) print("Checkpoint: loaded '{}' at epoch {}".format(CHECKPOINT_filename, checkpoint['epoch'])) - print(tabulate(history_training, headers="keys"), flush=True) - print(tabulate(history_validation, headers="keys"), flush=True) else: print("Checkpoint: not found") @@ -588,28 +542,24 @@ def main(args): 'best_loss': best_loss, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), - 'history_training': history_training, - 'history_validation': history_validation, }, False, CHECKPOINT_filename) with tqdm(total=args.epochs, unit_scale=1, disable=args.distributed) as pbar: for epoch in range(args.start_epoch, args.epochs): - train_one_epoch( + train_one_epoch(model, criterion, optimizer, scheduler, loader_training, device, pbar=pbar, non_blocking=non_blocking) if SIGNAL_RECEIVED: - save_checkpoint({ - 'epoch': epoch, - 'state_dict': model.state_dict(), - 'best_loss': best_loss, - 'optimizer': optimizer.state_dict(), - 'scheduler': scheduler.state_dict(), - 'history_training': history_training, - 'history_validation': history_validation, - }, False, CHECKPOINT_filename) + save_checkpoint({ + 'epoch': epoch, + 'state_dict': model.state_dict(), + 'best_loss': best_loss, + 'optimizer': optimizer.state_dict(), + 'scheduler': scheduler.state_dict(), + }, False, CHECKPOINT_filename) if not epoch % args.print_freq or epoch == args.epochs - 1: - sum_loss = evaluate_one_epoch + sum_loss = evaluate(model, criterion, loader_validation, greedy_decode, language_model, device, non_blocking=non_blocking) is_best = sum_loss < best_loss best_loss = min(sum_loss, best_loss) @@ -619,8 +569,6 @@ def main(args): 'best_loss': best_loss, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), - 'history_training': history_training, - 'history_validation': history_validation, }, is_best, CHECKPOINT_filename) From 8a768d2947f4e8eb662c3ff88efe683f779026bf Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 20 May 2020 14:41:17 -0700 Subject: [PATCH 013/129] cleaning full version. --- examples/pipeline/wav2letter.py | 714 +++++---------------------- examples/pipeline/wav2letterclean.py | 577 ---------------------- 2 files changed, 137 insertions(+), 1154 deletions(-) delete mode 100644 examples/pipeline/wav2letterclean.py diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 338226faa9..139ad53c30 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -1,32 +1,18 @@ -import torch.multiprocessing as mp - -# https://github.com/pytorch/pytorch/issues/13883 -if __name__ == '__main__': - mp.set_start_method('forkserver') - import argparse import collections -import cProfile -import hashlib import itertools import os import pprint -import pstats -import re import shutil import signal import statistics import string from collections import defaultdict from datetime import datetime -from io import StringIO from typing import Optional -import matplotlib import torch -import torch.distributed as dist import torchaudio -from matplotlib import pyplot as plt from torch import nn, topk from torch.optim import SGD, Adadelta, Adam from torch.optim.lr_scheduler import ExponentialLR, ReduceLROnPlateau @@ -37,13 +23,6 @@ from torchaudio.transforms import MFCC, Resample from tqdm.notebook import tqdm as tqdm -from tabulate import tabulate - - -matplotlib.use("Agg") -MAIN_PID = os.getpid() -SIGNAL_RECEIVED = False - def parse_args(): parser = argparse.ArgumentParser() @@ -95,18 +74,10 @@ def parse_args(): args = parser.parse_args() - - # Use #nodes as world_size - if 'SLURM_NNODES' in os.environ: - args.world_size = int(os.environ['SLURM_NNODES']) - - args.distributed = args.distributed or args.world_size > 1 - - if not args.distributed or os.environ['SLURM_PROCID'] == '0': - print(pprint.pformat(vars(args)), flush=True) - args.clip_norm = 0. + print(pprint.pformat(vars(args)), flush=True) + return args @@ -115,31 +86,11 @@ def SIGTERM_handler(a, b): pass -def signal_handler(a, b, HALT_filename): +def signal_handler(a, b): global SIGNAL_RECEIVED print('Signal received', a, datetime.now().strftime("%y%m%d.%H%M%S"), flush=True) SIGNAL_RECEIVED = True - # If HALT file exists, which means the job is done, exit peacefully. - if os.path.isfile(HALT_filename): - print('Job is done, exiting') - exit(0) - - return - - -def trigger_job_requeue(CHECKPOINT_filename): - # Submit a new job to resume from checkpoint. - if os.path.isfile(CHECKPOINT_filename) and os.environ['SLURM_PROCID'] == '0' and os.getpid() == MAIN_PID: - print('pid: ', os.getpid(), ' ppid: ', os.getppid(), flush=True) - print('time is up, back to slurm queue', flush=True) - command = 'scontrol requeue ' + os.environ['SLURM_JOB_ID'] - print(command) - if os.system(command): - raise RuntimeError('requeue failed') - print('New job submitted to the queue', flush=True) - exit(0) - def save_checkpoint(state, is_best, filename): """ @@ -147,24 +98,26 @@ def save_checkpoint(state, is_best, filename): then copy it to filename, in case the signal interrupts the torch.save() process. """ - if not args.distributed or os.environ['SLURM_PROCID'] == '0': - CHECKPOINT_tempfile = filename + '.temp' + CHECKPOINT_tempfile = filename + '.temp' - # Remove CHECKPOINT_tempfile, in case the signal arrives in the - # middle of copying from CHECKPOINT_tempfile to CHECKPOINT_filename - if os.path.isfile(CHECKPOINT_tempfile): - os.remove(CHECKPOINT_tempfile) + # Remove CHECKPOINT_tempfile, in case the signal arrives in the + # middle of copying from CHECKPOINT_tempfile to CHECKPOINT_filename + if os.path.isfile(CHECKPOINT_tempfile): + os.remove(CHECKPOINT_tempfile) - torch.save(state, CHECKPOINT_tempfile) - if os.path.isfile(CHECKPOINT_tempfile): - os.rename(CHECKPOINT_tempfile, filename) - if is_best: - shutil.copyfile(filename, 'model_best.pth.tar') - print("Checkpoint: saved") + torch.save(state, CHECKPOINT_tempfile) + if os.path.isfile(CHECKPOINT_tempfile): + os.rename(CHECKPOINT_tempfile, filename) + if is_best: + shutil.copyfile(filename, 'model_best.pth.tar') + print("Checkpoint: saved") class LanguageModel: - def __init__(self, labels, char_blank): + def __init__(self, labels, char_blank, char_space): + + self.char_space = char_space + labels = [l for l in labels] self.length = len(labels) enumerated = list(enumerate(labels)) @@ -276,7 +229,7 @@ def process_datapoint(item, transforms, encode): return transformed, target -def datasets_librispeech(transforms, encode, root="/datasets01/", folder_in_archive="librispeech/062419/"): +def datasets_librispeech(transforms, language_model, root="/datasets01/", folder_in_archive="librispeech/062419/"): def create(tag): @@ -285,7 +238,7 @@ def create(tag): else: data = sum(LIBRISPEECH(root, t, folder_in_archive=folder_in_archive, download=False) for t in tag) - data = Processed(lambda x: process_datapoint(x, transforms, encode), data) + data = Processed(lambda x: process_datapoint(x, transforms, language_model.encode), data) # data = diskcache_iterator(data) data = MapMemoryCache(data) return data @@ -294,84 +247,6 @@ def create(tag): # return create(["train-clean-100", "train-clean-360", "train-other-500"]), create(["dev-clean", "dev-other"]), None -def which_set(filename, validation_percentage, testing_percentage): - """Determines which data partition the file should belong to. - - We want to keep files in the same training, validation, or testing sets even - if new ones are added over time. This makes it less likely that testing - samples will accidentally be reused in training when long runs are restarted - for example. To keep this stability, a hash of the filename is taken and used - to determine which set it should belong to. This determination only depends on - the name and the set proportions, so it won't change as other files are added. - - It's also useful to associate particular files as related (for example words - spoken by the same person), so anything after '_nohash_' in a filename is - ignored for set determination. This ensures that 'bobby_nohash_0.wav' and - 'bobby_nohash_1.wav' are always in the same set, for example. - - Args: - filename: File path of the data sample. - validation_percentage: How much of the data set to use for validation. - testing_percentage: How much of the data set to use for testing. - - Returns: - String, one of 'training', 'validation', or 'testing'. - """ - - MAX_NUM_WAVS_PER_CLASS = 2**27 - 1 # ~134M - - base_name = os.path.basename(filename) - - # We want to ignore anything after '_nohash_' in the file name when - # deciding which set to put a wav in, so the data set creator has a way of - # grouping wavs that are close variations of each other. - hash_name = re.sub(r'_nohash_.*$', '', base_name).encode("utf-8") - - # This looks a bit magical, but we need to decide whether this file should - # go into the training, testing, or validation sets, and we want to keep - # existing files in the same set even if more files are subsequently - # added. - # To do that, we need a stable way of deciding based on just the file name - # itself, so we do a hash of that and then use that to generate a - # probability value that we use to assign it. - hash_name_hashed = hashlib.sha1(hash_name).hexdigest() - percentage_hash = ((int(hash_name_hashed, 16) % ( - MAX_NUM_WAVS_PER_CLASS + 1)) * (100.0 / MAX_NUM_WAVS_PER_CLASS)) - - if percentage_hash < validation_percentage: - result = 'validation' - elif percentage_hash < (testing_percentage + validation_percentage): - result = 'testing' - else: - result = 'training' - - return result - - -def filter_speechcommands(data, tag, training_percentage, validation_percentage): - if training_percentage < 100.: - testing_percentage = (100. - training_percentage - validation_percentage) - - def which_set_filter(x): - return which_set(x, validation_percentage, testing_percentage) == tag - - data._walker = list(filter(which_set_filter, data._walker)) - return data - - -def datasets_speechcommands(transforms, encode, root="./"): - - def create(tag): - data = SPEECHCOMMANDS(root, download=True) - data = filter_speechcommands(data, tag, 90, 5) - data = Processed(lambda x: process_datapoint(x, transforms, encode), data) - # data = diskcache_iterator(data) - data = MapMemoryCache(data) - return data - - return create("training"), create("validation"), create("testing") - - def greedy_decode(outputs): """Greedy Decoder. Returns highest probability of class labels for each timestep @@ -385,124 +260,6 @@ def greedy_decode(outputs): return indices[..., 0] -def build_transitions(training, vocab_size): - - from collections import Counter - - c = None - - for _, label in training: - # Count bigrams - count = [((a.item(), b.item())) for (a, b) in zip(label, label[1:])] - count = Counter(count) - if c is None: - c = count - else: - c = c + count - - # Encode as transition matrix - - ind = torch.tensor(list(zip(*[a for (a, b) in c.items()]))) - val = torch.tensor([b for (a, b) in c.items()], dtype=torch.float) - - transitions = torch.sparse_coo_tensor(indices=ind, values=val, size=[ - vocab_size, vocab_size]).coalesce().to_dense() - transitions = (transitions / torch.max(torch.tensor(1.), transitions.max(dim=1)[0]).unsqueeze(1)) - - return transitions - - -def viterbi_decode(tag_sequence: torch.Tensor, transition_matrix: torch.Tensor, top_k: int = 5): - """ - Perform Viterbi decoding in log space over a sequence given a transition matrix - specifying pairwise (transition) potentials between tags and a matrix of shape - (sequence_length, num_tags) specifying unary potentials for possible tags per - timestep. - Parameters - ---------- - tag_sequence : torch.Tensor, required. - A tensor of shape (sequence_length, num_tags) representing scores for - a set of tags over a given sequence. - transition_matrix : torch.Tensor, required. - A tensor of shape (num_tags, num_tags) representing the binary potentials - for transitioning between a given pair of tags. - top_k : int, required. - Integer defining the top number of paths to decode. - Returns - ------- - viterbi_path : List[int] - The tag indices of the maximum likelihood tag sequence. - viterbi_score : float - The score of the viterbi path. - """ - sequence_length, num_tags = tag_sequence.size() - - path_scores = [] - path_indices = [] - # At the beginning, the maximum number of permutations is 1; therefore, we unsqueeze(0) - # to allow for 1 permutation. - path_scores.append(tag_sequence[0, :].unsqueeze(0)) - # assert path_scores[0].size() == (n_permutations, num_tags) - - # Evaluate the scores for all possible paths. - for timestep in range(1, sequence_length): - # Add pairwise potentials to current scores. - # assert path_scores[timestep - 1].size() == (n_permutations, num_tags) - summed_potentials = path_scores[timestep - - 1].unsqueeze(2) + transition_matrix - summed_potentials = summed_potentials.view(-1, num_tags) - - # Best pairwise potential path score from the previous timestep. - max_k = min(summed_potentials.size()[0], top_k) - scores, paths = torch.topk(summed_potentials, k=max_k, dim=0) - # assert scores.size() == (n_permutations, num_tags) - # assert paths.size() == (n_permutations, num_tags) - - scores = tag_sequence[timestep, :] + scores - # assert scores.size() == (n_permutations, num_tags) - path_scores.append(scores) - path_indices.append(paths.squeeze()) - - # Construct the most likely sequence backwards. - path_scores = path_scores[-1].view(-1) - max_k = min(path_scores.size()[0], top_k) - viterbi_scores, best_paths = torch.topk(path_scores, k=max_k, dim=0) - - viterbi_paths = [] - for i in range(max_k): - - viterbi_path = [best_paths[i].item()] - for backward_timestep in reversed(path_indices): - viterbi_path.append( - int(backward_timestep.view(-1)[viterbi_path[-1]])) - - # Reverse the backward path. - viterbi_path.reverse() - - # Viterbi paths uses (num_tags * n_permutations) nodes; therefore, we need to modulo. - viterbi_path = [j % num_tags for j in viterbi_path] - viterbi_paths.append(viterbi_path) - - return viterbi_paths, viterbi_scores - - -def batch_viterbi_decode(tag_sequence: torch.Tensor, transition_matrix: torch.Tensor, top_k: int = 5): - - outputs = [] - scores = [] - for i in range(tag_sequence.shape[1]): - paths, score = viterbi_decode(tag_sequence[:, i, :], transition_matrix) - outputs.append(paths) - scores.append(score) - - return torch.tensor(outputs).transpose(0, -1), torch.cat(scores) - - -def top_batch_viterbi_decode(tag_sequence: torch.Tensor, transition_matrix: torch.Tensor): - output, _ = batch_viterbi_decode(tag_sequence, transition_matrix, top_k=1) - return output[:, 0, :] - - def levenshtein_distance(r: str, h: str, device: Optional[str] = None): # initialisation @@ -525,9 +282,7 @@ def levenshtein_distance(r: str, h: str, device: Optional[str] = None): dnew, dold = dold, dnew - dist = d[dnew, -1].item() - - return dist + return d[dnew, -1].item() def collate_fn(batch): @@ -554,60 +309,106 @@ def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) -def forward_loss(inputs, targets, tensors_lengths, target_lengths): +def train_one_epoch(model, criterion, optimizer, scheduler, data_loader, device, epoch, pbar=None, non_blocking=False): + + model.train() + + sum_loss = 0. + for inputs, targets, tensors_lengths, target_lengths in bg_iterator(data_loader, maxsize=2): + + inputs = inputs.to(device, non_blocking=non_blocking) + targets = targets.to(device, non_blocking=non_blocking) + + # keep batch first for data parallel + outputs = model(inputs).transpose(0, 1) - inputs = inputs.to(device, non_blocking=non_blocking) - targets = targets.to(device, non_blocking=non_blocking) + # CTC + # outputs: input length, batch size, number of classes (including blank) + # targets: batch size, max target length + # input_lengths: batch size + # target_lengths: batch size - # keep batch first for data parallel - outputs = model(inputs).transpose(0, 1) + loss = criterion(outputs, targets, tensors_lengths, target_lengths) + sum_loss += loss.item() - # this_batch_size = outputs.shape[1] - # seq_len = outputs.shape[0] - # input_lengths = torch.full((this_batch_size,), seq_len, dtype=torch.long, device=outputs.device) - # input_lengths = tensors_lengths + optimizer.zero_grad() + loss.backward() - # CTC - # outputs: input length, batch size, number of classes (including blank) - # targets: batch size, max target length - # input_lengths: batch size - # target_lengths: batch size + optimizer.step() - return criterion(outputs, targets, tensors_lengths, target_lengths) + if SIGNAL_RECEIVED: + return + if pbar is not None: + pbar.update(1 / len(data_loader)) -def forward_decode(inputs, targets, decoder): + # Average loss + sum_loss = sum_loss / len(data_loader) + print(f"Training loss: {sum_loss:4.5f}", flush=True) - inputs = inputs.to(device, non_blocking=True) - output = model(inputs).to("cpu") - output = decoder(output) + scheduler.step() - output = decode(output.tolist()) - target = decode(targets.tolist()) - print_length = 20 - for i in range(2): - output_print = output[i].ljust(print_length)[:print_length] - target_print = target[i].ljust(print_length)[:print_length] - print( - f"Epoch: {epoch:4} Target: {target_print} Output: {output_print}", flush=True) +def evaluate(model, criterion, data_loader, decoder, language_model, device, non_blocking=False): - cers = [levenshtein_distance(a, b) for a, b in zip(target, output)] - cers_normalized = [d / len(a) for a, d in zip(target, cers)] - cers = statistics.mean(cers) - cers_normalized = statistics.mean(cers_normalized) + with torch.no_grad(): - output = [o.split(char_space) for o in output] - target = [o.split(char_space) for o in target] + model.eval() - wers = [levenshtein_distance(a, b) for a, b in zip(target, output)] - wers_normalized = [d / len(a) for a, d in zip(target, wers)] - wers = statistics.mean(wers) - wers_normalized = statistics.mean(wers_normalized) + sums = defaultdict(lambda: 0.) - print(f"Epoch: {epoch:4} CER: {cers:1.5f} WER: {wers:1.5f}", flush=True) + for inputs, targets, tensors_lengths, target_lengths in bg_iterator(data_loader, maxsize=2): - return cers, wers, cers_normalized, wers_normalized + inputs = inputs.to(device, non_blocking=non_blocking) + targets = targets.to(device, non_blocking=non_blocking) + + # keep batch first for data parallel + outputs = model(inputs).transpose(0, 1) + + # CTC + # outputs: input length, batch size, number of classes (including blank) + # targets: batch size, max target length + # input_lengths: batch size + # target_lengths: batch size + + sums["loss"] += criterion(outputs, targets, tensors_lengths, target_lengths).item() + + output = outputs.transpose(0, 1).to("cpu") + output = decoder(output) + + output = language_model.decode(output.tolist()) + target = language_model.decode(targets.tolist()) + + print_length = 20 + for i in range(2): + output_print = output[i].ljust(print_length)[:print_length] + target_print = target[i].ljust(print_length)[:print_length] + print(f"Target: {target_print} Output: {output_print}", flush=True) + + cers = [levenshtein_distance(a, b) for a, b in zip(target, output)] + # cers_normalized = [d / len(a) for a, d in zip(target, cers)] + cers = statistics.mean(cers) + sums["cer"] += cers + + output = [o.split(language_model.char_space) for o in output] + target = [o.split(language_model.char_space) for o in target] + + wers = [levenshtein_distance(a, b) for a, b in zip(target, output)] + # wers_normalized = [d / len(a) for a, d in zip(target, wers)] + wers = statistics.mean(wers) + sums["wer"] += wers + + if SIGNAL_RECEIVED: + break + + # Average loss + for k in sums.keys(): + sums[k] /= len(data_loader) + + print(f"Validation loss: {sums['loss']:.5f}", flush=True) + print(f"CER: {sums['cer']} WER: {sums['wer']} CERN: {sums['cern']} WERN: {sums['wern']}", flush=True) + + return sums['loss'] def main(args): @@ -617,40 +418,18 @@ def main(args): # Empty CUDA cache torch.cuda.empty_cache() - # Profiling performance - pr = cProfile.Profile() - pr.enable() - - # Checkpoint - CHECKPOINT_filename = args.resume if args.resume else 'checkpoint.pth.tar' - HALT_filename = CHECKPOINT_filename + '.HALT' - - # HALT file is used as a sign of job completion. - # Make sure no HALT file left from previous runs. - if os.path.isfile(HALT_filename): - os.remove(HALT_filename) # Install signal handler - signal.signal(signal.SIGUSR1, lambda a, b: signal_handler(a, b, HALT_filename)) + signal.signal(signal.SIGUSR1, lambda a, b: signal_handler(a, b)) signal.signal(signal.SIGTERM, SIGTERM_handler) print('Signal handler installed', flush=True) - # Distributed - - if args.distributed: - os.environ['RANK'] = os.environ['SLURM_PROCID'] - os.environ['WORLD_SIZE'] = str(args.world_size) - print('in distributed', os.environ['RANK'], os.environ['MASTER_ADDR'], os.environ['MASTER_PORT'], flush=True) - dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) - print('init process', flush=True) - audio_backend = "soundfile" torchaudio.set_audio_backend(audio_backend) device = "cuda" if torch.cuda.is_available() else "cpu" - num_devices = torch.cuda.device_count() - print(num_devices, "GPUs", flush=True) + # num_devices = torch.cuda.device_count() data_loader_training_params = { "num_workers": args.workers, @@ -673,10 +452,9 @@ def main(args): } sample_rate_original = 16000 - sample_rate_new = 8000 transforms = nn.Sequential( - # torchaudio.transforms.Resample(sample_rate_original, sample_rate_new), + # torchaudio.transforms.Resample(sample_rate_original, sample_rate_original//2), # torchaudio.transforms.MFCC(sample_rate=sample_rate_original, n_mfcc=n_bins, melkwargs=melkwargs), torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate_original, n_mels=n_bins), # torchaudio.transforms.FrequencyMasking(freq_mask_param=n_bins), @@ -690,23 +468,11 @@ def main(args): char_apostrophe = "'" labels = char_blank + char_space + char_apostrophe + string.ascii_lowercase - coder = LanguageModel(labels, char_blank) - encode = coder.encode - decode = coder.decode - vocab_size = coder.length + language_model = LanguageModel(labels, char_blank, char_space) + vocab_size = language_model.length print("vocab_size", vocab_size, flush=True) - if args.dataset == "librispeech": - training, validation, _ = datasets_librispeech(transforms, encode) - elif args.dataset == "speechcommand": - training, validation, _ = datasets_speechcommands(transforms, encode) - - if args.viterbi_decoder: - print("transitions: building", flush=True) - transitions = build_transitions(training, vocab_size) - print("transitions: done", flush=True) - - # Model + training, validation, _ = datasets_librispeech(transforms, language_model) num_features = n_bins if n_bins else 1 model = Wav2Letter(num_features, vocab_size) @@ -723,9 +489,8 @@ def main(args): model = model.to(device, non_blocking=non_blocking) - if not args.distributed or os.environ['SLURM_PROCID'] == '0': - n = count_parameters(model) - print(f"Number of parameters: {n}", flush=True) + n = count_parameters(model) + print(f"Number of parameters: {n}", flush=True) print(torch.cuda.memory_summary(), flush=True) @@ -738,14 +503,14 @@ def main(args): "weight_decay": args.weight_decay, } - Optimizer = Adadelta + Optimizer = SGD optimizer_params = optimizer_params optimizer = Optimizer(model.parameters(), **optimizer_params) scheduler = ExponentialLR(optimizer, gamma=args.gamma) # scheduler = ReduceLROnPlateau(optimizer, patience=2, threshold=1e-3) - criterion = torch.nn.CTCLoss(blank=coder.mapping[char_blank], zero_infinity=False) + criterion = torch.nn.CTCLoss(blank=language_model.mapping[char_blank], zero_infinity=False) # criterion = nn.MSELoss() # criterion = torch.nn.NLLLoss() @@ -756,26 +521,18 @@ def main(args): print("Length of data loaders: ", len(loader_training), len(loader_validation), flush=True) - history_loader = defaultdict(list) - history_training = defaultdict(list) - history_validation = defaultdict(list) - if args.resume and os.path.isfile(CHECKPOINT_filename): print("Checkpoint: loading '{}'".format(CHECKPOINT_filename)) checkpoint = torch.load(CHECKPOINT_filename) args.start_epoch = checkpoint['epoch'] best_loss = checkpoint['best_loss'] - history_training = checkpoint['history_training'] - history_validation = checkpoint['history_validation'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) print("Checkpoint: loaded '{}' at epoch {}".format(CHECKPOINT_filename, checkpoint['epoch'])) - print(tabulate(history_training, headers="keys"), flush=True) - print(tabulate(history_validation, headers="keys"), flush=True) else: print("Checkpoint: not found") @@ -785,231 +542,34 @@ def main(args): 'best_loss': best_loss, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), - 'history_training': history_training, - 'history_validation': history_validation, }, False, CHECKPOINT_filename) with tqdm(total=args.epochs, unit_scale=1, disable=args.distributed) as pbar: + for epoch in range(args.start_epoch, args.epochs): - torch.cuda.reset_max_memory_allocated() - model.train() - - sum_loss = 0. - total_norm = 0. - for inputs, targets, tensors_lengths, target_lengths in bg_iterator(loader_training, maxsize=2): - - loss = forward_loss(inputs, targets, tensors_lengths, target_lengths) - sum_loss += loss.item() - - optimizer.zero_grad() - loss.backward() - - norm = 0. - if args.clip_norm > 0: - norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip_norm) - total_norm += norm - elif args.gradient: - for p in list(filter(lambda p: p.grad is not None, model.parameters())): - norm += p.grad.data.norm(2).item() ** 2 - norm = norm ** .5 - total_norm += norm - - optimizer.step() - - memory = torch.cuda.max_memory_allocated() - # print(f"memory in training: {memory}", flush=True) - - history_loader["epoch"].append(epoch) - history_loader["n"].append(pbar.n) - history_loader["memory"].append(memory) - - if SIGNAL_RECEIVED: - save_checkpoint({ - 'epoch': epoch, - 'state_dict': model.state_dict(), - 'best_loss': best_loss, - 'optimizer': optimizer.state_dict(), - 'scheduler': scheduler.state_dict(), - 'history_training': history_training, - 'history_validation': history_validation, - }, False, CHECKPOINT_filename) - trigger_job_requeue(CHECKPOINT_filename) - - pbar.update(1 / len(loader_training)) - - total_norm = (total_norm ** .5) / len(loader_training) - if total_norm > 0: - print(f"Epoch: {epoch:4} Gradient: {total_norm:4.5f}", flush=True) - - # Average loss - sum_loss = sum_loss / len(loader_training) - sum_loss_str = f"Epoch: {epoch:4} Train: {sum_loss:4.5f}" - - scheduler.step() - - memory = torch.cuda.max_memory_allocated() - print(f"memory after training: {memory}", flush=True) - - history_training["epoch"].append(epoch) - history_training["gradient_norm"].append(total_norm) - history_training["sum_loss"].append(sum_loss) - history_training["max_memory_allocated"].append(memory) + train_one_epoch(model, criterion, optimizer, scheduler, loader_training, device, pbar=pbar, non_blocking=non_blocking) + if SIGNAL_RECEIVED: + save_checkpoint({ + 'epoch': epoch, + 'state_dict': model.state_dict(), + 'best_loss': best_loss, + 'optimizer': optimizer.state_dict(), + 'scheduler': scheduler.state_dict(), + }, False, CHECKPOINT_filename) if not epoch % args.print_freq or epoch == args.epochs - 1: - with torch.no_grad(): - - # Switch to evaluation mode - model.eval() - - sum_loss = 0. - sum_out_greedy = [0, 0, 0, 0] - sum_out_viterbi = [0, 0, 0, 0] - - for inputs, targets, tensors_lengths, target_lengths in bg_iterator(loader_validation, maxsize=2): - sum_loss += forward_loss(inputs, targets, tensors_lengths, target_lengths).item() - - if True: - out_greedy = forward_decode(inputs, targets, greedy_decode) - for i in range(len(out_greedy)): - sum_out_greedy[i] += out_greedy[i] - if args.viterbi_decoder: - out_viterbi = forward_decode(inputs, targets, top_batch_viterbi_decode) - for i in range(len(out_greedy)): - sum_out_viterbi[i] += out_viterbi[i] - - if SIGNAL_RECEIVED: - break - - # Average loss - sum_loss = sum_loss / len(loader_validation) - sum_loss_str += f" Validation: {sum_loss:.5f}" - print(sum_loss_str, flush=True) - - if True: - for i in range(len(out_greedy)): - sum_out_greedy[i] /= len(loader_validation) - print(f"greedy decoder: {sum_out_greedy}", flush=True) - cer1, wer1, cern1, wern1 = sum_out_greedy - if args.viterbi_decoder: - for i in range(len(out_viterbi)): - sum_out_viterbi[i] /= len(loader_validation) - print(f"viterbi decoder: {sum_out_viterbi}", flush=True) - cer2, wer2, cern2, wern2 = sum_out_viterbi - - memory = torch.cuda.max_memory_allocated() - print(f"memory after validation: {memory}", flush=True) - - history_validation["epoch"].append(epoch) - history_validation["max_memory_allocated"].append(memory) - history_validation["sum_loss"].append(sum_loss) - - if True: - history_validation["greedy_cer"].append(cer1) - history_validation["greedy_cer_normalized"].append(cern1) - history_validation["greedy_wer"].append(wer1) - history_validation["greedy_wer_normalized"].append(wern1) - if args.viterbi_decoder: - history_validation["viterbi_cer"].append(cer2) - history_validation["viterbi_cer_normalized"].append(cern2) - history_validation["viterbi_wer"].append(wer2) - history_validation["viterbi_wer_normalized"].append(wern2) - - is_best = sum_loss < best_loss - best_loss = min(sum_loss, best_loss) - save_checkpoint({ - 'epoch': epoch + 1, - 'state_dict': model.state_dict(), - 'best_loss': best_loss, - 'optimizer': optimizer.state_dict(), - 'scheduler': scheduler.state_dict(), - 'history_training': history_training, - 'history_validation': history_validation, - }, is_best, CHECKPOINT_filename) - - print(tabulate(history_training, headers="keys"), flush=True) - print(tabulate(history_validation, headers="keys"), flush=True) - print(torch.cuda.memory_summary(), flush=True) - - # scheduler.step(sum_loss) - - # Create an empty file HALT_filename, mark the job as finished - if epoch == args.epochs - 1: - open(HALT_filename, 'a').close() - - print(tabulate(history_training, headers="keys"), flush=True) - print(tabulate(history_validation, headers="keys"), flush=True) - print(torch.cuda.memory_summary(), flush=True) - print(tabulate(history_loader, headers="keys"), flush=True) - - plt.plot(history_loader["epoch"], history_loader["memory"], label="memory") - - if not args.distributed or os.environ['SLURM_PROCID'] == '0': - - if "greedy_cer" in history_validation: - plt.plot(history_validation["epoch"], history_validation["greedy_cer"], label="greedy") - if "viterbi_cer" in history_validation: - plt.plot(history_validation["epoch"], history_validation["viterbi_cer"], label="viterbi") - plt.legend() - plt.savefig(os.path.join(args.figures, "cer.png")) - - if not args.distributed or os.environ['SLURM_PROCID'] == '0': - - if "greedy_wer" in history_validation: - plt.plot(history_validation["epoch"], history_validation["greedy_wer"], label="greedy") - if "viterbi_wer" in history_validation: - plt.plot(history_validation["epoch"], history_validation["viterbi_wer"], label="viterbi") - plt.legend() - plt.savefig(os.path.join(args.figures, "wer.png")) - - if not args.distributed or os.environ['SLURM_PROCID'] == '0': - - if "greedy_cer_normalized" in history_validation: - plt.plot(history_validation["epoch"], history_validation["greedy_cer_normalized"], label="greedy") - if "viterbi_cer_normalized" in history_validation: - plt.plot(history_validation["epoch"], history_validation["viterbi_cer_normalized"], label="viterbi") - plt.legend() - plt.savefig(os.path.join(args.figures, "cer_normalized.png")) - - if not args.distributed or os.environ['SLURM_PROCID'] == '0': - - if "greedy_wer_normalized" in history_validation: - plt.plot(history_validation["epoch"], history_validation["greedy_wer_normalized"], label="greedy") - if "viterbi_wer_normalized" in history_validation: - plt.plot(history_validation["epoch"], history_validation["viterbi_wer_normalized"], label="viterbi") - plt.legend() - plt.savefig(os.path.join(args.figures, "wer_normalized.png")) - - if not args.distributed or os.environ['SLURM_PROCID'] == '0': - - plt.plot(history_training["epoch"], history_training["sum_loss"], label="training") - plt.plot(history_validation["epoch"], history_validation["sum_loss"], label="validation") - plt.legend() - plt.savefig(os.path.join(args.figures, "sum_loss.png")) - - if not args.distributed or os.environ['SLURM_PROCID'] == '0': - - plt.plot(history_training["epoch"], history_training["sum_loss"], label="training") - plt.plot(history_validation["epoch"], history_validation["sum_loss"], label="validation") - plt.yscale("log") - plt.legend() - plt.savefig(os.path.join(args.figures, "log_sum_loss.png")) - - if not args.distributed or os.environ['SLURM_PROCID'] == '0': - print(torch.cuda.memory_summary(), flush=True) - - # Print performance - pr.disable() - s = StringIO() - ( - pstats - .Stats(pr, stream=s) - .strip_dirs() - .sort_stats("cumtime") - .print_stats(20) - ) - print(s.getvalue(), flush=True) - print("stop time: {}".format(str(datetime.now())), flush=True) + sum_loss = evaluate(model, criterion, loader_validation, greedy_decode, language_model, device, non_blocking=non_blocking) + + is_best = sum_loss < best_loss + best_loss = min(sum_loss, best_loss) + save_checkpoint({ + 'epoch': epoch + 1, + 'state_dict': model.state_dict(), + 'best_loss': best_loss, + 'optimizer': optimizer.state_dict(), + 'scheduler': scheduler.state_dict(), + }, is_best, CHECKPOINT_filename) if __name__ == "__main__": diff --git a/examples/pipeline/wav2letterclean.py b/examples/pipeline/wav2letterclean.py deleted file mode 100644 index 139ad53c30..0000000000 --- a/examples/pipeline/wav2letterclean.py +++ /dev/null @@ -1,577 +0,0 @@ -import argparse -import collections -import itertools -import os -import pprint -import shutil -import signal -import statistics -import string -from collections import defaultdict -from datetime import datetime -from typing import Optional - -import torch -import torchaudio -from torch import nn, topk -from torch.optim import SGD, Adadelta, Adam -from torch.optim.lr_scheduler import ExponentialLR, ReduceLROnPlateau -from torch.utils.data import DataLoader -from torchaudio.datasets import LIBRISPEECH, SPEECHCOMMANDS -from torchaudio.datasets.utils import bg_iterator, diskcache_iterator -from torchaudio.models.wav2letter import Wav2Letter -from torchaudio.transforms import MFCC, Resample -from tqdm.notebook import tqdm as tqdm - - -def parse_args(): - parser = argparse.ArgumentParser() - - parser.add_argument('--workers', default=0, type=int, - metavar='N', help='number of data loading workers') - parser.add_argument('--resume', default='', type=str, - metavar='PATH', help='path to latest checkpoint') - parser.add_argument('--figures', default='', type=str, - metavar='PATH', help='folder path to save figures') - - parser.add_argument('--epochs', default=200, type=int, - metavar='N', help='number of total epochs to run') - parser.add_argument('--start-epoch', default=0, type=int, - metavar='N', help='manual epoch number') - parser.add_argument('--print-freq', default=10, type=int, - metavar='N', help='print frequency in epochs') - - parser.add_argument('--arch', metavar='ARCH', default='wav2letter', - choices=["wav2letter", "lstm"], help='model architecture') - parser.add_argument('--batch-size', default=64, type=int, - metavar='N', help='mini-batch size') - - parser.add_argument('--learning-rate', default=1., type=float, - metavar='LR', help='initial learning rate') - parser.add_argument('--gamma', default=.96, type=float, - metavar='GAMMA', help='learning rate exponential decay constant') - # parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum') - parser.add_argument('--weight-decay', default=1e-5, - type=float, metavar='W', help='weight decay') - parser.add_argument("--eps", metavar='EPS', type=float, default=1e-8) - parser.add_argument("--rho", metavar='RHO', type=float, default=.95) - - parser.add_argument('--n-bins', default=13, type=int, - metavar='N', help='number of bins in transforms') - - parser.add_argument('--world-size', default=1, type=int, - help='number of distributed processes') - parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', - type=str, help='url used to set up distributed training') - parser.add_argument('--dist-backend', default='nccl', - type=str, help='distributed backend') - parser.add_argument('--distributed', action="store_true") - - parser.add_argument('--dataset', default='librispeech', type=str) - parser.add_argument('--gradient', action="store_true") - parser.add_argument('--jit', action="store_true") - parser.add_argument('--viterbi-decoder', action="store_true") - - args = parser.parse_args() - - args.clip_norm = 0. - - print(pprint.pformat(vars(args)), flush=True) - - return args - - -def SIGTERM_handler(a, b): - print('received sigterm') - pass - - -def signal_handler(a, b): - global SIGNAL_RECEIVED - print('Signal received', a, datetime.now().strftime("%y%m%d.%H%M%S"), flush=True) - SIGNAL_RECEIVED = True - - -def save_checkpoint(state, is_best, filename): - """ - Save the model to a temporary file first, - then copy it to filename, in case the signal interrupts - the torch.save() process. - """ - CHECKPOINT_tempfile = filename + '.temp' - - # Remove CHECKPOINT_tempfile, in case the signal arrives in the - # middle of copying from CHECKPOINT_tempfile to CHECKPOINT_filename - if os.path.isfile(CHECKPOINT_tempfile): - os.remove(CHECKPOINT_tempfile) - - torch.save(state, CHECKPOINT_tempfile) - if os.path.isfile(CHECKPOINT_tempfile): - os.rename(CHECKPOINT_tempfile, filename) - if is_best: - shutil.copyfile(filename, 'model_best.pth.tar') - print("Checkpoint: saved") - - -class LanguageModel: - def __init__(self, labels, char_blank, char_space): - - self.char_space = char_space - - labels = [l for l in labels] - self.length = len(labels) - enumerated = list(enumerate(labels)) - flipped = [(sub[1], sub[0]) for sub in enumerated] - - d1 = collections.OrderedDict(enumerated) - d2 = collections.OrderedDict(flipped) - self.mapping = {**d1, **d2} - - def encode(self, iterable): - if isinstance(iterable, list): - return [self.encode(i) for i in iterable] - else: - return [self.mapping[i] + self.mapping[self.char_blank] for i in iterable] - - def decode(self, tensor): - if isinstance(tensor[0], list): - return [self.decode(t) for t in tensor] - else: - # not idempotent, since clean string - x = (self.mapping[i] for i in tensor) - x = ''.join(i for i, _ in itertools.groupby(x)) - x = x.replace(self.char_blank, "") - # x = x.strip() - return x - - -def model_length_function(tensor): - return int(tensor.shape[0]) // 2 + 1 - - -class IterableMemoryCache: - - def __init__(self, iterable): - self.iterable = iterable - self._iter = iter(iterable) - self._done = False - self._values = [] - - def __iter__(self): - if self._done: - return iter(self._values) - return itertools.chain(self._values, self._gen_iter()) - - def _gen_iter(self): - for new_value in self._iter: - self._values.append(new_value) - yield new_value - self._done = True - - def __len__(self): - return len(self._iterable) - - -class MapMemoryCache(torch.utils.data.Dataset): - """ - Wrap a dataset so that, whenever a new item is returned, it is saved to memory. - """ - - def __init__(self, dataset): - self.dataset = dataset - self._cache = [None] * len(dataset) - - def __getitem__(self, n): - if self._cache[n]: - return self._cache[n] - - item = self.dataset[n] - self._cache[n] = item - - return item - - def __len__(self): - return len(self.dataset) - - -class Processed(torch.utils.data.Dataset): - - def __init__(self, process_datapoint, dataset): - self.process_datapoint = process_datapoint - self.dataset = dataset - - def __getitem__(self, n): - item = self.dataset[n] - return self.process_datapoint(item) - - def __next__(self): - item = next(self.dataset) - return self.process_datapoint(item) - - def __len__(self): - return len(self.dataset) - - -def process_datapoint(item, transforms, encode): - transformed = item[0] # .to(device, non_blocking=non_blocking) - target = item[2].lower() - - transformed = transforms(transformed) - - transformed = transformed[0, ...].transpose(0, -1) - - target = " " + target + " " - target = encode(target) - target = torch.tensor(target, dtype=torch.long, device=transformed.device) - - transformed = transformed # .to("cpu") - target = target # .to("cpu") - return transformed, target - - -def datasets_librispeech(transforms, language_model, root="/datasets01/", folder_in_archive="librispeech/062419/"): - - def create(tag): - - if isinstance(tag, str): - data = LIBRISPEECH(root, tag, folder_in_archive=folder_in_archive, download=False) - else: - data = sum(LIBRISPEECH(root, t, folder_in_archive=folder_in_archive, download=False) for t in tag) - - data = Processed(lambda x: process_datapoint(x, transforms, language_model.encode), data) - # data = diskcache_iterator(data) - data = MapMemoryCache(data) - return data - - return create("train-clean-100"), create("dev-clean"), None - # return create(["train-clean-100", "train-clean-360", "train-other-500"]), create(["dev-clean", "dev-other"]), None - - -def greedy_decode(outputs): - """Greedy Decoder. Returns highest probability of class labels for each timestep - - Args: - outputs (torch.Tensor): shape (input length, batch size, number of classes (including blank)) - - Returns: - torch.Tensor: class labels per time step. - """ - _, indices = topk(outputs, k=1, dim=-1) - return indices[..., 0] - - -def levenshtein_distance(r: str, h: str, device: Optional[str] = None): - - # initialisation - d = torch.zeros((2, len(h) + 1), dtype=torch.long) # , device=device) - dold = 0 - dnew = 1 - - # computation - for i in range(1, len(r) + 1): - d[dnew, 0] = 0 - for j in range(1, len(h) + 1): - - if r[i - 1] == h[j - 1]: - d[dnew, j] = d[dnew - 1, j - 1] - else: - substitution = d[dnew - 1, j - 1] + 1 - insertion = d[dnew, j - 1] + 1 - deletion = d[dnew - 1, j] + 1 - d[dnew, j] = min(substitution, insertion, deletion) - - dnew, dold = dold, dnew - - return d[dnew, -1].item() - - -def collate_fn(batch): - - tensors = [b[0] for b in batch if b] - - tensors_lengths = torch.tensor( - [model_length_function(t) for t in tensors], dtype=torch.long, device=tensors[0].device - ) - - tensors = torch.nn.utils.rnn.pad_sequence(tensors, batch_first=True) - tensors = tensors.transpose(1, -1) - - targets = [b[1] for b in batch if b] - target_lengths = torch.tensor( - [target.shape[0] for target in targets], dtype=torch.long, device=tensors.device - ) - targets = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True) - - return tensors, targets, tensors_lengths, target_lengths - - -def count_parameters(model): - return sum(p.numel() for p in model.parameters() if p.requires_grad) - - -def train_one_epoch(model, criterion, optimizer, scheduler, data_loader, device, epoch, pbar=None, non_blocking=False): - - model.train() - - sum_loss = 0. - for inputs, targets, tensors_lengths, target_lengths in bg_iterator(data_loader, maxsize=2): - - inputs = inputs.to(device, non_blocking=non_blocking) - targets = targets.to(device, non_blocking=non_blocking) - - # keep batch first for data parallel - outputs = model(inputs).transpose(0, 1) - - # CTC - # outputs: input length, batch size, number of classes (including blank) - # targets: batch size, max target length - # input_lengths: batch size - # target_lengths: batch size - - loss = criterion(outputs, targets, tensors_lengths, target_lengths) - sum_loss += loss.item() - - optimizer.zero_grad() - loss.backward() - - optimizer.step() - - if SIGNAL_RECEIVED: - return - - if pbar is not None: - pbar.update(1 / len(data_loader)) - - # Average loss - sum_loss = sum_loss / len(data_loader) - print(f"Training loss: {sum_loss:4.5f}", flush=True) - - scheduler.step() - - -def evaluate(model, criterion, data_loader, decoder, language_model, device, non_blocking=False): - - with torch.no_grad(): - - model.eval() - - sums = defaultdict(lambda: 0.) - - for inputs, targets, tensors_lengths, target_lengths in bg_iterator(data_loader, maxsize=2): - - inputs = inputs.to(device, non_blocking=non_blocking) - targets = targets.to(device, non_blocking=non_blocking) - - # keep batch first for data parallel - outputs = model(inputs).transpose(0, 1) - - # CTC - # outputs: input length, batch size, number of classes (including blank) - # targets: batch size, max target length - # input_lengths: batch size - # target_lengths: batch size - - sums["loss"] += criterion(outputs, targets, tensors_lengths, target_lengths).item() - - output = outputs.transpose(0, 1).to("cpu") - output = decoder(output) - - output = language_model.decode(output.tolist()) - target = language_model.decode(targets.tolist()) - - print_length = 20 - for i in range(2): - output_print = output[i].ljust(print_length)[:print_length] - target_print = target[i].ljust(print_length)[:print_length] - print(f"Target: {target_print} Output: {output_print}", flush=True) - - cers = [levenshtein_distance(a, b) for a, b in zip(target, output)] - # cers_normalized = [d / len(a) for a, d in zip(target, cers)] - cers = statistics.mean(cers) - sums["cer"] += cers - - output = [o.split(language_model.char_space) for o in output] - target = [o.split(language_model.char_space) for o in target] - - wers = [levenshtein_distance(a, b) for a, b in zip(target, output)] - # wers_normalized = [d / len(a) for a, d in zip(target, wers)] - wers = statistics.mean(wers) - sums["wer"] += wers - - if SIGNAL_RECEIVED: - break - - # Average loss - for k in sums.keys(): - sums[k] /= len(data_loader) - - print(f"Validation loss: {sums['loss']:.5f}", flush=True) - print(f"CER: {sums['cer']} WER: {sums['wer']} CERN: {sums['cern']} WERN: {sums['wern']}", flush=True) - - return sums['loss'] - - -def main(args): - - print("start time: {}".format(str(datetime.now())), flush=True) - - # Empty CUDA cache - torch.cuda.empty_cache() - - CHECKPOINT_filename = args.resume if args.resume else 'checkpoint.pth.tar' - - # Install signal handler - signal.signal(signal.SIGUSR1, lambda a, b: signal_handler(a, b)) - signal.signal(signal.SIGTERM, SIGTERM_handler) - print('Signal handler installed', flush=True) - - audio_backend = "soundfile" - torchaudio.set_audio_backend(audio_backend) - - device = "cuda" if torch.cuda.is_available() else "cpu" - # num_devices = torch.cuda.device_count() - - data_loader_training_params = { - "num_workers": args.workers, - "pin_memory": True, - "shuffle": True, - "drop_last": True, - } - data_loader_validation_params = data_loader_training_params.copy() - data_loader_validation_params["shuffle"] = False - - non_blocking = True - - # audio - - n_bins = args.n_bins # 13, 128 - melkwargs = { - 'n_fft': 512, - 'n_mels': 20, - 'hop_length': 80, # (160, 80) - } - - sample_rate_original = 16000 - - transforms = nn.Sequential( - # torchaudio.transforms.Resample(sample_rate_original, sample_rate_original//2), - # torchaudio.transforms.MFCC(sample_rate=sample_rate_original, n_mfcc=n_bins, melkwargs=melkwargs), - torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate_original, n_mels=n_bins), - # torchaudio.transforms.FrequencyMasking(freq_mask_param=n_bins), - # torchaudio.transforms.TimeMasking(time_mask_param=35) - ) - - # Text preprocessing - - char_blank = "*" - char_space = " " - char_apostrophe = "'" - - labels = char_blank + char_space + char_apostrophe + string.ascii_lowercase - language_model = LanguageModel(labels, char_blank, char_space) - vocab_size = language_model.length - print("vocab_size", vocab_size, flush=True) - - training, validation, _ = datasets_librispeech(transforms, language_model) - - num_features = n_bins if n_bins else 1 - model = Wav2Letter(num_features, vocab_size) - - if args.jit: - model = torch.jit.script(model) - - if not args.distributed: - model = torch.nn.DataParallel(model) - else: - model.cuda() - model = torch.nn.parallel.DistributedDataParallel(model) - # model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True) - - model = model.to(device, non_blocking=non_blocking) - - n = count_parameters(model) - print(f"Number of parameters: {n}", flush=True) - - print(torch.cuda.memory_summary(), flush=True) - - # Optimizer - - optimizer_params = { - "lr": args.learning_rate, - # "eps": args.eps, - # "rho": args.rho, - "weight_decay": args.weight_decay, - } - - Optimizer = SGD - optimizer_params = optimizer_params - - optimizer = Optimizer(model.parameters(), **optimizer_params) - scheduler = ExponentialLR(optimizer, gamma=args.gamma) - # scheduler = ReduceLROnPlateau(optimizer, patience=2, threshold=1e-3) - - criterion = torch.nn.CTCLoss(blank=language_model.mapping[char_blank], zero_infinity=False) - # criterion = nn.MSELoss() - # criterion = torch.nn.NLLLoss() - - best_loss = 1. - - loader_training = DataLoader(training, batch_size=args.batch_size, collate_fn=collate_fn, **data_loader_training_params) - loader_validation = DataLoader(validation, batch_size=args.batch_size, collate_fn=collate_fn, **data_loader_validation_params) - - print("Length of data loaders: ", len(loader_training), len(loader_validation), flush=True) - - if args.resume and os.path.isfile(CHECKPOINT_filename): - print("Checkpoint: loading '{}'".format(CHECKPOINT_filename)) - checkpoint = torch.load(CHECKPOINT_filename) - - args.start_epoch = checkpoint['epoch'] - best_loss = checkpoint['best_loss'] - - model.load_state_dict(checkpoint['state_dict']) - optimizer.load_state_dict(checkpoint['optimizer']) - scheduler.load_state_dict(checkpoint['scheduler']) - - print("Checkpoint: loaded '{}' at epoch {}".format(CHECKPOINT_filename, checkpoint['epoch'])) - else: - print("Checkpoint: not found") - - save_checkpoint({ - 'epoch': args.start_epoch, - 'state_dict': model.state_dict(), - 'best_loss': best_loss, - 'optimizer': optimizer.state_dict(), - 'scheduler': scheduler.state_dict(), - }, False, CHECKPOINT_filename) - - with tqdm(total=args.epochs, unit_scale=1, disable=args.distributed) as pbar: - - for epoch in range(args.start_epoch, args.epochs): - - train_one_epoch(model, criterion, optimizer, scheduler, loader_training, device, pbar=pbar, non_blocking=non_blocking) - if SIGNAL_RECEIVED: - save_checkpoint({ - 'epoch': epoch, - 'state_dict': model.state_dict(), - 'best_loss': best_loss, - 'optimizer': optimizer.state_dict(), - 'scheduler': scheduler.state_dict(), - }, False, CHECKPOINT_filename) - if not epoch % args.print_freq or epoch == args.epochs - 1: - - sum_loss = evaluate(model, criterion, loader_validation, greedy_decode, language_model, device, non_blocking=non_blocking) - - is_best = sum_loss < best_loss - best_loss = min(sum_loss, best_loss) - save_checkpoint({ - 'epoch': epoch + 1, - 'state_dict': model.state_dict(), - 'best_loss': best_loss, - 'optimizer': optimizer.state_dict(), - 'scheduler': scheduler.state_dict(), - }, is_best, CHECKPOINT_filename) - - -if __name__ == "__main__": - args = parse_args() - main(args) From e0b1359b5f71cc19854f11fee5e33b78a4bf4639 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 20 May 2020 15:30:11 -0700 Subject: [PATCH 014/129] check for not None. --- examples/pipeline/wav2letter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 139ad53c30..7ab596fdd3 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -182,7 +182,7 @@ def __init__(self, dataset): self._cache = [None] * len(dataset) def __getitem__(self, n): - if self._cache[n]: + if self._cache[n] is not None: return self._cache[n] item = self.dataset[n] From be34e16a721ade8480a498de5d64c0088042f847 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 21 May 2020 08:48:01 -0700 Subject: [PATCH 015/129] cleaning. --- examples/pipeline/wav2letter.py | 68 +++++++++++---------------------- 1 file changed, 23 insertions(+), 45 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 7ab596fdd3..ebbd04a5e6 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -17,7 +17,7 @@ from torch.optim import SGD, Adadelta, Adam from torch.optim.lr_scheduler import ExponentialLR, ReduceLROnPlateau from torch.utils.data import DataLoader -from torchaudio.datasets import LIBRISPEECH, SPEECHCOMMANDS +from torchaudio.datasets import LIBRISPEECH from torchaudio.datasets.utils import bg_iterator, diskcache_iterator from torchaudio.models.wav2letter import Wav2Letter from torchaudio.transforms import MFCC, Resample @@ -27,50 +27,27 @@ def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument('--workers', default=0, type=int, - metavar='N', help='number of data loading workers') - parser.add_argument('--resume', default='', type=str, - metavar='PATH', help='path to latest checkpoint') - parser.add_argument('--figures', default='', type=str, - metavar='PATH', help='folder path to save figures') - - parser.add_argument('--epochs', default=200, type=int, - metavar='N', help='number of total epochs to run') - parser.add_argument('--start-epoch', default=0, type=int, - metavar='N', help='manual epoch number') - parser.add_argument('--print-freq', default=10, type=int, - metavar='N', help='print frequency in epochs') - - parser.add_argument('--arch', metavar='ARCH', default='wav2letter', - choices=["wav2letter", "lstm"], help='model architecture') - parser.add_argument('--batch-size', default=64, type=int, - metavar='N', help='mini-batch size') - - parser.add_argument('--learning-rate', default=1., type=float, - metavar='LR', help='initial learning rate') - parser.add_argument('--gamma', default=.96, type=float, - metavar='GAMMA', help='learning rate exponential decay constant') + parser.add_argument('--workers', default=0, type=int, metavar='N', help='number of data loading workers') + parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint') + + parser.add_argument('--epochs', default=200, type=int, metavar='N', help='number of total epochs to run') + parser.add_argument('--start-epoch', default=0, type=int, metavar='N', help='manual epoch number') + parser.add_argument('--print-freq', default=10, type=int, metavar='N', help='print frequency in epochs') + + parser.add_argument('--arch', metavar='ARCH', default='wav2letter', choices=["wav2letter"], help='model architecture') + parser.add_argument('--batch-size', default=64, type=int, metavar='N', help='mini-batch size') + + parser.add_argument('--n-bins', default=13, type=int, metavar='N', help='number of bins in transforms') + parser.add_argument('--learning-rate', default=1., type=float, metavar='LR', help='initial learning rate') + parser.add_argument('--gamma', default=.96, type=float, metavar='GAMMA', help='learning rate exponential decay constant') # parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum') - parser.add_argument('--weight-decay', default=1e-5, - type=float, metavar='W', help='weight decay') + parser.add_argument('--weight-decay', default=1e-5, type=float, metavar='W', help='weight decay') parser.add_argument("--eps", metavar='EPS', type=float, default=1e-8) parser.add_argument("--rho", metavar='RHO', type=float, default=.95) - parser.add_argument('--n-bins', default=13, type=int, - metavar='N', help='number of bins in transforms') - - parser.add_argument('--world-size', default=1, type=int, - help='number of distributed processes') - parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', - type=str, help='url used to set up distributed training') - parser.add_argument('--dist-backend', default='nccl', - type=str, help='distributed backend') - parser.add_argument('--distributed', action="store_true") - parser.add_argument('--dataset', default='librispeech', type=str) - parser.add_argument('--gradient', action="store_true") + parser.add_argument('--distributed', action="store_true") parser.add_argument('--jit', action="store_true") - parser.add_argument('--viterbi-decoder', action="store_true") args = parser.parse_args() @@ -431,14 +408,14 @@ def main(args): device = "cuda" if torch.cuda.is_available() else "cpu" # num_devices = torch.cuda.device_count() - data_loader_training_params = { + loader_training_params = { "num_workers": args.workers, "pin_memory": True, "shuffle": True, "drop_last": True, } - data_loader_validation_params = data_loader_training_params.copy() - data_loader_validation_params["shuffle"] = False + loader_validation_params = loader_training_params.copy() + loader_validation_params["shuffle"] = False non_blocking = True @@ -456,7 +433,7 @@ def main(args): transforms = nn.Sequential( # torchaudio.transforms.Resample(sample_rate_original, sample_rate_original//2), # torchaudio.transforms.MFCC(sample_rate=sample_rate_original, n_mfcc=n_bins, melkwargs=melkwargs), - torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate_original, n_mels=n_bins), + torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate_original, **melkwargs), # torchaudio.transforms.FrequencyMasking(freq_mask_param=n_bins), # torchaudio.transforms.TimeMasking(time_mask_param=35) ) @@ -516,8 +493,8 @@ def main(args): best_loss = 1. - loader_training = DataLoader(training, batch_size=args.batch_size, collate_fn=collate_fn, **data_loader_training_params) - loader_validation = DataLoader(validation, batch_size=args.batch_size, collate_fn=collate_fn, **data_loader_validation_params) + loader_training = DataLoader(training, batch_size=args.batch_size, collate_fn=collate_fn, **loader_training_params) + loader_validation = DataLoader(validation, batch_size=args.batch_size, collate_fn=collate_fn, **loader_validation_params) print("Length of data loaders: ", len(loader_training), len(loader_validation), flush=True) @@ -549,6 +526,7 @@ def main(args): for epoch in range(args.start_epoch, args.epochs): train_one_epoch(model, criterion, optimizer, scheduler, loader_training, device, pbar=pbar, non_blocking=non_blocking) + if SIGNAL_RECEIVED: save_checkpoint({ 'epoch': epoch, From bd7c9c91534213a64eef1d41776f3b342339cb70 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 21 May 2020 08:56:50 -0700 Subject: [PATCH 016/129] back -l 160 --- examples/pipeline/wav2letter.py | 139 +++++++++++++++++--------------- 1 file changed, 72 insertions(+), 67 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index ebbd04a5e6..f1424b150c 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -27,31 +27,31 @@ def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument('--workers', default=0, type=int, metavar='N', help='number of data loading workers') - parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint') + parser.add_argument("--workers", default=0, type=int, metavar="N", help="number of data loading workers") + parser.add_argument("--resume", default="", type=str, metavar="PATH", help="path to latest checkpoint") - parser.add_argument('--epochs', default=200, type=int, metavar='N', help='number of total epochs to run') - parser.add_argument('--start-epoch', default=0, type=int, metavar='N', help='manual epoch number') - parser.add_argument('--print-freq', default=10, type=int, metavar='N', help='print frequency in epochs') + parser.add_argument("--epochs", default=200, type=int, metavar="N", help="number of total epochs to run") + parser.add_argument("--start-epoch", default=0, type=int, metavar="N", help="manual epoch number") + parser.add_argument("--print-freq", default=10, type=int, metavar="N", help="print frequency in epochs") - parser.add_argument('--arch', metavar='ARCH', default='wav2letter', choices=["wav2letter"], help='model architecture') - parser.add_argument('--batch-size', default=64, type=int, metavar='N', help='mini-batch size') + parser.add_argument("--arch", metavar="ARCH", default="wav2letter", choices=["wav2letter"], help="model architecture") + parser.add_argument("--batch-size", default=64, type=int, metavar="N", help="mini-batch size") - parser.add_argument('--n-bins', default=13, type=int, metavar='N', help='number of bins in transforms') - parser.add_argument('--learning-rate', default=1., type=float, metavar='LR', help='initial learning rate') - parser.add_argument('--gamma', default=.96, type=float, metavar='GAMMA', help='learning rate exponential decay constant') + parser.add_argument("--n-bins", default=13, type=int, metavar="N", help="number of bins in transforms") + parser.add_argument("--learning-rate", default=1.0, type=float, metavar="LR", help="initial learning rate") + parser.add_argument("--gamma", default=0.96, type=float, metavar="GAMMA", help="learning rate exponential decay constant") # parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum') - parser.add_argument('--weight-decay', default=1e-5, type=float, metavar='W', help='weight decay') - parser.add_argument("--eps", metavar='EPS', type=float, default=1e-8) - parser.add_argument("--rho", metavar='RHO', type=float, default=.95) + parser.add_argument("--weight-decay", default=1e-5, type=float, metavar="W", help="weight decay") + parser.add_argument("--eps", metavar="EPS", type=float, default=1e-8) + parser.add_argument("--rho", metavar="RHO", type=float, default=0.95) - parser.add_argument('--dataset', default='librispeech', type=str) - parser.add_argument('--distributed', action="store_true") - parser.add_argument('--jit', action="store_true") + parser.add_argument("--dataset", default="librispeech", type=str) + parser.add_argument("--distributed", action="store_true") + parser.add_argument("--jit", action="store_true") args = parser.parse_args() - args.clip_norm = 0. + args.clip_norm = 0.0 print(pprint.pformat(vars(args)), flush=True) @@ -59,13 +59,13 @@ def parse_args(): def SIGTERM_handler(a, b): - print('received sigterm') + print("received sigterm") pass def signal_handler(a, b): global SIGNAL_RECEIVED - print('Signal received', a, datetime.now().strftime("%y%m%d.%H%M%S"), flush=True) + print("Signal received", a, datetime.now().strftime("%y%m%d.%H%M%S"), flush=True) SIGNAL_RECEIVED = True @@ -75,7 +75,7 @@ def save_checkpoint(state, is_best, filename): then copy it to filename, in case the signal interrupts the torch.save() process. """ - CHECKPOINT_tempfile = filename + '.temp' + CHECKPOINT_tempfile = filename + ".temp" # Remove CHECKPOINT_tempfile, in case the signal arrives in the # middle of copying from CHECKPOINT_tempfile to CHECKPOINT_filename @@ -86,7 +86,7 @@ def save_checkpoint(state, is_best, filename): if os.path.isfile(CHECKPOINT_tempfile): os.rename(CHECKPOINT_tempfile, filename) if is_best: - shutil.copyfile(filename, 'model_best.pth.tar') + shutil.copyfile(filename, "model_best.pth.tar") print("Checkpoint: saved") @@ -116,7 +116,7 @@ def decode(self, tensor): else: # not idempotent, since clean string x = (self.mapping[i] for i in tensor) - x = ''.join(i for i, _ in itertools.groupby(x)) + x = "".join(i for i, _ in itertools.groupby(x)) x = x.replace(self.char_blank, "") # x = x.strip() return x @@ -127,7 +127,6 @@ def model_length_function(tensor): class IterableMemoryCache: - def __init__(self, iterable): self.iterable = iterable self._iter = iter(iterable) @@ -172,7 +171,6 @@ def __len__(self): class Processed(torch.utils.data.Dataset): - def __init__(self, process_datapoint, dataset): self.process_datapoint = process_datapoint self.dataset = dataset @@ -207,7 +205,6 @@ def process_datapoint(item, transforms, encode): def datasets_librispeech(transforms, language_model, root="/datasets01/", folder_in_archive="librispeech/062419/"): - def create(tag): if isinstance(tag, str): @@ -266,17 +263,13 @@ def collate_fn(batch): tensors = [b[0] for b in batch if b] - tensors_lengths = torch.tensor( - [model_length_function(t) for t in tensors], dtype=torch.long, device=tensors[0].device - ) + tensors_lengths = torch.tensor([model_length_function(t) for t in tensors], dtype=torch.long, device=tensors[0].device) tensors = torch.nn.utils.rnn.pad_sequence(tensors, batch_first=True) tensors = tensors.transpose(1, -1) targets = [b[1] for b in batch if b] - target_lengths = torch.tensor( - [target.shape[0] for target in targets], dtype=torch.long, device=tensors.device - ) + target_lengths = torch.tensor([target.shape[0] for target in targets], dtype=torch.long, device=tensors.device) targets = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True) return tensors, targets, tensors_lengths, target_lengths @@ -290,7 +283,7 @@ def train_one_epoch(model, criterion, optimizer, scheduler, data_loader, device, model.train() - sum_loss = 0. + sum_loss = 0.0 for inputs, targets, tensors_lengths, target_lengths in bg_iterator(data_loader, maxsize=2): inputs = inputs.to(device, non_blocking=non_blocking) @@ -332,7 +325,7 @@ def evaluate(model, criterion, data_loader, decoder, language_model, device, non model.eval() - sums = defaultdict(lambda: 0.) + sums = defaultdict(lambda: 0.0) for inputs, targets, tensors_lengths, target_lengths in bg_iterator(data_loader, maxsize=2): @@ -385,7 +378,7 @@ def evaluate(model, criterion, data_loader, decoder, language_model, device, non print(f"Validation loss: {sums['loss']:.5f}", flush=True) print(f"CER: {sums['cer']} WER: {sums['wer']} CERN: {sums['cern']} WERN: {sums['wern']}", flush=True) - return sums['loss'] + return sums["loss"] def main(args): @@ -395,12 +388,12 @@ def main(args): # Empty CUDA cache torch.cuda.empty_cache() - CHECKPOINT_filename = args.resume if args.resume else 'checkpoint.pth.tar' + CHECKPOINT_filename = args.resume if args.resume else "checkpoint.pth.tar" # Install signal handler signal.signal(signal.SIGUSR1, lambda a, b: signal_handler(a, b)) signal.signal(signal.SIGTERM, SIGTERM_handler) - print('Signal handler installed', flush=True) + print("Signal handler installed", flush=True) audio_backend = "soundfile" torchaudio.set_audio_backend(audio_backend) @@ -423,9 +416,9 @@ def main(args): n_bins = args.n_bins # 13, 128 melkwargs = { - 'n_fft': 512, - 'n_mels': 20, - 'hop_length': 80, # (160, 80) + "n_fft": 512, + "n_mels": 20, + "hop_length": 80, # (160, 80) } sample_rate_original = 16000 @@ -491,7 +484,7 @@ def main(args): # criterion = nn.MSELoss() # criterion = torch.nn.NLLLoss() - best_loss = 1. + best_loss = 1.0 loader_training = DataLoader(training, batch_size=args.batch_size, collate_fn=collate_fn, **loader_training_params) loader_validation = DataLoader(validation, batch_size=args.batch_size, collate_fn=collate_fn, **loader_validation_params) @@ -502,24 +495,28 @@ def main(args): print("Checkpoint: loading '{}'".format(CHECKPOINT_filename)) checkpoint = torch.load(CHECKPOINT_filename) - args.start_epoch = checkpoint['epoch'] - best_loss = checkpoint['best_loss'] + args.start_epoch = checkpoint["epoch"] + best_loss = checkpoint["best_loss"] - model.load_state_dict(checkpoint['state_dict']) - optimizer.load_state_dict(checkpoint['optimizer']) - scheduler.load_state_dict(checkpoint['scheduler']) + model.load_state_dict(checkpoint["state_dict"]) + optimizer.load_state_dict(checkpoint["optimizer"]) + scheduler.load_state_dict(checkpoint["scheduler"]) - print("Checkpoint: loaded '{}' at epoch {}".format(CHECKPOINT_filename, checkpoint['epoch'])) + print("Checkpoint: loaded '{}' at epoch {}".format(CHECKPOINT_filename, checkpoint["epoch"])) else: print("Checkpoint: not found") - save_checkpoint({ - 'epoch': args.start_epoch, - 'state_dict': model.state_dict(), - 'best_loss': best_loss, - 'optimizer': optimizer.state_dict(), - 'scheduler': scheduler.state_dict(), - }, False, CHECKPOINT_filename) + save_checkpoint( + { + "epoch": args.start_epoch, + "state_dict": model.state_dict(), + "best_loss": best_loss, + "optimizer": optimizer.state_dict(), + "scheduler": scheduler.state_dict(), + }, + False, + CHECKPOINT_filename, + ) with tqdm(total=args.epochs, unit_scale=1, disable=args.distributed) as pbar: @@ -528,26 +525,34 @@ def main(args): train_one_epoch(model, criterion, optimizer, scheduler, loader_training, device, pbar=pbar, non_blocking=non_blocking) if SIGNAL_RECEIVED: - save_checkpoint({ - 'epoch': epoch, - 'state_dict': model.state_dict(), - 'best_loss': best_loss, - 'optimizer': optimizer.state_dict(), - 'scheduler': scheduler.state_dict(), - }, False, CHECKPOINT_filename) + save_checkpoint( + { + "epoch": epoch, + "state_dict": model.state_dict(), + "best_loss": best_loss, + "optimizer": optimizer.state_dict(), + "scheduler": scheduler.state_dict(), + }, + False, + CHECKPOINT_filename, + ) if not epoch % args.print_freq or epoch == args.epochs - 1: sum_loss = evaluate(model, criterion, loader_validation, greedy_decode, language_model, device, non_blocking=non_blocking) is_best = sum_loss < best_loss best_loss = min(sum_loss, best_loss) - save_checkpoint({ - 'epoch': epoch + 1, - 'state_dict': model.state_dict(), - 'best_loss': best_loss, - 'optimizer': optimizer.state_dict(), - 'scheduler': scheduler.state_dict(), - }, is_best, CHECKPOINT_filename) + save_checkpoint( + { + "epoch": epoch + 1, + "state_dict": model.state_dict(), + "best_loss": best_loss, + "optimizer": optimizer.state_dict(), + "scheduler": scheduler.state_dict(), + }, + is_best, + CHECKPOINT_filename, + ) if __name__ == "__main__": From 91528c914528e32b69e3fedcc713aee37f99fb2a Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 21 May 2020 09:05:45 -0700 Subject: [PATCH 017/129] black. --- examples/pipeline/wav2letter.py | 197 +++++++++++++++++++++++++++----- 1 file changed, 166 insertions(+), 31 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index f1424b150c..efe202e579 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -27,21 +27,75 @@ def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument("--workers", default=0, type=int, metavar="N", help="number of data loading workers") - parser.add_argument("--resume", default="", type=str, metavar="PATH", help="path to latest checkpoint") + parser.add_argument( + "--workers", + default=0, + type=int, + metavar="N", + help="number of data loading workers", + ) + parser.add_argument( + "--resume", + default="", + type=str, + metavar="PATH", + help="path to latest checkpoint", + ) - parser.add_argument("--epochs", default=200, type=int, metavar="N", help="number of total epochs to run") - parser.add_argument("--start-epoch", default=0, type=int, metavar="N", help="manual epoch number") - parser.add_argument("--print-freq", default=10, type=int, metavar="N", help="print frequency in epochs") + parser.add_argument( + "--epochs", + default=200, + type=int, + metavar="N", + help="number of total epochs to run", + ) + parser.add_argument( + "--start-epoch", default=0, type=int, metavar="N", help="manual epoch number" + ) + parser.add_argument( + "--print-freq", + default=10, + type=int, + metavar="N", + help="print frequency in epochs", + ) - parser.add_argument("--arch", metavar="ARCH", default="wav2letter", choices=["wav2letter"], help="model architecture") - parser.add_argument("--batch-size", default=64, type=int, metavar="N", help="mini-batch size") + parser.add_argument( + "--arch", + metavar="ARCH", + default="wav2letter", + choices=["wav2letter"], + help="model architecture", + ) + parser.add_argument( + "--batch-size", default=64, type=int, metavar="N", help="mini-batch size" + ) - parser.add_argument("--n-bins", default=13, type=int, metavar="N", help="number of bins in transforms") - parser.add_argument("--learning-rate", default=1.0, type=float, metavar="LR", help="initial learning rate") - parser.add_argument("--gamma", default=0.96, type=float, metavar="GAMMA", help="learning rate exponential decay constant") + parser.add_argument( + "--n-bins", + default=13, + type=int, + metavar="N", + help="number of bins in transforms", + ) + parser.add_argument( + "--learning-rate", + default=1.0, + type=float, + metavar="LR", + help="initial learning rate", + ) + parser.add_argument( + "--gamma", + default=0.96, + type=float, + metavar="GAMMA", + help="learning rate exponential decay constant", + ) # parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum') - parser.add_argument("--weight-decay", default=1e-5, type=float, metavar="W", help="weight decay") + parser.add_argument( + "--weight-decay", default=1e-5, type=float, metavar="W", help="weight decay" + ) parser.add_argument("--eps", metavar="EPS", type=float, default=1e-8) parser.add_argument("--rho", metavar="RHO", type=float, default=0.95) @@ -204,15 +258,29 @@ def process_datapoint(item, transforms, encode): return transformed, target -def datasets_librispeech(transforms, language_model, root="/datasets01/", folder_in_archive="librispeech/062419/"): +def datasets_librispeech( + transforms, + language_model, + root="/datasets01/", + folder_in_archive="librispeech/062419/", +): def create(tag): if isinstance(tag, str): - data = LIBRISPEECH(root, tag, folder_in_archive=folder_in_archive, download=False) + data = LIBRISPEECH( + root, tag, folder_in_archive=folder_in_archive, download=False + ) else: - data = sum(LIBRISPEECH(root, t, folder_in_archive=folder_in_archive, download=False) for t in tag) + data = sum( + LIBRISPEECH( + root, t, folder_in_archive=folder_in_archive, download=False + ) + for t in tag + ) - data = Processed(lambda x: process_datapoint(x, transforms, language_model.encode), data) + data = Processed( + lambda x: process_datapoint(x, transforms, language_model.encode), data + ) # data = diskcache_iterator(data) data = MapMemoryCache(data) return data @@ -263,13 +331,19 @@ def collate_fn(batch): tensors = [b[0] for b in batch if b] - tensors_lengths = torch.tensor([model_length_function(t) for t in tensors], dtype=torch.long, device=tensors[0].device) + tensors_lengths = torch.tensor( + [model_length_function(t) for t in tensors], + dtype=torch.long, + device=tensors[0].device, + ) tensors = torch.nn.utils.rnn.pad_sequence(tensors, batch_first=True) tensors = tensors.transpose(1, -1) targets = [b[1] for b in batch if b] - target_lengths = torch.tensor([target.shape[0] for target in targets], dtype=torch.long, device=tensors.device) + target_lengths = torch.tensor( + [target.shape[0] for target in targets], dtype=torch.long, device=tensors.device + ) targets = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True) return tensors, targets, tensors_lengths, target_lengths @@ -279,12 +353,24 @@ def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) -def train_one_epoch(model, criterion, optimizer, scheduler, data_loader, device, epoch, pbar=None, non_blocking=False): +def train_one_epoch( + model, + criterion, + optimizer, + scheduler, + data_loader, + device, + epoch, + pbar=None, + non_blocking=False, +): model.train() sum_loss = 0.0 - for inputs, targets, tensors_lengths, target_lengths in bg_iterator(data_loader, maxsize=2): + for inputs, targets, tensors_lengths, target_lengths in bg_iterator( + data_loader, maxsize=2 + ): inputs = inputs.to(device, non_blocking=non_blocking) targets = targets.to(device, non_blocking=non_blocking) @@ -319,7 +405,9 @@ def train_one_epoch(model, criterion, optimizer, scheduler, data_loader, device, scheduler.step() -def evaluate(model, criterion, data_loader, decoder, language_model, device, non_blocking=False): +def evaluate( + model, criterion, data_loader, decoder, language_model, device, non_blocking=False +): with torch.no_grad(): @@ -327,7 +415,9 @@ def evaluate(model, criterion, data_loader, decoder, language_model, device, non sums = defaultdict(lambda: 0.0) - for inputs, targets, tensors_lengths, target_lengths in bg_iterator(data_loader, maxsize=2): + for inputs, targets, tensors_lengths, target_lengths in bg_iterator( + data_loader, maxsize=2 + ): inputs = inputs.to(device, non_blocking=non_blocking) targets = targets.to(device, non_blocking=non_blocking) @@ -341,7 +431,9 @@ def evaluate(model, criterion, data_loader, decoder, language_model, device, non # input_lengths: batch size # target_lengths: batch size - sums["loss"] += criterion(outputs, targets, tensors_lengths, target_lengths).item() + sums["loss"] += criterion( + outputs, targets, tensors_lengths, target_lengths + ).item() output = outputs.transpose(0, 1).to("cpu") output = decoder(output) @@ -376,7 +468,10 @@ def evaluate(model, criterion, data_loader, decoder, language_model, device, non sums[k] /= len(data_loader) print(f"Validation loss: {sums['loss']:.5f}", flush=True) - print(f"CER: {sums['cer']} WER: {sums['wer']} CERN: {sums['cern']} WERN: {sums['wern']}", flush=True) + print( + f"CER: {sums['cer']} WER: {sums['wer']} CERN: {sums['cern']} WERN: {sums['wern']}", + flush=True, + ) return sums["loss"] @@ -426,7 +521,9 @@ def main(args): transforms = nn.Sequential( # torchaudio.transforms.Resample(sample_rate_original, sample_rate_original//2), # torchaudio.transforms.MFCC(sample_rate=sample_rate_original, n_mfcc=n_bins, melkwargs=melkwargs), - torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate_original, **melkwargs), + torchaudio.transforms.MelSpectrogram( + sample_rate=sample_rate_original, **melkwargs + ), # torchaudio.transforms.FrequencyMasking(freq_mask_param=n_bins), # torchaudio.transforms.TimeMasking(time_mask_param=35) ) @@ -480,16 +577,33 @@ def main(args): scheduler = ExponentialLR(optimizer, gamma=args.gamma) # scheduler = ReduceLROnPlateau(optimizer, patience=2, threshold=1e-3) - criterion = torch.nn.CTCLoss(blank=language_model.mapping[char_blank], zero_infinity=False) + criterion = torch.nn.CTCLoss( + blank=language_model.mapping[char_blank], zero_infinity=False + ) # criterion = nn.MSELoss() # criterion = torch.nn.NLLLoss() best_loss = 1.0 - loader_training = DataLoader(training, batch_size=args.batch_size, collate_fn=collate_fn, **loader_training_params) - loader_validation = DataLoader(validation, batch_size=args.batch_size, collate_fn=collate_fn, **loader_validation_params) + loader_training = DataLoader( + training, + batch_size=args.batch_size, + collate_fn=collate_fn, + **loader_training_params, + ) + loader_validation = DataLoader( + validation, + batch_size=args.batch_size, + collate_fn=collate_fn, + **loader_validation_params, + ) - print("Length of data loaders: ", len(loader_training), len(loader_validation), flush=True) + print( + "Length of data loaders: ", + len(loader_training), + len(loader_validation), + flush=True, + ) if args.resume and os.path.isfile(CHECKPOINT_filename): print("Checkpoint: loading '{}'".format(CHECKPOINT_filename)) @@ -502,7 +616,11 @@ def main(args): optimizer.load_state_dict(checkpoint["optimizer"]) scheduler.load_state_dict(checkpoint["scheduler"]) - print("Checkpoint: loaded '{}' at epoch {}".format(CHECKPOINT_filename, checkpoint["epoch"])) + print( + "Checkpoint: loaded '{}' at epoch {}".format( + CHECKPOINT_filename, checkpoint["epoch"] + ) + ) else: print("Checkpoint: not found") @@ -522,7 +640,16 @@ def main(args): for epoch in range(args.start_epoch, args.epochs): - train_one_epoch(model, criterion, optimizer, scheduler, loader_training, device, pbar=pbar, non_blocking=non_blocking) + train_one_epoch( + model, + criterion, + optimizer, + scheduler, + loader_training, + device, + pbar=pbar, + non_blocking=non_blocking, + ) if SIGNAL_RECEIVED: save_checkpoint( @@ -538,7 +665,15 @@ def main(args): ) if not epoch % args.print_freq or epoch == args.epochs - 1: - sum_loss = evaluate(model, criterion, loader_validation, greedy_decode, language_model, device, non_blocking=non_blocking) + sum_loss = evaluate( + model, + criterion, + loader_validation, + greedy_decode, + language_model, + device, + non_blocking=non_blocking, + ) is_best = sum_loss < best_loss best_loss = min(sum_loss, best_loss) From 3f28b75ed505d828ff9495a1c20c9649c20ee8c4 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 21 May 2020 13:47:55 -0700 Subject: [PATCH 018/129] fix runtime error. --- examples/pipeline/wav2letter.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index efe202e579..3e5dc5b5c9 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -24,6 +24,9 @@ from tqdm.notebook import tqdm as tqdm +SIGNAL_RECEIVED = False + + def parse_args(): parser = argparse.ArgumentParser() @@ -113,7 +116,7 @@ def parse_args(): def SIGTERM_handler(a, b): - print("received sigterm") + print("Received sigterm") pass @@ -148,6 +151,7 @@ class LanguageModel: def __init__(self, labels, char_blank, char_space): self.char_space = char_space + self.char_blank = char_blank labels = [l for l in labels] self.length = len(labels) @@ -360,7 +364,6 @@ def train_one_epoch( scheduler, data_loader, device, - epoch, pbar=None, non_blocking=False, ): @@ -376,7 +379,7 @@ def train_one_epoch( targets = targets.to(device, non_blocking=non_blocking) # keep batch first for data parallel - outputs = model(inputs).transpose(0, 1) + outputs = model(inputs).transpose(-1, -2).transpose(0, 1) # CTC # outputs: input length, batch size, number of classes (including blank) @@ -509,10 +512,9 @@ def main(args): # audio - n_bins = args.n_bins # 13, 128 melkwargs = { "n_fft": 512, - "n_mels": 20, + "n_mels": args.n_bins, # 13, 20, 128 "hop_length": 80, # (160, 80) } @@ -520,11 +522,11 @@ def main(args): transforms = nn.Sequential( # torchaudio.transforms.Resample(sample_rate_original, sample_rate_original//2), - # torchaudio.transforms.MFCC(sample_rate=sample_rate_original, n_mfcc=n_bins, melkwargs=melkwargs), + # torchaudio.transforms.MFCC(sample_rate=sample_rate_original, n_mfcc=args.n_bins, melkwargs=melkwargs), torchaudio.transforms.MelSpectrogram( sample_rate=sample_rate_original, **melkwargs ), - # torchaudio.transforms.FrequencyMasking(freq_mask_param=n_bins), + # torchaudio.transforms.FrequencyMasking(freq_mask_param=args.n_bins), # torchaudio.transforms.TimeMasking(time_mask_param=35) ) @@ -541,8 +543,8 @@ def main(args): training, validation, _ = datasets_librispeech(transforms, language_model) - num_features = n_bins if n_bins else 1 - model = Wav2Letter(num_features, vocab_size) + num_features = args.n_bins + model = Wav2Letter(num_classes=vocab_size, input_type="mfcc", num_features=num_features) if args.jit: model = torch.jit.script(model) From 38d0cae589b6e84242b64470a37a3496545e6300 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 21 May 2020 14:32:48 -0700 Subject: [PATCH 019/129] removing some print statements. --- examples/pipeline/wav2letter.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 3e5dc5b5c9..95bbdf2a35 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -110,8 +110,6 @@ def parse_args(): args.clip_norm = 0.0 - print(pprint.pformat(vars(args)), flush=True) - return args @@ -481,7 +479,7 @@ def evaluate( def main(args): - print("start time: {}".format(str(datetime.now())), flush=True) + print("Start time: {}".format(str(datetime.now())), flush=True) # Empty CUDA cache torch.cuda.empty_cache() @@ -491,8 +489,6 @@ def main(args): # Install signal handler signal.signal(signal.SIGUSR1, lambda a, b: signal_handler(a, b)) signal.signal(signal.SIGTERM, SIGTERM_handler) - print("Signal handler installed", flush=True) - audio_backend = "soundfile" torchaudio.set_audio_backend(audio_backend) @@ -539,8 +535,6 @@ def main(args): labels = char_blank + char_space + char_apostrophe + string.ascii_lowercase language_model = LanguageModel(labels, char_blank, char_space) vocab_size = language_model.length - print("vocab_size", vocab_size, flush=True) - training, validation, _ = datasets_librispeech(transforms, language_model) num_features = args.n_bins @@ -561,8 +555,6 @@ def main(args): n = count_parameters(model) print(f"Number of parameters: {n}", flush=True) - print(torch.cuda.memory_summary(), flush=True) - # Optimizer optimizer_params = { @@ -600,13 +592,6 @@ def main(args): **loader_validation_params, ) - print( - "Length of data loaders: ", - len(loader_training), - len(loader_validation), - flush=True, - ) - if args.resume and os.path.isfile(CHECKPOINT_filename): print("Checkpoint: loading '{}'".format(CHECKPOINT_filename)) checkpoint = torch.load(CHECKPOINT_filename) From 797f0f90da2b81e702a74c0fd726d072f0d5fac3 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 21 May 2020 14:39:16 -0700 Subject: [PATCH 020/129] add help to command line. add progress bar option. --- examples/pipeline/wav2letter.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 95bbdf2a35..1081afc8be 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -62,6 +62,9 @@ def parse_args(): metavar="N", help="print frequency in epochs", ) + parser.add_argument( + "--progress-bar", action="store_true", help="use progress bar while training" + ) parser.add_argument( "--arch", @@ -102,9 +105,16 @@ def parse_args(): parser.add_argument("--eps", metavar="EPS", type=float, default=1e-8) parser.add_argument("--rho", metavar="RHO", type=float, default=0.95) - parser.add_argument("--dataset", default="librispeech", type=str) - parser.add_argument("--distributed", action="store_true") - parser.add_argument("--jit", action="store_true") + parser.add_argument( + "--dataset", + default="librispeech", + type=str, + help="select dataset to train with", + ) + parser.add_argument( + "--distributed", action="store_true", help="enable DistributedDataParallel" + ) + parser.add_argument("--jit", action="store_true", help="if used, model is jitted") args = parser.parse_args() @@ -538,7 +548,9 @@ def main(args): training, validation, _ = datasets_librispeech(transforms, language_model) num_features = args.n_bins - model = Wav2Letter(num_classes=vocab_size, input_type="mfcc", num_features=num_features) + model = Wav2Letter( + num_classes=vocab_size, input_type="mfcc", num_features=num_features + ) if args.jit: model = torch.jit.script(model) @@ -623,7 +635,7 @@ def main(args): CHECKPOINT_filename, ) - with tqdm(total=args.epochs, unit_scale=1, disable=args.distributed) as pbar: + with tqdm(total=args.epochs, unit_scale=1, disable=args.progress_bar) as pbar: for epoch in range(args.start_epoch, args.epochs): From 79b5dafeac468ed38bac8ffad9f27a8bc21e4087 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 21 May 2020 15:25:12 -0700 Subject: [PATCH 021/129] grouping librispeech-specific transform in subclass. --- examples/pipeline/wav2letter.py | 71 +++++++++++++++------------------ 1 file changed, 33 insertions(+), 38 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 1081afc8be..e33a6fce09 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -23,7 +23,6 @@ from torchaudio.transforms import MFCC, Resample from tqdm.notebook import tqdm as tqdm - SIGNAL_RECEIVED = False @@ -236,38 +235,34 @@ def __len__(self): return len(self.dataset) -class Processed(torch.utils.data.Dataset): - def __init__(self, process_datapoint, dataset): - self.process_datapoint = process_datapoint - self.dataset = dataset +class Processed(LIBRISPEECH): + def __init__(self, transforms, encode, *args, **kwargs): + self.transforms = transforms + self.encode = encode + super().__init__(*args, **kwargs) - def __getitem__(self, n): - item = self.dataset[n] - return self.process_datapoint(item) + def __getitem__(self, key): + item = super().__getitem__(key) + return self.process_datapoint(item, self.transforms, self.encode) def __next__(self): - item = next(self.dataset) - return self.process_datapoint(item) - - def __len__(self): - return len(self.dataset) + item = super().__next__() + return self.process_datapoint(item, self.transforms, self.encode) + def process_datapoint(self, item): + transformed = item[0] # .to(device, non_blocking=non_blocking) + target = item[2].lower() -def process_datapoint(item, transforms, encode): - transformed = item[0] # .to(device, non_blocking=non_blocking) - target = item[2].lower() + transformed = self.transforms(transformed) + transformed = transformed[0, ...].transpose(0, -1) - transformed = transforms(transformed) + target = " " + target + " " + target = self.encode(target) + target = torch.tensor(target, dtype=torch.long, device=transformed.device) - transformed = transformed[0, ...].transpose(0, -1) - - target = " " + target + " " - target = encode(target) - target = torch.tensor(target, dtype=torch.long, device=transformed.device) - - transformed = transformed # .to("cpu") - target = target # .to("cpu") - return transformed, target + transformed = transformed # .to("cpu") + target = target # .to("cpu") + return transformed, target def datasets_librispeech( @@ -279,20 +274,20 @@ def datasets_librispeech( def create(tag): if isinstance(tag, str): - data = LIBRISPEECH( - root, tag, folder_in_archive=folder_in_archive, download=False + tag = [tag] + + data = sum( + Processed( + transforms, + language_model.encode, + root, + t, + folder_in_archive=folder_in_archive, + download=False, ) - else: - data = sum( - LIBRISPEECH( - root, t, folder_in_archive=folder_in_archive, download=False - ) - for t in tag - ) - - data = Processed( - lambda x: process_datapoint(x, transforms, language_model.encode), data + for t in tag ) + # data = diskcache_iterator(data) data = MapMemoryCache(data) return data From 401df9f585c42c47669a43a06a033f348dde9626 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 21 May 2020 15:28:39 -0700 Subject: [PATCH 022/129] typo. --- examples/pipeline/wav2letter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index e33a6fce09..f4758955d9 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -630,7 +630,7 @@ def main(args): CHECKPOINT_filename, ) - with tqdm(total=args.epochs, unit_scale=1, disable=args.progress_bar) as pbar: + with tqdm(total=args.epochs, unit_scale=1, disable=not args.progress_bar) as pbar: for epoch in range(args.start_epoch, args.epochs): From 78fd8f7ba5d89444c5d13f976d9277c083b54a51 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 21 May 2020 15:43:22 -0700 Subject: [PATCH 023/129] fix concatenation. --- examples/pipeline/wav2letter.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index f4758955d9..3bae74b10b 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -235,7 +235,7 @@ def __len__(self): return len(self.dataset) -class Processed(LIBRISPEECH): +class ProcessedLIBRISPEECH(LIBRISPEECH): def __init__(self, transforms, encode, *args, **kwargs): self.transforms = transforms self.encode = encode @@ -276,8 +276,8 @@ def create(tag): if isinstance(tag, str): tag = [tag] - data = sum( - Processed( + data = torch.utils.data.ConcatDataset([ + ProcessedLIBRISPEECH( transforms, language_model.encode, root, @@ -286,7 +286,7 @@ def create(tag): download=False, ) for t in tag - ) + ]) # data = diskcache_iterator(data) data = MapMemoryCache(data) From 516bdc848d688d2a53edcfed1a225b4b48cd0272 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 21 May 2020 15:43:57 -0700 Subject: [PATCH 024/129] typo. --- examples/pipeline/wav2letter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 3bae74b10b..778a813da4 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -243,11 +243,11 @@ def __init__(self, transforms, encode, *args, **kwargs): def __getitem__(self, key): item = super().__getitem__(key) - return self.process_datapoint(item, self.transforms, self.encode) + return self.process_datapoint(item) def __next__(self): item = super().__next__() - return self.process_datapoint(item, self.transforms, self.encode) + return self.process_datapoint(item) def process_datapoint(self, item): transformed = item[0] # .to(device, non_blocking=non_blocking) From 0b65e43ffce7031d9b4a928755c0ee382f9d2cce Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 21 May 2020 15:45:36 -0700 Subject: [PATCH 025/129] black. tqdm. --- examples/pipeline/wav2letter.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 778a813da4..feca14bb0f 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -21,7 +21,7 @@ from torchaudio.datasets.utils import bg_iterator, diskcache_iterator from torchaudio.models.wav2letter import Wav2Letter from torchaudio.transforms import MFCC, Resample -from tqdm.notebook import tqdm as tqdm +from tqdm import tqdm SIGNAL_RECEIVED = False @@ -276,17 +276,19 @@ def create(tag): if isinstance(tag, str): tag = [tag] - data = torch.utils.data.ConcatDataset([ - ProcessedLIBRISPEECH( - transforms, - language_model.encode, - root, - t, - folder_in_archive=folder_in_archive, - download=False, - ) - for t in tag - ]) + data = torch.utils.data.ConcatDataset( + [ + ProcessedLIBRISPEECH( + transforms, + language_model.encode, + root, + t, + folder_in_archive=folder_in_archive, + download=False, + ) + for t in tag + ] + ) # data = diskcache_iterator(data) data = MapMemoryCache(data) From c335fe46351fdbc7984fcbb8e2407b000eaf2328 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 21 May 2020 17:15:15 -0700 Subject: [PATCH 026/129] missing transpose. --- examples/pipeline/wav2letter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index feca14bb0f..8f4ada424e 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -431,7 +431,7 @@ def evaluate( targets = targets.to(device, non_blocking=non_blocking) # keep batch first for data parallel - outputs = model(inputs).transpose(0, 1) + outputs = model(inputs).transpose(-1, -2).transpose(0, 1) # CTC # outputs: input length, batch size, number of classes (including blank) From bbc2a03459595fff70154c66fbff13ffd68321a2 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 21 May 2020 17:25:56 -0700 Subject: [PATCH 027/129] renaming variables. --- examples/pipeline/wav2letter.py | 52 ++++++++++++++------------------- 1 file changed, 22 insertions(+), 30 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 8f4ada424e..b1029dca06 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -2,7 +2,6 @@ import collections import itertools import os -import pprint import shutil import signal import statistics @@ -37,8 +36,8 @@ def parse_args(): help="number of data loading workers", ) parser.add_argument( - "--resume", - default="", + "--checkpoint", + default="checkpoint.pth.tar", type=str, metavar="PATH", help="path to latest checkpoint", @@ -139,16 +138,16 @@ def save_checkpoint(state, is_best, filename): then copy it to filename, in case the signal interrupts the torch.save() process. """ - CHECKPOINT_tempfile = filename + ".temp" + tempfile = filename + ".temp" - # Remove CHECKPOINT_tempfile, in case the signal arrives in the - # middle of copying from CHECKPOINT_tempfile to CHECKPOINT_filename - if os.path.isfile(CHECKPOINT_tempfile): - os.remove(CHECKPOINT_tempfile) + # Remove tempfile, in case the signal arrives in the + # middle of copying from tempfile to filename + if os.path.isfile(tempfile): + os.remove(tempfile) - torch.save(state, CHECKPOINT_tempfile) - if os.path.isfile(CHECKPOINT_tempfile): - os.rename(CHECKPOINT_tempfile, filename) + torch.save(state, tempfile) + if os.path.isfile(tempfile): + os.rename(tempfile, filename) if is_best: shutil.copyfile(filename, "model_best.pth.tar") print("Checkpoint: saved") @@ -491,8 +490,6 @@ def main(args): # Empty CUDA cache torch.cuda.empty_cache() - CHECKPOINT_filename = args.resume if args.resume else "checkpoint.pth.tar" - # Install signal handler signal.signal(signal.SIGUSR1, lambda a, b: signal_handler(a, b)) signal.signal(signal.SIGTERM, SIGTERM_handler) @@ -538,15 +535,13 @@ def main(args): char_blank = "*" char_space = " " char_apostrophe = "'" - labels = char_blank + char_space + char_apostrophe + string.ascii_lowercase language_model = LanguageModel(labels, char_blank, char_space) - vocab_size = language_model.length + training, validation, _ = datasets_librispeech(transforms, language_model) - num_features = args.n_bins model = Wav2Letter( - num_classes=vocab_size, input_type="mfcc", num_features=num_features + num_classes=language_model.length, input_type="mfcc", num_features=args.n_bins ) if args.jit: @@ -573,10 +568,7 @@ def main(args): "weight_decay": args.weight_decay, } - Optimizer = SGD - optimizer_params = optimizer_params - - optimizer = Optimizer(model.parameters(), **optimizer_params) + optimizer = SGD(model.parameters(), **optimizer_params) scheduler = ExponentialLR(optimizer, gamma=args.gamma) # scheduler = ReduceLROnPlateau(optimizer, patience=2, threshold=1e-3) @@ -586,8 +578,6 @@ def main(args): # criterion = nn.MSELoss() # criterion = torch.nn.NLLLoss() - best_loss = 1.0 - loader_training = DataLoader( training, batch_size=args.batch_size, @@ -601,9 +591,11 @@ def main(args): **loader_validation_params, ) - if args.resume and os.path.isfile(CHECKPOINT_filename): - print("Checkpoint: loading '{}'".format(CHECKPOINT_filename)) - checkpoint = torch.load(CHECKPOINT_filename) + best_loss = 1.0 + + if args.checkpoint and os.path.isfile(args.checkpoint): + print("Checkpoint: loading '{}'".format(args.checkpoint)) + checkpoint = torch.load(args.checkpoint) args.start_epoch = checkpoint["epoch"] best_loss = checkpoint["best_loss"] @@ -614,7 +606,7 @@ def main(args): print( "Checkpoint: loaded '{}' at epoch {}".format( - CHECKPOINT_filename, checkpoint["epoch"] + args.checkpoint, checkpoint["epoch"] ) ) else: @@ -629,7 +621,7 @@ def main(args): "scheduler": scheduler.state_dict(), }, False, - CHECKPOINT_filename, + args.checkpoint, ) with tqdm(total=args.epochs, unit_scale=1, disable=not args.progress_bar) as pbar: @@ -657,7 +649,7 @@ def main(args): "scheduler": scheduler.state_dict(), }, False, - CHECKPOINT_filename, + args.checkpoint, ) if not epoch % args.print_freq or epoch == args.epochs - 1: @@ -682,7 +674,7 @@ def main(args): "scheduler": scheduler.state_dict(), }, is_best, - CHECKPOINT_filename, + args.checkpoint, ) From 9ed1ccebeeab32c678a6bff0234a575bb808e860 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 28 May 2020 08:43:20 -0700 Subject: [PATCH 028/129] sum cer and wer --- examples/pipeline/wav2letter.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index b1029dca06..b30124f301 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -456,7 +456,7 @@ def evaluate( cers = [levenshtein_distance(a, b) for a, b in zip(target, output)] # cers_normalized = [d / len(a) for a, d in zip(target, cers)] - cers = statistics.mean(cers) + cers = sum(cers) sums["cer"] += cers output = [o.split(language_model.char_space) for o in output] @@ -464,7 +464,7 @@ def evaluate( wers = [levenshtein_distance(a, b) for a, b in zip(target, output)] # wers_normalized = [d / len(a) for a, d in zip(target, wers)] - wers = statistics.mean(wers) + wers = sum(wers) sums["wer"] += wers if SIGNAL_RECEIVED: @@ -475,10 +475,7 @@ def evaluate( sums[k] /= len(data_loader) print(f"Validation loss: {sums['loss']:.5f}", flush=True) - print( - f"CER: {sums['cer']} WER: {sums['wer']} CERN: {sums['cern']} WERN: {sums['wern']}", - flush=True, - ) + print(f"CER: {sums['cer']} WER: {sums['wer']}", flush=True) return sums["loss"] From 552a1a92be8c2b786d4af151d2cba89f668a1ba2 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 28 May 2020 08:51:24 -0700 Subject: [PATCH 029/129] clip norm. --- examples/pipeline/wav2letter.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index b30124f301..ec3090cb40 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -102,6 +102,7 @@ def parse_args(): ) parser.add_argument("--eps", metavar="EPS", type=float, default=1e-8) parser.add_argument("--rho", metavar="RHO", type=float, default=0.95) + parser.add_argument("--clip-norm", metavar="NORM", type=float, default=0.0) parser.add_argument( "--dataset", @@ -116,8 +117,6 @@ def parse_args(): args = parser.parse_args() - args.clip_norm = 0.0 - return args @@ -374,7 +373,7 @@ def train_one_epoch( model.train() - sum_loss = 0.0 + sums = defaultdict(lambda: 0.0) for inputs, targets, tensors_lengths, target_lengths in bg_iterator( data_loader, maxsize=2 ): @@ -392,11 +391,16 @@ def train_one_epoch( # target_lengths: batch size loss = criterion(outputs, targets, tensors_lengths, target_lengths) - sum_loss += loss.item() + sums["loss"] += loss.item() optimizer.zero_grad() loss.backward() + if args.clip_norm > 0: + sums["gradient"] += torch.nn.utils.clip_grad_norm_( + model.parameters(), args.clip_norm + ) + optimizer.step() if SIGNAL_RECEIVED: @@ -405,9 +409,13 @@ def train_one_epoch( if pbar is not None: pbar.update(1 / len(data_loader)) - # Average loss - sum_loss = sum_loss / len(data_loader) - print(f"Training loss: {sum_loss:4.5f}", flush=True) + # Average + for k in sums.keys(): + sums[k] /= len(data_loader) + + print(f"Training loss: {sums['loss']:4.5f}", flush=True) + if "gradient" in sums: + print(f"Average gradient norm: {sums['gradient']:4.5f}", flush=True) scheduler.step() @@ -470,7 +478,7 @@ def evaluate( if SIGNAL_RECEIVED: break - # Average loss + # Average for k in sums.keys(): sums[k] /= len(data_loader) From c8cc7d7ec317161b96da8f696ec5d52d0a53502b Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 28 May 2020 08:53:10 -0700 Subject: [PATCH 030/129] second signal handler removed. --- examples/pipeline/wav2letter.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index ec3090cb40..a79e28a923 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -120,11 +120,6 @@ def parse_args(): return args -def SIGTERM_handler(a, b): - print("Received sigterm") - pass - - def signal_handler(a, b): global SIGNAL_RECEIVED print("Signal received", a, datetime.now().strftime("%y%m%d.%H%M%S"), flush=True) @@ -497,7 +492,6 @@ def main(args): # Install signal handler signal.signal(signal.SIGUSR1, lambda a, b: signal_handler(a, b)) - signal.signal(signal.SIGTERM, SIGTERM_handler) audio_backend = "soundfile" torchaudio.set_audio_backend(audio_backend) From cdb8f8e9f65857145fd51109715d43068672a957 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 28 May 2020 08:53:26 -0700 Subject: [PATCH 031/129] cosmetic. --- examples/pipeline/wav2letter.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index a79e28a923..34f22a52ca 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -492,8 +492,9 @@ def main(args): # Install signal handler signal.signal(signal.SIGUSR1, lambda a, b: signal_handler(a, b)) - audio_backend = "soundfile" - torchaudio.set_audio_backend(audio_backend) + + # Change backend + torchaudio.set_audio_backend("soundfile") device = "cuda" if torch.cuda.is_available() else "cpu" # num_devices = torch.cuda.device_count() From 2cd49b196991bc0abc32386f846799798a472904 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 28 May 2020 15:08:35 -0700 Subject: [PATCH 032/129] default to no checkpoint. --- examples/pipeline/wav2letter.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 34f22a52ca..8239d1d87d 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -37,7 +37,7 @@ def parse_args(): ) parser.add_argument( "--checkpoint", - default="checkpoint.pth.tar", + default="", type=str, metavar="PATH", help="path to latest checkpoint", @@ -132,6 +132,10 @@ def save_checkpoint(state, is_best, filename): then copy it to filename, in case the signal interrupts the torch.save() process. """ + + if filename == "": + return + tempfile = filename + ".temp" # Remove tempfile, in case the signal arrives in the From db943fc0ae17068b3907c6d774254d3880b36afa Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 28 May 2020 15:09:17 -0700 Subject: [PATCH 033/129] remove non_blocking. --- examples/pipeline/wav2letter.py | 35 +++++++++++---------------------- 1 file changed, 11 insertions(+), 24 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 8239d1d87d..fe92059439 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -247,18 +247,18 @@ def __next__(self): return self.process_datapoint(item) def process_datapoint(self, item): - transformed = item[0] # .to(device, non_blocking=non_blocking) + transformed = item[0] # .to(device) target = item[2].lower() transformed = self.transforms(transformed) transformed = transformed[0, ...].transpose(0, -1) - target = " " + target + " " + # target = " " + target + " " target = self.encode(target) target = torch.tensor(target, dtype=torch.long, device=transformed.device) - transformed = transformed # .to("cpu") - target = target # .to("cpu") + # transformed = transformed.to("cpu") + # target = target.to("cpu") return transformed, target @@ -360,14 +360,7 @@ def count_parameters(model): def train_one_epoch( - model, - criterion, - optimizer, - scheduler, - data_loader, - device, - pbar=None, - non_blocking=False, + model, criterion, optimizer, scheduler, data_loader, device, pbar=None, ): model.train() @@ -377,8 +370,8 @@ def train_one_epoch( data_loader, maxsize=2 ): - inputs = inputs.to(device, non_blocking=non_blocking) - targets = targets.to(device, non_blocking=non_blocking) + inputs = inputs.to(device, non_blocking=True) + targets = targets.to(device, non_blocking=True) # keep batch first for data parallel outputs = model(inputs).transpose(-1, -2).transpose(0, 1) @@ -419,9 +412,7 @@ def train_one_epoch( scheduler.step() -def evaluate( - model, criterion, data_loader, decoder, language_model, device, non_blocking=False -): +def evaluate(model, criterion, data_loader, decoder, language_model, device): with torch.no_grad(): @@ -433,8 +424,8 @@ def evaluate( data_loader, maxsize=2 ): - inputs = inputs.to(device, non_blocking=non_blocking) - targets = targets.to(device, non_blocking=non_blocking) + inputs = inputs.to(device, non_blocking=True) + targets = targets.to(device, non_blocking=True) # keep batch first for data parallel outputs = model(inputs).transpose(-1, -2).transpose(0, 1) @@ -512,8 +503,6 @@ def main(args): loader_validation_params = loader_training_params.copy() loader_validation_params["shuffle"] = False - non_blocking = True - # audio melkwargs = { @@ -558,7 +547,7 @@ def main(args): model = torch.nn.parallel.DistributedDataParallel(model) # model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True) - model = model.to(device, non_blocking=non_blocking) + model = model.to(device, non_blocking=True) n = count_parameters(model) print(f"Number of parameters: {n}", flush=True) @@ -640,7 +629,6 @@ def main(args): loader_training, device, pbar=pbar, - non_blocking=non_blocking, ) if SIGNAL_RECEIVED: @@ -664,7 +652,6 @@ def main(args): greedy_decode, language_model, device, - non_blocking=non_blocking, ) is_best = sum_loss < best_loss From 8ecdef12e855bf406142c3d11c6fa8769ed844cf Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 28 May 2020 15:15:00 -0700 Subject: [PATCH 034/129] adadelta works better than sgd. --- examples/pipeline/wav2letter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index fe92059439..686b1cefe2 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -556,12 +556,12 @@ def main(args): optimizer_params = { "lr": args.learning_rate, - # "eps": args.eps, - # "rho": args.rho, + "eps": args.eps, + "rho": args.rho, "weight_decay": args.weight_decay, } - optimizer = SGD(model.parameters(), **optimizer_params) + optimizer = Adadelta(model.parameters(), **optimizer_params) scheduler = ExponentialLR(optimizer, gamma=args.gamma) # scheduler = ReduceLROnPlateau(optimizer, patience=2, threshold=1e-3) From 6b6cccbfed2e7c329088c58bef5f47edad61c6cd Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 28 May 2020 15:16:29 -0700 Subject: [PATCH 035/129] anomaly detection. --- examples/pipeline/wav2letter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 686b1cefe2..fcbe348429 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -366,6 +366,7 @@ def train_one_epoch( model.train() sums = defaultdict(lambda: 0.0) + for inputs, targets, tensors_lengths, target_lengths in bg_iterator( data_loader, maxsize=2 ): @@ -571,6 +572,8 @@ def main(args): # criterion = nn.MSELoss() # criterion = torch.nn.NLLLoss() + torch.autograd.set_detect_anomaly(False) + loader_training = DataLoader( training, batch_size=args.batch_size, From 0e250b366293842e58b1abc2213b139480356878 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Fri, 5 Jun 2020 07:44:22 -0700 Subject: [PATCH 036/129] moving dataset to separate file. --- examples/pipeline/datasets.py | 111 ++++++++++++++++++++++++++++++++ examples/pipeline/wav2letter.py | 110 +------------------------------ 2 files changed, 113 insertions(+), 108 deletions(-) create mode 100644 examples/pipeline/datasets.py diff --git a/examples/pipeline/datasets.py b/examples/pipeline/datasets.py new file mode 100644 index 0000000000..3bb60096c1 --- /dev/null +++ b/examples/pipeline/datasets.py @@ -0,0 +1,111 @@ +import itertools + +import torch +from torchaudio.datasets import LIBRISPEECH + + +class IterableMemoryCache: + def __init__(self, iterable): + self.iterable = iterable + self._iter = iter(iterable) + self._done = False + self._values = [] + + def __iter__(self): + if self._done: + return iter(self._values) + return itertools.chain(self._values, self._gen_iter()) + + def _gen_iter(self): + for new_value in self._iter: + self._values.append(new_value) + yield new_value + self._done = True + + def __len__(self): + return len(self._iterable) + + +class MapMemoryCache(torch.utils.data.Dataset): + """ + Wrap a dataset so that, whenever a new item is returned, it is saved to memory. + """ + + def __init__(self, dataset): + self.dataset = dataset + self._cache = [None] * len(dataset) + + def __getitem__(self, n): + if self._cache[n] is not None: + return self._cache[n] + + item = self.dataset[n] + self._cache[n] = item + + return item + + def __len__(self): + return len(self.dataset) + + +class ProcessedLIBRISPEECH(LIBRISPEECH): + def __init__(self, transforms, encode, *args, **kwargs): + self.transforms = transforms + self.encode = encode + super().__init__(*args, **kwargs) + + def __getitem__(self, key): + item = super().__getitem__(key) + return self.process_datapoint(item) + + def __next__(self): + item = super().__next__() + return self.process_datapoint(item) + + def process_datapoint(self, item): + transformed = item[0] # .to(device) + target = item[2].lower() + + transformed = self.transforms(transformed) + transformed = transformed[0, ...].transpose(0, -1) + + # target = " " + target + " " + target = self.encode(target) + target = torch.tensor(target, dtype=torch.long, device=transformed.device) + + # transformed = transformed.to("cpu") + # target = target.to("cpu") + return transformed, target + + +def datasets_librispeech( + transforms, + language_model, + root="/datasets01/", + folder_in_archive="librispeech/062419/", +): + def create(tag): + + if isinstance(tag, str): + tag = [tag] + + data = torch.utils.data.ConcatDataset( + [ + ProcessedLIBRISPEECH( + transforms, + language_model.encode, + root, + t, + folder_in_archive=folder_in_archive, + download=False, + ) + for t in tag + ] + ) + + # data = diskcache_iterator(data) + data = MapMemoryCache(data) + return data + + return create("train-clean-100"), create("dev-clean"), None + # return create(["train-clean-100", "train-clean-360", "train-other-500"]), create(["dev-clean", "dev-other"]), None diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index fcbe348429..bd7689d0ea 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -16,12 +16,13 @@ from torch.optim import SGD, Adadelta, Adam from torch.optim.lr_scheduler import ExponentialLR, ReduceLROnPlateau from torch.utils.data import DataLoader -from torchaudio.datasets import LIBRISPEECH from torchaudio.datasets.utils import bg_iterator, diskcache_iterator from torchaudio.models.wav2letter import Wav2Letter from torchaudio.transforms import MFCC, Resample from tqdm import tqdm +from .datasets import datasets_librispeech + SIGNAL_RECEIVED = False @@ -188,113 +189,6 @@ def model_length_function(tensor): return int(tensor.shape[0]) // 2 + 1 -class IterableMemoryCache: - def __init__(self, iterable): - self.iterable = iterable - self._iter = iter(iterable) - self._done = False - self._values = [] - - def __iter__(self): - if self._done: - return iter(self._values) - return itertools.chain(self._values, self._gen_iter()) - - def _gen_iter(self): - for new_value in self._iter: - self._values.append(new_value) - yield new_value - self._done = True - - def __len__(self): - return len(self._iterable) - - -class MapMemoryCache(torch.utils.data.Dataset): - """ - Wrap a dataset so that, whenever a new item is returned, it is saved to memory. - """ - - def __init__(self, dataset): - self.dataset = dataset - self._cache = [None] * len(dataset) - - def __getitem__(self, n): - if self._cache[n] is not None: - return self._cache[n] - - item = self.dataset[n] - self._cache[n] = item - - return item - - def __len__(self): - return len(self.dataset) - - -class ProcessedLIBRISPEECH(LIBRISPEECH): - def __init__(self, transforms, encode, *args, **kwargs): - self.transforms = transforms - self.encode = encode - super().__init__(*args, **kwargs) - - def __getitem__(self, key): - item = super().__getitem__(key) - return self.process_datapoint(item) - - def __next__(self): - item = super().__next__() - return self.process_datapoint(item) - - def process_datapoint(self, item): - transformed = item[0] # .to(device) - target = item[2].lower() - - transformed = self.transforms(transformed) - transformed = transformed[0, ...].transpose(0, -1) - - # target = " " + target + " " - target = self.encode(target) - target = torch.tensor(target, dtype=torch.long, device=transformed.device) - - # transformed = transformed.to("cpu") - # target = target.to("cpu") - return transformed, target - - -def datasets_librispeech( - transforms, - language_model, - root="/datasets01/", - folder_in_archive="librispeech/062419/", -): - def create(tag): - - if isinstance(tag, str): - tag = [tag] - - data = torch.utils.data.ConcatDataset( - [ - ProcessedLIBRISPEECH( - transforms, - language_model.encode, - root, - t, - folder_in_archive=folder_in_archive, - download=False, - ) - for t in tag - ] - ) - - # data = diskcache_iterator(data) - data = MapMemoryCache(data) - return data - - return create("train-clean-100"), create("dev-clean"), None - # return create(["train-clean-100", "train-clean-360", "train-other-500"]), create(["dev-clean", "dev-other"]), None - - def greedy_decode(outputs): """Greedy Decoder. Returns highest probability of class labels for each timestep From f791ec5cb18fd19ba521979c5145f88d35ef5056 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Fri, 5 Jun 2020 07:44:49 -0700 Subject: [PATCH 037/129] lint. --- examples/pipeline/wav2letter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index bd7689d0ea..4a2ab37ac1 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -4,7 +4,6 @@ import os import shutil import signal -import statistics import string from collections import defaultdict from datetime import datetime @@ -567,5 +566,6 @@ def main(args): if __name__ == "__main__": + args = parse_args() main(args) From 2fb5097b320623a487cfb603fb9ee0e8b2696e39 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Fri, 5 Jun 2020 08:27:00 -0700 Subject: [PATCH 038/129] move to separate module: languagemodel, decoder, metric. --- examples/pipeline/decoders.py | 14 +++++ examples/pipeline/languagemodels.py | 35 ++++++++++++ examples/pipeline/metrics.py | 28 ++++++++++ examples/pipeline/wav2letter.py | 82 ++--------------------------- 4 files changed, 82 insertions(+), 77 deletions(-) create mode 100644 examples/pipeline/decoders.py create mode 100644 examples/pipeline/languagemodels.py create mode 100644 examples/pipeline/metrics.py diff --git a/examples/pipeline/decoders.py b/examples/pipeline/decoders.py new file mode 100644 index 0000000000..6f9d4a22d2 --- /dev/null +++ b/examples/pipeline/decoders.py @@ -0,0 +1,14 @@ +from torch import topk + + +def greedy_decode(outputs): + """Greedy Decoder. Returns highest probability of class labels for each timestep + + Args: + outputs (torch.Tensor): shape (input length, batch size, number of classes (including blank)) + + Returns: + torch.Tensor: class labels per time step. + """ + _, indices = topk(outputs, k=1, dim=-1) + return indices[..., 0] diff --git a/examples/pipeline/languagemodels.py b/examples/pipeline/languagemodels.py new file mode 100644 index 0000000000..fefff55839 --- /dev/null +++ b/examples/pipeline/languagemodels.py @@ -0,0 +1,35 @@ +import collections +import itertools + + +class LanguageModel: + def __init__(self, labels, char_blank, char_space): + + self.char_space = char_space + self.char_blank = char_blank + + labels = [l for l in labels] + self.length = len(labels) + enumerated = list(enumerate(labels)) + flipped = [(sub[1], sub[0]) for sub in enumerated] + + d1 = collections.OrderedDict(enumerated) + d2 = collections.OrderedDict(flipped) + self.mapping = {**d1, **d2} + + def encode(self, iterable): + if isinstance(iterable, list): + return [self.encode(i) for i in iterable] + else: + return [self.mapping[i] + self.mapping[self.char_blank] for i in iterable] + + def decode(self, tensor): + if isinstance(tensor[0], list): + return [self.decode(t) for t in tensor] + else: + # not idempotent, since clean string + x = (self.mapping[i] for i in tensor) + x = "".join(i for i, _ in itertools.groupby(x)) + x = x.replace(self.char_blank, "") + # x = x.strip() + return x diff --git a/examples/pipeline/metrics.py b/examples/pipeline/metrics.py new file mode 100644 index 0000000000..6faede3969 --- /dev/null +++ b/examples/pipeline/metrics.py @@ -0,0 +1,28 @@ +from typing import Optional + +import torch + + +def levenshtein_distance(r: str, h: str, device: Optional[str] = None): + + # initialisation + d = torch.zeros((2, len(h) + 1), dtype=torch.long) # , device=device) + dold = 0 + dnew = 1 + + # computation + for i in range(1, len(r) + 1): + d[dnew, 0] = 0 + for j in range(1, len(h) + 1): + + if r[i - 1] == h[j - 1]: + d[dnew, j] = d[dnew - 1, j - 1] + else: + substitution = d[dnew - 1, j - 1] + 1 + insertion = d[dnew, j - 1] + 1 + deletion = d[dnew - 1, j] + 1 + d[dnew, j] = min(substitution, insertion, deletion) + + dnew, dold = dold, dnew + + return d[dnew, -1].item() diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 4a2ab37ac1..ac9c4948f3 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -1,17 +1,13 @@ import argparse -import collections -import itertools import os import shutil import signal import string from collections import defaultdict from datetime import datetime -from typing import Optional import torch import torchaudio -from torch import nn, topk from torch.optim import SGD, Adadelta, Adam from torch.optim.lr_scheduler import ExponentialLR, ReduceLROnPlateau from torch.utils.data import DataLoader @@ -21,6 +17,9 @@ from tqdm import tqdm from .datasets import datasets_librispeech +from .decoders import greedy_decode +from .languagemodels import LanguageModel +from .metrics import levenshtein_distance SIGNAL_RECEIVED = False @@ -151,81 +150,10 @@ def save_checkpoint(state, is_best, filename): print("Checkpoint: saved") -class LanguageModel: - def __init__(self, labels, char_blank, char_space): - - self.char_space = char_space - self.char_blank = char_blank - - labels = [l for l in labels] - self.length = len(labels) - enumerated = list(enumerate(labels)) - flipped = [(sub[1], sub[0]) for sub in enumerated] - - d1 = collections.OrderedDict(enumerated) - d2 = collections.OrderedDict(flipped) - self.mapping = {**d1, **d2} - - def encode(self, iterable): - if isinstance(iterable, list): - return [self.encode(i) for i in iterable] - else: - return [self.mapping[i] + self.mapping[self.char_blank] for i in iterable] - - def decode(self, tensor): - if isinstance(tensor[0], list): - return [self.decode(t) for t in tensor] - else: - # not idempotent, since clean string - x = (self.mapping[i] for i in tensor) - x = "".join(i for i, _ in itertools.groupby(x)) - x = x.replace(self.char_blank, "") - # x = x.strip() - return x - - def model_length_function(tensor): return int(tensor.shape[0]) // 2 + 1 -def greedy_decode(outputs): - """Greedy Decoder. Returns highest probability of class labels for each timestep - - Args: - outputs (torch.Tensor): shape (input length, batch size, number of classes (including blank)) - - Returns: - torch.Tensor: class labels per time step. - """ - _, indices = topk(outputs, k=1, dim=-1) - return indices[..., 0] - - -def levenshtein_distance(r: str, h: str, device: Optional[str] = None): - - # initialisation - d = torch.zeros((2, len(h) + 1), dtype=torch.long) # , device=device) - dold = 0 - dnew = 1 - - # computation - for i in range(1, len(r) + 1): - d[dnew, 0] = 0 - for j in range(1, len(h) + 1): - - if r[i - 1] == h[j - 1]: - d[dnew, j] = d[dnew - 1, j - 1] - else: - substitution = d[dnew - 1, j - 1] + 1 - insertion = d[dnew, j - 1] + 1 - deletion = d[dnew - 1, j] + 1 - d[dnew, j] = min(substitution, insertion, deletion) - - dnew, dold = dold, dnew - - return d[dnew, -1].item() - - def collate_fn(batch): tensors = [b[0] for b in batch if b] @@ -407,7 +335,7 @@ def main(args): sample_rate_original = 16000 - transforms = nn.Sequential( + transforms = torch.nn.Sequential( # torchaudio.transforms.Resample(sample_rate_original, sample_rate_original//2), # torchaudio.transforms.MFCC(sample_rate=sample_rate_original, n_mfcc=args.n_bins, melkwargs=melkwargs), torchaudio.transforms.MelSpectrogram( @@ -462,7 +390,7 @@ def main(args): criterion = torch.nn.CTCLoss( blank=language_model.mapping[char_blank], zero_infinity=False ) - # criterion = nn.MSELoss() + # criterion = torch.nn.MSELoss() # criterion = torch.nn.NLLLoss() torch.autograd.set_detect_anomaly(False) From 9ca6f1d2c9abf7e8cfa95b6d721219429a636a6c Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Fri, 5 Jun 2020 08:50:29 -0700 Subject: [PATCH 039/129] flush=True. --- examples/pipeline/wav2letter.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index ac9c4948f3..8374365bda 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -411,7 +411,7 @@ def main(args): best_loss = 1.0 if args.checkpoint and os.path.isfile(args.checkpoint): - print("Checkpoint: loading '{}'".format(args.checkpoint)) + print("Checkpoint: loading '{}'".format(args.checkpoint), flush=True) checkpoint = torch.load(args.checkpoint) args.start_epoch = checkpoint["epoch"] @@ -424,10 +424,11 @@ def main(args): print( "Checkpoint: loaded '{}' at epoch {}".format( args.checkpoint, checkpoint["epoch"] - ) + ), + flush=True, ) else: - print("Checkpoint: not found") + print("Checkpoint: not found", flush=True) save_checkpoint( { From f91f77faf5218b8ec7c8482725d7a6be6687a0ca Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Fri, 5 Jun 2020 13:30:19 -0700 Subject: [PATCH 040/129] renaming decoder. --- examples/pipeline/decoders.py | 143 ++++++++++++++++++++++++++++++-- examples/pipeline/wav2letter.py | 6 +- 2 files changed, 138 insertions(+), 11 deletions(-) diff --git a/examples/pipeline/decoders.py b/examples/pipeline/decoders.py index 6f9d4a22d2..4c3865f2f6 100644 --- a/examples/pipeline/decoders.py +++ b/examples/pipeline/decoders.py @@ -1,14 +1,139 @@ +from collections import Counter + +import torch from torch import topk -def greedy_decode(outputs): - """Greedy Decoder. Returns highest probability of class labels for each timestep +class GreedyDecoder: + def __call__(self, outputs): + """Greedy Decoder. Returns highest probability of class labels for each timestep + + Args: + outputs (torch.Tensor): shape (input length, batch size, number of classes (including blank)) + + Returns: + torch.Tensor: class labels per time step. + """ + _, indices = topk(outputs, k=1, dim=-1) + return indices[..., 0] + + +class ViterbiDecoder: + def __init__(self, data_loader, vocab_size, n=2): + self.vocab_size = vocab_size + self.n = n + self._build_transitions(data_loader) + self.top_k = 1 + + def _build_transitions(self, data_loader): + + # Count n-grams + + c = Counter() + + for _, label in data_loader: + count = zip([label[i:].item() for i in range(self.n)]) + count = Counter(*count) + c += count + + # Encode as transition matrix + + ind = torch.tensor(list(zip(*(a for (a, _) in c.items())))) + val = torch.tensor((b for (_, b) in c.items()), dtype=torch.float) + + transitions = ( + torch.sparse_coo_tensor( + indices=ind, values=val, size=[self.vocab_size, self.vocab_size] + ) + .coalesce() + .to_dense() + ) + transitions = transitions / torch.max( + torch.tensor(1.0), transitions.max(dim=1)[0] + ).unsqueeze(1) + + self.transitions = transitions + return transitions + + def _viterbi_decode(self, tag_sequence: torch.Tensor): + """ + Perform Viterbi decoding in log space over a sequence given a transition matrix + specifying pairwise (transition) potentials between tags and a matrix of shape + (sequence_length, num_tags) specifying unary potentials for possible tags per + timestep. + + Parameters + ---------- + tag_sequence : torch.Tensor, required. + A tensor of shape (sequence_length, num_tags) representing scores for + a set of tags over a given sequence. + + Returns + ------- + viterbi_path : List[int] + The tag indices of the maximum likelihood tag sequence. + viterbi_score : float + The score of the viterbi path. + """ + sequence_length, num_tags = tag_sequence.size() + + path_scores = [] + path_indices = [] + # At the beginning, the maximum number of permutations is 1; therefore, we unsqueeze(0) + # to allow for 1 permutation. + path_scores.append(tag_sequence[0, :].unsqueeze(0)) + # assert path_scores[0].size() == (n_permutations, num_tags) + + # Evaluate the scores for all possible paths. + for timestep in range(1, sequence_length): + # Add pairwise potentials to current scores. + # assert path_scores[timestep - 1].size() == (n_permutations, num_tags) + summed_potentials = ( + path_scores[timestep - 1].unsqueeze(2) + self.transition_matrix + ) + summed_potentials = summed_potentials.view(-1, num_tags) + + # Best pairwise potential path score from the previous timestep. + max_k = min(summed_potentials.size()[0], self.top_k) + scores, paths = torch.topk(summed_potentials, k=max_k, dim=0) + # assert scores.size() == (n_permutations, num_tags) + # assert paths.size() == (n_permutations, num_tags) + + scores = tag_sequence[timestep, :] + scores + # assert scores.size() == (n_permutations, num_tags) + path_scores.append(scores) + path_indices.append(paths.squeeze()) + + # Construct the most likely sequence backwards. + path_scores = path_scores[-1].view(-1) + max_k = min(path_scores.size()[0], self.top_k) + viterbi_scores, best_paths = torch.topk(path_scores, k=max_k, dim=0) + + viterbi_paths = [] + for i in range(max_k): + + viterbi_path = [best_paths[i].item()] + for backward_timestep in reversed(path_indices): + viterbi_path.append(int(backward_timestep.view(-1)[viterbi_path[-1]])) + + # Reverse the backward path. + viterbi_path.reverse() + + # Viterbi paths uses (num_tags * n_permutations) nodes; therefore, we need to modulo. + viterbi_path = [j % num_tags for j in viterbi_path] + viterbi_paths.append(viterbi_path) + + return viterbi_paths, viterbi_scores + + def __call__(self, tag_sequence: torch.Tensor): - Args: - outputs (torch.Tensor): shape (input length, batch size, number of classes (including blank)) + outputs = [] + scores = [] + for i in range(tag_sequence.shape[1]): + paths, score = self._viterbi_decode( + tag_sequence[:, i, :], self._transitions + ) + outputs.append(paths) + scores.append(score) - Returns: - torch.Tensor: class labels per time step. - """ - _, indices = topk(outputs, k=1, dim=-1) - return indices[..., 0] + return torch.tensor(outputs).transpose(0, -1), torch.cat(scores)[:, 0, :] diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 8374365bda..f6dd3c0507 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -17,7 +17,7 @@ from tqdm import tqdm from .datasets import datasets_librispeech -from .decoders import greedy_decode +from .decoders import GreedyDecoder from .languagemodels import LanguageModel from .metrics import levenshtein_distance @@ -355,6 +355,8 @@ def main(args): training, validation, _ = datasets_librispeech(transforms, language_model) + decoder = GreedyDecoder() + model = Wav2Letter( num_classes=language_model.length, input_type="mfcc", num_features=args.n_bins ) @@ -474,7 +476,7 @@ def main(args): model, criterion, loader_validation, - greedy_decode, + decoder, language_model, device, ) From 5b3ef99a647b7e1ba27537f4cd83220062229130 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Fri, 5 Jun 2020 13:44:55 -0700 Subject: [PATCH 041/129] CTC Decoders. --- examples/pipeline/{decoders.py => ctc_decoders.py} | 0 examples/pipeline/wav2letter.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename examples/pipeline/{decoders.py => ctc_decoders.py} (100%) diff --git a/examples/pipeline/decoders.py b/examples/pipeline/ctc_decoders.py similarity index 100% rename from examples/pipeline/decoders.py rename to examples/pipeline/ctc_decoders.py diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index f6dd3c0507..56cb40163a 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -16,8 +16,8 @@ from torchaudio.transforms import MFCC, Resample from tqdm import tqdm +from .ctc_decoders import GreedyDecoder from .datasets import datasets_librispeech -from .decoders import GreedyDecoder from .languagemodels import LanguageModel from .metrics import levenshtein_distance From 620e65daba647940b67393ad3392739f5af779d3 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Fri, 5 Jun 2020 13:45:12 -0700 Subject: [PATCH 042/129] flush=True. --- examples/pipeline/wav2letter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 56cb40163a..9ff9b64969 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -147,7 +147,7 @@ def save_checkpoint(state, is_best, filename): os.rename(tempfile, filename) if is_best: shutil.copyfile(filename, "model_best.pth.tar") - print("Checkpoint: saved") + print("Checkpoint: saved", flush=True) def model_length_function(tensor): From 8887c865bd460660ac39ea9c069448cbeb56d29d Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Fri, 5 Jun 2020 13:56:18 -0700 Subject: [PATCH 043/129] pass length for viterbi decoder. --- examples/pipeline/ctc_decoders.py | 1 - examples/pipeline/languagemodels.py | 3 +++ examples/pipeline/wav2letter.py | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/pipeline/ctc_decoders.py b/examples/pipeline/ctc_decoders.py index 4c3865f2f6..3c173644d5 100644 --- a/examples/pipeline/ctc_decoders.py +++ b/examples/pipeline/ctc_decoders.py @@ -53,7 +53,6 @@ def _build_transitions(self, data_loader): ).unsqueeze(1) self.transitions = transitions - return transitions def _viterbi_decode(self, tag_sequence: torch.Tensor): """ diff --git a/examples/pipeline/languagemodels.py b/examples/pipeline/languagemodels.py index fefff55839..7011b2230e 100644 --- a/examples/pipeline/languagemodels.py +++ b/examples/pipeline/languagemodels.py @@ -33,3 +33,6 @@ def decode(self, tensor): x = x.replace(self.char_blank, "") # x = x.strip() return x + + def __len__(self): + return self.length diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 9ff9b64969..2b54e7865f 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -16,7 +16,7 @@ from torchaudio.transforms import MFCC, Resample from tqdm import tqdm -from .ctc_decoders import GreedyDecoder +from .ctc_decoders import GreedyDecoder, ViterbiDecoder from .datasets import datasets_librispeech from .languagemodels import LanguageModel from .metrics import levenshtein_distance @@ -356,6 +356,7 @@ def main(args): training, validation, _ = datasets_librispeech(transforms, language_model) decoder = GreedyDecoder() + # decoder = ViterbiDecoder(training, len(language_model)) model = Wav2Letter( num_classes=language_model.length, input_type="mfcc", num_features=args.n_bins From c53301da41c735c517df07343dae6250d8ec60c8 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Fri, 5 Jun 2020 14:30:06 -0700 Subject: [PATCH 044/129] progress bar. relative path. --- examples/pipeline/ctc_decoders.py | 9 ++++++--- examples/pipeline/wav2letter.py | 8 ++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/examples/pipeline/ctc_decoders.py b/examples/pipeline/ctc_decoders.py index 3c173644d5..e38b59789b 100644 --- a/examples/pipeline/ctc_decoders.py +++ b/examples/pipeline/ctc_decoders.py @@ -1,3 +1,4 @@ +from tqdm import tqdm from collections import Counter import torch @@ -19,11 +20,13 @@ def __call__(self, outputs): class ViterbiDecoder: - def __init__(self, data_loader, vocab_size, n=2): + def __init__(self, data_loader, vocab_size, n=2, progress_bar=False): self.vocab_size = vocab_size self.n = n - self._build_transitions(data_loader) self.top_k = 1 + self.progress_bar = progress_bar + + self._build_transitions(data_loader) def _build_transitions(self, data_loader): @@ -31,7 +34,7 @@ def _build_transitions(self, data_loader): c = Counter() - for _, label in data_loader: + for _, label in tqdm(data_loader, disable=not self.progress_bar): count = zip([label[i:].item() for i in range(self.n)]) count = Counter(*count) c += count diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 2b54e7865f..a0dd30fa29 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -16,10 +16,10 @@ from torchaudio.transforms import MFCC, Resample from tqdm import tqdm -from .ctc_decoders import GreedyDecoder, ViterbiDecoder -from .datasets import datasets_librispeech -from .languagemodels import LanguageModel -from .metrics import levenshtein_distance +from ctc_decoders import GreedyDecoder, ViterbiDecoder +from datasets import datasets_librispeech +from languagemodels import LanguageModel +from metrics import levenshtein_distance SIGNAL_RECEIVED = False From a0c144ebe047ae21fcae081cbc4012fac49a64a8 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Mon, 8 Jun 2020 10:21:37 -0700 Subject: [PATCH 045/129] generalize transition matrix to n-gram. progress bar. --- examples/pipeline/ctc_decoders.py | 13 +++++++------ examples/pipeline/wav2letter.py | 2 +- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/examples/pipeline/ctc_decoders.py b/examples/pipeline/ctc_decoders.py index e38b59789b..bdfcacbdb5 100644 --- a/examples/pipeline/ctc_decoders.py +++ b/examples/pipeline/ctc_decoders.py @@ -1,8 +1,8 @@ -from tqdm import tqdm from collections import Counter import torch from torch import topk +from tqdm import tqdm class GreedyDecoder: @@ -33,16 +33,17 @@ def _build_transitions(self, data_loader): # Count n-grams c = Counter() - for _, label in tqdm(data_loader, disable=not self.progress_bar): - count = zip([label[i:].item() for i in range(self.n)]) - count = Counter(*count) + count = Counter( + tuple(b.item() for b in a) + for a in zip(*(label[i:] for i in range(self.n))) + ) c += count # Encode as transition matrix - ind = torch.tensor(list(zip(*(a for (a, _) in c.items())))) - val = torch.tensor((b for (_, b) in c.items()), dtype=torch.float) + ind = torch.tensor(list(a for (a, _) in c.items())).t() + val = torch.tensor([b for (_, b) in c.items()], dtype=torch.float) transitions = ( torch.sparse_coo_tensor( diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index a0dd30fa29..a4d7cafb19 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -356,7 +356,7 @@ def main(args): training, validation, _ = datasets_librispeech(transforms, language_model) decoder = GreedyDecoder() - # decoder = ViterbiDecoder(training, len(language_model)) + # decoder = ViterbiDecoder(training, len(language_model), progress_bar=args.progress_bar) model = Wav2Letter( num_classes=language_model.length, input_type="mfcc", num_features=args.n_bins From 50fc186a7d36b0f9419d8e1711437169d030f042 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Mon, 8 Jun 2020 15:29:35 -0700 Subject: [PATCH 046/129] choice of decoder. --- examples/pipeline/ctc_decoders.py | 2 +- examples/pipeline/wav2letter.py | 19 ++++++++++++------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/examples/pipeline/ctc_decoders.py b/examples/pipeline/ctc_decoders.py index bdfcacbdb5..b24a09b3ff 100644 --- a/examples/pipeline/ctc_decoders.py +++ b/examples/pipeline/ctc_decoders.py @@ -42,7 +42,7 @@ def _build_transitions(self, data_loader): # Encode as transition matrix - ind = torch.tensor(list(a for (a, _) in c.items())).t() + ind = torch.tensor([a for (a, _) in c.items()]).t() val = torch.tensor([b for (_, b) in c.items()], dtype=torch.float) transitions = ( diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index a4d7cafb19..e72a7e4d23 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -64,12 +64,13 @@ def parse_args(): ) parser.add_argument( - "--arch", - metavar="ARCH", - default="wav2letter", - choices=["wav2letter"], - help="model architecture", + "--decoder", + metavar="D", + default="greedy", + choices=["greedy", "viterbi"], + help="decoder to use", ) + parser.add_argument( "--batch-size", default=64, type=int, metavar="N", help="mini-batch size" ) @@ -355,8 +356,12 @@ def main(args): training, validation, _ = datasets_librispeech(transforms, language_model) - decoder = GreedyDecoder() - # decoder = ViterbiDecoder(training, len(language_model), progress_bar=args.progress_bar) + if args.decoder == "greedy": + decoder = GreedyDecoder() + elif args.decoder == "viterbi": + decoder = ViterbiDecoder( + training, len(language_model), progress_bar=args.progress_bar + ) model = Wav2Letter( num_classes=language_model.length, input_type="mfcc", num_features=args.n_bins From 5e6a44a1ec84baf4b3b24db3894ee2c25dd7c0b3 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 10 Jun 2020 11:37:13 -0700 Subject: [PATCH 047/129] collate func. --- examples/pipeline/datasets.py | 27 +++++++++++++++++++++++++++ examples/pipeline/wav2letter.py | 26 +++----------------------- 2 files changed, 30 insertions(+), 23 deletions(-) diff --git a/examples/pipeline/datasets.py b/examples/pipeline/datasets.py index 3bb60096c1..fbca2559af 100644 --- a/examples/pipeline/datasets.py +++ b/examples/pipeline/datasets.py @@ -109,3 +109,30 @@ def create(tag): return create("train-clean-100"), create("dev-clean"), None # return create(["train-clean-100", "train-clean-360", "train-other-500"]), create(["dev-clean", "dev-other"]), None + + +def collate_factory(model_length_function): + def collate_fn(batch): + + tensors = [b[0] for b in batch if b] + + tensors_lengths = torch.tensor( + [model_length_function(t) for t in tensors], + dtype=torch.long, + device=tensors[0].device, + ) + + tensors = torch.nn.utils.rnn.pad_sequence(tensors, batch_first=True) + tensors = tensors.transpose(1, -1) + + targets = [b[1] for b in batch if b] + target_lengths = torch.tensor( + [target.shape[0] for target in targets], + dtype=torch.long, + device=tensors.device, + ) + targets = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True) + + return tensors, targets, tensors_lengths, target_lengths + + return collate_fn diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index e72a7e4d23..35b1ee7da3 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -17,7 +17,7 @@ from tqdm import tqdm from ctc_decoders import GreedyDecoder, ViterbiDecoder -from datasets import datasets_librispeech +from datasets import collate_factory, datasets_librispeech from languagemodels import LanguageModel from metrics import levenshtein_distance @@ -155,28 +155,6 @@ def model_length_function(tensor): return int(tensor.shape[0]) // 2 + 1 -def collate_fn(batch): - - tensors = [b[0] for b in batch if b] - - tensors_lengths = torch.tensor( - [model_length_function(t) for t in tensors], - dtype=torch.long, - device=tensors[0].device, - ) - - tensors = torch.nn.utils.rnn.pad_sequence(tensors, batch_first=True) - tensors = tensors.transpose(1, -1) - - targets = [b[1] for b in batch if b] - target_lengths = torch.tensor( - [target.shape[0] for target in targets], dtype=torch.long, device=tensors.device - ) - targets = torch.nn.utils.rnn.pad_sequence(targets, batch_first=True) - - return tensors, targets, tensors_lengths, target_lengths - - def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) @@ -403,6 +381,8 @@ def main(args): torch.autograd.set_detect_anomaly(False) + collate_fn = collate_factory(model_length_function) + loader_training = DataLoader( training, batch_size=args.batch_size, From 4c6d87bce7f694c9049c764e677a9df9ac5148f2 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 10 Jun 2020 11:40:00 -0700 Subject: [PATCH 048/129] remove signal handling. --- examples/pipeline/wav2letter.py | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 35b1ee7da3..61522c042f 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -21,8 +21,6 @@ from languagemodels import LanguageModel from metrics import levenshtein_distance -SIGNAL_RECEIVED = False - def parse_args(): parser = argparse.ArgumentParser() @@ -120,12 +118,6 @@ def parse_args(): return args -def signal_handler(a, b): - global SIGNAL_RECEIVED - print("Signal received", a, datetime.now().strftime("%y%m%d.%H%M%S"), flush=True) - SIGNAL_RECEIVED = True - - def save_checkpoint(state, is_best, filename): """ Save the model to a temporary file first, @@ -196,9 +188,6 @@ def train_one_epoch( optimizer.step() - if SIGNAL_RECEIVED: - return - if pbar is not None: pbar.update(1 / len(data_loader)) @@ -266,9 +255,6 @@ def evaluate(model, criterion, data_loader, decoder, language_model, device): wers = sum(wers) sums["wer"] += wers - if SIGNAL_RECEIVED: - break - # Average for k in sums.keys(): sums[k] /= len(data_loader) @@ -286,9 +272,6 @@ def main(args): # Empty CUDA cache torch.cuda.empty_cache() - # Install signal handler - signal.signal(signal.SIGUSR1, lambda a, b: signal_handler(a, b)) - # Change backend torchaudio.set_audio_backend("soundfile") @@ -444,18 +427,6 @@ def main(args): pbar=pbar, ) - if SIGNAL_RECEIVED: - save_checkpoint( - { - "epoch": epoch, - "state_dict": model.state_dict(), - "best_loss": best_loss, - "optimizer": optimizer.state_dict(), - "scheduler": scheduler.state_dict(), - }, - False, - args.checkpoint, - ) if not epoch % args.print_freq or epoch == args.epochs - 1: sum_loss = evaluate( From bbede944647319b075825d286b2618b414941cde Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 10 Jun 2020 15:14:13 -0700 Subject: [PATCH 049/129] adding distributed. --- examples/pipeline/wav2letter.py | 64 ++++++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 12 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 61522c042f..5e6162b738 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -111,6 +111,11 @@ def parse_args(): parser.add_argument( "--distributed", action="store_true", help="enable DistributedDataParallel" ) + parser.add_argument("--seed", type=int, default=0, help="random seed") + parser.add_argument( + "--world-size", type=int, default=8, help="the world size to initiate DPP" + ) + parser.add_argument("--jit", action="store_true", help="if used, model is jitted") args = parser.parse_args() @@ -118,20 +123,30 @@ def parse_args(): return args -def save_checkpoint(state, is_best, filename): +def setup(rank, world_size): + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "12355" + + # initialize the process group + torch.distributed.init_process_group("nccl", rank=rank, world_size=world_size) + + +def save_checkpoint(state, is_best, filename, rank): """ Save the model to a temporary file first, then copy it to filename, in case the signal interrupts the torch.save() process. """ + if rank != 0: + return + if filename == "": return tempfile = filename + ".temp" - # Remove tempfile, in case the signal arrives in the - # middle of copying from tempfile to filename + # Remove tempfile in case interuption during the copying from tempfile to filename if os.path.isfile(tempfile): os.remove(tempfile) @@ -265,9 +280,15 @@ def evaluate(model, criterion, data_loader, decoder, language_model, device): return sums["loss"] -def main(args): +def main(args, rank=0): + + if args.distributed: + setup(rank, args.world_size) print("Start time: {}".format(str(datetime.now())), flush=True) + # Explicitly setting seed to make sure that models created in two processes + # start from same random weights and biases. + torch.manual_seed(args.seed) # Empty CUDA cache torch.cuda.empty_cache() @@ -275,8 +296,11 @@ def main(args): # Change backend torchaudio.set_audio_backend("soundfile") - device = "cuda" if torch.cuda.is_available() else "cpu" - # num_devices = torch.cuda.device_count() + if args.distributed: + n = torch.cuda.device_count() // args.world_size + devices = list(range(rank * n, (rank + 1) * n)) + else: + devices = ["cuda" if torch.cuda.is_available() else "cpu"] loader_training_params = { "num_workers": args.workers, @@ -335,10 +359,10 @@ def main(args): model = torch.nn.DataParallel(model) else: model.cuda() - model = torch.nn.parallel.DistributedDataParallel(model) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=devices) # model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True) - model = model.to(device, non_blocking=True) + model = model.to(devices[0], non_blocking=True) n = count_parameters(model) print(f"Number of parameters: {n}", flush=True) @@ -381,7 +405,12 @@ def main(args): best_loss = 1.0 - if args.checkpoint and os.path.isfile(args.checkpoint): + load_checkpoint = args.checkpoint and os.path.isfile(args.checkpoint) + + if args.distributed: + torch.distributed.barrier() + + if load_checkpoint: print("Checkpoint: loading '{}'".format(args.checkpoint), flush=True) checkpoint = torch.load(args.checkpoint) @@ -411,8 +440,12 @@ def main(args): }, False, args.checkpoint, + rank, ) + if args.distributed: + torch.distributed.barrier() + with tqdm(total=args.epochs, unit_scale=1, disable=not args.progress_bar) as pbar: for epoch in range(args.start_epoch, args.epochs): @@ -423,7 +456,7 @@ def main(args): optimizer, scheduler, loader_training, - device, + devices[0], pbar=pbar, ) @@ -435,7 +468,7 @@ def main(args): loader_validation, decoder, language_model, - device, + devices[0], ) is_best = sum_loss < best_loss @@ -450,10 +483,17 @@ def main(args): }, is_best, args.checkpoint, + rank, ) + if args.distributed: + torch.distributed.destroy_process_group() + if __name__ == "__main__": args = parse_args() - main(args) + if args.distributed: + torch.multiprocessing.spawn(lambda x: main(args, x), nprocs=args.world_size, join=True) + else: + main(args) From 6a0f12f3741353e8dfdadeea64563aff33fc3fae Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 11 Jun 2020 10:48:22 -0700 Subject: [PATCH 050/129] lint. --- examples/pipeline/wav2letter.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 5e6162b738..dba9db0dc4 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -1,7 +1,6 @@ import argparse import os import shutil -import signal import string from collections import defaultdict from datetime import datetime @@ -494,6 +493,8 @@ def main(args, rank=0): args = parse_args() if args.distributed: - torch.multiprocessing.spawn(lambda x: main(args, x), nprocs=args.world_size, join=True) + torch.multiprocessing.spawn( + lambda x: main(args, x), nprocs=args.world_size, join=True + ) else: main(args) From afc9d32e9c3517f36e3b3ea13eb7ba9c9ceecbd1 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 11 Jun 2020 14:48:13 -0700 Subject: [PATCH 051/129] normalize w/r to length of dataset, and w/r to total number characters. --- examples/pipeline/wav2letter.py | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index dba9db0dc4..2f4da7b629 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -205,13 +205,12 @@ def train_one_epoch( if pbar is not None: pbar.update(1 / len(data_loader)) - # Average - for k in sums.keys(): - sums[k] /= len(data_loader) + avg_loss = sums["loss"] / len(data_loader) + print(f"Training loss: {avg_loss:4.5f}", flush=True) - print(f"Training loss: {sums['loss']:4.5f}", flush=True) if "gradient" in sums: - print(f"Average gradient norm: {sums['gradient']:4.5f}", flush=True) + avg_gradient = sums["gradient"] / len(data_loader) + print(f"Average gradient norm: {avg_gradient:4.5f}", flush=True) scheduler.step() @@ -234,6 +233,8 @@ def evaluate(model, criterion, data_loader, decoder, language_model, device): # keep batch first for data parallel outputs = model(inputs).transpose(-1, -2).transpose(0, 1) + sums["length_dataset"] += len(inputs) + # CTC # outputs: input length, batch size, number of classes (including blank) # targets: batch size, max target length @@ -260,6 +261,7 @@ def evaluate(model, criterion, data_loader, decoder, language_model, device): # cers_normalized = [d / len(a) for a, d in zip(target, cers)] cers = sum(cers) sums["cer"] += cers + sums["total_chars"] += sum(len(t) for t in target) output = [o.split(language_model.char_space) for o in output] target = [o.split(language_model.char_space) for o in target] @@ -268,15 +270,26 @@ def evaluate(model, criterion, data_loader, decoder, language_model, device): # wers_normalized = [d / len(a) for a, d in zip(target, wers)] wers = sum(wers) sums["wer"] += wers + sums["total_words"] += len(target) - # Average - for k in sums.keys(): - sums[k] /= len(data_loader) + avg_loss = sums["loss"] / len(data_loader) + print(f"Validation loss: {avg_loss:.5f}", flush=True) - print(f"Validation loss: {sums['loss']:.5f}", flush=True) print(f"CER: {sums['cer']} WER: {sums['wer']}", flush=True) + print( + f"CER: {sums['cer']/sums['length_dataset']} " + f"WER: {sums['wer']/sums['length_dataset']} " + f"(over dataset length)", + flush=True, + ) + print( + f"CER: {sums['cer']/sums['total_chars']} " + f"WER: {sums['wer']/sums['total_words']} " + f"(over total target length)", + flush=True, + ) - return sums["loss"] + return avg_loss def main(args, rank=0): From 9dc45ca42794073b2b7af45c06fdc87dba9ceaf5 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Fri, 12 Jun 2020 13:09:13 -0700 Subject: [PATCH 052/129] relative cer/wer. --- examples/pipeline/wav2letter.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 2f4da7b629..7147b24958 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -260,8 +260,10 @@ def evaluate(model, criterion, data_loader, decoder, language_model, device): cers = [levenshtein_distance(a, b) for a, b in zip(target, output)] # cers_normalized = [d / len(a) for a, d in zip(target, cers)] cers = sum(cers) + n = sum(len(t) for t in target) sums["cer"] += cers - sums["total_chars"] += sum(len(t) for t in target) + sums["cer_relative"] += cers / n + sums["total_chars"] += n output = [o.split(language_model.char_space) for o in output] target = [o.split(language_model.char_space) for o in target] @@ -269,8 +271,10 @@ def evaluate(model, criterion, data_loader, decoder, language_model, device): wers = [levenshtein_distance(a, b) for a, b in zip(target, output)] # wers_normalized = [d / len(a) for a, d in zip(target, wers)] wers = sum(wers) + n = len(target) sums["wer"] += wers - sums["total_words"] += len(target) + sums["wer_relative"] += wers / n + sums["total_words"] += n avg_loss = sums["loss"] / len(data_loader) print(f"Validation loss: {avg_loss:.5f}", flush=True) From 0bfb559ed9593e8e9feca0fea530e44e62bddaa0 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 24 Jun 2020 11:33:41 -0700 Subject: [PATCH 053/129] clip grad parameter. momentum back but not yet used. --- examples/pipeline/wav2letter.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 7147b24958..03e429ed96 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -93,13 +93,15 @@ def parse_args(): metavar="GAMMA", help="learning rate exponential decay constant", ) - # parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum') + parser.add_argument( + "--momentum", default=0.0, type=float, metavar="M", help="momentum" + ) parser.add_argument( "--weight-decay", default=1e-5, type=float, metavar="W", help="weight decay" ) parser.add_argument("--eps", metavar="EPS", type=float, default=1e-8) parser.add_argument("--rho", metavar="RHO", type=float, default=0.95) - parser.add_argument("--clip-norm", metavar="NORM", type=float, default=0.0) + parser.add_argument("--clip-grad", metavar="NORM", type=float, default=0.0) parser.add_argument( "--dataset", @@ -195,9 +197,9 @@ def train_one_epoch( optimizer.zero_grad() loss.backward() - if args.clip_norm > 0: + if args.clip_grad > 0: sums["gradient"] += torch.nn.utils.clip_grad_norm_( - model.parameters(), args.clip_norm + model.parameters(), args.clip_grad ) optimizer.step() From 28c905a5f641b2228c371cfe44abb03d562651d5 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 24 Jun 2020 11:34:16 -0700 Subject: [PATCH 054/129] Switch to SGD. --- examples/pipeline/wav2letter.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 03e429ed96..9ad54c2a06 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -387,14 +387,20 @@ def main(args, rank=0): # Optimizer - optimizer_params = { - "lr": args.learning_rate, - "eps": args.eps, - "rho": args.rho, - "weight_decay": args.weight_decay, - } - - optimizer = Adadelta(model.parameters(), **optimizer_params) + # optimizer = Adadelta( + # model.parameters(), + # lr=args.learning_rate, + # weight_decay=args.weight_decay, + # momentum=args.momentum, + # eps=args.eps, + # rho=args.rho, + # ) + optimizer = SGD( + model.parameters(), + lr=args.learning_rate, + weight_decay=args.weight_decay, + momentum=args.momentum, + ) scheduler = ExponentialLR(optimizer, gamma=args.gamma) # scheduler = ReduceLROnPlateau(optimizer, patience=2, threshold=1e-3) From f99eef9da45bf5bf14969b37522c54525c02a1ca Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 25 Jun 2020 15:59:37 -0700 Subject: [PATCH 055/129] choice of optimizer. --- examples/pipeline/wav2letter.py | 49 +++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 9ad54c2a06..199d641325 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -38,7 +38,6 @@ def parse_args(): metavar="PATH", help="path to latest checkpoint", ) - parser.add_argument( "--epochs", default=200, @@ -59,7 +58,6 @@ def parse_args(): parser.add_argument( "--progress-bar", action="store_true", help="use progress bar while training" ) - parser.add_argument( "--decoder", metavar="D", @@ -67,11 +65,9 @@ def parse_args(): choices=["greedy", "viterbi"], help="decoder to use", ) - parser.add_argument( "--batch-size", default=64, type=int, metavar="N", help="mini-batch size" ) - parser.add_argument( "--n-bins", default=13, @@ -79,6 +75,13 @@ def parse_args(): metavar="N", help="number of bins in transforms", ) + parser.add_argument( + "--optimizer", + metavar="OPT", + default="sgd", + choices=["sgd", "adadelta", "adam"], + help="optimizer to use", + ) parser.add_argument( "--learning-rate", default=1.0, @@ -102,7 +105,6 @@ def parse_args(): parser.add_argument("--eps", metavar="EPS", type=float, default=1e-8) parser.add_argument("--rho", metavar="RHO", type=float, default=0.95) parser.add_argument("--clip-grad", metavar="NORM", type=float, default=0.0) - parser.add_argument( "--dataset", default="librispeech", @@ -116,7 +118,6 @@ def parse_args(): parser.add_argument( "--world-size", type=int, default=8, help="the world size to initiate DPP" ) - parser.add_argument("--jit", action="store_true", help="if used, model is jitted") args = parser.parse_args() @@ -387,20 +388,28 @@ def main(args, rank=0): # Optimizer - # optimizer = Adadelta( - # model.parameters(), - # lr=args.learning_rate, - # weight_decay=args.weight_decay, - # momentum=args.momentum, - # eps=args.eps, - # rho=args.rho, - # ) - optimizer = SGD( - model.parameters(), - lr=args.learning_rate, - weight_decay=args.weight_decay, - momentum=args.momentum, - ) + if args.optimizer == "adadelta": + optimizer = Adadelta( + model.parameters(), + lr=args.learning_rate, + weight_decay=args.weight_decay, + eps=args.eps, + rho=args.rho, + ) + elif args.optimizer == "sgd": + optimizer = SGD( + model.parameters(), + lr=args.learning_rate, + momentum=args.momentum, + weight_decay=args.weight_decay, + ) + elif args.optimizer == "adam": + optimizer = Adam( + model.parameters(), + lr=args.learning_rate, + momentum=args.momentum, + weight_decay=args.weight_decay, + ) scheduler = ExponentialLR(optimizer, gamma=args.gamma) # scheduler = ReduceLROnPlateau(optimizer, patience=2, threshold=1e-3) From 9431d55fd80e91c16ca4533454f59fd504d62a11 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Fri, 26 Jun 2020 10:33:42 -0700 Subject: [PATCH 056/129] scheduler. --- examples/pipeline/wav2letter.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 199d641325..de39c13789 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -82,6 +82,13 @@ def parse_args(): choices=["sgd", "adadelta", "adam"], help="optimizer to use", ) + parser.add_argument( + "--scheduler", + metavar="S", + default="exponential", + choices=["exponential", "reduceonplateau"], + help="optimizer to use", + ) parser.add_argument( "--learning-rate", default=1.0, @@ -410,8 +417,11 @@ def main(args, rank=0): momentum=args.momentum, weight_decay=args.weight_decay, ) - scheduler = ExponentialLR(optimizer, gamma=args.gamma) - # scheduler = ReduceLROnPlateau(optimizer, patience=2, threshold=1e-3) + + if args.scheduler == "exponential": + scheduler = ExponentialLR(optimizer, gamma=args.gamma) + elif args.scheduler == "reduceonplateau": + scheduler = ReduceLROnPlateau(optimizer, patience=10, threshold=1e-3) criterion = torch.nn.CTCLoss( blank=language_model.mapping[char_blank], zero_infinity=False From 91e71c1c8366a231a22b5df70a22057978cbc0d8 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Mon, 29 Jun 2020 14:36:48 -0700 Subject: [PATCH 057/129] move to utils file. --- examples/pipeline/wav2letter.py | 35 +++------------------------------ 1 file changed, 3 insertions(+), 32 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index de39c13789..bc4d79f391 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -1,6 +1,5 @@ import argparse import os -import shutil import string from collections import defaultdict from datetime import datetime @@ -19,6 +18,7 @@ from datasets import collate_factory, datasets_librispeech from languagemodels import LanguageModel from metrics import levenshtein_distance +from utils import count_parameters, save_checkpoint def parse_args(): @@ -140,41 +140,10 @@ def setup(rank, world_size): torch.distributed.init_process_group("nccl", rank=rank, world_size=world_size) -def save_checkpoint(state, is_best, filename, rank): - """ - Save the model to a temporary file first, - then copy it to filename, in case the signal interrupts - the torch.save() process. - """ - - if rank != 0: - return - - if filename == "": - return - - tempfile = filename + ".temp" - - # Remove tempfile in case interuption during the copying from tempfile to filename - if os.path.isfile(tempfile): - os.remove(tempfile) - - torch.save(state, tempfile) - if os.path.isfile(tempfile): - os.rename(tempfile, filename) - if is_best: - shutil.copyfile(filename, "model_best.pth.tar") - print("Checkpoint: saved", flush=True) - - def model_length_function(tensor): return int(tensor.shape[0]) // 2 + 1 -def count_parameters(model): - return sum(p.numel() for p in model.parameters() if p.requires_grad) - - def train_one_epoch( model, criterion, optimizer, scheduler, data_loader, device, pbar=None, ): @@ -529,6 +498,8 @@ def main(args, rank=0): rank, ) + print("End time: {}".format(str(datetime.now())), flush=True) + if args.distributed: torch.distributed.destroy_process_group() From 9472c22f744eb2816ce4bad4360a4a9f2d4d696a Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Mon, 29 Jun 2020 15:12:14 -0700 Subject: [PATCH 058/129] metric log, and utils file. --- examples/pipeline/utils.py | 80 +++++++++++++++++++++++++++++++++ examples/pipeline/wav2letter.py | 57 ++++++++++++++++++++--- 2 files changed, 131 insertions(+), 6 deletions(-) create mode 100644 examples/pipeline/utils.py diff --git a/examples/pipeline/utils.py b/examples/pipeline/utils.py new file mode 100644 index 0000000000..6d1924ac03 --- /dev/null +++ b/examples/pipeline/utils.py @@ -0,0 +1,80 @@ +import csv +import json +import os +import shutil +from collections import defaultdict + +import torch +from tabulate import tabulate + + +class MetricLog: + def __init__(self, d=None, disable=False): + self.disable = disable + self.d = defaultdict(lambda: defaultdict(list)) if d is None else d + + def record(self, group, metric, value, msg=None): + if not self.disable: + + self.d[group][metric].append(value) + if msg is not None: + print(msg, "{: >10}".format(round(value, 5)), flush=True) + + return value + + def print_last_row(self, group=None): + if self.disable: + return + for group in group or self.log: + # print({k: v[-1] for k, v in group.items()}) + print( + tabulate({k: v[-1] for k, v in group.items()}, headers="keys"), + flush=True, + ) + + def write_csv(self, prefix=""): + if self.disable: + return + for group in self.d: + filename = prefix + group + ".csv" + content = tabulate(self.d[group]) + with open(filename, "w") as csvfile: + writer = csv.writer(csvfile) + writer.writerow(content) + + def write_json(self, filename): + if self.disable: + return + with open(filename, "w") as outfile: + json.dump(self.d, outfile) + + +def save_checkpoint(state, is_best, filename, rank): + """ + Save the model to a temporary file first, + then copy it to filename, in case the signal interrupts + the torch.save() process. + """ + + if rank != 0: + return + + if filename == "": + return + + tempfile = filename + ".temp" + + # Remove tempfile in case interuption during the copying from tempfile to filename + if os.path.isfile(tempfile): + os.remove(tempfile) + + torch.save(state, tempfile) + if os.path.isfile(tempfile): + os.rename(tempfile, filename) + if is_best: + shutil.copyfile(filename, "model_best.pth.tar") + print("Checkpoint: saved", flush=True) + + +def count_parameters(model): + return sum(p.numel() for p in model.parameters() if p.requires_grad) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index bc4d79f391..715ffc44d2 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -18,7 +18,7 @@ from datasets import collate_factory, datasets_librispeech from languagemodels import LanguageModel from metrics import levenshtein_distance -from utils import count_parameters, save_checkpoint +from utils import MetricLog, count_parameters, save_checkpoint def parse_args(): @@ -145,7 +145,7 @@ def model_length_function(tensor): def train_one_epoch( - model, criterion, optimizer, scheduler, data_loader, device, pbar=None, + model, criterion, optimizer, scheduler, data_loader, device, metric_log, pbar=None ): model.train() @@ -169,32 +169,52 @@ def train_one_epoch( # target_lengths: batch size loss = criterion(outputs, targets, tensors_lengths, target_lengths) - sums["loss"] += loss.item() + loss_item = loss.item() + sums["loss"] += loss_item + metric_log.record("train_iteration", "loss", loss_item) optimizer.zero_grad() loss.backward() if args.clip_grad > 0: - sums["gradient"] += torch.nn.utils.clip_grad_norm_( + gradient = torch.nn.utils.clip_grad_norm_( model.parameters(), args.clip_grad ) + sums["gradient"] += gradient + metric_log.record("train_iteration", "gradient", gradient) optimizer.step() if pbar is not None: pbar.update(1 / len(data_loader)) + metric_log.record("train_iteration", "n", pbar.n) avg_loss = sums["loss"] / len(data_loader) print(f"Training loss: {avg_loss:4.5f}", flush=True) + metric_log.record("train_epoch", "loss", avg_loss, "Training loss: ") if "gradient" in sums: avg_gradient = sums["gradient"] / len(data_loader) print(f"Average gradient norm: {avg_gradient:4.5f}", flush=True) + metric_log.record( + "train_epoch", "gradient", avg_gradient, "Average gradient norm: " + ) scheduler.step() + metric_log.record("train_epoch", "n", pbar.n) + -def evaluate(model, criterion, data_loader, decoder, language_model, device): +def evaluate( + model, + criterion, + data_loader, + decoder, + language_model, + device, + metric_log, + pbar=None, +): with torch.no_grad(): @@ -220,9 +240,10 @@ def evaluate(model, criterion, data_loader, decoder, language_model, device): # input_lengths: batch size # target_lengths: batch size - sums["loss"] += criterion( + loss_item = criterion( outputs, targets, tensors_lengths, target_lengths ).item() + sums["loss"] += loss_item output = outputs.transpose(0, 1).to("cpu") output = decoder(output) @@ -272,6 +293,20 @@ def evaluate(model, criterion, data_loader, decoder, language_model, device): flush=True, ) + metric_log("validation", "loss", avg_loss) + metric_log("validation", "cer", sums["cer"]) + metric_log("validation", "wer", sums["wer"]) + metric_log( + "validation", "cer_over_dataset", sums["cer"] / sums["length_dataset"] + ) + metric_log( + "validation", "wer_over_dataset", sums["wer"] / sums["length_dataset"] + ) + metric_log("validation", "cer_over_length", sums["cer"] / sums["total_chars"]) + metric_log("validation", "wer_over_length", sums["wer"] / sums["total_words"]) + if pbar is not None: + metric_log("validation", "n", pbar.n) + return avg_loss @@ -416,6 +451,7 @@ def main(args, rank=0): ) best_loss = 1.0 + metric_log = MetricLog() load_checkpoint = args.checkpoint and os.path.isfile(args.checkpoint) @@ -469,6 +505,7 @@ def main(args, rank=0): scheduler, loader_training, devices[0], + metric_log=metric_log, pbar=pbar, ) @@ -481,6 +518,8 @@ def main(args, rank=0): decoder, language_model, devices[0], + metric_log=metric_log, + pbar=pbar, ) is_best = sum_loss < best_loss @@ -498,6 +537,12 @@ def main(args, rank=0): rank, ) + metric_log.print_last_row() + + prefix = args.checkpoint or "metric_log" + metric_log.write_csv(prefix + ".csv") + metric_log.write_json(prefix + ".json") + print("End time: {}".format(str(datetime.now())), flush=True) if args.distributed: From 5d77b8870584d3cf271f0cdc131918b9853ecaea Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 30 Jun 2020 12:30:16 -0700 Subject: [PATCH 059/129] rename metric_logger. --- examples/pipeline/utils.py | 21 ++++++++---- examples/pipeline/wav2letter.py | 59 +++++++++++++++++++-------------- 2 files changed, 49 insertions(+), 31 deletions(-) diff --git a/examples/pipeline/utils.py b/examples/pipeline/utils.py index 6d1924ac03..f5081e76b6 100644 --- a/examples/pipeline/utils.py +++ b/examples/pipeline/utils.py @@ -8,15 +8,15 @@ from tabulate import tabulate -class MetricLog: - def __init__(self, d=None, disable=False): +class MetricLogger: + def __init__(self, log=None, disable=False): self.disable = disable - self.d = defaultdict(lambda: defaultdict(list)) if d is None else d + self.log = defaultdict(lambda: defaultdict(list)) if log is None else log def record(self, group, metric, value, msg=None): if not self.disable: - self.d[group][metric].append(value) + self.log[group][metric].append(value) if msg is not None: print(msg, "{: >10}".format(round(value, 5)), flush=True) @@ -32,12 +32,19 @@ def print_last_row(self, group=None): flush=True, ) + def print_all_row(self, group=None): + if self.disable: + return + for group in group or self.log: + # print({k: v[-1] for k, v in group.items()}) + print(tabulate(self.log[group], flush=True)) + def write_csv(self, prefix=""): if self.disable: return - for group in self.d: + for group in self.log: filename = prefix + group + ".csv" - content = tabulate(self.d[group]) + content = tabulate(self.log[group]) with open(filename, "w") as csvfile: writer = csv.writer(csvfile) writer.writerow(content) @@ -46,7 +53,7 @@ def write_json(self, filename): if self.disable: return with open(filename, "w") as outfile: - json.dump(self.d, outfile) + json.dump(self.log, outfile) def save_checkpoint(state, is_best, filename, rank): diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 715ffc44d2..9bb239639e 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -18,7 +18,7 @@ from datasets import collate_factory, datasets_librispeech from languagemodels import LanguageModel from metrics import levenshtein_distance -from utils import MetricLog, count_parameters, save_checkpoint +from utils import MetricLogger, count_parameters, save_checkpoint def parse_args(): @@ -145,7 +145,14 @@ def model_length_function(tensor): def train_one_epoch( - model, criterion, optimizer, scheduler, data_loader, device, metric_log, pbar=None + model, + criterion, + optimizer, + scheduler, + data_loader, + device, + metric_logger, + pbar=None, ): model.train() @@ -171,7 +178,7 @@ def train_one_epoch( loss = criterion(outputs, targets, tensors_lengths, target_lengths) loss_item = loss.item() sums["loss"] += loss_item - metric_log.record("train_iteration", "loss", loss_item) + metric_logger.record("train_iteration", "loss", loss_item) optimizer.zero_grad() loss.backward() @@ -181,28 +188,28 @@ def train_one_epoch( model.parameters(), args.clip_grad ) sums["gradient"] += gradient - metric_log.record("train_iteration", "gradient", gradient) + metric_logger.record("train_iteration", "gradient", gradient) optimizer.step() if pbar is not None: pbar.update(1 / len(data_loader)) - metric_log.record("train_iteration", "n", pbar.n) + metric_logger.record("train_iteration", "n", pbar.n) avg_loss = sums["loss"] / len(data_loader) print(f"Training loss: {avg_loss:4.5f}", flush=True) - metric_log.record("train_epoch", "loss", avg_loss, "Training loss: ") + metric_logger.record("train_epoch", "loss", avg_loss, "Training loss: ") if "gradient" in sums: avg_gradient = sums["gradient"] / len(data_loader) print(f"Average gradient norm: {avg_gradient:4.5f}", flush=True) - metric_log.record( + metric_logger.record( "train_epoch", "gradient", avg_gradient, "Average gradient norm: " ) scheduler.step() - metric_log.record("train_epoch", "n", pbar.n) + metric_logger.record("train_epoch", "n", pbar.n) def evaluate( @@ -212,7 +219,7 @@ def evaluate( decoder, language_model, device, - metric_log, + metric_logger, pbar=None, ): @@ -293,19 +300,23 @@ def evaluate( flush=True, ) - metric_log("validation", "loss", avg_loss) - metric_log("validation", "cer", sums["cer"]) - metric_log("validation", "wer", sums["wer"]) - metric_log( + metric_logger("validation", "loss", avg_loss) + metric_logger("validation", "cer", sums["cer"]) + metric_logger("validation", "wer", sums["wer"]) + metric_logger( "validation", "cer_over_dataset", sums["cer"] / sums["length_dataset"] ) - metric_log( + metric_logger( "validation", "wer_over_dataset", sums["wer"] / sums["length_dataset"] ) - metric_log("validation", "cer_over_length", sums["cer"] / sums["total_chars"]) - metric_log("validation", "wer_over_length", sums["wer"] / sums["total_words"]) + metric_logger( + "validation", "cer_over_length", sums["cer"] / sums["total_chars"] + ) + metric_logger( + "validation", "wer_over_length", sums["wer"] / sums["total_words"] + ) if pbar is not None: - metric_log("validation", "n", pbar.n) + metric_logger("validation", "n", pbar.n) return avg_loss @@ -451,7 +462,7 @@ def main(args, rank=0): ) best_loss = 1.0 - metric_log = MetricLog() + metric_logger = MetricLogger() load_checkpoint = args.checkpoint and os.path.isfile(args.checkpoint) @@ -505,7 +516,7 @@ def main(args, rank=0): scheduler, loader_training, devices[0], - metric_log=metric_log, + metric_logger=metric_logger, pbar=pbar, ) @@ -518,7 +529,7 @@ def main(args, rank=0): decoder, language_model, devices[0], - metric_log=metric_log, + metric_logger=metric_logger, pbar=pbar, ) @@ -537,11 +548,11 @@ def main(args, rank=0): rank, ) - metric_log.print_last_row() + metric_logger.print_last_row() - prefix = args.checkpoint or "metric_log" - metric_log.write_csv(prefix + ".csv") - metric_log.write_json(prefix + ".json") + prefix = args.checkpoint or "metric_logger" + metric_logger.write_csv(prefix + ".csv") + metric_logger.write_json(prefix + ".json") print("End time: {}".format(str(datetime.now())), flush=True) From 7529009628f293f1cbaa20cce65c3859116c221c Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 30 Jun 2020 14:52:38 -0700 Subject: [PATCH 060/129] stderr and stdout. simpler metric logger. --- examples/pipeline/utils.py | 64 +++++------------ examples/pipeline/wav2letter.py | 119 ++++++++++++-------------------- 2 files changed, 62 insertions(+), 121 deletions(-) diff --git a/examples/pipeline/utils.py b/examples/pipeline/utils.py index f5081e76b6..1fe9b8abec 100644 --- a/examples/pipeline/utils.py +++ b/examples/pipeline/utils.py @@ -1,59 +1,27 @@ -import csv -import json import os import shutil -from collections import defaultdict +import sys +from collections import defaultdict, deque import torch -from tabulate import tabulate class MetricLogger: - def __init__(self, log=None, disable=False): - self.disable = disable - self.log = defaultdict(lambda: defaultdict(list)) if log is None else log + def __init__(self, group, print_freq=1): + self.print_freq = print_freq + self.data = defaultdict(lambda: deque(maxlen=self.print_freq)) + self.data["group"].append(group) + self._iter = 0 - def record(self, group, metric, value, msg=None): - if not self.disable: + def __call__(self, key, value): + self.data[key].append(value) - self.log[group][metric].append(value) - if msg is not None: - print(msg, "{: >10}".format(round(value, 5)), flush=True) - - return value - - def print_last_row(self, group=None): - if self.disable: - return - for group in group or self.log: - # print({k: v[-1] for k, v in group.items()}) - print( - tabulate({k: v[-1] for k, v in group.items()}, headers="keys"), - flush=True, - ) - - def print_all_row(self, group=None): - if self.disable: - return - for group in group or self.log: - # print({k: v[-1] for k, v in group.items()}) - print(tabulate(self.log[group], flush=True)) - - def write_csv(self, prefix=""): - if self.disable: - return - for group in self.log: - filename = prefix + group + ".csv" - content = tabulate(self.log[group]) - with open(filename, "w") as csvfile: - writer = csv.writer(csvfile) - writer.writerow(content) - - def write_json(self, filename): - if self.disable: - return - with open(filename, "w") as outfile: - json.dump(self.log, outfile) + def print(self): + self._iter += 1 + if self._iter % self.print_freq: + # d = {k: statistics.mean(v) for k, v in self.data.items()} + d = {k: v[-1] for k, v in self.data.items()} + print(d, flush=True) def save_checkpoint(state, is_best, filename, rank): @@ -80,7 +48,7 @@ def save_checkpoint(state, is_best, filename, rank): os.rename(tempfile, filename) if is_best: shutil.copyfile(filename, "model_best.pth.tar") - print("Checkpoint: saved", flush=True) + print("Checkpoint: saved", file=sys.stderr, flush=True) def count_parameters(model): diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 9bb239639e..345244e608 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -1,6 +1,7 @@ import argparse import os import string +import sys from collections import defaultdict from datetime import datetime @@ -145,20 +146,15 @@ def model_length_function(tensor): def train_one_epoch( - model, - criterion, - optimizer, - scheduler, - data_loader, - device, - metric_logger, - pbar=None, + model, criterion, optimizer, scheduler, data_loader, device, pbar=None, ): model.train() sums = defaultdict(lambda: 0.0) + metric = MetricLogger("train_iteration") + for inputs, targets, tensors_lengths, target_lengths in bg_iterator( data_loader, maxsize=2 ): @@ -178,7 +174,7 @@ def train_one_epoch( loss = criterion(outputs, targets, tensors_lengths, target_lengths) loss_item = loss.item() sums["loss"] += loss_item - metric_logger.record("train_iteration", "loss", loss_item) + metric("loss", loss_item) optimizer.zero_grad() loss.backward() @@ -188,39 +184,34 @@ def train_one_epoch( model.parameters(), args.clip_grad ) sums["gradient"] += gradient - metric_logger.record("train_iteration", "gradient", gradient) + metric("gradient", gradient) optimizer.step() if pbar is not None: pbar.update(1 / len(data_loader)) - metric_logger.record("train_iteration", "n", pbar.n) + metric("n", pbar.n) + + metric.print() + + metric = MetricLogger("train_epoch") + metric("n", pbar.n) avg_loss = sums["loss"] / len(data_loader) - print(f"Training loss: {avg_loss:4.5f}", flush=True) - metric_logger.record("train_epoch", "loss", avg_loss, "Training loss: ") + metric("loss", avg_loss) if "gradient" in sums: avg_gradient = sums["gradient"] / len(data_loader) - print(f"Average gradient norm: {avg_gradient:4.5f}", flush=True) - metric_logger.record( - "train_epoch", "gradient", avg_gradient, "Average gradient norm: " - ) + metric("gradient", avg_gradient) - scheduler.step() + metric("lr", scheduler.get_last_lr()) + metric.print() - metric_logger.record("train_epoch", "n", pbar.n) + scheduler.step() def evaluate( - model, - criterion, - data_loader, - decoder, - language_model, - device, - metric_logger, - pbar=None, + model, criterion, data_loader, decoder, language_model, device, pbar=None, ): with torch.no_grad(): @@ -262,7 +253,11 @@ def evaluate( for i in range(2): output_print = output[i].ljust(print_length)[:print_length] target_print = target[i].ljust(print_length)[:print_length] - print(f"Target: {target_print} Output: {output_print}", flush=True) + print( + f"Target: {target_print} Output: {output_print}", + file=sys.stderr, + flush=True, + ) cers = [levenshtein_distance(a, b) for a, b in zip(target, output)] # cers_normalized = [d / len(a) for a, d in zip(target, cers)] @@ -284,39 +279,21 @@ def evaluate( sums["total_words"] += n avg_loss = sums["loss"] / len(data_loader) - print(f"Validation loss: {avg_loss:.5f}", flush=True) - print(f"CER: {sums['cer']} WER: {sums['wer']}", flush=True) - print( - f"CER: {sums['cer']/sums['length_dataset']} " - f"WER: {sums['wer']/sums['length_dataset']} " - f"(over dataset length)", - flush=True, - ) - print( - f"CER: {sums['cer']/sums['total_chars']} " - f"WER: {sums['wer']/sums['total_words']} " - f"(over total target length)", - flush=True, - ) - - metric_logger("validation", "loss", avg_loss) - metric_logger("validation", "cer", sums["cer"]) - metric_logger("validation", "wer", sums["wer"]) - metric_logger( - "validation", "cer_over_dataset", sums["cer"] / sums["length_dataset"] - ) - metric_logger( - "validation", "wer_over_dataset", sums["wer"] / sums["length_dataset"] - ) - metric_logger( - "validation", "cer_over_length", sums["cer"] / sums["total_chars"] - ) - metric_logger( - "validation", "wer_over_length", sums["wer"] / sums["total_words"] - ) + metric = MetricLogger("validation") + metric("loss", avg_loss) + metric("cer", sums["cer"]) + metric("wer", sums["wer"]) + metric("cer over dataset length", sums["cer"] / sums["length_dataset"]) + metric("wer over dataset length", sums["wer"] / sums["length_dataset"]) + metric("cer over target length", sums["cer"] / sums["total_chars"]) + metric("wer over target length", sums["wer"] / sums["total_words"]) + metric("target length", sums["total_chars"]) + metric("target length", sums["total_words"]) + metric("dataset length", sums["length_dataset"]) if pbar is not None: - metric_logger("validation", "n", pbar.n) + metric("n", pbar.n) + metric.print() return avg_loss @@ -326,7 +303,7 @@ def main(args, rank=0): if args.distributed: setup(rank, args.world_size) - print("Start time: {}".format(str(datetime.now())), flush=True) + print("Start time: {}".format(str(datetime.now())), file=sys.stderr, flush=True) # Explicitly setting seed to make sure that models created in two processes # start from same random weights and biases. torch.manual_seed(args.seed) @@ -406,7 +383,7 @@ def main(args, rank=0): model = model.to(devices[0], non_blocking=True) n = count_parameters(model) - print(f"Number of parameters: {n}", flush=True) + print(f"Number of parameters: {n}", file=sys.stderr, flush=True) # Optimizer @@ -462,7 +439,6 @@ def main(args, rank=0): ) best_loss = 1.0 - metric_logger = MetricLogger() load_checkpoint = args.checkpoint and os.path.isfile(args.checkpoint) @@ -470,7 +446,11 @@ def main(args, rank=0): torch.distributed.barrier() if load_checkpoint: - print("Checkpoint: loading '{}'".format(args.checkpoint), flush=True) + print( + "Checkpoint: loading '{}'".format(args.checkpoint), + file=sys.stderr, + flush=True, + ) checkpoint = torch.load(args.checkpoint) args.start_epoch = checkpoint["epoch"] @@ -484,10 +464,11 @@ def main(args, rank=0): "Checkpoint: loaded '{}' at epoch {}".format( args.checkpoint, checkpoint["epoch"] ), + file=sys.stderr, flush=True, ) else: - print("Checkpoint: not found", flush=True) + print("Checkpoint: not found", file=sys.stderr, flush=True) save_checkpoint( { @@ -516,7 +497,6 @@ def main(args, rank=0): scheduler, loader_training, devices[0], - metric_logger=metric_logger, pbar=pbar, ) @@ -529,7 +509,6 @@ def main(args, rank=0): decoder, language_model, devices[0], - metric_logger=metric_logger, pbar=pbar, ) @@ -548,13 +527,7 @@ def main(args, rank=0): rank, ) - metric_logger.print_last_row() - - prefix = args.checkpoint or "metric_logger" - metric_logger.write_csv(prefix + ".csv") - metric_logger.write_json(prefix + ".json") - - print("End time: {}".format(str(datetime.now())), flush=True) + print("End time: {}".format(str(datetime.now())), file=sys.stderr, flush=True) if args.distributed: torch.distributed.destroy_process_group() From 25cb8f3283db667fe83102badd1ac2b4bd2c666f Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 30 Jun 2020 14:58:15 -0700 Subject: [PATCH 061/129] replace by logging. --- examples/pipeline/utils.py | 13 +++++------ examples/pipeline/wav2letter.py | 38 +++++++++++++-------------------- 2 files changed, 22 insertions(+), 29 deletions(-) diff --git a/examples/pipeline/utils.py b/examples/pipeline/utils.py index 1fe9b8abec..574d476e7d 100644 --- a/examples/pipeline/utils.py +++ b/examples/pipeline/utils.py @@ -1,6 +1,6 @@ +import logging import os import shutil -import sys from collections import defaultdict, deque import torch @@ -16,12 +16,13 @@ def __init__(self, group, print_freq=1): def __call__(self, key, value): self.data[key].append(value) + def __str__(self): + return str({k: v[-1] for k, v in self.data.items()}) + def print(self): self._iter += 1 - if self._iter % self.print_freq: - # d = {k: statistics.mean(v) for k, v in self.data.items()} - d = {k: v[-1] for k, v in self.data.items()} - print(d, flush=True) + if not self._iter % self.print_freq: + print(self, flush=True) def save_checkpoint(state, is_best, filename, rank): @@ -48,7 +49,7 @@ def save_checkpoint(state, is_best, filename, rank): os.rename(tempfile, filename) if is_best: shutil.copyfile(filename, "model_best.pth.tar") - print("Checkpoint: saved", file=sys.stderr, flush=True) + logging.info("Checkpoint: saved") def count_parameters(model): diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 345244e608..4457bb62c0 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -1,7 +1,7 @@ import argparse +import logging import os import string -import sys from collections import defaultdict from datetime import datetime @@ -129,7 +129,7 @@ def parse_args(): parser.add_argument("--jit", action="store_true", help="if used, model is jitted") args = parser.parse_args() - + logging.info(args) return args @@ -195,7 +195,8 @@ def train_one_epoch( metric.print() metric = MetricLogger("train_epoch") - metric("n", pbar.n) + if pbar is not None: + metric("n", pbar.n) avg_loss = sums["loss"] / len(data_loader) metric("loss", avg_loss) @@ -253,11 +254,7 @@ def evaluate( for i in range(2): output_print = output[i].ljust(print_length)[:print_length] target_print = target[i].ljust(print_length)[:print_length] - print( - f"Target: {target_print} Output: {output_print}", - file=sys.stderr, - flush=True, - ) + logging.info(f"Target: {target_print} Output: {output_print}") cers = [levenshtein_distance(a, b) for a, b in zip(target, output)] # cers_normalized = [d / len(a) for a, d in zip(target, cers)] @@ -303,7 +300,7 @@ def main(args, rank=0): if args.distributed: setup(rank, args.world_size) - print("Start time: {}".format(str(datetime.now())), file=sys.stderr, flush=True) + logging.info("Start time: {}".format(str(datetime.now()))) # Explicitly setting seed to make sure that models created in two processes # start from same random weights and biases. torch.manual_seed(args.seed) @@ -383,7 +380,7 @@ def main(args, rank=0): model = model.to(devices[0], non_blocking=True) n = count_parameters(model) - print(f"Number of parameters: {n}", file=sys.stderr, flush=True) + logging.info(f"Number of parameters: {n}") # Optimizer @@ -446,11 +443,7 @@ def main(args, rank=0): torch.distributed.barrier() if load_checkpoint: - print( - "Checkpoint: loading '{}'".format(args.checkpoint), - file=sys.stderr, - flush=True, - ) + logging.info(f"Checkpoint: loading '{args.checkpoint}'") checkpoint = torch.load(args.checkpoint) args.start_epoch = checkpoint["epoch"] @@ -460,15 +453,11 @@ def main(args, rank=0): optimizer.load_state_dict(checkpoint["optimizer"]) scheduler.load_state_dict(checkpoint["scheduler"]) - print( - "Checkpoint: loaded '{}' at epoch {}".format( - args.checkpoint, checkpoint["epoch"] - ), - file=sys.stderr, - flush=True, + logging.info( + f"Checkpoint: loaded '{args.checkpoint}' at epoch {checkpoint['epoch']}" ) else: - print("Checkpoint: not found", file=sys.stderr, flush=True) + logging.info("Checkpoint: not found") save_checkpoint( { @@ -527,7 +516,7 @@ def main(args, rank=0): rank, ) - print("End time: {}".format(str(datetime.now())), file=sys.stderr, flush=True) + logging.info(f"End time: {datetime.now()}") if args.distributed: torch.distributed.destroy_process_group() @@ -535,7 +524,10 @@ def main(args, rank=0): if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + args = parse_args() + if args.distributed: torch.multiprocessing.spawn( lambda x: main(args, x), nprocs=args.world_size, join=True From 660082cf318e3dccf3556359bccac228a061cf23 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 1 Jul 2020 11:32:09 -0700 Subject: [PATCH 062/129] adding time measurement in metric logger. --- examples/pipeline/utils.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/examples/pipeline/utils.py b/examples/pipeline/utils.py index 574d476e7d..6d64680715 100644 --- a/examples/pipeline/utils.py +++ b/examples/pipeline/utils.py @@ -1,27 +1,37 @@ import logging import os import shutil +import time from collections import defaultdict, deque import torch class MetricLogger: - def __init__(self, group, print_freq=1): + def __init__(self, group, print_freq=1, time_key="_time"): self.print_freq = print_freq + self.time_key = time_key self.data = defaultdict(lambda: deque(maxlen=self.print_freq)) self.data["group"].append(group) self._iter = 0 + self._start = time.time() def __call__(self, key, value): self.data[key].append(value) + def _get_last(self): + if self.time_key is not None: + stop = time.time() + self(self.time_key, stop - self._start) + self._start = stop + return {k: v[-1] for k, v in self.data.items()} + def __str__(self): - return str({k: v[-1] for k, v in self.data.items()}) + return str(self._get_last()) def print(self): - self._iter += 1 - if not self._iter % self.print_freq: + self._iter = (self._iter + 1) % self.print_freq + if not self._iter: print(self, flush=True) From dd03e37b94231e1b7de13df89c128ebabbc0bfed Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 1 Jul 2020 11:37:01 -0700 Subject: [PATCH 063/129] fix duplicate name. remove tqdm. keep track of epoch instead and iteration instead. --- examples/pipeline/wav2letter.py | 105 +++++++++++++------------------- 1 file changed, 44 insertions(+), 61 deletions(-) diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/wav2letter.py index 4457bb62c0..be002c6e5d 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/wav2letter.py @@ -13,7 +13,6 @@ from torchaudio.datasets.utils import bg_iterator, diskcache_iterator from torchaudio.models.wav2letter import Wav2Letter from torchaudio.transforms import MFCC, Resample -from tqdm import tqdm from ctc_decoders import GreedyDecoder, ViterbiDecoder from datasets import collate_factory, datasets_librispeech @@ -146,14 +145,16 @@ def model_length_function(tensor): def train_one_epoch( - model, criterion, optimizer, scheduler, data_loader, device, pbar=None, + model, criterion, optimizer, scheduler, data_loader, device, epoch, ): model.train() sums = defaultdict(lambda: 0.0) - metric = MetricLogger("train_iteration") + metric_iteration = MetricLogger("train_iteration") + metric_iteration["epoch"] = epoch + metric_epoch = MetricLogger("train_epoch") for inputs, targets, tensors_lengths, target_lengths in bg_iterator( data_loader, maxsize=2 @@ -174,7 +175,7 @@ def train_one_epoch( loss = criterion(outputs, targets, tensors_lengths, target_lengths) loss_item = loss.item() sums["loss"] += loss_item - metric("loss", loss_item) + metric_iteration("loss", loss_item) optimizer.zero_grad() loss.backward() @@ -184,42 +185,35 @@ def train_one_epoch( model.parameters(), args.clip_grad ) sums["gradient"] += gradient - metric("gradient", gradient) + metric_iteration("gradient", gradient) optimizer.step() - if pbar is not None: - pbar.update(1 / len(data_loader)) - metric("n", pbar.n) - - metric.print() - - metric = MetricLogger("train_epoch") - if pbar is not None: - metric("n", pbar.n) + metric_iteration("iteration", sums["iteration"]) + metric_iteration.print() + sums["iteration"] += 1 avg_loss = sums["loss"] / len(data_loader) - metric("loss", avg_loss) + metric_epoch("epoch", epoch) + metric_epoch("loss", avg_loss) if "gradient" in sums: - avg_gradient = sums["gradient"] / len(data_loader) - metric("gradient", avg_gradient) - - metric("lr", scheduler.get_last_lr()) - metric.print() + metric_epoch("gradient", sums["gradient"] / len(data_loader)) + metric_epoch("lr", scheduler.get_last_lr()[0]) + metric_epoch.print() scheduler.step() def evaluate( - model, criterion, data_loader, decoder, language_model, device, pbar=None, + model, criterion, data_loader, decoder, language_model, device, epoch, ): with torch.no_grad(): model.eval() - sums = defaultdict(lambda: 0.0) + metric = MetricLogger("validation") for inputs, targets, tensors_lengths, target_lengths in bg_iterator( data_loader, maxsize=2 @@ -239,10 +233,9 @@ def evaluate( # input_lengths: batch size # target_lengths: batch size - loss_item = criterion( + sums["loss"] += criterion( outputs, targets, tensors_lengths, target_lengths ).item() - sums["loss"] += loss_item output = outputs.transpose(0, 1).to("cpu") output = decoder(output) @@ -254,7 +247,7 @@ def evaluate( for i in range(2): output_print = output[i].ljust(print_length)[:print_length] target_print = target[i].ljust(print_length)[:print_length] - logging.info(f"Target: {target_print} Output: {output_print}") + logging.info(f"Epoch: {epoch} Target: {target_print} Output: {output_print}") cers = [levenshtein_distance(a, b) for a, b in zip(target, output)] # cers_normalized = [d / len(a) for a, d in zip(target, cers)] @@ -277,7 +270,7 @@ def evaluate( avg_loss = sums["loss"] / len(data_loader) - metric = MetricLogger("validation") + metric("epoch", epoch) metric("loss", avg_loss) metric("cer", sums["cer"]) metric("wer", sums["wer"]) @@ -288,8 +281,6 @@ def evaluate( metric("target length", sums["total_chars"]) metric("target length", sums["total_words"]) metric("dataset length", sums["length_dataset"]) - if pbar is not None: - metric("n", pbar.n) metric.print() return avg_loss @@ -475,46 +466,38 @@ def main(args, rank=0): if args.distributed: torch.distributed.barrier() - with tqdm(total=args.epochs, unit_scale=1, disable=not args.progress_bar) as pbar: + for epoch in range(args.start_epoch, args.epochs): - for epoch in range(args.start_epoch, args.epochs): + train_one_epoch( + model, criterion, optimizer, scheduler, loader_training, devices[0], epoch, + ) - train_one_epoch( + if not (epoch + 1) % args.print_freq or epoch == args.epochs - 1: + + sum_loss = evaluate( model, criterion, - optimizer, - scheduler, - loader_training, + loader_validation, + decoder, + language_model, devices[0], - pbar=pbar, + epoch, ) - if not epoch % args.print_freq or epoch == args.epochs - 1: - - sum_loss = evaluate( - model, - criterion, - loader_validation, - decoder, - language_model, - devices[0], - pbar=pbar, - ) - - is_best = sum_loss < best_loss - best_loss = min(sum_loss, best_loss) - save_checkpoint( - { - "epoch": epoch + 1, - "state_dict": model.state_dict(), - "best_loss": best_loss, - "optimizer": optimizer.state_dict(), - "scheduler": scheduler.state_dict(), - }, - is_best, - args.checkpoint, - rank, - ) + is_best = sum_loss < best_loss + best_loss = min(sum_loss, best_loss) + save_checkpoint( + { + "epoch": epoch + 1, + "state_dict": model.state_dict(), + "best_loss": best_loss, + "optimizer": optimizer.state_dict(), + "scheduler": scheduler.state_dict(), + }, + is_best, + args.checkpoint, + rank, + ) logging.info(f"End time: {datetime.now()}") From 358236a1a994e465fb3a19ec97fe8082b88372df Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 1 Jul 2020 15:03:39 -0700 Subject: [PATCH 064/129] rename main file. and add readme. --- examples/pipeline/README.md | 38 ++++++++++++++++++++ examples/pipeline/{wav2letter.py => main.py} | 4 ++- 2 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 examples/pipeline/README.md rename examples/pipeline/{wav2letter.py => main.py} (99%) diff --git a/examples/pipeline/README.md b/examples/pipeline/README.md new file mode 100644 index 0000000000..ec3639c9e5 --- /dev/null +++ b/examples/pipeline/README.md @@ -0,0 +1,38 @@ +This is an example pipeline for speech recognition using a greedy or Viterbi CTC decoder, and the Wav2Letter model trained with LibriSpeech. Wav2Letter and LibriSpeech are available in torchaudio. + +### Output + +The information reported at each iteration and epoch (e.g. loss, character error rate, word error rate) is printed to standard output in the form of one json per line. Further information is reported to standard error. Here is an example python function to parse the standard output. +```python +def read_json(filename): + """ + Convert the standard output saved to filename into a pandas dataframe for analysis. + """ + + import pandas + import json + + with open(filename, "r") as f: + data = f.read() + + # pandas doesn't read single quotes for json + data = data.replace("'", '"') + + data = [json.loads(l) for l in data.splitlines()] + return pandas.DataFrame(data) +``` + +### Usage + +More information about each command line parameters is available with the `--help` option. An example can be invoked as follows. +``` +python main.py \ + --batch-size 128 \ + --learning-rate .6 \ + --gamma .99 \ + --n-bins 13 \ + --momentum .8 \ + --clip-grad 0. \ + --optimizer "adadelta" \ + --scheduler "exponential" +``` diff --git a/examples/pipeline/wav2letter.py b/examples/pipeline/main.py similarity index 99% rename from examples/pipeline/wav2letter.py rename to examples/pipeline/main.py index be002c6e5d..66552d3c64 100644 --- a/examples/pipeline/wav2letter.py +++ b/examples/pipeline/main.py @@ -247,7 +247,9 @@ def evaluate( for i in range(2): output_print = output[i].ljust(print_length)[:print_length] target_print = target[i].ljust(print_length)[:print_length] - logging.info(f"Epoch: {epoch} Target: {target_print} Output: {output_print}") + logging.info( + f"Epoch: {epoch} Target: {target_print} Output: {output_print}" + ) cers = [levenshtein_distance(a, b) for a, b in zip(target, output)] # cers_normalized = [d / len(a) for a, d in zip(target, cers)] From 490c22221180dc3377a061458b9f304cdeb5b86f Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 1 Jul 2020 15:56:47 -0700 Subject: [PATCH 065/129] refactor distributed. --- examples/pipeline/main.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/examples/pipeline/main.py b/examples/pipeline/main.py index 66552d3c64..9c7df2eba5 100644 --- a/examples/pipeline/main.py +++ b/examples/pipeline/main.py @@ -132,7 +132,7 @@ def parse_args(): return args -def setup(rank, world_size): +def setup_distributed(rank, world_size): os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "12355" @@ -291,7 +291,7 @@ def evaluate( def main(args, rank=0): if args.distributed: - setup(rank, args.world_size) + setup_distributed(rank, args.world_size) logging.info("Start time: {}".format(str(datetime.now()))) # Explicitly setting seed to make sure that models created in two processes @@ -363,12 +363,12 @@ def main(args, rank=0): if args.jit: model = torch.jit.script(model) - if not args.distributed: - model = torch.nn.DataParallel(model) - else: + if args.distributed: model.cuda() model = torch.nn.parallel.DistributedDataParallel(model, device_ids=devices) # model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True) + else: + model = torch.nn.DataParallel(model) model = model.to(devices[0], non_blocking=True) @@ -507,15 +507,17 @@ def main(args, rank=0): torch.distributed.destroy_process_group() -if __name__ == "__main__": - - logging.basicConfig(level=logging.INFO) - - args = parse_args() - +def spawn_main(args, main): if args.distributed: torch.multiprocessing.spawn( lambda x: main(args, x), nprocs=args.world_size, join=True ) else: main(args) + + +if __name__ == "__main__": + + logging.basicConfig(level=logging.INFO) + args = parse_args() + spawn_main(args, main) From 17a5999102f629a720cbef13218bff93cbf4d9c6 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 1 Jul 2020 16:03:13 -0700 Subject: [PATCH 066/129] swap example and output in readme. --- examples/pipeline/README.md | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/examples/pipeline/README.md b/examples/pipeline/README.md index ec3639c9e5..92dd660ec1 100644 --- a/examples/pipeline/README.md +++ b/examples/pipeline/README.md @@ -1,4 +1,19 @@ -This is an example pipeline for speech recognition using a greedy or Viterbi CTC decoder, and the Wav2Letter model trained with LibriSpeech. Wav2Letter and LibriSpeech are available in torchaudio. +This is an example pipeline for speech recognition using a greedy or Viterbi CTC decoder, along with the Wav2Letter model trained on LibriSpeech. Wav2Letter and LibriSpeech are available in torchaudio. + +### Usage + +More information about each command line parameters is available with the `--help` option. An example can be invoked as follows. +``` +python main.py \ + --batch-size 128 \ + --learning-rate .6 \ + --gamma .99 \ + --n-bins 13 \ + --momentum .8 \ + --clip-grad 0. \ + --optimizer "adadelta" \ + --scheduler "exponential" +``` ### Output @@ -21,18 +36,3 @@ def read_json(filename): data = [json.loads(l) for l in data.splitlines()] return pandas.DataFrame(data) ``` - -### Usage - -More information about each command line parameters is available with the `--help` option. An example can be invoked as follows. -``` -python main.py \ - --batch-size 128 \ - --learning-rate .6 \ - --gamma .99 \ - --n-bins 13 \ - --momentum .8 \ - --clip-grad 0. \ - --optimizer "adadelta" \ - --scheduler "exponential" -``` From a188200d4596c67959d7cd40a896697f3e49f665 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 2 Jul 2020 07:13:09 -0700 Subject: [PATCH 067/129] remove time from logger. --- examples/pipeline/main.py | 33 ++++++++++++++++++++------------- examples/pipeline/utils.py | 11 ++--------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/examples/pipeline/main.py b/examples/pipeline/main.py index 9c7df2eba5..5a8784c441 100644 --- a/examples/pipeline/main.py +++ b/examples/pipeline/main.py @@ -4,6 +4,7 @@ import string from collections import defaultdict from datetime import datetime +from time import time import torch import torchaudio @@ -151,15 +152,16 @@ def train_one_epoch( model.train() sums = defaultdict(lambda: 0.0) + start1 = time() - metric_iteration = MetricLogger("train_iteration") - metric_iteration["epoch"] = epoch - metric_epoch = MetricLogger("train_epoch") + metric = MetricLogger("train_iteration") + metric["epoch"] = epoch for inputs, targets, tensors_lengths, target_lengths in bg_iterator( data_loader, maxsize=2 ): + start2 = time() inputs = inputs.to(device, non_blocking=True) targets = targets.to(device, non_blocking=True) @@ -175,7 +177,7 @@ def train_one_epoch( loss = criterion(outputs, targets, tensors_lengths, target_lengths) loss_item = loss.item() sums["loss"] += loss_item - metric_iteration("loss", loss_item) + metric("loss", loss_item) optimizer.zero_grad() loss.backward() @@ -185,22 +187,25 @@ def train_one_epoch( model.parameters(), args.clip_grad ) sums["gradient"] += gradient - metric_iteration("gradient", gradient) + metric("gradient", gradient) optimizer.step() - metric_iteration("iteration", sums["iteration"]) - metric_iteration.print() + metric("iteration", sums["iteration"]) + metric("time", time() - start2) + metric.print() sums["iteration"] += 1 avg_loss = sums["loss"] / len(data_loader) - metric_epoch("epoch", epoch) - metric_epoch("loss", avg_loss) + metric = MetricLogger("train_epoch") + metric("epoch", epoch) + metric("loss", avg_loss) if "gradient" in sums: - metric_epoch("gradient", sums["gradient"] / len(data_loader)) - metric_epoch("lr", scheduler.get_last_lr()[0]) - metric_epoch.print() + metric("gradient", sums["gradient"] / len(data_loader)) + metric("lr", scheduler.get_last_lr()[0]) + metric("time", time() - start1) + metric.print() scheduler.step() @@ -213,7 +218,7 @@ def evaluate( model.eval() sums = defaultdict(lambda: 0.0) - metric = MetricLogger("validation") + start = time() for inputs, targets, tensors_lengths, target_lengths in bg_iterator( data_loader, maxsize=2 @@ -272,6 +277,7 @@ def evaluate( avg_loss = sums["loss"] / len(data_loader) + metric = MetricLogger("validation") metric("epoch", epoch) metric("loss", avg_loss) metric("cer", sums["cer"]) @@ -283,6 +289,7 @@ def evaluate( metric("target length", sums["total_chars"]) metric("target length", sums["total_words"]) metric("dataset length", sums["length_dataset"]) + metric("time", time() - start) metric.print() return avg_loss diff --git a/examples/pipeline/utils.py b/examples/pipeline/utils.py index 6d64680715..ca507e4101 100644 --- a/examples/pipeline/utils.py +++ b/examples/pipeline/utils.py @@ -1,29 +1,22 @@ import logging import os import shutil -import time from collections import defaultdict, deque import torch class MetricLogger: - def __init__(self, group, print_freq=1, time_key="_time"): + def __init__(self, group, print_freq=1): self.print_freq = print_freq - self.time_key = time_key + self._iter = 0 self.data = defaultdict(lambda: deque(maxlen=self.print_freq)) self.data["group"].append(group) - self._iter = 0 - self._start = time.time() def __call__(self, key, value): self.data[key].append(value) def _get_last(self): - if self.time_key is not None: - stop = time.time() - self(self.time_key, stop - self._start) - self._start = stop return {k: v[-1] for k, v in self.data.items()} def __str__(self): From bd5d4d96c4fecc180c2b56b9aa2d7e12d8c130cd Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 2 Jul 2020 12:23:47 -0700 Subject: [PATCH 068/129] check non-empty tensor input. --- examples/pipeline/languagemodels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pipeline/languagemodels.py b/examples/pipeline/languagemodels.py index 7011b2230e..eee018ae69 100644 --- a/examples/pipeline/languagemodels.py +++ b/examples/pipeline/languagemodels.py @@ -24,7 +24,7 @@ def encode(self, iterable): return [self.mapping[i] + self.mapping[self.char_blank] for i in iterable] def decode(self, tensor): - if isinstance(tensor[0], list): + if len(tensor) > 0 and isinstance(tensor[0], list): return [self.decode(t) for t in tensor] else: # not idempotent, since clean string From d1183dc609a4ca0f182a2153307c497a5ca5b2db Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 2 Jul 2020 14:06:50 -0700 Subject: [PATCH 069/129] typo in variable name and log update. --- examples/pipeline/ctc_decoders.py | 2 +- examples/pipeline/main.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/pipeline/ctc_decoders.py b/examples/pipeline/ctc_decoders.py index b24a09b3ff..d54980e309 100644 --- a/examples/pipeline/ctc_decoders.py +++ b/examples/pipeline/ctc_decoders.py @@ -134,7 +134,7 @@ def __call__(self, tag_sequence: torch.Tensor): scores = [] for i in range(tag_sequence.shape[1]): paths, score = self._viterbi_decode( - tag_sequence[:, i, :], self._transitions + tag_sequence[:, i, :], self.transitions ) outputs.append(paths) scores.append(score) diff --git a/examples/pipeline/main.py b/examples/pipeline/main.py index 5a8784c441..7da96749bb 100644 --- a/examples/pipeline/main.py +++ b/examples/pipeline/main.py @@ -155,7 +155,7 @@ def train_one_epoch( start1 = time() metric = MetricLogger("train_iteration") - metric["epoch"] = epoch + metric("epoch", epoch) for inputs, targets, tensors_lengths, target_lengths in bg_iterator( data_loader, maxsize=2 From 26de948512eddc2cdd750ef7e00d3cc357725306 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 7 Jul 2020 15:17:52 -0700 Subject: [PATCH 070/129] typo. --- examples/pipeline/ctc_decoders.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/pipeline/ctc_decoders.py b/examples/pipeline/ctc_decoders.py index d54980e309..c891fa5aac 100644 --- a/examples/pipeline/ctc_decoders.py +++ b/examples/pipeline/ctc_decoders.py @@ -133,9 +133,7 @@ def __call__(self, tag_sequence: torch.Tensor): outputs = [] scores = [] for i in range(tag_sequence.shape[1]): - paths, score = self._viterbi_decode( - tag_sequence[:, i, :], self.transitions - ) + paths, score = self._viterbi_decode(tag_sequence[:, i, :]) outputs.append(paths) scores.append(score) From 7d40304dc719525e30413573d0fe607a53850a5a Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 7 Jul 2020 15:19:03 -0700 Subject: [PATCH 071/129] compute cer/wer in training too. --- examples/pipeline/main.py | 112 +++++++++++++++++++++++++------------- 1 file changed, 74 insertions(+), 38 deletions(-) diff --git a/examples/pipeline/main.py b/examples/pipeline/main.py index 7da96749bb..ece1c823f0 100644 --- a/examples/pipeline/main.py +++ b/examples/pipeline/main.py @@ -146,7 +146,15 @@ def model_length_function(tensor): def train_one_epoch( - model, criterion, optimizer, scheduler, data_loader, device, epoch, + model, + criterion, + optimizer, + scheduler, + data_loader, + decoder, + language_model, + device, + epoch, ): model.train() @@ -191,11 +199,14 @@ def train_one_epoch( optimizer.step() + sums["length_dataset"] += len(inputs) metric("iteration", sums["iteration"]) metric("time", time() - start2) metric.print() sums["iteration"] += 1 + compute_error_rates(outputs, targets, decoder, language_model, sums, epoch) + avg_loss = sums["loss"] / len(data_loader) metric = MetricLogger("train_epoch") @@ -203,11 +214,59 @@ def train_one_epoch( metric("loss", avg_loss) if "gradient" in sums: metric("gradient", sums["gradient"] / len(data_loader)) - metric("lr", scheduler.get_last_lr()[0]) + try: + metric("lr", scheduler.get_last_lr()[0]) + except AttributeError: + pass + metric("cer", sums["cer"]) + metric("wer", sums["wer"]) + metric("cer over dataset length", sums["cer"] / sums["length_dataset"]) + metric("wer over dataset length", sums["wer"] / sums["length_dataset"]) + metric("cer over target length", sums["cer"] / sums["total_chars"]) + metric("wer over target length", sums["wer"] / sums["total_words"]) + metric("target length", sums["total_chars"]) + metric("target length", sums["total_words"]) + metric("dataset length", sums["length_dataset"]) metric("time", time() - start1) metric.print() - scheduler.step() + if isinstance(scheduler, ReduceLROnPlateau): + scheduler.step(avg_loss) + else: + scheduler.step() + + +def compute_error_rates(outputs, targets, decoder, language_model, sums, epoch): + output = outputs.transpose(0, 1).to("cpu") + output = decoder(output) + + output = language_model.decode(output.tolist()) + target = language_model.decode(targets.tolist()) + + print_length = 20 + for i in range(2): + output_print = output[i].ljust(print_length)[:print_length] + target_print = target[i].ljust(print_length)[:print_length] + logging.info(f"Epoch: {epoch} Target: {target_print} Output: {output_print}") + + cers = [levenshtein_distance(a, b) for a, b in zip(target, output)] + # cers_normalized = [d / len(a) for a, d in zip(target, cers)] + cers = sum(cers) + n = sum(len(t) for t in target) + sums["cer"] += cers + # sums["cer_relative"] += cers / n + sums["total_chars"] += n + + output = [o.split(language_model.char_space) for o in output] + target = [o.split(language_model.char_space) for o in target] + + wers = [levenshtein_distance(a, b) for a, b in zip(target, output)] + # wers_normalized = [d / len(a) for a, d in zip(target, wers)] + wers = sum(wers) + n = len(target) + sums["wer"] += wers + # sums["wer_relative"] += wers / n + sums["total_words"] += n def evaluate( @@ -230,8 +289,6 @@ def evaluate( # keep batch first for data parallel outputs = model(inputs).transpose(-1, -2).transpose(0, 1) - sums["length_dataset"] += len(inputs) - # CTC # outputs: input length, batch size, number of classes (including blank) # targets: batch size, max target length @@ -242,38 +299,9 @@ def evaluate( outputs, targets, tensors_lengths, target_lengths ).item() - output = outputs.transpose(0, 1).to("cpu") - output = decoder(output) - - output = language_model.decode(output.tolist()) - target = language_model.decode(targets.tolist()) - - print_length = 20 - for i in range(2): - output_print = output[i].ljust(print_length)[:print_length] - target_print = target[i].ljust(print_length)[:print_length] - logging.info( - f"Epoch: {epoch} Target: {target_print} Output: {output_print}" - ) - - cers = [levenshtein_distance(a, b) for a, b in zip(target, output)] - # cers_normalized = [d / len(a) for a, d in zip(target, cers)] - cers = sum(cers) - n = sum(len(t) for t in target) - sums["cer"] += cers - sums["cer_relative"] += cers / n - sums["total_chars"] += n - - output = [o.split(language_model.char_space) for o in output] - target = [o.split(language_model.char_space) for o in target] - - wers = [levenshtein_distance(a, b) for a, b in zip(target, output)] - # wers_normalized = [d / len(a) for a, d in zip(target, wers)] - wers = sum(wers) - n = len(target) - sums["wer"] += wers - sums["wer_relative"] += wers / n - sums["total_words"] += n + sums["length_dataset"] += len(inputs) + + compute_error_rates(outputs, targets, decoder, language_model, sums, epoch) avg_loss = sums["loss"] / len(data_loader) @@ -478,7 +506,15 @@ def main(args, rank=0): for epoch in range(args.start_epoch, args.epochs): train_one_epoch( - model, criterion, optimizer, scheduler, loader_training, devices[0], epoch, + model, + criterion, + optimizer, + scheduler, + loader_training, + decoder, + language_model, + devices[0], + epoch, ) if not (epoch + 1) % args.print_freq or epoch == args.epochs - 1: From 214ed96506792a319d027750bca3394419557fa7 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Mon, 13 Jul 2020 14:09:46 -0700 Subject: [PATCH 072/129] typo. --- examples/pipeline/ctc_decoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pipeline/ctc_decoders.py b/examples/pipeline/ctc_decoders.py index c891fa5aac..b64adb84d1 100644 --- a/examples/pipeline/ctc_decoders.py +++ b/examples/pipeline/ctc_decoders.py @@ -92,7 +92,7 @@ def _viterbi_decode(self, tag_sequence: torch.Tensor): # Add pairwise potentials to current scores. # assert path_scores[timestep - 1].size() == (n_permutations, num_tags) summed_potentials = ( - path_scores[timestep - 1].unsqueeze(2) + self.transition_matrix + path_scores[timestep - 1].unsqueeze(2) + self.transitions ) summed_potentials = summed_potentials.view(-1, num_tags) From 26fc3919cedd2eabfcfcedad3a464751b12c02b8 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Mon, 13 Jul 2020 15:03:28 -0700 Subject: [PATCH 073/129] add back slurm signal capture to resubmit job. --- examples/pipeline/main.py | 45 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/examples/pipeline/main.py b/examples/pipeline/main.py index ece1c823f0..ea401c4ceb 100644 --- a/examples/pipeline/main.py +++ b/examples/pipeline/main.py @@ -1,6 +1,7 @@ import argparse import logging import os +import signal import string from collections import defaultdict from datetime import datetime @@ -21,6 +22,10 @@ from metrics import levenshtein_distance from utils import MetricLogger, count_parameters, save_checkpoint +# TODO Remove before merge pull request +MAIN_PID = os.getpid() +SIGNAL_RECEIVED = False + def parse_args(): parser = argparse.ArgumentParser() @@ -133,6 +138,27 @@ def parse_args(): return args +# TODO Remove before merge pull request +def signal_handler(a, b): + global SIGNAL_RECEIVED + logging.info(f"Signal received on {datetime.now()}") + SIGNAL_RECEIVED = True + + +# TODO Remove before merge pull request +def trigger_job_requeue(): + # Submit a new job to resume from checkpoint. + if os.environ["SLURM_PROCID"] == "0" and os.getpid() == MAIN_PID: + logging.info(f"PID: {os.getpid()}. PPID: {os.getppid()}.") + logging.info("Resubmitting job") + command = "scontrol requeue " + os.environ["SLURM_JOB_ID"] + logging.info(command) + if os.system(command): + raise RuntimeError("Fail to resubmit") + logging.info("New job submitted to the queue") + exit(0) + + def setup_distributed(rank, world_size): os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "12355" @@ -169,6 +195,10 @@ def train_one_epoch( data_loader, maxsize=2 ): + # TODO Remove before merge pull request + if SIGNAL_RECEIVED: + return + start2 = time() inputs = inputs.to(device, non_blocking=True) targets = targets.to(device, non_blocking=True) @@ -283,6 +313,10 @@ def evaluate( data_loader, maxsize=2 ): + # TODO Remove before merge pull request + if SIGNAL_RECEIVED: + return + inputs = inputs.to(device, non_blocking=True) targets = targets.to(device, non_blocking=True) @@ -517,7 +551,11 @@ def main(args, rank=0): epoch, ) - if not (epoch + 1) % args.print_freq or epoch == args.epochs - 1: + if ( + SIGNAL_RECEIVED # TODO Remove before merge pull request + or not (epoch + 1) % args.print_freq + or epoch == args.epochs - 1 + ): sum_loss = evaluate( model, @@ -544,6 +582,10 @@ def main(args, rank=0): rank, ) + # TODO Remove before merge pull request + if SIGNAL_RECEIVED: + trigger_job_requeue() + logging.info(f"End time: {datetime.now()}") if args.distributed: @@ -562,5 +604,6 @@ def spawn_main(args, main): if __name__ == "__main__": logging.basicConfig(level=logging.INFO) + signal.signal(signal.SIGUSR1, signal_handler) args = parse_args() spawn_main(args, main) From 8b3e156daae5f362ac60e6b272bd28bab58c8a9e Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Mon, 13 Jul 2020 19:57:36 -0700 Subject: [PATCH 074/129] update levinstein distance. --- examples/pipeline/metrics.py | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/examples/pipeline/metrics.py b/examples/pipeline/metrics.py index 6faede3969..16f114ac1f 100644 --- a/examples/pipeline/metrics.py +++ b/examples/pipeline/metrics.py @@ -1,28 +1,27 @@ -from typing import Optional +from typing import List, Union -import torch +def levenshtein_distance(r: Union[str, List[str]], h: Union[str, List[str]]): + """ + Calculate the Levenshtein distance between two lists or strings. + """ -def levenshtein_distance(r: str, h: str, device: Optional[str] = None): + # Initialisation + dold = list(range(len(h) + 1)) + dnew = list(0 for _ in range(len(h) + 1)) - # initialisation - d = torch.zeros((2, len(h) + 1), dtype=torch.long) # , device=device) - dold = 0 - dnew = 1 - - # computation + # Computation for i in range(1, len(r) + 1): - d[dnew, 0] = 0 + dnew[0] = i for j in range(1, len(h) + 1): - if r[i - 1] == h[j - 1]: - d[dnew, j] = d[dnew - 1, j - 1] + dnew[j] = dold[j - 1] else: - substitution = d[dnew - 1, j - 1] + 1 - insertion = d[dnew, j - 1] + 1 - deletion = d[dnew - 1, j] + 1 - d[dnew, j] = min(substitution, insertion, deletion) + substitution = dold[j - 1] + 1 + insertion = dnew[j - 1] + 1 + deletion = dold[j] + 1 + dnew[j] = min(substitution, insertion, deletion) dnew, dold = dold, dnew - return d[dnew, -1].item() + return dold[-1] From 16765beb9c3c5c7cd79a4708969f29099ed44d4a Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 14 Jul 2020 06:24:21 -0700 Subject: [PATCH 075/129] adding tests for levenstein distance. --- examples/pipeline/metrics.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/examples/pipeline/metrics.py b/examples/pipeline/metrics.py index 16f114ac1f..cba6595016 100644 --- a/examples/pipeline/metrics.py +++ b/examples/pipeline/metrics.py @@ -25,3 +25,14 @@ def levenshtein_distance(r: Union[str, List[str]], h: Union[str, List[str]]): dnew, dold = dold, dnew return dold[-1] + + +if __name__ == "__main__": + assert levenshtein_distance("abc", "abc") == 0 + assert levenshtein_distance("aaa", "aba") == 1 + assert levenshtein_distance("aba", "aaa") == 1 + assert levenshtein_distance("aa", "aaa") == 1 + assert levenshtein_distance("aaa", "aa") == 1 + assert levenshtein_distance("abc", "bcd") == 2 + assert levenshtein_distance(["hello", "world"], ["hello", "world", "!"]) == 1 + assert levenshtein_distance(["hello", "world"], ["world", "hello", "!"]) == 2 From 61b61d801c70e1d3a81a8cce95bb165704e637dd Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 14 Jul 2020 07:08:29 -0700 Subject: [PATCH 076/129] record error rate during iteration. --- examples/pipeline/main.py | 50 ++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/examples/pipeline/main.py b/examples/pipeline/main.py index ea401c4ceb..8a5c247fef 100644 --- a/examples/pipeline/main.py +++ b/examples/pipeline/main.py @@ -229,25 +229,40 @@ def train_one_epoch( optimizer.step() + compute_error_rates(outputs, targets, decoder, language_model, sums) + record_error_rates(sums, metric) + sums["length_dataset"] += len(inputs) + metric("iteration", sums["iteration"]) metric("time", time() - start2) + metric("epoch", epoch) metric.print() - sums["iteration"] += 1 - compute_error_rates(outputs, targets, decoder, language_model, sums, epoch) + sums["iteration"] += 1 avg_loss = sums["loss"] / len(data_loader) metric = MetricLogger("train_epoch") + record_error_rates(sums, metric) metric("epoch", epoch) metric("loss", avg_loss) - if "gradient" in sums: - metric("gradient", sums["gradient"] / len(data_loader)) + metric("gradient", sums["gradient"] / len(data_loader)) try: metric("lr", scheduler.get_last_lr()[0]) except AttributeError: pass + metric("time", time() - start1) + metric.print() + + if isinstance(scheduler, ReduceLROnPlateau): + scheduler.step(avg_loss) + else: + scheduler.step() + + +def record_error_rates(sums, metric): + metric("cer", sums["cer"]) metric("wer", sums["wer"]) metric("cer over dataset length", sums["cer"] / sums["length_dataset"]) @@ -257,16 +272,9 @@ def train_one_epoch( metric("target length", sums["total_chars"]) metric("target length", sums["total_words"]) metric("dataset length", sums["length_dataset"]) - metric("time", time() - start1) - metric.print() - - if isinstance(scheduler, ReduceLROnPlateau): - scheduler.step(avg_loss) - else: - scheduler.step() -def compute_error_rates(outputs, targets, decoder, language_model, sums, epoch): +def compute_error_rates(outputs, targets, decoder, language_model, sums): output = outputs.transpose(0, 1).to("cpu") output = decoder(output) @@ -277,7 +285,7 @@ def compute_error_rates(outputs, targets, decoder, language_model, sums, epoch): for i in range(2): output_print = output[i].ljust(print_length)[:print_length] target_print = target[i].ljust(print_length)[:print_length] - logging.info(f"Epoch: {epoch} Target: {target_print} Output: {output_print}") + logging.info(f"Target: {target_print} Output: {output_print}") cers = [levenshtein_distance(a, b) for a, b in zip(target, output)] # cers_normalized = [d / len(a) for a, d in zip(target, cers)] @@ -308,6 +316,7 @@ def evaluate( model.eval() sums = defaultdict(lambda: 0.0) start = time() + metric = MetricLogger("validation") for inputs, targets, tensors_lengths, target_lengths in bg_iterator( data_loader, maxsize=2 @@ -335,23 +344,14 @@ def evaluate( sums["length_dataset"] += len(inputs) - compute_error_rates(outputs, targets, decoder, language_model, sums, epoch) + compute_error_rates(outputs, targets, decoder, language_model, sums) avg_loss = sums["loss"] / len(data_loader) - metric = MetricLogger("validation") metric("epoch", epoch) metric("loss", avg_loss) - metric("cer", sums["cer"]) - metric("wer", sums["wer"]) - metric("cer over dataset length", sums["cer"] / sums["length_dataset"]) - metric("wer over dataset length", sums["wer"] / sums["length_dataset"]) - metric("cer over target length", sums["cer"] / sums["total_chars"]) - metric("wer over target length", sums["wer"] / sums["total_words"]) - metric("target length", sums["total_chars"]) - metric("target length", sums["total_words"]) - metric("dataset length", sums["length_dataset"]) metric("time", time() - start) + record_error_rates(sums, metric) metric.print() return avg_loss @@ -539,6 +539,8 @@ def main(args, rank=0): for epoch in range(args.start_epoch, args.epochs): + logging.info(f"Epoch: {epoch}") + train_one_epoch( model, criterion, From 243f9c2169cc5f1080b4133d8418934d4dda8a6f Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 15 Jul 2020 10:24:53 -0700 Subject: [PATCH 077/129] metric logger using setitem. --- examples/pipeline/main.py | 52 +++++++++++++++++++------------------- examples/pipeline/utils.py | 4 +-- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/examples/pipeline/main.py b/examples/pipeline/main.py index 8a5c247fef..ad7b17615c 100644 --- a/examples/pipeline/main.py +++ b/examples/pipeline/main.py @@ -189,7 +189,7 @@ def train_one_epoch( start1 = time() metric = MetricLogger("train_iteration") - metric("epoch", epoch) + metric["epoch"] = epoch for inputs, targets, tensors_lengths, target_lengths in bg_iterator( data_loader, maxsize=2 @@ -215,7 +215,7 @@ def train_one_epoch( loss = criterion(outputs, targets, tensors_lengths, target_lengths) loss_item = loss.item() sums["loss"] += loss_item - metric("loss", loss_item) + metric["loss"] = loss_item optimizer.zero_grad() loss.backward() @@ -225,7 +225,7 @@ def train_one_epoch( model.parameters(), args.clip_grad ) sums["gradient"] += gradient - metric("gradient", gradient) + metric["gradient"] = gradient optimizer.step() @@ -234,10 +234,10 @@ def train_one_epoch( sums["length_dataset"] += len(inputs) - metric("iteration", sums["iteration"]) - metric("time", time() - start2) - metric("epoch", epoch) - metric.print() + metric["iteration"] = sums["iteration"] + metric["time"] = time() - start2 + metric["epoch"] = epoch + metric() sums["iteration"] += 1 @@ -245,15 +245,15 @@ def train_one_epoch( metric = MetricLogger("train_epoch") record_error_rates(sums, metric) - metric("epoch", epoch) - metric("loss", avg_loss) - metric("gradient", sums["gradient"] / len(data_loader)) + metric["epoch"] = epoch + metric["loss"] = avg_loss + metric["gradient"] = sums["gradient"] / len(data_loader) try: - metric("lr", scheduler.get_last_lr()[0]) + metric["lr"] = scheduler.get_last_lr()[0] except AttributeError: pass - metric("time", time() - start1) - metric.print() + metric["time"] = time() - start1 + metric() if isinstance(scheduler, ReduceLROnPlateau): scheduler.step(avg_loss) @@ -263,15 +263,15 @@ def train_one_epoch( def record_error_rates(sums, metric): - metric("cer", sums["cer"]) - metric("wer", sums["wer"]) - metric("cer over dataset length", sums["cer"] / sums["length_dataset"]) - metric("wer over dataset length", sums["wer"] / sums["length_dataset"]) - metric("cer over target length", sums["cer"] / sums["total_chars"]) - metric("wer over target length", sums["wer"] / sums["total_words"]) - metric("target length", sums["total_chars"]) - metric("target length", sums["total_words"]) - metric("dataset length", sums["length_dataset"]) + metric["cer"] = sums["cer"] + metric["wer"] = sums["wer"] + metric["cer over dataset length"] = sums["cer"] / sums["length_dataset"] + metric["wer over dataset length"] = sums["wer"] / sums["length_dataset"] + metric["cer over target length"] = sums["cer"] / sums["total_chars"] + metric["wer over target length"] = sums["wer"] / sums["total_words"] + metric["target length"] = sums["total_chars"] + metric["target length"] = sums["total_words"] + metric["dataset length"] = sums["length_dataset"] def compute_error_rates(outputs, targets, decoder, language_model, sums): @@ -348,11 +348,11 @@ def evaluate( avg_loss = sums["loss"] / len(data_loader) - metric("epoch", epoch) - metric("loss", avg_loss) - metric("time", time() - start) + metric["epoch"] = epoch + metric["loss"] = avg_loss + metric["time"] = time() - start record_error_rates(sums, metric) - metric.print() + metric() return avg_loss diff --git a/examples/pipeline/utils.py b/examples/pipeline/utils.py index ca507e4101..a5b118f322 100644 --- a/examples/pipeline/utils.py +++ b/examples/pipeline/utils.py @@ -13,7 +13,7 @@ def __init__(self, group, print_freq=1): self.data = defaultdict(lambda: deque(maxlen=self.print_freq)) self.data["group"].append(group) - def __call__(self, key, value): + def __setitem__(self, key, value): self.data[key].append(value) def _get_last(self): @@ -22,7 +22,7 @@ def _get_last(self): def __str__(self): return str(self._get_last()) - def print(self): + def __call__(self): self._iter = (self._iter + 1) % self.print_freq if not self._iter: print(self, flush=True) From 4e34958f4b207ebd0cd2b011fc5a290cbce76832 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 15 Jul 2020 10:55:57 -0700 Subject: [PATCH 078/129] moving signal break to end of loop and return loss so far. --- examples/pipeline/main.py | 45 ++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/examples/pipeline/main.py b/examples/pipeline/main.py index ad7b17615c..6325024947 100644 --- a/examples/pipeline/main.py +++ b/examples/pipeline/main.py @@ -195,10 +195,6 @@ def train_one_epoch( data_loader, maxsize=2 ): - # TODO Remove before merge pull request - if SIGNAL_RECEIVED: - return - start2 = time() inputs = inputs.to(device, non_blocking=True) targets = targets.to(device, non_blocking=True) @@ -241,6 +237,10 @@ def train_one_epoch( sums["iteration"] += 1 + # TODO Remove before merge pull request + if SIGNAL_RECEIVED: + return loss_item + avg_loss = sums["loss"] / len(data_loader) metric = MetricLogger("train_epoch") @@ -322,10 +322,6 @@ def evaluate( data_loader, maxsize=2 ): - # TODO Remove before merge pull request - if SIGNAL_RECEIVED: - return - inputs = inputs.to(device, non_blocking=True) targets = targets.to(device, non_blocking=True) @@ -346,6 +342,10 @@ def evaluate( compute_error_rates(outputs, targets, decoder, language_model, sums) + # TODO Remove before merge pull request + if SIGNAL_RECEIVED: + return sums["loss"] / len(data_loader) + avg_loss = sums["loss"] / len(data_loader) metric["epoch"] = epoch @@ -363,6 +363,10 @@ def main(args, rank=0): setup_distributed(rank, args.world_size) logging.info("Start time: {}".format(str(datetime.now()))) + + # Install signal handler + signal.signal(signal.SIGUSR1, signal_handler) + # Explicitly setting seed to make sure that models created in two processes # start from same random weights and biases. torch.manual_seed(args.seed) @@ -553,13 +557,9 @@ def main(args, rank=0): epoch, ) - if ( - SIGNAL_RECEIVED # TODO Remove before merge pull request - or not (epoch + 1) % args.print_freq - or epoch == args.epochs - 1 - ): + if not (epoch + 1) % args.print_freq or epoch == args.epochs - 1: - sum_loss = evaluate( + loss = evaluate( model, criterion, loader_validation, @@ -569,8 +569,8 @@ def main(args, rank=0): epoch, ) - is_best = sum_loss < best_loss - best_loss = min(sum_loss, best_loss) + is_best = loss < best_loss + best_loss = min(loss, best_loss) save_checkpoint( { "epoch": epoch + 1, @@ -586,6 +586,18 @@ def main(args, rank=0): # TODO Remove before merge pull request if SIGNAL_RECEIVED: + save_checkpoint( + { + "epoch": epoch + 1, + "state_dict": model.state_dict(), + "best_loss": best_loss, + "optimizer": optimizer.state_dict(), + "scheduler": scheduler.state_dict(), + }, + False, + args.checkpoint, + rank, + ) trigger_job_requeue() logging.info(f"End time: {datetime.now()}") @@ -606,6 +618,5 @@ def spawn_main(args, main): if __name__ == "__main__": logging.basicConfig(level=logging.INFO) - signal.signal(signal.SIGUSR1, signal_handler) args = parse_args() spawn_main(args, main) From 84a15a3a53ea913b2f41b28fa14ce0368788fd11 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 15 Jul 2020 14:46:47 -0700 Subject: [PATCH 079/129] typo. --- examples/pipeline/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/pipeline/main.py b/examples/pipeline/main.py index 6325024947..5333e71fe3 100644 --- a/examples/pipeline/main.py +++ b/examples/pipeline/main.py @@ -225,11 +225,11 @@ def train_one_epoch( optimizer.step() + sums["length_dataset"] += len(inputs) + compute_error_rates(outputs, targets, decoder, language_model, sums) record_error_rates(sums, metric) - sums["length_dataset"] += len(inputs) - metric["iteration"] = sums["iteration"] metric["time"] = time() - start2 metric["epoch"] = epoch From efb74f1a96e5768b30a645d59f1cb106a5f00144 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Fri, 17 Jul 2020 09:01:41 -0700 Subject: [PATCH 080/129] add citation. --- examples/pipeline/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pipeline/README.md b/examples/pipeline/README.md index 92dd660ec1..4fce6364c6 100644 --- a/examples/pipeline/README.md +++ b/examples/pipeline/README.md @@ -1,4 +1,4 @@ -This is an example pipeline for speech recognition using a greedy or Viterbi CTC decoder, along with the Wav2Letter model trained on LibriSpeech. Wav2Letter and LibriSpeech are available in torchaudio. +This is an example pipeline for speech recognition using a greedy or Viterbi CTC decoder, along with the Wav2Letter model trained on LibriSpeech, see [Wav2Letter: an End-to-End ConvNet-based Speech Recognition System](https://arxiv.org/pdf/1609.03193.pdf). Wav2Letter and LibriSpeech are available in torchaudio. ### Usage From dbded0dae855d2d9f258414c0709b82de715c772 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 23 Jul 2020 12:36:14 -0700 Subject: [PATCH 081/129] change default to best run. --- examples/pipeline/main.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/pipeline/main.py b/examples/pipeline/main.py index 5333e71fe3..3c769d6e04 100644 --- a/examples/pipeline/main.py +++ b/examples/pipeline/main.py @@ -84,33 +84,33 @@ def parse_args(): parser.add_argument( "--optimizer", metavar="OPT", - default="sgd", + default="adadelta", choices=["sgd", "adadelta", "adam"], help="optimizer to use", ) parser.add_argument( "--scheduler", metavar="S", - default="exponential", + default="reduceonplateau", choices=["exponential", "reduceonplateau"], help="optimizer to use", ) parser.add_argument( "--learning-rate", - default=1.0, + default=0.6, type=float, metavar="LR", help="initial learning rate", ) parser.add_argument( "--gamma", - default=0.96, + default=0.99, type=float, metavar="GAMMA", help="learning rate exponential decay constant", ) parser.add_argument( - "--momentum", default=0.0, type=float, metavar="M", help="momentum" + "--momentum", default=0.8, type=float, metavar="M", help="momentum" ) parser.add_argument( "--weight-decay", default=1e-5, type=float, metavar="W", help="weight decay" From fb8324dc85b81b2bdd2f410ef25cdd83479dc86a Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 23 Jul 2020 12:38:29 -0700 Subject: [PATCH 082/129] adding other experiment with decoders. --- examples/pipeline/ctc_decoders.py | 93 +++++++++++++++++++++++++++++++ examples/pipeline/main.py | 8 ++- 2 files changed, 98 insertions(+), 3 deletions(-) diff --git a/examples/pipeline/ctc_decoders.py b/examples/pipeline/ctc_decoders.py index b64adb84d1..9a3018be93 100644 --- a/examples/pipeline/ctc_decoders.py +++ b/examples/pipeline/ctc_decoders.py @@ -5,6 +5,28 @@ from tqdm import tqdm +class GreedyIterableDecoder: + def __init__(self, blank_label=0, collapse_repeated=True): + self.blank_label = blank_label + self.collapse_repeated = collapse_repeated + + def __call__(self, output): + arg_maxes = torch.argmax(output, dim=-1) + decodes = [] + for args in arg_maxes: + decode = [] + for j, index in enumerate(args): + if index != self.blank_label: + if self.collapse_repeated and j != 0 and index == args[j - 1]: + continue + decode.append(index.item()) + decode = torch.tensor(decode) + decodes.append(decode) + # decodes = torch.tensor(decodes) + decodes = torch.nn.utils.rnn.pad_sequence(decodes, batch_first=True) + return decodes + + class GreedyDecoder: def __call__(self, outputs): """Greedy Decoder. Returns highest probability of class labels for each timestep @@ -19,6 +41,77 @@ def __call__(self, outputs): return indices[..., 0] +def zeros_like(m): + return zeros(len(m), len(m[0])) + + +def zeros(d1, d2): + return list(list(0 for _ in range(d2)) for _ in range(d1)) + + +def apply_transpose(f, m): + return list(map(f, zip(*m))) + + +def argmax(l): + return max(range(len(l)), key=lambda i: l[i]) + + +def add1d2d(m1, m2): + return [[v2 + v1 for v2 in m2_row] for m2_row, v1 in zip(m2, m1)] + + +def add1d1d(v1, v2): + return [e + s for e, s in zip(v1, v2)] + + +class ListViterbiDecoder: + def __init__(self, data_loader, vocab_size, n=2, progress_bar=False): + self._transitions = self._build_transitions( + data_loader, vocab_size, n, progress_bar + ) + + def __call__(self, emissions): + return torch.tensor([self._decode(emissions[i].tolist(), self._transitions)[0] for i in range(len(emissions))]) + + @staticmethod + def _build_transitions(data_loader, vocab_size, n=2, progress_bar=False): + + # Count n-grams + count = Counter() + for _, label in tqdm(data_loader, disable=not progress_bar): + count += Counter(a for a in zip(*(label[i:] for i in range(n)))) + + # Write as matrix + transitions = zeros(vocab_size, vocab_size) + for (k1, k2), v in count.items(): + transitions[k1][k2] = v + + return transitions + + @staticmethod + def _decode(emissions, transitions): + scores = zeros_like(emissions) + back_pointers = zeros_like(emissions) + scores = emissions[0] + + # Generate most likely scores and paths for each step in sequence + for i in range(1, len(emissions)): + score_with_transition = add1d2d(scores, transitions) + max_score_with_transition = apply_transpose(max, score_with_transition) + scores = add1d1d(emissions[i], max_score_with_transition) + back_pointers[i] = apply_transpose(argmax, score_with_transition) + + # Generate the most likely path + viterbi = [argmax(scores)] + for bp in reversed(back_pointers[1:]): + viterbi.append(bp[viterbi[-1]]) + viterbi.reverse() + viterbi_score = max(scores) + + return viterbi, viterbi_score + + class ViterbiDecoder: def __init__(self, data_loader, vocab_size, n=2, progress_bar=False): self.vocab_size = vocab_size diff --git a/examples/pipeline/main.py b/examples/pipeline/main.py index 3c769d6e04..83cd77d13c 100644 --- a/examples/pipeline/main.py +++ b/examples/pipeline/main.py @@ -16,7 +16,7 @@ from torchaudio.models.wav2letter import Wav2Letter from torchaudio.transforms import MFCC, Resample -from ctc_decoders import GreedyDecoder, ViterbiDecoder +from ctc_decoders import GreedyDecoder, GreedyIterableDecoder, ViterbiDecoder, ListViterbiDecoder from datasets import collate_factory, datasets_librispeech from languagemodels import LanguageModel from metrics import levenshtein_distance @@ -68,7 +68,7 @@ def parse_args(): "--decoder", metavar="D", default="greedy", - choices=["greedy", "viterbi"], + choices=["greedy", "greedyiter", "viterbi"], help="decoder to use", ) parser.add_argument( @@ -424,8 +424,10 @@ def main(args, rank=0): if args.decoder == "greedy": decoder = GreedyDecoder() + elif args.decoder == "greedyiter": + decoder = GreedyIterableDecoder() elif args.decoder == "viterbi": - decoder = ViterbiDecoder( + decoder = ListViterbiDecoder( training, len(language_model), progress_bar=args.progress_bar ) From 5063d689c2915dd7daa7f2c5764d6a3051b6f45b Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 23 Jul 2020 12:39:53 -0700 Subject: [PATCH 083/129] remove other decoders than greedy. --- examples/pipeline/ctc_decoders.py | 218 ------------------------------ examples/pipeline/main.py | 10 +- 2 files changed, 2 insertions(+), 226 deletions(-) diff --git a/examples/pipeline/ctc_decoders.py b/examples/pipeline/ctc_decoders.py index 9a3018be93..b4f155d6fa 100644 --- a/examples/pipeline/ctc_decoders.py +++ b/examples/pipeline/ctc_decoders.py @@ -1,30 +1,4 @@ -from collections import Counter - -import torch from torch import topk -from tqdm import tqdm - - -class GreedyIterableDecoder: - def __init__(self, blank_label=0, collapse_repeated=True): - self.blank_label = blank_label - self.collapse_repeated = collapse_repeated - - def __call__(self, output): - arg_maxes = torch.argmax(output, dim=-1) - decodes = [] - for args in arg_maxes: - decode = [] - for j, index in enumerate(args): - if index != self.blank_label: - if self.collapse_repeated and j != 0 and index == args[j - 1]: - continue - decode.append(index.item()) - decode = torch.tensor(decode) - decodes.append(decode) - # decodes = torch.tensor(decodes) - decodes = torch.nn.utils.rnn.pad_sequence(decodes, batch_first=True) - return decodes class GreedyDecoder: @@ -39,195 +13,3 @@ def __call__(self, outputs): """ _, indices = topk(outputs, k=1, dim=-1) return indices[..., 0] - - -def zeros_like(m): - return zeros(len(m), len(m[0])) - - -def zeros(d1, d2): - return list(list(0 for _ in range(d2)) for _ in range(d1)) - - -def apply_transpose(f, m): - return list(map(f, zip(*m))) - - -def argmax(l): - return max(range(len(l)), key=lambda i: l[i]) - - -def add1d2d(m1, m2): - return [[v2 + v1 for v2 in m2_row] for m2_row, v1 in zip(m2, m1)] - - -def add1d1d(v1, v2): - return [e + s for e, s in zip(v1, v2)] - - -class ListViterbiDecoder: - def __init__(self, data_loader, vocab_size, n=2, progress_bar=False): - self._transitions = self._build_transitions( - data_loader, vocab_size, n, progress_bar - ) - - def __call__(self, emissions): - return torch.tensor([self._decode(emissions[i].tolist(), self._transitions)[0] for i in range(len(emissions))]) - - @staticmethod - def _build_transitions(data_loader, vocab_size, n=2, progress_bar=False): - - # Count n-grams - count = Counter() - for _, label in tqdm(data_loader, disable=not progress_bar): - count += Counter(a for a in zip(*(label[i:] for i in range(n)))) - - # Write as matrix - transitions = zeros(vocab_size, vocab_size) - for (k1, k2), v in count.items(): - transitions[k1][k2] = v - - return transitions - - @staticmethod - def _decode(emissions, transitions): - scores = zeros_like(emissions) - back_pointers = zeros_like(emissions) - scores = emissions[0] - - # Generate most likely scores and paths for each step in sequence - for i in range(1, len(emissions)): - score_with_transition = add1d2d(scores, transitions) - max_score_with_transition = apply_transpose(max, score_with_transition) - scores = add1d1d(emissions[i], max_score_with_transition) - back_pointers[i] = apply_transpose(argmax, score_with_transition) - - # Generate the most likely path - viterbi = [argmax(scores)] - for bp in reversed(back_pointers[1:]): - viterbi.append(bp[viterbi[-1]]) - viterbi.reverse() - viterbi_score = max(scores) - - return viterbi, viterbi_score - - -class ViterbiDecoder: - def __init__(self, data_loader, vocab_size, n=2, progress_bar=False): - self.vocab_size = vocab_size - self.n = n - self.top_k = 1 - self.progress_bar = progress_bar - - self._build_transitions(data_loader) - - def _build_transitions(self, data_loader): - - # Count n-grams - - c = Counter() - for _, label in tqdm(data_loader, disable=not self.progress_bar): - count = Counter( - tuple(b.item() for b in a) - for a in zip(*(label[i:] for i in range(self.n))) - ) - c += count - - # Encode as transition matrix - - ind = torch.tensor([a for (a, _) in c.items()]).t() - val = torch.tensor([b for (_, b) in c.items()], dtype=torch.float) - - transitions = ( - torch.sparse_coo_tensor( - indices=ind, values=val, size=[self.vocab_size, self.vocab_size] - ) - .coalesce() - .to_dense() - ) - transitions = transitions / torch.max( - torch.tensor(1.0), transitions.max(dim=1)[0] - ).unsqueeze(1) - - self.transitions = transitions - - def _viterbi_decode(self, tag_sequence: torch.Tensor): - """ - Perform Viterbi decoding in log space over a sequence given a transition matrix - specifying pairwise (transition) potentials between tags and a matrix of shape - (sequence_length, num_tags) specifying unary potentials for possible tags per - timestep. - - Parameters - ---------- - tag_sequence : torch.Tensor, required. - A tensor of shape (sequence_length, num_tags) representing scores for - a set of tags over a given sequence. - - Returns - ------- - viterbi_path : List[int] - The tag indices of the maximum likelihood tag sequence. - viterbi_score : float - The score of the viterbi path. - """ - sequence_length, num_tags = tag_sequence.size() - - path_scores = [] - path_indices = [] - # At the beginning, the maximum number of permutations is 1; therefore, we unsqueeze(0) - # to allow for 1 permutation. - path_scores.append(tag_sequence[0, :].unsqueeze(0)) - # assert path_scores[0].size() == (n_permutations, num_tags) - - # Evaluate the scores for all possible paths. - for timestep in range(1, sequence_length): - # Add pairwise potentials to current scores. - # assert path_scores[timestep - 1].size() == (n_permutations, num_tags) - summed_potentials = ( - path_scores[timestep - 1].unsqueeze(2) + self.transitions - ) - summed_potentials = summed_potentials.view(-1, num_tags) - - # Best pairwise potential path score from the previous timestep. - max_k = min(summed_potentials.size()[0], self.top_k) - scores, paths = torch.topk(summed_potentials, k=max_k, dim=0) - # assert scores.size() == (n_permutations, num_tags) - # assert paths.size() == (n_permutations, num_tags) - - scores = tag_sequence[timestep, :] + scores - # assert scores.size() == (n_permutations, num_tags) - path_scores.append(scores) - path_indices.append(paths.squeeze()) - - # Construct the most likely sequence backwards. - path_scores = path_scores[-1].view(-1) - max_k = min(path_scores.size()[0], self.top_k) - viterbi_scores, best_paths = torch.topk(path_scores, k=max_k, dim=0) - - viterbi_paths = [] - for i in range(max_k): - - viterbi_path = [best_paths[i].item()] - for backward_timestep in reversed(path_indices): - viterbi_path.append(int(backward_timestep.view(-1)[viterbi_path[-1]])) - - # Reverse the backward path. - viterbi_path.reverse() - - # Viterbi paths uses (num_tags * n_permutations) nodes; therefore, we need to modulo. - viterbi_path = [j % num_tags for j in viterbi_path] - viterbi_paths.append(viterbi_path) - - return viterbi_paths, viterbi_scores - - def __call__(self, tag_sequence: torch.Tensor): - - outputs = [] - scores = [] - for i in range(tag_sequence.shape[1]): - paths, score = self._viterbi_decode(tag_sequence[:, i, :]) - outputs.append(paths) - scores.append(score) - - return torch.tensor(outputs).transpose(0, -1), torch.cat(scores)[:, 0, :] diff --git a/examples/pipeline/main.py b/examples/pipeline/main.py index 83cd77d13c..32651f12f6 100644 --- a/examples/pipeline/main.py +++ b/examples/pipeline/main.py @@ -16,7 +16,7 @@ from torchaudio.models.wav2letter import Wav2Letter from torchaudio.transforms import MFCC, Resample -from ctc_decoders import GreedyDecoder, GreedyIterableDecoder, ViterbiDecoder, ListViterbiDecoder +from ctc_decoders import GreedyDecoder from datasets import collate_factory, datasets_librispeech from languagemodels import LanguageModel from metrics import levenshtein_distance @@ -68,7 +68,7 @@ def parse_args(): "--decoder", metavar="D", default="greedy", - choices=["greedy", "greedyiter", "viterbi"], + choices=["greedy"], help="decoder to use", ) parser.add_argument( @@ -424,12 +424,6 @@ def main(args, rank=0): if args.decoder == "greedy": decoder = GreedyDecoder() - elif args.decoder == "greedyiter": - decoder = GreedyIterableDecoder() - elif args.decoder == "viterbi": - decoder = ListViterbiDecoder( - training, len(language_model), progress_bar=args.progress_bar - ) model = Wav2Letter( num_classes=language_model.length, input_type="mfcc", num_features=args.n_bins From 61b7afc32b2bd7232fd6468ba26724872f91d202 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 23 Jul 2020 12:40:44 -0700 Subject: [PATCH 084/129] Revert "remove other decoders than greedy." This reverts commit fb114372e89e317bf48d0b1f846c60bca8efe1ac. --- examples/pipeline/ctc_decoders.py | 218 ++++++++++++++++++++++++++++++ examples/pipeline/main.py | 10 +- 2 files changed, 226 insertions(+), 2 deletions(-) diff --git a/examples/pipeline/ctc_decoders.py b/examples/pipeline/ctc_decoders.py index b4f155d6fa..9a3018be93 100644 --- a/examples/pipeline/ctc_decoders.py +++ b/examples/pipeline/ctc_decoders.py @@ -1,4 +1,30 @@ +from collections import Counter + +import torch from torch import topk +from tqdm import tqdm + + +class GreedyIterableDecoder: + def __init__(self, blank_label=0, collapse_repeated=True): + self.blank_label = blank_label + self.collapse_repeated = collapse_repeated + + def __call__(self, output): + arg_maxes = torch.argmax(output, dim=-1) + decodes = [] + for args in arg_maxes: + decode = [] + for j, index in enumerate(args): + if index != self.blank_label: + if self.collapse_repeated and j != 0 and index == args[j - 1]: + continue + decode.append(index.item()) + decode = torch.tensor(decode) + decodes.append(decode) + # decodes = torch.tensor(decodes) + decodes = torch.nn.utils.rnn.pad_sequence(decodes, batch_first=True) + return decodes class GreedyDecoder: @@ -13,3 +39,195 @@ def __call__(self, outputs): """ _, indices = topk(outputs, k=1, dim=-1) return indices[..., 0] + + +def zeros_like(m): + return zeros(len(m), len(m[0])) + + +def zeros(d1, d2): + return list(list(0 for _ in range(d2)) for _ in range(d1)) + + +def apply_transpose(f, m): + return list(map(f, zip(*m))) + + +def argmax(l): + return max(range(len(l)), key=lambda i: l[i]) + + +def add1d2d(m1, m2): + return [[v2 + v1 for v2 in m2_row] for m2_row, v1 in zip(m2, m1)] + + +def add1d1d(v1, v2): + return [e + s for e, s in zip(v1, v2)] + + +class ListViterbiDecoder: + def __init__(self, data_loader, vocab_size, n=2, progress_bar=False): + self._transitions = self._build_transitions( + data_loader, vocab_size, n, progress_bar + ) + + def __call__(self, emissions): + return torch.tensor([self._decode(emissions[i].tolist(), self._transitions)[0] for i in range(len(emissions))]) + + @staticmethod + def _build_transitions(data_loader, vocab_size, n=2, progress_bar=False): + + # Count n-grams + count = Counter() + for _, label in tqdm(data_loader, disable=not progress_bar): + count += Counter(a for a in zip(*(label[i:] for i in range(n)))) + + # Write as matrix + transitions = zeros(vocab_size, vocab_size) + for (k1, k2), v in count.items(): + transitions[k1][k2] = v + + return transitions + + @staticmethod + def _decode(emissions, transitions): + scores = zeros_like(emissions) + back_pointers = zeros_like(emissions) + scores = emissions[0] + + # Generate most likely scores and paths for each step in sequence + for i in range(1, len(emissions)): + score_with_transition = add1d2d(scores, transitions) + max_score_with_transition = apply_transpose(max, score_with_transition) + scores = add1d1d(emissions[i], max_score_with_transition) + back_pointers[i] = apply_transpose(argmax, score_with_transition) + + # Generate the most likely path + viterbi = [argmax(scores)] + for bp in reversed(back_pointers[1:]): + viterbi.append(bp[viterbi[-1]]) + viterbi.reverse() + viterbi_score = max(scores) + + return viterbi, viterbi_score + + +class ViterbiDecoder: + def __init__(self, data_loader, vocab_size, n=2, progress_bar=False): + self.vocab_size = vocab_size + self.n = n + self.top_k = 1 + self.progress_bar = progress_bar + + self._build_transitions(data_loader) + + def _build_transitions(self, data_loader): + + # Count n-grams + + c = Counter() + for _, label in tqdm(data_loader, disable=not self.progress_bar): + count = Counter( + tuple(b.item() for b in a) + for a in zip(*(label[i:] for i in range(self.n))) + ) + c += count + + # Encode as transition matrix + + ind = torch.tensor([a for (a, _) in c.items()]).t() + val = torch.tensor([b for (_, b) in c.items()], dtype=torch.float) + + transitions = ( + torch.sparse_coo_tensor( + indices=ind, values=val, size=[self.vocab_size, self.vocab_size] + ) + .coalesce() + .to_dense() + ) + transitions = transitions / torch.max( + torch.tensor(1.0), transitions.max(dim=1)[0] + ).unsqueeze(1) + + self.transitions = transitions + + def _viterbi_decode(self, tag_sequence: torch.Tensor): + """ + Perform Viterbi decoding in log space over a sequence given a transition matrix + specifying pairwise (transition) potentials between tags and a matrix of shape + (sequence_length, num_tags) specifying unary potentials for possible tags per + timestep. + + Parameters + ---------- + tag_sequence : torch.Tensor, required. + A tensor of shape (sequence_length, num_tags) representing scores for + a set of tags over a given sequence. + + Returns + ------- + viterbi_path : List[int] + The tag indices of the maximum likelihood tag sequence. + viterbi_score : float + The score of the viterbi path. + """ + sequence_length, num_tags = tag_sequence.size() + + path_scores = [] + path_indices = [] + # At the beginning, the maximum number of permutations is 1; therefore, we unsqueeze(0) + # to allow for 1 permutation. + path_scores.append(tag_sequence[0, :].unsqueeze(0)) + # assert path_scores[0].size() == (n_permutations, num_tags) + + # Evaluate the scores for all possible paths. + for timestep in range(1, sequence_length): + # Add pairwise potentials to current scores. + # assert path_scores[timestep - 1].size() == (n_permutations, num_tags) + summed_potentials = ( + path_scores[timestep - 1].unsqueeze(2) + self.transitions + ) + summed_potentials = summed_potentials.view(-1, num_tags) + + # Best pairwise potential path score from the previous timestep. + max_k = min(summed_potentials.size()[0], self.top_k) + scores, paths = torch.topk(summed_potentials, k=max_k, dim=0) + # assert scores.size() == (n_permutations, num_tags) + # assert paths.size() == (n_permutations, num_tags) + + scores = tag_sequence[timestep, :] + scores + # assert scores.size() == (n_permutations, num_tags) + path_scores.append(scores) + path_indices.append(paths.squeeze()) + + # Construct the most likely sequence backwards. + path_scores = path_scores[-1].view(-1) + max_k = min(path_scores.size()[0], self.top_k) + viterbi_scores, best_paths = torch.topk(path_scores, k=max_k, dim=0) + + viterbi_paths = [] + for i in range(max_k): + + viterbi_path = [best_paths[i].item()] + for backward_timestep in reversed(path_indices): + viterbi_path.append(int(backward_timestep.view(-1)[viterbi_path[-1]])) + + # Reverse the backward path. + viterbi_path.reverse() + + # Viterbi paths uses (num_tags * n_permutations) nodes; therefore, we need to modulo. + viterbi_path = [j % num_tags for j in viterbi_path] + viterbi_paths.append(viterbi_path) + + return viterbi_paths, viterbi_scores + + def __call__(self, tag_sequence: torch.Tensor): + + outputs = [] + scores = [] + for i in range(tag_sequence.shape[1]): + paths, score = self._viterbi_decode(tag_sequence[:, i, :]) + outputs.append(paths) + scores.append(score) + + return torch.tensor(outputs).transpose(0, -1), torch.cat(scores)[:, 0, :] diff --git a/examples/pipeline/main.py b/examples/pipeline/main.py index 32651f12f6..83cd77d13c 100644 --- a/examples/pipeline/main.py +++ b/examples/pipeline/main.py @@ -16,7 +16,7 @@ from torchaudio.models.wav2letter import Wav2Letter from torchaudio.transforms import MFCC, Resample -from ctc_decoders import GreedyDecoder +from ctc_decoders import GreedyDecoder, GreedyIterableDecoder, ViterbiDecoder, ListViterbiDecoder from datasets import collate_factory, datasets_librispeech from languagemodels import LanguageModel from metrics import levenshtein_distance @@ -68,7 +68,7 @@ def parse_args(): "--decoder", metavar="D", default="greedy", - choices=["greedy"], + choices=["greedy", "greedyiter", "viterbi"], help="decoder to use", ) parser.add_argument( @@ -424,6 +424,12 @@ def main(args, rank=0): if args.decoder == "greedy": decoder = GreedyDecoder() + elif args.decoder == "greedyiter": + decoder = GreedyIterableDecoder() + elif args.decoder == "viterbi": + decoder = ListViterbiDecoder( + training, len(language_model), progress_bar=args.progress_bar + ) model = Wav2Letter( num_classes=language_model.length, input_type="mfcc", num_features=args.n_bins From bc95fb5f2bb5d6fc7b10566396f7f6c469f83a9d Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 23 Jul 2020 12:42:20 -0700 Subject: [PATCH 085/129] changing name of folfder. --- examples/{pipeline => pipeline_wav2letter}/README.md | 0 examples/{pipeline => pipeline_wav2letter}/ctc_decoders.py | 0 examples/{pipeline => pipeline_wav2letter}/datasets.py | 0 examples/{pipeline => pipeline_wav2letter}/languagemodels.py | 0 examples/{pipeline => pipeline_wav2letter}/main.py | 0 examples/{pipeline => pipeline_wav2letter}/metrics.py | 0 examples/{pipeline => pipeline_wav2letter}/utils.py | 0 7 files changed, 0 insertions(+), 0 deletions(-) rename examples/{pipeline => pipeline_wav2letter}/README.md (100%) rename examples/{pipeline => pipeline_wav2letter}/ctc_decoders.py (100%) rename examples/{pipeline => pipeline_wav2letter}/datasets.py (100%) rename examples/{pipeline => pipeline_wav2letter}/languagemodels.py (100%) rename examples/{pipeline => pipeline_wav2letter}/main.py (100%) rename examples/{pipeline => pipeline_wav2letter}/metrics.py (100%) rename examples/{pipeline => pipeline_wav2letter}/utils.py (100%) diff --git a/examples/pipeline/README.md b/examples/pipeline_wav2letter/README.md similarity index 100% rename from examples/pipeline/README.md rename to examples/pipeline_wav2letter/README.md diff --git a/examples/pipeline/ctc_decoders.py b/examples/pipeline_wav2letter/ctc_decoders.py similarity index 100% rename from examples/pipeline/ctc_decoders.py rename to examples/pipeline_wav2letter/ctc_decoders.py diff --git a/examples/pipeline/datasets.py b/examples/pipeline_wav2letter/datasets.py similarity index 100% rename from examples/pipeline/datasets.py rename to examples/pipeline_wav2letter/datasets.py diff --git a/examples/pipeline/languagemodels.py b/examples/pipeline_wav2letter/languagemodels.py similarity index 100% rename from examples/pipeline/languagemodels.py rename to examples/pipeline_wav2letter/languagemodels.py diff --git a/examples/pipeline/main.py b/examples/pipeline_wav2letter/main.py similarity index 100% rename from examples/pipeline/main.py rename to examples/pipeline_wav2letter/main.py diff --git a/examples/pipeline/metrics.py b/examples/pipeline_wav2letter/metrics.py similarity index 100% rename from examples/pipeline/metrics.py rename to examples/pipeline_wav2letter/metrics.py diff --git a/examples/pipeline/utils.py b/examples/pipeline_wav2letter/utils.py similarity index 100% rename from examples/pipeline/utils.py rename to examples/pipeline_wav2letter/utils.py From d8ee1e9f3d91cb01fae8a4d065d27a0d4dff23fe Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 23 Jul 2020 12:43:14 -0700 Subject: [PATCH 086/129] remove other decoders, and unused dataset class. --- examples/pipeline_wav2letter/ctc_decoders.py | 218 ------------------- examples/pipeline_wav2letter/datasets.py | 22 -- examples/pipeline_wav2letter/main.py | 10 +- 3 files changed, 2 insertions(+), 248 deletions(-) diff --git a/examples/pipeline_wav2letter/ctc_decoders.py b/examples/pipeline_wav2letter/ctc_decoders.py index 9a3018be93..b4f155d6fa 100644 --- a/examples/pipeline_wav2letter/ctc_decoders.py +++ b/examples/pipeline_wav2letter/ctc_decoders.py @@ -1,30 +1,4 @@ -from collections import Counter - -import torch from torch import topk -from tqdm import tqdm - - -class GreedyIterableDecoder: - def __init__(self, blank_label=0, collapse_repeated=True): - self.blank_label = blank_label - self.collapse_repeated = collapse_repeated - - def __call__(self, output): - arg_maxes = torch.argmax(output, dim=-1) - decodes = [] - for args in arg_maxes: - decode = [] - for j, index in enumerate(args): - if index != self.blank_label: - if self.collapse_repeated and j != 0 and index == args[j - 1]: - continue - decode.append(index.item()) - decode = torch.tensor(decode) - decodes.append(decode) - # decodes = torch.tensor(decodes) - decodes = torch.nn.utils.rnn.pad_sequence(decodes, batch_first=True) - return decodes class GreedyDecoder: @@ -39,195 +13,3 @@ def __call__(self, outputs): """ _, indices = topk(outputs, k=1, dim=-1) return indices[..., 0] - - -def zeros_like(m): - return zeros(len(m), len(m[0])) - - -def zeros(d1, d2): - return list(list(0 for _ in range(d2)) for _ in range(d1)) - - -def apply_transpose(f, m): - return list(map(f, zip(*m))) - - -def argmax(l): - return max(range(len(l)), key=lambda i: l[i]) - - -def add1d2d(m1, m2): - return [[v2 + v1 for v2 in m2_row] for m2_row, v1 in zip(m2, m1)] - - -def add1d1d(v1, v2): - return [e + s for e, s in zip(v1, v2)] - - -class ListViterbiDecoder: - def __init__(self, data_loader, vocab_size, n=2, progress_bar=False): - self._transitions = self._build_transitions( - data_loader, vocab_size, n, progress_bar - ) - - def __call__(self, emissions): - return torch.tensor([self._decode(emissions[i].tolist(), self._transitions)[0] for i in range(len(emissions))]) - - @staticmethod - def _build_transitions(data_loader, vocab_size, n=2, progress_bar=False): - - # Count n-grams - count = Counter() - for _, label in tqdm(data_loader, disable=not progress_bar): - count += Counter(a for a in zip(*(label[i:] for i in range(n)))) - - # Write as matrix - transitions = zeros(vocab_size, vocab_size) - for (k1, k2), v in count.items(): - transitions[k1][k2] = v - - return transitions - - @staticmethod - def _decode(emissions, transitions): - scores = zeros_like(emissions) - back_pointers = zeros_like(emissions) - scores = emissions[0] - - # Generate most likely scores and paths for each step in sequence - for i in range(1, len(emissions)): - score_with_transition = add1d2d(scores, transitions) - max_score_with_transition = apply_transpose(max, score_with_transition) - scores = add1d1d(emissions[i], max_score_with_transition) - back_pointers[i] = apply_transpose(argmax, score_with_transition) - - # Generate the most likely path - viterbi = [argmax(scores)] - for bp in reversed(back_pointers[1:]): - viterbi.append(bp[viterbi[-1]]) - viterbi.reverse() - viterbi_score = max(scores) - - return viterbi, viterbi_score - - -class ViterbiDecoder: - def __init__(self, data_loader, vocab_size, n=2, progress_bar=False): - self.vocab_size = vocab_size - self.n = n - self.top_k = 1 - self.progress_bar = progress_bar - - self._build_transitions(data_loader) - - def _build_transitions(self, data_loader): - - # Count n-grams - - c = Counter() - for _, label in tqdm(data_loader, disable=not self.progress_bar): - count = Counter( - tuple(b.item() for b in a) - for a in zip(*(label[i:] for i in range(self.n))) - ) - c += count - - # Encode as transition matrix - - ind = torch.tensor([a for (a, _) in c.items()]).t() - val = torch.tensor([b for (_, b) in c.items()], dtype=torch.float) - - transitions = ( - torch.sparse_coo_tensor( - indices=ind, values=val, size=[self.vocab_size, self.vocab_size] - ) - .coalesce() - .to_dense() - ) - transitions = transitions / torch.max( - torch.tensor(1.0), transitions.max(dim=1)[0] - ).unsqueeze(1) - - self.transitions = transitions - - def _viterbi_decode(self, tag_sequence: torch.Tensor): - """ - Perform Viterbi decoding in log space over a sequence given a transition matrix - specifying pairwise (transition) potentials between tags and a matrix of shape - (sequence_length, num_tags) specifying unary potentials for possible tags per - timestep. - - Parameters - ---------- - tag_sequence : torch.Tensor, required. - A tensor of shape (sequence_length, num_tags) representing scores for - a set of tags over a given sequence. - - Returns - ------- - viterbi_path : List[int] - The tag indices of the maximum likelihood tag sequence. - viterbi_score : float - The score of the viterbi path. - """ - sequence_length, num_tags = tag_sequence.size() - - path_scores = [] - path_indices = [] - # At the beginning, the maximum number of permutations is 1; therefore, we unsqueeze(0) - # to allow for 1 permutation. - path_scores.append(tag_sequence[0, :].unsqueeze(0)) - # assert path_scores[0].size() == (n_permutations, num_tags) - - # Evaluate the scores for all possible paths. - for timestep in range(1, sequence_length): - # Add pairwise potentials to current scores. - # assert path_scores[timestep - 1].size() == (n_permutations, num_tags) - summed_potentials = ( - path_scores[timestep - 1].unsqueeze(2) + self.transitions - ) - summed_potentials = summed_potentials.view(-1, num_tags) - - # Best pairwise potential path score from the previous timestep. - max_k = min(summed_potentials.size()[0], self.top_k) - scores, paths = torch.topk(summed_potentials, k=max_k, dim=0) - # assert scores.size() == (n_permutations, num_tags) - # assert paths.size() == (n_permutations, num_tags) - - scores = tag_sequence[timestep, :] + scores - # assert scores.size() == (n_permutations, num_tags) - path_scores.append(scores) - path_indices.append(paths.squeeze()) - - # Construct the most likely sequence backwards. - path_scores = path_scores[-1].view(-1) - max_k = min(path_scores.size()[0], self.top_k) - viterbi_scores, best_paths = torch.topk(path_scores, k=max_k, dim=0) - - viterbi_paths = [] - for i in range(max_k): - - viterbi_path = [best_paths[i].item()] - for backward_timestep in reversed(path_indices): - viterbi_path.append(int(backward_timestep.view(-1)[viterbi_path[-1]])) - - # Reverse the backward path. - viterbi_path.reverse() - - # Viterbi paths uses (num_tags * n_permutations) nodes; therefore, we need to modulo. - viterbi_path = [j % num_tags for j in viterbi_path] - viterbi_paths.append(viterbi_path) - - return viterbi_paths, viterbi_scores - - def __call__(self, tag_sequence: torch.Tensor): - - outputs = [] - scores = [] - for i in range(tag_sequence.shape[1]): - paths, score = self._viterbi_decode(tag_sequence[:, i, :]) - outputs.append(paths) - scores.append(score) - - return torch.tensor(outputs).transpose(0, -1), torch.cat(scores)[:, 0, :] diff --git a/examples/pipeline_wav2letter/datasets.py b/examples/pipeline_wav2letter/datasets.py index fbca2559af..e36b1a85d9 100644 --- a/examples/pipeline_wav2letter/datasets.py +++ b/examples/pipeline_wav2letter/datasets.py @@ -4,28 +4,6 @@ from torchaudio.datasets import LIBRISPEECH -class IterableMemoryCache: - def __init__(self, iterable): - self.iterable = iterable - self._iter = iter(iterable) - self._done = False - self._values = [] - - def __iter__(self): - if self._done: - return iter(self._values) - return itertools.chain(self._values, self._gen_iter()) - - def _gen_iter(self): - for new_value in self._iter: - self._values.append(new_value) - yield new_value - self._done = True - - def __len__(self): - return len(self._iterable) - - class MapMemoryCache(torch.utils.data.Dataset): """ Wrap a dataset so that, whenever a new item is returned, it is saved to memory. diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index 83cd77d13c..32651f12f6 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -16,7 +16,7 @@ from torchaudio.models.wav2letter import Wav2Letter from torchaudio.transforms import MFCC, Resample -from ctc_decoders import GreedyDecoder, GreedyIterableDecoder, ViterbiDecoder, ListViterbiDecoder +from ctc_decoders import GreedyDecoder from datasets import collate_factory, datasets_librispeech from languagemodels import LanguageModel from metrics import levenshtein_distance @@ -68,7 +68,7 @@ def parse_args(): "--decoder", metavar="D", default="greedy", - choices=["greedy", "greedyiter", "viterbi"], + choices=["greedy"], help="decoder to use", ) parser.add_argument( @@ -424,12 +424,6 @@ def main(args, rank=0): if args.decoder == "greedy": decoder = GreedyDecoder() - elif args.decoder == "greedyiter": - decoder = GreedyIterableDecoder() - elif args.decoder == "viterbi": - decoder = ListViterbiDecoder( - training, len(language_model), progress_bar=args.progress_bar - ) model = Wav2Letter( num_classes=language_model.length, input_type="mfcc", num_features=args.n_bins From 0503f65aa849a7641f5e0216cd77474fd3713087 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 23 Jul 2020 13:37:25 -0700 Subject: [PATCH 087/129] rename functions to align with other pipeline. --- examples/pipeline_wav2letter/datasets.py | 24 +++++++++++------------- examples/pipeline_wav2letter/main.py | 6 +++--- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/examples/pipeline_wav2letter/datasets.py b/examples/pipeline_wav2letter/datasets.py index e36b1a85d9..b3f6afdd7e 100644 --- a/examples/pipeline_wav2letter/datasets.py +++ b/examples/pipeline_wav2letter/datasets.py @@ -26,19 +26,18 @@ def __len__(self): return len(self.dataset) -class ProcessedLIBRISPEECH(LIBRISPEECH): - def __init__(self, transforms, encode, *args, **kwargs): +class Processed(torch.utils.data.Dataset): + def __init__(self, dataset, transforms, encode): + self.dataset = dataset self.transforms = transforms self.encode = encode - super().__init__(*args, **kwargs) def __getitem__(self, key): - item = super().__getitem__(key) + item = self.dataset[key] return self.process_datapoint(item) - def __next__(self): - item = super().__next__() - return self.process_datapoint(item) + def __len__(self): + return len(self.dataset) def process_datapoint(self, item): transformed = item[0] # .to(device) @@ -56,7 +55,7 @@ def process_datapoint(self, item): return transformed, target -def datasets_librispeech( +def split_process_librispeech( transforms, language_model, root="/datasets01/", @@ -69,13 +68,12 @@ def create(tag): data = torch.utils.data.ConcatDataset( [ - ProcessedLIBRISPEECH( + Processed( + LIBRISPEECH( + root, t, folder_in_archive=folder_in_archive, download=False, + ), transforms, language_model.encode, - root, - t, - folder_in_archive=folder_in_archive, - download=False, ) for t in tag ] diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index 32651f12f6..fd59bce134 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -12,12 +12,12 @@ from torch.optim import SGD, Adadelta, Adam from torch.optim.lr_scheduler import ExponentialLR, ReduceLROnPlateau from torch.utils.data import DataLoader -from torchaudio.datasets.utils import bg_iterator, diskcache_iterator +from torchaudio.datasets.utils import bg_iterator from torchaudio.models.wav2letter import Wav2Letter from torchaudio.transforms import MFCC, Resample from ctc_decoders import GreedyDecoder -from datasets import collate_factory, datasets_librispeech +from datasets import collate_factory, split_process_librispeech from languagemodels import LanguageModel from metrics import levenshtein_distance from utils import MetricLogger, count_parameters, save_checkpoint @@ -420,7 +420,7 @@ def main(args, rank=0): labels = char_blank + char_space + char_apostrophe + string.ascii_lowercase language_model = LanguageModel(labels, char_blank, char_space) - training, validation, _ = datasets_librispeech(transforms, language_model) + training, validation, _ = split_process_librispeech(transforms, language_model) if args.decoder == "greedy": decoder = GreedyDecoder() From cef6c5093e35a699a7524e755cdb1bbe847946f6 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Fri, 24 Jul 2020 15:07:24 -0700 Subject: [PATCH 088/129] pick which parts to train with. --- examples/pipeline_wav2letter/datasets.py | 10 ++-------- examples/pipeline_wav2letter/main.py | 22 ++++++++++++++++++---- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/examples/pipeline_wav2letter/datasets.py b/examples/pipeline_wav2letter/datasets.py index b3f6afdd7e..5beb73a5cb 100644 --- a/examples/pipeline_wav2letter/datasets.py +++ b/examples/pipeline_wav2letter/datasets.py @@ -1,5 +1,3 @@ -import itertools - import torch from torchaudio.datasets import LIBRISPEECH @@ -56,10 +54,7 @@ def process_datapoint(self, item): def split_process_librispeech( - transforms, - language_model, - root="/datasets01/", - folder_in_archive="librispeech/062419/", + datasets, transforms, language_model, root, folder_in_archive, ): def create(tag): @@ -83,8 +78,7 @@ def create(tag): data = MapMemoryCache(data) return data - return create("train-clean-100"), create("dev-clean"), None - # return create(["train-clean-100", "train-clean-360", "train-other-500"]), create(["dev-clean", "dev-other"]), None + return tuple(create(dataset) for dataset in datasets) def collate_factory(model_length_function): diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index fd59bce134..842b99fb32 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -119,10 +119,18 @@ def parse_args(): parser.add_argument("--rho", metavar="RHO", type=float, default=0.95) parser.add_argument("--clip-grad", metavar="NORM", type=float, default=0.0) parser.add_argument( - "--dataset", - default="librispeech", + "--dataset-train", + default=["train-100"], + nargs="+", type=str, - help="select dataset to train with", + help="select which part of librispeech to train with", + ) + parser.add_argument( + "--dataset-valid", + default=["dev-clean"], + nargs="+", + type=str, + help="select which part of librispeech to validate with", ) parser.add_argument( "--distributed", action="store_true", help="enable DistributedDataParallel" @@ -420,7 +428,13 @@ def main(args, rank=0): labels = char_blank + char_space + char_apostrophe + string.ascii_lowercase language_model = LanguageModel(labels, char_blank, char_space) - training, validation, _ = split_process_librispeech(transforms, language_model) + training, validation = split_process_librispeech( + [args.datasets_train, args.datasets_valid], + transforms, + language_model, + root="/datasets01/", + folder_in_archive="librispeech/062419/", + ) if args.decoder == "greedy": decoder = GreedyDecoder() From 0a90df5fc1c37908ec636fff07a50df382808442 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Fri, 24 Jul 2020 15:42:38 -0700 Subject: [PATCH 089/129] adding specaugment to validation. note that caching prevents randomization from happening in validation. --- examples/pipeline_wav2letter/datasets.py | 19 ++++++++----- examples/pipeline_wav2letter/main.py | 34 ++++++++++++++++++++---- 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/examples/pipeline_wav2letter/datasets.py b/examples/pipeline_wav2letter/datasets.py index 5beb73a5cb..9ebba13eab 100644 --- a/examples/pipeline_wav2letter/datasets.py +++ b/examples/pipeline_wav2letter/datasets.py @@ -56,21 +56,25 @@ def process_datapoint(self, item): def split_process_librispeech( datasets, transforms, language_model, root, folder_in_archive, ): - def create(tag): + def create(tags, cache=True): - if isinstance(tag, str): - tag = [tag] + if isinstance(tags, str): + tags = [tags] + if isinstance(transforms, list): + transform_list = transforms + else: + transform_list = [transforms] data = torch.utils.data.ConcatDataset( [ Processed( LIBRISPEECH( - root, t, folder_in_archive=folder_in_archive, download=False, + root, tag, folder_in_archive=folder_in_archive, download=False, ), - transforms, + transform, language_model.encode, ) - for t in tag + for tag, transform in zip(tags, transform_list) ] ) @@ -78,6 +82,9 @@ def create(tag): data = MapMemoryCache(data) return data + # FIXME For performance, we cache all datasets + # Do not cache first dataset + # return tuple(create(dataset, cache=i > 0) for i, dataset in enumerate(datasets)) return tuple(create(dataset) for dataset in datasets) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index 842b99fb32..4a6bcf02bc 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -30,6 +30,20 @@ def parse_args(): parser = argparse.ArgumentParser() + parser.add_argument( + "--freq-mask", + default=0, + type=int, + metavar="N", + help="maximal width of frequency mask", + ) + parser.add_argument( + "--time-mask", + default=0, + type=int, + metavar="N", + help="maximal width of time mask", + ) parser.add_argument( "--workers", default=0, @@ -410,16 +424,26 @@ def main(args, rank=0): sample_rate_original = 16000 - transforms = torch.nn.Sequential( + transforms_valid = torch.nn.Sequential( # torchaudio.transforms.Resample(sample_rate_original, sample_rate_original//2), # torchaudio.transforms.MFCC(sample_rate=sample_rate_original, n_mfcc=args.n_bins, melkwargs=melkwargs), torchaudio.transforms.MelSpectrogram( sample_rate=sample_rate_original, **melkwargs ), - # torchaudio.transforms.FrequencyMasking(freq_mask_param=args.n_bins), - # torchaudio.transforms.TimeMasking(time_mask_param=35) ) + transforms_train = transforms_valid + if args.freq_mask: + transforms_train = torch.nn.Sequential( + transforms_train, + torchaudio.transforms.FrequencyMasking(freq_mask_param=args.freq_mask), + ) + if args.time_mask: + transforms_train = torch.nn.Sequential( + transforms_train, + torchaudio.transforms.TimeMasking(time_mask_param=args.time_mask), + ) + # Text preprocessing char_blank = "*" @@ -429,8 +453,8 @@ def main(args, rank=0): language_model = LanguageModel(labels, char_blank, char_space) training, validation = split_process_librispeech( - [args.datasets_train, args.datasets_valid], - transforms, + [args.dataset_train, args.dataset_valid], + [transforms_train, transforms_valid], language_model, root="/datasets01/", folder_in_archive="librispeech/062419/", From 1563288297f5d42bb12ad89e338b8b5a4044c43a Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Mon, 27 Jul 2020 08:05:57 -0700 Subject: [PATCH 090/129] updating readme. --- examples/pipeline_wav2letter/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/pipeline_wav2letter/README.md b/examples/pipeline_wav2letter/README.md index 4fce6364c6..a9baf39095 100644 --- a/examples/pipeline_wav2letter/README.md +++ b/examples/pipeline_wav2letter/README.md @@ -12,12 +12,12 @@ python main.py \ --momentum .8 \ --clip-grad 0. \ --optimizer "adadelta" \ - --scheduler "exponential" + --scheduler "reduceonplateau" ``` ### Output -The information reported at each iteration and epoch (e.g. loss, character error rate, word error rate) is printed to standard output in the form of one json per line. Further information is reported to standard error. Here is an example python function to parse the standard output. +The information reported at each iteration and epoch (e.g. loss, character error rate, word error rate) is printed to standard output in the form of one json per line. Further information is reported to standard error. Here is an example python function to parse the standard output when saved to a file. ```python def read_json(filename): """ From 463a25cba66fcb4a905bb824853ddf420624cd8d Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Mon, 27 Jul 2020 14:36:19 -0700 Subject: [PATCH 091/129] typo in metric logging. --- examples/pipeline_wav2letter/main.py | 34 ++++++++++++---------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index 4a6bcf02bc..c903daf5c4 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -247,7 +247,7 @@ def train_one_epoch( optimizer.step() - sums["length_dataset"] += len(inputs) + sums["dataset length"] += len(inputs) compute_error_rates(outputs, targets, decoder, language_model, sums) record_error_rates(sums, metric) @@ -287,13 +287,13 @@ def record_error_rates(sums, metric): metric["cer"] = sums["cer"] metric["wer"] = sums["wer"] - metric["cer over dataset length"] = sums["cer"] / sums["length_dataset"] - metric["wer over dataset length"] = sums["wer"] / sums["length_dataset"] - metric["cer over target length"] = sums["cer"] / sums["total_chars"] - metric["wer over target length"] = sums["wer"] / sums["total_words"] - metric["target length"] = sums["total_chars"] - metric["target length"] = sums["total_words"] - metric["dataset length"] = sums["length_dataset"] + metric["cer over dataset length"] = sums["cer"] / sums["dataset length"] + metric["wer over dataset length"] = sums["wer"] / sums["dataset length"] + metric["cer over target length"] = sums["cer"] / sums["total chars"] + metric["wer over target length"] = sums["wer"] / sums["total words"] + metric["target chars"] = sums["total chars"] + metric["target words"] = sums["total words"] + metric["dataset length"] = sums[""] def compute_error_rates(outputs, targets, decoder, language_model, sums): @@ -309,24 +309,20 @@ def compute_error_rates(outputs, targets, decoder, language_model, sums): target_print = target[i].ljust(print_length)[:print_length] logging.info(f"Target: {target_print} Output: {output_print}") - cers = [levenshtein_distance(a, b) for a, b in zip(target, output)] - # cers_normalized = [d / len(a) for a, d in zip(target, cers)] + cers = [levenshtein_distance(t, o) for t, o in zip(target, output)] cers = sum(cers) n = sum(len(t) for t in target) sums["cer"] += cers - # sums["cer_relative"] += cers / n - sums["total_chars"] += n + sums["total chars"] += n output = [o.split(language_model.char_space) for o in output] - target = [o.split(language_model.char_space) for o in target] + target = [t.split(language_model.char_space) for t in target] - wers = [levenshtein_distance(a, b) for a, b in zip(target, output)] - # wers_normalized = [d / len(a) for a, d in zip(target, wers)] + wers = [levenshtein_distance(t, o) for t, o in zip(target, output)] wers = sum(wers) - n = len(target) + n = sum(len(t) for t in target) sums["wer"] += wers - # sums["wer_relative"] += wers / n - sums["total_words"] += n + sums["total words"] += n def evaluate( @@ -360,7 +356,7 @@ def evaluate( outputs, targets, tensors_lengths, target_lengths ).item() - sums["length_dataset"] += len(inputs) + sums["dataset length"] += len(inputs) compute_error_rates(outputs, targets, decoder, language_model, sums) From 18a18e63f621e778f15fdffdd18f4b1491210a05 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Mon, 27 Jul 2020 14:36:52 -0700 Subject: [PATCH 092/129] Revert "typo in metric logging." This reverts commit acac245eec250f61d2039a67933d3c01f1975ce9. --- examples/pipeline_wav2letter/main.py | 34 ++++++++++++++++------------ 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index c903daf5c4..4a6bcf02bc 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -247,7 +247,7 @@ def train_one_epoch( optimizer.step() - sums["dataset length"] += len(inputs) + sums["length_dataset"] += len(inputs) compute_error_rates(outputs, targets, decoder, language_model, sums) record_error_rates(sums, metric) @@ -287,13 +287,13 @@ def record_error_rates(sums, metric): metric["cer"] = sums["cer"] metric["wer"] = sums["wer"] - metric["cer over dataset length"] = sums["cer"] / sums["dataset length"] - metric["wer over dataset length"] = sums["wer"] / sums["dataset length"] - metric["cer over target length"] = sums["cer"] / sums["total chars"] - metric["wer over target length"] = sums["wer"] / sums["total words"] - metric["target chars"] = sums["total chars"] - metric["target words"] = sums["total words"] - metric["dataset length"] = sums[""] + metric["cer over dataset length"] = sums["cer"] / sums["length_dataset"] + metric["wer over dataset length"] = sums["wer"] / sums["length_dataset"] + metric["cer over target length"] = sums["cer"] / sums["total_chars"] + metric["wer over target length"] = sums["wer"] / sums["total_words"] + metric["target length"] = sums["total_chars"] + metric["target length"] = sums["total_words"] + metric["dataset length"] = sums["length_dataset"] def compute_error_rates(outputs, targets, decoder, language_model, sums): @@ -309,20 +309,24 @@ def compute_error_rates(outputs, targets, decoder, language_model, sums): target_print = target[i].ljust(print_length)[:print_length] logging.info(f"Target: {target_print} Output: {output_print}") - cers = [levenshtein_distance(t, o) for t, o in zip(target, output)] + cers = [levenshtein_distance(a, b) for a, b in zip(target, output)] + # cers_normalized = [d / len(a) for a, d in zip(target, cers)] cers = sum(cers) n = sum(len(t) for t in target) sums["cer"] += cers - sums["total chars"] += n + # sums["cer_relative"] += cers / n + sums["total_chars"] += n output = [o.split(language_model.char_space) for o in output] - target = [t.split(language_model.char_space) for t in target] + target = [o.split(language_model.char_space) for o in target] - wers = [levenshtein_distance(t, o) for t, o in zip(target, output)] + wers = [levenshtein_distance(a, b) for a, b in zip(target, output)] + # wers_normalized = [d / len(a) for a, d in zip(target, wers)] wers = sum(wers) - n = sum(len(t) for t in target) + n = len(target) sums["wer"] += wers - sums["total words"] += n + # sums["wer_relative"] += wers / n + sums["total_words"] += n def evaluate( @@ -356,7 +360,7 @@ def evaluate( outputs, targets, tensors_lengths, target_lengths ).item() - sums["dataset length"] += len(inputs) + sums["length_dataset"] += len(inputs) compute_error_rates(outputs, targets, decoder, language_model, sums) From c4545d2e3b3059cc37aa6000f7145da1111b5788 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Mon, 27 Jul 2020 14:37:37 -0700 Subject: [PATCH 093/129] Revert "Revert "typo in metric logging."" This reverts commit 2c80d9691ed401044da734c40df3715dba92d0db. --- examples/pipeline_wav2letter/main.py | 34 ++++++++++++---------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index 4a6bcf02bc..c903daf5c4 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -247,7 +247,7 @@ def train_one_epoch( optimizer.step() - sums["length_dataset"] += len(inputs) + sums["dataset length"] += len(inputs) compute_error_rates(outputs, targets, decoder, language_model, sums) record_error_rates(sums, metric) @@ -287,13 +287,13 @@ def record_error_rates(sums, metric): metric["cer"] = sums["cer"] metric["wer"] = sums["wer"] - metric["cer over dataset length"] = sums["cer"] / sums["length_dataset"] - metric["wer over dataset length"] = sums["wer"] / sums["length_dataset"] - metric["cer over target length"] = sums["cer"] / sums["total_chars"] - metric["wer over target length"] = sums["wer"] / sums["total_words"] - metric["target length"] = sums["total_chars"] - metric["target length"] = sums["total_words"] - metric["dataset length"] = sums["length_dataset"] + metric["cer over dataset length"] = sums["cer"] / sums["dataset length"] + metric["wer over dataset length"] = sums["wer"] / sums["dataset length"] + metric["cer over target length"] = sums["cer"] / sums["total chars"] + metric["wer over target length"] = sums["wer"] / sums["total words"] + metric["target chars"] = sums["total chars"] + metric["target words"] = sums["total words"] + metric["dataset length"] = sums[""] def compute_error_rates(outputs, targets, decoder, language_model, sums): @@ -309,24 +309,20 @@ def compute_error_rates(outputs, targets, decoder, language_model, sums): target_print = target[i].ljust(print_length)[:print_length] logging.info(f"Target: {target_print} Output: {output_print}") - cers = [levenshtein_distance(a, b) for a, b in zip(target, output)] - # cers_normalized = [d / len(a) for a, d in zip(target, cers)] + cers = [levenshtein_distance(t, o) for t, o in zip(target, output)] cers = sum(cers) n = sum(len(t) for t in target) sums["cer"] += cers - # sums["cer_relative"] += cers / n - sums["total_chars"] += n + sums["total chars"] += n output = [o.split(language_model.char_space) for o in output] - target = [o.split(language_model.char_space) for o in target] + target = [t.split(language_model.char_space) for t in target] - wers = [levenshtein_distance(a, b) for a, b in zip(target, output)] - # wers_normalized = [d / len(a) for a, d in zip(target, wers)] + wers = [levenshtein_distance(t, o) for t, o in zip(target, output)] wers = sum(wers) - n = len(target) + n = sum(len(t) for t in target) sums["wer"] += wers - # sums["wer_relative"] += wers / n - sums["total_words"] += n + sums["total words"] += n def evaluate( @@ -360,7 +356,7 @@ def evaluate( outputs, targets, tensors_lengths, target_lengths ).item() - sums["length_dataset"] += len(inputs) + sums["dataset length"] += len(inputs) compute_error_rates(outputs, targets, decoder, language_model, sums) From 8e2d1f7bd4b1e4540878d4dd0fbffbd0f8e815c5 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Mon, 27 Jul 2020 16:19:59 -0700 Subject: [PATCH 094/129] update metric logger. --- examples/pipeline_wav2letter/main.py | 104 ++++++++++---------------- examples/pipeline_wav2letter/utils.py | 6 +- 2 files changed, 42 insertions(+), 68 deletions(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index c903daf5c4..dab2c3eb8d 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -207,17 +207,14 @@ def train_one_epoch( model.train() - sums = defaultdict(lambda: 0.0) - start1 = time() - - metric = MetricLogger("train_iteration") + metric = MetricLogger("train") metric["epoch"] = epoch for inputs, targets, tensors_lengths, target_lengths in bg_iterator( data_loader, maxsize=2 ): - start2 = time() + start = time() inputs = inputs.to(device, non_blocking=True) targets = targets.to(device, non_blocking=True) @@ -231,80 +228,56 @@ def train_one_epoch( # target_lengths: batch size loss = criterion(outputs, targets, tensors_lengths, target_lengths) - loss_item = loss.item() - sums["loss"] += loss_item - metric["loss"] = loss_item + metric["loss"] = loss.item() + metric["cumulative loss"] += metric["loss"] optimizer.zero_grad() loss.backward() if args.clip_grad > 0: - gradient = torch.nn.utils.clip_grad_norm_( + metric["gradient"] = torch.nn.utils.clip_grad_norm_( model.parameters(), args.clip_grad ) - sums["gradient"] += gradient - metric["gradient"] = gradient optimizer.step() - sums["dataset length"] += len(inputs) + compute_error_rates(outputs, targets, decoder, language_model, metric) - compute_error_rates(outputs, targets, decoder, language_model, sums) - record_error_rates(sums, metric) + try: + metric["lr"] = scheduler.get_last_lr()[0] + except AttributeError: + pass - metric["iteration"] = sums["iteration"] - metric["time"] = time() - start2 - metric["epoch"] = epoch + metric["batch size"] = len(inputs) + metric["dataset length"] += metric["batch size"] + metric["iteration"] += 1 + metric["average loss"] = metric["cumulative loss"] / metric["iteration"] + metric["iteration time"] = time() - start + metric["epoch time"] += metric["time"] metric() - sums["iteration"] += 1 - # TODO Remove before merge pull request if SIGNAL_RECEIVED: - return loss_item - - avg_loss = sums["loss"] / len(data_loader) - - metric = MetricLogger("train_epoch") - record_error_rates(sums, metric) - metric["epoch"] = epoch - metric["loss"] = avg_loss - metric["gradient"] = sums["gradient"] / len(data_loader) - try: - metric["lr"] = scheduler.get_last_lr()[0] - except AttributeError: - pass - metric["time"] = time() - start1 - metric() + break if isinstance(scheduler, ReduceLROnPlateau): - scheduler.step(avg_loss) + scheduler.step(metric["average loss"]) else: scheduler.step() -def record_error_rates(sums, metric): - - metric["cer"] = sums["cer"] - metric["wer"] = sums["wer"] - metric["cer over dataset length"] = sums["cer"] / sums["dataset length"] - metric["wer over dataset length"] = sums["wer"] / sums["dataset length"] - metric["cer over target length"] = sums["cer"] / sums["total chars"] - metric["wer over target length"] = sums["wer"] / sums["total words"] - metric["target chars"] = sums["total chars"] - metric["target words"] = sums["total words"] - metric["dataset length"] = sums[""] - - -def compute_error_rates(outputs, targets, decoder, language_model, sums): +def compute_error_rates(outputs, targets, decoder, language_model, metric): output = outputs.transpose(0, 1).to("cpu") output = decoder(output) + # Compute CER + output = language_model.decode(output.tolist()) target = language_model.decode(targets.tolist()) print_length = 20 for i in range(2): + # Print a few examples output_print = output[i].ljust(print_length)[:print_length] target_print = target[i].ljust(print_length)[:print_length] logging.info(f"Target: {target_print} Output: {output_print}") @@ -312,8 +285,11 @@ def compute_error_rates(outputs, targets, decoder, language_model, sums): cers = [levenshtein_distance(t, o) for t, o in zip(target, output)] cers = sum(cers) n = sum(len(t) for t in target) - sums["cer"] += cers - sums["total chars"] += n + metric["cer"] += cers + metric["total chars"] += n + metric["cer over target length"] = metric["cer"] / metric["total chars"] + + # Compute WER output = [o.split(language_model.char_space) for o in output] target = [t.split(language_model.char_space) for t in target] @@ -321,8 +297,9 @@ def compute_error_rates(outputs, targets, decoder, language_model, sums): wers = [levenshtein_distance(t, o) for t, o in zip(target, output)] wers = sum(wers) n = sum(len(t) for t in target) - sums["wer"] += wers - sums["total words"] += n + metric["wer"] += wers + metric["total words"] += n + metric["wer over target length"] = metric["wer"] / metric["total words"] def evaluate( @@ -332,9 +309,9 @@ def evaluate( with torch.no_grad(): model.eval() - sums = defaultdict(lambda: 0.0) start = time() metric = MetricLogger("validation") + metric["epoch"] = epoch for inputs, targets, tensors_lengths, target_lengths in bg_iterator( data_loader, maxsize=2 @@ -352,27 +329,24 @@ def evaluate( # input_lengths: batch size # target_lengths: batch size - sums["loss"] += criterion( + metric["cumulative loss"] += criterion( outputs, targets, tensors_lengths, target_lengths ).item() - sums["dataset length"] += len(inputs) + metric["dataset length"] += len(inputs) + metric["iteration"] += 1 - compute_error_rates(outputs, targets, decoder, language_model, sums) + compute_error_rates(outputs, targets, decoder, language_model, metric) # TODO Remove before merge pull request if SIGNAL_RECEIVED: - return sums["loss"] / len(data_loader) - - avg_loss = sums["loss"] / len(data_loader) + break - metric["epoch"] = epoch - metric["loss"] = avg_loss - metric["time"] = time() - start - record_error_rates(sums, metric) + metric["average loss"] = metric["cumulative loss"] / metric["iteration"] + metric["validation time"] = time() - start metric() - return avg_loss + return metric["average loss"] def main(args, rank=0): diff --git a/examples/pipeline_wav2letter/utils.py b/examples/pipeline_wav2letter/utils.py index a5b118f322..00ea509455 100644 --- a/examples/pipeline_wav2letter/utils.py +++ b/examples/pipeline_wav2letter/utils.py @@ -16,11 +16,11 @@ def __init__(self, group, print_freq=1): def __setitem__(self, key, value): self.data[key].append(value) - def _get_last(self): - return {k: v[-1] for k, v in self.data.items()} + def __getitem__(self, key): + return self.data[key][-1] def __str__(self): - return str(self._get_last()) + return str({k: self[k] for k in self.data}) def __call__(self): self._iter = (self._iter + 1) % self.print_freq From 523e0e12a62be62af30c02571351213572ba0326 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Mon, 27 Jul 2020 16:49:27 -0700 Subject: [PATCH 095/129] simplify metric logger implementation. --- examples/pipeline_wav2letter/utils.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/examples/pipeline_wav2letter/utils.py b/examples/pipeline_wav2letter/utils.py index 00ea509455..dbc08f8deb 100644 --- a/examples/pipeline_wav2letter/utils.py +++ b/examples/pipeline_wav2letter/utils.py @@ -6,21 +6,15 @@ import torch -class MetricLogger: - def __init__(self, group, print_freq=1): +class MetricLogger(defaultdict): + def __init__(self, name, print_freq=1): self.print_freq = print_freq self._iter = 0 - self.data = defaultdict(lambda: deque(maxlen=self.print_freq)) - self.data["group"].append(group) - - def __setitem__(self, key, value): - self.data[key].append(value) - - def __getitem__(self, key): - return self.data[key][-1] + super().__init__(lambda: 0.) + self["name"] = name def __str__(self): - return str({k: self[k] for k in self.data}) + return str(dict(self.data)) def __call__(self): self._iter = (self._iter + 1) % self.print_freq From 7efc0286cea7fbf6ea542512e804cd8fede3a34f Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Mon, 27 Jul 2020 16:57:25 -0700 Subject: [PATCH 096/129] use json dumps instead. --- examples/pipeline_wav2letter/utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/pipeline_wav2letter/utils.py b/examples/pipeline_wav2letter/utils.py index dbc08f8deb..ffb6e6dc7e 100644 --- a/examples/pipeline_wav2letter/utils.py +++ b/examples/pipeline_wav2letter/utils.py @@ -1,20 +1,21 @@ +import json import logging import os import shutil -from collections import defaultdict, deque +from collections import defaultdict import torch class MetricLogger(defaultdict): def __init__(self, name, print_freq=1): + super().__init__(lambda: 0.0) self.print_freq = print_freq self._iter = 0 - super().__init__(lambda: 0.) self["name"] = name def __str__(self): - return str(dict(self.data)) + return json.dumps(self.data) def __call__(self): self._iter = (self._iter + 1) % self.print_freq From 7780b26ce71013b35c231226b6f18c02c6683ca2 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Mon, 27 Jul 2020 17:03:36 -0700 Subject: [PATCH 097/129] group metric together. --- examples/pipeline_wav2letter/main.py | 4 ++-- examples/pipeline_wav2letter/utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index dab2c3eb8d..9a5531f2c9 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -228,8 +228,6 @@ def train_one_epoch( # target_lengths: batch size loss = criterion(outputs, targets, tensors_lengths, target_lengths) - metric["loss"] = loss.item() - metric["cumulative loss"] += metric["loss"] optimizer.zero_grad() loss.backward() @@ -251,6 +249,8 @@ def train_one_epoch( metric["batch size"] = len(inputs) metric["dataset length"] += metric["batch size"] metric["iteration"] += 1 + metric["loss"] = loss.item() + metric["cumulative loss"] += metric["loss"] metric["average loss"] = metric["cumulative loss"] / metric["iteration"] metric["iteration time"] = time() - start metric["epoch time"] += metric["time"] diff --git a/examples/pipeline_wav2letter/utils.py b/examples/pipeline_wav2letter/utils.py index ffb6e6dc7e..0b7ef94bd5 100644 --- a/examples/pipeline_wav2letter/utils.py +++ b/examples/pipeline_wav2letter/utils.py @@ -15,7 +15,7 @@ def __init__(self, name, print_freq=1): self["name"] = name def __str__(self): - return json.dumps(self.data) + return json.dumps(self) def __call__(self): self._iter = (self._iter + 1) % self.print_freq From 0006d89d97b5269f1b8f6d70cf6db7a13d2c1d13 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Mon, 27 Jul 2020 17:27:23 -0700 Subject: [PATCH 098/129] move function. --- examples/pipeline_wav2letter/main.py | 72 ++++++++++++++-------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index 9a5531f2c9..fda40477ba 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -193,6 +193,42 @@ def model_length_function(tensor): return int(tensor.shape[0]) // 2 + 1 +def compute_error_rates(outputs, targets, decoder, language_model, metric): + output = outputs.transpose(0, 1).to("cpu") + output = decoder(output) + + # Compute CER + + output = language_model.decode(output.tolist()) + target = language_model.decode(targets.tolist()) + + print_length = 20 + for i in range(2): + # Print a few examples + output_print = output[i].ljust(print_length)[:print_length] + target_print = target[i].ljust(print_length)[:print_length] + logging.info(f"Target: {target_print} Output: {output_print}") + + cers = [levenshtein_distance(t, o) for t, o in zip(target, output)] + cers = sum(cers) + n = sum(len(t) for t in target) + metric["cer"] += cers + metric["total chars"] += n + metric["cer over target length"] = metric["cer"] / metric["total chars"] + + # Compute WER + + output = [o.split(language_model.char_space) for o in output] + target = [t.split(language_model.char_space) for t in target] + + wers = [levenshtein_distance(t, o) for t, o in zip(target, output)] + wers = sum(wers) + n = sum(len(t) for t in target) + metric["wer"] += wers + metric["total words"] += n + metric["wer over target length"] = metric["wer"] / metric["total words"] + + def train_one_epoch( model, criterion, @@ -266,42 +302,6 @@ def train_one_epoch( scheduler.step() -def compute_error_rates(outputs, targets, decoder, language_model, metric): - output = outputs.transpose(0, 1).to("cpu") - output = decoder(output) - - # Compute CER - - output = language_model.decode(output.tolist()) - target = language_model.decode(targets.tolist()) - - print_length = 20 - for i in range(2): - # Print a few examples - output_print = output[i].ljust(print_length)[:print_length] - target_print = target[i].ljust(print_length)[:print_length] - logging.info(f"Target: {target_print} Output: {output_print}") - - cers = [levenshtein_distance(t, o) for t, o in zip(target, output)] - cers = sum(cers) - n = sum(len(t) for t in target) - metric["cer"] += cers - metric["total chars"] += n - metric["cer over target length"] = metric["cer"] / metric["total chars"] - - # Compute WER - - output = [o.split(language_model.char_space) for o in output] - target = [t.split(language_model.char_space) for t in target] - - wers = [levenshtein_distance(t, o) for t, o in zip(target, output)] - wers = sum(wers) - n = sum(len(t) for t in target) - metric["wer"] += wers - metric["total words"] += n - metric["wer over target length"] = metric["wer"] / metric["total words"] - - def evaluate( model, criterion, data_loader, decoder, language_model, device, epoch, ): From 68d0ac1da6037ef76a60553514bcfbc4cfc3df31 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 28 Jul 2020 14:34:05 -0700 Subject: [PATCH 099/129] lint. --- examples/pipeline_wav2letter/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index fda40477ba..cdb24c23fa 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -3,7 +3,6 @@ import os import signal import string -from collections import defaultdict from datetime import datetime from time import time From b087ff5159a2a1eb013a8917a595d3cd24c52f54 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 28 Jul 2020 15:26:01 -0700 Subject: [PATCH 100/129] quick summary of files in folder. --- examples/pipeline_wav2letter/README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/examples/pipeline_wav2letter/README.md b/examples/pipeline_wav2letter/README.md index a9baf39095..91546b724e 100644 --- a/examples/pipeline_wav2letter/README.md +++ b/examples/pipeline_wav2letter/README.md @@ -36,3 +36,12 @@ def read_json(filename): data = [json.loads(l) for l in data.splitlines()] return pandas.DataFrame(data) ``` + +## Structure of pipeline + +* `main.py` -- the entry point +* `ctc_decoders.py` -- the greedy CTC decoder +* `datasets.py` -- the function to split and process librispeech, a collate factory function +* `languagemodels.py` -- a class to encode and decode strings +* `metrics.py` -- the levenshtein edit distance +* `utils.py` -- functions to log metrics, save checkpoint, and count parameters From f5bcead066711beac97f8b473bc001ca0d52323b Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 28 Jul 2020 16:32:03 -0700 Subject: [PATCH 101/129] pass clip_grad explictly. --- examples/pipeline_wav2letter/main.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index cdb24c23fa..dfc1eadbc7 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -238,6 +238,7 @@ def train_one_epoch( language_model, device, epoch, + clip_grad, ): model.train() @@ -267,9 +268,9 @@ def train_one_epoch( optimizer.zero_grad() loss.backward() - if args.clip_grad > 0: + if clip_grad > 0: metric["gradient"] = torch.nn.utils.clip_grad_norm_( - model.parameters(), args.clip_grad + model.parameters(), clip_grad ) optimizer.step() @@ -558,6 +559,7 @@ def main(args, rank=0): language_model, devices[0], epoch, + args.clip_grad, ) if not (epoch + 1) % args.print_freq or epoch == args.epochs - 1: From 91a06a64dea8545e9e1b28d81c72ba70df24b178 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 29 Jul 2020 10:43:52 -0700 Subject: [PATCH 102/129] typo in default dataset name. --- examples/pipeline_wav2letter/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index dfc1eadbc7..4e01be3e36 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -133,7 +133,7 @@ def parse_args(): parser.add_argument("--clip-grad", metavar="NORM", type=float, default=0.0) parser.add_argument( "--dataset-train", - default=["train-100"], + default=["train-clean-100"], nargs="+", type=str, help="select which part of librispeech to train with", From 2167f27d70dda8f75204ef20f656a15b48a107dc Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 29 Jul 2020 12:02:02 -0700 Subject: [PATCH 103/129] option to disable logger. --- examples/pipeline_wav2letter/main.py | 24 ++++++++++++++++++------ examples/pipeline_wav2letter/utils.py | 9 +++++---- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index 4e01be3e36..6c6f7170be 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -239,11 +239,12 @@ def train_one_epoch( device, epoch, clip_grad, + disable_logger=False, ): model.train() - metric = MetricLogger("train") + metric = MetricLogger("train", disable=disable_logger) metric["epoch"] = epoch for inputs, targets, tensors_lengths, target_lengths in bg_iterator( @@ -303,14 +304,21 @@ def train_one_epoch( def evaluate( - model, criterion, data_loader, decoder, language_model, device, epoch, + model, + criterion, + data_loader, + decoder, + language_model, + device, + epoch, + disable_logger=False, ): with torch.no_grad(): model.eval() start = time() - metric = MetricLogger("validation") + metric = MetricLogger("validation", disable=disable_logger) metric["epoch"] = epoch for inputs, targets, tensors_lengths, target_lengths in bg_iterator( @@ -354,6 +362,8 @@ def main(args, rank=0): if args.distributed: setup_distributed(rank, args.world_size) + main_rank = rank == 0 + logging.info("Start time: {}".format(str(datetime.now()))) # Install signal handler @@ -539,7 +549,7 @@ def main(args, rank=0): }, False, args.checkpoint, - rank, + not main_rank, ) if args.distributed: @@ -560,6 +570,7 @@ def main(args, rank=0): devices[0], epoch, args.clip_grad, + not main_rank, ) if not (epoch + 1) % args.print_freq or epoch == args.epochs - 1: @@ -572,6 +583,7 @@ def main(args, rank=0): language_model, devices[0], epoch, + not main_rank, ) is_best = loss < best_loss @@ -586,7 +598,7 @@ def main(args, rank=0): }, is_best, args.checkpoint, - rank, + not main_rank, ) # TODO Remove before merge pull request @@ -601,7 +613,7 @@ def main(args, rank=0): }, False, args.checkpoint, - rank, + not main_rank, ) trigger_job_requeue() diff --git a/examples/pipeline_wav2letter/utils.py b/examples/pipeline_wav2letter/utils.py index 0b7ef94bd5..ef6f3f7730 100644 --- a/examples/pipeline_wav2letter/utils.py +++ b/examples/pipeline_wav2letter/utils.py @@ -8,8 +8,9 @@ class MetricLogger(defaultdict): - def __init__(self, name, print_freq=1): + def __init__(self, name, print_freq=1, disable=False): super().__init__(lambda: 0.0) + self.disable = disable self.print_freq = print_freq self._iter = 0 self["name"] = name @@ -19,18 +20,18 @@ def __str__(self): def __call__(self): self._iter = (self._iter + 1) % self.print_freq - if not self._iter: + if not self.disable and not self._iter: print(self, flush=True) -def save_checkpoint(state, is_best, filename, rank): +def save_checkpoint(state, is_best, filename, disable): """ Save the model to a temporary file first, then copy it to filename, in case the signal interrupts the torch.save() process. """ - if rank != 0: + if disable: return if filename == "": From 10ef47ceaaf672109668f61ffea311592810063d Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 29 Jul 2020 12:13:11 -0700 Subject: [PATCH 104/129] ergonomics for distributed. --- examples/pipeline_wav2letter/main.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index 6c6f7170be..f8c774d35e 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -357,7 +357,7 @@ def evaluate( return metric["average loss"] -def main(args, rank=0): +def main(rank, args): if args.distributed: setup_distributed(rank, args.world_size) @@ -376,7 +376,7 @@ def main(args, rank=0): # Empty CUDA cache torch.cuda.empty_cache() - # Change backend + # Change backend for flac files torchaudio.set_audio_backend("soundfile") if args.distributed: @@ -623,17 +623,17 @@ def main(args, rank=0): torch.distributed.destroy_process_group() -def spawn_main(args, main): +def spawn_main(main, args): if args.distributed: torch.multiprocessing.spawn( - lambda x: main(args, x), nprocs=args.world_size, join=True + main, args=(args,), nprocs=args.world_size, join=True ) else: - main(args) + main(0, args) if __name__ == "__main__": logging.basicConfig(level=logging.INFO) args = parse_args() - spawn_main(args, main) + spawn_main(main, args) From 6e6b2ead3854ebcf6a45035f276a98d850a7a4c3 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 29 Jul 2020 12:20:51 -0700 Subject: [PATCH 105/129] reminder about signal handler. --- examples/pipeline_wav2letter/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index f8c774d35e..3421e5e279 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -367,6 +367,7 @@ def main(rank, args): logging.info("Start time: {}".format(str(datetime.now()))) # Install signal handler + # TODO Remove before merge pull request signal.signal(signal.SIGUSR1, signal_handler) # Explicitly setting seed to make sure that models created in two processes From 8d6e27d369fb5614fd6c84b4e4bb8a8bf201ba28 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 29 Jul 2020 12:47:07 -0700 Subject: [PATCH 106/129] minor refactor of main in main. --- examples/pipeline_wav2letter/main.py | 69 ++++++++++++++++++---------- 1 file changed, 45 insertions(+), 24 deletions(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index 3421e5e279..baf3097717 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -131,6 +131,18 @@ def parse_args(): parser.add_argument("--eps", metavar="EPS", type=float, default=1e-8) parser.add_argument("--rho", metavar="RHO", type=float, default=0.95) parser.add_argument("--clip-grad", metavar="NORM", type=float, default=0.0) + parser.add_argument( + "--dataset-root", + default="/datasets01/", + type=str, + help="specify dataset root folder", + ) + parser.add_argument( + "--dataset-folder-in-archive", + default="librispeech/062419/", + type=str, + help="specify dataset folder in archive", + ) parser.add_argument( "--dataset-train", default=["train-clean-100"], @@ -359,19 +371,21 @@ def evaluate( def main(rank, args): + # Distributed setup + if args.distributed: setup_distributed(rank, args.world_size) - main_rank = rank == 0 - - logging.info("Start time: {}".format(str(datetime.now()))) + main_rank = not args.distributed or rank == 0 # Install signal handler # TODO Remove before merge pull request signal.signal(signal.SIGUSR1, signal_handler) - # Explicitly setting seed to make sure that models created in two processes - # start from same random weights and biases. + logging.info("Start time: {}".format(str(datetime.now()))) + + # Explicitly set seed to make sure models created in separate processes + # start from same random weights and biases torch.manual_seed(args.seed) # Empty CUDA cache @@ -380,22 +394,7 @@ def main(rank, args): # Change backend for flac files torchaudio.set_audio_backend("soundfile") - if args.distributed: - n = torch.cuda.device_count() // args.world_size - devices = list(range(rank * n, (rank + 1) * n)) - else: - devices = ["cuda" if torch.cuda.is_available() else "cpu"] - - loader_training_params = { - "num_workers": args.workers, - "pin_memory": True, - "shuffle": True, - "drop_last": True, - } - loader_validation_params = loader_training_params.copy() - loader_validation_params["shuffle"] = False - - # audio + # Transforms melkwargs = { "n_fft": 512, @@ -433,17 +432,23 @@ def main(rank, args): labels = char_blank + char_space + char_apostrophe + string.ascii_lowercase language_model = LanguageModel(labels, char_blank, char_space) + # Dataset + training, validation = split_process_librispeech( [args.dataset_train, args.dataset_valid], [transforms_train, transforms_valid], language_model, - root="/datasets01/", - folder_in_archive="librispeech/062419/", + root=args.dataset_root, + folder_in_archive=args.dataset_folder_in_archive, ) + # Decoder + if args.decoder == "greedy": decoder = GreedyDecoder() + # Model + model = Wav2Letter( num_classes=language_model.length, input_type="mfcc", num_features=args.n_bins ) @@ -452,10 +457,13 @@ def main(rank, args): model = torch.jit.script(model) if args.distributed: + n = torch.cuda.device_count() // args.world_size + devices = list(range(rank * n, (rank + 1) * n)) model.cuda() model = torch.nn.parallel.DistributedDataParallel(model, device_ids=devices) # model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True) else: + devices = ["cuda" if torch.cuda.is_available() else "cpu"] model = torch.nn.DataParallel(model) model = model.to(devices[0], non_blocking=True) @@ -499,10 +507,19 @@ def main(rank, args): # criterion = torch.nn.MSELoss() # criterion = torch.nn.NLLLoss() - torch.autograd.set_detect_anomaly(False) + # Data Loader collate_fn = collate_factory(model_length_function) + loader_training_params = { + "num_workers": args.workers, + "pin_memory": True, + "shuffle": True, + "drop_last": True, + } + loader_validation_params = loader_training_params.copy() + loader_validation_params["shuffle"] = False + loader_training = DataLoader( training, batch_size=args.batch_size, @@ -516,6 +533,8 @@ def main(rank, args): **loader_validation_params, ) + # Setup checkpoint + best_loss = 1.0 load_checkpoint = args.checkpoint and os.path.isfile(args.checkpoint) @@ -556,6 +575,8 @@ def main(rank, args): if args.distributed: torch.distributed.barrier() + torch.autograd.set_detect_anomaly(False) + for epoch in range(args.start_epoch, args.epochs): logging.info(f"Epoch: {epoch}") From 6f5f7cde8d96427bb6e3c6b3304b4da1698e6603 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 29 Jul 2020 12:54:00 -0700 Subject: [PATCH 107/129] replace by not_main_rank. --- examples/pipeline_wav2letter/main.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index baf3097717..9773de4db9 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -376,7 +376,7 @@ def main(rank, args): if args.distributed: setup_distributed(rank, args.world_size) - main_rank = not args.distributed or rank == 0 + not_main_rank = args.distributed and rank != 0 # Install signal handler # TODO Remove before merge pull request @@ -569,7 +569,7 @@ def main(rank, args): }, False, args.checkpoint, - not main_rank, + not_main_rank, ) if args.distributed: @@ -592,7 +592,7 @@ def main(rank, args): devices[0], epoch, args.clip_grad, - not main_rank, + not_main_rank, ) if not (epoch + 1) % args.print_freq or epoch == args.epochs - 1: @@ -605,7 +605,7 @@ def main(rank, args): language_model, devices[0], epoch, - not main_rank, + not_main_rank, ) is_best = loss < best_loss @@ -620,7 +620,7 @@ def main(rank, args): }, is_best, args.checkpoint, - not main_rank, + not_main_rank, ) # TODO Remove before merge pull request @@ -635,7 +635,7 @@ def main(rank, args): }, False, args.checkpoint, - not main_rank, + not_main_rank, ) trigger_job_requeue() From d7ebdb3a9ef35791ef824ae987ad2923d9a4abe3 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 29 Jul 2020 15:39:05 -0700 Subject: [PATCH 108/129] raising error if parameter not supported. --- examples/pipeline_wav2letter/main.py | 6 ++++++ examples/pipeline_wav2letter/utils.py | 1 + 2 files changed, 7 insertions(+) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index 9773de4db9..b41305777e 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -446,6 +446,8 @@ def main(rank, args): if args.decoder == "greedy": decoder = GreedyDecoder() + else: + raise ValueError("Selected decoder not supported") # Model @@ -495,11 +497,15 @@ def main(rank, args): momentum=args.momentum, weight_decay=args.weight_decay, ) + else: + raise ValueError("Selected optimizer not supported") if args.scheduler == "exponential": scheduler = ExponentialLR(optimizer, gamma=args.gamma) elif args.scheduler == "reduceonplateau": scheduler = ReduceLROnPlateau(optimizer, patience=10, threshold=1e-3) + else: + raise ValueError("Selected scheduler not supported") criterion = torch.nn.CTCLoss( blank=language_model.mapping[char_blank], zero_infinity=False diff --git a/examples/pipeline_wav2letter/utils.py b/examples/pipeline_wav2letter/utils.py index ef6f3f7730..2c112a1be4 100644 --- a/examples/pipeline_wav2letter/utils.py +++ b/examples/pipeline_wav2letter/utils.py @@ -19,6 +19,7 @@ def __str__(self): return json.dumps(self) def __call__(self): + logging.info(self.name, self.disable) self._iter = (self._iter + 1) % self.print_freq if not self.disable and not self._iter: print(self, flush=True) From b67ba51b47b5c866c6afe59afa4b790a1298f02a Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 29 Jul 2020 16:21:47 -0700 Subject: [PATCH 109/129] move model before invoking DDP. --- examples/pipeline_wav2letter/main.py | 5 ++--- examples/pipeline_wav2letter/utils.py | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index b41305777e..1c402c9e49 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -461,15 +461,14 @@ def main(rank, args): if args.distributed: n = torch.cuda.device_count() // args.world_size devices = list(range(rank * n, (rank + 1) * n)) - model.cuda() + model = model.to(devices[0]) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=devices) # model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True) else: devices = ["cuda" if torch.cuda.is_available() else "cpu"] + model = model.to(devices[0], non_blocking=True) model = torch.nn.DataParallel(model) - model = model.to(devices[0], non_blocking=True) - n = count_parameters(model) logging.info(f"Number of parameters: {n}") diff --git a/examples/pipeline_wav2letter/utils.py b/examples/pipeline_wav2letter/utils.py index 2c112a1be4..ef6f3f7730 100644 --- a/examples/pipeline_wav2letter/utils.py +++ b/examples/pipeline_wav2letter/utils.py @@ -19,7 +19,6 @@ def __str__(self): return json.dumps(self) def __call__(self): - logging.info(self.name, self.disable) self._iter = (self._iter + 1) % self.print_freq if not self.disable and not self._iter: print(self, flush=True) From ecd8d738b6496907ac825acee3093d9d7b05a8c4 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 29 Jul 2020 16:37:53 -0700 Subject: [PATCH 110/129] changing log level. using python 2 style string for logging. --- examples/pipeline_wav2letter/main.py | 24 ++++++++++++------------ examples/pipeline_wav2letter/utils.py | 2 +- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index 1c402c9e49..0ba4b02038 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -174,7 +174,7 @@ def parse_args(): # TODO Remove before merge pull request def signal_handler(a, b): global SIGNAL_RECEIVED - logging.info(f"Signal received on {datetime.now()}") + logging.warning("Signal received on %s", datetime.now()) SIGNAL_RECEIVED = True @@ -182,13 +182,13 @@ def signal_handler(a, b): def trigger_job_requeue(): # Submit a new job to resume from checkpoint. if os.environ["SLURM_PROCID"] == "0" and os.getpid() == MAIN_PID: - logging.info(f"PID: {os.getpid()}. PPID: {os.getppid()}.") - logging.info("Resubmitting job") + logging.warning("PID: %s. PPID: %s.", os.getpid(), os.getppid()) + logging.warning("Resubmitting job") command = "scontrol requeue " + os.environ["SLURM_JOB_ID"] - logging.info(command) + logging.warning(command) if os.system(command): raise RuntimeError("Fail to resubmit") - logging.info("New job submitted to the queue") + logging.warning("New job submitted to the queue") exit(0) @@ -218,7 +218,7 @@ def compute_error_rates(outputs, targets, decoder, language_model, metric): # Print a few examples output_print = output[i].ljust(print_length)[:print_length] target_print = target[i].ljust(print_length)[:print_length] - logging.info(f"Target: {target_print} Output: {output_print}") + logging.info("Target: %s Output: %s", target_print, output_print) cers = [levenshtein_distance(t, o) for t, o in zip(target, output)] cers = sum(cers) @@ -382,7 +382,7 @@ def main(rank, args): # TODO Remove before merge pull request signal.signal(signal.SIGUSR1, signal_handler) - logging.info("Start time: {}".format(str(datetime.now()))) + logging.info("Start time: %s", datetime.now()) # Explicitly set seed to make sure models created in separate processes # start from same random weights and biases @@ -470,7 +470,7 @@ def main(rank, args): model = torch.nn.DataParallel(model) n = count_parameters(model) - logging.info(f"Number of parameters: {n}") + logging.info("Number of parameters: %s", n) # Optimizer @@ -548,7 +548,7 @@ def main(rank, args): torch.distributed.barrier() if load_checkpoint: - logging.info(f"Checkpoint: loading '{args.checkpoint}'") + logging.info("Checkpoint: loading %s", args.checkpoint) checkpoint = torch.load(args.checkpoint) args.start_epoch = checkpoint["epoch"] @@ -559,7 +559,7 @@ def main(rank, args): scheduler.load_state_dict(checkpoint["scheduler"]) logging.info( - f"Checkpoint: loaded '{args.checkpoint}' at epoch {checkpoint['epoch']}" + "Checkpoint: loaded '%s' at epoch %s", args.checkpoint, checkpoint["epoch"] ) else: logging.info("Checkpoint: not found") @@ -584,7 +584,7 @@ def main(rank, args): for epoch in range(args.start_epoch, args.epochs): - logging.info(f"Epoch: {epoch}") + logging.info("Epoch: %s", epoch) train_one_epoch( model, @@ -644,7 +644,7 @@ def main(rank, args): ) trigger_job_requeue() - logging.info(f"End time: {datetime.now()}") + logging.info("End time: %s", datetime.now()) if args.distributed: torch.distributed.destroy_process_group() diff --git a/examples/pipeline_wav2letter/utils.py b/examples/pipeline_wav2letter/utils.py index ef6f3f7730..7cd07a2a80 100644 --- a/examples/pipeline_wav2letter/utils.py +++ b/examples/pipeline_wav2letter/utils.py @@ -48,7 +48,7 @@ def save_checkpoint(state, is_best, filename, disable): os.rename(tempfile, filename) if is_best: shutil.copyfile(filename, "model_best.pth.tar") - logging.info("Checkpoint: saved") + logging.warning("Checkpoint: saved") def count_parameters(model): From af2eb0cf7bdd04f8b1d0f16d757e7218061b8a48 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Fri, 31 Jul 2020 06:40:37 -0700 Subject: [PATCH 111/129] dynamic augmentations. --- examples/pipeline_wav2letter/datasets.py | 8 ++++++-- examples/pipeline_wav2letter/main.py | 22 ++++++++++++---------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/examples/pipeline_wav2letter/datasets.py b/examples/pipeline_wav2letter/datasets.py index 9ebba13eab..7434cbc365 100644 --- a/examples/pipeline_wav2letter/datasets.py +++ b/examples/pipeline_wav2letter/datasets.py @@ -88,10 +88,14 @@ def create(tags, cache=True): return tuple(create(dataset) for dataset in datasets) -def collate_factory(model_length_function): +def collate_factory(model_length_function, transforms=None): + + if transforms is None: + transforms = torch.nn.Sequential() + def collate_fn(batch): - tensors = [b[0] for b in batch if b] + tensors = [transforms(b[0]) for b in batch if b] tensors_lengths = torch.tensor( [model_length_function(t) for t in tensors], diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index 0ba4b02038..907508d680 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -404,7 +404,7 @@ def main(rank, args): sample_rate_original = 16000 - transforms_valid = torch.nn.Sequential( + transforms = torch.nn.Sequential( # torchaudio.transforms.Resample(sample_rate_original, sample_rate_original//2), # torchaudio.transforms.MFCC(sample_rate=sample_rate_original, n_mfcc=args.n_bins, melkwargs=melkwargs), torchaudio.transforms.MelSpectrogram( @@ -412,15 +412,15 @@ def main(rank, args): ), ) - transforms_train = transforms_valid + augmentations = torch.nn.Sequential() if args.freq_mask: - transforms_train = torch.nn.Sequential( - transforms_train, + augmentations = torch.nn.Sequential( + augmentations, torchaudio.transforms.FrequencyMasking(freq_mask_param=args.freq_mask), ) if args.time_mask: - transforms_train = torch.nn.Sequential( - transforms_train, + augmentations = torch.nn.Sequential( + augmentations, torchaudio.transforms.TimeMasking(time_mask_param=args.time_mask), ) @@ -436,7 +436,8 @@ def main(rank, args): training, validation = split_process_librispeech( [args.dataset_train, args.dataset_valid], - [transforms_train, transforms_valid], + # [transforms_train, transforms_valid], + [transforms, transforms], language_model, root=args.dataset_root, folder_in_archive=args.dataset_folder_in_archive, @@ -514,7 +515,8 @@ def main(rank, args): # Data Loader - collate_fn = collate_factory(model_length_function) + collate_fn_train = collate_factory(model_length_function, augmentations) + collate_fn_valid = collate_factory(model_length_function) loader_training_params = { "num_workers": args.workers, @@ -528,13 +530,13 @@ def main(rank, args): loader_training = DataLoader( training, batch_size=args.batch_size, - collate_fn=collate_fn, + collate_fn=collate_fn_train, **loader_training_params, ) loader_validation = DataLoader( validation, batch_size=args.batch_size, - collate_fn=collate_fn, + collate_fn=collate_fn_valid, **loader_validation_params, ) From 25524eb8ebeb66c4d65176caa6b90b71dfe61cac Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Fri, 31 Jul 2020 13:51:09 -0700 Subject: [PATCH 112/129] update metric log. batch cer/wer metric. correct typo in time. adding other dimensions in metric. --- examples/pipeline_wav2letter/main.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index 907508d680..810f78be72 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -223,9 +223,10 @@ def compute_error_rates(outputs, targets, decoder, language_model, metric): cers = [levenshtein_distance(t, o) for t, o in zip(target, output)] cers = sum(cers) n = sum(len(t) for t in target) - metric["cer"] += cers + metric["cer over target length"] = cers / n + metric["cumulative cer"] += cers metric["total chars"] += n - metric["cer over target length"] = metric["cer"] / metric["total chars"] + metric["cumulative cer over target length"] = metric["cer"] / metric["total chars"] # Compute WER @@ -235,9 +236,10 @@ def compute_error_rates(outputs, targets, decoder, language_model, metric): wers = [levenshtein_distance(t, o) for t, o in zip(target, output)] wers = sum(wers) n = sum(len(t) for t in target) - metric["wer"] += wers + metric["wer over target length"] = wers / n + metric["cumulative wer"] += wers metric["total words"] += n - metric["wer over target length"] = metric["wer"] / metric["total words"] + metric["cumulative wer over target length"] = metric["wer"] / metric["total words"] def train_one_epoch( @@ -296,13 +298,15 @@ def train_one_epoch( pass metric["batch size"] = len(inputs) + metric["n_channel"] = inputs.shape[1] + metric["n_time"] = inputs.shape[-1] metric["dataset length"] += metric["batch size"] metric["iteration"] += 1 metric["loss"] = loss.item() metric["cumulative loss"] += metric["loss"] metric["average loss"] = metric["cumulative loss"] / metric["iteration"] metric["iteration time"] = time() - start - metric["epoch time"] += metric["time"] + metric["epoch time"] += metric["iteration time"] metric() # TODO Remove before merge pull request From 01438030ef727c581f7057c230032afb6d73d1df Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Sat, 1 Aug 2020 14:54:02 -0700 Subject: [PATCH 113/129] save learning rate even if function not available. --- examples/pipeline_wav2letter/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index 810f78be72..2429a1233a 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -295,7 +295,7 @@ def train_one_epoch( try: metric["lr"] = scheduler.get_last_lr()[0] except AttributeError: - pass + metric["lr"] = optimizer.param_groups[0]["lr"] metric["batch size"] = len(inputs) metric["n_channel"] = inputs.shape[1] From 406d2a3fc5449414cfd82cde9faff184d8338e3c Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 4 Aug 2020 12:55:51 -0700 Subject: [PATCH 114/129] add type option to model. --- examples/pipeline_wav2letter/main.py | 39 ++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index 2429a1233a..3a0fff3af7 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -29,6 +29,13 @@ def parse_args(): parser = argparse.ArgumentParser() + parser.add_argument( + "--type", + metavar="T", + default="mel", + choices=["waveform", "mfcc", "mel"], + help="input type for model", + ) parser.add_argument( "--freq-mask", default=0, @@ -408,13 +415,26 @@ def main(rank, args): sample_rate_original = 16000 - transforms = torch.nn.Sequential( - # torchaudio.transforms.Resample(sample_rate_original, sample_rate_original//2), - # torchaudio.transforms.MFCC(sample_rate=sample_rate_original, n_mfcc=args.n_bins, melkwargs=melkwargs), - torchaudio.transforms.MelSpectrogram( - sample_rate=sample_rate_original, **melkwargs - ), - ) + input_type = "mfcc" if args.type == "mel" else args.type + if args.type == "mel": + transforms = torch.nn.Sequential( + # torchaudio.transforms.Resample(sample_rate_original, sample_rate_original//2), + torchaudio.transforms.MelSpectrogram( + sample_rate=sample_rate_original, **melkwargs + ), + ) + elif args.type == "mfcc": + transforms = torch.nn.Sequential( + torchaudio.transforms.MFCC( + sample_rate=sample_rate_original, + n_mfcc=args.n_bins, + melkwargs=melkwargs, + ), + ) + elif args.type == "waveform": + transforms = torch.nn.Sequential() + else: + raise ValueError("Model type not supported") augmentations = torch.nn.Sequential() if args.freq_mask: @@ -456,8 +476,11 @@ def main(rank, args): # Model + input_type = "mfcc" if args.type == "mel" else args.type model = Wav2Letter( - num_classes=language_model.length, input_type="mfcc", num_features=args.n_bins + num_classes=language_model.length, + input_type=input_type, + num_features=args.n_bins, ) if args.jit: From 3716e9da163d02d4f7796e8dfa3219ef1b9e7eb6 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 4 Aug 2020 12:56:11 -0700 Subject: [PATCH 115/129] add adamw. --- examples/pipeline_wav2letter/main.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index 3a0fff3af7..64d57d40c9 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -8,7 +8,7 @@ import torch import torchaudio -from torch.optim import SGD, Adadelta, Adam +from torch.optim import SGD, Adadelta, Adam, AdamW from torch.optim.lr_scheduler import ExponentialLR, ReduceLROnPlateau from torch.utils.data import DataLoader from torchaudio.datasets.utils import bg_iterator @@ -105,7 +105,7 @@ def parse_args(): "--optimizer", metavar="OPT", default="adadelta", - choices=["sgd", "adadelta", "adam"], + choices=["sgd", "adadelta", "adam", "adamw"], help="optimizer to use", ) parser.add_argument( @@ -524,6 +524,13 @@ def main(rank, args): momentum=args.momentum, weight_decay=args.weight_decay, ) + elif args.optimizer == "adamw": + optimizer = AdamW( + model.parameters(), + lr=args.learning_rate, + momentum=args.momentum, + weight_decay=args.weight_decay, + ) else: raise ValueError("Selected optimizer not supported") From f30f71384f8d2754c0c0cc7df2d8bece34e909cb Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 4 Aug 2020 12:56:43 -0700 Subject: [PATCH 116/129] reduce lr on validation step or training step. --- examples/pipeline_wav2letter/main.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index 64d57d40c9..d085f4199c 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -81,6 +81,11 @@ def parse_args(): metavar="N", help="print frequency in epochs", ) + parser.add_argument( + "--reduce-lr-valid", + action="store_true", + help="reduce learning rate based on validation loss", + ) parser.add_argument( "--progress-bar", action="store_true", help="use progress bar while training" ) @@ -261,6 +266,7 @@ def train_one_epoch( epoch, clip_grad, disable_logger=False, + reduce_lr_train=False, ): model.train() @@ -320,7 +326,7 @@ def train_one_epoch( if SIGNAL_RECEIVED: break - if isinstance(scheduler, ReduceLROnPlateau): + if reduce_lr_train and isinstance(scheduler, ReduceLROnPlateau): scheduler.step(metric["average loss"]) else: scheduler.step() @@ -634,6 +640,7 @@ def main(rank, args): epoch, args.clip_grad, not_main_rank, + not args.reduce_lr_valid, ) if not (epoch + 1) % args.print_freq or epoch == args.epochs - 1: @@ -664,6 +671,9 @@ def main(rank, args): not_main_rank, ) + if args.reduce_lr_valid and isinstance(scheduler, ReduceLROnPlateau): + scheduler.step(loss) + # TODO Remove before merge pull request if SIGNAL_RECEIVED: save_checkpoint( From 061dd400aac20cb2546cd0ae639d0ca5dfb97875 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 4 Aug 2020 14:33:46 -0700 Subject: [PATCH 117/129] specify hop-length and win-length. --- examples/pipeline_wav2letter/main.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index d085f4199c..2c98736b96 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -43,6 +43,20 @@ def parse_args(): metavar="N", help="maximal width of frequency mask", ) + parser.add_argument( + "--win-length", + default=512, + type=int, + metavar="N", + help="width of spectrogram window", + ) + parser.add_argument( + "--hop-length", + default=80, + type=int, + metavar="N", + help="width of spectrogram window", + ) parser.add_argument( "--time-mask", default=0, @@ -414,9 +428,9 @@ def main(rank, args): # Transforms melkwargs = { - "n_fft": 512, - "n_mels": args.n_bins, # 13, 20, 128 - "hop_length": 80, # (160, 80) + "n_fft": args.win_length, + "n_mels": args.n_bins, + "hop_length": args.hop_length, } sample_rate_original = 16000 From 8d49a70e58134ce7645b36d34468bf1e9e5a8a2a Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Tue, 4 Aug 2020 14:45:38 -0700 Subject: [PATCH 118/129] normalize option. --- examples/pipeline_wav2letter/main.py | 7 +++++++ examples/pipeline_wav2letter/transforms.py | 6 ++++++ 2 files changed, 13 insertions(+) create mode 100644 examples/pipeline_wav2letter/transforms.py diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index 2c98736b96..d9736366fe 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -19,6 +19,7 @@ from datasets import collate_factory, split_process_librispeech from languagemodels import LanguageModel from metrics import levenshtein_distance +from transforms import Normalize from utils import MetricLogger, count_parameters, save_checkpoint # TODO Remove before merge pull request @@ -100,6 +101,9 @@ def parse_args(): action="store_true", help="reduce learning rate based on validation loss", ) + parser.add_argument( + "--normalize", action="store_true", help="normalize model input" + ) parser.add_argument( "--progress-bar", action="store_true", help="use progress bar while training" ) @@ -456,6 +460,9 @@ def main(rank, args): else: raise ValueError("Model type not supported") + if args.normalize: + transforms = torch.nn.Sequential(transforms, Normalize()) + augmentations = torch.nn.Sequential() if args.freq_mask: augmentations = torch.nn.Sequential( diff --git a/examples/pipeline_wav2letter/transforms.py b/examples/pipeline_wav2letter/transforms.py new file mode 100644 index 0000000000..2109efe0bc --- /dev/null +++ b/examples/pipeline_wav2letter/transforms.py @@ -0,0 +1,6 @@ +import torch + + +class Normalize(torch.nn.Module): + def forward(self, tensor): + return (tensor - tensor.mean(-1, keepdim=True)) / tensor.std(-1, keepdim=True) From 4ea2596a313bfe3d6682ad6ac0fa3bf260904eb1 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 6 Aug 2020 15:24:45 -0700 Subject: [PATCH 119/129] rename parameter. --- examples/pipeline_wav2letter/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index d9736366fe..9506af1446 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -284,7 +284,7 @@ def train_one_epoch( epoch, clip_grad, disable_logger=False, - reduce_lr_train=False, + reduce_lr_on_plateau=False, ): model.train() @@ -344,7 +344,7 @@ def train_one_epoch( if SIGNAL_RECEIVED: break - if reduce_lr_train and isinstance(scheduler, ReduceLROnPlateau): + if reduce_lr_on_plateau and isinstance(scheduler, ReduceLROnPlateau): scheduler.step(metric["average loss"]) else: scheduler.step() From 340df0a4f3e53f4efb96b5c23587f5b9f893c175 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 6 Aug 2020 15:25:05 -0700 Subject: [PATCH 120/129] add dropout and tweak to number of channels. --- examples/pipeline_wav2letter/main.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index 9506af1446..9e6227ce55 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -12,7 +12,6 @@ from torch.optim.lr_scheduler import ExponentialLR, ReduceLROnPlateau from torch.utils.data import DataLoader from torchaudio.datasets.utils import bg_iterator -from torchaudio.models.wav2letter import Wav2Letter from torchaudio.transforms import MFCC, Resample from ctc_decoders import GreedyDecoder @@ -22,6 +21,9 @@ from transforms import Normalize from utils import MetricLogger, count_parameters, save_checkpoint +# from torchaudio.models.wav2letter import Wav2Letter +from wav2letter import Wav2Letter + # TODO Remove before merge pull request MAIN_PID = os.getpid() SIGNAL_RECEIVED = False @@ -37,6 +39,13 @@ def parse_args(): choices=["waveform", "mfcc", "mel"], help="input type for model", ) + parser.add_argument( + "--n-hidden-channels", + default=2000, + type=int, + metavar="N", + help="number of hidden channels in wav2letter", + ) parser.add_argument( "--freq-mask", default=0, @@ -124,6 +133,13 @@ def parse_args(): metavar="N", help="number of bins in transforms", ) + parser.add_argument( + "--dropout", + default=0.0, + type=float, + metavar="D", + help="probability of an element to be zeroed", + ) parser.add_argument( "--optimizer", metavar="OPT", @@ -508,6 +524,8 @@ def main(rank, args): num_classes=language_model.length, input_type=input_type, num_features=args.n_bins, + num_hidden_channels=args.n_hidden_channels, + dropout=args.dropout, ) if args.jit: From f5b1b1b17780f5d20fb1e25a5db7a4aec094880f Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 6 Aug 2020 15:26:12 -0700 Subject: [PATCH 121/129] copy model in pipeline folder for experimentation. --- examples/pipeline_wav2letter/wav2letter.py | 123 +++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 examples/pipeline_wav2letter/wav2letter.py diff --git a/examples/pipeline_wav2letter/wav2letter.py b/examples/pipeline_wav2letter/wav2letter.py new file mode 100644 index 0000000000..261b14febd --- /dev/null +++ b/examples/pipeline_wav2letter/wav2letter.py @@ -0,0 +1,123 @@ +from torch import Tensor, nn + +__all__ = ["Wav2Letter"] + + +class Wav2Letter(nn.Module): + r"""Wav2Letter model architecture from the `Wav2Letter an End-to-End ConvNet-based Speech Recognition System`_. + + .. _Wav2Letter an End-to-End ConvNet-based Speech Recognition System: https://arxiv.org/abs/1609.03193 + + :math:`\text{padding} = \frac{\text{ceil}(\text{kernel} - \text{stride})}{2}` + + Args: + num_classes (int, optional): Number of classes to be classified. (Default: ``40``) + input_type (str, optional): Wav2Letter can use as input: ``waveform``, ``power_spectrum`` + or ``mfcc`` (Default: ``waveform``). + num_features (int, optional): Number of input features that the network will receive (Default: ``1``). + """ + + def __init__( + self, + num_classes: int = 40, + input_type: str = "waveform", + num_features: int = 1, + num_hidden_channels=2000, + dropout=0.0, + ) -> None: + super(Wav2Letter, self).__init__() + + acoustic_num_features = 250 if input_type == "waveform" else num_features + acoustic_model = nn.Sequential( + nn.Conv1d( + in_channels=acoustic_num_features, + out_channels=250, + kernel_size=48, + stride=2, + padding=23, + ), + nn.ReLU(inplace=True), + nn.Conv1d( + in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3 + ), + nn.ReLU(inplace=True), + nn.Conv1d( + in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3 + ), + nn.ReLU(inplace=True), + nn.Conv1d( + in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3 + ), + nn.ReLU(inplace=True), + nn.Conv1d( + in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3 + ), + nn.ReLU(inplace=True), + nn.Conv1d( + in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3 + ), + nn.ReLU(inplace=True), + nn.Conv1d( + in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3 + ), + nn.ReLU(inplace=True), + nn.Conv1d( + in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3 + ), + nn.ReLU(inplace=True), + nn.Conv1d( + in_channels=250, + out_channels=num_hidden_channels, + kernel_size=32, + stride=1, + padding=16, + ), + nn.Dropout(p=dropout), + nn.ReLU(inplace=True), + nn.Conv1d( + in_channels=num_hidden_channels, + out_channels=num_hidden_channels, + kernel_size=1, + stride=1, + padding=0, + ), + nn.Dropout(p=dropout), + nn.ReLU(inplace=True), + nn.Conv1d( + in_channels=num_hidden_channels, + out_channels=num_classes, + kernel_size=1, + stride=1, + padding=0, + ), + nn.ReLU(inplace=True), + ) + + if input_type == "waveform": + waveform_model = nn.Sequential( + nn.Conv1d( + in_channels=num_features, + out_channels=250, + kernel_size=250, + stride=160, + padding=45, + ), + nn.ReLU(inplace=True), + ) + self.acoustic_model = nn.Sequential(waveform_model, acoustic_model) + + if input_type in ["power_spectrum", "mfcc"]: + self.acoustic_model = acoustic_model + + def forward(self, x: Tensor) -> Tensor: + r""" + Args: + x (torch.Tensor): Tensor of dimension (batch_size, num_features, input_length). + + Returns: + Tensor: Predictor tensor of dimension (batch_size, number_of_classes, input_length). + """ + + x = self.acoustic_model(x) + x = nn.functional.log_softmax(x, dim=1) + return x From e5f733d742008a6a4cd536ab43f9c757afa852be Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Thu, 6 Aug 2020 15:40:13 -0700 Subject: [PATCH 122/129] fix scheduler stepping. --- examples/pipeline_wav2letter/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index 9e6227ce55..8cf4b0c546 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -362,7 +362,7 @@ def train_one_epoch( if reduce_lr_on_plateau and isinstance(scheduler, ReduceLROnPlateau): scheduler.step(metric["average loss"]) - else: + elif not isinstance(scheduler, ReduceLROnPlateau): scheduler.step() From fe752498fa17ce97ed89d1071e2ac73e5f0f52fc Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Fri, 7 Aug 2020 08:00:26 -0700 Subject: [PATCH 123/129] fix input_type and num_features. --- examples/pipeline_wav2letter/main.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index 8cf4b0c546..0d2f24cf5f 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -455,7 +455,9 @@ def main(rank, args): sample_rate_original = 16000 - input_type = "mfcc" if args.type == "mel" else args.type + input_type = args.type + num_features = args.n_bins + if args.type == "mel": transforms = torch.nn.Sequential( # torchaudio.transforms.Resample(sample_rate_original, sample_rate_original//2), @@ -463,6 +465,7 @@ def main(rank, args): sample_rate=sample_rate_original, **melkwargs ), ) + input_type = "mfcc" elif args.type == "mfcc": transforms = torch.nn.Sequential( torchaudio.transforms.MFCC( @@ -473,6 +476,7 @@ def main(rank, args): ) elif args.type == "waveform": transforms = torch.nn.Sequential() + num_features = 1 else: raise ValueError("Model type not supported") @@ -519,11 +523,10 @@ def main(rank, args): # Model - input_type = "mfcc" if args.type == "mel" else args.type model = Wav2Letter( num_classes=language_model.length, input_type=input_type, - num_features=args.n_bins, + num_features=num_features, num_hidden_channels=args.n_hidden_channels, dropout=args.dropout, ) From 4d2119ab95884bc6ac8f1f3e1f33cbb1fd9dca3b Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Fri, 7 Aug 2020 12:00:48 -0700 Subject: [PATCH 124/129] waveform mode changes shape more. --- examples/pipeline_wav2letter/main.py | 7 +++++-- examples/pipeline_wav2letter/transforms.py | 5 +++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index 0d2f24cf5f..0dc5677d88 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -18,7 +18,7 @@ from datasets import collate_factory, split_process_librispeech from languagemodels import LanguageModel from metrics import levenshtein_distance -from transforms import Normalize +from transforms import Normalize, UnsqueezeFirst from utils import MetricLogger, count_parameters, save_checkpoint # from torchaudio.models.wav2letter import Wav2Letter @@ -247,6 +247,9 @@ def setup_distributed(rank, world_size): def model_length_function(tensor): + if tensor.shape[1] == 1: + # waveform mode + return int(tensor.shape[0]) // 160 // 2 + 1 return int(tensor.shape[0]) // 2 + 1 @@ -475,7 +478,7 @@ def main(rank, args): ), ) elif args.type == "waveform": - transforms = torch.nn.Sequential() + transforms = torch.nn.Sequential(UnsqueezeFirst()) num_features = 1 else: raise ValueError("Model type not supported") diff --git a/examples/pipeline_wav2letter/transforms.py b/examples/pipeline_wav2letter/transforms.py index 2109efe0bc..f1d9115c87 100644 --- a/examples/pipeline_wav2letter/transforms.py +++ b/examples/pipeline_wav2letter/transforms.py @@ -4,3 +4,8 @@ class Normalize(torch.nn.Module): def forward(self, tensor): return (tensor - tensor.mean(-1, keepdim=True)) / tensor.std(-1, keepdim=True) + + +class UnsqueezeFirst(torch.nn.Module): + def forward(self, tensor): + return tensor.unsqueeze(0) From e63a6162657e3d49de28cd24e8868887f415b549 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 19 Aug 2020 13:49:05 -0700 Subject: [PATCH 125/129] adding best character error rate with current implementation of model with mfcc. --- examples/pipeline_wav2letter/README.md | 28 ++++++++++++++++++++------ examples/pipeline_wav2letter/main.py | 10 ++++----- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/examples/pipeline_wav2letter/README.md b/examples/pipeline_wav2letter/README.md index 91546b724e..b703a51593 100644 --- a/examples/pipeline_wav2letter/README.md +++ b/examples/pipeline_wav2letter/README.md @@ -2,22 +2,38 @@ This is an example pipeline for speech recognition using a greedy or Viterbi CTC ### Usage -More information about each command line parameters is available with the `--help` option. An example can be invoked as follows. +More information about each command line parameters is available with the `--help` option. An example reproducing the demo can be invoked as follows. ``` python main.py \ + --reduce-lr-valid \ + --dataset-train train-clean-100 train-clean-360 train-other-500 \ + --dataset-valid dev-clean \ --batch-size 128 \ --learning-rate .6 \ - --gamma .99 \ - --n-bins 13 \ --momentum .8 \ + --weight-decay .00001 \ --clip-grad 0. \ - --optimizer "adadelta" \ - --scheduler "reduceonplateau" + --gamma .99 \ + --hop-length 160 \ + --n-hidden-channels 2000 \ + --win-length 400 \ + --n-bins 13 \ + --normalize \ + --optimizer adadelta \ + --scheduler reduceonplateau \ + --epochs 30 ``` +With these default parameters, we get a character error rate of 13.8% on dev-clean after 30 epochs. ### Output -The information reported at each iteration and epoch (e.g. loss, character error rate, word error rate) is printed to standard output in the form of one json per line. Further information is reported to standard error. Here is an example python function to parse the standard output when saved to a file. +The information reported at each iteration and epoch (e.g. loss, character error rate, word error rate) is printed to standard output in the form of one json per line, e.g. +``` +{"name": "train", "epoch": 0, "cer over target length": 1.0, "cumulative cer": 23317.0, "total chars": 23317.0, "cer": 0.0, "cumulative cer over target length": 0.0, "wer over target length": 1.0, "cumulative wer": 4446.0, "total words": 4446.0, "wer": 0.0, "cumulative wer over target length": 0.0, "lr": 0.6, "batch size": 128, "n_channel": 13, "n_time": 2453, "dataset length": 128.0, "iteration": 1.0, "loss": 8.712121963500977, "cumulative loss": 8.712121963500977, "average loss": 8.712121963500977, "iteration time": 41.46276903152466, "epoch time": 41.46276903152466} +{"name": "train", "epoch": 0, "cer over target length": 1.0, "cumulative cer": 46005.0, "total chars": 46005.0, "cer": 0.0, "cumulative cer over target length": 0.0, "wer over target length": 1.0, "cumulative wer": 8762.0, "total words": 8762.0, "wer": 0.0, "cumulative wer over target length": 0.0, "lr": 0.6, "batch size": 128, "n_channel": 13, "n_time": 1703, "dataset length": 256.0, "iteration": 2.0, "loss": 8.918599128723145, "cumulative loss": 17.63072109222412, "average loss": 8.81536054611206, "iteration time": 1.2905676364898682, "epoch time": 42.753336668014526} +{"name": "train", "epoch": 0, "cer over target length": 1.0, "cumulative cer": 70030.0, "total chars": 70030.0, "cer": 0.0, "cumulative cer over target length": 0.0, "wer over target length": 1.0, "cumulative wer": 13348.0, "total words": 13348.0, "wer": 0.0, "cumulative wer over target length": 0.0, "lr": 0.6, "batch size": 128, "n_channel": 13, "n_time": 1713, "dataset length": 384.0, "iteration": 3.0, "loss": 8.550191879272461, "cumulative loss": 26.180912971496582, "average loss": 8.726970990498861, "iteration time": 1.2109291553497314, "epoch time": 43.96426582336426} +``` +Further information is reported to standard error. Here is an example python function to parse the standard output when saved to a file. ```python def read_json(filename): """ diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index 0dc5677d88..741d374ee9 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--type", metavar="T", - default="mel", + default="mfcc", choices=["waveform", "mfcc", "mel"], help="input type for model", ) @@ -55,14 +55,14 @@ def parse_args(): ) parser.add_argument( "--win-length", - default=512, + default=400, type=int, metavar="N", help="width of spectrogram window", ) parser.add_argument( "--hop-length", - default=80, + default=160, type=int, metavar="N", help="width of spectrogram window", @@ -124,7 +124,7 @@ def parse_args(): help="decoder to use", ) parser.add_argument( - "--batch-size", default=64, type=int, metavar="N", help="mini-batch size" + "--batch-size", default=128, type=int, metavar="N", help="mini-batch size" ) parser.add_argument( "--n-bins", @@ -179,13 +179,11 @@ def parse_args(): parser.add_argument("--clip-grad", metavar="NORM", type=float, default=0.0) parser.add_argument( "--dataset-root", - default="/datasets01/", type=str, help="specify dataset root folder", ) parser.add_argument( "--dataset-folder-in-archive", - default="librispeech/062419/", type=str, help="specify dataset folder in archive", ) From 4a9381f6e31b614af798c993ace7a6d0debd3e0d Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 19 Aug 2020 13:52:35 -0700 Subject: [PATCH 126/129] comment update. --- examples/pipeline_wav2letter/datasets.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/pipeline_wav2letter/datasets.py b/examples/pipeline_wav2letter/datasets.py index 7434cbc365..0a702055c2 100644 --- a/examples/pipeline_wav2letter/datasets.py +++ b/examples/pipeline_wav2letter/datasets.py @@ -82,9 +82,7 @@ def create(tags, cache=True): data = MapMemoryCache(data) return data - # FIXME For performance, we cache all datasets - # Do not cache first dataset - # return tuple(create(dataset, cache=i > 0) for i, dataset in enumerate(datasets)) + # For performance, we cache all datasets return tuple(create(dataset) for dataset in datasets) From 4795a72fc52182412f22c20c53a757ef927ea884 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 19 Aug 2020 14:00:32 -0700 Subject: [PATCH 127/129] remove signal. remove custom wav2letter model. --- examples/pipeline_wav2letter/main.py | 93 +--------------- examples/pipeline_wav2letter/wav2letter.py | 123 --------------------- 2 files changed, 5 insertions(+), 211 deletions(-) delete mode 100644 examples/pipeline_wav2letter/wav2letter.py diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index 741d374ee9..aaa4d30a2c 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -1,7 +1,6 @@ import argparse import logging import os -import signal import string from datetime import datetime from time import time @@ -12,7 +11,7 @@ from torch.optim.lr_scheduler import ExponentialLR, ReduceLROnPlateau from torch.utils.data import DataLoader from torchaudio.datasets.utils import bg_iterator -from torchaudio.transforms import MFCC, Resample +from torchaudio.models.wav2letter import Wav2Letter from ctc_decoders import GreedyDecoder from datasets import collate_factory, split_process_librispeech @@ -21,13 +20,6 @@ from transforms import Normalize, UnsqueezeFirst from utils import MetricLogger, count_parameters, save_checkpoint -# from torchaudio.models.wav2letter import Wav2Letter -from wav2letter import Wav2Letter - -# TODO Remove before merge pull request -MAIN_PID = os.getpid() -SIGNAL_RECEIVED = False - def parse_args(): parser = argparse.ArgumentParser() @@ -36,16 +28,9 @@ def parse_args(): "--type", metavar="T", default="mfcc", - choices=["waveform", "mfcc", "mel"], + choices=["waveform", "mfcc"], help="input type for model", ) - parser.add_argument( - "--n-hidden-channels", - default=2000, - type=int, - metavar="N", - help="number of hidden channels in wav2letter", - ) parser.add_argument( "--freq-mask", default=0, @@ -133,13 +118,6 @@ def parse_args(): metavar="N", help="number of bins in transforms", ) - parser.add_argument( - "--dropout", - default=0.0, - type=float, - metavar="D", - help="probability of an element to be zeroed", - ) parser.add_argument( "--optimizer", metavar="OPT", @@ -215,27 +193,6 @@ def parse_args(): return args -# TODO Remove before merge pull request -def signal_handler(a, b): - global SIGNAL_RECEIVED - logging.warning("Signal received on %s", datetime.now()) - SIGNAL_RECEIVED = True - - -# TODO Remove before merge pull request -def trigger_job_requeue(): - # Submit a new job to resume from checkpoint. - if os.environ["SLURM_PROCID"] == "0" and os.getpid() == MAIN_PID: - logging.warning("PID: %s. PPID: %s.", os.getpid(), os.getppid()) - logging.warning("Resubmitting job") - command = "scontrol requeue " + os.environ["SLURM_JOB_ID"] - logging.warning(command) - if os.system(command): - raise RuntimeError("Fail to resubmit") - logging.warning("New job submitted to the queue") - exit(0) - - def setup_distributed(rank, world_size): os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "12355" @@ -357,10 +314,6 @@ def train_one_epoch( metric["epoch time"] += metric["iteration time"] metric() - # TODO Remove before merge pull request - if SIGNAL_RECEIVED: - break - if reduce_lr_on_plateau and isinstance(scheduler, ReduceLROnPlateau): scheduler.step(metric["average loss"]) elif not isinstance(scheduler, ReduceLROnPlateau): @@ -410,10 +363,6 @@ def evaluate( compute_error_rates(outputs, targets, decoder, language_model, metric) - # TODO Remove before merge pull request - if SIGNAL_RECEIVED: - break - metric["average loss"] = metric["cumulative loss"] / metric["iteration"] metric["validation time"] = time() - start metric() @@ -430,10 +379,6 @@ def main(rank, args): not_main_rank = args.distributed and rank != 0 - # Install signal handler - # TODO Remove before merge pull request - signal.signal(signal.SIGUSR1, signal_handler) - logging.info("Start time: %s", datetime.now()) # Explicitly set seed to make sure models created in separate processes @@ -456,18 +401,7 @@ def main(rank, args): sample_rate_original = 16000 - input_type = args.type - num_features = args.n_bins - - if args.type == "mel": - transforms = torch.nn.Sequential( - # torchaudio.transforms.Resample(sample_rate_original, sample_rate_original//2), - torchaudio.transforms.MelSpectrogram( - sample_rate=sample_rate_original, **melkwargs - ), - ) - input_type = "mfcc" - elif args.type == "mfcc": + if args.type == "mfcc": transforms = torch.nn.Sequential( torchaudio.transforms.MFCC( sample_rate=sample_rate_original, @@ -475,6 +409,7 @@ def main(rank, args): melkwargs=melkwargs, ), ) + num_features = args.n_bins elif args.type == "waveform": transforms = torch.nn.Sequential(UnsqueezeFirst()) num_features = 1 @@ -526,10 +461,8 @@ def main(rank, args): model = Wav2Letter( num_classes=language_model.length, - input_type=input_type, + input_type=args.type, num_features=num_features, - num_hidden_channels=args.n_hidden_channels, - dropout=args.dropout, ) if args.jit: @@ -717,22 +650,6 @@ def main(rank, args): if args.reduce_lr_valid and isinstance(scheduler, ReduceLROnPlateau): scheduler.step(loss) - # TODO Remove before merge pull request - if SIGNAL_RECEIVED: - save_checkpoint( - { - "epoch": epoch + 1, - "state_dict": model.state_dict(), - "best_loss": best_loss, - "optimizer": optimizer.state_dict(), - "scheduler": scheduler.state_dict(), - }, - False, - args.checkpoint, - not_main_rank, - ) - trigger_job_requeue() - logging.info("End time: %s", datetime.now()) if args.distributed: diff --git a/examples/pipeline_wav2letter/wav2letter.py b/examples/pipeline_wav2letter/wav2letter.py deleted file mode 100644 index 261b14febd..0000000000 --- a/examples/pipeline_wav2letter/wav2letter.py +++ /dev/null @@ -1,123 +0,0 @@ -from torch import Tensor, nn - -__all__ = ["Wav2Letter"] - - -class Wav2Letter(nn.Module): - r"""Wav2Letter model architecture from the `Wav2Letter an End-to-End ConvNet-based Speech Recognition System`_. - - .. _Wav2Letter an End-to-End ConvNet-based Speech Recognition System: https://arxiv.org/abs/1609.03193 - - :math:`\text{padding} = \frac{\text{ceil}(\text{kernel} - \text{stride})}{2}` - - Args: - num_classes (int, optional): Number of classes to be classified. (Default: ``40``) - input_type (str, optional): Wav2Letter can use as input: ``waveform``, ``power_spectrum`` - or ``mfcc`` (Default: ``waveform``). - num_features (int, optional): Number of input features that the network will receive (Default: ``1``). - """ - - def __init__( - self, - num_classes: int = 40, - input_type: str = "waveform", - num_features: int = 1, - num_hidden_channels=2000, - dropout=0.0, - ) -> None: - super(Wav2Letter, self).__init__() - - acoustic_num_features = 250 if input_type == "waveform" else num_features - acoustic_model = nn.Sequential( - nn.Conv1d( - in_channels=acoustic_num_features, - out_channels=250, - kernel_size=48, - stride=2, - padding=23, - ), - nn.ReLU(inplace=True), - nn.Conv1d( - in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3 - ), - nn.ReLU(inplace=True), - nn.Conv1d( - in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3 - ), - nn.ReLU(inplace=True), - nn.Conv1d( - in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3 - ), - nn.ReLU(inplace=True), - nn.Conv1d( - in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3 - ), - nn.ReLU(inplace=True), - nn.Conv1d( - in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3 - ), - nn.ReLU(inplace=True), - nn.Conv1d( - in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3 - ), - nn.ReLU(inplace=True), - nn.Conv1d( - in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3 - ), - nn.ReLU(inplace=True), - nn.Conv1d( - in_channels=250, - out_channels=num_hidden_channels, - kernel_size=32, - stride=1, - padding=16, - ), - nn.Dropout(p=dropout), - nn.ReLU(inplace=True), - nn.Conv1d( - in_channels=num_hidden_channels, - out_channels=num_hidden_channels, - kernel_size=1, - stride=1, - padding=0, - ), - nn.Dropout(p=dropout), - nn.ReLU(inplace=True), - nn.Conv1d( - in_channels=num_hidden_channels, - out_channels=num_classes, - kernel_size=1, - stride=1, - padding=0, - ), - nn.ReLU(inplace=True), - ) - - if input_type == "waveform": - waveform_model = nn.Sequential( - nn.Conv1d( - in_channels=num_features, - out_channels=250, - kernel_size=250, - stride=160, - padding=45, - ), - nn.ReLU(inplace=True), - ) - self.acoustic_model = nn.Sequential(waveform_model, acoustic_model) - - if input_type in ["power_spectrum", "mfcc"]: - self.acoustic_model = acoustic_model - - def forward(self, x: Tensor) -> Tensor: - r""" - Args: - x (torch.Tensor): Tensor of dimension (batch_size, num_features, input_length). - - Returns: - Tensor: Predictor tensor of dimension (batch_size, number_of_classes, input_length). - """ - - x = self.acoustic_model(x) - x = nn.functional.log_softmax(x, dim=1) - return x From cc4db154e7f003404b0c1fd12b5df27853cbf9ee Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 19 Aug 2020 14:08:43 -0700 Subject: [PATCH 128/129] remove comment. --- examples/pipeline_wav2letter/datasets.py | 6 +----- examples/pipeline_wav2letter/main.py | 4 ---- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/examples/pipeline_wav2letter/datasets.py b/examples/pipeline_wav2letter/datasets.py index 0a702055c2..79b05b2c5b 100644 --- a/examples/pipeline_wav2letter/datasets.py +++ b/examples/pipeline_wav2letter/datasets.py @@ -38,18 +38,15 @@ def __len__(self): return len(self.dataset) def process_datapoint(self, item): - transformed = item[0] # .to(device) + transformed = item[0] target = item[2].lower() transformed = self.transforms(transformed) transformed = transformed[0, ...].transpose(0, -1) - # target = " " + target + " " target = self.encode(target) target = torch.tensor(target, dtype=torch.long, device=transformed.device) - # transformed = transformed.to("cpu") - # target = target.to("cpu") return transformed, target @@ -78,7 +75,6 @@ def create(tags, cache=True): ] ) - # data = diskcache_iterator(data) data = MapMemoryCache(data) return data diff --git a/examples/pipeline_wav2letter/main.py b/examples/pipeline_wav2letter/main.py index aaa4d30a2c..8f52ec6c3c 100644 --- a/examples/pipeline_wav2letter/main.py +++ b/examples/pipeline_wav2letter/main.py @@ -443,7 +443,6 @@ def main(rank, args): training, validation = split_process_librispeech( [args.dataset_train, args.dataset_valid], - # [transforms_train, transforms_valid], [transforms, transforms], language_model, root=args.dataset_root, @@ -473,7 +472,6 @@ def main(rank, args): devices = list(range(rank * n, (rank + 1) * n)) model = model.to(devices[0]) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=devices) - # model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True) else: devices = ["cuda" if torch.cuda.is_available() else "cpu"] model = model.to(devices[0], non_blocking=True) @@ -526,8 +524,6 @@ def main(rank, args): criterion = torch.nn.CTCLoss( blank=language_model.mapping[char_blank], zero_infinity=False ) - # criterion = torch.nn.MSELoss() - # criterion = torch.nn.NLLLoss() # Data Loader From a2b6ad235e69f0276b8804d06a307c83acae4448 Mon Sep 17 00:00:00 2001 From: Vincent Quenneville-Belair Date: Wed, 19 Aug 2020 14:19:38 -0700 Subject: [PATCH 129/129] simpler import with pandas. --- examples/pipeline_wav2letter/README.md | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/examples/pipeline_wav2letter/README.md b/examples/pipeline_wav2letter/README.md index b703a51593..8117d19c91 100644 --- a/examples/pipeline_wav2letter/README.md +++ b/examples/pipeline_wav2letter/README.md @@ -2,7 +2,7 @@ This is an example pipeline for speech recognition using a greedy or Viterbi CTC ### Usage -More information about each command line parameters is available with the `--help` option. An example reproducing the demo can be invoked as follows. +More information about each command line parameters is available with the `--help` option. An example can be invoked as follows. ``` python main.py \ --reduce-lr-valid \ @@ -28,30 +28,12 @@ With these default parameters, we get a character error rate of 13.8% on dev-cle ### Output The information reported at each iteration and epoch (e.g. loss, character error rate, word error rate) is printed to standard output in the form of one json per line, e.g. -``` +```python {"name": "train", "epoch": 0, "cer over target length": 1.0, "cumulative cer": 23317.0, "total chars": 23317.0, "cer": 0.0, "cumulative cer over target length": 0.0, "wer over target length": 1.0, "cumulative wer": 4446.0, "total words": 4446.0, "wer": 0.0, "cumulative wer over target length": 0.0, "lr": 0.6, "batch size": 128, "n_channel": 13, "n_time": 2453, "dataset length": 128.0, "iteration": 1.0, "loss": 8.712121963500977, "cumulative loss": 8.712121963500977, "average loss": 8.712121963500977, "iteration time": 41.46276903152466, "epoch time": 41.46276903152466} {"name": "train", "epoch": 0, "cer over target length": 1.0, "cumulative cer": 46005.0, "total chars": 46005.0, "cer": 0.0, "cumulative cer over target length": 0.0, "wer over target length": 1.0, "cumulative wer": 8762.0, "total words": 8762.0, "wer": 0.0, "cumulative wer over target length": 0.0, "lr": 0.6, "batch size": 128, "n_channel": 13, "n_time": 1703, "dataset length": 256.0, "iteration": 2.0, "loss": 8.918599128723145, "cumulative loss": 17.63072109222412, "average loss": 8.81536054611206, "iteration time": 1.2905676364898682, "epoch time": 42.753336668014526} {"name": "train", "epoch": 0, "cer over target length": 1.0, "cumulative cer": 70030.0, "total chars": 70030.0, "cer": 0.0, "cumulative cer over target length": 0.0, "wer over target length": 1.0, "cumulative wer": 13348.0, "total words": 13348.0, "wer": 0.0, "cumulative wer over target length": 0.0, "lr": 0.6, "batch size": 128, "n_channel": 13, "n_time": 1713, "dataset length": 384.0, "iteration": 3.0, "loss": 8.550191879272461, "cumulative loss": 26.180912971496582, "average loss": 8.726970990498861, "iteration time": 1.2109291553497314, "epoch time": 43.96426582336426} ``` -Further information is reported to standard error. Here is an example python function to parse the standard output when saved to a file. -```python -def read_json(filename): - """ - Convert the standard output saved to filename into a pandas dataframe for analysis. - """ - - import pandas - import json - - with open(filename, "r") as f: - data = f.read() - - # pandas doesn't read single quotes for json - data = data.replace("'", '"') - - data = [json.loads(l) for l in data.splitlines()] - return pandas.DataFrame(data) -``` +One way to import the output in python with pandas is by saving the standard output to a file, and then using `pandas.read_json(filename, lines=True)`. ## Structure of pipeline