From 35f577b3798d22135cb980b04ec3e7362414a4ab Mon Sep 17 00:00:00 2001 From: Wanchao Liang Date: Fri, 23 Feb 2024 09:36:56 -0800 Subject: [PATCH 1/4] Bunch of cleanups clena up the repo to: 1. remove unused files 2. delete not used code comments --- torchtrain/datasets/pad_batch_sequence.py | 77 -------------------- torchtrain/parallelisms/parallelize_llama.py | 6 -- 2 files changed, 83 deletions(-) delete mode 100644 torchtrain/datasets/pad_batch_sequence.py diff --git a/torchtrain/datasets/pad_batch_sequence.py b/torchtrain/datasets/pad_batch_sequence.py deleted file mode 100644 index ffdbf555fc..0000000000 --- a/torchtrain/datasets/pad_batch_sequence.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# copied from torchtune - -from typing import List, Tuple - -import torch -import torch.nn.functional as F -from torch.nn.utils.rnn import pad_sequence - -# TokenPair is a pair (tuple) of two lists: tokenized text inputs and labels. -TokenPair = Tuple[List[int], List[int]] - -_DEFAULT_INPUT_PADDING_IDX: int = 0 -_DEFAULT_LABEL_PADDING_IDX: int = -100 - - -def pad_batch_to_longest_seq( - batch: List[TokenPair], - input_padding_idx: int = _DEFAULT_INPUT_PADDING_IDX, - label_padding_idx: int = _DEFAULT_LABEL_PADDING_IDX, -) -> Tuple[torch.Tensor, torch.Tensor]: - """Pad a batch of sequences to the longest sequence length in the batch, and - convert integer lists to tensors. - - Args: - batch (List[TokenPair]): A list of tuples containing input, label pairs. - input_padding_idx (int): Padding index for input ids. Defaults to 0. - label_padding_idx (int): Padding index for labels. Defaults to -100. - Returns: - Collated input and label tensors. - - Example: - token_pairs = [ - ([1, 2, 3], [4, 5, 6]), - ([7,], [10,],), - ] - inputs, labels = batch_pad_to_longest_seq( - batch=token_pairs, - input_padding_idx=input_padding_idx, - label_padding_idx=label_padding_idx, - ) - >>> inputs - tensor([[1, 2, 3], [7, 0, 0]]) - >>> labels - tensor([[4,5,6], [10,-100,-100]]) - """ - input_ids = pad_sequence( - [torch.tensor(x[0]) for x in batch], - batch_first=True, - padding_value=input_padding_idx, - ) - labels = pad_sequence( - [torch.tensor(x[1]) for x in batch], - batch_first=True, - padding_value=label_padding_idx, - ) - - input_ids_seq_len = input_ids.shape[-1] - labels_seq_len = labels.shape[-1] - - # Hack to pad correctly and not use max_seq_len, which is costly - if input_ids_seq_len > labels_seq_len: - labels = F.pad( - labels, (0, input_ids_seq_len - labels_seq_len), value=label_padding_idx - ) - elif labels_seq_len > input_ids_seq_len: - input_ids = F.pad( - input_ids, - (0, labels_seq_len - input_ids_seq_len), - value=input_padding_idx, - ) - return input_ids, labels diff --git a/torchtrain/parallelisms/parallelize_llama.py b/torchtrain/parallelisms/parallelize_llama.py index 399533e2f0..6538d418fe 100644 --- a/torchtrain/parallelisms/parallelize_llama.py +++ b/torchtrain/parallelisms/parallelize_llama.py @@ -130,12 +130,6 @@ def parallelize_llama(model, world_mesh, parallel_dims, args): "feed_forward.w2": RowwiseParallel(output_layouts=Shard(0)), "feed_forward.w3": ColwiseParallel(), } - # if layer_id == 0: - # # in first transformer block we need to shard the input - # layer_plan[""] = PrepareModuleInput( - # input_layouts=(Replicate(), None), - # desired_input_layouts=(Shard(0), None), - # ) # adjust num_heads in attention layer to local heads attn_layer = transformer_block.attention From 27e40832872a6e155e240d84061535eed4260d10 Mon Sep 17 00:00:00 2001 From: Wanchao Liang Date: Fri, 23 Feb 2024 09:58:28 -0800 Subject: [PATCH 2/4] add design principles --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 35930f6d94..c867eeac4b 100644 --- a/README.md +++ b/README.md @@ -67,3 +67,12 @@ If your gpu count per node is not 8, adjust: ```#SBATCH --gpus-per-task``` in the SBATCH command section. + + +## Design Principles + +TorchTrain is a native PyTorch library with various training techniques, it utilizes the PyTorch ecosystem for things like data loading (i.e. HuggingFace datasets), the core functionality is written in PyTorch. + +* Designed to be easy to understand, use and extend for different training purposes. +* Minimal code changes to the model code, when applying 2D/3D Parallelisms. +* Modular components instead of monolithic codebase From bee50b072db3c568197dc0590ab51d5d6099f90e Mon Sep 17 00:00:00 2001 From: Wanchao Liang Date: Fri, 23 Feb 2024 10:52:20 -0800 Subject: [PATCH 3/4] fix imports --- torchtrain/datasets/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/torchtrain/datasets/__init__.py b/torchtrain/datasets/__init__.py index 8d91e584cc..338b5e1fa4 100644 --- a/torchtrain/datasets/__init__.py +++ b/torchtrain/datasets/__init__.py @@ -2,7 +2,6 @@ # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. from torchtrain.datasets.alpaca import build_alpaca_data_loader -from torchtrain.datasets.pad_batch_sequence import pad_batch_to_longest_seq from torchtrain.datasets.tokenizer import create_tokenizer __all__ = ["build_alpaca_data_loader", "create_tokenizer", "pad_batch_to_longest_seq"] From 52b8a82e2106a5c38fad944cc19c772dd46ce6b1 Mon Sep 17 00:00:00 2001 From: Wanchao Liang Date: Fri, 23 Feb 2024 11:11:55 -0800 Subject: [PATCH 4/4] address comments --- README.md | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index c867eeac4b..14996ced79 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,14 @@ Note: This repository is currently under heavy development. torchtrain contains PyTorch native parallelisms, tools and utilities to train large models. +## Design Principles + +TorchTrain is a native PyTorch library with various training techniques. While it utilizes the PyTorch ecosystem for things like data loading (i.e. HuggingFace datasets), the core functionality is written in PyTorch. + +* Designed to be easy to understand, use and extend for different training purposes. +* Minimal changes to the model code, when applying 1D/2D or 3D Parallelisms. +* Modular components instead of monolithic codebase + # Installation Install PyTorch from source or install the latest pytorch nightly, then install requirements by @@ -67,12 +75,3 @@ If your gpu count per node is not 8, adjust: ```#SBATCH --gpus-per-task``` in the SBATCH command section. - - -## Design Principles - -TorchTrain is a native PyTorch library with various training techniques, it utilizes the PyTorch ecosystem for things like data loading (i.e. HuggingFace datasets), the core functionality is written in PyTorch. - -* Designed to be easy to understand, use and extend for different training purposes. -* Minimal code changes to the model code, when applying 2D/3D Parallelisms. -* Modular components instead of monolithic codebase