From 35f577b3798d22135cb980b04ec3e7362414a4ab Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Fri, 23 Feb 2024 09:36:56 -0800
Subject: [PATCH 1/4] Bunch of cleanups

clena up the repo to:

1. remove unused files
2. delete not used code comments
---
 torchtrain/datasets/pad_batch_sequence.py    | 77 --------------------
 torchtrain/parallelisms/parallelize_llama.py |  6 --
 2 files changed, 83 deletions(-)
 delete mode 100644 torchtrain/datasets/pad_batch_sequence.py

diff --git a/torchtrain/datasets/pad_batch_sequence.py b/torchtrain/datasets/pad_batch_sequence.py
deleted file mode 100644
index ffdbf555fc..0000000000
--- a/torchtrain/datasets/pad_batch_sequence.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# copied from torchtune
-
-from typing import List, Tuple
-
-import torch
-import torch.nn.functional as F
-from torch.nn.utils.rnn import pad_sequence
-
-# TokenPair is a pair (tuple) of two lists: tokenized text inputs and labels.
-TokenPair = Tuple[List[int], List[int]]
-
-_DEFAULT_INPUT_PADDING_IDX: int = 0
-_DEFAULT_LABEL_PADDING_IDX: int = -100
-
-
-def pad_batch_to_longest_seq(
-    batch: List[TokenPair],
-    input_padding_idx: int = _DEFAULT_INPUT_PADDING_IDX,
-    label_padding_idx: int = _DEFAULT_LABEL_PADDING_IDX,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """Pad a batch of sequences to the longest sequence length in the batch, and
-    convert integer lists to tensors.
-
-    Args:
-        batch (List[TokenPair]): A list of tuples containing input, label pairs.
-        input_padding_idx (int): Padding index for input ids. Defaults to 0.
-        label_padding_idx (int): Padding index for labels. Defaults to -100.
-    Returns:
-        Collated input and label tensors.
-
-    Example:
-        token_pairs = [
-            ([1, 2, 3], [4, 5, 6]),
-            ([7,], [10,],),
-        ]
-        inputs, labels = batch_pad_to_longest_seq(
-            batch=token_pairs,
-            input_padding_idx=input_padding_idx,
-            label_padding_idx=label_padding_idx,
-        )
-        >>> inputs
-            tensor([[1, 2, 3], [7, 0, 0]])
-        >>> labels
-            tensor([[4,5,6], [10,-100,-100]])
-    """
-    input_ids = pad_sequence(
-        [torch.tensor(x[0]) for x in batch],
-        batch_first=True,
-        padding_value=input_padding_idx,
-    )
-    labels = pad_sequence(
-        [torch.tensor(x[1]) for x in batch],
-        batch_first=True,
-        padding_value=label_padding_idx,
-    )
-
-    input_ids_seq_len = input_ids.shape[-1]
-    labels_seq_len = labels.shape[-1]
-
-    # Hack to pad correctly and not use max_seq_len, which is costly
-    if input_ids_seq_len > labels_seq_len:
-        labels = F.pad(
-            labels, (0, input_ids_seq_len - labels_seq_len), value=label_padding_idx
-        )
-    elif labels_seq_len > input_ids_seq_len:
-        input_ids = F.pad(
-            input_ids,
-            (0, labels_seq_len - input_ids_seq_len),
-            value=input_padding_idx,
-        )
-    return input_ids, labels
diff --git a/torchtrain/parallelisms/parallelize_llama.py b/torchtrain/parallelisms/parallelize_llama.py
index 399533e2f0..6538d418fe 100644
--- a/torchtrain/parallelisms/parallelize_llama.py
+++ b/torchtrain/parallelisms/parallelize_llama.py
@@ -130,12 +130,6 @@ def parallelize_llama(model, world_mesh, parallel_dims, args):
                 "feed_forward.w2": RowwiseParallel(output_layouts=Shard(0)),
                 "feed_forward.w3": ColwiseParallel(),
             }
-            # if layer_id == 0:
-            #     # in first transformer block we need to shard the input
-            #     layer_plan[""] = PrepareModuleInput(
-            #         input_layouts=(Replicate(), None),
-            #         desired_input_layouts=(Shard(0), None),
-            #     )
 
             # adjust num_heads in attention layer to local heads
             attn_layer = transformer_block.attention

From 27e40832872a6e155e240d84061535eed4260d10 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Fri, 23 Feb 2024 09:58:28 -0800
Subject: [PATCH 2/4] add design principles

---
 README.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/README.md b/README.md
index 35930f6d94..c867eeac4b 100644
--- a/README.md
+++ b/README.md
@@ -67,3 +67,12 @@ If your gpu count per node is not 8, adjust:
 ```#SBATCH --gpus-per-task```
 
 in the SBATCH command section.
+
+
+## Design Principles
+
+TorchTrain is a native PyTorch library with various training techniques, it utilizes the PyTorch ecosystem for things like data loading (i.e. HuggingFace datasets), the core functionality is written in PyTorch.
+
+* Designed to be easy to understand, use and extend for different training purposes.
+* Minimal code changes to the model code, when applying 2D/3D Parallelisms.
+* Modular components instead of monolithic codebase

From bee50b072db3c568197dc0590ab51d5d6099f90e Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Fri, 23 Feb 2024 10:52:20 -0800
Subject: [PATCH 3/4] fix imports

---
 torchtrain/datasets/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torchtrain/datasets/__init__.py b/torchtrain/datasets/__init__.py
index 8d91e584cc..338b5e1fa4 100644
--- a/torchtrain/datasets/__init__.py
+++ b/torchtrain/datasets/__init__.py
@@ -2,7 +2,6 @@
 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 
 from torchtrain.datasets.alpaca import build_alpaca_data_loader
-from torchtrain.datasets.pad_batch_sequence import pad_batch_to_longest_seq
 from torchtrain.datasets.tokenizer import create_tokenizer
 
 __all__ = ["build_alpaca_data_loader", "create_tokenizer", "pad_batch_to_longest_seq"]

From 52b8a82e2106a5c38fad944cc19c772dd46ce6b1 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Fri, 23 Feb 2024 11:11:55 -0800
Subject: [PATCH 4/4] address comments

---
 README.md | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index c867eeac4b..14996ced79 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,14 @@ Note: This repository is currently under heavy development.
 
 torchtrain contains PyTorch native parallelisms, tools and utilities to train large models.
 
+## Design Principles
+
+TorchTrain is a native PyTorch library with various training techniques. While it utilizes the PyTorch ecosystem for things like data loading (i.e. HuggingFace datasets), the core functionality is written in PyTorch.
+
+* Designed to be easy to understand, use and extend for different training purposes.
+* Minimal changes to the model code, when applying 1D/2D or 3D Parallelisms.
+* Modular components instead of monolithic codebase
+
 # Installation
 
 Install PyTorch from source or install the latest pytorch nightly, then install requirements by
@@ -67,12 +75,3 @@ If your gpu count per node is not 8, adjust:
 ```#SBATCH --gpus-per-task```
 
 in the SBATCH command section.
-
-
-## Design Principles
-
-TorchTrain is a native PyTorch library with various training techniques, it utilizes the PyTorch ecosystem for things like data loading (i.e. HuggingFace datasets), the core functionality is written in PyTorch.
-
-* Designed to be easy to understand, use and extend for different training purposes.
-* Minimal code changes to the model code, when applying 2D/3D Parallelisms.
-* Modular components instead of monolithic codebase