From 794d61d58de799d0a1d11c69c66fb9490d506271 Mon Sep 17 00:00:00 2001
From: Philip Bontrager <pbontrager@gmail.com>
Date: Thu, 30 Oct 2025 08:45:28 -0700
Subject: [PATCH 1/3] initial gpt_oss config

---
 apps/grpo/gpt_oss_20b.yaml | 184 +++++++++++++++++++++++++++++++++++++
 pyproject.toml             |   8 +-
 2 files changed, 191 insertions(+), 1 deletion(-)
 create mode 100644 apps/grpo/gpt_oss_20b.yaml

diff --git a/apps/grpo/gpt_oss_20b.yaml b/apps/grpo/gpt_oss_20b.yaml
new file mode 100644
index 000000000..ab38bb21c
--- /dev/null
+++ b/apps/grpo/gpt_oss_20b.yaml
@@ -0,0 +1,184 @@
+# Grouped Relative Policy Optimization (GRPO) for GPT-OSS 20B
+# >>> python -m apps.grpo.main --config apps/grpo/gpt_oss_20b.yaml
+
+# Global configuration
+group_size: 4  # Reduced for initial testing to avoid OOM
+local_batch_size: 1 # per-device batch size (reduced for 20B model to avoid OOM)
+max_req_tokens: 512  # Reduced for initial testing
+max_res_tokens: 512  # Reduced for initial testing
+model: "openai/gpt-oss-20b"
+off_by_n: 1 # Off by one by default
+
+# GPU allocation for single-node (8 GPUs total):
+# - Trainer: 4 GPUs (EP=4 for 32 experts -> 8 experts per GPU)
+# - Policy: 2 GPUs (EP=2 for 32 experts -> 16 experts per GPU)
+# - Ref Model: 2 GPUs (EP=2 for 32 experts -> 16 experts per GPU)
+
+# Main loop configuration
+rollout_threads: 1   # Recommended to set equal to policy.num_replicas
+
+# Observability configuration
+metric_logging:
+  wandb:
+    project: grpo-training
+    group: gpt_oss_exp_${oc.env:USER}
+    logging_mode: global_reduce # global_reduce, per_rank_reduce, per_rank_no_reduce
+  console:
+    logging_mode: global_reduce
+
+# Dataset configuration
+dataset:
+  path: "openai/gsm8k"
+  revision: "main"
+  data_split: "train"
+  streaming: true
+  model: ${model}
+
+# Policy configuration (uses vLLM for generation)
+policy:
+  engine_args:  # https://docs.vllm.ai/en/v0.10.0/api/vllm/engine/arg_utils.html#vllm.engine.arg_utils.EngineArgs
+    model: ${model}
+    tensor_parallel_size: 2  # 2 GPUs for policy
+    pipeline_parallel_size: 1
+    enable_expert_parallel: true  # Enable expert parallelism for MoE (shards 32 experts across 2 GPUs = 16 experts/GPU)
+    enforce_eager: false
+  sampling_params:  # https://docs.vllm.ai/en/v0.10.0/api/vllm/sampling_params.html#vllm.sampling_params.SamplingParams
+    n: ${group_size}
+    max_tokens: ${max_res_tokens}
+    temperature: 1.0
+    top_p: 1.0
+
+# Trainer configuration
+trainer:
+  model:
+    name: gpt_oss
+    flavor: 20b
+    hf_assets_path: hf://${model}  # Update when HF checkpoint available
+  optimizer:
+    name: AdamW
+    lr: 8e-5
+    eps: 1e-8
+  lr_scheduler:
+    warmup_steps: 100
+    decay_ratio: 0.8
+    decay_type: "linear"
+    min_lr_factor: 0.0
+  training:
+    local_batch_size: ${local_batch_size}
+    seq_len: ${sum:${max_req_tokens},${max_res_tokens}}  # seq_len >= max_req_tokens + max_res_tokens
+    max_norm: 1.0
+    steps: 1000000
+    dtype: bfloat16
+    gc_freq: 1
+  compile:
+    enable: false
+    components: ["model", "loss"]
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: 4  # Must satisfy: dp_replicate * dp_shard * cp * tp * pp = world_size (4)
+    tensor_parallel_degree: 1
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 4  # EP borrows from dp_shard: 32 experts / 4 = 8 experts per GPU
+    expert_tensor_parallel_degree: 1
+    disable_loss_parallel: true
+  checkpoint:
+    enable: true
+    folder: ./checkpoint              # The folder to save checkpoints to.
+    initial_load_path: hf://${model}  # Update when HF checkpoint available
+    initial_load_in_hf: true          # If true, interpret initial_load_path as a HuggingFace model repo
+    last_save_in_hf: false  # Disabled since gpt_oss doesn't provide state_dict_adapter
+    interval: 500
+    async_mode: "disabled"
+  activation_checkpoint:
+    mode: selective
+    selective_ac_option: op
+  quantize:
+    linear:
+      float8:
+        enable_fsdp_float8_all_gather: false
+        precompute_float8_dynamic_scale_for_fsdp: false
+        filter_fqns: ["output", "router.gate"]
+    grouped_mm:
+      float8:
+        fqns: ["experts"]
+  comm:
+    trace_buf_size: 0
+
+# Replay buffer configuration
+replay_buffer:
+  batch_size: ${local_batch_size}
+  max_policy_age: ${off_by_n}
+  dp_size: 4  # Total DP degree: dp_replicate * dp_shard = 1 * 4 = 4
+
+# Reference model configuration
+ref_model:
+  model:
+    name: gpt_oss
+    flavor: 20b
+    hf_assets_path: hf://${model}
+  training:
+    seq_len: ${trainer.training.seq_len}
+    dtype: bfloat16
+    gc_freq: 1
+  compile:
+    enable: false
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: 2  # Must satisfy: dp_replicate * dp_shard * cp * tp * pp = world_size (2)
+    tensor_parallel_degree: 1
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 2  # EP borrows from dp_shard: 32 experts / 2 = 16 experts per GPU
+    expert_tensor_parallel_degree: 1
+  checkpoint:
+    enable: true
+    initial_load_path: hf://${model}
+    initial_load_in_hf: true
+  quantize:
+    linear:
+      float8:
+        enable_fsdp_float8_all_gather: false
+        precompute_float8_dynamic_scale_for_fsdp: false
+        filter_fqns: ["output", "router.gate"]
+    grouped_mm:
+      float8:
+        fqns: ["experts"]
+  comm:
+    trace_buf_size: 0
+
+# All resource allocations
+services:
+  policy:
+    procs: 2  # 2 GPUs for policy with expert parallelism
+    num_replicas: 1
+    mesh_name: policy
+    with_gpus: true
+  ref_model:
+    procs: 2  # 2 GPUs for reference model with expert parallelism
+    num_replicas: 1
+    mesh_name: ref_model
+    with_gpus: true
+  reward_actor:
+    procs: 1
+    num_replicas: 1
+    mesh_name: reward_actor
+    with_gpus: false
+
+actors:
+  dataset:
+    procs: 1
+    with_gpus: false
+    mesh_name: dataset
+  trainer:
+    procs: 4  # 4 GPUs for trainer with expert parallelism
+    with_gpus: true
+    mesh_name: trainer
+  replay_buffer:
+    procs: 1
+    with_gpus: false
+    mesh_name: replay_buffer
+  compute_advantages:
+    procs: 1
+    with_gpus: false
+    mesh_name: compute_advantages
diff --git a/pyproject.toml b/pyproject.toml
index 8460b5b78..9ad7363b7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,7 @@ dependencies = [
     # PyTorch
     "torch==2.9.0",
     "torchdata>=0.8.0",
-    "torchtitan==0.2.0",
+    "torchtitan==0.1.0.dev20251029",
     "torchmonarch==0.1.2",
     "torchstore==0.1.2",
     # vLLM
@@ -83,6 +83,11 @@ members = [
 name = "pytorch-cu128"
 url  = "https://download.pytorch.org/whl/cu128"
 
+# pytorch nightly
+[[tool.uv.index]]
+name = "pytorch-nightly-cu128"
+url = "https://download.pytorch.org/whl/nightly/cu128"
+
 # vllm
 [[tool.uv.index]]
 name = "vllm-forge"
@@ -90,6 +95,7 @@ url = "https://download.pytorch.org/whl/preview/forge"
 
 [tool.uv.sources]
 torch = { index = "pytorch-cu128" }
+torchtitan = { index = "pytorch-nightly-cu128" }
 vllm = { index = "vllm-forge" }
 
 [tool.uv]

From 37e0231079607b5dea8ea4d2353d877c052178c1 Mon Sep 17 00:00:00 2001
From: Philip Bontrager <pbontrager@gmail.com>
Date: Mon, 10 Nov 2025 07:26:58 -0800
Subject: [PATCH 2/3] moe config

---
 apps/grpo/qwen3_30b_a3b.yaml | 186 +++++++++++++++++++++++++++++++++++
 1 file changed, 186 insertions(+)
 create mode 100644 apps/grpo/qwen3_30b_a3b.yaml

diff --git a/apps/grpo/qwen3_30b_a3b.yaml b/apps/grpo/qwen3_30b_a3b.yaml
new file mode 100644
index 000000000..cf155c2b7
--- /dev/null
+++ b/apps/grpo/qwen3_30b_a3b.yaml
@@ -0,0 +1,186 @@
+# Grouped Relative Policy Optimization (GRPO) for Qwen3 30B-A3B
+# >>> python -m apps.grpo.main --config apps/grpo/qwen3_30b_a3b.yaml
+
+# Global configuration
+group_size: 4  # Reduced for initial testing to avoid OOM
+local_batch_size: 1 # per-device batch size (reduced for 30B MoE model to avoid OOM)
+max_req_tokens: 512  # Reduced for initial testing
+max_res_tokens: 512  # Reduced for initial testing
+model: "Qwen/Qwen3-30B-A3B"
+off_by_n: 1 # Off by one by default
+
+# GPU allocation for single-node (8 GPUs total):
+# - Trainer: 4 GPUs (EP=4 for MoE experts)
+# - Policy: 2 GPUs (EP=2 for MoE experts)
+# - Ref Model: 2 GPUs (EP=2 for MoE experts)
+
+# Main loop configuration
+rollout_threads: 1   # Recommended to set equal to policy.num_replicas
+
+# Observability configuration
+metric_logging:
+  wandb:
+    project: grpo-training
+    group: qwen3_30b_a3b_exp_${oc.env:USER}
+    logging_mode: global_reduce # global_reduce, per_rank_reduce, per_rank_no_reduce
+  console:
+    logging_mode: global_reduce
+
+# Dataset configuration
+dataset:
+  path: "openai/gsm8k"
+  revision: "main"
+  data_split: "train"
+  streaming: true
+  model: ${model}
+
+# Policy configuration (uses vLLM for generation)
+policy:
+  engine_args:  # https://docs.vllm.ai/en/v0.10.0/api/vllm/engine/arg_utils.html#vllm.engine.arg_utils.EngineArgs
+    model: ${model}
+    tensor_parallel_size: 2  # 2 GPUs for policy
+    pipeline_parallel_size: 1
+    enable_expert_parallel: true  # Enable expert parallelism for MoE
+    enforce_eager: false
+  sampling_params:  # https://docs.vllm.ai/en/v0.10.0/api/vllm/sampling_params.html#vllm.sampling_params.SamplingParams
+    n: ${group_size}
+    max_tokens: ${max_res_tokens}
+    temperature: 1.0
+    top_p: 1.0
+
+# Trainer configuration
+trainer:
+  model:
+    name: qwen3
+    flavor: 30B-A3B
+    hf_assets_path: hf://${model}
+  optimizer:
+    name: AdamW
+    lr: 8e-5
+    eps: 1e-8
+  lr_scheduler:
+    warmup_steps: 100
+    decay_ratio: 0.8
+    decay_type: "linear"
+    min_lr_factor: 0.0
+  training:
+    local_batch_size: ${local_batch_size}
+    seq_len: ${sum:${max_req_tokens},${max_res_tokens}}  # seq_len >= max_req_tokens + max_res_tokens
+    max_norm: 1.0
+    steps: 1000000
+    dtype: bfloat16
+    gc_freq: 1
+  compile:
+    enable: false
+    components: ["model", "loss"]
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: 4  # Must satisfy: dp_replicate * dp_shard * cp * tp * pp = world_size (4)
+    tensor_parallel_degree: 1
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 4  # EP borrows from dp_shard for MoE
+    expert_tensor_parallel_degree: 1
+    disable_loss_parallel: true
+  checkpoint:
+    enable: true
+    folder: ./checkpoint              # The folder to save checkpoints to.
+    initial_load_path: hf://${model}
+    initial_load_in_hf: true          # If true, interpret initial_load_path as a HuggingFace model repo
+    last_save_in_hf: true
+    interval: 500
+    async_mode: "disabled"
+  activation_checkpoint:
+    mode: selective
+    selective_ac_option: op
+  quantize:
+    linear:
+      float8:
+        enable_fsdp_float8_all_gather: false
+        precompute_float8_dynamic_scale_for_fsdp: false
+        recipe_name: "rowwise"
+        filter_fqns: ["output", "router.gate"]
+    grouped_mm:
+      float8:
+        fqns: ["experts"]
+  comm:
+    trace_buf_size: 0
+
+# Replay buffer configuration
+replay_buffer:
+  batch_size: ${local_batch_size}
+  max_policy_age: ${off_by_n}
+  dp_size: 4  # Total DP degree: dp_replicate * dp_shard = 1 * 4 = 4
+
+# Reference model configuration
+ref_model:
+  model:
+    name: qwen3
+    flavor: 30B-A3B
+    hf_assets_path: hf://${model}
+  training:
+    seq_len: ${trainer.training.seq_len}
+    dtype: bfloat16
+    gc_freq: 1
+  compile:
+    enable: false
+  parallelism:
+    data_parallel_replicate_degree: 1
+    data_parallel_shard_degree: 1  # Must satisfy: dp_replicate * dp_shard * cp * tp * pp = world_size (2)
+    tensor_parallel_degree: 2
+    pipeline_parallel_degree: 1
+    context_parallel_degree: 1
+    expert_parallel_degree: 2  # EP borrows from dp_shard for MoE
+    expert_tensor_parallel_degree: 1
+  checkpoint:
+    enable: true
+    initial_load_path: hf://${model}
+    initial_load_in_hf: true
+  quantize:
+    linear:
+      float8:
+        enable_fsdp_float8_all_gather: false
+        precompute_float8_dynamic_scale_for_fsdp: false
+        recipe_name: "rowwise"
+        filter_fqns: ["output", "router.gate"]
+    grouped_mm:
+      float8:
+        fqns: ["experts"]
+  comm:
+    trace_buf_size: 0
+
+# All resource allocations
+services:
+  policy:
+    procs: 2  # 2 GPUs for policy with expert parallelism
+    num_replicas: 1
+    mesh_name: policy
+    with_gpus: true
+  ref_model:
+    procs: 2  # 2 GPUs for reference model with expert parallelism
+    num_replicas: 1
+    mesh_name: ref_model
+    with_gpus: true
+  reward_actor:
+    procs: 1
+    num_replicas: 1
+    mesh_name: reward_actor
+    with_gpus: false
+
+actors:
+  dataset:
+    procs: 1
+    with_gpus: false
+    mesh_name: dataset
+  trainer:
+    procs: 4  # 4 GPUs for trainer with expert parallelism
+    with_gpus: true
+    mesh_name: trainer
+  replay_buffer:
+    procs: 1
+    with_gpus: false
+    mesh_name: replay_buffer
+  compute_advantages:
+    procs: 1
+    with_gpus: false
+    mesh_name: compute_advantages

From 3665832e400deae8c0270baaf96b93fcaa1bc169 Mon Sep 17 00:00:00 2001
From: Philip Bontrager <pbontrager@gmail.com>
Date: Mon, 10 Nov 2025 07:33:01 -0800
Subject: [PATCH 3/3] include missing edits

---
 apps/grpo/gpt_oss_20b.yaml          | 184 ----------------------------
 src/forge/actors/reference_model.py |   3 +
 2 files changed, 3 insertions(+), 184 deletions(-)
 delete mode 100644 apps/grpo/gpt_oss_20b.yaml

diff --git a/apps/grpo/gpt_oss_20b.yaml b/apps/grpo/gpt_oss_20b.yaml
deleted file mode 100644
index ab38bb21c..000000000
--- a/apps/grpo/gpt_oss_20b.yaml
+++ /dev/null
@@ -1,184 +0,0 @@
-# Grouped Relative Policy Optimization (GRPO) for GPT-OSS 20B
-# >>> python -m apps.grpo.main --config apps/grpo/gpt_oss_20b.yaml
-
-# Global configuration
-group_size: 4  # Reduced for initial testing to avoid OOM
-local_batch_size: 1 # per-device batch size (reduced for 20B model to avoid OOM)
-max_req_tokens: 512  # Reduced for initial testing
-max_res_tokens: 512  # Reduced for initial testing
-model: "openai/gpt-oss-20b"
-off_by_n: 1 # Off by one by default
-
-# GPU allocation for single-node (8 GPUs total):
-# - Trainer: 4 GPUs (EP=4 for 32 experts -> 8 experts per GPU)
-# - Policy: 2 GPUs (EP=2 for 32 experts -> 16 experts per GPU)
-# - Ref Model: 2 GPUs (EP=2 for 32 experts -> 16 experts per GPU)
-
-# Main loop configuration
-rollout_threads: 1   # Recommended to set equal to policy.num_replicas
-
-# Observability configuration
-metric_logging:
-  wandb:
-    project: grpo-training
-    group: gpt_oss_exp_${oc.env:USER}
-    logging_mode: global_reduce # global_reduce, per_rank_reduce, per_rank_no_reduce
-  console:
-    logging_mode: global_reduce
-
-# Dataset configuration
-dataset:
-  path: "openai/gsm8k"
-  revision: "main"
-  data_split: "train"
-  streaming: true
-  model: ${model}
-
-# Policy configuration (uses vLLM for generation)
-policy:
-  engine_args:  # https://docs.vllm.ai/en/v0.10.0/api/vllm/engine/arg_utils.html#vllm.engine.arg_utils.EngineArgs
-    model: ${model}
-    tensor_parallel_size: 2  # 2 GPUs for policy
-    pipeline_parallel_size: 1
-    enable_expert_parallel: true  # Enable expert parallelism for MoE (shards 32 experts across 2 GPUs = 16 experts/GPU)
-    enforce_eager: false
-  sampling_params:  # https://docs.vllm.ai/en/v0.10.0/api/vllm/sampling_params.html#vllm.sampling_params.SamplingParams
-    n: ${group_size}
-    max_tokens: ${max_res_tokens}
-    temperature: 1.0
-    top_p: 1.0
-
-# Trainer configuration
-trainer:
-  model:
-    name: gpt_oss
-    flavor: 20b
-    hf_assets_path: hf://${model}  # Update when HF checkpoint available
-  optimizer:
-    name: AdamW
-    lr: 8e-5
-    eps: 1e-8
-  lr_scheduler:
-    warmup_steps: 100
-    decay_ratio: 0.8
-    decay_type: "linear"
-    min_lr_factor: 0.0
-  training:
-    local_batch_size: ${local_batch_size}
-    seq_len: ${sum:${max_req_tokens},${max_res_tokens}}  # seq_len >= max_req_tokens + max_res_tokens
-    max_norm: 1.0
-    steps: 1000000
-    dtype: bfloat16
-    gc_freq: 1
-  compile:
-    enable: false
-    components: ["model", "loss"]
-  parallelism:
-    data_parallel_replicate_degree: 1
-    data_parallel_shard_degree: 4  # Must satisfy: dp_replicate * dp_shard * cp * tp * pp = world_size (4)
-    tensor_parallel_degree: 1
-    pipeline_parallel_degree: 1
-    context_parallel_degree: 1
-    expert_parallel_degree: 4  # EP borrows from dp_shard: 32 experts / 4 = 8 experts per GPU
-    expert_tensor_parallel_degree: 1
-    disable_loss_parallel: true
-  checkpoint:
-    enable: true
-    folder: ./checkpoint              # The folder to save checkpoints to.
-    initial_load_path: hf://${model}  # Update when HF checkpoint available
-    initial_load_in_hf: true          # If true, interpret initial_load_path as a HuggingFace model repo
-    last_save_in_hf: false  # Disabled since gpt_oss doesn't provide state_dict_adapter
-    interval: 500
-    async_mode: "disabled"
-  activation_checkpoint:
-    mode: selective
-    selective_ac_option: op
-  quantize:
-    linear:
-      float8:
-        enable_fsdp_float8_all_gather: false
-        precompute_float8_dynamic_scale_for_fsdp: false
-        filter_fqns: ["output", "router.gate"]
-    grouped_mm:
-      float8:
-        fqns: ["experts"]
-  comm:
-    trace_buf_size: 0
-
-# Replay buffer configuration
-replay_buffer:
-  batch_size: ${local_batch_size}
-  max_policy_age: ${off_by_n}
-  dp_size: 4  # Total DP degree: dp_replicate * dp_shard = 1 * 4 = 4
-
-# Reference model configuration
-ref_model:
-  model:
-    name: gpt_oss
-    flavor: 20b
-    hf_assets_path: hf://${model}
-  training:
-    seq_len: ${trainer.training.seq_len}
-    dtype: bfloat16
-    gc_freq: 1
-  compile:
-    enable: false
-  parallelism:
-    data_parallel_replicate_degree: 1
-    data_parallel_shard_degree: 2  # Must satisfy: dp_replicate * dp_shard * cp * tp * pp = world_size (2)
-    tensor_parallel_degree: 1
-    pipeline_parallel_degree: 1
-    context_parallel_degree: 1
-    expert_parallel_degree: 2  # EP borrows from dp_shard: 32 experts / 2 = 16 experts per GPU
-    expert_tensor_parallel_degree: 1
-  checkpoint:
-    enable: true
-    initial_load_path: hf://${model}
-    initial_load_in_hf: true
-  quantize:
-    linear:
-      float8:
-        enable_fsdp_float8_all_gather: false
-        precompute_float8_dynamic_scale_for_fsdp: false
-        filter_fqns: ["output", "router.gate"]
-    grouped_mm:
-      float8:
-        fqns: ["experts"]
-  comm:
-    trace_buf_size: 0
-
-# All resource allocations
-services:
-  policy:
-    procs: 2  # 2 GPUs for policy with expert parallelism
-    num_replicas: 1
-    mesh_name: policy
-    with_gpus: true
-  ref_model:
-    procs: 2  # 2 GPUs for reference model with expert parallelism
-    num_replicas: 1
-    mesh_name: ref_model
-    with_gpus: true
-  reward_actor:
-    procs: 1
-    num_replicas: 1
-    mesh_name: reward_actor
-    with_gpus: false
-
-actors:
-  dataset:
-    procs: 1
-    with_gpus: false
-    mesh_name: dataset
-  trainer:
-    procs: 4  # 4 GPUs for trainer with expert parallelism
-    with_gpus: true
-    mesh_name: trainer
-  replay_buffer:
-    procs: 1
-    with_gpus: false
-    mesh_name: replay_buffer
-  compute_advantages:
-    procs: 1
-    with_gpus: false
-    mesh_name: compute_advantages
diff --git a/src/forge/actors/reference_model.py b/src/forge/actors/reference_model.py
index 02a6e1410..973220f6a 100644
--- a/src/forge/actors/reference_model.py
+++ b/src/forge/actors/reference_model.py
@@ -22,6 +22,7 @@
     Compile,
     Model,
     Parallelism,
+    Quantize,
     Training,
 )
 from torchtitan.experiments.forge.engine import ForgeEngine
@@ -61,6 +62,7 @@ class ReferenceModel(ForgeActor):
         (TP, PP, CP, DP)
         checkpoint (Checkpoint): Checkpoint loading configuration
         compile (Compile): Torch compilation settings
+        quantize (Quantize): Quantization settings (float8, etc.)
         comm (Comm): Communication backend configuration
         training (Training): Training-related settings (dtype, garbage
         collection, etc.)
@@ -71,6 +73,7 @@ class ReferenceModel(ForgeActor):
     parallelism: Parallelism = field(default_factory=Parallelism)
     checkpoint: Checkpoint = field(default_factory=Checkpoint)
     compile: Compile = field(default_factory=Compile)
+    quantize: Quantize = field(default_factory=Quantize)
     comm: Comm = field(default_factory=Comm)
     training: Training = field(
         default_factory=Training