From d9b4f04b802b04a396e47079117b46900a1cff7a Mon Sep 17 00:00:00 2001
From: Matteo Bettini <matbet@meta.com>
Date: Tue, 3 Oct 2023 22:27:41 +0100
Subject: [PATCH 1/5] update

Signed-off-by: Matteo Bettini <matbet@meta.com>
---
 tutorials/sphinx-tutorials/multiagent_ppo.py | 49 +++++++++-----------
 1 file changed, 22 insertions(+), 27 deletions(-)

diff --git a/tutorials/sphinx-tutorials/multiagent_ppo.py b/tutorials/sphinx-tutorials/multiagent_ppo.py
index 4d35b18a360..2b6e4a884a9 100644
--- a/tutorials/sphinx-tutorials/multiagent_ppo.py
+++ b/tutorials/sphinx-tutorials/multiagent_ppo.py
@@ -253,12 +253,11 @@
 #
 #
 
-print("action_spec:", env.action_spec)
-print("reward_spec:", env.reward_spec)
-print("done_spec:", env.done_spec)
+print("action_spec:", env.full_action_spec)
+print("reward_spec:", env.full_reward_spec)
+print("done_spec:", env.full_done_spec)
 print("observation_spec:", env.observation_spec)
 
-
 ######################################################################
 # Using the commands just shown we can access the domain of each value.
 # Doing this we can see that all specs apart from done have a leading shape ``(num_vmas_envs, n_agents)``.
@@ -270,35 +269,20 @@
 # In fact, specs that have the additional agent dimension
 # (i.e., they vary for each agent) will be contained in a inner "agents" key.
 #
-# To access the full structure of the specs we can use
-#
-
-print("full_action_spec:", env.input_spec["full_action_spec"])
-print("full_reward_spec:", env.output_spec["full_reward_spec"])
-print("full_done_spec:", env.output_spec["full_done_spec"])
-
-######################################################################
 # As you can see the reward and action spec present the "agent" key,
 # meaning that entries in tensordicts belonging to those specs will be nested in an "agents" tensordict,
 # grouping all per-agent values.
 #
-# To quickly access the key for each of these values in tensordicts, we can simply ask the environment for the
-# respective key, and
+# To quickly access the keys for each of these values in tensordicts, we can simply ask the environment for the
+# respective keys, and
 # we will immediately understand which are per-agent and which shared.
 # This info will be useful in order to tell all other TorchRL components where to find each value
 #
 
-print("action_key:", env.action_key)
-print("reward_key:", env.reward_key)
-print("done_key:", env.done_key)
+print("action_key:", env.action_keys)
+print("reward_key:", env.reward_keys)
+print("done_key:", env.done_keys)
 
-######################################################################
-# To tie it all together, we can see that passing these keys to the full specs gives us the leaf domains
-#
-
-assert env.action_spec == env.input_spec["full_action_spec"][env.action_key]
-assert env.reward_spec == env.output_spec["full_reward_spec"][env.reward_key]
-assert env.done_spec == env.output_spec["full_done_spec"][env.done_key]
 
 ######################################################################
 # Transforms
@@ -615,6 +599,9 @@
     action=env.action_key,
     sample_log_prob=("agents", "sample_log_prob"),
     value=("agents", "state_value"),
+    # These last 2 keys will be expanded to match the reward shape
+    done=("agents", "done"),
+    terminated=("agents", "terminated"),
 )
 
 
@@ -649,11 +636,19 @@
 episode_reward_mean_list = []
 for tensordict_data in collector:
     tensordict_data.set(
-        ("next", "done"),
+        ("next", "agents", "done"),
         tensordict_data.get(("next", "done"))
         .unsqueeze(-1)
-        .expand(tensordict_data.get(("next", env.reward_key)).shape),
-    )  # We need to expand the done to match the reward shape (this is expected by the value estimator)
+        .expand(tensordict_data.get_item_shape(("next", env.reward_key))),
+    )
+    tensordict_data.set(
+        ("next", "agents", "terminated"),
+        tensordict_data.get(("next", "terminated"))
+        .unsqueeze(-1)
+        .expand(tensordict_data.get_item_shape(("next", env.reward_key))),
+    )
+
+    # We need to expand the done to match the reward shape (this is expected by the value estimator)
 
     with torch.no_grad():
         GAE(

From c522418f8e137a6316a6f4fe8a092b5e23442cb6 Mon Sep 17 00:00:00 2001
From: Matteo Bettini <matbet@meta.com>
Date: Tue, 3 Oct 2023 22:29:23 +0100
Subject: [PATCH 2/5] update

Signed-off-by: Matteo Bettini <matbet@meta.com>
---
 tutorials/sphinx-tutorials/multiagent_ppo.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tutorials/sphinx-tutorials/multiagent_ppo.py b/tutorials/sphinx-tutorials/multiagent_ppo.py
index 2b6e4a884a9..05e824b7e6e 100644
--- a/tutorials/sphinx-tutorials/multiagent_ppo.py
+++ b/tutorials/sphinx-tutorials/multiagent_ppo.py
@@ -647,8 +647,7 @@
         .unsqueeze(-1)
         .expand(tensordict_data.get_item_shape(("next", env.reward_key))),
     )
-
-    # We need to expand the done to match the reward shape (this is expected by the value estimator)
+    # We need to expand the done and terminated to match the reward shape (this is expected by the value estimator)
 
     with torch.no_grad():
         GAE(

From 221e457391d77475f9f0129749d09e1a2bc8f0c3 Mon Sep 17 00:00:00 2001
From: Matteo Bettini <matbet@meta.com>
Date: Tue, 3 Oct 2023 22:30:07 +0100
Subject: [PATCH 3/5] update

Signed-off-by: Matteo Bettini <matbet@meta.com>
---
 tutorials/sphinx-tutorials/multiagent_ppo.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tutorials/sphinx-tutorials/multiagent_ppo.py b/tutorials/sphinx-tutorials/multiagent_ppo.py
index 05e824b7e6e..7b1061f6f99 100644
--- a/tutorials/sphinx-tutorials/multiagent_ppo.py
+++ b/tutorials/sphinx-tutorials/multiagent_ppo.py
@@ -279,9 +279,9 @@
 # This info will be useful in order to tell all other TorchRL components where to find each value
 #
 
-print("action_key:", env.action_keys)
-print("reward_key:", env.reward_keys)
-print("done_key:", env.done_keys)
+print("action_keys:", env.action_keys)
+print("reward_keys:", env.reward_keys)
+print("done_keys:", env.done_keys)
 
 
 ######################################################################

From d2f5c2c3cfab665b8b2e7bfd6eda0932137cdcd9 Mon Sep 17 00:00:00 2001
From: Matteo Bettini <matbet@meta.com>
Date: Tue, 3 Oct 2023 22:33:06 +0100
Subject: [PATCH 4/5] update

Signed-off-by: Matteo Bettini <matbet@meta.com>
---
 tutorials/sphinx-tutorials/multiagent_ppo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/sphinx-tutorials/multiagent_ppo.py b/tutorials/sphinx-tutorials/multiagent_ppo.py
index 7b1061f6f99..c5ae154fcfd 100644
--- a/tutorials/sphinx-tutorials/multiagent_ppo.py
+++ b/tutorials/sphinx-tutorials/multiagent_ppo.py
@@ -682,7 +682,7 @@
     collector.update_policy_weights_()
 
     # Logging
-    done = tensordict_data.get(("next", "done"))
+    done = tensordict_data.get(("next", "agents", "done"))
     episode_reward_mean = (
         tensordict_data.get(("next", "agents", "episode_reward"))[done].mean().item()
     )

From 80f9d54770719eb7ecadfde259c5e94f206602ee Mon Sep 17 00:00:00 2001
From: Matteo Bettini <matbet@meta.com>
Date: Wed, 4 Oct 2023 08:51:45 +0100
Subject: [PATCH 5/5] update

Signed-off-by: Matteo Bettini <matbet@meta.com>
---
 examples/multiagent/sac.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/multiagent/sac.py b/examples/multiagent/sac.py
index 6fc063c2411..fb184291c90 100644
--- a/examples/multiagent/sac.py
+++ b/examples/multiagent/sac.py
@@ -258,7 +258,6 @@ def train(cfg: "DictConfig"):  # noqa: F821
                     loss_vals["loss_actor"]
                     + loss_vals["loss_alpha"]
                     + loss_vals["loss_qvalue"]
-                    + loss_vals["loss_alpha"]
                 )
 
                 loss_value.backward()