From d9b4f04b802b04a396e47079117b46900a1cff7a Mon Sep 17 00:00:00 2001 From: Matteo Bettini Date: Tue, 3 Oct 2023 22:27:41 +0100 Subject: [PATCH 1/5] update Signed-off-by: Matteo Bettini --- tutorials/sphinx-tutorials/multiagent_ppo.py | 49 +++++++++----------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/tutorials/sphinx-tutorials/multiagent_ppo.py b/tutorials/sphinx-tutorials/multiagent_ppo.py index 4d35b18a360..2b6e4a884a9 100644 --- a/tutorials/sphinx-tutorials/multiagent_ppo.py +++ b/tutorials/sphinx-tutorials/multiagent_ppo.py @@ -253,12 +253,11 @@ # # -print("action_spec:", env.action_spec) -print("reward_spec:", env.reward_spec) -print("done_spec:", env.done_spec) +print("action_spec:", env.full_action_spec) +print("reward_spec:", env.full_reward_spec) +print("done_spec:", env.full_done_spec) print("observation_spec:", env.observation_spec) - ###################################################################### # Using the commands just shown we can access the domain of each value. # Doing this we can see that all specs apart from done have a leading shape ``(num_vmas_envs, n_agents)``. @@ -270,35 +269,20 @@ # In fact, specs that have the additional agent dimension # (i.e., they vary for each agent) will be contained in a inner "agents" key. # -# To access the full structure of the specs we can use -# - -print("full_action_spec:", env.input_spec["full_action_spec"]) -print("full_reward_spec:", env.output_spec["full_reward_spec"]) -print("full_done_spec:", env.output_spec["full_done_spec"]) - -###################################################################### # As you can see the reward and action spec present the "agent" key, # meaning that entries in tensordicts belonging to those specs will be nested in an "agents" tensordict, # grouping all per-agent values. # -# To quickly access the key for each of these values in tensordicts, we can simply ask the environment for the -# respective key, and +# To quickly access the keys for each of these values in tensordicts, we can simply ask the environment for the +# respective keys, and # we will immediately understand which are per-agent and which shared. # This info will be useful in order to tell all other TorchRL components where to find each value # -print("action_key:", env.action_key) -print("reward_key:", env.reward_key) -print("done_key:", env.done_key) +print("action_key:", env.action_keys) +print("reward_key:", env.reward_keys) +print("done_key:", env.done_keys) -###################################################################### -# To tie it all together, we can see that passing these keys to the full specs gives us the leaf domains -# - -assert env.action_spec == env.input_spec["full_action_spec"][env.action_key] -assert env.reward_spec == env.output_spec["full_reward_spec"][env.reward_key] -assert env.done_spec == env.output_spec["full_done_spec"][env.done_key] ###################################################################### # Transforms @@ -615,6 +599,9 @@ action=env.action_key, sample_log_prob=("agents", "sample_log_prob"), value=("agents", "state_value"), + # These last 2 keys will be expanded to match the reward shape + done=("agents", "done"), + terminated=("agents", "terminated"), ) @@ -649,11 +636,19 @@ episode_reward_mean_list = [] for tensordict_data in collector: tensordict_data.set( - ("next", "done"), + ("next", "agents", "done"), tensordict_data.get(("next", "done")) .unsqueeze(-1) - .expand(tensordict_data.get(("next", env.reward_key)).shape), - ) # We need to expand the done to match the reward shape (this is expected by the value estimator) + .expand(tensordict_data.get_item_shape(("next", env.reward_key))), + ) + tensordict_data.set( + ("next", "agents", "terminated"), + tensordict_data.get(("next", "terminated")) + .unsqueeze(-1) + .expand(tensordict_data.get_item_shape(("next", env.reward_key))), + ) + + # We need to expand the done to match the reward shape (this is expected by the value estimator) with torch.no_grad(): GAE( From c522418f8e137a6316a6f4fe8a092b5e23442cb6 Mon Sep 17 00:00:00 2001 From: Matteo Bettini Date: Tue, 3 Oct 2023 22:29:23 +0100 Subject: [PATCH 2/5] update Signed-off-by: Matteo Bettini --- tutorials/sphinx-tutorials/multiagent_ppo.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tutorials/sphinx-tutorials/multiagent_ppo.py b/tutorials/sphinx-tutorials/multiagent_ppo.py index 2b6e4a884a9..05e824b7e6e 100644 --- a/tutorials/sphinx-tutorials/multiagent_ppo.py +++ b/tutorials/sphinx-tutorials/multiagent_ppo.py @@ -647,8 +647,7 @@ .unsqueeze(-1) .expand(tensordict_data.get_item_shape(("next", env.reward_key))), ) - - # We need to expand the done to match the reward shape (this is expected by the value estimator) + # We need to expand the done and terminated to match the reward shape (this is expected by the value estimator) with torch.no_grad(): GAE( From 221e457391d77475f9f0129749d09e1a2bc8f0c3 Mon Sep 17 00:00:00 2001 From: Matteo Bettini Date: Tue, 3 Oct 2023 22:30:07 +0100 Subject: [PATCH 3/5] update Signed-off-by: Matteo Bettini --- tutorials/sphinx-tutorials/multiagent_ppo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tutorials/sphinx-tutorials/multiagent_ppo.py b/tutorials/sphinx-tutorials/multiagent_ppo.py index 05e824b7e6e..7b1061f6f99 100644 --- a/tutorials/sphinx-tutorials/multiagent_ppo.py +++ b/tutorials/sphinx-tutorials/multiagent_ppo.py @@ -279,9 +279,9 @@ # This info will be useful in order to tell all other TorchRL components where to find each value # -print("action_key:", env.action_keys) -print("reward_key:", env.reward_keys) -print("done_key:", env.done_keys) +print("action_keys:", env.action_keys) +print("reward_keys:", env.reward_keys) +print("done_keys:", env.done_keys) ###################################################################### From d2f5c2c3cfab665b8b2e7bfd6eda0932137cdcd9 Mon Sep 17 00:00:00 2001 From: Matteo Bettini Date: Tue, 3 Oct 2023 22:33:06 +0100 Subject: [PATCH 4/5] update Signed-off-by: Matteo Bettini --- tutorials/sphinx-tutorials/multiagent_ppo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/sphinx-tutorials/multiagent_ppo.py b/tutorials/sphinx-tutorials/multiagent_ppo.py index 7b1061f6f99..c5ae154fcfd 100644 --- a/tutorials/sphinx-tutorials/multiagent_ppo.py +++ b/tutorials/sphinx-tutorials/multiagent_ppo.py @@ -682,7 +682,7 @@ collector.update_policy_weights_() # Logging - done = tensordict_data.get(("next", "done")) + done = tensordict_data.get(("next", "agents", "done")) episode_reward_mean = ( tensordict_data.get(("next", "agents", "episode_reward"))[done].mean().item() ) From 80f9d54770719eb7ecadfde259c5e94f206602ee Mon Sep 17 00:00:00 2001 From: Matteo Bettini Date: Wed, 4 Oct 2023 08:51:45 +0100 Subject: [PATCH 5/5] update Signed-off-by: Matteo Bettini --- examples/multiagent/sac.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/multiagent/sac.py b/examples/multiagent/sac.py index 6fc063c2411..fb184291c90 100644 --- a/examples/multiagent/sac.py +++ b/examples/multiagent/sac.py @@ -258,7 +258,6 @@ def train(cfg: "DictConfig"): # noqa: F821 loss_vals["loss_actor"] + loss_vals["loss_alpha"] + loss_vals["loss_qvalue"] - + loss_vals["loss_alpha"] ) loss_value.backward()