hyerra
diff --git a/‎.circleci/unittest/linux_examples/scripts/environment.yml‎
Lines changed: 1 addition & 0 deletions b/‎.circleci/unittest/linux_examples/scripts/environment.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.circleci/unittest/linux_examples/scripts/run_test.sh‎
Lines changed: 25 additions & 0 deletions b/‎.circleci/unittest/linux_examples/scripts/run_test.sh‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎examples/multiagent/README.md‎
Lines changed: 69 additions & 0 deletions b/‎examples/multiagent/README.md‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎examples/multiagent/iql.py‎
Lines changed: 231 additions & 0 deletions b/‎examples/multiagent/iql.py‎
Lines changed: 231 additions & 0 deletions
diff --git a/‎examples/multiagent/iql.yaml‎
Lines changed: 38 additions & 0 deletions b/‎examples/multiagent/iql.yaml‎
Lines changed: 38 additions & 0 deletions
@@ -27,3 +27,4 @@ dependencies:
     - mlflow
     - av
     - coverage
+    - vmas
@@ -246,6 +246,31 @@ python .circleci/unittest/helpers/coverage_run_parallel.py examples/td3/td3.py \
   collector.collector_device=cuda:0 \
   env.name=Pendulum-v1 \
   logger.backend=
+python .circleci/unittest/helpers/coverage_run_parallel.py examples/multiagent/mappo_ippo.py \
+  collector.n_iters=2 \
+  collector.frames_per_batch=200 \
+  train.num_epochs=3 \
+  train.minibatch_size=100 \
+  logger.backend=
+python .circleci/unittest/helpers/coverage_run_parallel.py examples/multiagent/maddpg_iddpg.py \
+  collector.n_iters=2 \
+  collector.frames_per_batch=200 \
+  train.num_epochs=3 \
+  train.minibatch_size=100 \
+  logger.backend=
+python .circleci/unittest/helpers/coverage_run_parallel.py examples/multiagent/iql.py \
+  collector.n_iters=2 \
+  collector.frames_per_batch=200 \
+  train.num_epochs=3 \
+  train.minibatch_size=100 \
+  logger.backend=
+python .circleci/unittest/helpers/coverage_run_parallel.py examples/multiagent/qmix_vdn.py \
+  collector.n_iters=2 \
+  collector.frames_per_batch=200 \
+  train.num_epochs=3 \
+  train.minibatch_size=100 \
+  logger.backend=
+
 
 python .circleci/unittest/helpers/coverage_run_parallel.py examples/bandits/dqn.py --n_steps=100
 
 
@@ -0,0 +1,69 @@
+# Multi-agent examples
+
+In this folder we provide a set of multi-agent example scripts using the [VMAS](https://github.com/proroklab/VectorizedMultiAgentSimulator) simulator.
+
+<p align="center">
+<img src="https://pytorch.s3.amazonaws.com/torchrl/github-artifacts/img/marl_vmas.png" width="600px">
+</p>
+
+<center><i>The MARL algorithms contained in the scripts of this folder run on three multi-robot tasks in VMAS.</i></center>
+
+For more details on the experiment setup and the environments please refer to the corresponding section of the appendix in the [TorchRL paper](https://arxiv.org/abs/2306.00577).
+
+## Using the scripts
+
+### Install
+
+First you need to install vmas and the dependencies of the scripts.
+
+Install torchrl and tensordict following repo instructions.
+
+Install vmas and dependencies:
+
+```bash
+pip install vmas
+pip install wandb moviepy
+pip install hydra-core
+```
+
+### Run
+
+To run the scripts just execute the corresponding python file after having modified the corresponding config 
+according to your needs.
+The config can be found in the .yaml file with the same name.
+
+For example:
+```bash
+python mappo_ippo.py
+```
+
+You can even change the config from the command line like:
+
+```bash
+python mappo_ippo.py --m env.scenario_name=navigation
+```
+
+### Computational demand
+The scripts are set up for collecting many frames, if your compute is limited, you can change the "frames_per_batch" 
+and "num_epochs" parameters to reduce compute requirements.
+
+### Script structure
+
+The scripts are self-contained.
+This means that all the code you will need to look at is contained in the script file. 
+No helper functions are used.
+
+The structure of scripts follows this order:
+- Configuration dictionary for the script
+- Environment creation
+- Modules creation
+- Collector instantiation
+- Replay buffer instantiation
+- Loss module creation
+- Training loop (with inner minibatch loops)
+- Evaluation run (at the desired frequency)
+
+Logging is done by default to wandb.
+The logging backend can be changed in the config files to one of "wandb", "tensorboard", "csv", "mlflow".
+
+All the scripts follow the same on-policy training structure so that results can be compared across different algorithms.
@@ -0,0 +1,231 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import time
+
+import hydra
+import torch
+
+from tensordict.nn import TensorDictModule
+from torch import nn
+from torchrl.collectors import SyncDataCollector
+from torchrl.data import TensorDictReplayBuffer
+from torchrl.data.replay_buffers.samplers import SamplerWithoutReplacement
+from torchrl.data.replay_buffers.storages import LazyTensorStorage
+from torchrl.envs import RewardSum, TransformedEnv
+from torchrl.envs.libs.vmas import VmasEnv
+from torchrl.envs.utils import ExplorationType, set_exploration_type
+from torchrl.modules import EGreedyWrapper, QValueModule, SafeSequential
+from torchrl.modules.models.multiagent import MultiAgentMLP
+from torchrl.objectives import DQNLoss, SoftUpdate, ValueEstimators
+from utils.logging import init_logging, log_evaluation, log_training
+
+
+def rendering_callback(env, td):
+    env.frames.append(env.render(mode="rgb_array", agent_index_focus=None))
+
+
+@hydra.main(version_base="1.1", config_path=".", config_name="iql")
+def train(cfg: "DictConfig"):  # noqa: F821
+    # Device
+    cfg.train.device = "cpu" if not torch.has_cuda else "cuda:0"
+    cfg.env.device = cfg.train.device
+
+    # Seeding
+    torch.manual_seed(cfg.seed)
+
+    # Sampling
+    cfg.env.vmas_envs = cfg.collector.frames_per_batch // cfg.env.max_steps
+    cfg.collector.total_frames = cfg.collector.frames_per_batch * cfg.collector.n_iters
+    cfg.buffer.memory_size = cfg.collector.frames_per_batch
+
+    # Create env and env_test
+    env = VmasEnv(
+        scenario=cfg.env.scenario_name,
+        num_envs=cfg.env.vmas_envs,
+        continuous_actions=False,
+        max_steps=cfg.env.max_steps,
+        device=cfg.env.device,
+        seed=cfg.seed,
+        # Scenario kwargs
+        **cfg.env.scenario,
+    )
+    env = TransformedEnv(
+        env,
+        RewardSum(in_keys=[env.reward_key], out_keys=[("agents", "episode_reward")]),
+    )
+
+    env_test = VmasEnv(
+        scenario=cfg.env.scenario_name,
+        num_envs=cfg.eval.evaluation_episodes,
+        continuous_actions=False,
+        max_steps=cfg.env.max_steps,
+        device=cfg.env.device,
+        seed=cfg.seed,
+        # Scenario kwargs
+        **cfg.env.scenario,
+    )
+
+    # Policy
+    net = MultiAgentMLP(
+        n_agent_inputs=env.observation_spec["agents", "observation"].shape[-1],
+        n_agent_outputs=env.action_spec.space.n,
+        n_agents=env.n_agents,
+        centralised=False,
+        share_params=cfg.model.shared_parameters,
+        device=cfg.train.device,
+        depth=2,
+        num_cells=256,
+        activation_class=nn.Tanh,
+    )
+    module = TensorDictModule(
+        net, in_keys=[("agents", "observation")], out_keys=[("agents", "action_value")]
+    )
+    value_module = QValueModule(
+        action_value_key=("agents", "action_value"),
+        out_keys=[
+            env.action_key,
+            ("agents", "action_value"),
+            ("agents", "chosen_action_value"),
+        ],
+        spec=env.unbatched_action_spec,
+        action_space=None,
+    )
+    qnet = SafeSequential(module, value_module)
+
+    qnet_explore = EGreedyWrapper(
+        qnet,
+        eps_init=0.3,
+        eps_end=0,
+        annealing_num_steps=int(cfg.collector.total_frames * (1 / 2)),
+        action_key=env.action_key,
+        spec=env.unbatched_action_spec[env.action_key],
+    )
+
+    collector = SyncDataCollector(
+        env,
+        qnet_explore,
+        device=cfg.env.device,
+        storing_device=cfg.train.device,
+        frames_per_batch=cfg.collector.frames_per_batch,
+        total_frames=cfg.collector.total_frames,
+    )
+
+    replay_buffer = TensorDictReplayBuffer(
+        storage=LazyTensorStorage(cfg.buffer.memory_size, device=cfg.train.device),
+        sampler=SamplerWithoutReplacement(),
+        batch_size=cfg.train.minibatch_size,
+    )
+
+    loss_module = DQNLoss(qnet, delay_value=True)
+    loss_module.set_keys(
+        action_value=("agents", "action_value"),
+        action=env.action_key,
+        value=("agents", "chosen_action_value"),
+        reward=env.reward_key,
+    )
+    loss_module.make_value_estimator(ValueEstimators.TD0, gamma=cfg.loss.gamma)
+    target_net_updater = SoftUpdate(loss_module, eps=1 - cfg.loss.tau)
+
+    optim = torch.optim.Adam(loss_module.parameters(), cfg.train.lr)
+
+    # Logging
+    if cfg.logger.backend:
+        model_name = ("Het" if not cfg.model.shared_parameters else "") + "IQL"
+        logger = init_logging(cfg, model_name)
+
+    total_time = 0
+    total_frames = 0
+    sampling_start = time.time()
+    for i, tensordict_data in enumerate(collector):
+        print(f"\nIteration {i}")
+
+        sampling_time = time.time() - sampling_start
+
+        tensordict_data.set(
+            ("next", "done"),
+            tensordict_data.get(("next", "done"))
+            .unsqueeze(-1)
+            .expand(tensordict_data.get(("next", env.reward_key)).shape),
+        )  # We need to expand the done to match the reward shape
+
+        current_frames = tensordict_data.numel()
+        total_frames += current_frames
+        data_view = tensordict_data.reshape(-1)
+        replay_buffer.extend(data_view)
+
+        training_tds = []
+        training_start = time.time()
+        for _ in range(cfg.train.num_epochs):
+            for _ in range(cfg.collector.frames_per_batch // cfg.train.minibatch_size):
+                subdata = replay_buffer.sample()
+                loss_vals = loss_module(subdata)
+                training_tds.append(loss_vals.detach())
+
+                loss_value = loss_vals["loss"]
+
+                loss_value.backward()
+
+                total_norm = torch.nn.utils.clip_grad_norm_(
+                    loss_module.parameters(), cfg.train.max_grad_norm
+                )
+                training_tds[-1].set("grad_norm", total_norm.mean())
+
+                optim.step()
+                optim.zero_grad()
+                target_net_updater.step()
+
+        qnet_explore.step(frames=current_frames)  # Update exploration annealing
+        collector.update_policy_weights_()
+
+        training_time = time.time() - training_start
+
+        iteration_time = sampling_time + training_time
+        total_time += iteration_time
+        training_tds = torch.stack(training_tds)
+
+        # More logs
+        if cfg.logger.backend:
+            log_training(
+                logger,
+                training_tds,
+                tensordict_data,
+                sampling_time,
+                training_time,
+                total_time,
+                i,
+                current_frames,
+                total_frames,
+                step=i,
+            )
+
+        if (
+            cfg.eval.evaluation_episodes > 0
+            and i % cfg.eval.evaluation_interval == 0
+            and cfg.logger.backend
+        ):
+            evaluation_start = time.time()
+            with torch.no_grad() and set_exploration_type(ExplorationType.MEAN):
+                env_test.frames = []
+                rollouts = env_test.rollout(
+                    max_steps=cfg.env.max_steps,
+                    policy=qnet,
+                    callback=rendering_callback,
+                    auto_cast_to_device=True,
+                    break_when_any_done=False,
+                    # We are running vectorized evaluation we do not want it to stop when just one env is done
+                )
+
+                evaluation_time = time.time() - evaluation_start
+
+                log_evaluation(logger, rollouts, env_test, evaluation_time, step=i)
+
+        if cfg.logger.backend == "wandb":
+            logger.experiment.log({}, commit=True)
+        sampling_start = time.time()
+
+
+if __name__ == "__main__":
+    train()
@@ -0,0 +1,38 @@
+seed: 0
+
+env:
+  max_steps: 100
+  scenario_name: "balance"
+  scenario:
+    n_agents: 3
+  device: ??? # These values will be populated dynamically
+  vmas_envs: ???
+
+model:
+  shared_parameters: True
+
+collector:
+  frames_per_batch: 60_000 # Frames sampled each sampling iteration
+  n_iters: 500 # Number of sampling/training iterations
+  total_frames: ???
+
+buffer:
+  memory_size: ???
+
+loss:
+  gamma: 0.9
+  tau: 0.005 # For target net
+
+train:
+  num_epochs: 45  # optimization steps per batch of data collected
+  minibatch_size: 4096 # size of minibatches used in each epoch
+  lr: 5e-5
+  max_grad_norm: 40.0
+  device: ???
+
+eval:
+  evaluation_interval: 20
+  evaluation_episodes: 200
+
+logger:
+  backend:  wandb # Delete to remove logging
-Original file line number
+Diff line change
     - mlflow
     - av
     - coverage
 +    - vmas