pytorch · vmoens · Apr 7, 2023 · Mar 20, 2023 · Mar 20, 2023 · Mar 20, 2023
diff --git a/docs/source/_static/img/replaybuffer_traj.png b/docs/source/_static/img/replaybuffer_traj.png
diff --git a/docs/source/_static/js/theme.js b/docs/source/_static/js/theme.js
diff --git a/docs/source/reference/data.rst b/docs/source/reference/data.rst
@@ -218,7 +218,7 @@ Check the :obj:`torchrl.envs.utils.check_env_specs` method for a sanity check.
 Utils
 -----
 
-.. currentmodule:: torchrl.data.datasets
+.. currentmodule:: torchrl.data
 
 .. autosummary::
     :toctree: generated/

diff --git a/docs/source/reference/envs.rst b/docs/source/reference/envs.rst
@@ -114,7 +114,6 @@ provides more information on how to design a custom environment from scratch.
     EnvBase
     GymLikeEnv
     EnvMetaData
-    Specs
 
 Vectorized envs
 ---------------

diff --git a/docs/source/reference/modules.rst b/docs/source/reference/modules.rst
@@ -32,7 +32,7 @@ TensorDict modules
 
 Hooks
 -----
-.. currentmodule:: torchrl.modules.tensordict_module.actors
+.. currentmodule:: torchrl.modules
 
 .. autosummary::
     :toctree: generated/

diff --git a/docs/source/reference/objectives.rst b/docs/source/reference/objectives.rst
@@ -16,13 +16,15 @@ The main characteristics of TorchRL losses are:
   method will receive a tensordict as input that contains all the necessary
   information to return a loss value.
 - They output a :class:`tensordict.TensorDict` instance with the loss values
-  written under a ``"loss_<smth>`` where ``smth`` is a string describing the
+  written under a ``"loss_<smth>"`` where ``smth`` is a string describing the
   loss. Additional keys in the tensordict may be useful metrics to log during
   training time.
   .. note::
     The reason we return independent losses is to let the user use a different
     optimizer for different sets of parameters for instance. Summing the losses
-    can be simply done via ``sum(loss for key, loss in loss_vals.items() if key.startswith("loss_")``.
+    can be simply done via
+
+        >>> loss_val = sum(loss for key, loss in loss_vals.items() if key.startswith("loss_"))
 
 Training value functions
 ------------------------
@@ -216,5 +218,5 @@ Utils
     next_state_value
     SoftUpdate
     HardUpdate
-    ValueFunctions
+    ValueEstimators
     default_value_kwargs
diff --git a/docs/source/reference/trainers.rst b/docs/source/reference/trainers.rst
@@ -73,7 +73,7 @@ Hooks can be split into 3 categories: **data processing** (:obj:`"batch_process"
 - **Data processing** hooks update a tensordict of data. Hooks :obj:`__call__` method should accept
   a :obj:`TensorDict` object as input and update it given some strategy.
   Examples of such hooks include Replay Buffer extension (:obj:`ReplayBufferTrainer.extend`), data normalization (including normalization
-  constants update), data subsampling (:doc:`BatchSubSampler`) and such.
+  constants update), data subsampling (:class:`torchrl.trainers.BatchSubSampler`) and such.
 
 - **Logging** hooks take a batch of data presented as a :obj:`TensorDict` and write in the logger
   some information retrieved from that data. Examples include the :obj:`Recorder` hook, the reward

diff --git a/test/test_trainer.py b/test/test_trainer.py
@@ -89,11 +89,10 @@ class MockingLossModule(nn.Module):
 
 def mocking_trainer(file=None, optimizer=_mocking_optim) -> Trainer:
     trainer = Trainer(
-        MockingCollector(),
-        *[
-            None,
-        ]
-        * 2,
+        collector=MockingCollector(),
+        total_frames=None,
+        frame_skip=None,
+        optim_steps_per_batch=None,
         loss_module=MockingLossModule(),
         optimizer=optimizer,
         save_trainer_file=file,
@@ -862,7 +861,7 @@ def test_recorder(self, N=8):
         with tempfile.TemporaryDirectory() as folder:
             logger = TensorboardLogger(exp_name=folder)
 
-            recorder = transformed_env_constructor(
+            environment = transformed_env_constructor(
                 args,
                 video_tag="tmp",
                 norm_obs_only=True,
@@ -874,7 +873,7 @@ def test_recorder(self, N=8):
                 record_frames=args.record_frames,
                 frame_skip=args.frame_skip,
                 policy_exploration=None,
-                recorder=recorder,
+                environment=environment,
                 record_interval=args.record_interval,
             )
             trainer = mocking_trainer()
@@ -936,7 +935,7 @@ def _make_recorder_and_trainer(tmpdirname):
                 raise NotImplementedError
             trainer = mocking_trainer(file)
 
-            recorder = transformed_env_constructor(
+            environment = transformed_env_constructor(
                 args,
                 video_tag="tmp",
                 norm_obs_only=True,
@@ -948,7 +947,7 @@ def _make_recorder_and_trainer(tmpdirname):
                 record_frames=args.record_frames,
                 frame_skip=args.frame_skip,
                 policy_exploration=None,
-                recorder=recorder,
+                environment=environment,
                 record_interval=args.record_interval,
             )
             recorder.register(trainer)

diff --git a/torchrl/data/__init__.py b/torchrl/data/__init__.py
@@ -3,6 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+from . import datasets
 from .postprocs import MultiStep
 from .replay_buffers import (
     LazyMemmapStorage,

diff --git a/torchrl/data/datasets/__init__.py b/torchrl/data/datasets/__init__.py
@@ -1 +1,2 @@
 from .d4rl import D4RLExperienceReplay
+from .openml import OpenMLExperienceReplay
diff --git a/torchrl/data/datasets/openml.py b/torchrl/data/datasets/openml.py
@@ -8,8 +8,13 @@
 import numpy as np
 from tensordict.tensordict import TensorDict
 
-from torchrl.data import LazyMemmapStorage, TensorDictReplayBuffer
-from torchrl.data.replay_buffers import Sampler, SamplerWithoutReplacement, Writer
+from torchrl.data.replay_buffers import (
+    LazyMemmapStorage,
+    Sampler,
+    SamplerWithoutReplacement,
+    TensorDictReplayBuffer,
+    Writer,
+)
 
 
 class OpenMLExperienceReplay(TensorDictReplayBuffer):

diff --git a/torchrl/data/postprocs/postprocs.py b/torchrl/data/postprocs/postprocs.py
@@ -82,9 +82,9 @@ def _get_reward(
 class MultiStep(nn.Module):
     """Multistep reward transform.
 
-    Presented in 'Sutton, R. S. 1988. Learning to
-    predict by the methods of temporal differences. Machine learning 3(
-    1):9–44.'
+    Presented in
+
+    | Sutton, R. S. 1988. Learning to predict by the methods of temporal differences. Machine learning 3(1):9–44.
 
     This module maps the "next" observation to the t + n "next" observation.
     It is an identity transform whenever :attr:`n_steps` is 0.
@@ -153,6 +153,10 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
         """
         tensordict = tensordict.clone(False)
         done = tensordict.get(("next", "done"))
+        truncated = tensordict.get(
+            ("next", "truncated"), torch.zeros((), dtype=done.dtype, device=done.device)
+        )
+        done = done | truncated
 
         # we'll be using the done states to index the tensordict.
         # if the shapes don't match we're in trouble.
@@ -175,10 +179,6 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
                         "(trailing singleton dimension excluded)."
                     ) from err
 
-        truncated = tensordict.get(
-            ("next", "truncated"), torch.zeros((), dtype=done.dtype, device=done.device)
-        )
-        done = done | truncated
         mask = tensordict.get(("collector", "mask"), None)
         reward = tensordict.get(("next", "reward"))
         *batch, T = tensordict.batch_size

diff --git a/torchrl/data/replay_buffers/replay_buffers.py b/torchrl/data/replay_buffers/replay_buffers.py
@@ -11,7 +11,7 @@
 
 import torch
 from tensordict.tensordict import LazyStackedTensorDict, TensorDict, TensorDictBase
-from tensordict.utils import expand_right
+from tensordict.utils import expand_as_right
 
 from torchrl.data.utils import DEVICE_TYPING
 
@@ -708,6 +708,8 @@ def extend(self, tensordicts: Union[List, TensorDictBase]) -> torch.Tensor:
         return index
 
     def update_tensordict_priority(self, data: TensorDictBase) -> None:
+        if not isinstance(self._sampler, PrioritizedSampler):
+            return
         priority = torch.tensor(
             [self._get_priority(td) for td in data],
             dtype=torch.float,
@@ -753,19 +755,7 @@ def sample(
         data, info = super().sample(batch_size, return_info=True)
         if include_info in (True, None):
             for k, v in info.items():
-                data.set(k, torch.tensor(v, device=data.device))
-        if "_batch_size" in data.keys():
-            # we need to reset the batch-size
-            shape = data.pop("_batch_size")
-            shape = shape[0]
-            shape = torch.Size([data.shape[0], *shape])
-            # we may need to update some values in the data
-            for key, value in data.items():
-                if value.ndim >= len(shape):
-                    continue
-                value = expand_right(value, shape)
-                data.set(key, value)
-            data.batch_size = shape
+                data.set(k, expand_as_right(torch.tensor(v, device=data.device), data))
         if return_info:
             return data, info
         return data

diff --git a/torchrl/data/replay_buffers/storages.py b/torchrl/data/replay_buffers/storages.py
@@ -14,6 +14,7 @@
 from tensordict.memmap import MemmapTensor
 from tensordict.prototype import is_tensorclass
 from tensordict.tensordict import is_tensor_collection, TensorDict, TensorDictBase
+from tensordict.utils import expand_right
 
 from torchrl._utils import _CKPT_BACKEND, VERBOSE
 from torchrl.data.replay_buffers.utils import INT_CLASSES
@@ -423,10 +424,42 @@ def _mem_map_tensor_as_tensor(mem_map_tensor: MemmapTensor) -> torch.Tensor:
         return mem_map_tensor._tensor
 
 
+def _reset_batch_size(x):
+    """Resets the batch size of a tensordict.
+
+    In some cases we save the original shape of the tensordict as a tensor (or memmap tensor).
+
+    This function will read that tensor, extract its items and reset the shape
+    of the tensordict to it. If items have an incompatible shape (e.g. "index")
+    they will be expanded to the right to match it.
+
+    """
+    shape = x.pop("_batch_size", None)
+    if shape is not None:
+        # we need to reset the batch-size
+        if isinstance(shape, MemmapTensor):
+            shape = shape.as_tensor()
+        locked = x.is_locked
+        if locked:
+            x.unlock_()
+        shape = [s.item() for s in shape[0]]
+        shape = torch.Size([x.shape[0], *shape])
+        # we may need to update some values in the data
+        for key, value in x.items():
+            if value.ndim >= len(shape):
+                continue
+            value = expand_right(value, shape)
+            x.set(key, value)
+        x.batch_size = shape
+        if locked:
+            x.lock_()
+    return x
+
+
 def _collate_list_tensordict(x):
     out = torch.stack(x, 0)
     if isinstance(out, TensorDictBase):
-        return out.to_tensordict()
+        return _reset_batch_size(out.to_tensordict())
     return out
 
 
@@ -436,7 +469,7 @@ def _collate_list_tensors(*x):
 
 def _collate_contiguous(x):
     if isinstance(x, TensorDictBase):
-        return x.to_tensordict()
+        return _reset_batch_size(x).to_tensordict()
     return x.clone()
 
 

diff --git a/torchrl/envs/transforms/transforms.py b/torchrl/envs/transforms/transforms.py
@@ -2602,6 +2602,13 @@ class VecNorm(Transform):
             default: 0.99
         eps (number, optional): lower bound of the running standard
             deviation (for numerical underflow). Default is 1e-4.
+        shapes (List[torch.Size], optional): if provided, represents the shape
+            of each in_keys. Its length must match the one of ``in_keys``.
+            Each shape must match the trailing dimension of the corresponding
+            entry.
+            If not, the feature dimensions of the entry (ie all dims that do
+            not belong to the tensordict batch-size) will be considered as
+            feature dimension.
 
     Examples:
         >>> from torchrl.envs.libs.gym import GymEnv
@@ -2629,6 +2636,7 @@ def __init__(
         lock: mp.Lock = None,
         decay: float = 0.9999,
         eps: float = 1e-4,
+        shapes: List[torch.Size] = None,
     ) -> None:
         if lock is None:
             lock = mp.Lock()
@@ -2656,8 +2664,14 @@ def __init__(
 
         self.lock = lock
         self.decay = decay
+        self.shapes = shapes
         self.eps = eps
 
+    def _key_str(self, key):
+        if not isinstance(key, str):
+            key = "_".join(key)
+        return key
+
     def _call(self, tensordict: TensorDictBase) -> TensorDictBase:
         if self.lock is not None:
             self.lock.acquire()
@@ -2681,17 +2695,44 @@ def _call(self, tensordict: TensorDictBase) -> TensorDictBase:
     forward = _call
 
     def _init(self, tensordict: TensorDictBase, key: str) -> None:
-        if self._td is None or key + "_sum" not in self._td.keys():
-            td_view = tensordict.view(-1)
-            td_select = td_view[0]
-            d = {key + "_sum": torch.zeros_like(td_select.get(key))}
-            d.update({key + "_ssq": torch.zeros_like(td_select.get(key))})
+        key_str = self._key_str(key)
+        if self._td is None or key_str + "_sum" not in self._td.keys():
+            if key is not key_str and key_str in tensordict.keys():
+                raise RuntimeError(
+                    f"Conflicting key names: {key_str} from VecNorm and input tensordict keys."
+                )
+            if self.shapes is None:
+                td_view = tensordict.view(-1)
+                td_select = td_view[0]
+                item = td_select.get(key)
+                d = {key_str + "_sum": torch.zeros_like(item)}
+                d.update({key_str + "_ssq": torch.zeros_like(item)})
+            else:
+                idx = 0
+                for in_key in self.in_keys:
+                    if in_key != key:
+                        idx += 1
+                    else:
+                        break
+                shape = self.shapes[idx]
+                item = tensordict.get(key)
+                d = {
+                    key_str
+                    + "_sum": torch.zeros(shape, device=item.device, dtype=item.dtype)
+                }
+                d.update(
+                    {
+                        key_str
+                        + "_ssq": torch.zeros(
+                            shape, device=item.device, dtype=item.dtype
+                        )
+                    }
+                )
+
             d.update(
                 {
-                    key
-                    + "_count": torch.zeros(
-                        1, device=td_select.get(key).device, dtype=torch.float
-                    )
+                    key_str
+                    + "_count": torch.zeros(1, device=item.device, dtype=torch.float)
                 }
             )
             if self._td is None:
@@ -2702,6 +2743,7 @@ def _init(self, tensordict: TensorDictBase, key: str) -> None:
             pass
 
     def _update(self, key, value, N) -> torch.Tensor:
+        key = self._key_str(key)
         _sum = self._td.get(key + "_sum")
         _ssq = self._td.get(key + "_ssq")
         _count = self._td.get(key + "_count")

diff --git a/torchrl/modules/__init__.py b/torchrl/modules/__init__.py
@@ -41,10 +41,12 @@
     ActorValueOperator,
     AdditiveGaussianWrapper,
     DistributionalQValueActor,
+    DistributionalQValueHook,
     EGreedyWrapper,
     OrnsteinUhlenbeckProcessWrapper,
     ProbabilisticActor,
     QValueActor,
+    QValueHook,
     SafeModule,
     SafeProbabilisticModule,
     SafeProbabilisticTensorDictSequential,

diff --git a/torchrl/modules/tensordict_module/__init__.py b/torchrl/modules/tensordict_module/__init__.py
@@ -9,8 +9,10 @@
     ActorCriticWrapper,
     ActorValueOperator,
     DistributionalQValueActor,
+    DistributionalQValueHook,
     ProbabilisticActor,
     QValueActor,
+    QValueHook,
     ValueOperator,
 )
 from .common import SafeModule
-Original file line number
+Diff line change
@@ Expand Up @@
         EnvBase
         GymLikeEnv
         EnvMetaData
-        Specs
     Vectorized envs
     ---------------
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		from .d4rl import D4RLExperienceReplay
		from .openml import OpenMLExperienceReplay