From 9fbf53b94616dc5cba6397b4701f6c0feb7d8b64 Mon Sep 17 00:00:00 2001 From: vmoens Date: Mon, 20 Mar 2023 18:07:52 +0000 Subject: [PATCH 01/89] init --- torchrl/collectors/collectors.py | 5 +- torchrl/data/postprocs/postprocs.py | 8 +- torchrl/modules/tensordict_module/actors.py | 3 +- tutorials/sphinx-tutorials/coding_ddpg.py | 756 +++++++++----------- 4 files changed, 345 insertions(+), 427 deletions(-) diff --git a/torchrl/collectors/collectors.py b/torchrl/collectors/collectors.py index 9e2640522ca..d7dabbf70fc 100644 --- a/torchrl/collectors/collectors.py +++ b/torchrl/collectors/collectors.py @@ -750,7 +750,10 @@ def rollout(self) -> TensorDictBase: self._tensordict_out.lock() self._step_and_maybe_reset() - if self.interruptor is not None and self.interruptor.collection_stopped(): + if ( + self.interruptor is not None + and self.interruptor.collection_stopped() + ): break return self._tensordict_out diff --git a/torchrl/data/postprocs/postprocs.py b/torchrl/data/postprocs/postprocs.py index 26cdc470824..c157fd81977 100644 --- a/torchrl/data/postprocs/postprocs.py +++ b/torchrl/data/postprocs/postprocs.py @@ -153,6 +153,10 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: """ tensordict = tensordict.clone(False) done = tensordict.get(("next", "done")) + truncated = tensordict.get( + ("next", "truncated"), torch.zeros((), dtype=done.dtype, device=done.device) + ) + done = done | truncated # we'll be using the done states to index the tensordict. # if the shapes don't match we're in trouble. @@ -175,10 +179,6 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: "(trailing singleton dimension excluded)." ) from err - truncated = tensordict.get( - ("next", "truncated"), torch.zeros((), dtype=done.dtype, device=done.device) - ) - done = done | truncated mask = tensordict.get(("collector", "mask"), None) reward = tensordict.get(("next", "reward")) *batch, T = tensordict.batch_size diff --git a/torchrl/modules/tensordict_module/actors.py b/torchrl/modules/tensordict_module/actors.py index bca7a34090d..1fe6a32c5e5 100644 --- a/torchrl/modules/tensordict_module/actors.py +++ b/torchrl/modules/tensordict_module/actors.py @@ -6,7 +6,7 @@ from typing import Optional, Sequence, Tuple, Union import torch -from tensordict.nn import TensorDictModuleWrapper +from tensordict.nn import get_functional, TensorDictModuleWrapper from torch import nn from torchrl.data.tensor_specs import ( @@ -911,6 +911,7 @@ def __init__( policy_operator, value_operator, ) + get_functional(self) def get_policy_operator(self) -> SafeSequential: """Returns a stand-alone policy operator that maps an observation to an action.""" diff --git a/tutorials/sphinx-tutorials/coding_ddpg.py b/tutorials/sphinx-tutorials/coding_ddpg.py index 5a4c9e5f4b0..f2ef9a292ae 100644 --- a/tutorials/sphinx-tutorials/coding_ddpg.py +++ b/tutorials/sphinx-tutorials/coding_ddpg.py @@ -1,25 +1,28 @@ # -*- coding: utf-8 -*- """ -Coding DDPG using TorchRL -========================= +TorchRL objectives: Coding a DDPG loss +====================================== **Author**: `Vincent Moens `_ """ ############################################################################## -# This tutorial will guide you through the steps to code DDPG from scratch. +# TorchRL separates the training of RL algorithms in various pieces that will be +# assembled in your training script: the environment, the data collection and +# storage, the model and finally the loss function. # +# TorchRL losses (or "objectives") are stateful objects that contain the +# trainable parameters (policy and value models). +# This tutorial will guide you through the steps to code a loss from the ground up +# using torchrl. +# +# To this aim, we will be focusing on DDPG, which is a relatively straightforward +# algorithm to code. # DDPG (`Deep Deterministic Policy Gradient _`_) # is a simple continuous control algorithm. It consists in learning a # parametric value function for an action-observation pair, and # then learning a policy that outputs actions that maximise this value # function given a certain observation. # -# This tutorial is more than the PPO tutorial: it covers -# multiple topics that were left aside. We strongly advise the reader to go -# through the PPO tutorial first before trying out this one. The goal is to -# show how flexible torchrl is when it comes to writing scripts that can cover -# multiple use cases. -# # Key learnings: # # - how to build an environment in TorchRL, including transforms @@ -30,6 +33,10 @@ # - how to store trajectories (and not transitions) in your replay buffer); # - and finally how to evaluate your model. # +# This tutorial assumes that you have completed the PPO tutorial which gives +# an overview of the TorchRL components. +# +# # This tutorial assumes the reader is familiar with some of TorchRL primitives, # such as :class:`tensordict.TensorDict` and # :class:`tensordict.nn.TensorDictModules`, although it should be @@ -46,6 +53,10 @@ # sphinx_gallery_start_ignore import warnings +from typing import Tuple + +from torchrl.objectives import LossModule +from torchrl.objectives.value import TDEstimate, TDLambdaEstimate warnings.filterwarnings("ignore") # sphinx_gallery_end_ignore @@ -58,6 +69,7 @@ import tqdm from matplotlib import pyplot as plt from tensordict.nn import TensorDictModule +from tensordict.tensordict import TensorDict, TensorDictBase from torch import nn, optim from torchrl.collectors import MultiaSyncDataCollector from torchrl.data import CompositeSpec, TensorDictReplayBuffer @@ -76,21 +88,256 @@ from torchrl.envs.transforms import RewardScaling, TransformedEnv from torchrl.envs.utils import set_exploration_mode, step_mdp from torchrl.modules import ( + Actor, + ActorCriticWrapper, MLP, OrnsteinUhlenbeckProcessWrapper, ProbabilisticActor, ValueOperator, ) from torchrl.modules.distributions.continuous import TanhDelta -from torchrl.objectives.utils import hold_out_net +from torchrl.objectives.utils import ( + distance_loss, + hold_out_net, + hold_out_params, + SoftUpdate, +) from torchrl.trainers import Recorder ############################################################################### +# TorchRL LossModule +# ------------------ +# +# The ``__init__`` method +# ~~~~~~~~~~~~~~~~~~~~~~~ +# +# The parent class of all losses is :class:`torchrl.objectives.LossModule`. +# As many other components of the library, its :meth:`__call__` method expects +# as input a :class:`tensordict.TensorDict` instance sampled from an expenrience +# replay buffer. Using this format makes it possible to re-use the module across +# modalities, or in complex settings where the model needs to read multiple +# entries for instance. +# +# To keep the tutorial as didactic as we can, we'll be displaying each method +# of the class independently and we'll be populating the class at a later stage. +# +# Let us start with the :meth:`__init__` method. DDPG aims at a simple goal: +# training a policy to output actions that maximise the value predicted by +# a value network. Hence, our loss module needs to receive two networks in its +# constructor: an actor and a value networks. We expect both of these to be +# tensordict-compatible objects, such as :class:`tensordict.nn.TensorDictModule`. +# +# The crucial step of the :meth:`LossModule.__init__` method is the call to +# :meth:`LossModule.convert_to_functional`. This method will extract the +# parameters from the module and convert it to a functional module. +# The reason TorchRL does this is that RL algorithms often execute the same +# model with different sets of parameters, called "trainable" and "target" parameters. +# The "trainable" parameters are those that the optimizer needs to fit. The +# "target" parameters are usually a copy of the formers with some time lag +# (absolute or diluted through a moving average). These target parameters +# are used to compute the value associated with the next observation. +# One the advantages of using a set of target parameters for the value model +# that do not match exactly the current configuration is that they provide +# a pessimistic bound on the value function being computed. +# Pay attention to the ``create_target_params`` keyword argument below: this +# argument tells the :meth:`torchrl.objectives.LossModule.convert_to_functional` +# method to create a set of target parameters in the loss module to be used +# for target value computation. If this is set to ``False`` (see the actor network +# for instance) the ``target_actor_network_params`` attribute will still be +# accessible but this will just return a detached version of the actor parameters. +# +# Later, we will see how the target parameters should be updated in TorchRL. +# +# We also incorporate an advantage module. This will be used to compute the +# next state value using our value network. We'll see later in this tutorial +# how various advantage modules can be used. If none is provided, we'll +# be using the TD(lambda) method. +# + + +def _init( + self, + actor_network: TensorDictModule, + value_network: TensorDictModule, + advantage="td(lambda)", +) -> None: + super(type(self), self).__init__() + + self.convert_to_functional( + actor_network, + "actor_network", + create_target_params=False, + ) + self.convert_to_functional( + value_network, + "value_network", + create_target_params=True, + compare_against=list(actor_network.parameters()), + ) + + self.actor_in_keys = actor_network.in_keys + + # Since the value we'll be using is based on the actor and value network, + # we put them together in a single actor-critic container. + actor_critic = ActorCriticWrapper(actor_network, value_network) + if advantage == "td(lambda)": + advantage_module = TDLambdaEstimate( + gamma=0.99, + lmbda=0.95, + value_network=actor_critic, + value_key="state_action_value", + ) + elif advantage == "td(0)": + advantage_module = TDEstimate( + gamma=0.99, value_network=actor_critic, value_key="state_action_value" + ) + else: + raise NotImplementedError("advantage must be one of 'td(lambda)' or 'td(0)'.") + self.advantage = advantage + self.advantage_module = advantage_module + + self.loss_funtion = "l2" + + +############################################################################### +# The actor loss method +# ~~~~~~~~~~~~~~~~~~~~~ +# +# The central piece of an RL algorithm is the training loss for the actor. +# In the case of DDPG, this function is quite simple: we just need to compute +# the value associated with an action computed using the policy and optimize +# the actor weights to maximise this value. +# +# When computing this value, we must make sure to take the value parameters out +# of the graph, otherwise the actor and value loss will be mixed up. +# For this, the :func:`torchrl.objectives.utils.hold_out_params` function +# can be used. + + +def _loss_actor( + self, + tensordict, +) -> torch.Tensor: + td_copy = tensordict.select(*self.actor_in_keys).detach() + # Get an action from the actor network + td_copy = self.actor_network( + td_copy, + params=self.actor_network_params, + ) + # get the value associated with that action + with hold_out_params(self.value_network_params) as params: + td_copy = self.value_network( + td_copy, + params=params, + ) + return -td_copy.get("state_action_value") + + +############################################################################### +# The value loss method +# ~~~~~~~~~~~~~~~~~~~~~ +# +# We now need to optimize our value network parameters. +# To do this, we will rely on the advantage module provided during +# the loss construction. + + +def _loss_value( + self, + tensordict, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + td_copy = tensordict.detach() + + # we manually reconstruct the parameters of the actor-critic, where the first + # set of parameters belongs to the actor and the second to the value function. + params = TensorDict( + { + "module": { + "0": self.actor_network_params.detach(), + "1": self.value_network_params, + } + }, + batch_size=self.target_actor_network_params.batch_size, + device=self.target_actor_network_params.device, + ) + target_params = TensorDict( + { + "module": { + "0": self.target_actor_network_params, + "1": self.target_value_network_params, + } + }, + batch_size=self.target_actor_network_params.batch_size, + device=self.target_actor_network_params.device, + ) + with set_exploration_mode("mode"): + self.advantage_module(td_copy, params=params, target_params=target_params) + target_value = td_copy.get(self.advantage_module.value_target_key) + pred_val = td_copy.get("state_action_value") + # td_error = pred_val - target_value + loss_value = distance_loss(pred_val, target_value, loss_function=self.loss_funtion) + + return loss_value, (pred_val - target_value).pow(2), pred_val, target_value + + +############################################################################### +# Putting things together in a forward call +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# The only missing piece is the forward method, which will glue together the +# value and actor loss, collect the cost values and write them in a tensordict +# delivered to the user. + + +def _forward(self, input_tensordict: TensorDictBase) -> TensorDict: + if not input_tensordict.device == self.device: + raise RuntimeError( + f"Got device={input_tensordict.device} but " + f"actor_network.device={self.device} (self.device={self.device})" + ) + + loss_value, td_error, pred_val, target_value = self.loss_value( + input_tensordict, + ) + td_error = td_error.detach() + td_error = td_error.unsqueeze(input_tensordict.ndimension()) + if input_tensordict.device is not None: + td_error = td_error.to(input_tensordict.device) + input_tensordict.set( + "td_error", + td_error, + inplace=True, + ) + loss_actor = self.loss_actor(input_tensordict) + return TensorDict( + source={ + "loss_actor": loss_actor.mean(), + "loss_value": loss_value.mean(), + "pred_value": pred_val.mean().detach(), + "target_value": target_value.mean().detach(), + "pred_value_max": pred_val.max().detach(), + "target_value_max": target_value.max().detach(), + }, + batch_size=[], + ) + + +class DDPGLoss(LossModule): + __init__ = _init + forward = _forward + loss_value = _loss_value + loss_actor = _loss_actor + + +############################################################################### +# Now that we have our loss, we can use it to train a policy to solve a +# control task. +# # Environment # ----------- # # In most algorithms, the first thing that needs to be taken care of is the -# construction of the environmet as it conditions the remainder of the +# construction of the environment as it conditions the remainder of the # training script. # # For this example, we will be using the ``"cheetah"`` task. The goal is to make @@ -118,7 +365,7 @@ # # env = GymEnv("HalfCheetah-v4", from_pixels=True, pixels_only=True) # -# We write a :func:`make_env` helper funciton that will create an environment +# We write a :func:`make_env` helper function that will create an environment # with either one of the two backends considered above (dm-control or gym). # @@ -155,7 +402,7 @@ def make_env(): ############################################################################### # Transforms -# ^^^^^^^^^^ +# ~~~~~~~~~~ # # Now that we have a base environment, we may want to modify its representation # to make it more policy-friendly. In TorchRL, transforms are appended to the @@ -232,7 +479,7 @@ def make_transformed_env( ############################################################################### # Normalization of the observations -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # To compute the normalizing statistics, we run an arbitrary number of random # steps in the environment and compute the mean and standard deviation of the @@ -256,7 +503,7 @@ def get_env_stats(): ############################################################################### # Parallel execution -# ^^^^^^^^^^^^^^^^^^ +# ~~~~~~~~~~~~~~~~~~ # # The following helper function allows us to run environments in parallel. # Running environments in parallel can significantly speed up the collection @@ -314,27 +561,19 @@ def make_t_env(): # Building the model # ------------------ # -# We now turn to the setup of the model and loss function. DDPG requires a +# We now turn to the setup of the model. As we have seen, DDPG requires a # value network, trained to estimate the value of a state-action pair, and a # parametric actor that learns how to select actions that maximize this value. -# In this tutorial, we will be using two independent networks for these -# components. # # Recall that building a torchrl module requires two steps: # -# - writing the :class:`torch.nn.Module` that will be used as network +# - writing the :class:`torch.nn.Module` that will be used as network, # - wrapping the network in a :class:`tensordict.nn.TensorDictModule` where the # data flow is handled by specifying the input and output keys. # # In more complex scenarios, :class:`tensordict.nn.TensorDictSequential` can # also be used. # -# In :func:`make_ddpg_actor`, we use a :class:`torchrl.modules.ProbabilisticActor` -# object to wrap our policy network. Since DDPG is a deterministic algorithm, -# this is not strictly necessary. We rely on this class to map the output -# action to the appropriate domain. Alternatively, one could perfectly use a -# non-linearity such as :class:`torch.tanh` to map the output to the right -# domain. # # The Q-Value network is wrapped in a :class:`torchrl.modules.ValueOperator` # that automatically sets the ``out_keys`` to ``"state_action_value`` for q-value @@ -357,36 +596,34 @@ def make_ddpg_actor( proof_environment.transform[2].load_state_dict(transform_state_dict) env_specs = proof_environment.specs - out_features = env_specs["input_spec"]["action"].shape[0] + in_features = env_specs["output_spec"]["observation"]["observation_vector"].shape[ + -1 + ] + out_features = env_specs["input_spec"]["action"].shape[-1] actor_net = MLP( + in_features=in_features, + out_features=out_features, num_cells=[num_cells] * num_layers, activation_class=nn.Tanh, - out_features=out_features, + activate_last_layer=True, # with this option on, we use a Tanh map as a last layer, thereby constraining the action to the [-1; 1] domain ) in_keys = ["observation_vector"] - out_keys = ["param"] - - actor_module = TensorDictModule(actor_net, in_keys=in_keys, out_keys=out_keys) + out_keys = ["action"] - # We use a ProbabilisticActor to make sure that we map the network output - # to the right space using a TanhDelta distribution. - actor = ProbabilisticActor( - module=actor_module, - in_keys=["param"], + actor = Actor( + actor_net, + in_keys=in_keys, + out_keys=out_keys, spec=CompositeSpec(action=env_specs["input_spec"]["action"]), - safe=True, - distribution_class=TanhDelta, - distribution_kwargs={ - "min": env_specs["input_spec"]["action"].space.minimum, - "max": env_specs["input_spec"]["action"].space.maximum, - }, ).to(device) q_net = MLP( + in_features=in_features + + out_features, # receives an action and an observation as input + out_features=1, num_cells=[num_cells] * num_layers, activation_class=nn.Tanh, - out_features=1, ) in_keys = in_keys + ["action"] @@ -395,15 +632,6 @@ def make_ddpg_actor( module=q_net, ).to(device) - # init: since we have lazy layers, we should run the network - # once to initialize them - with torch.no_grad(), set_exploration_mode("random"): - td = proof_environment.fake_tensordict() - td = td.expand((*td.shape, 2)) - td = td.to(device) - actor(td) - qnet(td) - return actor, qnet @@ -484,7 +712,7 @@ def make_replay_buffer(buffer_size, prefetch=3): ############################################################################### # Environment -# ^^^^^^^^^^^ +# ~~~~~~~~~~~ # The backend can be gym or dm_control backend = "gym" @@ -509,7 +737,7 @@ def make_replay_buffer(buffer_size, prefetch=3): ############################################################################### # Collection -# ^^^^^^^^^^ +# ~~~~~~~~~~ # We will execute the policy on cuda if available device = ( @@ -521,8 +749,10 @@ def make_replay_buffer(buffer_size, prefetch=3): # Total frames we will use during training. Scale up to 500K - 1M for a more # meaningful training -total_frames = 5000 // frame_skip -# Number of frames returned by the collector at each iteration of the outer loop +total_frames = 10000 // frame_skip + +# Number of frames returned by the collector at each iteration of the outer loop. +# We expect batches from the collector to have a shape [env_per_collector, frames_per_batch // env_per_collector] frames_per_batch = env_per_collector * 1000 // frame_skip max_frames_per_traj = 1000 // frame_skip init_random_frames = 0 @@ -535,7 +765,7 @@ def make_replay_buffer(buffer_size, prefetch=3): ############################################################################### # Optimizer and optimization -# ^^^^^^^^^^^^^^^^^^^^^^^^^^ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~ lr = 5e-4 weight_decay = 0.0 @@ -545,7 +775,7 @@ def make_replay_buffer(buffer_size, prefetch=3): ############################################################################### # Model -# ^^^^^ +# ~~~~~ gamma = 0.99 tau = 0.005 # Decay factor for the target network @@ -556,12 +786,13 @@ def make_replay_buffer(buffer_size, prefetch=3): ############################################################################### # Replay buffer -# ^^^^^^^^^^^^^ +# ~~~~~~~~~~~~~ # If True, a Prioritized replay buffer will be used prb = True # Number of frames stored in the buffer -buffer_size = min(total_frames, 1000000 // frame_skip) +traj_len_collector = frames_per_batch // env_per_collector +buffer_size = min(total_frames, 1_000_000 // traj_len_collector) buffer_scratch_dir = "/tmp/" seed = 0 @@ -582,13 +813,13 @@ def make_replay_buffer(buffer_size, prefetch=3): ############################################################################### # Normalization stats -# ^^^^^^^^^^^^^^^^^^^ +# ~~~~~~~~~~~~~~~~~~~ transform_state_dict = get_env_stats() ############################################################################### # Models: policy and q-value network -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ actor, qnet = make_ddpg_actor( transform_state_dict=transform_state_dict, @@ -597,10 +828,17 @@ def make_replay_buffer(buffer_size, prefetch=3): if device == torch.device("cpu"): actor.share_memory() -############################################################################### -# We create a copy of the q-value network to be used as target network -qnet_target = deepcopy(qnet).requires_grad_(False) +############################################################################### +# Loss module +# ~~~~~~~~~~~ +# We build our loss module with the actor and qnet we've just created. +# Because we have target parameters to update, we _must_ create a target network +# updater. +# +loss_module = DDPGLoss(actor, qnet) +target_net_updater = SoftUpdate(loss_module, eps=0.98) +target_net_updater.init_() ############################################################################### # The policy is wrapped in a :class:`torchrl.modules.OrnsteinUhlenbeckProcessWrapper` @@ -615,7 +853,7 @@ def make_replay_buffer(buffer_size, prefetch=3): ############################################################################### # Parallel environment creation -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # We pass the stats computed earlier to normalize the output of our # environment: @@ -626,7 +864,7 @@ def make_replay_buffer(buffer_size, prefetch=3): ############################################################################### # Data collector -# ^^^^^^^^^^^^^^ +# ~~~~~~~~~~~~~~ # # TorchRL provides specialized classes to help you collect data by executing # the policy in the environment. These "data collectors" iteratively compute @@ -684,9 +922,8 @@ def make_replay_buffer(buffer_size, prefetch=3): reset_at_each_iter=False, postproc=multistep, split_trajs=True, - devices=[device, device], # device for execution + device=device, # device for execution storing_devices=[device, device], # device where data will be stored and passed - pin_memory=False, update_at_each_batch=False, exploration_mode="random", ) @@ -695,83 +932,38 @@ def make_replay_buffer(buffer_size, prefetch=3): ############################################################################### # Replay buffer -# ^^^^^^^^^^^^^ +# ~~~~~~~~~~~~~ # replay_buffer = make_replay_buffer(buffer_size, prefetch=3) ############################################################################### # Recorder -# ^^^^^^^^ +# ~~~~~~~~ recorder = make_recorder(actor_model_explore, transform_state_dict) ############################################################################### # Optimizer -# ^^^^^^^^^ +# ~~~~~~~~~ # # Finally, we will use the Adam optimizer for the policy and value network, # with the same learning rate for both. -optimizer_actor = optim.Adam(actor.parameters(), lr=lr, weight_decay=weight_decay) -optimizer_qnet = optim.Adam(qnet.parameters(), lr=lr, weight_decay=weight_decay) +optimizer = optim.Adam(loss_module.parameters(), lr=lr, weight_decay=weight_decay) total_collection_steps = total_frames // frames_per_batch -scheduler1 = torch.optim.lr_scheduler.CosineAnnealingLR( - optimizer_actor, T_max=total_collection_steps -) -scheduler2 = torch.optim.lr_scheduler.CosineAnnealingLR( - optimizer_qnet, T_max=total_collection_steps +scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( + optimizer, T_max=total_collection_steps ) ############################################################################### # Time to train the policy # ------------------------ # -# Some notes about the following training loop: -# -# - :func:`torchrl.objectives.utils.hold_out_net` is a TorchRL context manager -# that temporarily sets :func:`torch.Tensor.requires_grad_()` to False for -# a designated set of network parameters. This is used to -# prevent :func:`torch.Tensor.backward()`` from writing gradients on -# parameters that need not to be differentiated given the loss at hand. -# - The value network is designed using the -# :class:`torchrl.modules.ValueOperator` subclass from -# :class:`tensordict.nn.TensorDictModule` class. As explained earlier, -# this class will write a ``"state_action_value"`` entry if one of its -# ``in_keys`` is named ``"action"``, otherwise it will assume that only the -# state-value is returned and the output key will simply be ``"state_value"``. -# In the case of DDPG, the value if of the state-action pair, -# hence the ``"state_action_value"`` will be used. -# - The :func:`torchrl.envs.utils.step_mdp(tensordict)` helper function is the -# equivalent of the ``obs = next_obs`` command found in multiple RL -# algorithms. It will return a new :class:`tensordict.TensorDict` instance -# that contains all the data that will need to be used in the next iteration. -# This makes it possible to pass this new tensordict to the policy or -# value network. -# - When using prioritized replay buffer, a priority key is added to the -# sampled tensordict (named ``"td_error"`` by default). Then, this -# TensorDict will be fed back to the replay buffer using the -# :func:`torchrl.data.replay_buffers.TensorDictReplayBuffer.update_tensordict_priority` -# method. Under the hood, this method will read the index present in the -# TensorDict as well as the priority value, and update its list of priorities -# at these indices. -# - TorchRL provides optimized versions of the loss functions (such as this one) -# where one only needs to pass a sampled tensordict and obtains a dictionary -# of losses and metadata in return (see :mod:`torchrl.objectives` for more -# context). Here we write the full loss function in the optimization loop -# for transparency. -# Similarly, the target network updates are written explicitly but -# TorchRL provides a couple of dedicated classes for this -# (see :class:`torchrl.objectives.SoftUpdate` and -# :class:`torchrl.objectives.HardUpdate`). -# - After each collection of data, we call :func:`collector.update_policy_weights_()`, -# which will update the policy network weights on the data collector. If the -# code is executed on cpu or with a single cuda device, this part can be -# omitted. If the collector is executed on another device, then its weights -# must be synced with those on the main, training process and this method -# should be incorporated in the training loop (ideally early in the loop in -# async settings, and at the end of it in sync settings). +# The training loop is pretty straightforward now that we have built all the +# modules we need. +# rewards = [] rewards_eval = [] @@ -794,13 +986,7 @@ def make_replay_buffer(buffer_size, prefetch=3): pbar.update(tensordict.numel()) # extend the replay buffer with the new data - if ("collector", "mask") in tensordict.keys(True): - # if multi-step, a mask is present to help filter padded values - current_frames = tensordict["collector", "mask"].sum() - tensordict = tensordict[tensordict.get(("collector", "mask"))] - else: - tensordict = tensordict.view(-1) - current_frames = tensordict.numel() + current_frames = tensordict.numel() collected_frames += current_frames replay_buffer.extend(tensordict.cpu()) @@ -810,49 +996,22 @@ def make_replay_buffer(buffer_size, prefetch=3): # sample from replay buffer sampled_tensordict = replay_buffer.sample(batch_size).clone() - # compute loss for qnet and backprop - with hold_out_net(actor): - # get next state value - next_tensordict = step_mdp(sampled_tensordict) - qnet_target(actor(next_tensordict)) - next_value = next_tensordict["state_action_value"] - assert not next_value.requires_grad - value_est = ( - sampled_tensordict["next", "reward"] - + gamma * (1 - sampled_tensordict["next", "done"].float()) * next_value + # Compute loss + loss_dict = loss_module(sampled_tensordict) + + # optimize + loss_val = sum( + value for key, value in loss_dict.items() if key.startswith("loss") ) - value = qnet(sampled_tensordict)["state_action_value"] - value_loss = (value - value_est).pow(2).mean() - # we write the td_error in the sampled_tensordict for priority update - # because the indices of the samples is tracked in sampled_tensordict - # and the replay buffer will know which priorities to update. - sampled_tensordict["td_error"] = (value - value_est).pow(2).detach() - value_loss.backward() - - optimizer_qnet.step() - optimizer_qnet.zero_grad() - - # compute loss for actor and backprop: - # the actor must maximise the state-action value, hence the loss - # is the neg value of this. - sampled_tensordict_actor = sampled_tensordict.select(*actor.in_keys) - with hold_out_net(qnet): - qnet(actor(sampled_tensordict_actor)) - actor_loss = -sampled_tensordict_actor["state_action_value"] - actor_loss.mean().backward() - - optimizer_actor.step() - optimizer_actor.zero_grad() - - # update qnet_target params - for (p_in, p_dest) in zip(qnet.parameters(), qnet_target.parameters()): - p_dest.data.copy_(tau * p_in.data + (1 - tau) * p_dest.data) - for (b_in, b_dest) in zip(qnet.buffers(), qnet_target.buffers()): - b_dest.data.copy_(tau * b_in.data + (1 - tau) * b_dest.data) + loss_val.backward() + optimizer.step() + optimizer.zero_grad() # update priority if prb: replay_buffer.update_tensordict_priority(sampled_tensordict) + # update target network + target_net_updater.step() rewards.append( ( @@ -873,8 +1032,7 @@ def make_replay_buffer(buffer_size, prefetch=3): # update the exploration strategy actor_model_explore.step(current_frames) if collected_frames >= init_random_frames: - scheduler1.step() - scheduler2.step() + scheduler.step() collector.shutdown() del collector @@ -898,259 +1056,15 @@ def make_replay_buffer(buffer_size, prefetch=3): plt.tight_layout() ############################################################################### -# Sampling trajectories and using TD(lambda) -# ------------------------------------------ -# -# TD(lambda) is known to be less biased than the regular TD-error we used in -# the previous example. To use it, however, we need to sample trajectories and -# not single transitions. -# -# We modify the previous example to make this possible. -# -# The first modification consists in building a replay buffer that stores -# trajectories (and not transitions). -# -# Specifically, we'll collect trajectories of (at most) -# 250 steps (note that the total trajectory length is actually 1000 frames, but -# we collect batches of 500 transitions obtained over 2 environments running in -# parallel, hence only 250 steps per trajectory are collected at any given -# time). Hence, we'll divide our replay buffer size by 250: - -buffer_size = 100000 // frame_skip // 250 -print("the new buffer size is", buffer_size) -batch_size_traj = max(4, batch_size // 250) -print("the new batch size for trajectories is", batch_size_traj) - -n_steps_forward = 0 # disable multi-step for simplicity - -############################################################################### -# The following code is identical to the initialization we made earlier: - -torch.manual_seed(seed) -np.random.seed(seed) - -# get stats for normalization -transform_state_dict = get_env_stats() - -# Actor and qnet instantiation -actor, qnet = make_ddpg_actor( - transform_state_dict=transform_state_dict, - device=device, -) -if device == torch.device("cpu"): - actor.share_memory() - -# Target network -qnet_target = deepcopy(qnet).requires_grad_(False) - -# Exploration wrappers: -actor_model_explore = OrnsteinUhlenbeckProcessWrapper( - actor, - annealing_num_steps=annealing_frames, -).to(device) -if device == torch.device("cpu"): - actor_model_explore.share_memory() - -# Environment setting: -create_env_fn = parallel_env_constructor( - transform_state_dict=transform_state_dict, -) -# Batch collector: -collector = MultiaSyncDataCollector( - create_env_fn=[create_env_fn, create_env_fn], - policy=actor_model_explore, - total_frames=total_frames, - max_frames_per_traj=max_frames_per_traj, - frames_per_batch=frames_per_batch, - init_random_frames=init_random_frames, - reset_at_each_iter=False, - postproc=None, - split_trajs=False, - devices=[device, device], # device for execution - storing_devices=[device, device], # device where data will be stored and passed - seed=None, - pin_memory=False, - update_at_each_batch=False, - exploration_mode="random", -) -collector.set_seed(seed) - -# Replay buffer: -replay_buffer = make_replay_buffer(buffer_size, prefetch=0) - -# trajectory recorder -recorder = make_recorder(actor_model_explore, transform_state_dict) - -# Optimizers -optimizer_actor = optim.Adam(actor.parameters(), lr=lr, weight_decay=weight_decay) -optimizer_qnet = optim.Adam(qnet.parameters(), lr=lr, weight_decay=weight_decay) -total_collection_steps = total_frames // frames_per_batch - -scheduler1 = torch.optim.lr_scheduler.CosineAnnealingLR( - optimizer_actor, T_max=total_collection_steps -) -scheduler2 = torch.optim.lr_scheduler.CosineAnnealingLR( - optimizer_qnet, T_max=total_collection_steps -) - -############################################################################### -# The training loop needs to be slightly adapted. -# First, whereas before extending the replay buffer we used to flatten the -# collected data, this won't be the case anymore. To understand why, let's -# check the output shape of the data collector: - -for data in collector: - print(data.shape) - break - -############################################################################### -# We see that our data has shape ``[2, 250]`` as expected: 2 envs, each -# returning 250 frames. +# Conclusion +# ---------- # -# Let's import the td_lambda function: +# In this tutorial, we have learnt how to code a loss module in TorchRL given +# the concrete example of DDPG. # - -from torchrl.objectives.value.functional import vec_td_lambda_advantage_estimate - -lmbda = 0.95 - -############################################################################### -# The training loop is roughly the same as before, with the exception that we -# don't flatten the collected data. Also, the sampling from the replay buffer -# is slightly different: We will collect at minimum four trajectories, compute -# the returns (TD(lambda)), then sample from these the values we'll be using -# to compute gradients. This ensures that do not have batches that are -# 'too big' but still compute an accurate return. +# The key takeaways are: # - -rewards = [] -rewards_eval = [] - -# Main loop -norm_factor_training = ( - sum(gamma**i for i in range(n_steps_forward)) if n_steps_forward else 1 -) - -collected_frames = 0 -# # if tqdm is to be used -# pbar = tqdm.tqdm(total=total_frames) -r0 = None -for i, tensordict in enumerate(collector): - - # update weights of the inference policy - collector.update_policy_weights_() - - if r0 is None: - r0 = tensordict["next", "reward"].mean().item() - - # extend the replay buffer with the new data - current_frames = tensordict.numel() - collected_frames += current_frames - replay_buffer.extend(tensordict.cpu()) - - # optimization steps - if collected_frames >= init_random_frames: - for _ in range(update_to_data): - # sample from replay buffer - sampled_tensordict = replay_buffer.sample(batch_size_traj) - # reset the batch size temporarily, and exclude index - # whose shape is incompatible with the new size - index = sampled_tensordict.get("index") - sampled_tensordict.exclude("index", inplace=True) - - # compute loss for qnet and backprop - with hold_out_net(actor): - # get next state value - next_tensordict = step_mdp(sampled_tensordict) - qnet_target(actor(next_tensordict.view(-1))).view( - sampled_tensordict.shape - ) - next_value = next_tensordict["state_action_value"] - assert not next_value.requires_grad - - # This is the crucial part: we'll compute the TD(lambda) - # instead of a simple single step estimate - done = sampled_tensordict["next", "done"] - reward = sampled_tensordict["next", "reward"] - value = qnet(sampled_tensordict.view(-1)).view(sampled_tensordict.shape)[ - "state_action_value" - ] - advantage = vec_td_lambda_advantage_estimate( - gamma, lmbda, value, next_value, reward, done - ) - # we sample from the values we have computed - rand_idx = torch.randint(0, advantage.numel(), (batch_size,)) - value_loss = advantage.view(-1)[rand_idx].pow(2).mean() - - # we write the td_error in the sampled_tensordict for priority update - # because the indices of the samples is tracked in sampled_tensordict - # and the replay buffer will know which priorities to update. - value_loss.backward() - - optimizer_qnet.step() - optimizer_qnet.zero_grad() - - # compute loss for actor and backprop: the actor must maximise the state-action value, hence the loss is the neg value of this. - sampled_tensordict_actor = sampled_tensordict.select(*actor.in_keys) - with hold_out_net(qnet): - qnet(actor(sampled_tensordict_actor.view(-1))).view( - sampled_tensordict.shape - ) - actor_loss = -sampled_tensordict_actor["state_action_value"] - actor_loss.view(-1)[rand_idx].mean().backward() - - optimizer_actor.step() - optimizer_actor.zero_grad() - - # update qnet_target params - for (p_in, p_dest) in zip(qnet.parameters(), qnet_target.parameters()): - p_dest.data.copy_(tau * p_in.data + (1 - tau) * p_dest.data) - for (b_in, b_dest) in zip(qnet.buffers(), qnet_target.buffers()): - b_dest.data.copy_(tau * b_in.data + (1 - tau) * b_dest.data) - - # update priority - sampled_tensordict.batch_size = [batch_size_traj] - sampled_tensordict["td_error"] = advantage.detach().pow(2).mean(1) - sampled_tensordict["index"] = index - if prb: - replay_buffer.update_tensordict_priority(sampled_tensordict) - - rewards.append( - ( - i, - tensordict["next", "reward"].mean().item() - / norm_factor_training - / frame_skip, - ) - ) - td_record = recorder(None) - if td_record is not None: - rewards_eval.append((i, td_record["r_evaluation"].item())) - # if len(rewards_eval): - # pbar.set_description(f"reward: {rewards[-1][1]: 4.4f} (r0 = {r0: 4.4f}), reward eval: reward: {rewards_eval[-1][1]: 4.4f}") - - # update the exploration strategy - actor_model_explore.step(current_frames) - if collected_frames >= init_random_frames: - scheduler1.step() - scheduler2.step() - -collector.shutdown() -del create_env_fn -del collector - -############################################################################### -# We can observe that using TD(lambda) made our results considerably more -# stable for a similar training speed: +# - How to use the :class:`torchrl.objectives.LossModule` class to register components; +# - How to use (or not) a target network, and how to update its parameters; +# - How to create an optimizer associated with a loss module. # -# **Note**: As already mentioned above, to get a more reasonable performance, -# use a greater value for ``total_frames`` e.g. 1000000. - -plt.figure() -plt.plot(*zip(*rewards), label="training") -plt.plot(*zip(*rewards_eval), label="eval") -plt.legend() -plt.xlabel("iter") -plt.ylabel("reward") -plt.tight_layout() -plt.title("TD-labmda DDPG results") From 5488d4d7effa5de75fb4ad3369cbf4cf925f4a92 Mon Sep 17 00:00:00 2001 From: vmoens Date: Mon, 20 Mar 2023 18:08:18 +0000 Subject: [PATCH 02/89] lint --- tutorials/sphinx-tutorials/coding_ddpg.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tutorials/sphinx-tutorials/coding_ddpg.py b/tutorials/sphinx-tutorials/coding_ddpg.py index f2ef9a292ae..dc568b63fe2 100644 --- a/tutorials/sphinx-tutorials/coding_ddpg.py +++ b/tutorials/sphinx-tutorials/coding_ddpg.py @@ -61,10 +61,8 @@ warnings.filterwarnings("ignore") # sphinx_gallery_end_ignore -from copy import deepcopy import numpy as np -import torch import torch.cuda import tqdm from matplotlib import pyplot as plt @@ -86,19 +84,16 @@ from torchrl.envs.libs.dm_control import DMControlEnv from torchrl.envs.libs.gym import GymEnv from torchrl.envs.transforms import RewardScaling, TransformedEnv -from torchrl.envs.utils import set_exploration_mode, step_mdp +from torchrl.envs.utils import set_exploration_mode from torchrl.modules import ( Actor, ActorCriticWrapper, MLP, OrnsteinUhlenbeckProcessWrapper, - ProbabilisticActor, ValueOperator, ) -from torchrl.modules.distributions.continuous import TanhDelta from torchrl.objectives.utils import ( distance_loss, - hold_out_net, hold_out_params, SoftUpdate, ) From 93deeebfe9b2b47122675587461ad63b2e9fce29 Mon Sep 17 00:00:00 2001 From: vmoens Date: Mon, 20 Mar 2023 18:10:19 +0000 Subject: [PATCH 03/89] amend --- tutorials/sphinx-tutorials/coding_ddpg.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tutorials/sphinx-tutorials/coding_ddpg.py b/tutorials/sphinx-tutorials/coding_ddpg.py index dc568b63fe2..c35bd87c41e 100644 --- a/tutorials/sphinx-tutorials/coding_ddpg.py +++ b/tutorials/sphinx-tutorials/coding_ddpg.py @@ -146,7 +146,10 @@ # We also incorporate an advantage module. This will be used to compute the # next state value using our value network. We'll see later in this tutorial # how various advantage modules can be used. If none is provided, we'll -# be using the TD(lambda) method. +# be using the TD(lambda) method, which is usually preferable to TD(0). +# Notice that this choice makes it necessary that the tensordict provided +# has its last dimension representing the time span of the experiment (ie +# our replay buffer must be populated using non-flatten data). # From f511020d81b551cf230dc40b9ad3938e8bf32daf Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 21 Mar 2023 10:22:45 +0000 Subject: [PATCH 04/89] dqn (1) --- tutorials/sphinx-tutorials/coding_dqn.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py index a50ac553b21..8455811898c 100644 --- a/tutorials/sphinx-tutorials/coding_dqn.py +++ b/tutorials/sphinx-tutorials/coding_dqn.py @@ -1,16 +1,25 @@ # -*- coding: utf-8 -*- """ -Coding a pixel-based DQN using TorchRL -====================================== +TorchRL trainer: A DQN example +============================== **Author**: `Vincent Moens `_ """ ############################################################################## -# This tutorial will guide you through the steps to code DQN to solve the -# CartPole task from scratch. DQN -# (`Deep Q-Learning `_) was +# TorchRL provides a generic :class:`torchrl.trainers.Trainer` class to handle +# your training loop. The trainer executes a nested loop where the outer loop +# is the data collection and the inner loop consumes this data or some data +# retrieved from the replay buffer to train the model. +# At various points in this training loop, hooks can be attached and executed at +# given intervals. +# +# In this tutorial, we will be using the trainer class to train a DQN algorithm +# to solve the CartPole task from scratch. +# +# DQN (`Deep Q-Learning `_) was # the founding work in deep reinforcement learning. +# # On a high level, the algorithm is quite simple: Q-learning consists in learning a table of # state-action values in such a way that, when encountering any particular state, # we know which action to pick just by searching for the action with the From 2586b74164a78ee8aa0353be4cb7532867840a2b Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 21 Mar 2023 11:58:31 +0000 Subject: [PATCH 05/89] amend --- tutorials/sphinx-tutorials/coding_dqn.py | 392 ++++++++++++----------- 1 file changed, 201 insertions(+), 191 deletions(-) diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py index 8455811898c..331128eee9a 100644 --- a/tutorials/sphinx-tutorials/coding_dqn.py +++ b/tutorials/sphinx-tutorials/coding_dqn.py @@ -17,13 +17,41 @@ # In this tutorial, we will be using the trainer class to train a DQN algorithm # to solve the CartPole task from scratch. # +# Main takeaways: +# +# - Building a trainer with its essential components: data collector, loss +# module, replay buffer and optimizer. +# - Adding hooks to a trainer, such as loggers, target network updaters and such. +# +# We will also focus on some other aspects of the library: +# +# - how to build an environment in TorchRL, including transforms (e.g. data +# normalization, frame concatenation, resizing and turning to grayscale) +# and parallel execution. Unlike what we did in the +# `DDPG tutorial `_, we +# will normalize the pixels and not the state vector. +# - how to design a ``QValueActor``, i.e. an actor that estimates the action +# values and picks up the action with the highest estimated return; +# - how to collect data from your environment efficiently and store them +# in a replay buffer; +# - how to store trajectories (and not transitions) in your replay buffer), +# and how to estimate returns using TD(lambda); +# - and finally how to evaluate your model. +# +# **Prerequisites**: We encourage you to get familiar with torchrl through the +# `PPO tutorial `_ first. +# +# DQN +# --- +# # DQN (`Deep Q-Learning `_) was # the founding work in deep reinforcement learning. # -# On a high level, the algorithm is quite simple: Q-learning consists in learning a table of -# state-action values in such a way that, when encountering any particular state, -# we know which action to pick just by searching for the action with the -# highest value. This simple setting requires the actions and states to be +# On a high level, the algorithm is quite simple: Q-learning consists in +# learning a table of state-action values in such a way that, when +# encountering any particular state, we know which action to pick just by +# searching for the action with the highest value. This simple setting +# requires the actions and states to be # discrete, otherwise a lookup table cannot be built. # # DQN uses a neural network that encodes a map from the state-action space to @@ -44,32 +72,6 @@ # .. figure:: /_static/img/cartpole_demo.gif # :alt: Cart Pole # -# **Prerequisites**: We encourage you to get familiar with torchrl through the -# `PPO tutorial `_ first. -# This tutorial is more complex and full-fleshed, but it may be . -# -# In this tutorial, you will learn: -# -# - how to build an environment in TorchRL, including transforms (e.g. data -# normalization, frame concatenation, resizing and turning to grayscale) -# and parallel execution. Unlike what we did in the -# `DDPG tutorial `_, we -# will normalize the pixels and not the state vector. -# - how to design a QValue actor, i.e. an actor that estimates the action -# values and picks up the action with the highest estimated return; -# - how to collect data from your environment efficiently and store them -# in a replay buffer; -# - how to store trajectories (and not transitions) in your replay buffer), -# and how to estimate returns using TD(lambda); -# - how to make a module functional and use ; -# - and finally how to evaluate your model. -# -# This tutorial assumes the reader is familiar with some of TorchRL -# primitives, such as :class:`tensordict.TensorDict` and -# :class:`tensordict.TensorDictModules`, although it -# should be sufficiently transparent to be understood without a deep -# understanding of these classes. -# # We do not aim at giving a SOTA implementation of the algorithm, but rather # to provide a high-level illustration of TorchRL features in the context # of this algorithm. @@ -120,102 +122,6 @@ def is_notebook() -> bool: ############################################################################### -# Hyperparameters -# --------------- -# -# Let's start with our hyperparameters. The following setting should work well -# in practice, and the performance of the algorithm should hopefully not be -# too sensitive to slight variations of these. - -device = "cuda:0" if torch.cuda.device_count() > 0 else "cpu" - -############################################################################### -# Optimizer -# ^^^^^^^^^ - -# the learning rate of the optimizer -lr = 2e-3 -# the beta parameters of Adam -betas = (0.9, 0.999) -# Optimization steps per batch collected (aka UPD or updates per data) -n_optim = 8 - -############################################################################### -# DQN parameters -# ^^^^^^^^^^^^^^ - -############################################################################### -# gamma decay factor -gamma = 0.99 - -############################################################################### -# lambda decay factor (see second the part with TD(:math:`\lambda`) -lmbda = 0.95 - -############################################################################### -# Smooth target network update decay parameter. -# This loosely corresponds to a 1/(1-tau) interval with hard target network -# update -tau = 0.005 - -############################################################################### -# Data collection and replay buffer -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# Values to be used for proper training have been commented. -# -# Total frames collected in the environment. In other implementations, the -# user defines a maximum number of episodes. -# This is harder to do with our data collectors since they return batches -# of N collected frames, where N is a constant. -# However, one can easily get the same restriction on number of episodes by -# breaking the training loop when a certain number -# episodes has been collected. -total_frames = 5000 # 500000 - -############################################################################### -# Random frames used to initialize the replay buffer. -init_random_frames = 100 # 1000 - -############################################################################### -# Frames in each batch collected. -frames_per_batch = 32 # 128 - -############################################################################### -# Frames sampled from the replay buffer at each optimization step -batch_size = 32 # 256 - -############################################################################### -# Size of the replay buffer in terms of frames -buffer_size = min(total_frames, 100000) - -############################################################################### -# Number of environments run in parallel in each data collector -num_workers = 2 # 8 -num_collectors = 2 # 4 - - -############################################################################### -# Environment and exploration -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# -# We set the initial and final value of the epsilon factor in Epsilon-greedy -# exploration. -# Since our policy is deterministic, exploration is crucial: without it, the -# only source of randomness would be the environment reset. - -eps_greedy_val = 0.1 -eps_greedy_val_env = 0.005 - -############################################################################### -# To speed up learning, we set the bias of the last layer of our value network -# to a predefined value (this is not mandatory) -init_bias = 2.0 - -############################################################################### -# **Note**: for fast rendering of the tutorial ``total_frames`` hyperparameter -# was set to a very low number. To get a reasonable performance, use a greater -# value e.g. 500000 -# # Building the environment # ------------------------ # @@ -283,7 +189,7 @@ def make_env(parallel=False, observation_norm_state_dict=None): ############################################################################### # Compute normalizing constants -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # To normalize images, we don't want to normalize each pixel independently # with a full ``[C, W, H]`` normalizing mask, but with simpler ``[C, 1, 1]`` @@ -292,16 +198,16 @@ def make_env(parallel=False, observation_norm_state_dict=None): # dimensions must be reduced, and the ``keep_dims`` parameter to ensure that # not all dimensions disappear in the process: -test_env = make_env() -test_env.transform[-1].init_stats( +def get_norm_const(): + test_env = make_env() + test_env.transform[-1].init_stats( num_iter=1000, cat_dim=0, reduce_dim=[-1, -2, -4], keep_dims=(-1, -2) ) -observation_norm_state_dict = test_env.transform[-1].state_dict() - -############################################################################### -# let's check that normalizing constants have a size of ``[C, 1, 1]`` where -# ``C=4`` (because of :class:`torchrl.envs.CatFrames`). -print(observation_norm_state_dict) + observation_norm_state_dict = test_env.transform[-1].state_dict() + # let's check that normalizing constants have a size of ``[C, 1, 1]`` where + # ``C=4`` (because of :class:`torchrl.envs.CatFrames`). + print(observation_norm_state_dict) + return observation_norm_state_dict ############################################################################### # Building the model (Deep Q-network) @@ -324,7 +230,7 @@ def make_env(parallel=False, observation_norm_state_dict=None): # in the input :class:`tensordict.TensorDict`. # # Target parameters -# ^^^^^^^^^^^^^^^^^ +# ~~~~~~~~~~~~~~~~~ # # Many off-policy RL algorithms use the concept of "target parameters" when it # comes to estimate the value of the ``t+1`` state or state-action pair. @@ -335,7 +241,7 @@ def make_env(parallel=False, observation_norm_state_dict=None): # in similar algorithms. # # Functionalizing modules -# ^^^^^^^^^^^^^^^^^^^^^^^ +# ~~~~~~~~~~~~~~~~~~~~~~~ # # One of the features of torchrl is its usage of functional modules: as the # same architecture is often used with multiple sets of parameters (e.g. @@ -401,40 +307,12 @@ def make_model(dummy_env): return factor, actor, actor_explore, params, params_target -( - factor, - actor, - actor_explore, - params, - params_target, -) = make_model(test_env) - -############################################################################### -# We represent the parameters and targets as flat structures, but unflattening -# them is quite easy: - -params_flat = params.flatten_keys(".") - -############################################################################### -# We will be using the adam optimizer: - -optim = torch.optim.Adam(list(params_flat.values()), lr, betas=betas) - -############################################################################### -# We create a test environment for evaluation of the policy: - -test_env = make_env( - parallel=False, observation_norm_state_dict=observation_norm_state_dict -) -# sanity check: -print(actor_explore(test_env.reset())) - ############################################################################### # Collecting and storing data # --------------------------- # # Replay buffers -# ^^^^^^^^^^^^^^ +# ~~~~~~~~~~~~~~ # # Replay buffers play a central role in off-policy RL algorithms such as DQN. # They constitute the dataset we will be sampling from during training. @@ -450,14 +328,16 @@ def make_model(dummy_env): # The only requirement of this storage is that the data passed to it at write # time must always have the same shape. -replay_buffer = TensorDictReplayBuffer( - storage=LazyMemmapStorage(buffer_size), - prefetch=n_optim, -) +def get_replay_buffer(buffer_size, n_optim): + replay_buffer = TensorDictReplayBuffer( + storage=LazyMemmapStorage(buffer_size), + prefetch=n_optim, + ) + return replay_buffer ############################################################################### # Data collector -# ^^^^^^^^^^^^^^ +# ~~~~~~~~~~~~~~ # # As in `PPO ` and # `DDPG `, we will be using @@ -485,27 +365,157 @@ def make_model(dummy_env): # out training loop must account for. For simplicity, we set the devices to # the same value for all sub-collectors. -data_collector = MultiaSyncDataCollector( - # ``num_collectors`` collectors, each with an set of `num_workers` environments being run in parallel - [ - make_env( - parallel=True, observation_norm_state_dict=observation_norm_state_dict - ), - ] - * num_collectors, - policy=actor_explore, - frames_per_batch=frames_per_batch, - total_frames=total_frames, - # this is the default behaviour: the collector runs in ``"random"`` (or explorative) mode - exploration_mode="random", - # We set the all the devices to be identical. Below is an example of - # heterogeneous devices - devices=[device] * num_collectors, - storing_devices=[device] * num_collectors, - # devices=[f"cuda:{i}" for i in range(1, 1 + num_collectors)], - # storing_devices=[f"cuda:{i}" for i in range(1, 1 + num_collectors)], - split_trajs=False, +def get_collector(observation_norm_state_dict, num_collectors, actor_explore, frames_per_batch, total_frames, device): + data_collector = MultiaSyncDataCollector( + [ + make_env( + parallel=True, observation_norm_state_dict=observation_norm_state_dict + ), + ] + * num_collectors, + policy=actor_explore, + frames_per_batch=frames_per_batch, + total_frames=total_frames, + # this is the default behaviour: the collector runs in ``"random"`` (or explorative) mode + exploration_mode="random", + # We set the all the devices to be identical. Below is an example of + # heterogeneous devices + device=device, + storing_device=device, + split_trajs=False, + ) + return data_collector + + + + +############################################################################### +# Hyperparameters +# --------------- +# +# Let's start with our hyperparameters. The following setting should work well +# in practice, and the performance of the algorithm should hopefully not be +# too sensitive to slight variations of these. + +device = "cuda:0" if torch.cuda.device_count() > 0 else "cpu" + +############################################################################### +# Optimizer +# ~~~~~~~~~ + +# the learning rate of the optimizer +lr = 2e-3 +# the beta parameters of Adam +betas = (0.9, 0.999) +# Optimization steps per batch collected (aka UPD or updates per data) +n_optim = 8 + +############################################################################### +# DQN parameters +# ~~~~~~~~~~~~~~ + +############################################################################### +# gamma decay factor +gamma = 0.99 + +############################################################################### +# lambda decay factor (see second the part with TD(:math:`\lambda`) +lmbda = 0.95 + +############################################################################### +# Smooth target network update decay parameter. +# This loosely corresponds to a 1/(1-tau) interval with hard target network +# update +tau = 0.005 + +############################################################################### +# Data collection and replay buffer +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Values to be used for proper training have been commented. +# +# Total frames collected in the environment. In other implementations, the +# user defines a maximum number of episodes. +# This is harder to do with our data collectors since they return batches +# of N collected frames, where N is a constant. +# However, one can easily get the same restriction on number of episodes by +# breaking the training loop when a certain number +# episodes has been collected. +total_frames = 5000 # 500000 + +############################################################################### +# Random frames used to initialize the replay buffer. +init_random_frames = 100 # 1000 + +############################################################################### +# Frames in each batch collected. +frames_per_batch = 32 # 128 + +############################################################################### +# Frames sampled from the replay buffer at each optimization step +batch_size = 32 # 256 + +############################################################################### +# Size of the replay buffer in terms of frames +buffer_size = min(total_frames, 100000) + +############################################################################### +# Number of environments run in parallel in each data collector +num_workers = 2 # 8 +num_collectors = 2 # 4 + + +############################################################################### +# Environment and exploration +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# We set the initial and final value of the epsilon factor in Epsilon-greedy +# exploration. +# Since our policy is deterministic, exploration is crucial: without it, the +# only source of randomness would be the environment reset. + +eps_greedy_val = 0.1 +eps_greedy_val_env = 0.005 + +############################################################################### +# To speed up learning, we set the bias of the last layer of our value network +# to a predefined value (this is not mandatory) +init_bias = 2.0 + +############################################################################### +# .. note:: +# For fast rendering of the tutorial ``total_frames`` hyperparameter +# was set to a very low number. To get a reasonable performance, use a greater +# value e.g. 500000 +# + + +( + factor, + actor, + actor_explore, + params, + params_target, +) = make_model(test_env) + +############################################################################### +# We represent the parameters and targets as flat structures, but unflattening +# them is quite easy: + +params_flat = params.flatten_keys(".") + +############################################################################### +# We will be using the adam optimizer: + +optim = torch.optim.Adam(list(params_flat.values()), lr, betas=betas) + +############################################################################### +# We create a test environment for evaluation of the policy: + +test_env = make_env( + parallel=False, observation_norm_state_dict=observation_norm_state_dict ) +# sanity check: +print(actor_explore(test_env.reset())) ############################################################################### # Training loop of a regular DQN From b330b166384f4e7405af150f13be1fc7e5e6da7f Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 21 Mar 2023 13:26:26 +0000 Subject: [PATCH 06/89] edit training dqn --- tutorials/sphinx-tutorials/coding_dqn.py | 1928 +++++++++++----------- 1 file changed, 961 insertions(+), 967 deletions(-) diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py index 331128eee9a..cc6bc232513 100644 --- a/tutorials/sphinx-tutorials/coding_dqn.py +++ b/tutorials/sphinx-tutorials/coding_dqn.py @@ -76,982 +76,976 @@ # to provide a high-level illustration of TorchRL features in the context # of this algorithm. -# sphinx_gallery_start_ignore -import warnings -from collections import defaultdict - -warnings.filterwarnings("ignore") -# sphinx_gallery_end_ignore - -import torch -import tqdm -from functorch import vmap -from matplotlib import pyplot as plt -from tensordict import TensorDict -from tensordict.nn import get_functional -from torch import nn -from torchrl.collectors import MultiaSyncDataCollector -from torchrl.data import LazyMemmapStorage, TensorDictReplayBuffer -from torchrl.envs import EnvCreator, ParallelEnv, RewardScaling, StepCounter -from torchrl.envs.libs.gym import GymEnv -from torchrl.envs.transforms import ( - CatFrames, - CatTensors, - Compose, - GrayScale, - ObservationNorm, - Resize, - ToTensorImage, - TransformedEnv, -) -from torchrl.envs.utils import set_exploration_mode, step_mdp -from torchrl.modules import DuelingCnnDQNet, EGreedyWrapper, QValueActor +if __name__ == "__main__": + # sphinx_gallery_start_ignore + import warnings + from collections import defaultdict + + from torchrl.objectives import DQNLoss + from torchrl.trainers import Trainer + + warnings.filterwarnings("ignore") + # sphinx_gallery_end_ignore + + import torch + import tqdm + from functorch import vmap + from matplotlib import pyplot as plt + from tensordict import TensorDict + from tensordict.nn import get_functional + from torch import nn + from torchrl.collectors import MultiaSyncDataCollector + from torchrl.data import LazyMemmapStorage, TensorDictReplayBuffer + from torchrl.envs import EnvCreator, ParallelEnv, RewardScaling, StepCounter + from torchrl.envs.libs.gym import GymEnv + from torchrl.envs.transforms import ( + CatFrames, + CatTensors, + Compose, + GrayScale, + ObservationNorm, + Resize, + ToTensorImage, + TransformedEnv, + ) + from torchrl.envs.utils import set_exploration_mode, step_mdp + from torchrl.modules import DuelingCnnDQNet, EGreedyWrapper, QValueActor -def is_notebook() -> bool: - try: - shell = get_ipython().__class__.__name__ - if shell == "ZMQInteractiveShell": - return True # Jupyter notebook or qtconsole - elif shell == "TerminalInteractiveShell": - return False # Terminal running IPython + def is_notebook() -> bool: + try: + shell = get_ipython().__class__.__name__ + if shell == "ZMQInteractiveShell": + return True # Jupyter notebook or qtconsole + elif shell == "TerminalInteractiveShell": + return False # Terminal running IPython + else: + return False # Other type (?) + except NameError: + return False # Probably standard Python interpreter + + + ############################################################################### + # Building the environment + # ------------------------ + # + # Our environment builder has two arguments: + # + # - ``parallel``: determines whether multiple environments have to be run in + # parallel. We stack the transforms after the + # :class:`torchrl.envs.ParallelEnv` to take advantage + # of vectorization of the operations on device, although this would + # technically work with every single environment attached to its own set of + # transforms. + # - ``observation_norm_state_dict`` will contain the normalizing constants for + # the :class:`torchrl.envs.ObservationNorm` tranform. + # + # We will be using five transforms: + # + # - :class:`torchrl.envs.ToTensorImage` will convert a ``[W, H, C]`` uint8 + # tensor in a floating point tensor in the ``[0, 1]`` space with shape + # ``[C, W, H]``; + # - :class:`torchrl.envs.RewardScaling` to reduce the scale of the return; + # - :class:`torchrl.envs.GrayScale` will turn our image into grayscale; + # - :class:`torchrl.envs.Resize` will resize the image in a 64x64 format; + # - :class:`torchrl.envs.CatFrames` will concatenate an arbitrary number of + # successive frames (``N=4``) in a single tensor along the channel dimension. + # This is useful as a single image does not carry information about the + # motion of the cartpole. Some memory about past observations and actions + # is needed, either via a recurrent neural network or using a stack of + # frames. + # - :class:`torchrl.envs.ObservationNorm` which will normalize our observations + # given some custom summary statistics. + # + + + def make_env(parallel=False, observation_norm_state_dict=None, frame_skip=1): + if observation_norm_state_dict is None: + observation_norm_state_dict = {"standard_normal": True} + if parallel: + base_env = ParallelEnv( + num_workers, + EnvCreator( + lambda: GymEnv( + "CartPole-v1", from_pixels=True, pixels_only=True, device=device, frame_skip=frame_skip + ) + ), + ) else: - return False # Other type (?) - except NameError: - return False # Probably standard Python interpreter - - -############################################################################### -# Building the environment -# ------------------------ -# -# Our environment builder has two arguments: -# -# - ``parallel``: determines whether multiple environments have to be run in -# parallel. We stack the transforms after the -# :class:`torchrl.envs.ParallelEnv` to take advantage -# of vectorization of the operations on device, although this would -# technically work with every single environment attached to its own set of -# transforms. -# - ``observation_norm_state_dict`` will contain the normalizing constants for -# the :class:`torchrl.envs.ObservationNorm` tranform. -# -# We will be using five transforms: -# -# - :class:`torchrl.envs.ToTensorImage` will convert a ``[W, H, C]`` uint8 -# tensor in a floating point tensor in the ``[0, 1]`` space with shape -# ``[C, W, H]``; -# - :class:`torchrl.envs.RewardScaling` to reduce the scale of the return; -# - :class:`torchrl.envs.GrayScale` will turn our image into grayscale; -# - :class:`torchrl.envs.Resize` will resize the image in a 64x64 format; -# - :class:`torchrl.envs.CatFrames` will concatenate an arbitrary number of -# successive frames (``N=4``) in a single tensor along the channel dimension. -# This is useful as a single image does not carry information about the -# motion of the cartpole. Some memory about past observations and actions -# is needed, either via a recurrent neural network or using a stack of -# frames. -# - :class:`torchrl.envs.ObservationNorm` which will normalize our observations -# given some custom summary statistics. -# - + base_env = GymEnv( + "CartPole-v1", from_pixels=True, pixels_only=True, device=device, frame_skip=frame_skip, + ) -def make_env(parallel=False, observation_norm_state_dict=None): - if observation_norm_state_dict is None: - observation_norm_state_dict = {"standard_normal": True} - if parallel: - base_env = ParallelEnv( - num_workers, - EnvCreator( - lambda: GymEnv( - "CartPole-v1", from_pixels=True, pixels_only=True, device=device - ) + env = TransformedEnv( + base_env, + Compose( + StepCounter(), # to count the steps of each trajectory + ToTensorImage(), + RewardScaling(loc=0.0, scale=0.1), + GrayScale(), + Resize(64, 64), + CatFrames(4, in_keys=["pixels"], dim=-3), + ObservationNorm(in_keys=["pixels"], **observation_norm_state_dict), ), ) - else: - base_env = GymEnv( - "CartPole-v1", from_pixels=True, pixels_only=True, device=device - ) - - env = TransformedEnv( - base_env, - Compose( - StepCounter(), # to count the steps of each trajectory - ToTensorImage(), - RewardScaling(loc=0.0, scale=0.1), - GrayScale(), - Resize(64, 64), - CatFrames(4, in_keys=["pixels"], dim=-3), - ObservationNorm(in_keys=["pixels"], **observation_norm_state_dict), - ), - ) - return env - - -############################################################################### -# Compute normalizing constants -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# To normalize images, we don't want to normalize each pixel independently -# with a full ``[C, W, H]`` normalizing mask, but with simpler ``[C, 1, 1]`` -# shaped loc and scale parameters. We will be using the ``reduce_dim`` argument -# of :func:`torchrl.envs.ObservationNorm.init_stats` to instruct which -# dimensions must be reduced, and the ``keep_dims`` parameter to ensure that -# not all dimensions disappear in the process: - -def get_norm_const(): - test_env = make_env() - test_env.transform[-1].init_stats( - num_iter=1000, cat_dim=0, reduce_dim=[-1, -2, -4], keep_dims=(-1, -2) -) - observation_norm_state_dict = test_env.transform[-1].state_dict() - # let's check that normalizing constants have a size of ``[C, 1, 1]`` where - # ``C=4`` (because of :class:`torchrl.envs.CatFrames`). - print(observation_norm_state_dict) - return observation_norm_state_dict - -############################################################################### -# Building the model (Deep Q-network) -# ----------------------------------- -# -# The following function builds a :class:`torchrl.modules.DuelingCnnDQNet` -# object which is a simple CNN followed by a two-layer MLP. The only trick used -# here is that the action values (i.e. left and right action value) are -# computed using -# -# .. math:: -# -# val = b(obs) + v(obs) - \mathbb{E}[v(obs)] -# -# where :math:`b` is a :math:`\# obs \rightarrow 1` function and :math:`v` is a -# :math:`\# obs \rightarrow num_actions` function. -# -# Our network is wrapped in a :class:`torchrl.modules.QValueActor`, which will read the state-action -# values, pick up the one with the maximum value and write all those results -# in the input :class:`tensordict.TensorDict`. -# -# Target parameters -# ~~~~~~~~~~~~~~~~~ -# -# Many off-policy RL algorithms use the concept of "target parameters" when it -# comes to estimate the value of the ``t+1`` state or state-action pair. -# The target parameters are lagged copies of the model parameters. Because -# their predictions mismatch those of the current model configuration, they -# help learning by putting a pessimistic bound on the value being estimated. -# This is a powerful trick (known as "Double Q-Learning") that is ubiquitous -# in similar algorithms. -# -# Functionalizing modules -# ~~~~~~~~~~~~~~~~~~~~~~~ -# -# One of the features of torchrl is its usage of functional modules: as the -# same architecture is often used with multiple sets of parameters (e.g. -# trainable and target parameters), we functionalize the modules and isolate -# the various sets of parameters in separate tensordicts. -# -# To this aim, we use :func:`tensordict.nn.get_functional`, which augments -# our modules with some extra feature that make them compatible with parameters -# passed in the ``TensorDict`` format. - - -def make_model(dummy_env): - cnn_kwargs = { - "num_cells": [32, 64, 64], - "kernel_sizes": [6, 4, 3], - "strides": [2, 2, 1], - "activation_class": nn.ELU, - # This can be used to reduce the size of the last layer of the CNN - # "squeeze_output": True, - # "aggregator_class": nn.AdaptiveAvgPool2d, - # "aggregator_kwargs": {"output_size": (1, 1)}, - } - mlp_kwargs = { - "depth": 2, - "num_cells": [ - 64, - 64, - ], - "activation_class": nn.ELU, - } - net = DuelingCnnDQNet( - dummy_env.action_spec.shape[-1], 1, cnn_kwargs, mlp_kwargs - ).to(device) - net.value[-1].bias.data.fill_(init_bias) - - actor = QValueActor(net, in_keys=["pixels"], spec=dummy_env.action_spec).to(device) - # init actor: because the model is composed of lazy conv/linear layers, - # we must pass a fake batch of data through it to instantiate them. - tensordict = dummy_env.fake_tensordict() - actor(tensordict) - - # Make functional: - # here's an explicit way of creating the parameters and buffer tensordict. - # Alternatively, we could have used `params = make_functional(actor)` from - # tensordict.nn - params = TensorDict({k: v for k, v in actor.named_parameters()}, []) - buffers = TensorDict({k: v for k, v in actor.named_buffers()}, []) - params = params.update(buffers) - params = params.unflatten_keys(".") # creates a nested TensorDict - factor = get_functional(actor) - - # creating the target parameters is fairly easy with tensordict: - params_target = params.clone().detach() - - # we wrap our actor in an EGreedyWrapper for data collection - actor_explore = EGreedyWrapper( - actor, - annealing_num_steps=total_frames, - eps_init=eps_greedy_val, - eps_end=eps_greedy_val_env, - ) - - return factor, actor, actor_explore, params, params_target - - -############################################################################### -# Collecting and storing data -# --------------------------- -# -# Replay buffers -# ~~~~~~~~~~~~~~ -# -# Replay buffers play a central role in off-policy RL algorithms such as DQN. -# They constitute the dataset we will be sampling from during training. -# -# Here, we will use a regular sampling strategy, although a prioritized RB -# could improve the performance significantly. -# -# We place the storage on disk using -# :class:`torchrl.data.replay_buffers.storages.LazyMemmapStorage` class. This -# storage is created in a lazy manner: it will only be instantiated once the -# first batch of data is passed to it. -# -# The only requirement of this storage is that the data passed to it at write -# time must always have the same shape. - -def get_replay_buffer(buffer_size, n_optim): - replay_buffer = TensorDictReplayBuffer( - storage=LazyMemmapStorage(buffer_size), - prefetch=n_optim, - ) - return replay_buffer - -############################################################################### -# Data collector -# ~~~~~~~~~~~~~~ -# -# As in `PPO ` and -# `DDPG `, we will be using -# a data collector as a dataloader in the outer loop. -# -# We choose the following configuration: we will be running a series of -# parallel environments synchronously in parallel in different collectors, -# themselves running in parallel but asynchronously. -# The advantage of this configuration is that we can balance the amount of -# compute that is executed in batch with what we want to be executed -# asynchronously. We encourage the reader to experiment how the collection -# speed is impacted by modifying the number of collectors (ie the number of -# environment constructors passed to the collector) and the number of -# environment executed in parallel in each collector (controlled by the -# ``num_workers`` hyperparameter). -# -# When building the collector, we can choose on which device we want the -# environment and policy to execute the operations through the ``device`` -# keyword argument. The ``storing_devices`` argument will modify the -# location of the data being collected: if the batches that we are gathering -# have a considerable size, we may want to store them on a different location -# than the device where the computation is happening. For asynchronous data -# collectors such as ours, different storing devices mean that the data that -# we collect won't sit on the same device each time, which is something that -# out training loop must account for. For simplicity, we set the devices to -# the same value for all sub-collectors. - -def get_collector(observation_norm_state_dict, num_collectors, actor_explore, frames_per_batch, total_frames, device): - data_collector = MultiaSyncDataCollector( - [ - make_env( - parallel=True, observation_norm_state_dict=observation_norm_state_dict - ), - ] - * num_collectors, - policy=actor_explore, - frames_per_batch=frames_per_batch, - total_frames=total_frames, - # this is the default behaviour: the collector runs in ``"random"`` (or explorative) mode - exploration_mode="random", - # We set the all the devices to be identical. Below is an example of - # heterogeneous devices - device=device, - storing_device=device, - split_trajs=False, + return env + + + ############################################################################### + # Compute normalizing constants + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # + # To normalize images, we don't want to normalize each pixel independently + # with a full ``[C, W, H]`` normalizing mask, but with simpler ``[C, 1, 1]`` + # shaped loc and scale parameters. We will be using the ``reduce_dim`` argument + # of :func:`torchrl.envs.ObservationNorm.init_stats` to instruct which + # dimensions must be reduced, and the ``keep_dims`` parameter to ensure that + # not all dimensions disappear in the process: + + def get_norm_stats(): + test_env = make_env() + test_env.transform[-1].init_stats( + num_iter=1000, cat_dim=0, reduce_dim=[-1, -2, -4], keep_dims=(-1, -2) ) - return data_collector - - - - -############################################################################### -# Hyperparameters -# --------------- -# -# Let's start with our hyperparameters. The following setting should work well -# in practice, and the performance of the algorithm should hopefully not be -# too sensitive to slight variations of these. - -device = "cuda:0" if torch.cuda.device_count() > 0 else "cpu" - -############################################################################### -# Optimizer -# ~~~~~~~~~ - -# the learning rate of the optimizer -lr = 2e-3 -# the beta parameters of Adam -betas = (0.9, 0.999) -# Optimization steps per batch collected (aka UPD or updates per data) -n_optim = 8 - -############################################################################### -# DQN parameters -# ~~~~~~~~~~~~~~ - -############################################################################### -# gamma decay factor -gamma = 0.99 - -############################################################################### -# lambda decay factor (see second the part with TD(:math:`\lambda`) -lmbda = 0.95 - -############################################################################### -# Smooth target network update decay parameter. -# This loosely corresponds to a 1/(1-tau) interval with hard target network -# update -tau = 0.005 - -############################################################################### -# Data collection and replay buffer -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Values to be used for proper training have been commented. -# -# Total frames collected in the environment. In other implementations, the -# user defines a maximum number of episodes. -# This is harder to do with our data collectors since they return batches -# of N collected frames, where N is a constant. -# However, one can easily get the same restriction on number of episodes by -# breaking the training loop when a certain number -# episodes has been collected. -total_frames = 5000 # 500000 - -############################################################################### -# Random frames used to initialize the replay buffer. -init_random_frames = 100 # 1000 - -############################################################################### -# Frames in each batch collected. -frames_per_batch = 32 # 128 - -############################################################################### -# Frames sampled from the replay buffer at each optimization step -batch_size = 32 # 256 - -############################################################################### -# Size of the replay buffer in terms of frames -buffer_size = min(total_frames, 100000) - -############################################################################### -# Number of environments run in parallel in each data collector -num_workers = 2 # 8 -num_collectors = 2 # 4 - - -############################################################################### -# Environment and exploration -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# We set the initial and final value of the epsilon factor in Epsilon-greedy -# exploration. -# Since our policy is deterministic, exploration is crucial: without it, the -# only source of randomness would be the environment reset. - -eps_greedy_val = 0.1 -eps_greedy_val_env = 0.005 - -############################################################################### -# To speed up learning, we set the bias of the last layer of our value network -# to a predefined value (this is not mandatory) -init_bias = 2.0 - -############################################################################### -# .. note:: -# For fast rendering of the tutorial ``total_frames`` hyperparameter -# was set to a very low number. To get a reasonable performance, use a greater -# value e.g. 500000 -# - - -( - factor, - actor, - actor_explore, - params, - params_target, -) = make_model(test_env) - -############################################################################### -# We represent the parameters and targets as flat structures, but unflattening -# them is quite easy: - -params_flat = params.flatten_keys(".") - -############################################################################### -# We will be using the adam optimizer: - -optim = torch.optim.Adam(list(params_flat.values()), lr, betas=betas) - -############################################################################### -# We create a test environment for evaluation of the policy: - -test_env = make_env( - parallel=False, observation_norm_state_dict=observation_norm_state_dict -) -# sanity check: -print(actor_explore(test_env.reset())) - -############################################################################### -# Training loop of a regular DQN -# ------------------------------ -# -# We'll start with a simple implementation of DQN where the returns are -# computed without bootstrapping, i.e. -# -# .. math:: -# -# Q_{t}(s, a) = R(s, a) + \gamma * V_{t+1}(s) -# -# where :math:`Q(s, a)` is the Q-value of the current state-action pair, -# :math:`R(s, a)` is the result of the reward function, and :math:`V(s)` is a -# value function that returns 0 for terminating states. -# -# We store the logs in a defaultdict: - -logs_exp1 = defaultdict(list) -prev_traj_count = 0 - -pbar = tqdm.tqdm(total=total_frames) -for j, data in enumerate(data_collector): - current_frames = data.numel() - pbar.update(current_frames) - data = data.view(-1) - - # We store the values on the replay buffer, after placing them on CPU. - # When called for the first time, this will instantiate our storage - # object which will print its content. - replay_buffer.extend(data.cpu()) - - # some logging - if len(logs_exp1["frames"]): - logs_exp1["frames"].append(current_frames + logs_exp1["frames"][-1]) - else: - logs_exp1["frames"].append(current_frames) - - if data["next", "done"].any(): - done = data["next", "done"].squeeze(-1) - logs_exp1["traj_lengths"].append( - data["next", "step_count"][done].float().mean().item() + observation_norm_state_dict = test_env.transform[-1].state_dict() + # let's check that normalizing constants have a size of ``[C, 1, 1]`` where + # ``C=4`` (because of :class:`torchrl.envs.CatFrames`). + print(observation_norm_state_dict) + return observation_norm_state_dict + + ############################################################################### + # Building the model (Deep Q-network) + # ----------------------------------- + # + # The following function builds a :class:`torchrl.modules.DuelingCnnDQNet` + # object which is a simple CNN followed by a two-layer MLP. The only trick used + # here is that the action values (i.e. left and right action value) are + # computed using + # + # .. math:: + # + # val = b(obs) + v(obs) - \mathbb{E}[v(obs)] + # + # where :math:`b` is a :math:`\# obs \rightarrow 1` function and :math:`v` is a + # :math:`\# obs \rightarrow num_actions` function. + # + # Our network is wrapped in a :class:`torchrl.modules.QValueActor`, which will read the state-action + # values, pick up the one with the maximum value and write all those results + # in the input :class:`tensordict.TensorDict`. + # + # Target parameters + # ~~~~~~~~~~~~~~~~~ + # + # Many off-policy RL algorithms use the concept of "target parameters" when it + # comes to estimate the value of the ``t+1`` state or state-action pair. + # The target parameters are lagged copies of the model parameters. Because + # their predictions mismatch those of the current model configuration, they + # help learning by putting a pessimistic bound on the value being estimated. + # This is a powerful trick (known as "Double Q-Learning") that is ubiquitous + # in similar algorithms. + # + + def make_model(dummy_env): + cnn_kwargs = { + "num_cells": [32, 64, 64], + "kernel_sizes": [6, 4, 3], + "strides": [2, 2, 1], + "activation_class": nn.ELU, + # This can be used to reduce the size of the last layer of the CNN + # "squeeze_output": True, + # "aggregator_class": nn.AdaptiveAvgPool2d, + # "aggregator_kwargs": {"output_size": (1, 1)}, + } + mlp_kwargs = { + "depth": 2, + "num_cells": [ + 64, + 64, + ], + "activation_class": nn.ELU, + } + net = DuelingCnnDQNet( + dummy_env.action_spec.shape[-1], 1, cnn_kwargs, mlp_kwargs + ).to(device) + net.value[-1].bias.data.fill_(init_bias) + + actor = QValueActor(net, in_keys=["pixels"], spec=dummy_env.action_spec).to(device) + # init actor: because the model is composed of lazy conv/linear layers, + # we must pass a fake batch of data through it to instantiate them. + tensordict = dummy_env.fake_tensordict() + actor(tensordict) + + # we wrap our actor in an EGreedyWrapper for data collection + actor_explore = EGreedyWrapper( + actor, + annealing_num_steps=total_frames, + eps_init=eps_greedy_val, + eps_end=eps_greedy_val_env, ) - # check that we have enough data to start training - if sum(logs_exp1["frames"]) > init_random_frames: - for _ in range(n_optim): - # sample from the RB and send to device - sampled_data = replay_buffer.sample(batch_size) - sampled_data = sampled_data.to(device, non_blocking=True) - - # collect data from RB - reward = sampled_data["next", "reward"].squeeze(-1) - done = sampled_data["next", "done"].squeeze(-1).to(reward.dtype) - action = sampled_data["action"].clone() - - # Compute action value (of the action actually taken) at time t - # By default, TorchRL uses one-hot encodings for discrete actions - sampled_data_out = sampled_data.select(*actor.in_keys) - sampled_data_out = factor(sampled_data_out, params=params) - action_value = sampled_data_out["action_value"] - action_value = (action_value * action.to(action_value.dtype)).sum(-1) - with torch.no_grad(): - # compute best action value for the next step, using target parameters - tdstep = step_mdp(sampled_data) - next_value = factor( - tdstep.select(*actor.in_keys), - params=params_target, - )["chosen_action_value"].squeeze(-1) - exp_value = reward + gamma * next_value * (1 - done) - assert exp_value.shape == action_value.shape - # we use MSE loss but L1 or smooth L1 should also work - error = nn.functional.mse_loss(exp_value, action_value).mean() - error.backward() - - gv = nn.utils.clip_grad_norm_(list(params_flat.values()), 1) - - optim.step() - optim.zero_grad() - - # update of the target parameters - params_target.apply( - lambda p_target, p_orig: p_orig * tau + p_target * (1 - tau), - params.detach(), - inplace=True, - ) - - actor_explore.step(current_frames) - - # Logging - logs_exp1["grad_vals"].append(float(gv)) - logs_exp1["losses"].append(error.item()) - logs_exp1["values"].append(action_value.mean().item()) - logs_exp1["traj_count"].append( - prev_traj_count + data["next", "done"].sum().item() + return actor, actor_explore + + + ############################################################################### + # Collecting and storing data + # --------------------------- + # + # Replay buffers + # ~~~~~~~~~~~~~~ + # + # Replay buffers play a central role in off-policy RL algorithms such as DQN. + # They constitute the dataset we will be sampling from during training. + # + # Here, we will use a regular sampling strategy, although a prioritized RB + # could improve the performance significantly. + # + # We place the storage on disk using + # :class:`torchrl.data.replay_buffers.storages.LazyMemmapStorage` class. This + # storage is created in a lazy manner: it will only be instantiated once the + # first batch of data is passed to it. + # + # The only requirement of this storage is that the data passed to it at write + # time must always have the same shape. + + def get_replay_buffer(buffer_size, n_optim): + replay_buffer = TensorDictReplayBuffer( + storage=LazyMemmapStorage(buffer_size), + prefetch=n_optim, ) - prev_traj_count = logs_exp1["traj_count"][-1] - - if j % 10 == 0: - with set_exploration_mode("mode"), torch.no_grad(): - # execute a rollout. The `set_exploration_mode("mode")` has no effect here since the policy is deterministic, but we add it for completeness - eval_rollout = test_env.rollout( - max_steps=10000, - policy=actor, - ).cpu() - logs_exp1["traj_lengths_eval"].append(eval_rollout.shape[-1]) - logs_exp1["evals"].append(eval_rollout["next", "reward"].sum().item()) - if len(logs_exp1["mavgs"]): - logs_exp1["mavgs"].append( - logs_exp1["evals"][-1] * 0.05 + logs_exp1["mavgs"][-1] * 0.95 - ) - else: - logs_exp1["mavgs"].append(logs_exp1["evals"][-1]) - logs_exp1["traj_count_eval"].append(logs_exp1["traj_count"][-1]) - pbar.set_description( - f"error: {error: 4.4f}, value: {action_value.mean(): 4.4f}, test return: {logs_exp1['evals'][-1]: 4.4f}" - ) - - # update policy weights - data_collector.update_policy_weights_() - -############################################################################### -# We write a custom plot function to display the performance of our algorithm -# - - -def plot(logs, name): - plt.figure(figsize=(15, 10)) - plt.subplot(2, 3, 1) - plt.plot( - logs["frames"][-len(logs["evals"]) :], - logs["evals"], - label="return (eval)", - ) - plt.plot( - logs["frames"][-len(logs["mavgs"]) :], - logs["mavgs"], - label="mavg of returns (eval)", - ) - plt.xlabel("frames collected") - plt.ylabel("trajectory length (= return)") - plt.subplot(2, 3, 2) - plt.plot( - logs["traj_count"][-len(logs["evals"]) :], - logs["evals"], - label="return", - ) - plt.plot( - logs["traj_count"][-len(logs["mavgs"]) :], - logs["mavgs"], - label="mavg", - ) - plt.xlabel("trajectories collected") - plt.legend() - plt.subplot(2, 3, 3) - plt.plot(logs["frames"][-len(logs["losses"]) :], logs["losses"]) - plt.xlabel("frames collected") - plt.title("loss") - plt.subplot(2, 3, 4) - plt.plot(logs["frames"][-len(logs["values"]) :], logs["values"]) - plt.xlabel("frames collected") - plt.title("value") - plt.subplot(2, 3, 5) - plt.plot( - logs["frames"][-len(logs["grad_vals"]) :], - logs["grad_vals"], - ) - plt.xlabel("frames collected") - plt.title("grad norm") - if len(logs["traj_lengths"]): - plt.subplot(2, 3, 6) - plt.plot(logs["traj_lengths"]) - plt.xlabel("batches") - plt.title("traj length (training)") - plt.savefig(name) - if is_notebook(): - plt.show() - - -############################################################################### -# The performance of the policy can be measured as the length of trajectories. -# As we can see on the results of the :func:`plot` function, the performance -# of the policy increases, albeit slowly. -# -# .. code-block:: python -# -# plot(logs_exp1, "dqn_td0.png") -# -# .. figure:: /_static/img/dqn_td0.png -# :alt: Cart Pole results with TD(0) -# - -print("shutting down") -data_collector.shutdown() -del data_collector - -############################################################################### -# DQN with TD(:math:`\lambda`) -# ---------------------------- -# -# We can improve the above algorithm by getting a better estimate of the -# return, using not only the next state value but the whole sequence of rewards -# and values that follow a particular step. -# -# TorchRL provides a vectorized version of TD(lambda) named -# :func:`torchrl.objectives.value.functional.vec_td_lambda_advantage_estimate`. -# We'll use this to obtain a target value that the value network will be -# trained to match. -# -# The big difference in this implementation is that we'll store entire -# trajectories and not single steps in the replay buffer. This will be done -# automatically as long as we're not "flattening" the tensordict collected: -# by keeping a shape ``[Batch x timesteps]`` and giving this -# to the RB, we'll be creating a replay buffer of size -# ``[Capacity x timesteps]``. - - -from torchrl.objectives.value.functional import vec_td_lambda_advantage_estimate - -############################################################################### -# We reset the actor parameters: -# - -( - factor, - actor, - actor_explore, - params, - params_target, -) = make_model(test_env) -params_flat = params.flatten_keys(".") - -optim = torch.optim.Adam(list(params_flat.values()), lr, betas=betas) -test_env = make_env( - parallel=False, observation_norm_state_dict=observation_norm_state_dict -) -print(actor_explore(test_env.reset())) - -############################################################################### -# Data: Replay buffer and collector -# --------------------------------- -# -# We need to build a new replay buffer of the appropriate size: -# - -max_size = frames_per_batch // num_workers - -replay_buffer = TensorDictReplayBuffer( - storage=LazyMemmapStorage(-(-buffer_size // max_size)), - prefetch=n_optim, -) - -data_collector = MultiaSyncDataCollector( - [ - make_env( - parallel=True, observation_norm_state_dict=observation_norm_state_dict - ), - ] - * num_collectors, - policy=actor_explore, - frames_per_batch=frames_per_batch, - total_frames=total_frames, - exploration_mode="random", - devices=[device] * num_collectors, - storing_devices=[device] * num_collectors, - # devices=[f"cuda:{i}" for i in range(1, 1 + num_collectors)], - # storing_devices=[f"cuda:{i}" for i in range(1, 1 + num_collectors)], - split_trajs=False, -) - - -logs_exp2 = defaultdict(list) -prev_traj_count = 0 - -############################################################################### -# Training loop -# ------------- -# -# There are very few differences with the training loop above: -# -# - The tensordict received by the collector is used as-is, without being -# flattened (recall the ``data.view(-1)`` above), to keep the temporal -# relation between consecutive steps. -# - We use :func:`vec_td_lambda_advantage_estimate` to compute the target -# value. - -pbar = tqdm.tqdm(total=total_frames) -for j, data in enumerate(data_collector): - current_frames = data.numel() - pbar.update(current_frames) - - replay_buffer.extend(data.cpu()) - if len(logs_exp2["frames"]): - logs_exp2["frames"].append(current_frames + logs_exp2["frames"][-1]) - else: - logs_exp2["frames"].append(current_frames) - - if data["next", "done"].any(): - done = data["next", "done"].squeeze(-1) - logs_exp2["traj_lengths"].append( - data["next", "step_count"][done].float().mean().item() + return replay_buffer + + ############################################################################### + # Data collector + # ~~~~~~~~~~~~~~ + # + # As in `PPO ` and + # `DDPG `, we will be using + # a data collector as a dataloader in the outer loop. + # + # We choose the following configuration: we will be running a series of + # parallel environments synchronously in parallel in different collectors, + # themselves running in parallel but asynchronously. + # The advantage of this configuration is that we can balance the amount of + # compute that is executed in batch with what we want to be executed + # asynchronously. We encourage the reader to experiment how the collection + # speed is impacted by modifying the number of collectors (ie the number of + # environment constructors passed to the collector) and the number of + # environment executed in parallel in each collector (controlled by the + # ``num_workers`` hyperparameter). + # + # When building the collector, we can choose on which device we want the + # environment and policy to execute the operations through the ``device`` + # keyword argument. The ``storing_devices`` argument will modify the + # location of the data being collected: if the batches that we are gathering + # have a considerable size, we may want to store them on a different location + # than the device where the computation is happening. For asynchronous data + # collectors such as ours, different storing devices mean that the data that + # we collect won't sit on the same device each time, which is something that + # out training loop must account for. For simplicity, we set the devices to + # the same value for all sub-collectors. + + def get_collector(observation_norm_state_dict, num_collectors, actor_explore, frames_per_batch, total_frames, device): + data_collector = MultiaSyncDataCollector( + [ + make_env( + parallel=True, observation_norm_state_dict=observation_norm_state_dict + ), + ] + * num_collectors, + policy=actor_explore, + frames_per_batch=frames_per_batch, + total_frames=total_frames, + # this is the default behaviour: the collector runs in ``"random"`` (or explorative) mode + exploration_mode="random", + # We set the all the devices to be identical. Below is an example of + # heterogeneous devices + device=device, + storing_device=device, + split_trajs=False, ) - - if sum(logs_exp2["frames"]) > init_random_frames: - for _ in range(n_optim): - sampled_data = replay_buffer.sample(batch_size // max_size) - sampled_data = sampled_data.clone().to(device, non_blocking=True) - - reward = sampled_data["next", "reward"] - done = sampled_data["next", "done"].to(reward.dtype) - action = sampled_data["action"].clone() - - sampled_data_out = sampled_data.select(*actor.in_keys) - sampled_data_out = vmap(factor, (0, None))(sampled_data_out, params) - action_value = sampled_data_out["action_value"] - action_value = (action_value * action.to(action_value.dtype)).sum(-1, True) - with torch.no_grad(): - tdstep = step_mdp(sampled_data) - next_value = vmap(factor, (0, None))( - tdstep.select(*actor.in_keys), params - ) - next_value = next_value["chosen_action_value"] - error = vec_td_lambda_advantage_estimate( - gamma, - lmbda, - action_value, - next_value, - reward, - done, - ).pow(2) - error = error.mean() - error.backward() - - gv = nn.utils.clip_grad_norm_(list(params_flat.values()), 1) - - optim.step() - optim.zero_grad() - - # update of the target parameters - params_target.apply( - lambda p_target, p_orig: p_orig * tau + p_target * (1 - tau), - params.detach(), - inplace=True, - ) - - actor_explore.step(current_frames) - - # Logging - logs_exp2["grad_vals"].append(float(gv)) - - logs_exp2["losses"].append(error.item()) - logs_exp2["values"].append(action_value.mean().item()) - logs_exp2["traj_count"].append( - prev_traj_count + data["next", "done"].sum().item() + return data_collector + + + + + ############################################################################### + # Hyperparameters + # --------------- + # + # Let's start with our hyperparameters. The following setting should work well + # in practice, and the performance of the algorithm should hopefully not be + # too sensitive to slight variations of these. + + device = "cuda:0" if torch.cuda.device_count() > 0 else "cpu" + + ############################################################################### + # Optimizer + # ~~~~~~~~~ + + # the learning rate of the optimizer + lr = 2e-3 + # weight decay + wd = 1e-5 + # the beta parameters of Adam + betas = (0.9, 0.999) + # Optimization steps per batch collected (aka UPD or updates per data) + n_optim = 8 + + ############################################################################### + # DQN parameters + # ~~~~~~~~~~~~~~ + + ############################################################################### + # gamma decay factor + gamma = 0.99 + + ############################################################################### + # lambda decay factor (see second the part with TD(:math:`\lambda`) + lmbda = 0.95 + + ############################################################################### + # Smooth target network update decay parameter. + # This loosely corresponds to a 1/(1-tau) interval with hard target network + # update + tau = 0.005 + + ############################################################################### + # Data collection and replay buffer + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # Values to be used for proper training have been commented. + # + # Total frames collected in the environment. In other implementations, the + # user defines a maximum number of episodes. + # This is harder to do with our data collectors since they return batches + # of N collected frames, where N is a constant. + # However, one can easily get the same restriction on number of episodes by + # breaking the training loop when a certain number + # episodes has been collected. + total_frames = 4096 # 500000 + + ############################################################################### + # Random frames used to initialize the replay buffer. + init_random_frames = 100 # 1000 + + ############################################################################### + # Frames in each batch collected. + frames_per_batch = 32 # 128 + + ############################################################################### + # Frames sampled from the replay buffer at each optimization step + batch_size = 32 # 256 + + ############################################################################### + # Size of the replay buffer in terms of frames + buffer_size = min(total_frames, 100000) + + ############################################################################### + # Number of environments run in parallel in each data collector + num_workers = 2 # 8 + num_collectors = 2 # 4 + + + ############################################################################### + # Environment and exploration + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # + # We set the initial and final value of the epsilon factor in Epsilon-greedy + # exploration. + # Since our policy is deterministic, exploration is crucial: without it, the + # only source of randomness would be the environment reset. + + eps_greedy_val = 0.1 + eps_greedy_val_env = 0.005 + + ############################################################################### + # To speed up learning, we set the bias of the last layer of our value network + # to a predefined value (this is not mandatory) + init_bias = 2.0 + + ############################################################################### + # .. note:: + # For fast rendering of the tutorial ``total_frames`` hyperparameter + # was set to a very low number. To get a reasonable performance, use a greater + # value e.g. 500000 + # + + def get_trainer(): + stats = get_norm_stats() + test_env = make_env(parallel=False, observation_norm_state_dict=stats) + # Get model + actor, actor_explore = make_model(test_env) + loss_module = DQNLoss(actor, gamma=0.99) + collector = get_collector(stats, num_collectors, actor_explore, frames_per_batch, total_frames, device) + optimizer = torch.optim.Adam(loss_module.parameters(), lr=lr, weight_decay=wd, betas=betas) + trainer = Trainer( + collector=collector, + total_frames=total_frames, + frame_skip=1, + loss_module=loss_module, + optimizer=optimizer, + logger=None, + optim_steps_per_batch = n_optim, ) - prev_traj_count = logs_exp2["traj_count"][-1] - if j % 10 == 0: - with set_exploration_mode("mode"), torch.no_grad(): - # execute a rollout. The `set_exploration_mode("mode")` has - # no effect here since the policy is deterministic, but we add - # it for completeness - eval_rollout = test_env.rollout( - max_steps=10000, - policy=actor, - ).cpu() - logs_exp2["traj_lengths_eval"].append(eval_rollout.shape[-1]) - logs_exp2["evals"].append(eval_rollout["next", "reward"].sum().item()) - if len(logs_exp2["mavgs"]): - logs_exp2["mavgs"].append( - logs_exp2["evals"][-1] * 0.05 + logs_exp2["mavgs"][-1] * 0.95 - ) - else: - logs_exp2["mavgs"].append(logs_exp2["evals"][-1]) - logs_exp2["traj_count_eval"].append(logs_exp2["traj_count"][-1]) - pbar.set_description( - f"error: {error: 4.4f}, value: {action_value.mean(): 4.4f}, test return: {logs_exp2['evals'][-1]: 4.4f}" - ) - - # update policy weights - data_collector.update_policy_weights_() - - -############################################################################### -# TD(:math:`\lambda`) performs significantly better than TD(0) because it -# retrieves a much less biased estimate of the state-action value. -# -# .. code-block:: python -# -# plot(logs_exp2, "dqn_tdlambda.png") -# -# .. figure:: /_static/img/dqn_tdlambda.png -# :alt: Cart Pole results with TD(lambda) -# - - -print("shutting down") -data_collector.shutdown() -del data_collector - -############################################################################### -# Let's compare the results on a single plot. Because the TD(lambda) version -# works better, we'll have fewer episodes collected for a given number of -# frames (as there are more frames per episode). -# -# **Note**: As already mentioned above, to get a more reasonable performance, -# use a greater value for ``total_frames`` e.g. 500000. - - -def plot_both(): - frames_td0 = logs_exp1["frames"] - frames_tdlambda = logs_exp2["frames"] - evals_td0 = logs_exp1["evals"] - evals_tdlambda = logs_exp2["evals"] - mavgs_td0 = logs_exp1["mavgs"] - mavgs_tdlambda = logs_exp2["mavgs"] - traj_count_td0 = logs_exp1["traj_count_eval"] - traj_count_tdlambda = logs_exp2["traj_count_eval"] - - plt.figure(figsize=(15, 10)) - plt.subplot(1, 2, 1) - plt.plot(frames_td0[-len(evals_td0) :], evals_td0, label="return (td0)", alpha=0.5) - plt.plot( - frames_tdlambda[-len(evals_tdlambda) :], - evals_tdlambda, - label="return (td(lambda))", - alpha=0.5, - ) - plt.plot(frames_td0[-len(mavgs_td0) :], mavgs_td0, label="mavg (td0)") - plt.plot( - frames_tdlambda[-len(mavgs_tdlambda) :], - mavgs_tdlambda, - label="mavg (td(lambda))", - ) - plt.xlabel("frames collected") - plt.ylabel("trajectory length (= return)") - - plt.subplot(1, 2, 2) - plt.plot( - traj_count_td0[-len(evals_td0) :], - evals_td0, - label="return (td0)", - alpha=0.5, - ) - plt.plot( - traj_count_tdlambda[-len(evals_tdlambda) :], - evals_tdlambda, - label="return (td(lambda))", - alpha=0.5, - ) - plt.plot(traj_count_td0[-len(mavgs_td0) :], mavgs_td0, label="mavg (td0)") - plt.plot( - traj_count_tdlambda[-len(mavgs_tdlambda) :], - mavgs_tdlambda, - label="mavg (td(lambda))", - ) - plt.xlabel("trajectories collected") - plt.legend() - - plt.savefig("dqn.png") - - -############################################################################### -# .. code-block:: python -# -# plot_both() -# -# .. figure:: /_static/img/dqn.png -# :alt: Cart Pole results from the TD(:math:`lambda`) trained policy. -# -# Finally, we generate a new video to check what the algorithm has learnt. -# If all goes well, the duration should be significantly longer than with a -# random rollout. -# -# To get the raw pixels of the rollout, we insert a -# :class:`torchrl.envs.CatTensors` transform that precedes all others and copies -# the ``"pixels"`` key onto a ``"pixels_save"`` key. This is necessary because -# the other transforms that modify this key will update its value in-place in -# the output tensordict. -# - -test_env.transform.insert(0, CatTensors(["pixels"], "pixels_save", del_keys=False)) -eval_rollout = test_env.rollout(max_steps=10000, policy=actor, auto_reset=True).cpu() - -# sphinx_gallery_start_ignore -import imageio - -imageio.mimwrite("cartpole.gif", eval_rollout["pixels_save"].numpy(), fps=30) -# sphinx_gallery_end_ignore - -del test_env - -############################################################################### -# The video of the rollout can be saved using the imageio package: -# -# .. code-block:: -# -# import imageio -# imageio.mimwrite('cartpole.mp4', eval_rollout["pixels_save"].numpy(), fps=30); -# -# .. figure:: /_static/img/cartpole.gif -# :alt: Cart Pole results from the TD(:math:`\lambda`) trained policy. - -############################################################################### -# Conclusion and possible improvements -# ------------------------------------ -# -# In this tutorial we have learnt: -# -# - How to train a policy that read pixel-based states, what transforms to -# include and how to normalize the data; -# - How to create a policy that picks up the action with the highest value -# with :class:`torchrl.modules.QValueNetwork`; -# - How to build a multiprocessed data collector; -# - How to train a DQN with TD(:math:`\lambda`) returns. -# -# We have seen that using TD(:math:`\lambda`) greatly improved the performance -# of DQN. Other possible improvements could include: -# -# - Using the Multi-Step post-processing. Multi-step will project an action -# to the nth following step, and create a discounted sum of the rewards in -# between. This trick can make the algorithm noticebly less myopic. To use -# this, simply create the collector with -# -# from torchrl.data.postprocs.postprocs import MultiStep -# collector = CollectorClass(..., postproc=MultiStep(gamma, n)) -# -# where ``n`` is the number of looking-forward steps. Pay attention to the -# fact that the ``gamma`` factor has to be corrected by the number of -# steps till the next observation when being passed to -# ``vec_td_lambda_advantage_estimate``: -# -# gamma = gamma ** tensordict["steps_to_next_obs"] -# - A prioritized replay buffer could also be used. This will give a -# higher priority to samples that have the worst value accuracy. -# - A distributional loss (see ``torchrl.objectives.DistributionalDQNLoss`` -# for more information). -# - More fancy exploration techniques, such as NoisyLinear layers and such -# (check ``torchrl.modules.NoisyLinear``, which is fully compatible with the -# ``MLP`` class used in our Dueling DQN). + return trainer + + trainer = get_trainer() + trainer.train() + + # ############################################################################### + # # We represent the parameters and targets as flat structures, but unflattening + # # them is quite easy: + # + # params_flat = params.flatten_keys(".") + # + # ############################################################################### + # # We will be using the adam optimizer: + # + # optim = torch.optim.Adam(list(params_flat.values()), lr, betas=betas) + # + # ############################################################################### + # # We create a test environment for evaluation of the policy: + # + # test_env = make_env( + # parallel=False, observation_norm_state_dict=observation_norm_state_dict + # ) + # # sanity check: + # print(actor_explore(test_env.reset())) + # + # ############################################################################### + # # Training loop of a regular DQN + # # ------------------------------ + # # + # # We'll start with a simple implementation of DQN where the returns are + # # computed without bootstrapping, i.e. + # # + # # .. math:: + # # + # # Q_{t}(s, a) = R(s, a) + \gamma * V_{t+1}(s) + # # + # # where :math:`Q(s, a)` is the Q-value of the current state-action pair, + # # :math:`R(s, a)` is the result of the reward function, and :math:`V(s)` is a + # # value function that returns 0 for terminating states. + # # + # # We store the logs in a defaultdict: + # + # logs_exp1 = defaultdict(list) + # prev_traj_count = 0 + # + # pbar = tqdm.tqdm(total=total_frames) + # for j, data in enumerate(data_collector): + # current_frames = data.numel() + # pbar.update(current_frames) + # data = data.view(-1) + # + # # We store the values on the replay buffer, after placing them on CPU. + # # When called for the first time, this will instantiate our storage + # # object which will print its content. + # replay_buffer.extend(data.cpu()) + # + # # some logging + # if len(logs_exp1["frames"]): + # logs_exp1["frames"].append(current_frames + logs_exp1["frames"][-1]) + # else: + # logs_exp1["frames"].append(current_frames) + # + # if data["next", "done"].any(): + # done = data["next", "done"].squeeze(-1) + # logs_exp1["traj_lengths"].append( + # data["next", "step_count"][done].float().mean().item() + # ) + # + # # check that we have enough data to start training + # if sum(logs_exp1["frames"]) > init_random_frames: + # for _ in range(n_optim): + # # sample from the RB and send to device + # sampled_data = replay_buffer.sample(batch_size) + # sampled_data = sampled_data.to(device, non_blocking=True) + # + # # collect data from RB + # reward = sampled_data["next", "reward"].squeeze(-1) + # done = sampled_data["next", "done"].squeeze(-1).to(reward.dtype) + # action = sampled_data["action"].clone() + # + # # Compute action value (of the action actually taken) at time t + # # By default, TorchRL uses one-hot encodings for discrete actions + # sampled_data_out = sampled_data.select(*actor.in_keys) + # sampled_data_out = factor(sampled_data_out, params=params) + # action_value = sampled_data_out["action_value"] + # action_value = (action_value * action.to(action_value.dtype)).sum(-1) + # with torch.no_grad(): + # # compute best action value for the next step, using target parameters + # tdstep = step_mdp(sampled_data) + # next_value = factor( + # tdstep.select(*actor.in_keys), + # params=params_target, + # )["chosen_action_value"].squeeze(-1) + # exp_value = reward + gamma * next_value * (1 - done) + # assert exp_value.shape == action_value.shape + # # we use MSE loss but L1 or smooth L1 should also work + # error = nn.functional.mse_loss(exp_value, action_value).mean() + # error.backward() + # + # gv = nn.utils.clip_grad_norm_(list(params_flat.values()), 1) + # + # optim.step() + # optim.zero_grad() + # + # # update of the target parameters + # params_target.apply( + # lambda p_target, p_orig: p_orig * tau + p_target * (1 - tau), + # params.detach(), + # inplace=True, + # ) + # + # actor_explore.step(current_frames) + # + # # Logging + # logs_exp1["grad_vals"].append(float(gv)) + # logs_exp1["losses"].append(error.item()) + # logs_exp1["values"].append(action_value.mean().item()) + # logs_exp1["traj_count"].append( + # prev_traj_count + data["next", "done"].sum().item() + # ) + # prev_traj_count = logs_exp1["traj_count"][-1] + # + # if j % 10 == 0: + # with set_exploration_mode("mode"), torch.no_grad(): + # # execute a rollout. The `set_exploration_mode("mode")` has no effect here since the policy is deterministic, but we add it for completeness + # eval_rollout = test_env.rollout( + # max_steps=10000, + # policy=actor, + # ).cpu() + # logs_exp1["traj_lengths_eval"].append(eval_rollout.shape[-1]) + # logs_exp1["evals"].append(eval_rollout["next", "reward"].sum().item()) + # if len(logs_exp1["mavgs"]): + # logs_exp1["mavgs"].append( + # logs_exp1["evals"][-1] * 0.05 + logs_exp1["mavgs"][-1] * 0.95 + # ) + # else: + # logs_exp1["mavgs"].append(logs_exp1["evals"][-1]) + # logs_exp1["traj_count_eval"].append(logs_exp1["traj_count"][-1]) + # pbar.set_description( + # f"error: {error: 4.4f}, value: {action_value.mean(): 4.4f}, test return: {logs_exp1['evals'][-1]: 4.4f}" + # ) + # + # # update policy weights + # data_collector.update_policy_weights_() + # + # ############################################################################### + # # We write a custom plot function to display the performance of our algorithm + # # + # + # + # def plot(logs, name): + # plt.figure(figsize=(15, 10)) + # plt.subplot(2, 3, 1) + # plt.plot( + # logs["frames"][-len(logs["evals"]) :], + # logs["evals"], + # label="return (eval)", + # ) + # plt.plot( + # logs["frames"][-len(logs["mavgs"]) :], + # logs["mavgs"], + # label="mavg of returns (eval)", + # ) + # plt.xlabel("frames collected") + # plt.ylabel("trajectory length (= return)") + # plt.subplot(2, 3, 2) + # plt.plot( + # logs["traj_count"][-len(logs["evals"]) :], + # logs["evals"], + # label="return", + # ) + # plt.plot( + # logs["traj_count"][-len(logs["mavgs"]) :], + # logs["mavgs"], + # label="mavg", + # ) + # plt.xlabel("trajectories collected") + # plt.legend() + # plt.subplot(2, 3, 3) + # plt.plot(logs["frames"][-len(logs["losses"]) :], logs["losses"]) + # plt.xlabel("frames collected") + # plt.title("loss") + # plt.subplot(2, 3, 4) + # plt.plot(logs["frames"][-len(logs["values"]) :], logs["values"]) + # plt.xlabel("frames collected") + # plt.title("value") + # plt.subplot(2, 3, 5) + # plt.plot( + # logs["frames"][-len(logs["grad_vals"]) :], + # logs["grad_vals"], + # ) + # plt.xlabel("frames collected") + # plt.title("grad norm") + # if len(logs["traj_lengths"]): + # plt.subplot(2, 3, 6) + # plt.plot(logs["traj_lengths"]) + # plt.xlabel("batches") + # plt.title("traj length (training)") + # plt.savefig(name) + # if is_notebook(): + # plt.show() + # + # + # ############################################################################### + # # The performance of the policy can be measured as the length of trajectories. + # # As we can see on the results of the :func:`plot` function, the performance + # # of the policy increases, albeit slowly. + # # + # # .. code-block:: python + # # + # # plot(logs_exp1, "dqn_td0.png") + # # + # # .. figure:: /_static/img/dqn_td0.png + # # :alt: Cart Pole results with TD(0) + # # + # + # print("shutting down") + # data_collector.shutdown() + # del data_collector + # + # ############################################################################### + # # DQN with TD(:math:`\lambda`) + # # ---------------------------- + # # + # # We can improve the above algorithm by getting a better estimate of the + # # return, using not only the next state value but the whole sequence of rewards + # # and values that follow a particular step. + # # + # # TorchRL provides a vectorized version of TD(lambda) named + # # :func:`torchrl.objectives.value.functional.vec_td_lambda_advantage_estimate`. + # # We'll use this to obtain a target value that the value network will be + # # trained to match. + # # + # # The big difference in this implementation is that we'll store entire + # # trajectories and not single steps in the replay buffer. This will be done + # # automatically as long as we're not "flattening" the tensordict collected: + # # by keeping a shape ``[Batch x timesteps]`` and giving this + # # to the RB, we'll be creating a replay buffer of size + # # ``[Capacity x timesteps]``. + # + # + # from torchrl.objectives.value.functional import vec_td_lambda_advantage_estimate + # + # ############################################################################### + # # We reset the actor parameters: + # # + # + # ( + # factor, + # actor, + # actor_explore, + # params, + # params_target, + # ) = make_model(test_env) + # params_flat = params.flatten_keys(".") + # + # optim = torch.optim.Adam(list(params_flat.values()), lr, betas=betas) + # test_env = make_env( + # parallel=False, observation_norm_state_dict=observation_norm_state_dict + # ) + # print(actor_explore(test_env.reset())) + # + # ############################################################################### + # # Data: Replay buffer and collector + # # --------------------------------- + # # + # # We need to build a new replay buffer of the appropriate size: + # # + # + # max_size = frames_per_batch // num_workers + # + # replay_buffer = TensorDictReplayBuffer( + # storage=LazyMemmapStorage(-(-buffer_size // max_size)), + # prefetch=n_optim, + # ) + # + # data_collector = MultiaSyncDataCollector( + # [ + # make_env( + # parallel=True, observation_norm_state_dict=observation_norm_state_dict + # ), + # ] + # * num_collectors, + # policy=actor_explore, + # frames_per_batch=frames_per_batch, + # total_frames=total_frames, + # exploration_mode="random", + # devices=[device] * num_collectors, + # storing_devices=[device] * num_collectors, + # # devices=[f"cuda:{i}" for i in range(1, 1 + num_collectors)], + # # storing_devices=[f"cuda:{i}" for i in range(1, 1 + num_collectors)], + # split_trajs=False, + # ) + # + # + # logs_exp2 = defaultdict(list) + # prev_traj_count = 0 + # + # ############################################################################### + # # Training loop + # # ------------- + # # + # # There are very few differences with the training loop above: + # # + # # - The tensordict received by the collector is used as-is, without being + # # flattened (recall the ``data.view(-1)`` above), to keep the temporal + # # relation between consecutive steps. + # # - We use :func:`vec_td_lambda_advantage_estimate` to compute the target + # # value. + # + # pbar = tqdm.tqdm(total=total_frames) + # for j, data in enumerate(data_collector): + # current_frames = data.numel() + # pbar.update(current_frames) + # + # replay_buffer.extend(data.cpu()) + # if len(logs_exp2["frames"]): + # logs_exp2["frames"].append(current_frames + logs_exp2["frames"][-1]) + # else: + # logs_exp2["frames"].append(current_frames) + # + # if data["next", "done"].any(): + # done = data["next", "done"].squeeze(-1) + # logs_exp2["traj_lengths"].append( + # data["next", "step_count"][done].float().mean().item() + # ) + # + # if sum(logs_exp2["frames"]) > init_random_frames: + # for _ in range(n_optim): + # sampled_data = replay_buffer.sample(batch_size // max_size) + # sampled_data = sampled_data.clone().to(device, non_blocking=True) + # + # reward = sampled_data["next", "reward"] + # done = sampled_data["next", "done"].to(reward.dtype) + # action = sampled_data["action"].clone() + # + # sampled_data_out = sampled_data.select(*actor.in_keys) + # sampled_data_out = vmap(factor, (0, None))(sampled_data_out, params) + # action_value = sampled_data_out["action_value"] + # action_value = (action_value * action.to(action_value.dtype)).sum(-1, True) + # with torch.no_grad(): + # tdstep = step_mdp(sampled_data) + # next_value = vmap(factor, (0, None))( + # tdstep.select(*actor.in_keys), params + # ) + # next_value = next_value["chosen_action_value"] + # error = vec_td_lambda_advantage_estimate( + # gamma, + # lmbda, + # action_value, + # next_value, + # reward, + # done, + # ).pow(2) + # error = error.mean() + # error.backward() + # + # gv = nn.utils.clip_grad_norm_(list(params_flat.values()), 1) + # + # optim.step() + # optim.zero_grad() + # + # # update of the target parameters + # params_target.apply( + # lambda p_target, p_orig: p_orig * tau + p_target * (1 - tau), + # params.detach(), + # inplace=True, + # ) + # + # actor_explore.step(current_frames) + # + # # Logging + # logs_exp2["grad_vals"].append(float(gv)) + # + # logs_exp2["losses"].append(error.item()) + # logs_exp2["values"].append(action_value.mean().item()) + # logs_exp2["traj_count"].append( + # prev_traj_count + data["next", "done"].sum().item() + # ) + # prev_traj_count = logs_exp2["traj_count"][-1] + # if j % 10 == 0: + # with set_exploration_mode("mode"), torch.no_grad(): + # # execute a rollout. The `set_exploration_mode("mode")` has + # # no effect here since the policy is deterministic, but we add + # # it for completeness + # eval_rollout = test_env.rollout( + # max_steps=10000, + # policy=actor, + # ).cpu() + # logs_exp2["traj_lengths_eval"].append(eval_rollout.shape[-1]) + # logs_exp2["evals"].append(eval_rollout["next", "reward"].sum().item()) + # if len(logs_exp2["mavgs"]): + # logs_exp2["mavgs"].append( + # logs_exp2["evals"][-1] * 0.05 + logs_exp2["mavgs"][-1] * 0.95 + # ) + # else: + # logs_exp2["mavgs"].append(logs_exp2["evals"][-1]) + # logs_exp2["traj_count_eval"].append(logs_exp2["traj_count"][-1]) + # pbar.set_description( + # f"error: {error: 4.4f}, value: {action_value.mean(): 4.4f}, test return: {logs_exp2['evals'][-1]: 4.4f}" + # ) + # + # # update policy weights + # data_collector.update_policy_weights_() + # + # + # ############################################################################### + # # TD(:math:`\lambda`) performs significantly better than TD(0) because it + # # retrieves a much less biased estimate of the state-action value. + # # + # # .. code-block:: python + # # + # # plot(logs_exp2, "dqn_tdlambda.png") + # # + # # .. figure:: /_static/img/dqn_tdlambda.png + # # :alt: Cart Pole results with TD(lambda) + # # + # + # + # print("shutting down") + # data_collector.shutdown() + # del data_collector + # + # ############################################################################### + # # Let's compare the results on a single plot. Because the TD(lambda) version + # # works better, we'll have fewer episodes collected for a given number of + # # frames (as there are more frames per episode). + # # + # # **Note**: As already mentioned above, to get a more reasonable performance, + # # use a greater value for ``total_frames`` e.g. 500000. + # + # + # def plot_both(): + # frames_td0 = logs_exp1["frames"] + # frames_tdlambda = logs_exp2["frames"] + # evals_td0 = logs_exp1["evals"] + # evals_tdlambda = logs_exp2["evals"] + # mavgs_td0 = logs_exp1["mavgs"] + # mavgs_tdlambda = logs_exp2["mavgs"] + # traj_count_td0 = logs_exp1["traj_count_eval"] + # traj_count_tdlambda = logs_exp2["traj_count_eval"] + # + # plt.figure(figsize=(15, 10)) + # plt.subplot(1, 2, 1) + # plt.plot(frames_td0[-len(evals_td0) :], evals_td0, label="return (td0)", alpha=0.5) + # plt.plot( + # frames_tdlambda[-len(evals_tdlambda) :], + # evals_tdlambda, + # label="return (td(lambda))", + # alpha=0.5, + # ) + # plt.plot(frames_td0[-len(mavgs_td0) :], mavgs_td0, label="mavg (td0)") + # plt.plot( + # frames_tdlambda[-len(mavgs_tdlambda) :], + # mavgs_tdlambda, + # label="mavg (td(lambda))", + # ) + # plt.xlabel("frames collected") + # plt.ylabel("trajectory length (= return)") + # + # plt.subplot(1, 2, 2) + # plt.plot( + # traj_count_td0[-len(evals_td0) :], + # evals_td0, + # label="return (td0)", + # alpha=0.5, + # ) + # plt.plot( + # traj_count_tdlambda[-len(evals_tdlambda) :], + # evals_tdlambda, + # label="return (td(lambda))", + # alpha=0.5, + # ) + # plt.plot(traj_count_td0[-len(mavgs_td0) :], mavgs_td0, label="mavg (td0)") + # plt.plot( + # traj_count_tdlambda[-len(mavgs_tdlambda) :], + # mavgs_tdlambda, + # label="mavg (td(lambda))", + # ) + # plt.xlabel("trajectories collected") + # plt.legend() + # + # plt.savefig("dqn.png") + # + # + # ############################################################################### + # # .. code-block:: python + # # + # # plot_both() + # # + # # .. figure:: /_static/img/dqn.png + # # :alt: Cart Pole results from the TD(:math:`lambda`) trained policy. + # # + # # Finally, we generate a new video to check what the algorithm has learnt. + # # If all goes well, the duration should be significantly longer than with a + # # random rollout. + # # + # # To get the raw pixels of the rollout, we insert a + # # :class:`torchrl.envs.CatTensors` transform that precedes all others and copies + # # the ``"pixels"`` key onto a ``"pixels_save"`` key. This is necessary because + # # the other transforms that modify this key will update its value in-place in + # # the output tensordict. + # # + # + # test_env.transform.insert(0, CatTensors(["pixels"], "pixels_save", del_keys=False)) + # eval_rollout = test_env.rollout(max_steps=10000, policy=actor, auto_reset=True).cpu() + # + # # sphinx_gallery_start_ignore + # import imageio + # + # imageio.mimwrite("cartpole.gif", eval_rollout["pixels_save"].numpy(), fps=30) + # # sphinx_gallery_end_ignore + # + # del test_env + # + # ############################################################################### + # # The video of the rollout can be saved using the imageio package: + # # + # # .. code-block:: + # # + # # import imageio + # # imageio.mimwrite('cartpole.mp4', eval_rollout["pixels_save"].numpy(), fps=30); + # # + # # .. figure:: /_static/img/cartpole.gif + # # :alt: Cart Pole results from the TD(:math:`\lambda`) trained policy. + # + # ############################################################################### + # # Conclusion and possible improvements + # # ------------------------------------ + # # + # # In this tutorial we have learnt: + # # + # # - How to train a policy that read pixel-based states, what transforms to + # # include and how to normalize the data; + # # - How to create a policy that picks up the action with the highest value + # # with :class:`torchrl.modules.QValueNetwork`; + # # - How to build a multiprocessed data collector; + # # - How to train a DQN with TD(:math:`\lambda`) returns. + # # + # # We have seen that using TD(:math:`\lambda`) greatly improved the performance + # # of DQN. Other possible improvements could include: + # # + # # - Using the Multi-Step post-processing. Multi-step will project an action + # # to the nth following step, and create a discounted sum of the rewards in + # # between. This trick can make the algorithm noticebly less myopic. To use + # # this, simply create the collector with + # # + # # from torchrl.data.postprocs.postprocs import MultiStep + # # collector = CollectorClass(..., postproc=MultiStep(gamma, n)) + # # + # # where ``n`` is the number of looking-forward steps. Pay attention to the + # # fact that the ``gamma`` factor has to be corrected by the number of + # # steps till the next observation when being passed to + # # ``vec_td_lambda_advantage_estimate``: + # # + # # gamma = gamma ** tensordict["steps_to_next_obs"] + # # - A prioritized replay buffer could also be used. This will give a + # # higher priority to samples that have the worst value accuracy. + # # - A distributional loss (see ``torchrl.objectives.DistributionalDQNLoss`` + # # for more information). + # # - More fancy exploration techniques, such as NoisyLinear layers and such + # # (check ``torchrl.modules.NoisyLinear``, which is fully compatible with the + # # ``MLP`` class used in our Dueling DQN). From bfef8eefae9f7578c79a0ce90760d5de510d244e Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 21 Mar 2023 14:12:46 +0000 Subject: [PATCH 07/89] dqn --- torchrl/trainers/trainers.py | 15 +-- tutorials/sphinx-tutorials/coding_dqn.py | 147 ++++++++++++++--------- 2 files changed, 96 insertions(+), 66 deletions(-) diff --git a/torchrl/trainers/trainers.py b/torchrl/trainers/trainers.py index b6c53b03ab7..0f7aa74cd0f 100644 --- a/torchrl/trainers/trainers.py +++ b/torchrl/trainers/trainers.py @@ -95,17 +95,16 @@ class Trainer: optimizer (optim.Optimizer): An optimizer that trains the parameters of the model. logger (Logger, optional): a Logger that will handle the logging. - optim_steps_per_batch (int, optional): number of optimization steps + optim_steps_per_batch (int): number of optimization steps per collection of data. An trainer works as follows: a main loop collects batches of data (epoch loop), and a sub-loop (training loop) performs model updates in between two collections of data. - Default is 500 clip_grad_norm (bool, optional): If True, the gradients will be clipped based on the total norm of the model parameters. If False, all the partial derivatives will be clamped to (-clip_norm, clip_norm). Default is :obj:`True`. clip_norm (Number, optional): value to be used for clipping gradients. - Default is 100.0. + Default is None (no clip norm). progress_bar (bool, optional): If True, a progress bar will be displayed using tqdm. If tqdm is not installed, this option won't have any effect. Default is :obj:`True` @@ -131,15 +130,16 @@ def __new__(cls, *args, **kwargs): def __init__( self, + *, collector: _DataCollector, total_frames: int, frame_skip: int, + optim_steps_per_batch: int, loss_module: Union[LossModule, Callable[[TensorDictBase], TensorDictBase]], optimizer: Optional[optim.Optimizer] = None, logger: Optional[Logger] = None, - optim_steps_per_batch: int = 500, clip_grad_norm: bool = True, - clip_norm: float = 100.0, + clip_norm: float = None, progress_bar: bool = True, seed: int = 42, save_trainer_interval: int = 10000, @@ -726,11 +726,12 @@ def _grad_clip(self, clip_grad_norm: bool, clip_norm: float) -> float: for param_group in self.optimizer.param_groups: params += param_group["params"] - if clip_grad_norm: + if clip_grad_norm and clip_norm is not None: gn = nn.utils.clip_grad_norm_(params, clip_norm) else: gn = sum([p.grad.pow(2).sum() for p in params if p.grad is not None]).sqrt() - nn.utils.clip_grad_value_(params, clip_norm) + if clip_norm is not None: + nn.utils.clip_grad_value_(params, clip_norm) return float(gn) diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py index cc6bc232513..9ad17095ee3 100644 --- a/tutorials/sphinx-tutorials/coding_dqn.py +++ b/tutorials/sphinx-tutorials/coding_dqn.py @@ -79,20 +79,14 @@ if __name__ == "__main__": # sphinx_gallery_start_ignore import warnings - from collections import defaultdict - from torchrl.objectives import DQNLoss - from torchrl.trainers import Trainer + from torchrl.objectives import DQNLoss, SoftUpdate + from torchrl.trainers import Trainer, ReplayBufferTrainer, UpdateWeights warnings.filterwarnings("ignore") # sphinx_gallery_end_ignore import torch - import tqdm - from functorch import vmap - from matplotlib import pyplot as plt - from tensordict import TensorDict - from tensordict.nn import get_functional from torch import nn from torchrl.collectors import MultiaSyncDataCollector from torchrl.data import LazyMemmapStorage, TensorDictReplayBuffer @@ -100,7 +94,6 @@ from torchrl.envs.libs.gym import GymEnv from torchrl.envs.transforms import ( CatFrames, - CatTensors, Compose, GrayScale, ObservationNorm, @@ -108,7 +101,6 @@ ToTensorImage, TransformedEnv, ) - from torchrl.envs.utils import set_exploration_mode, step_mdp from torchrl.modules import DuelingCnnDQNet, EGreedyWrapper, QValueActor @@ -137,11 +129,12 @@ def is_notebook() -> bool: # of vectorization of the operations on device, although this would # technically work with every single environment attached to its own set of # transforms. - # - ``observation_norm_state_dict`` will contain the normalizing constants for - # the :class:`torchrl.envs.ObservationNorm` tranform. + # - ``obs_norm_sd`` will contain the normalizing constants for + # the :class:`torchrl.envs.ObservationNorm` transform. # # We will be using five transforms: # + # - :class:`torchrl.envs.StepCounter` to count the number of steps in each trajectory; # - :class:`torchrl.envs.ToTensorImage` will convert a ``[W, H, C]`` uint8 # tensor in a floating point tensor in the ``[0, 1]`` space with shape # ``[C, W, H]``; @@ -159,21 +152,21 @@ def is_notebook() -> bool: # - def make_env(parallel=False, observation_norm_state_dict=None, frame_skip=1): - if observation_norm_state_dict is None: - observation_norm_state_dict = {"standard_normal": True} + def make_env(parallel=False, obs_norm_sd=None, ): + if obs_norm_sd is None: + obs_norm_sd = {"standard_normal": True} if parallel: base_env = ParallelEnv( num_workers, EnvCreator( lambda: GymEnv( - "CartPole-v1", from_pixels=True, pixels_only=True, device=device, frame_skip=frame_skip + "CartPole-v1", from_pixels=True, pixels_only=True, device=device, ) ), ) else: base_env = GymEnv( - "CartPole-v1", from_pixels=True, pixels_only=True, device=device, frame_skip=frame_skip, + "CartPole-v1", from_pixels=True, pixels_only=True, device=device, ) env = TransformedEnv( @@ -185,7 +178,7 @@ def make_env(parallel=False, observation_norm_state_dict=None, frame_skip=1): GrayScale(), Resize(64, 64), CatFrames(4, in_keys=["pixels"], dim=-3), - ObservationNorm(in_keys=["pixels"], **observation_norm_state_dict), + ObservationNorm(in_keys=["pixels"], **obs_norm_sd), ), ) return env @@ -197,21 +190,23 @@ def make_env(parallel=False, observation_norm_state_dict=None, frame_skip=1): # # To normalize images, we don't want to normalize each pixel independently # with a full ``[C, W, H]`` normalizing mask, but with simpler ``[C, 1, 1]`` - # shaped loc and scale parameters. We will be using the ``reduce_dim`` argument + # shaped set of normalizing constants (loc and scale parameters). + # We will be using the ``reduce_dim`` argument # of :func:`torchrl.envs.ObservationNorm.init_stats` to instruct which # dimensions must be reduced, and the ``keep_dims`` parameter to ensure that # not all dimensions disappear in the process: + # def get_norm_stats(): test_env = make_env() test_env.transform[-1].init_stats( num_iter=1000, cat_dim=0, reduce_dim=[-1, -2, -4], keep_dims=(-1, -2) ) - observation_norm_state_dict = test_env.transform[-1].state_dict() + obs_norm_sd = test_env.transform[-1].state_dict() # let's check that normalizing constants have a size of ``[C, 1, 1]`` where # ``C=4`` (because of :class:`torchrl.envs.CatFrames`). - print(observation_norm_state_dict) - return observation_norm_state_dict + print(obs_norm_sd) + return obs_norm_sd ############################################################################### # Building the model (Deep Q-network) @@ -229,21 +224,11 @@ def get_norm_stats(): # where :math:`b` is a :math:`\# obs \rightarrow 1` function and :math:`v` is a # :math:`\# obs \rightarrow num_actions` function. # - # Our network is wrapped in a :class:`torchrl.modules.QValueActor`, which will read the state-action + # Our network is wrapped in a :class:`torchrl.modules.QValueActor`, + # which will read the state-action # values, pick up the one with the maximum value and write all those results # in the input :class:`tensordict.TensorDict`. # - # Target parameters - # ~~~~~~~~~~~~~~~~~ - # - # Many off-policy RL algorithms use the concept of "target parameters" when it - # comes to estimate the value of the ``t+1`` state or state-action pair. - # The target parameters are lagged copies of the model parameters. Because - # their predictions mismatch those of the current model configuration, they - # help learning by putting a pessimistic bound on the value being estimated. - # This is a powerful trick (known as "Double Q-Learning") that is ubiquitous - # in similar algorithms. - # def make_model(dummy_env): cnn_kwargs = { @@ -344,11 +329,11 @@ def get_replay_buffer(buffer_size, n_optim): # out training loop must account for. For simplicity, we set the devices to # the same value for all sub-collectors. - def get_collector(observation_norm_state_dict, num_collectors, actor_explore, frames_per_batch, total_frames, device): + def get_collector(obs_norm_sd, num_collectors, actor_explore, frames_per_batch, total_frames, device): data_collector = MultiaSyncDataCollector( [ make_env( - parallel=True, observation_norm_state_dict=observation_norm_state_dict + parallel=True, obs_norm_sd=obs_norm_sd ), ] * num_collectors, @@ -365,8 +350,29 @@ def get_collector(observation_norm_state_dict, num_collectors, actor_explore, fr ) return data_collector + ############################################################################### + # Loss function + # ------------- + # + # Building our loss function is straightforward: we only need to provide + # the model and a bunch of hyperparameters to the DQNLoss class. + # + # Target parameters + # ~~~~~~~~~~~~~~~~~ + # + # Many off-policy RL algorithms use the concept of "target parameters" when it + # comes to estimate the value of the next state or state-action pair. + # The target parameters are lagged copies of the model parameters. Because + # their predictions mismatch those of the current model configuration, they + # help learning by putting a pessimistic bound on the value being estimated. + # This is a powerful trick (known as "Double Q-Learning") that is ubiquitous + # in similar algorithms. + # - + def get_loss_module(actor, gamma): + loss_module = DQNLoss(actor, gamma=gamma, delay_value=True) + target_updater = SoftUpdate(loss_module) + return loss_module, target_updater ############################################################################### # Hyperparameters @@ -469,26 +475,49 @@ def get_collector(observation_norm_state_dict, num_collectors, actor_explore, fr # value e.g. 500000 # - def get_trainer(): - stats = get_norm_stats() - test_env = make_env(parallel=False, observation_norm_state_dict=stats) - # Get model - actor, actor_explore = make_model(test_env) - loss_module = DQNLoss(actor, gamma=0.99) - collector = get_collector(stats, num_collectors, actor_explore, frames_per_batch, total_frames, device) - optimizer = torch.optim.Adam(loss_module.parameters(), lr=lr, weight_decay=wd, betas=betas) - trainer = Trainer( - collector=collector, - total_frames=total_frames, - frame_skip=1, - loss_module=loss_module, - optimizer=optimizer, - logger=None, - optim_steps_per_batch = n_optim, - ) - return trainer + ############################################################################### + # Building a Trainer + # ------------------ + # + # TorchRL's :class:`torchrl.trainers.Trainer` class constructor takes the + # following keyword-only arguments: + # + # - ``collector`` + # - ``loss_module`` + # - ``optimizer`` + # - ``logger``: A logger can be + # - ``total_frames``: this parameter defines the lifespan of the trainer. + # - ``frame_skip``: when a frame-skip is used, the collector must be made + # aware of it in order to accurately count the number of frames + # collected etc. Making the trainer aware of this parameter is not + # mandatory but helps to have a fairer comparison between settings where + # the total number of frames (budget) is fixed but the frame-skip is + # variable. + + stats = get_norm_stats() + test_env = make_env(parallel=False, obs_norm_sd=stats) + # Get model + actor, actor_explore = make_model(test_env) + loss_module, target_net_updater = get_loss_module(actor, gamma) + collector = get_collector(stats, num_collectors, actor_explore, frames_per_batch, total_frames, device) + optimizer = torch.optim.Adam(loss_module.parameters(), lr=lr, weight_decay=wd, betas=betas) + trainer = Trainer( + collector=collector, + total_frames=total_frames, + frame_skip=1, + loss_module=loss_module, + optimizer=optimizer, + logger=None, + optim_steps_per_batch = n_optim, + ) + + buffer_hook = ReplayBufferTrainer(get_replay_buffer(buffer_size, n_optim)) + buffer_hook.register(trainer) + weight_updater = UpdateWeights(collector, update_weights_interval=1) + weight_updater.register(trainer) + + trainer.register_op("post_optim", target_net_updater.step) - trainer = get_trainer() trainer.train() # ############################################################################### @@ -506,7 +535,7 @@ def get_trainer(): # # We create a test environment for evaluation of the policy: # # test_env = make_env( - # parallel=False, observation_norm_state_dict=observation_norm_state_dict + # parallel=False, obs_norm_sd=obs_norm_sd # ) # # sanity check: # print(actor_explore(test_env.reset())) @@ -744,7 +773,7 @@ def get_trainer(): # # optim = torch.optim.Adam(list(params_flat.values()), lr, betas=betas) # test_env = make_env( - # parallel=False, observation_norm_state_dict=observation_norm_state_dict + # parallel=False, obs_norm_sd=obs_norm_sd # ) # print(actor_explore(test_env.reset())) # @@ -765,7 +794,7 @@ def get_trainer(): # data_collector = MultiaSyncDataCollector( # [ # make_env( - # parallel=True, observation_norm_state_dict=observation_norm_state_dict + # parallel=True, obs_norm_sd=obs_norm_sd # ), # ] # * num_collectors, From 972217a2135c0c63e871f53ee6445991785af5a9 Mon Sep 17 00:00:00 2001 From: vmoens Date: Thu, 23 Mar 2023 15:48:57 +0000 Subject: [PATCH 08/89] amend --- docs/source/reference/data.rst | 9 +- torchrl/data/__init__.py | 1 + tutorials/sphinx-tutorials/coding_dqn.py | 1933 +++++++++++----------- 3 files changed, 987 insertions(+), 956 deletions(-) diff --git a/docs/source/reference/data.rst b/docs/source/reference/data.rst index c115514650c..72b66b3ab1e 100644 --- a/docs/source/reference/data.rst +++ b/docs/source/reference/data.rst @@ -24,11 +24,12 @@ Composable Replay Buffers We also give users the ability to compose a replay buffer using the following components: +.. currentmodule:: torchrl.data.replay_buffers + .. autosummary:: :toctree: generated/ :template: rl_template.rst - .. currentmodule:: torchrl.data.replay_buffers Sampler PrioritizedSampler @@ -176,11 +177,12 @@ Here's an example: `the repository `_ is needed as the latest wheels are not published on PyPI. +.. currentmodule:: torchrl.data.datasets + .. autosummary:: :toctree: generated/ :template: rl_template.rst - .. currentmodule:: torchrl.data.datasets D4RLExperienceReplay @@ -193,6 +195,7 @@ It is important that your environment specs match the input and output that it s :obj:`ParallelEnv` will create buffers from these specs to communicate with the spawn processes. Check the :obj:`torchrl.envs.utils.check_env_specs` method for a sanity check. +.. currentmodule:: torchrl.data .. autosummary:: :toctree: generated/ @@ -213,6 +216,8 @@ Check the :obj:`torchrl.envs.utils.check_env_specs` method for a sanity check. Utils ----- +.. currentmodule:: torchrl.data.datasets + .. autosummary:: :toctree: generated/ :template: rl_template.rst diff --git a/torchrl/data/__init__.py b/torchrl/data/__init__.py index 788a2cce27d..6608b49cade 100644 --- a/torchrl/data/__init__.py +++ b/torchrl/data/__init__.py @@ -30,3 +30,4 @@ UnboundedContinuousTensorSpec, UnboundedDiscreteTensorSpec, ) +from . import datasets diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py index 9ad17095ee3..4124d87a492 100644 --- a/tutorials/sphinx-tutorials/coding_dqn.py +++ b/tutorials/sphinx-tutorials/coding_dqn.py @@ -23,15 +23,21 @@ # module, replay buffer and optimizer. # - Adding hooks to a trainer, such as loggers, target network updaters and such. # -# We will also focus on some other aspects of the library: +# The trainer is fully customisable and offers a large set of functionalities. +# The tutorial is organised around its construction. +# We will be detailing how to build each of the components of the library first, +# and then put the pieces together using the `torchrl.trainers.Trainer` class. +# +# Along the road, we will also focus on some other aspects of the library: # # - how to build an environment in TorchRL, including transforms (e.g. data # normalization, frame concatenation, resizing and turning to grayscale) # and parallel execution. Unlike what we did in the # `DDPG tutorial `_, we # will normalize the pixels and not the state vector. -# - how to design a ``QValueActor``, i.e. an actor that estimates the action -# values and picks up the action with the highest estimated return; +# - how to design a :class:`torchrl.modules.QValueActor` object, i.e. an actor +# that estimates the action values and picks up the action with the highest +# estimated return; # - how to collect data from your environment efficiently and store them # in a replay buffer; # - how to store trajectories (and not transitions) in your replay buffer), @@ -76,1005 +82,1024 @@ # to provide a high-level illustration of TorchRL features in the context # of this algorithm. -if __name__ == "__main__": - # sphinx_gallery_start_ignore - import warnings - - from torchrl.objectives import DQNLoss, SoftUpdate - from torchrl.trainers import Trainer, ReplayBufferTrainer, UpdateWeights +# sphinx_gallery_start_ignore +import warnings - warnings.filterwarnings("ignore") - # sphinx_gallery_end_ignore +from torchrl.objectives import DQNLoss, SoftUpdate +from torchrl.trainers import Trainer, ReplayBufferTrainer, UpdateWeights - import torch - from torch import nn - from torchrl.collectors import MultiaSyncDataCollector - from torchrl.data import LazyMemmapStorage, TensorDictReplayBuffer - from torchrl.envs import EnvCreator, ParallelEnv, RewardScaling, StepCounter - from torchrl.envs.libs.gym import GymEnv - from torchrl.envs.transforms import ( - CatFrames, - Compose, - GrayScale, - ObservationNorm, - Resize, - ToTensorImage, - TransformedEnv, - ) - from torchrl.modules import DuelingCnnDQNet, EGreedyWrapper, QValueActor +warnings.filterwarnings("ignore") +# sphinx_gallery_end_ignore +import torch +from torch import nn +from torchrl.collectors import MultiaSyncDataCollector +from torchrl.data import LazyMemmapStorage, TensorDictReplayBuffer +from torchrl.envs import EnvCreator, ParallelEnv, RewardScaling, StepCounter +from torchrl.envs.libs.gym import GymEnv +from torchrl.envs.transforms import ( + CatFrames, + Compose, + GrayScale, + ObservationNorm, + Resize, + ToTensorImage, + TransformedEnv, +) +from torchrl.modules import DuelingCnnDQNet, EGreedyWrapper, QValueActor - def is_notebook() -> bool: - try: - shell = get_ipython().__class__.__name__ - if shell == "ZMQInteractiveShell": - return True # Jupyter notebook or qtconsole - elif shell == "TerminalInteractiveShell": - return False # Terminal running IPython - else: - return False # Other type (?) - except NameError: - return False # Probably standard Python interpreter +def is_notebook() -> bool: + try: + shell = get_ipython().__class__.__name__ + if shell == "ZMQInteractiveShell": + return True # Jupyter notebook or qtconsole + elif shell == "TerminalInteractiveShell": + return False # Terminal running IPython + else: + return False # Other type (?) + except NameError: + return False # Probably standard Python interpreter - ############################################################################### - # Building the environment - # ------------------------ - # - # Our environment builder has two arguments: - # - # - ``parallel``: determines whether multiple environments have to be run in - # parallel. We stack the transforms after the - # :class:`torchrl.envs.ParallelEnv` to take advantage - # of vectorization of the operations on device, although this would - # technically work with every single environment attached to its own set of - # transforms. - # - ``obs_norm_sd`` will contain the normalizing constants for - # the :class:`torchrl.envs.ObservationNorm` transform. - # - # We will be using five transforms: - # - # - :class:`torchrl.envs.StepCounter` to count the number of steps in each trajectory; - # - :class:`torchrl.envs.ToTensorImage` will convert a ``[W, H, C]`` uint8 - # tensor in a floating point tensor in the ``[0, 1]`` space with shape - # ``[C, W, H]``; - # - :class:`torchrl.envs.RewardScaling` to reduce the scale of the return; - # - :class:`torchrl.envs.GrayScale` will turn our image into grayscale; - # - :class:`torchrl.envs.Resize` will resize the image in a 64x64 format; - # - :class:`torchrl.envs.CatFrames` will concatenate an arbitrary number of - # successive frames (``N=4``) in a single tensor along the channel dimension. - # This is useful as a single image does not carry information about the - # motion of the cartpole. Some memory about past observations and actions - # is needed, either via a recurrent neural network or using a stack of - # frames. - # - :class:`torchrl.envs.ObservationNorm` which will normalize our observations - # given some custom summary statistics. - # +############################################################################### +# Let's get started with the various pieces we need for our algorithm: +# +# - An environment; +# - A policy (and related modules that we group under the "model" umbrella); +# - A data collector, which makes the policy play in the environment and +# delivers training data; +# - A replay buffer to store the training data; +# - A loss module, which computes the objective function to train our policy +# to maximise the return; +# - An optimizer, which performs parameter updates based on our loss. +# +# Additional modules include a logger, a recorder (executes the policy in +# "eval" mode) and a target network updater. With all these components into +# place, it is easy to see how one could misplace or misuse one component in +# the training script. The trainer is there to orchestrate everything for you! +# +# Building the environment +# ------------------------ +# +# First let's write a helper function that will output an environment. As usual, +# the "raw" environment may be too simple to be used in practice and we'll need +# some data transformation to expose its output to the policy. +# +# We will be using five transforms: +# +# - :class:`torchrl.envs.StepCounter` to count the number of steps in each trajectory; +# - :class:`torchrl.envs.ToTensorImage` will convert a ``[W, H, C]`` uint8 +# tensor in a floating point tensor in the ``[0, 1]`` space with shape +# ``[C, W, H]``; +# - :class:`torchrl.envs.RewardScaling` to reduce the scale of the return; +# - :class:`torchrl.envs.GrayScale` will turn our image into grayscale; +# - :class:`torchrl.envs.Resize` will resize the image in a 64x64 format; +# - :class:`torchrl.envs.CatFrames` will concatenate an arbitrary number of +# successive frames (``N=4``) in a single tensor along the channel dimension. +# This is useful as a single image does not carry information about the +# motion of the cartpole. Some memory about past observations and actions +# is needed, either via a recurrent neural network or using a stack of +# frames. +# - :class:`torchrl.envs.ObservationNorm` which will normalize our observations +# given some custom summary statistics. +# +# In practice, our environment builder has two arguments: +# +# - ``parallel``: determines whether multiple environments have to be run in +# parallel. We stack the transforms after the +# :class:`torchrl.envs.ParallelEnv` to take advantage +# of vectorization of the operations on device, although this would +# technically work with every single environment attached to its own set of +# transforms. +# - ``obs_norm_sd`` will contain the normalizing constants for +# the :class:`torchrl.envs.ObservationNorm` transform. +# - def make_env(parallel=False, obs_norm_sd=None, ): - if obs_norm_sd is None: - obs_norm_sd = {"standard_normal": True} - if parallel: - base_env = ParallelEnv( - num_workers, - EnvCreator( - lambda: GymEnv( - "CartPole-v1", from_pixels=True, pixels_only=True, device=device, - ) - ), - ) - else: - base_env = GymEnv( - "CartPole-v1", from_pixels=True, pixels_only=True, device=device, - ) - env = TransformedEnv( - base_env, - Compose( - StepCounter(), # to count the steps of each trajectory - ToTensorImage(), - RewardScaling(loc=0.0, scale=0.1), - GrayScale(), - Resize(64, 64), - CatFrames(4, in_keys=["pixels"], dim=-3), - ObservationNorm(in_keys=["pixels"], **obs_norm_sd), +def make_env(parallel=False, obs_norm_sd=None, ): + if obs_norm_sd is None: + obs_norm_sd = {"standard_normal": True} + if parallel: + base_env = ParallelEnv( + num_workers, + EnvCreator( + lambda: GymEnv( + "CartPole-v1", from_pixels=True, pixels_only=True, device=device, + ) ), ) - return env + else: + base_env = GymEnv( + "CartPole-v1", from_pixels=True, pixels_only=True, device=device, + ) + env = TransformedEnv( + base_env, + Compose( + StepCounter(), # to count the steps of each trajectory + ToTensorImage(), + RewardScaling(loc=0.0, scale=0.1), + GrayScale(), + Resize(64, 64), + CatFrames(4, in_keys=["pixels"], dim=-3), + ObservationNorm(in_keys=["pixels"], **obs_norm_sd), + ), + ) + return env - ############################################################################### - # Compute normalizing constants - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # - # To normalize images, we don't want to normalize each pixel independently - # with a full ``[C, W, H]`` normalizing mask, but with simpler ``[C, 1, 1]`` - # shaped set of normalizing constants (loc and scale parameters). - # We will be using the ``reduce_dim`` argument - # of :func:`torchrl.envs.ObservationNorm.init_stats` to instruct which - # dimensions must be reduced, and the ``keep_dims`` parameter to ensure that - # not all dimensions disappear in the process: - # - def get_norm_stats(): - test_env = make_env() - test_env.transform[-1].init_stats( - num_iter=1000, cat_dim=0, reduce_dim=[-1, -2, -4], keep_dims=(-1, -2) - ) - obs_norm_sd = test_env.transform[-1].state_dict() - # let's check that normalizing constants have a size of ``[C, 1, 1]`` where - # ``C=4`` (because of :class:`torchrl.envs.CatFrames`). - print(obs_norm_sd) - return obs_norm_sd +############################################################################### +# Compute normalizing constants +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# To normalize images, we don't want to normalize each pixel independently +# with a full ``[C, W, H]`` normalizing mask, but with simpler ``[C, 1, 1]`` +# shaped set of normalizing constants (loc and scale parameters). +# We will be using the ``reduce_dim`` argument +# of :meth:`torchrl.envs.ObservationNorm.init_stats` to instruct which +# dimensions must be reduced, and the ``keep_dims`` parameter to ensure that +# not all dimensions disappear in the process: +# - ############################################################################### - # Building the model (Deep Q-network) - # ----------------------------------- - # - # The following function builds a :class:`torchrl.modules.DuelingCnnDQNet` - # object which is a simple CNN followed by a two-layer MLP. The only trick used - # here is that the action values (i.e. left and right action value) are - # computed using - # - # .. math:: - # - # val = b(obs) + v(obs) - \mathbb{E}[v(obs)] - # - # where :math:`b` is a :math:`\# obs \rightarrow 1` function and :math:`v` is a - # :math:`\# obs \rightarrow num_actions` function. - # - # Our network is wrapped in a :class:`torchrl.modules.QValueActor`, - # which will read the state-action - # values, pick up the one with the maximum value and write all those results - # in the input :class:`tensordict.TensorDict`. - # +def get_norm_stats(): + test_env = make_env() + test_env.transform[-1].init_stats( + num_iter=1000, cat_dim=0, reduce_dim=[-1, -2, -4], keep_dims=(-1, -2) +) + obs_norm_sd = test_env.transform[-1].state_dict() + # let's check that normalizing constants have a size of ``[C, 1, 1]`` where + # ``C=4`` (because of :class:`torchrl.envs.CatFrames`). + print(obs_norm_sd) + return obs_norm_sd - def make_model(dummy_env): - cnn_kwargs = { - "num_cells": [32, 64, 64], - "kernel_sizes": [6, 4, 3], - "strides": [2, 2, 1], - "activation_class": nn.ELU, - # This can be used to reduce the size of the last layer of the CNN - # "squeeze_output": True, - # "aggregator_class": nn.AdaptiveAvgPool2d, - # "aggregator_kwargs": {"output_size": (1, 1)}, - } - mlp_kwargs = { - "depth": 2, - "num_cells": [ - 64, - 64, - ], - "activation_class": nn.ELU, - } - net = DuelingCnnDQNet( - dummy_env.action_spec.shape[-1], 1, cnn_kwargs, mlp_kwargs - ).to(device) - net.value[-1].bias.data.fill_(init_bias) +############################################################################### +# Building the model (Deep Q-network) +# ----------------------------------- +# +# The following function builds a :class:`torchrl.modules.DuelingCnnDQNet` +# object which is a simple CNN followed by a two-layer MLP. The only trick used +# here is that the action values (i.e. left and right action value) are +# computed using +# +# .. math:: +# +# val = b(obs) + v(obs) - \mathbb{E}[v(obs)] +# +# where :math:`b` is a :math:`\# obs \rightarrow 1` function and :math:`v` is a +# :math:`\# obs \rightarrow num_actions` function. +# +# Our network is wrapped in a :class:`torchrl.modules.QValueActor`, +# which will read the state-action +# values, pick up the one with the maximum value and write all those results +# in the input :class:`tensordict.TensorDict`. +# - actor = QValueActor(net, in_keys=["pixels"], spec=dummy_env.action_spec).to(device) - # init actor: because the model is composed of lazy conv/linear layers, - # we must pass a fake batch of data through it to instantiate them. - tensordict = dummy_env.fake_tensordict() - actor(tensordict) +def make_model(dummy_env): + cnn_kwargs = { + "num_cells": [32, 64, 64], + "kernel_sizes": [6, 4, 3], + "strides": [2, 2, 1], + "activation_class": nn.ELU, + # This can be used to reduce the size of the last layer of the CNN + # "squeeze_output": True, + # "aggregator_class": nn.AdaptiveAvgPool2d, + # "aggregator_kwargs": {"output_size": (1, 1)}, + } + mlp_kwargs = { + "depth": 2, + "num_cells": [ + 64, + 64, + ], + "activation_class": nn.ELU, + } + net = DuelingCnnDQNet( + dummy_env.action_spec.shape[-1], 1, cnn_kwargs, mlp_kwargs + ).to(device) + net.value[-1].bias.data.fill_(init_bias) - # we wrap our actor in an EGreedyWrapper for data collection - actor_explore = EGreedyWrapper( - actor, - annealing_num_steps=total_frames, - eps_init=eps_greedy_val, - eps_end=eps_greedy_val_env, - ) + actor = QValueActor(net, in_keys=["pixels"], spec=dummy_env.action_spec).to(device) + # init actor: because the model is composed of lazy conv/linear layers, + # we must pass a fake batch of data through it to instantiate them. + tensordict = dummy_env.fake_tensordict() + actor(tensordict) - return actor, actor_explore + # we wrap our actor in an EGreedyWrapper for data collection + actor_explore = EGreedyWrapper( + actor, + annealing_num_steps=total_frames, + eps_init=eps_greedy_val, + eps_end=eps_greedy_val_env, + ) + return actor, actor_explore - ############################################################################### - # Collecting and storing data - # --------------------------- - # - # Replay buffers - # ~~~~~~~~~~~~~~ - # - # Replay buffers play a central role in off-policy RL algorithms such as DQN. - # They constitute the dataset we will be sampling from during training. - # - # Here, we will use a regular sampling strategy, although a prioritized RB - # could improve the performance significantly. - # - # We place the storage on disk using - # :class:`torchrl.data.replay_buffers.storages.LazyMemmapStorage` class. This - # storage is created in a lazy manner: it will only be instantiated once the - # first batch of data is passed to it. - # - # The only requirement of this storage is that the data passed to it at write - # time must always have the same shape. - def get_replay_buffer(buffer_size, n_optim): - replay_buffer = TensorDictReplayBuffer( - storage=LazyMemmapStorage(buffer_size), - prefetch=n_optim, - ) - return replay_buffer +############################################################################### +# Collecting and storing data +# --------------------------- +# +# Replay buffers +# ~~~~~~~~~~~~~~ +# +# Replay buffers play a central role in off-policy RL algorithms such as DQN. +# They constitute the dataset we will be sampling from during training. +# +# Here, we will use a regular sampling strategy, although a prioritized RB +# could improve the performance significantly. +# +# We place the storage on disk using +# :class:`torchrl.data.replay_buffers.storages.LazyMemmapStorage` class. This +# storage is created in a lazy manner: it will only be instantiated once the +# first batch of data is passed to it. +# +# The only requirement of this storage is that the data passed to it at write +# time must always have the same shape. - ############################################################################### - # Data collector - # ~~~~~~~~~~~~~~ - # - # As in `PPO ` and - # `DDPG `, we will be using - # a data collector as a dataloader in the outer loop. - # - # We choose the following configuration: we will be running a series of - # parallel environments synchronously in parallel in different collectors, - # themselves running in parallel but asynchronously. - # The advantage of this configuration is that we can balance the amount of - # compute that is executed in batch with what we want to be executed - # asynchronously. We encourage the reader to experiment how the collection - # speed is impacted by modifying the number of collectors (ie the number of - # environment constructors passed to the collector) and the number of - # environment executed in parallel in each collector (controlled by the - # ``num_workers`` hyperparameter). - # - # When building the collector, we can choose on which device we want the - # environment and policy to execute the operations through the ``device`` - # keyword argument. The ``storing_devices`` argument will modify the - # location of the data being collected: if the batches that we are gathering - # have a considerable size, we may want to store them on a different location - # than the device where the computation is happening. For asynchronous data - # collectors such as ours, different storing devices mean that the data that - # we collect won't sit on the same device each time, which is something that - # out training loop must account for. For simplicity, we set the devices to - # the same value for all sub-collectors. +def get_replay_buffer(buffer_size, n_optim): + replay_buffer = TensorDictReplayBuffer( + storage=LazyMemmapStorage(buffer_size), + prefetch=n_optim, + ) + return replay_buffer - def get_collector(obs_norm_sd, num_collectors, actor_explore, frames_per_batch, total_frames, device): - data_collector = MultiaSyncDataCollector( - [ - make_env( - parallel=True, obs_norm_sd=obs_norm_sd - ), - ] - * num_collectors, - policy=actor_explore, - frames_per_batch=frames_per_batch, - total_frames=total_frames, - # this is the default behaviour: the collector runs in ``"random"`` (or explorative) mode - exploration_mode="random", - # We set the all the devices to be identical. Below is an example of - # heterogeneous devices - device=device, - storing_device=device, - split_trajs=False, - ) - return data_collector +############################################################################### +# Data collector +# ~~~~~~~~~~~~~~ +# +# As in `PPO ` and +# `DDPG `, we will be using +# a data collector as a dataloader in the outer loop. +# +# We choose the following configuration: we will be running a series of +# parallel environments synchronously in parallel in different collectors, +# themselves running in parallel but asynchronously. +# The advantage of this configuration is that we can balance the amount of +# compute that is executed in batch with what we want to be executed +# asynchronously. We encourage the reader to experiment how the collection +# speed is impacted by modifying the number of collectors (ie the number of +# environment constructors passed to the collector) and the number of +# environment executed in parallel in each collector (controlled by the +# ``num_workers`` hyperparameter). +# +# When building the collector, we can choose on which device we want the +# environment and policy to execute the operations through the ``device`` +# keyword argument. The ``storing_devices`` argument will modify the +# location of the data being collected: if the batches that we are gathering +# have a considerable size, we may want to store them on a different location +# than the device where the computation is happening. For asynchronous data +# collectors such as ours, different storing devices mean that the data that +# we collect won't sit on the same device each time, which is something that +# out training loop must account for. For simplicity, we set the devices to +# the same value for all sub-collectors. - ############################################################################### - # Loss function - # ------------- - # - # Building our loss function is straightforward: we only need to provide - # the model and a bunch of hyperparameters to the DQNLoss class. - # - # Target parameters - # ~~~~~~~~~~~~~~~~~ - # - # Many off-policy RL algorithms use the concept of "target parameters" when it - # comes to estimate the value of the next state or state-action pair. - # The target parameters are lagged copies of the model parameters. Because - # their predictions mismatch those of the current model configuration, they - # help learning by putting a pessimistic bound on the value being estimated. - # This is a powerful trick (known as "Double Q-Learning") that is ubiquitous - # in similar algorithms. - # +def get_collector(obs_norm_sd, num_collectors, actor_explore, frames_per_batch, total_frames, device): + data_collector = MultiaSyncDataCollector( + [ + make_env( + parallel=True, obs_norm_sd=obs_norm_sd + ), + ] + * num_collectors, + policy=actor_explore, + frames_per_batch=frames_per_batch, + total_frames=total_frames, + # this is the default behaviour: the collector runs in ``"random"`` (or explorative) mode + exploration_mode="random", + # We set the all the devices to be identical. Below is an example of + # heterogeneous devices + device=device, + storing_device=device, + split_trajs=False, + ) + return data_collector - def get_loss_module(actor, gamma): - loss_module = DQNLoss(actor, gamma=gamma, delay_value=True) - target_updater = SoftUpdate(loss_module) - return loss_module, target_updater +############################################################################### +# Loss function +# ------------- +# +# Building our loss function is straightforward: we only need to provide +# the model and a bunch of hyperparameters to the DQNLoss class. +# +# Target parameters +# ~~~~~~~~~~~~~~~~~ +# +# Many off-policy RL algorithms use the concept of "target parameters" when it +# comes to estimate the value of the next state or state-action pair. +# The target parameters are lagged copies of the model parameters. Because +# their predictions mismatch those of the current model configuration, they +# help learning by putting a pessimistic bound on the value being estimated. +# This is a powerful trick (known as "Double Q-Learning") that is ubiquitous +# in similar algorithms. +# - ############################################################################### - # Hyperparameters - # --------------- - # - # Let's start with our hyperparameters. The following setting should work well - # in practice, and the performance of the algorithm should hopefully not be - # too sensitive to slight variations of these. +def get_loss_module(actor, gamma): + loss_module = DQNLoss(actor, gamma=gamma, delay_value=True) + target_updater = SoftUpdate(loss_module) + return loss_module, target_updater - device = "cuda:0" if torch.cuda.device_count() > 0 else "cpu" +############################################################################### +# Hyperparameters +# --------------- +# +# Let's start with our hyperparameters. The following setting should work well +# in practice, and the performance of the algorithm should hopefully not be +# too sensitive to slight variations of these. - ############################################################################### - # Optimizer - # ~~~~~~~~~ +device = "cuda:0" if torch.cuda.device_count() > 0 else "cpu" - # the learning rate of the optimizer - lr = 2e-3 - # weight decay - wd = 1e-5 - # the beta parameters of Adam - betas = (0.9, 0.999) - # Optimization steps per batch collected (aka UPD or updates per data) - n_optim = 8 +############################################################################### +# Optimizer +# ~~~~~~~~~ - ############################################################################### - # DQN parameters - # ~~~~~~~~~~~~~~ +# the learning rate of the optimizer +lr = 2e-3 +# weight decay +wd = 1e-5 +# the beta parameters of Adam +betas = (0.9, 0.999) +# Optimization steps per batch collected (aka UPD or updates per data) +n_optim = 8 - ############################################################################### - # gamma decay factor - gamma = 0.99 +############################################################################### +# DQN parameters +# ~~~~~~~~~~~~~~ - ############################################################################### - # lambda decay factor (see second the part with TD(:math:`\lambda`) - lmbda = 0.95 +############################################################################### +# gamma decay factor +gamma = 0.99 - ############################################################################### - # Smooth target network update decay parameter. - # This loosely corresponds to a 1/(1-tau) interval with hard target network - # update - tau = 0.005 +############################################################################### +# lambda decay factor (see second the part with TD(:math:`\lambda`) +lmbda = 0.95 - ############################################################################### - # Data collection and replay buffer - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # Values to be used for proper training have been commented. - # - # Total frames collected in the environment. In other implementations, the - # user defines a maximum number of episodes. - # This is harder to do with our data collectors since they return batches - # of N collected frames, where N is a constant. - # However, one can easily get the same restriction on number of episodes by - # breaking the training loop when a certain number - # episodes has been collected. - total_frames = 4096 # 500000 +############################################################################### +# Smooth target network update decay parameter. +# This loosely corresponds to a 1/(1-tau) interval with hard target network +# update +tau = 0.005 - ############################################################################### - # Random frames used to initialize the replay buffer. - init_random_frames = 100 # 1000 +############################################################################### +# Data collection and replay buffer +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Values to be used for proper training have been commented. +# +# Total frames collected in the environment. In other implementations, the +# user defines a maximum number of episodes. +# This is harder to do with our data collectors since they return batches +# of N collected frames, where N is a constant. +# However, one can easily get the same restriction on number of episodes by +# breaking the training loop when a certain number +# episodes has been collected. +total_frames = 4096 # 500000 - ############################################################################### - # Frames in each batch collected. - frames_per_batch = 32 # 128 +############################################################################### +# Random frames used to initialize the replay buffer. +init_random_frames = 100 # 1000 - ############################################################################### - # Frames sampled from the replay buffer at each optimization step - batch_size = 32 # 256 +############################################################################### +# Frames in each batch collected. +frames_per_batch = 32 # 128 - ############################################################################### - # Size of the replay buffer in terms of frames - buffer_size = min(total_frames, 100000) +############################################################################### +# Frames sampled from the replay buffer at each optimization step +batch_size = 32 # 256 - ############################################################################### - # Number of environments run in parallel in each data collector - num_workers = 2 # 8 - num_collectors = 2 # 4 +############################################################################### +# Size of the replay buffer in terms of frames +buffer_size = min(total_frames, 100000) +############################################################################### +# Number of environments run in parallel in each data collector +num_workers = 2 # 8 +num_collectors = 2 # 4 - ############################################################################### - # Environment and exploration - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # - # We set the initial and final value of the epsilon factor in Epsilon-greedy - # exploration. - # Since our policy is deterministic, exploration is crucial: without it, the - # only source of randomness would be the environment reset. - eps_greedy_val = 0.1 - eps_greedy_val_env = 0.005 +############################################################################### +# Environment and exploration +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# We set the initial and final value of the epsilon factor in Epsilon-greedy +# exploration. +# Since our policy is deterministic, exploration is crucial: without it, the +# only source of randomness would be the environment reset. - ############################################################################### - # To speed up learning, we set the bias of the last layer of our value network - # to a predefined value (this is not mandatory) - init_bias = 2.0 +eps_greedy_val = 0.1 +eps_greedy_val_env = 0.005 - ############################################################################### - # .. note:: - # For fast rendering of the tutorial ``total_frames`` hyperparameter - # was set to a very low number. To get a reasonable performance, use a greater - # value e.g. 500000 - # +############################################################################### +# To speed up learning, we set the bias of the last layer of our value network +# to a predefined value (this is not mandatory) +init_bias = 2.0 - ############################################################################### - # Building a Trainer - # ------------------ - # - # TorchRL's :class:`torchrl.trainers.Trainer` class constructor takes the - # following keyword-only arguments: - # - # - ``collector`` - # - ``loss_module`` - # - ``optimizer`` - # - ``logger``: A logger can be - # - ``total_frames``: this parameter defines the lifespan of the trainer. - # - ``frame_skip``: when a frame-skip is used, the collector must be made - # aware of it in order to accurately count the number of frames - # collected etc. Making the trainer aware of this parameter is not - # mandatory but helps to have a fairer comparison between settings where - # the total number of frames (budget) is fixed but the frame-skip is - # variable. +############################################################################### +# .. note:: +# For fast rendering of the tutorial ``total_frames`` hyperparameter +# was set to a very low number. To get a reasonable performance, use a greater +# value e.g. 500000 +# - stats = get_norm_stats() - test_env = make_env(parallel=False, obs_norm_sd=stats) - # Get model - actor, actor_explore = make_model(test_env) - loss_module, target_net_updater = get_loss_module(actor, gamma) - collector = get_collector(stats, num_collectors, actor_explore, frames_per_batch, total_frames, device) - optimizer = torch.optim.Adam(loss_module.parameters(), lr=lr, weight_decay=wd, betas=betas) - trainer = Trainer( - collector=collector, - total_frames=total_frames, - frame_skip=1, - loss_module=loss_module, - optimizer=optimizer, - logger=None, - optim_steps_per_batch = n_optim, - ) +############################################################################### +# Building a Trainer +# ------------------ +# +# TorchRL's :class:`torchrl.trainers.Trainer` class constructor takes the +# following keyword-only arguments: +# +# - ``collector`` +# - ``loss_module`` +# - ``optimizer`` +# - ``logger``: A logger can be +# - ``total_frames``: this parameter defines the lifespan of the trainer. +# - ``frame_skip``: when a frame-skip is used, the collector must be made +# aware of it in order to accurately count the number of frames +# collected etc. Making the trainer aware of this parameter is not +# mandatory but helps to have a fairer comparison between settings where +# the total number of frames (budget) is fixed but the frame-skip is +# variable. + +stats = get_norm_stats() +test_env = make_env(parallel=False, obs_norm_sd=stats) +# Get model +actor, actor_explore = make_model(test_env) +loss_module, target_net_updater = get_loss_module(actor, gamma) +collector = get_collector(stats, num_collectors, actor_explore, frames_per_batch, total_frames, device) +optimizer = torch.optim.Adam(loss_module.parameters(), lr=lr, weight_decay=wd, betas=betas) +trainer = Trainer( + collector=collector, + total_frames=total_frames, + frame_skip=1, + loss_module=loss_module, + optimizer=optimizer, + logger=None, + optim_steps_per_batch = n_optim, +) - buffer_hook = ReplayBufferTrainer(get_replay_buffer(buffer_size, n_optim)) - buffer_hook.register(trainer) - weight_updater = UpdateWeights(collector, update_weights_interval=1) - weight_updater.register(trainer) +buffer_hook = ReplayBufferTrainer(get_replay_buffer(buffer_size, n_optim)) +buffer_hook.register(trainer) +weight_updater = UpdateWeights(collector, update_weights_interval=1) +weight_updater.register(trainer) - trainer.register_op("post_optim", target_net_updater.step) +trainer.register_op("post_optim", target_net_updater.step) - trainer.train() +trainer.train() - # ############################################################################### - # # We represent the parameters and targets as flat structures, but unflattening - # # them is quite easy: - # - # params_flat = params.flatten_keys(".") - # - # ############################################################################### - # # We will be using the adam optimizer: - # - # optim = torch.optim.Adam(list(params_flat.values()), lr, betas=betas) - # - # ############################################################################### - # # We create a test environment for evaluation of the policy: - # - # test_env = make_env( - # parallel=False, obs_norm_sd=obs_norm_sd - # ) - # # sanity check: - # print(actor_explore(test_env.reset())) - # - # ############################################################################### - # # Training loop of a regular DQN - # # ------------------------------ - # # - # # We'll start with a simple implementation of DQN where the returns are - # # computed without bootstrapping, i.e. - # # - # # .. math:: - # # - # # Q_{t}(s, a) = R(s, a) + \gamma * V_{t+1}(s) - # # - # # where :math:`Q(s, a)` is the Q-value of the current state-action pair, - # # :math:`R(s, a)` is the result of the reward function, and :math:`V(s)` is a - # # value function that returns 0 for terminating states. - # # - # # We store the logs in a defaultdict: - # - # logs_exp1 = defaultdict(list) - # prev_traj_count = 0 - # - # pbar = tqdm.tqdm(total=total_frames) - # for j, data in enumerate(data_collector): - # current_frames = data.numel() - # pbar.update(current_frames) - # data = data.view(-1) - # - # # We store the values on the replay buffer, after placing them on CPU. - # # When called for the first time, this will instantiate our storage - # # object which will print its content. - # replay_buffer.extend(data.cpu()) - # - # # some logging - # if len(logs_exp1["frames"]): - # logs_exp1["frames"].append(current_frames + logs_exp1["frames"][-1]) - # else: - # logs_exp1["frames"].append(current_frames) - # - # if data["next", "done"].any(): - # done = data["next", "done"].squeeze(-1) - # logs_exp1["traj_lengths"].append( - # data["next", "step_count"][done].float().mean().item() - # ) - # - # # check that we have enough data to start training - # if sum(logs_exp1["frames"]) > init_random_frames: - # for _ in range(n_optim): - # # sample from the RB and send to device - # sampled_data = replay_buffer.sample(batch_size) - # sampled_data = sampled_data.to(device, non_blocking=True) - # - # # collect data from RB - # reward = sampled_data["next", "reward"].squeeze(-1) - # done = sampled_data["next", "done"].squeeze(-1).to(reward.dtype) - # action = sampled_data["action"].clone() - # - # # Compute action value (of the action actually taken) at time t - # # By default, TorchRL uses one-hot encodings for discrete actions - # sampled_data_out = sampled_data.select(*actor.in_keys) - # sampled_data_out = factor(sampled_data_out, params=params) - # action_value = sampled_data_out["action_value"] - # action_value = (action_value * action.to(action_value.dtype)).sum(-1) - # with torch.no_grad(): - # # compute best action value for the next step, using target parameters - # tdstep = step_mdp(sampled_data) - # next_value = factor( - # tdstep.select(*actor.in_keys), - # params=params_target, - # )["chosen_action_value"].squeeze(-1) - # exp_value = reward + gamma * next_value * (1 - done) - # assert exp_value.shape == action_value.shape - # # we use MSE loss but L1 or smooth L1 should also work - # error = nn.functional.mse_loss(exp_value, action_value).mean() - # error.backward() - # - # gv = nn.utils.clip_grad_norm_(list(params_flat.values()), 1) - # - # optim.step() - # optim.zero_grad() - # - # # update of the target parameters - # params_target.apply( - # lambda p_target, p_orig: p_orig * tau + p_target * (1 - tau), - # params.detach(), - # inplace=True, - # ) - # - # actor_explore.step(current_frames) - # - # # Logging - # logs_exp1["grad_vals"].append(float(gv)) - # logs_exp1["losses"].append(error.item()) - # logs_exp1["values"].append(action_value.mean().item()) - # logs_exp1["traj_count"].append( - # prev_traj_count + data["next", "done"].sum().item() - # ) - # prev_traj_count = logs_exp1["traj_count"][-1] - # - # if j % 10 == 0: - # with set_exploration_mode("mode"), torch.no_grad(): - # # execute a rollout. The `set_exploration_mode("mode")` has no effect here since the policy is deterministic, but we add it for completeness - # eval_rollout = test_env.rollout( - # max_steps=10000, - # policy=actor, - # ).cpu() - # logs_exp1["traj_lengths_eval"].append(eval_rollout.shape[-1]) - # logs_exp1["evals"].append(eval_rollout["next", "reward"].sum().item()) - # if len(logs_exp1["mavgs"]): - # logs_exp1["mavgs"].append( - # logs_exp1["evals"][-1] * 0.05 + logs_exp1["mavgs"][-1] * 0.95 - # ) - # else: - # logs_exp1["mavgs"].append(logs_exp1["evals"][-1]) - # logs_exp1["traj_count_eval"].append(logs_exp1["traj_count"][-1]) - # pbar.set_description( - # f"error: {error: 4.4f}, value: {action_value.mean(): 4.4f}, test return: {logs_exp1['evals'][-1]: 4.4f}" - # ) - # - # # update policy weights - # data_collector.update_policy_weights_() - # - # ############################################################################### - # # We write a custom plot function to display the performance of our algorithm - # # - # - # - # def plot(logs, name): - # plt.figure(figsize=(15, 10)) - # plt.subplot(2, 3, 1) - # plt.plot( - # logs["frames"][-len(logs["evals"]) :], - # logs["evals"], - # label="return (eval)", - # ) - # plt.plot( - # logs["frames"][-len(logs["mavgs"]) :], - # logs["mavgs"], - # label="mavg of returns (eval)", - # ) - # plt.xlabel("frames collected") - # plt.ylabel("trajectory length (= return)") - # plt.subplot(2, 3, 2) - # plt.plot( - # logs["traj_count"][-len(logs["evals"]) :], - # logs["evals"], - # label="return", - # ) - # plt.plot( - # logs["traj_count"][-len(logs["mavgs"]) :], - # logs["mavgs"], - # label="mavg", - # ) - # plt.xlabel("trajectories collected") - # plt.legend() - # plt.subplot(2, 3, 3) - # plt.plot(logs["frames"][-len(logs["losses"]) :], logs["losses"]) - # plt.xlabel("frames collected") - # plt.title("loss") - # plt.subplot(2, 3, 4) - # plt.plot(logs["frames"][-len(logs["values"]) :], logs["values"]) - # plt.xlabel("frames collected") - # plt.title("value") - # plt.subplot(2, 3, 5) - # plt.plot( - # logs["frames"][-len(logs["grad_vals"]) :], - # logs["grad_vals"], - # ) - # plt.xlabel("frames collected") - # plt.title("grad norm") - # if len(logs["traj_lengths"]): - # plt.subplot(2, 3, 6) - # plt.plot(logs["traj_lengths"]) - # plt.xlabel("batches") - # plt.title("traj length (training)") - # plt.savefig(name) - # if is_notebook(): - # plt.show() - # - # - # ############################################################################### - # # The performance of the policy can be measured as the length of trajectories. - # # As we can see on the results of the :func:`plot` function, the performance - # # of the policy increases, albeit slowly. - # # - # # .. code-block:: python - # # - # # plot(logs_exp1, "dqn_td0.png") - # # - # # .. figure:: /_static/img/dqn_td0.png - # # :alt: Cart Pole results with TD(0) - # # - # - # print("shutting down") - # data_collector.shutdown() - # del data_collector - # - # ############################################################################### - # # DQN with TD(:math:`\lambda`) - # # ---------------------------- - # # - # # We can improve the above algorithm by getting a better estimate of the - # # return, using not only the next state value but the whole sequence of rewards - # # and values that follow a particular step. - # # - # # TorchRL provides a vectorized version of TD(lambda) named - # # :func:`torchrl.objectives.value.functional.vec_td_lambda_advantage_estimate`. - # # We'll use this to obtain a target value that the value network will be - # # trained to match. - # # - # # The big difference in this implementation is that we'll store entire - # # trajectories and not single steps in the replay buffer. This will be done - # # automatically as long as we're not "flattening" the tensordict collected: - # # by keeping a shape ``[Batch x timesteps]`` and giving this - # # to the RB, we'll be creating a replay buffer of size - # # ``[Capacity x timesteps]``. - # - # - # from torchrl.objectives.value.functional import vec_td_lambda_advantage_estimate - # - # ############################################################################### - # # We reset the actor parameters: - # # - # - # ( - # factor, - # actor, - # actor_explore, - # params, - # params_target, - # ) = make_model(test_env) - # params_flat = params.flatten_keys(".") - # - # optim = torch.optim.Adam(list(params_flat.values()), lr, betas=betas) - # test_env = make_env( - # parallel=False, obs_norm_sd=obs_norm_sd - # ) - # print(actor_explore(test_env.reset())) - # - # ############################################################################### - # # Data: Replay buffer and collector - # # --------------------------------- - # # - # # We need to build a new replay buffer of the appropriate size: - # # - # - # max_size = frames_per_batch // num_workers - # - # replay_buffer = TensorDictReplayBuffer( - # storage=LazyMemmapStorage(-(-buffer_size // max_size)), - # prefetch=n_optim, - # ) - # - # data_collector = MultiaSyncDataCollector( - # [ - # make_env( - # parallel=True, obs_norm_sd=obs_norm_sd - # ), - # ] - # * num_collectors, - # policy=actor_explore, - # frames_per_batch=frames_per_batch, - # total_frames=total_frames, - # exploration_mode="random", - # devices=[device] * num_collectors, - # storing_devices=[device] * num_collectors, - # # devices=[f"cuda:{i}" for i in range(1, 1 + num_collectors)], - # # storing_devices=[f"cuda:{i}" for i in range(1, 1 + num_collectors)], - # split_trajs=False, - # ) - # - # - # logs_exp2 = defaultdict(list) - # prev_traj_count = 0 - # - # ############################################################################### - # # Training loop - # # ------------- - # # - # # There are very few differences with the training loop above: - # # - # # - The tensordict received by the collector is used as-is, without being - # # flattened (recall the ``data.view(-1)`` above), to keep the temporal - # # relation between consecutive steps. - # # - We use :func:`vec_td_lambda_advantage_estimate` to compute the target - # # value. - # - # pbar = tqdm.tqdm(total=total_frames) - # for j, data in enumerate(data_collector): - # current_frames = data.numel() - # pbar.update(current_frames) - # - # replay_buffer.extend(data.cpu()) - # if len(logs_exp2["frames"]): - # logs_exp2["frames"].append(current_frames + logs_exp2["frames"][-1]) - # else: - # logs_exp2["frames"].append(current_frames) - # - # if data["next", "done"].any(): - # done = data["next", "done"].squeeze(-1) - # logs_exp2["traj_lengths"].append( - # data["next", "step_count"][done].float().mean().item() - # ) - # - # if sum(logs_exp2["frames"]) > init_random_frames: - # for _ in range(n_optim): - # sampled_data = replay_buffer.sample(batch_size // max_size) - # sampled_data = sampled_data.clone().to(device, non_blocking=True) - # - # reward = sampled_data["next", "reward"] - # done = sampled_data["next", "done"].to(reward.dtype) - # action = sampled_data["action"].clone() - # - # sampled_data_out = sampled_data.select(*actor.in_keys) - # sampled_data_out = vmap(factor, (0, None))(sampled_data_out, params) - # action_value = sampled_data_out["action_value"] - # action_value = (action_value * action.to(action_value.dtype)).sum(-1, True) - # with torch.no_grad(): - # tdstep = step_mdp(sampled_data) - # next_value = vmap(factor, (0, None))( - # tdstep.select(*actor.in_keys), params - # ) - # next_value = next_value["chosen_action_value"] - # error = vec_td_lambda_advantage_estimate( - # gamma, - # lmbda, - # action_value, - # next_value, - # reward, - # done, - # ).pow(2) - # error = error.mean() - # error.backward() - # - # gv = nn.utils.clip_grad_norm_(list(params_flat.values()), 1) - # - # optim.step() - # optim.zero_grad() - # - # # update of the target parameters - # params_target.apply( - # lambda p_target, p_orig: p_orig * tau + p_target * (1 - tau), - # params.detach(), - # inplace=True, - # ) - # - # actor_explore.step(current_frames) - # - # # Logging - # logs_exp2["grad_vals"].append(float(gv)) - # - # logs_exp2["losses"].append(error.item()) - # logs_exp2["values"].append(action_value.mean().item()) - # logs_exp2["traj_count"].append( - # prev_traj_count + data["next", "done"].sum().item() - # ) - # prev_traj_count = logs_exp2["traj_count"][-1] - # if j % 10 == 0: - # with set_exploration_mode("mode"), torch.no_grad(): - # # execute a rollout. The `set_exploration_mode("mode")` has - # # no effect here since the policy is deterministic, but we add - # # it for completeness - # eval_rollout = test_env.rollout( - # max_steps=10000, - # policy=actor, - # ).cpu() - # logs_exp2["traj_lengths_eval"].append(eval_rollout.shape[-1]) - # logs_exp2["evals"].append(eval_rollout["next", "reward"].sum().item()) - # if len(logs_exp2["mavgs"]): - # logs_exp2["mavgs"].append( - # logs_exp2["evals"][-1] * 0.05 + logs_exp2["mavgs"][-1] * 0.95 - # ) - # else: - # logs_exp2["mavgs"].append(logs_exp2["evals"][-1]) - # logs_exp2["traj_count_eval"].append(logs_exp2["traj_count"][-1]) - # pbar.set_description( - # f"error: {error: 4.4f}, value: {action_value.mean(): 4.4f}, test return: {logs_exp2['evals'][-1]: 4.4f}" - # ) - # - # # update policy weights - # data_collector.update_policy_weights_() - # - # - # ############################################################################### - # # TD(:math:`\lambda`) performs significantly better than TD(0) because it - # # retrieves a much less biased estimate of the state-action value. - # # - # # .. code-block:: python - # # - # # plot(logs_exp2, "dqn_tdlambda.png") - # # - # # .. figure:: /_static/img/dqn_tdlambda.png - # # :alt: Cart Pole results with TD(lambda) - # # - # - # - # print("shutting down") - # data_collector.shutdown() - # del data_collector - # - # ############################################################################### - # # Let's compare the results on a single plot. Because the TD(lambda) version - # # works better, we'll have fewer episodes collected for a given number of - # # frames (as there are more frames per episode). - # # - # # **Note**: As already mentioned above, to get a more reasonable performance, - # # use a greater value for ``total_frames`` e.g. 500000. - # - # - # def plot_both(): - # frames_td0 = logs_exp1["frames"] - # frames_tdlambda = logs_exp2["frames"] - # evals_td0 = logs_exp1["evals"] - # evals_tdlambda = logs_exp2["evals"] - # mavgs_td0 = logs_exp1["mavgs"] - # mavgs_tdlambda = logs_exp2["mavgs"] - # traj_count_td0 = logs_exp1["traj_count_eval"] - # traj_count_tdlambda = logs_exp2["traj_count_eval"] - # - # plt.figure(figsize=(15, 10)) - # plt.subplot(1, 2, 1) - # plt.plot(frames_td0[-len(evals_td0) :], evals_td0, label="return (td0)", alpha=0.5) - # plt.plot( - # frames_tdlambda[-len(evals_tdlambda) :], - # evals_tdlambda, - # label="return (td(lambda))", - # alpha=0.5, - # ) - # plt.plot(frames_td0[-len(mavgs_td0) :], mavgs_td0, label="mavg (td0)") - # plt.plot( - # frames_tdlambda[-len(mavgs_tdlambda) :], - # mavgs_tdlambda, - # label="mavg (td(lambda))", - # ) - # plt.xlabel("frames collected") - # plt.ylabel("trajectory length (= return)") - # - # plt.subplot(1, 2, 2) - # plt.plot( - # traj_count_td0[-len(evals_td0) :], - # evals_td0, - # label="return (td0)", - # alpha=0.5, - # ) - # plt.plot( - # traj_count_tdlambda[-len(evals_tdlambda) :], - # evals_tdlambda, - # label="return (td(lambda))", - # alpha=0.5, - # ) - # plt.plot(traj_count_td0[-len(mavgs_td0) :], mavgs_td0, label="mavg (td0)") - # plt.plot( - # traj_count_tdlambda[-len(mavgs_tdlambda) :], - # mavgs_tdlambda, - # label="mavg (td(lambda))", - # ) - # plt.xlabel("trajectories collected") - # plt.legend() - # - # plt.savefig("dqn.png") - # - # - # ############################################################################### - # # .. code-block:: python - # # - # # plot_both() - # # - # # .. figure:: /_static/img/dqn.png - # # :alt: Cart Pole results from the TD(:math:`lambda`) trained policy. - # # - # # Finally, we generate a new video to check what the algorithm has learnt. - # # If all goes well, the duration should be significantly longer than with a - # # random rollout. - # # - # # To get the raw pixels of the rollout, we insert a - # # :class:`torchrl.envs.CatTensors` transform that precedes all others and copies - # # the ``"pixels"`` key onto a ``"pixels_save"`` key. This is necessary because - # # the other transforms that modify this key will update its value in-place in - # # the output tensordict. - # # - # - # test_env.transform.insert(0, CatTensors(["pixels"], "pixels_save", del_keys=False)) - # eval_rollout = test_env.rollout(max_steps=10000, policy=actor, auto_reset=True).cpu() - # - # # sphinx_gallery_start_ignore - # import imageio - # - # imageio.mimwrite("cartpole.gif", eval_rollout["pixels_save"].numpy(), fps=30) - # # sphinx_gallery_end_ignore - # - # del test_env - # - # ############################################################################### - # # The video of the rollout can be saved using the imageio package: - # # - # # .. code-block:: - # # - # # import imageio - # # imageio.mimwrite('cartpole.mp4', eval_rollout["pixels_save"].numpy(), fps=30); - # # - # # .. figure:: /_static/img/cartpole.gif - # # :alt: Cart Pole results from the TD(:math:`\lambda`) trained policy. - # - # ############################################################################### - # # Conclusion and possible improvements - # # ------------------------------------ - # # - # # In this tutorial we have learnt: - # # - # # - How to train a policy that read pixel-based states, what transforms to - # # include and how to normalize the data; - # # - How to create a policy that picks up the action with the highest value - # # with :class:`torchrl.modules.QValueNetwork`; - # # - How to build a multiprocessed data collector; - # # - How to train a DQN with TD(:math:`\lambda`) returns. - # # - # # We have seen that using TD(:math:`\lambda`) greatly improved the performance - # # of DQN. Other possible improvements could include: - # # - # # - Using the Multi-Step post-processing. Multi-step will project an action - # # to the nth following step, and create a discounted sum of the rewards in - # # between. This trick can make the algorithm noticebly less myopic. To use - # # this, simply create the collector with - # # - # # from torchrl.data.postprocs.postprocs import MultiStep - # # collector = CollectorClass(..., postproc=MultiStep(gamma, n)) - # # - # # where ``n`` is the number of looking-forward steps. Pay attention to the - # # fact that the ``gamma`` factor has to be corrected by the number of - # # steps till the next observation when being passed to - # # ``vec_td_lambda_advantage_estimate``: - # # - # # gamma = gamma ** tensordict["steps_to_next_obs"] - # # - A prioritized replay buffer could also be used. This will give a - # # higher priority to samples that have the worst value accuracy. - # # - A distributional loss (see ``torchrl.objectives.DistributionalDQNLoss`` - # # for more information). - # # - More fancy exploration techniques, such as NoisyLinear layers and such - # # (check ``torchrl.modules.NoisyLinear``, which is fully compatible with the - # # ``MLP`` class used in our Dueling DQN). +# ############################################################################### +# # We represent the parameters and targets as flat structures, but unflattening +# # them is quite easy: +# +# params_flat = params.flatten_keys(".") +# +# ############################################################################### +# # We will be using the adam optimizer: +# +# optim = torch.optim.Adam(list(params_flat.values()), lr, betas=betas) +# +# ############################################################################### +# # We create a test environment for evaluation of the policy: +# +# test_env = make_env( +# parallel=False, obs_norm_sd=obs_norm_sd +# ) +# # sanity check: +# print(actor_explore(test_env.reset())) +# +# ############################################################################### +# # Training loop of a regular DQN +# # ------------------------------ +# # +# # We'll start with a simple implementation of DQN where the returns are +# # computed without bootstrapping, i.e. +# # +# # .. math:: +# # +# # Q_{t}(s, a) = R(s, a) + \gamma * V_{t+1}(s) +# # +# # where :math:`Q(s, a)` is the Q-value of the current state-action pair, +# # :math:`R(s, a)` is the result of the reward function, and :math:`V(s)` is a +# # value function that returns 0 for terminating states. +# # +# # We store the logs in a defaultdict: +# +# logs_exp1 = defaultdict(list) +# prev_traj_count = 0 +# +# pbar = tqdm.tqdm(total=total_frames) +# for j, data in enumerate(data_collector): +# current_frames = data.numel() +# pbar.update(current_frames) +# data = data.view(-1) +# +# # We store the values on the replay buffer, after placing them on CPU. +# # When called for the first time, this will instantiate our storage +# # object which will print its content. +# replay_buffer.extend(data.cpu()) +# +# # some logging +# if len(logs_exp1["frames"]): +# logs_exp1["frames"].append(current_frames + logs_exp1["frames"][-1]) +# else: +# logs_exp1["frames"].append(current_frames) +# +# if data["next", "done"].any(): +# done = data["next", "done"].squeeze(-1) +# logs_exp1["traj_lengths"].append( +# data["next", "step_count"][done].float().mean().item() +# ) +# +# # check that we have enough data to start training +# if sum(logs_exp1["frames"]) > init_random_frames: +# for _ in range(n_optim): +# # sample from the RB and send to device +# sampled_data = replay_buffer.sample(batch_size) +# sampled_data = sampled_data.to(device, non_blocking=True) +# +# # collect data from RB +# reward = sampled_data["next", "reward"].squeeze(-1) +# done = sampled_data["next", "done"].squeeze(-1).to(reward.dtype) +# action = sampled_data["action"].clone() +# +# # Compute action value (of the action actually taken) at time t +# # By default, TorchRL uses one-hot encodings for discrete actions +# sampled_data_out = sampled_data.select(*actor.in_keys) +# sampled_data_out = factor(sampled_data_out, params=params) +# action_value = sampled_data_out["action_value"] +# action_value = (action_value * action.to(action_value.dtype)).sum(-1) +# with torch.no_grad(): +# # compute best action value for the next step, using target parameters +# tdstep = step_mdp(sampled_data) +# next_value = factor( +# tdstep.select(*actor.in_keys), +# params=params_target, +# )["chosen_action_value"].squeeze(-1) +# exp_value = reward + gamma * next_value * (1 - done) +# assert exp_value.shape == action_value.shape +# # we use MSE loss but L1 or smooth L1 should also work +# error = nn.functional.mse_loss(exp_value, action_value).mean() +# error.backward() +# +# gv = nn.utils.clip_grad_norm_(list(params_flat.values()), 1) +# +# optim.step() +# optim.zero_grad() +# +# # update of the target parameters +# params_target.apply( +# lambda p_target, p_orig: p_orig * tau + p_target * (1 - tau), +# params.detach(), +# inplace=True, +# ) +# +# actor_explore.step(current_frames) +# +# # Logging +# logs_exp1["grad_vals"].append(float(gv)) +# logs_exp1["losses"].append(error.item()) +# logs_exp1["values"].append(action_value.mean().item()) +# logs_exp1["traj_count"].append( +# prev_traj_count + data["next", "done"].sum().item() +# ) +# prev_traj_count = logs_exp1["traj_count"][-1] +# +# if j % 10 == 0: +# with set_exploration_mode("mode"), torch.no_grad(): +# # execute a rollout. The `set_exploration_mode("mode")` has no effect here since the policy is deterministic, but we add it for completeness +# eval_rollout = test_env.rollout( +# max_steps=10000, +# policy=actor, +# ).cpu() +# logs_exp1["traj_lengths_eval"].append(eval_rollout.shape[-1]) +# logs_exp1["evals"].append(eval_rollout["next", "reward"].sum().item()) +# if len(logs_exp1["mavgs"]): +# logs_exp1["mavgs"].append( +# logs_exp1["evals"][-1] * 0.05 + logs_exp1["mavgs"][-1] * 0.95 +# ) +# else: +# logs_exp1["mavgs"].append(logs_exp1["evals"][-1]) +# logs_exp1["traj_count_eval"].append(logs_exp1["traj_count"][-1]) +# pbar.set_description( +# f"error: {error: 4.4f}, value: {action_value.mean(): 4.4f}, test return: {logs_exp1['evals'][-1]: 4.4f}" +# ) +# +# # update policy weights +# data_collector.update_policy_weights_() +# +# ############################################################################### +# # We write a custom plot function to display the performance of our algorithm +# # +# +# +# def plot(logs, name): +# plt.figure(figsize=(15, 10)) +# plt.subplot(2, 3, 1) +# plt.plot( +# logs["frames"][-len(logs["evals"]) :], +# logs["evals"], +# label="return (eval)", +# ) +# plt.plot( +# logs["frames"][-len(logs["mavgs"]) :], +# logs["mavgs"], +# label="mavg of returns (eval)", +# ) +# plt.xlabel("frames collected") +# plt.ylabel("trajectory length (= return)") +# plt.subplot(2, 3, 2) +# plt.plot( +# logs["traj_count"][-len(logs["evals"]) :], +# logs["evals"], +# label="return", +# ) +# plt.plot( +# logs["traj_count"][-len(logs["mavgs"]) :], +# logs["mavgs"], +# label="mavg", +# ) +# plt.xlabel("trajectories collected") +# plt.legend() +# plt.subplot(2, 3, 3) +# plt.plot(logs["frames"][-len(logs["losses"]) :], logs["losses"]) +# plt.xlabel("frames collected") +# plt.title("loss") +# plt.subplot(2, 3, 4) +# plt.plot(logs["frames"][-len(logs["values"]) :], logs["values"]) +# plt.xlabel("frames collected") +# plt.title("value") +# plt.subplot(2, 3, 5) +# plt.plot( +# logs["frames"][-len(logs["grad_vals"]) :], +# logs["grad_vals"], +# ) +# plt.xlabel("frames collected") +# plt.title("grad norm") +# if len(logs["traj_lengths"]): +# plt.subplot(2, 3, 6) +# plt.plot(logs["traj_lengths"]) +# plt.xlabel("batches") +# plt.title("traj length (training)") +# plt.savefig(name) +# if is_notebook(): +# plt.show() +# +# +# ############################################################################### +# # The performance of the policy can be measured as the length of trajectories. +# # As we can see on the results of the :func:`plot` function, the performance +# # of the policy increases, albeit slowly. +# # +# # .. code-block:: python +# # +# # plot(logs_exp1, "dqn_td0.png") +# # +# # .. figure:: /_static/img/dqn_td0.png +# # :alt: Cart Pole results with TD(0) +# # +# +# print("shutting down") +# data_collector.shutdown() +# del data_collector +# +# ############################################################################### +# # DQN with TD(:math:`\lambda`) +# # ---------------------------- +# # +# # We can improve the above algorithm by getting a better estimate of the +# # return, using not only the next state value but the whole sequence of rewards +# # and values that follow a particular step. +# # +# # TorchRL provides a vectorized version of TD(lambda) named +# # :func:`torchrl.objectives.value.functional.vec_td_lambda_advantage_estimate`. +# # We'll use this to obtain a target value that the value network will be +# # trained to match. +# # +# # The big difference in this implementation is that we'll store entire +# # trajectories and not single steps in the replay buffer. This will be done +# # automatically as long as we're not "flattening" the tensordict collected: +# # by keeping a shape ``[Batch x timesteps]`` and giving this +# # to the RB, we'll be creating a replay buffer of size +# # ``[Capacity x timesteps]``. +# +# +# from torchrl.objectives.value.functional import vec_td_lambda_advantage_estimate +# +# ############################################################################### +# # We reset the actor parameters: +# # +# +# ( +# factor, +# actor, +# actor_explore, +# params, +# params_target, +# ) = make_model(test_env) +# params_flat = params.flatten_keys(".") +# +# optim = torch.optim.Adam(list(params_flat.values()), lr, betas=betas) +# test_env = make_env( +# parallel=False, obs_norm_sd=obs_norm_sd +# ) +# print(actor_explore(test_env.reset())) +# +# ############################################################################### +# # Data: Replay buffer and collector +# # --------------------------------- +# # +# # We need to build a new replay buffer of the appropriate size: +# # +# +# max_size = frames_per_batch // num_workers +# +# replay_buffer = TensorDictReplayBuffer( +# storage=LazyMemmapStorage(-(-buffer_size // max_size)), +# prefetch=n_optim, +# ) +# +# data_collector = MultiaSyncDataCollector( +# [ +# make_env( +# parallel=True, obs_norm_sd=obs_norm_sd +# ), +# ] +# * num_collectors, +# policy=actor_explore, +# frames_per_batch=frames_per_batch, +# total_frames=total_frames, +# exploration_mode="random", +# devices=[device] * num_collectors, +# storing_devices=[device] * num_collectors, +# # devices=[f"cuda:{i}" for i in range(1, 1 + num_collectors)], +# # storing_devices=[f"cuda:{i}" for i in range(1, 1 + num_collectors)], +# split_trajs=False, +# ) +# +# +# logs_exp2 = defaultdict(list) +# prev_traj_count = 0 +# +# ############################################################################### +# # Training loop +# # ------------- +# # +# # There are very few differences with the training loop above: +# # +# # - The tensordict received by the collector is used as-is, without being +# # flattened (recall the ``data.view(-1)`` above), to keep the temporal +# # relation between consecutive steps. +# # - We use :func:`vec_td_lambda_advantage_estimate` to compute the target +# # value. +# +# pbar = tqdm.tqdm(total=total_frames) +# for j, data in enumerate(data_collector): +# current_frames = data.numel() +# pbar.update(current_frames) +# +# replay_buffer.extend(data.cpu()) +# if len(logs_exp2["frames"]): +# logs_exp2["frames"].append(current_frames + logs_exp2["frames"][-1]) +# else: +# logs_exp2["frames"].append(current_frames) +# +# if data["next", "done"].any(): +# done = data["next", "done"].squeeze(-1) +# logs_exp2["traj_lengths"].append( +# data["next", "step_count"][done].float().mean().item() +# ) +# +# if sum(logs_exp2["frames"]) > init_random_frames: +# for _ in range(n_optim): +# sampled_data = replay_buffer.sample(batch_size // max_size) +# sampled_data = sampled_data.clone().to(device, non_blocking=True) +# +# reward = sampled_data["next", "reward"] +# done = sampled_data["next", "done"].to(reward.dtype) +# action = sampled_data["action"].clone() +# +# sampled_data_out = sampled_data.select(*actor.in_keys) +# sampled_data_out = vmap(factor, (0, None))(sampled_data_out, params) +# action_value = sampled_data_out["action_value"] +# action_value = (action_value * action.to(action_value.dtype)).sum(-1, True) +# with torch.no_grad(): +# tdstep = step_mdp(sampled_data) +# next_value = vmap(factor, (0, None))( +# tdstep.select(*actor.in_keys), params +# ) +# next_value = next_value["chosen_action_value"] +# error = vec_td_lambda_advantage_estimate( +# gamma, +# lmbda, +# action_value, +# next_value, +# reward, +# done, +# ).pow(2) +# error = error.mean() +# error.backward() +# +# gv = nn.utils.clip_grad_norm_(list(params_flat.values()), 1) +# +# optim.step() +# optim.zero_grad() +# +# # update of the target parameters +# params_target.apply( +# lambda p_target, p_orig: p_orig * tau + p_target * (1 - tau), +# params.detach(), +# inplace=True, +# ) +# +# actor_explore.step(current_frames) +# +# # Logging +# logs_exp2["grad_vals"].append(float(gv)) +# +# logs_exp2["losses"].append(error.item()) +# logs_exp2["values"].append(action_value.mean().item()) +# logs_exp2["traj_count"].append( +# prev_traj_count + data["next", "done"].sum().item() +# ) +# prev_traj_count = logs_exp2["traj_count"][-1] +# if j % 10 == 0: +# with set_exploration_mode("mode"), torch.no_grad(): +# # execute a rollout. The `set_exploration_mode("mode")` has +# # no effect here since the policy is deterministic, but we add +# # it for completeness +# eval_rollout = test_env.rollout( +# max_steps=10000, +# policy=actor, +# ).cpu() +# logs_exp2["traj_lengths_eval"].append(eval_rollout.shape[-1]) +# logs_exp2["evals"].append(eval_rollout["next", "reward"].sum().item()) +# if len(logs_exp2["mavgs"]): +# logs_exp2["mavgs"].append( +# logs_exp2["evals"][-1] * 0.05 + logs_exp2["mavgs"][-1] * 0.95 +# ) +# else: +# logs_exp2["mavgs"].append(logs_exp2["evals"][-1]) +# logs_exp2["traj_count_eval"].append(logs_exp2["traj_count"][-1]) +# pbar.set_description( +# f"error: {error: 4.4f}, value: {action_value.mean(): 4.4f}, test return: {logs_exp2['evals'][-1]: 4.4f}" +# ) +# +# # update policy weights +# data_collector.update_policy_weights_() +# +# +# ############################################################################### +# # TD(:math:`\lambda`) performs significantly better than TD(0) because it +# # retrieves a much less biased estimate of the state-action value. +# # +# # .. code-block:: python +# # +# # plot(logs_exp2, "dqn_tdlambda.png") +# # +# # .. figure:: /_static/img/dqn_tdlambda.png +# # :alt: Cart Pole results with TD(lambda) +# # +# +# +# print("shutting down") +# data_collector.shutdown() +# del data_collector +# +# ############################################################################### +# # Let's compare the results on a single plot. Because the TD(lambda) version +# # works better, we'll have fewer episodes collected for a given number of +# # frames (as there are more frames per episode). +# # +# # **Note**: As already mentioned above, to get a more reasonable performance, +# # use a greater value for ``total_frames`` e.g. 500000. +# +# +# def plot_both(): +# frames_td0 = logs_exp1["frames"] +# frames_tdlambda = logs_exp2["frames"] +# evals_td0 = logs_exp1["evals"] +# evals_tdlambda = logs_exp2["evals"] +# mavgs_td0 = logs_exp1["mavgs"] +# mavgs_tdlambda = logs_exp2["mavgs"] +# traj_count_td0 = logs_exp1["traj_count_eval"] +# traj_count_tdlambda = logs_exp2["traj_count_eval"] +# +# plt.figure(figsize=(15, 10)) +# plt.subplot(1, 2, 1) +# plt.plot(frames_td0[-len(evals_td0) :], evals_td0, label="return (td0)", alpha=0.5) +# plt.plot( +# frames_tdlambda[-len(evals_tdlambda) :], +# evals_tdlambda, +# label="return (td(lambda))", +# alpha=0.5, +# ) +# plt.plot(frames_td0[-len(mavgs_td0) :], mavgs_td0, label="mavg (td0)") +# plt.plot( +# frames_tdlambda[-len(mavgs_tdlambda) :], +# mavgs_tdlambda, +# label="mavg (td(lambda))", +# ) +# plt.xlabel("frames collected") +# plt.ylabel("trajectory length (= return)") +# +# plt.subplot(1, 2, 2) +# plt.plot( +# traj_count_td0[-len(evals_td0) :], +# evals_td0, +# label="return (td0)", +# alpha=0.5, +# ) +# plt.plot( +# traj_count_tdlambda[-len(evals_tdlambda) :], +# evals_tdlambda, +# label="return (td(lambda))", +# alpha=0.5, +# ) +# plt.plot(traj_count_td0[-len(mavgs_td0) :], mavgs_td0, label="mavg (td0)") +# plt.plot( +# traj_count_tdlambda[-len(mavgs_tdlambda) :], +# mavgs_tdlambda, +# label="mavg (td(lambda))", +# ) +# plt.xlabel("trajectories collected") +# plt.legend() +# +# plt.savefig("dqn.png") +# +# +# ############################################################################### +# # .. code-block:: python +# # +# # plot_both() +# # +# # .. figure:: /_static/img/dqn.png +# # :alt: Cart Pole results from the TD(:math:`lambda`) trained policy. +# # +# # Finally, we generate a new video to check what the algorithm has learnt. +# # If all goes well, the duration should be significantly longer than with a +# # random rollout. +# # +# # To get the raw pixels of the rollout, we insert a +# # :class:`torchrl.envs.CatTensors` transform that precedes all others and copies +# # the ``"pixels"`` key onto a ``"pixels_save"`` key. This is necessary because +# # the other transforms that modify this key will update its value in-place in +# # the output tensordict. +# # +# +# test_env.transform.insert(0, CatTensors(["pixels"], "pixels_save", del_keys=False)) +# eval_rollout = test_env.rollout(max_steps=10000, policy=actor, auto_reset=True).cpu() +# +# # sphinx_gallery_start_ignore +# import imageio +# +# imageio.mimwrite("cartpole.gif", eval_rollout["pixels_save"].numpy(), fps=30) +# # sphinx_gallery_end_ignore +# +# del test_env +# +# ############################################################################### +# # The video of the rollout can be saved using the imageio package: +# # +# # .. code-block:: +# # +# # import imageio +# # imageio.mimwrite('cartpole.mp4', eval_rollout["pixels_save"].numpy(), fps=30); +# # +# # .. figure:: /_static/img/cartpole.gif +# # :alt: Cart Pole results from the TD(:math:`\lambda`) trained policy. +# +# ############################################################################### +# # Conclusion and possible improvements +# # ------------------------------------ +# # +# # In this tutorial we have learnt: +# # +# # - How to train a policy that read pixel-based states, what transforms to +# # include and how to normalize the data; +# # - How to create a policy that picks up the action with the highest value +# # with :class:`torchrl.modules.QValueNetwork`; +# # - How to build a multiprocessed data collector; +# # - How to train a DQN with TD(:math:`\lambda`) returns. +# # +# # We have seen that using TD(:math:`\lambda`) greatly improved the performance +# # of DQN. Other possible improvements could include: +# # +# # - Using the Multi-Step post-processing. Multi-step will project an action +# # to the nth following step, and create a discounted sum of the rewards in +# # between. This trick can make the algorithm noticebly less myopic. To use +# # this, simply create the collector with +# # +# # from torchrl.data.postprocs.postprocs import MultiStep +# # collector = CollectorClass(..., postproc=MultiStep(gamma, n)) +# # +# # where ``n`` is the number of looking-forward steps. Pay attention to the +# # fact that the ``gamma`` factor has to be corrected by the number of +# # steps till the next observation when being passed to +# # ``vec_td_lambda_advantage_estimate``: +# # +# # gamma = gamma ** tensordict["steps_to_next_obs"] +# # - A prioritized replay buffer could also be used. This will give a +# # higher priority to samples that have the worst value accuracy. +# # - A distributional loss (see ``torchrl.objectives.DistributionalDQNLoss`` +# # for more information). +# # - More fancy exploration techniques, such as NoisyLinear layers and such +# # (check ``torchrl.modules.NoisyLinear``, which is fully compatible with the +# # ``MLP`` class used in our Dueling DQN). From fb81fc318f97faf032c6ddf8c760e24f3fadbd73 Mon Sep 17 00:00:00 2001 From: vmoens Date: Fri, 24 Mar 2023 17:10:33 +0000 Subject: [PATCH 09/89] empty From d13704849087fc8c36b2258c6c0d73eed9eee113 Mon Sep 17 00:00:00 2001 From: vmoens Date: Thu, 23 Mar 2023 17:31:38 +0000 Subject: [PATCH 10/89] init --- torchrl/modules/tensordict_module/sequence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchrl/modules/tensordict_module/sequence.py b/torchrl/modules/tensordict_module/sequence.py index ee7c181db5d..94bc3aa4970 100644 --- a/torchrl/modules/tensordict_module/sequence.py +++ b/torchrl/modules/tensordict_module/sequence.py @@ -13,7 +13,7 @@ class SafeSequential(TensorDictSequential, SafeModule): - """A sequence of TensorDictModules. + """A safe sequence of TensorDictModules. Similarly to :obj:`nn.Sequence` which passes a tensor through a chain of mappings that read and write a single tensor each, this module will read and write over a tensordict by querying each of the input modules. From adad97d884bac1ca7e7a5b5cc5d4ebc3d19eda11 Mon Sep 17 00:00:00 2001 From: vmoens Date: Thu, 23 Mar 2023 16:41:18 +0000 Subject: [PATCH 11/89] init --- torchrl/objectives/common.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/torchrl/objectives/common.py b/torchrl/objectives/common.py index 3db3df0e9cd..5ff7c171d7a 100644 --- a/torchrl/objectives/common.py +++ b/torchrl/objectives/common.py @@ -37,9 +37,12 @@ class LossModule(nn.Module): """A parent class for RL losses. - LossModule inherits from nn.Module. It is designed to read an input TensorDict and return another tensordict - with loss keys named "loss_*". - Splitting the loss in its component can then be used by the trainer to log the various loss values throughout + LossModule inherits from nn.Module. It is designed to read an input + TensorDict and return another tensordict + with loss keys named ``"loss_*"``. + + Splitting the loss in its component can then be used by the trainer to log + the various loss values throughout training. Other scalars present in the output tensordict will be logged too. """ @@ -75,6 +78,8 @@ def convert_to_functional( compare_against: Optional[List[Parameter]] = None, funs_to_decorate=None, ) -> None: + """Converts a module to functional to be used in the loss. + """ if funs_to_decorate is None: funs_to_decorate = ["forward"] # To make it robust to device casting, we must register list of From ea206037540abb02630cea02a86fe28339bb376d Mon Sep 17 00:00:00 2001 From: vmoens Date: Fri, 24 Mar 2023 08:39:17 +0000 Subject: [PATCH 12/89] amend --- docs/source/reference/objectives.rst | 1 + torchrl/objectives/common.py | 10 +++++ torchrl/objectives/dqn.py | 54 ++++++++++++++++++-------- torchrl/objectives/utils.py | 3 ++ torchrl/objectives/value/__init__.py | 2 +- torchrl/objectives/value/advantages.py | 41 +++++++++++++++++-- 6 files changed, 89 insertions(+), 22 deletions(-) diff --git a/docs/source/reference/objectives.rst b/docs/source/reference/objectives.rst index 84b7d0a2cb7..2a7a2f92be7 100644 --- a/docs/source/reference/objectives.rst +++ b/docs/source/reference/objectives.rst @@ -108,6 +108,7 @@ Returns :toctree: generated/ :template: rl_template_noinherit.rst + ValueFunctionBase GAE TDLambdaEstimate TDEstimate diff --git a/torchrl/objectives/common.py b/torchrl/objectives/common.py index 5ff7c171d7a..d3be7b75c3e 100644 --- a/torchrl/objectives/common.py +++ b/torchrl/objectives/common.py @@ -18,6 +18,7 @@ from torch.nn import Parameter from torchrl.modules.utils import Buffer +from torchrl.objectives.value import ValueFunctionBase _has_functorch = False try: @@ -353,3 +354,12 @@ def half(self) -> LossModule: def cpu(self) -> LossModule: return self.to(torch.device("cpu")) + + def _default_value_function(self) -> ValueFunctionBase: + """A value-function constructor when none is provided. + + No kwarg should be present as default parameters should be retrieved + from :obj:`torchrl.objectives.utils.DEFAULT_VALUE_FUN_PARAMS`. + + """ + raise NotImplementedError diff --git a/torchrl/objectives/dqn.py b/torchrl/objectives/dqn.py index d79f202fca4..05ec29fe492 100644 --- a/torchrl/objectives/dqn.py +++ b/torchrl/objectives/dqn.py @@ -3,11 +3,11 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from typing import Union +from typing import Union, Optional import torch -from tensordict import TensorDict -from tensordict.tensordict import TensorDictBase +from tensordict import TensorDict, TensorDictBase +from tensordict.nn import make_functional from torch import nn from torchrl.envs.utils import step_mdp @@ -15,7 +15,8 @@ from torchrl.modules.tensordict_module.common import ensure_tensordict_compatible from .common import LossModule -from .utils import distance_loss, next_state_value +from .utils import distance_loss, next_state_value, DEFAULT_VALUE_FUN_PARAMS +from .value import ValueFunctionBase, TDLambdaEstimate class DQNLoss(LossModule): @@ -33,7 +34,7 @@ class DQNLoss(LossModule): def __init__( self, value_network: Union[QValueActor, nn.Module], - gamma: float, + value_function: Optional[ValueFunctionBase]=None, loss_function: str = "l2", priority_key: str = "td_error", delay_value: bool = False, @@ -41,10 +42,12 @@ def __init__( super().__init__() self.delay_value = delay_value - + if value_function is not None and value_function.value_network is not value_network: + raise RuntimeError("value_function.value_network and value_network must match.") value_network = ensure_tensordict_compatible( module=value_network, wrapper_type=QValueActor ) + self.value_function.value_key = "chosen_action_value" self.convert_to_functional( value_network, @@ -52,13 +55,30 @@ def __init__( create_target_params=self.delay_value, ) + make_functional(self.value_network) + + if value_function is None: + value_function = self._default_value_function() + self.value_function = value_function + self.value_network_in_keys = value_network.in_keys - self.register_buffer("gamma", torch.tensor(gamma)) self.loss_function = loss_function self.priority_key = priority_key self.action_space = self.value_network.action_space + def _default_value_function(self): + return TDLambdaEstimate(gamma=DEFAULT_VALUE_FUN_PARAMS.gamma, + lmbda=DEFAULT_VALUE_FUN_PARAMS.lmbda, + value_network=self.value_network, + average_rewards=True, + differentiable=False, + vectorized=True, + advantage_key="advantage", + value_target_key = "value_target", + value_key="chosen_action_value", + ) + def forward(self, input_tensordict: TensorDictBase) -> TensorDict: """Computes the DQN loss given a tensordict sampled from the replay buffer. @@ -106,14 +126,9 @@ def forward(self, input_tensordict: TensorDictBase) -> TensorDict: action = action.to(torch.float) pred_val_index = (pred_val * action).sum(-1) - with torch.no_grad(): - target_value = next_state_value( - tensordict, - self.value_network, - gamma=self.gamma, - params=self.target_value_network_params, - next_val_key="chosen_action_value", - ) + self.value_function(tensordict, self.value_network_parameters, self.target_value_network_parameters) + target_value = tensordict[self.value_function.value_target_key] + priority_tensor = (pred_val_index - target_value).pow(2) priority_tensor = priority_tensor.detach().unsqueeze(-1) if input_tensordict.device is not None: @@ -150,12 +165,14 @@ class DistributionalDQNLoss(LossModule): def __init__( self, value_network: Union[DistributionalQValueActor, nn.Module], - gamma: float, + value_function: ValueFunctionBase, priority_key: str = "td_error", delay_value: bool = False, ): super().__init__() - self.register_buffer("gamma", torch.tensor(gamma)) + self.value_function = value_function + if self.value_function.value_network is not value_network: + raise RuntimeError("value_function.value_network and value_network must match.") self.priority_key = priority_key self.delay_value = delay_value @@ -168,6 +185,9 @@ def __init__( "value_network", create_target_params=self.delay_value, ) + + make_functional(self.value_function.value_network) + self.action_space = self.value_network.action_space @staticmethod diff --git a/torchrl/objectives/utils.py b/torchrl/objectives/utils.py index 5c30e7d2244..7c85075b19b 100644 --- a/torchrl/objectives/utils.py +++ b/torchrl/objectives/utils.py @@ -14,6 +14,9 @@ from torchrl.envs.utils import step_mdp +class DEFAULT_VALUE_FUN_PARAMS: + gamma: 0.99 + lmbda: 0.95 class _context_manager: def __init__(self, value=True): diff --git a/torchrl/objectives/value/__init__.py b/torchrl/objectives/value/__init__.py index 11e8f316f0b..6152732f411 100644 --- a/torchrl/objectives/value/__init__.py +++ b/torchrl/objectives/value/__init__.py @@ -3,4 +3,4 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from .advantages import GAE, TDEstimate, TDLambdaEstimate +from .advantages import GAE, TDEstimate, TDLambdaEstimate, ValueFunctionBase diff --git a/torchrl/objectives/value/advantages.py b/torchrl/objectives/value/advantages.py index d2e9b05dcf2..6ed0188502e 100644 --- a/torchrl/objectives/value/advantages.py +++ b/torchrl/objectives/value/advantages.py @@ -2,7 +2,7 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. - +import abc from functools import wraps from typing import List, Optional, Tuple, Union @@ -30,8 +30,41 @@ def new_fun(self, *args, **kwargs): return new_fun +class ValueFunctionBase(nn.Module): + """An abstract parent class for value function modules.""" + + value_network: TensorDictModule + value_key: Union[Tuple[str], str] + + @abc.abstractmethod + def forward( + self, + tensordict: TensorDictBase, + params: Optional[TensorDictBase] = None, + target_params: Optional[TensorDictBase] = None, + ) -> TensorDictBase: + """Computes the a value estimate given the data in tensordict. + + If a functional module is provided, a nested TensorDict containing the parameters + (and if relevant the target parameters) can be passed to the module. + + Args: + tensordict (TensorDictBase): A TensorDict containing the data + (an observation key, "action", ("next", "reward"), ("next", "done") and "next" tensordict state + as returned by the environment) necessary to compute the value estimates and the TDEstimate. + The data passed to this module should be structured as :obj:`[*B, T, F]` where :obj:`B` are + the batch size, :obj:`T` the time dimension and :obj:`F` the feature dimension(s). + params (TensorDictBase, optional): A nested TensorDict containing the params + to be passed to the functional value network module. + target_params (TensorDictBase, optional): A nested TensorDict containing the + target params to be passed to the functional value network module. + + Returns: + An updated TensorDict with an advantage and a value_error keys as defined in the constructor. + """ + raise NotImplementedError -class TDEstimate(nn.Module): +class TDEstimate(ValueFunctionBase): """Temporal Difference estimate of advantage function. Args: @@ -198,7 +231,7 @@ def forward( return tensordict -class TDLambdaEstimate(nn.Module): +class TDLambdaEstimate(ValueFunctionBase): """TD-Lambda estimate of advantage function. Args: @@ -384,7 +417,7 @@ def forward( return tensordict -class GAE(nn.Module): +class GAE(ValueFunctionBase): """A class wrapper around the generalized advantage estimate functional. Refer to "HIGH-DIMENSIONAL CONTINUOUS CONTROL USING GENERALIZED ADVANTAGE ESTIMATION" From e8bf4c1395c565acb3de051a43f28c58f167578f Mon Sep 17 00:00:00 2001 From: vmoens Date: Fri, 24 Mar 2023 11:07:02 +0000 Subject: [PATCH 13/89] amend --- test/test_cost.py | 6 ++-- torchrl/objectives/common.py | 3 +- torchrl/objectives/dqn.py | 50 ++++++++++++++++---------- torchrl/objectives/utils.py | 6 ++-- torchrl/objectives/value/advantages.py | 2 ++ 5 files changed, 41 insertions(+), 26 deletions(-) diff --git a/test/test_cost.py b/test/test_cost.py index ccb6a798d6c..2359c34f6df 100644 --- a/test/test_cost.py +++ b/test/test_cost.py @@ -109,7 +109,7 @@ def __enter__(self): pass def __exit__(self, exc_type, exc_val, exc_tb): - assert (self.td.select(*self.td_clone.keys()) == self.td_clone).all() + assert (self.td.select(*self.td_clone.keys()) == self.td_clone).all(), "Some keys have been modified in the tensordict!" def get_devices(): @@ -301,7 +301,7 @@ def test_dqn(self, delay_value, device, action_spec_type): td = self._create_mock_data_dqn( action_spec_type=action_spec_type, device=device ) - loss_fn = DQNLoss(actor, gamma=0.9, loss_function="l2", delay_value=delay_value) + loss_fn = DQNLoss(actor, loss_function="l2", delay_value=delay_value) with _check_td_steady(td): loss = loss_fn(td) assert loss_fn.priority_key in td.keys() @@ -341,7 +341,7 @@ def test_dqn_batcher(self, n, delay_value, device, action_spec_type, gamma=0.9): action_spec_type=action_spec_type, device=device ) loss_fn = DQNLoss( - actor, gamma=gamma, loss_function="l2", delay_value=delay_value + actor, loss_function="l2", delay_value=delay_value ) ms = MultiStep(gamma=gamma, n_steps=n).to(device) diff --git a/torchrl/objectives/common.py b/torchrl/objectives/common.py index d3be7b75c3e..134da3b1e34 100644 --- a/torchrl/objectives/common.py +++ b/torchrl/objectives/common.py @@ -79,8 +79,7 @@ def convert_to_functional( compare_against: Optional[List[Parameter]] = None, funs_to_decorate=None, ) -> None: - """Converts a module to functional to be used in the loss. - """ + """Converts a module to functional to be used in the loss.""" if funs_to_decorate is None: funs_to_decorate = ["forward"] # To make it robust to device casting, we must register list of diff --git a/torchrl/objectives/dqn.py b/torchrl/objectives/dqn.py index 05ec29fe492..1ffe6d7c4eb 100644 --- a/torchrl/objectives/dqn.py +++ b/torchrl/objectives/dqn.py @@ -3,7 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from typing import Union, Optional +from typing import Optional, Union import torch from tensordict import TensorDict, TensorDictBase @@ -15,8 +15,8 @@ from torchrl.modules.tensordict_module.common import ensure_tensordict_compatible from .common import LossModule -from .utils import distance_loss, next_state_value, DEFAULT_VALUE_FUN_PARAMS -from .value import ValueFunctionBase, TDLambdaEstimate +from .utils import DEFAULT_VALUE_FUN_PARAMS, distance_loss, next_state_value +from .value import TDLambdaEstimate, ValueFunctionBase class DQNLoss(LossModule): @@ -34,7 +34,7 @@ class DQNLoss(LossModule): def __init__( self, value_network: Union[QValueActor, nn.Module], - value_function: Optional[ValueFunctionBase]=None, + value_function: Optional[ValueFunctionBase] = None, loss_function: str = "l2", priority_key: str = "td_error", delay_value: bool = False, @@ -42,12 +42,16 @@ def __init__( super().__init__() self.delay_value = delay_value - if value_function is not None and value_function.value_network is not value_network: - raise RuntimeError("value_function.value_network and value_network must match.") + if ( + value_function is not None + and value_function.value_network is not value_network + ): + raise RuntimeError( + "value_function.value_network and value_network must match." + ) value_network = ensure_tensordict_compatible( module=value_network, wrapper_type=QValueActor ) - self.value_function.value_key = "chosen_action_value" self.convert_to_functional( value_network, @@ -59,6 +63,8 @@ def __init__( if value_function is None: value_function = self._default_value_function() + else: + value_function.value_key = "chosen_action_value" self.value_function = value_function self.value_network_in_keys = value_network.in_keys @@ -68,15 +74,16 @@ def __init__( self.action_space = self.value_network.action_space def _default_value_function(self): - return TDLambdaEstimate(gamma=DEFAULT_VALUE_FUN_PARAMS.gamma, - lmbda=DEFAULT_VALUE_FUN_PARAMS.lmbda, - value_network=self.value_network, - average_rewards=True, - differentiable=False, - vectorized=True, - advantage_key="advantage", - value_target_key = "value_target", - value_key="chosen_action_value", + return TDLambdaEstimate( + gamma=DEFAULT_VALUE_FUN_PARAMS.gamma, + lmbda=DEFAULT_VALUE_FUN_PARAMS.lmbda, + value_network=self.value_network, + average_rewards=True, + differentiable=False, + vectorized=True, + advantage_key="advantage", + value_target_key="value_target", + value_key="chosen_action_value", ) def forward(self, input_tensordict: TensorDictBase) -> TensorDict: @@ -126,8 +133,11 @@ def forward(self, input_tensordict: TensorDictBase) -> TensorDict: action = action.to(torch.float) pred_val_index = (pred_val * action).sum(-1) - self.value_function(tensordict, self.value_network_parameters, self.target_value_network_parameters) - target_value = tensordict[self.value_function.value_target_key] + target_value = self.value_function( + tensordict.clone(False), + self.value_network_params, + self.target_value_network_params, + ).get(self.value_function.value_target_key).squeeze(-1) priority_tensor = (pred_val_index - target_value).pow(2) priority_tensor = priority_tensor.detach().unsqueeze(-1) @@ -172,7 +182,9 @@ def __init__( super().__init__() self.value_function = value_function if self.value_function.value_network is not value_network: - raise RuntimeError("value_function.value_network and value_network must match.") + raise RuntimeError( + "value_function.value_network and value_network must match." + ) self.priority_key = priority_key self.delay_value = delay_value diff --git a/torchrl/objectives/utils.py b/torchrl/objectives/utils.py index 7c85075b19b..e18b6c841c4 100644 --- a/torchrl/objectives/utils.py +++ b/torchrl/objectives/utils.py @@ -14,9 +14,11 @@ from torchrl.envs.utils import step_mdp + class DEFAULT_VALUE_FUN_PARAMS: - gamma: 0.99 - lmbda: 0.95 + gamma = 0.99 + lmbda = 0.95 + class _context_manager: def __init__(self, value=True): diff --git a/torchrl/objectives/value/advantages.py b/torchrl/objectives/value/advantages.py index 6ed0188502e..7f91bc349c3 100644 --- a/torchrl/objectives/value/advantages.py +++ b/torchrl/objectives/value/advantages.py @@ -30,6 +30,7 @@ def new_fun(self, *args, **kwargs): return new_fun + class ValueFunctionBase(nn.Module): """An abstract parent class for value function modules.""" @@ -64,6 +65,7 @@ def forward( """ raise NotImplementedError + class TDEstimate(ValueFunctionBase): """Temporal Difference estimate of advantage function. From d178f93c391bf9227d2494eb4cb4141cd37f9e1c Mon Sep 17 00:00:00 2001 From: vmoens Date: Fri, 24 Mar 2023 13:00:31 +0000 Subject: [PATCH 14/89] value_estimate and sac init --- torchrl/objectives/dqn.py | 23 +- torchrl/objectives/sac.py | 308 +++---------------------- torchrl/objectives/value/advantages.py | 35 ++- 3 files changed, 72 insertions(+), 294 deletions(-) diff --git a/torchrl/objectives/dqn.py b/torchrl/objectives/dqn.py index 1ffe6d7c4eb..57329cb0404 100644 --- a/torchrl/objectives/dqn.py +++ b/torchrl/objectives/dqn.py @@ -24,7 +24,8 @@ class DQNLoss(LossModule): Args: value_network (QValueActor or nn.Module): a Q value operator. - gamma (scalar): a discount factor for return computation. + value_function (ValueFunctionBase, optional): the value function module + to be used. Defaults to :class:`torchrl.objectives.values.TDLambdaEstimate`. loss_function (str): loss function for the value discrepancy. Can be one of "l1", "l2" or "smooth_l1". delay_value (bool, optional): whether to duplicate the value network into a new target value network to create a double DQN. Default is :obj:`False`. @@ -59,8 +60,6 @@ def __init__( create_target_params=self.delay_value, ) - make_functional(self.value_network) - if value_function is None: value_function = self._default_value_function() else: @@ -169,22 +168,23 @@ class DistributionalDQNLoss(LossModule): value_network (DistributionalQValueActor or nn.Module): the distributional Q value operator. gamma (scalar): a discount factor for return computation. - delay_value (bool): whether to duplicate the value network into a new target value network to create double DQN + .. note:: + Unlike :class:`DQNLoss`, this class does not currently support + custom value functions. The next value estimation is not + bootstrapped. + delay_value (bool): whether to duplicate the value network into a new + target value network to create double DQN """ def __init__( self, value_network: Union[DistributionalQValueActor, nn.Module], - value_function: ValueFunctionBase, + gamma: float, priority_key: str = "td_error", delay_value: bool = False, ): super().__init__() - self.value_function = value_function - if self.value_function.value_network is not value_network: - raise RuntimeError( - "value_function.value_network and value_network must match." - ) + self.register_buffer("gamma", torch.tensor(gamma)) self.priority_key = priority_key self.delay_value = delay_value @@ -197,9 +197,6 @@ def __init__( "value_network", create_target_params=self.delay_value, ) - - make_functional(self.value_function.value_network) - self.action_space = self.value_network.action_space @staticmethod diff --git a/torchrl/objectives/sac.py b/torchrl/objectives/sac.py index 48ca586245e..f5318957d1a 100644 --- a/torchrl/objectives/sac.py +++ b/torchrl/objectives/sac.py @@ -15,7 +15,9 @@ from torchrl.modules import ProbabilisticActor from torchrl.modules.tensordict_module.actors import ActorCriticWrapper -from torchrl.objectives.utils import distance_loss, next_state_value +from torchrl.objectives.utils import distance_loss, next_state_value, \ + DEFAULT_VALUE_FUN_PARAMS +from .value import ValueFunctionBase, TDLambdaEstimate from ..envs.utils import set_exploration_mode, step_mdp from .common import LossModule @@ -39,14 +41,18 @@ class SACLoss(LossModule): Args: actor_network (ProbabilisticActor): stochastic actor - qvalue_network (TensorDictModule): Q(s, a) parametric model - value_network (TensorDictModule, optional): V(s) parametric model. If not - provided, the second version of SAC is assumed. - gamma (number, optional): discount for return computation - Default is 0.99 + qvalue_network (TensorDictModule): Q(s, a) parametric model. + This module typically outputs a ``"state_action_value"`` entry. + value_network (TensorDictModule, optional): V(s) parametric model. + This module typically outputs a ``"state_value"`` entry. + .. note:: + If not provided, the second version of SAC is assumed, where + only the Q-Value network is needed. + value_function (ValueFunctionBase, optional): the value function module + to be used. Defaults to :class:`torchrl.objectives.values.TDLambdaEstimate`. priority_key (str, optional): tensordict key where to write the - priority (for prioritized replay buffer usage). Default is - `"td_error"`. + priority (for prioritized replay buffer usage). Defaults to + ``"td_error"``. loss_function (str, optional): loss function to be used with the value function loss. Default is `"smooth_l1"`. alpha_init (float, optional): initial entropy multiplier. @@ -78,8 +84,8 @@ def __init__( actor_network: ProbabilisticActor, qvalue_network: TensorDictModule, value_network: Optional[TensorDictModule] = None, + value_function: Optional[ValueFunctionBase] = None, num_qvalue_nets: int = 2, - gamma: Number = 0.99, priority_key: str = "td_error", loss_function: str = "smooth_l1", alpha_init: float = 1.0, @@ -132,7 +138,6 @@ def __init__( compare_against=list(actor_network.parameters()) + value_params, ) - self.register_buffer("gamma", torch.tensor(gamma)) self.priority_key = priority_key self.loss_function = loss_function try: @@ -174,6 +179,26 @@ def __init__( ) make_functional(self.actor_critic) + if value_function is None: + value_function = self._default_value_function() + else: + value_function.value_key = "chosen_action_value" + self.value_function = value_function + + + def _default_value_function(self): + return TDLambdaEstimate( + gamma=DEFAULT_VALUE_FUN_PARAMS.gamma, + lmbda=DEFAULT_VALUE_FUN_PARAMS.lmbda, + value_network=self.value_network if self._version == 1 else self.qvalue_network, + average_rewards=True, + differentiable=False, + vectorized=True, + advantage_key="advantage", + value_target_key="value_target", + value_key="state_action_value" if self._version == 2 else "state_value", + ) + @property def device(self) -> torch.device: for p in self.parameters(): @@ -409,266 +434,3 @@ def _alpha(self): with torch.no_grad(): alpha = self.log_alpha.exp() return alpha - - -class DiscreteSACLoss(LossModule): - """Discrete SAC Loss module. - - Args: - actor_network (ProbabilisticActor): the actor to be trained - qvalue_network (TensorDictModule): a single Q-value network that will be multiplicated as many times as needed. - num_qvalue_nets (int, optional): Number of Q-value networks to be trained. Default is 10. - gamma (Number, optional): gamma decay factor. Default is 0.99. - priotity_key (str, optional): Key where to write the priority value for prioritized replay buffers. Default is - `"td_error"`. - loss_function (str, optional): loss function to be used for the Q-value. Can be one of `"smooth_l1"`, "l2", - "l1", Default is "smooth_l1". - alpha_init (float, optional): initial entropy multiplier. - Default is 1.0. - min_alpha (float, optional): min value of alpha. - Default is 0.1. - max_alpha (float, optional): max value of alpha. - Default is 10.0. - fixed_alpha (bool, optional): whether alpha should be trained to match a target entropy. Default is :obj:`False`. - target_entropy_weight (float, optional): weight for the target entropy term. - target_entropy (Union[str, Number], optional): Target entropy for the stochastic policy. Default is "auto". - delay_qvalue (bool, optional): Whether to separate the target Q value networks from the Q value networks used - for data collection. Default is :obj:`False`. - """ - - delay_actor: bool = False - - def __init__( - self, - actor_network: ProbabilisticActor, - qvalue_network: TensorDictModule, - num_actions: int, - num_qvalue_nets: int = 2, - gamma: Number = 0.99, - priotity_key: str = "td_error", - loss_function: str = "smooth_l1", - alpha_init: float = 1.0, - min_alpha: float = 0.1, - max_alpha: float = 10.0, - fixed_alpha: bool = False, - target_entropy_weight: float = 0.98, - target_entropy: Union[str, Number] = "auto", - delay_qvalue: bool = True, - ): - if not _has_functorch: - raise ImportError("Failed to import functorch.") from FUNCTORCH_ERROR - super().__init__() - self.convert_to_functional( - actor_network, - "actor_network", - create_target_params=self.delay_actor, - funs_to_decorate=["forward", "get_dist_params"], - ) - - self.delay_qvalue = delay_qvalue - self.convert_to_functional( - qvalue_network, - "qvalue_network", - num_qvalue_nets, - create_target_params=self.delay_qvalue, - compare_against=list(actor_network.parameters()), - ) - self.num_qvalue_nets = num_qvalue_nets - self.register_buffer("gamma", torch.tensor(gamma)) - self.priority_key = priotity_key - self.loss_function = loss_function - - try: - device = next(self.parameters()).device - except AttributeError: - device = torch.device("cpu") - - self.register_buffer("alpha_init", torch.tensor(alpha_init, device=device)) - self.register_buffer( - "min_log_alpha", torch.tensor(min_alpha, device=device).log() - ) - self.register_buffer( - "max_log_alpha", torch.tensor(max_alpha, device=device).log() - ) - self.fixed_alpha = fixed_alpha - if fixed_alpha: - self.register_buffer( - "log_alpha", torch.tensor(math.log(alpha_init), device=device) - ) - else: - self.register_parameter( - "log_alpha", - torch.nn.Parameter(torch.tensor(math.log(alpha_init), device=device)), - ) - - if target_entropy == "auto": - target_entropy = -float(np.log(1.0 / num_actions) * target_entropy_weight) - self.register_buffer( - "target_entropy", torch.tensor(target_entropy, device=device) - ) - - @property - def alpha(self): - self.log_alpha.data.clamp_(self.min_log_alpha, self.max_log_alpha) - with torch.no_grad(): - alpha = self.log_alpha.exp() - return alpha - - def forward(self, tensordict: TensorDictBase) -> TensorDictBase: - obs_keys = self.actor_network.in_keys - tensordict_select = tensordict.select("next", *obs_keys, "action") - - actor_params = torch.stack( - [self.actor_network_params, self.target_actor_network_params], 0 - ) - - tensordict_actor_grad = tensordict_select.select( - *obs_keys - ) # to avoid overwriting keys - next_td_actor = step_mdp(tensordict_select).select( - *self.actor_network.in_keys - ) # next_observation -> - tensordict_actor = torch.stack([tensordict_actor_grad, next_td_actor], 0) - tensordict_actor = tensordict_actor.contiguous() - - with set_exploration_mode("random"): - # vmap doesn't support sampling, so we take it out from the vmap - td_params = vmap(self.actor_network.get_dist_params)( - tensordict_actor, - actor_params, - ) - if isinstance(self.actor_network, ProbabilisticActor): - tensordict_actor_dist = self.actor_network.build_dist_from_params( - td_params - ) - else: - tensordict_actor_dist = self.actor_network.build_dist_from_params( - td_params - ) - probs = tensordict_actor_dist.probs - z = (probs == 0.0).float() * 1e-8 - logp_pi = torch.log(probs + z) - logp_pi_pol = torch.sum(probs * logp_pi, dim=-1, keepdim=True) - - # repeat tensordict_actor to match the qvalue size - _actor_loss_td = ( - tensordict_actor[0] - .select(*self.qvalue_network.in_keys) - .expand(self.num_qvalue_nets, *tensordict_actor[0].batch_size) - ) # for actor loss - _qval_td = tensordict_select.select(*self.qvalue_network.in_keys).expand( - self.num_qvalue_nets, - *tensordict_select.select(*self.qvalue_network.in_keys).batch_size, - ) # for qvalue loss - _next_val_td = ( - tensordict_actor[1] - .select(*self.qvalue_network.in_keys) - .expand(self.num_qvalue_nets, *tensordict_actor[1].batch_size) - ) # for next value estimation - tensordict_qval = torch.cat( - [ - _actor_loss_td, - _next_val_td, - _qval_td, - ], - 0, - ) - - # cat params - q_params_detach = self.qvalue_network_params.detach() - qvalue_params = torch.cat( - [ - q_params_detach, - self.target_qvalue_network_params, - self.qvalue_network_params, - ], - 0, - ) - tensordict_qval = vmap(self.qvalue_network)( - tensordict_qval, - qvalue_params, - ) - - state_action_value = tensordict_qval.get("state_value").squeeze(-1) - ( - state_action_value_actor, - next_state_action_value_qvalue, - state_action_value_qvalue, - ) = state_action_value.split( - [self.num_qvalue_nets, self.num_qvalue_nets, self.num_qvalue_nets], - dim=0, - ) - - loss_actor = -( - (state_action_value_actor.min(0)[0] * probs[0]).sum(-1, keepdim=True) - - self.alpha * logp_pi_pol[0] - ).mean() - - pred_next_val = ( - probs[1] - * (next_state_action_value_qvalue.min(0)[0] - self.alpha * logp_pi[1]) - ).sum(dim=-1, keepdim=True) - - target_value = next_state_value( - tensordict, - gamma=self.gamma, - pred_next_val=pred_next_val, - ) - - actions = torch.argmax(tensordict_select["action"], dim=-1) - - pred_val_1 = ( - state_action_value_qvalue[0].gather(-1, actions.unsqueeze(-1)).unsqueeze(0) - ) - pred_val_2 = ( - state_action_value_qvalue[1].gather(-1, actions.unsqueeze(-1)).unsqueeze(0) - ) - pred_val = torch.cat([pred_val_1, pred_val_2], dim=0).squeeze() - td_error = (pred_val - target_value.expand_as(pred_val)).pow(2) - loss_qval = ( - distance_loss( - pred_val, - target_value.expand_as(pred_val), - loss_function=self.loss_function, - ) - .mean(-1) - .sum() - * 0.5 - ) - - tensordict.set("td_error", td_error.detach().max(0)[0]) - - loss_alpha = self._loss_alpha(logp_pi_pol) - if not loss_qval.shape == loss_actor.shape: - raise RuntimeError( - f"QVal and actor loss have different shape: {loss_qval.shape} and {loss_actor.shape}" - ) - td_out = TensorDict( - { - "loss_actor": loss_actor.mean(), - "loss_qvalue": loss_qval.mean(), - "loss_alpha": loss_alpha.mean(), - "alpha": self.alpha.detach(), - "entropy": -logp_pi.mean().detach(), - "state_action_value_actor": state_action_value_actor.mean().detach(), - "action_log_prob_actor": logp_pi.mean().detach(), - "next.state_value": pred_next_val.mean().detach(), - "target_value": target_value.mean().detach(), - }, - [], - ) - - return td_out - - def _loss_alpha(self, log_pi: Tensor) -> Tensor: - if torch.is_grad_enabled() and not log_pi.requires_grad: - raise RuntimeError( - "expected log_pi to require gradient for the alpha loss)" - ) - if self.target_entropy is not None: - # we can compute this loss even if log_alpha is not a parameter - alpha_loss = -self.log_alpha.exp() * (log_pi.detach() + self.target_entropy) - else: - # placeholder - alpha_loss = torch.zeros_like(log_pi) - return alpha_loss diff --git a/torchrl/objectives/value/advantages.py b/torchrl/objectives/value/advantages.py index 7f91bc349c3..65a0f81c7c8 100644 --- a/torchrl/objectives/value/advantages.py +++ b/torchrl/objectives/value/advantages.py @@ -65,6 +65,20 @@ def forward( """ raise NotImplementedError + def value_estimate(self, tensordict, requires_grad=False, target_params: Optional[TensorDictBase] = None): + """Gets a value estimate, usually used as a target value for the value network. + + Args: + tensordict (TensorDictBase): the tensordict containing the data to + read. + requires_grad (bool, optional): whether the estimate should be part + of a computational graph. + Defaults to ``False``. + target_params (TensorDictBase, optional): A nested TensorDict containing the + target params to be passed to the functional value network module. + + """ + raise NotImplementedError class TDEstimate(ValueFunctionBase): """Temporal Difference estimate of advantage function. @@ -202,7 +216,6 @@ def forward( ("next", "reward"), reward ) # we must update the rewards if they are used later in the code - gamma = self.gamma kwargs = {} if self.is_functional and params is None: raise RuntimeError( @@ -214,24 +227,30 @@ def forward( self.value_network(tensordict, **kwargs) value = tensordict.get(self.value_key) + if params is not None and target_params is None: + target_params = params.detach() + value_target = self.value_estimate(tensordict, target_params=target_params) + tensordict.set("advantage", value_target - value) + tensordict.set("value_target", value_target) + return tensordict + + def value_estimate(self, tensordict, requires_grad=False, target_params: Optional[TensorDictBase] = None): + kwargs = {} + gamma = self.gamma # we may still need to pass gradient, but we don't want to assign grads to # value net params + reward = tensordict.get(("next", "reward")) step_td = step_mdp(tensordict) if target_params is not None: # we assume that target parameters are not differentiable kwargs["params"] = target_params - elif "params" in kwargs: - kwargs["params"] = kwargs["params"].detach() with hold_out_net(self.value_network): self.value_network(step_td, **kwargs) next_value = step_td.get(self.value_key) done = tensordict.get(("next", "done")) - adv = td_advantage_estimate(gamma, value, next_value, reward, done) - tensordict.set("advantage", adv) - tensordict.set("value_target", adv + value) - return tensordict - + value_target = td_advantage_estimate(gamma, torch.zeros_like(next_value), next_value, reward, done) + return value_target class TDLambdaEstimate(ValueFunctionBase): """TD-Lambda estimate of advantage function. From efb57a85cf6ff851b2d9beb54501ed732b1fefca Mon Sep 17 00:00:00 2001 From: vmoens Date: Fri, 24 Mar 2023 14:08:07 +0000 Subject: [PATCH 15/89] temp --- torchrl/objectives/value/advantages.py | 52 ++++++++++++++------------ 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/torchrl/objectives/value/advantages.py b/torchrl/objectives/value/advantages.py index 65a0f81c7c8..d339ebf8c3b 100644 --- a/torchrl/objectives/value/advantages.py +++ b/torchrl/objectives/value/advantages.py @@ -208,13 +208,6 @@ def forward( "Expected input tensordict to have at least one dimensions, got" f"tensordict.batch_size = {tensordict.batch_size}" ) - reward = tensordict.get(("next", "reward")) - if self.average_rewards: - reward = reward - reward.mean() - reward = reward / reward.std().clamp_min(1e-4) - tensordict.set( - ("next", "reward"), reward - ) # we must update the rewards if they are used later in the code kwargs = {} if self.is_functional and params is None: @@ -240,6 +233,12 @@ def value_estimate(self, tensordict, requires_grad=False, target_params: Optiona # we may still need to pass gradient, but we don't want to assign grads to # value net params reward = tensordict.get(("next", "reward")) + if self.average_rewards: + reward = reward - reward.mean() + reward = reward / reward.std().clamp_min(1e-4) + tensordict.set( + ("next", "reward"), reward + ) # we must update the rewards if they are used later in the code step_td = step_mdp(tensordict) if target_params is not None: # we assume that target parameters are not differentiable @@ -389,17 +388,6 @@ def forward( "Expected input tensordict to have at least one dimensions, got" f"tensordict.batch_size = {tensordict.batch_size}" ) - reward = tensordict.get(("next", "reward")) - if self.average_rewards: - reward = reward - reward.mean() - reward = reward / reward.std().clamp_min(1e-4) - tensordict.set( - ("next", "reward"), reward - ) # we must update the rewards if they are used later in the code - - gamma = self.gamma - lmbda = self.lmbda - kwargs = {} if self.is_functional and params is None: raise RuntimeError( @@ -410,13 +398,33 @@ def forward( with hold_out_net(self.value_network): self.value_network(tensordict, **kwargs) value = tensordict.get(self.value_key) + if params is not None and target_params is None: + target_params = params.detach() + value_target = self.value_estimate(tensordict, target_params=target_params) + + tensordict.set(self.advantage_key, value_target-value) + tensordict.set(self.value_target_key, value_target) + return tensordict + + def value_estimate(self, tensordict, requires_grad=False, target_params: Optional[TensorDictBase] = None): + + gamma = self.gamma + lmbda = self.lmbda + reward = tensordict.get(("next", "reward")) + if self.average_rewards: + reward = reward - reward.mean() + reward = reward / reward.std().clamp_min(1e-4) + tensordict.set( + ("next", "reward"), reward + ) # we must update the rewards if they are used later in the code + + + kwargs = {} step_td = step_mdp(tensordict) if target_params is not None: # we assume that target parameters are not differentiable kwargs["params"] = target_params - elif "params" in kwargs: - kwargs["params"] = kwargs["params"].detach() with hold_out_net(self.value_network): # we may still need to pass gradient, but we don't want to assign grads to # value net params @@ -433,10 +441,6 @@ def forward( gamma, lmbda, value, next_value, reward, done ) - tensordict.set(self.advantage_key, adv) - tensordict.set(self.value_target_key, adv + value) - return tensordict - class GAE(ValueFunctionBase): """A class wrapper around the generalized advantage estimate functional. From 48c227ae6fa3fc306b959e41471d932e9386d563 Mon Sep 17 00:00:00 2001 From: vmoens Date: Mon, 27 Mar 2023 07:51:26 +0100 Subject: [PATCH 16/89] tmp --- torchrl/objectives/common.py | 40 ++++ torchrl/objectives/dqn.py | 32 ++- torchrl/objectives/sac.py | 300 +++++++++++++++++++++++-- torchrl/objectives/utils.py | 34 ++- torchrl/objectives/value/advantages.py | 217 +++++++++++++++--- 5 files changed, 568 insertions(+), 55 deletions(-) diff --git a/torchrl/objectives/common.py b/torchrl/objectives/common.py index 134da3b1e34..c53c9214de1 100644 --- a/torchrl/objectives/common.py +++ b/torchrl/objectives/common.py @@ -18,6 +18,7 @@ from torch.nn import Parameter from torchrl.modules.utils import Buffer +from torchrl.objectives.utils import ValueFunctions from torchrl.objectives.value import ValueFunctionBase _has_functorch = False @@ -362,3 +363,42 @@ def _default_value_function(self) -> ValueFunctionBase: """ raise NotImplementedError + + def make_value_function(self, value_type: ValueFunctions, **hyperparams): + """Value-function constructor. + + If the non-default value function is wanted, it must be built using + this method. + + Args: + value_type (ValueFunctions): A :class:`torchrl.objectives.utils.ValueFunctions` + enum type indicating the value function to use. + **hyperparams: hyperparameters to use for the value function. + If not provided, the value indicated by + :func:`torchrl.objectives.utils.default_value_kwargs` will be + used. + + Examples: + >>> # initialize the DQN loss + >>> dqn_loss = DQNLoss(actor) + >>> dqn_loss.make_value_function( + ... ValueFunctions.TD1, + ... gamma=0.9) + + """ + if value_type == ValueFunctions.TD1: + raise NotImplementedError(f"Value type {value_type} it not implemented for loss {type(self)}.") + elif value_type == ValueFunctions.TD0: + raise NotImplementedError( + f"Value type {value_type} it not implemented for loss {type(self)}." + ) + elif value_type == ValueFunctions.GAE: + raise NotImplementedError( + f"Value type {value_type} it not implemented for loss {type(self)}." + ) + elif value_type == ValueFunctions.TDLambda: + raise NotImplementedError( + f"Value type {value_type} it not implemented for loss {type(self)}." + ) + else: + raise NotImplementedError(f"Unknown value type {value_type}") diff --git a/torchrl/objectives/dqn.py b/torchrl/objectives/dqn.py index 57329cb0404..41a6887a858 100644 --- a/torchrl/objectives/dqn.py +++ b/torchrl/objectives/dqn.py @@ -7,7 +7,6 @@ import torch from tensordict import TensorDict, TensorDictBase -from tensordict.nn import make_functional from torch import nn from torchrl.envs.utils import step_mdp @@ -15,7 +14,7 @@ from torchrl.modules.tensordict_module.common import ensure_tensordict_compatible from .common import LossModule -from .utils import DEFAULT_VALUE_FUN_PARAMS, distance_loss, next_state_value +from .utils import distance_loss, ValueFunctions, default_value_kwargs from .value import TDLambdaEstimate, ValueFunctionBase @@ -72,6 +71,35 @@ def __init__( self.priority_key = priority_key self.action_space = self.value_network.action_space + def make_value_function( + self, + value_type: ValueFunctions, + **hyperparams + ): + hp = dict(default_value_kwargs(value_type)) + hp.update(hyperparams) + if value_type == ValueFunctions.TD1: + raise NotImplementedError(f"Value type {value_type} it not implemented for loss {type(self)}.") + elif value_type == ValueFunctions.TD0: + raise NotImplementedError( + f"Value type {value_type} it not implemented for loss {type(self)}." + ) + elif value_type == ValueFunctions.GAE: + raise NotImplementedError( + f"Value type {value_type} it not implemented for loss {type(self)}." + ) + elif value_type == ValueFunctions.TDLambda: + return TDLambdaEstimate( + **hp, + value_network=self.value_network, + advantage_key="advantage", + value_target_key="value_target", + value_key="chosen_action_value", + ) + else: + raise NotImplementedError(f"Unknown value type {value_type}") + + def _default_value_function(self): return TDLambdaEstimate( gamma=DEFAULT_VALUE_FUN_PARAMS.gamma, diff --git a/torchrl/objectives/sac.py b/torchrl/objectives/sac.py index f5318957d1a..424e2f28bfb 100644 --- a/torchrl/objectives/sac.py +++ b/torchrl/objectives/sac.py @@ -15,8 +15,7 @@ from torchrl.modules import ProbabilisticActor from torchrl.modules.tensordict_module.actors import ActorCriticWrapper -from torchrl.objectives.utils import distance_loss, next_state_value, \ - DEFAULT_VALUE_FUN_PARAMS +from torchrl.objectives.utils import distance_loss, next_state_value from .value import ValueFunctionBase, TDLambdaEstimate from ..envs.utils import set_exploration_mode, step_mdp @@ -181,8 +180,13 @@ def __init__( if value_function is None: value_function = self._default_value_function() + elif self._version == 1: + # in v1, the next value requires an action to be sampled + value_function.value_network = self.actor_critic else: - value_function.value_key = "chosen_action_value" + # TODO + pass + self.value_function = value_function @@ -190,7 +194,7 @@ def _default_value_function(self): return TDLambdaEstimate( gamma=DEFAULT_VALUE_FUN_PARAMS.gamma, lmbda=DEFAULT_VALUE_FUN_PARAMS.lmbda, - value_network=self.value_network if self._version == 1 else self.qvalue_network, + value_network=self.actor_critic if self._version == 1 else self.qvalue_network, average_rewards=True, differentiable=False, vectorized=True, @@ -244,10 +248,7 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: } if self._version == 1: out["loss_value"] = loss_value.mean() - return TensorDict( - out, - [], - ) + return TensorDict(out,[]) def _loss_actor(self, tensordict: TensorDictBase) -> Tensor: # KL lossa @@ -278,8 +279,7 @@ def _loss_actor(self, tensordict: TensorDictBase) -> Tensor: return self._alpha * log_prob - min_q_logprob def _loss_qvalue_v1(self, tensordict: TensorDictBase) -> Tuple[Tensor, Tensor]: - actor_critic = self.actor_critic - params = TensorDict( + target_params = TensorDict( { "module": { "0": self.target_actor_network_params, @@ -290,19 +290,16 @@ def _loss_qvalue_v1(self, tensordict: TensorDictBase) -> Tuple[Tensor, Tensor]: _run_checks=False, ) with set_exploration_mode("mode"): - target_value = next_state_value( + target_value = self.value_function.value_estimate( tensordict, - actor_critic, - gamma=self.gamma, - next_val_key="state_value", - params=params, + target_params=target_params ) # value loss qvalue_network = self.qvalue_network - # Q-nets must be trained independently: as such, we split the data in 2 if required and train each q-net on - # one half of the data. + # Q-nets must be trained independently: as such, we split the data in 2 + # if required and train each q-net on one half of the data. shape = tensordict.shape if shape[0] % self.num_qvalue_nets != 0: raise RuntimeError( @@ -341,8 +338,8 @@ def _loss_qvalue_v2(self, tensordict: TensorDictBase) -> Tuple[Tensor, Tensor]: next_td, params=self.target_actor_network_params, ) - next_td["action"] = dist.rsample() - next_td["sample_log_prob"] = dist.log_prob(next_td["action"]) + next_td.set("action", dist.rsample()) + next_td.set("sample_log_prob", dist.log_prob(next_td["action"])) sample_log_prob = next_td.get("sample_log_prob") # get q-values next_td = vmap(self.qvalue_network, (None, 0))( @@ -356,7 +353,7 @@ def _loss_qvalue_v2(self, tensordict: TensorDictBase) -> Tuple[Tensor, Tensor]: ): sample_log_prob = sample_log_prob.unsqueeze(-1) state_value = ( - next_td.get("state_action_value") - self._alpha * sample_log_prob + state_action_value - self._alpha * sample_log_prob ) state_value = state_value.min(0)[0] @@ -434,3 +431,266 @@ def _alpha(self): with torch.no_grad(): alpha = self.log_alpha.exp() return alpha + + +class DiscreteSACLoss(LossModule): + """Discrete SAC Loss module. + + Args: + actor_network (ProbabilisticActor): the actor to be trained + qvalue_network (TensorDictModule): a single Q-value network that will be multiplicated as many times as needed. + num_qvalue_nets (int, optional): Number of Q-value networks to be trained. Default is 10. + gamma (Number, optional): gamma decay factor. Default is 0.99. + priotity_key (str, optional): Key where to write the priority value for prioritized replay buffers. Default is + `"td_error"`. + loss_function (str, optional): loss function to be used for the Q-value. Can be one of `"smooth_l1"`, "l2", + "l1", Default is "smooth_l1". + alpha_init (float, optional): initial entropy multiplier. + Default is 1.0. + min_alpha (float, optional): min value of alpha. + Default is 0.1. + max_alpha (float, optional): max value of alpha. + Default is 10.0. + fixed_alpha (bool, optional): whether alpha should be trained to match a target entropy. Default is :obj:`False`. + target_entropy_weight (float, optional): weight for the target entropy term. + target_entropy (Union[str, Number], optional): Target entropy for the stochastic policy. Default is "auto". + delay_qvalue (bool, optional): Whether to separate the target Q value networks from the Q value networks used + for data collection. Default is :obj:`False`. + """ + + delay_actor: bool = False + + def __init__( + self, + actor_network: ProbabilisticActor, + qvalue_network: TensorDictModule, + num_actions: int, + num_qvalue_nets: int = 2, + gamma: Number = 0.99, + priotity_key: str = "td_error", + loss_function: str = "smooth_l1", + alpha_init: float = 1.0, + min_alpha: float = 0.1, + max_alpha: float = 10.0, + fixed_alpha: bool = False, + target_entropy_weight: float = 0.98, + target_entropy: Union[str, Number] = "auto", + delay_qvalue: bool = True, + ): + if not _has_functorch: + raise ImportError("Failed to import functorch.") from FUNCTORCH_ERROR + super().__init__() + self.convert_to_functional( + actor_network, + "actor_network", + create_target_params=self.delay_actor, + funs_to_decorate=["forward", "get_dist_params"], + ) + + self.delay_qvalue = delay_qvalue + self.convert_to_functional( + qvalue_network, + "qvalue_network", + num_qvalue_nets, + create_target_params=self.delay_qvalue, + compare_against=list(actor_network.parameters()), + ) + self.num_qvalue_nets = num_qvalue_nets + self.register_buffer("gamma", torch.tensor(gamma)) + self.priority_key = priotity_key + self.loss_function = loss_function + + try: + device = next(self.parameters()).device + except AttributeError: + device = torch.device("cpu") + + self.register_buffer("alpha_init", torch.tensor(alpha_init, device=device)) + self.register_buffer( + "min_log_alpha", torch.tensor(min_alpha, device=device).log() + ) + self.register_buffer( + "max_log_alpha", torch.tensor(max_alpha, device=device).log() + ) + self.fixed_alpha = fixed_alpha + if fixed_alpha: + self.register_buffer( + "log_alpha", torch.tensor(math.log(alpha_init), device=device) + ) + else: + self.register_parameter( + "log_alpha", + torch.nn.Parameter(torch.tensor(math.log(alpha_init), device=device)), + ) + + if target_entropy == "auto": + target_entropy = -float(np.log(1.0 / num_actions) * target_entropy_weight) + self.register_buffer( + "target_entropy", torch.tensor(target_entropy, device=device) + ) + + @property + def alpha(self): + self.log_alpha.data.clamp_(self.min_log_alpha, self.max_log_alpha) + with torch.no_grad(): + alpha = self.log_alpha.exp() + return alpha + + def forward(self, tensordict: TensorDictBase) -> TensorDictBase: + obs_keys = self.actor_network.in_keys + tensordict_select = tensordict.select("next", *obs_keys, "action") + + actor_params = torch.stack( + [self.actor_network_params, self.target_actor_network_params], 0 + ) + + tensordict_actor_grad = tensordict_select.select( + *obs_keys + ) # to avoid overwriting keys + next_td_actor = step_mdp(tensordict_select).select( + *self.actor_network.in_keys + ) # next_observation -> + tensordict_actor = torch.stack([tensordict_actor_grad, next_td_actor], 0) + tensordict_actor = tensordict_actor.contiguous() + + with set_exploration_mode("random"): + # vmap doesn't support sampling, so we take it out from the vmap + td_params = vmap(self.actor_network.get_dist_params)( + tensordict_actor, + actor_params, + ) + if isinstance(self.actor_network, ProbabilisticActor): + tensordict_actor_dist = self.actor_network.build_dist_from_params( + td_params + ) + else: + tensordict_actor_dist = self.actor_network.build_dist_from_params( + td_params + ) + probs = tensordict_actor_dist.probs + z = (probs == 0.0).float() * 1e-8 + logp_pi = torch.log(probs + z) + logp_pi_pol = torch.sum(probs * logp_pi, dim=-1, keepdim=True) + + # repeat tensordict_actor to match the qvalue size + _actor_loss_td = ( + tensordict_actor[0] + .select(*self.qvalue_network.in_keys) + .expand(self.num_qvalue_nets, *tensordict_actor[0].batch_size) + ) # for actor loss + _qval_td = tensordict_select.select(*self.qvalue_network.in_keys).expand( + self.num_qvalue_nets, + *tensordict_select.select(*self.qvalue_network.in_keys).batch_size, + ) # for qvalue loss + _next_val_td = ( + tensordict_actor[1] + .select(*self.qvalue_network.in_keys) + .expand(self.num_qvalue_nets, *tensordict_actor[1].batch_size) + ) # for next value estimation + tensordict_qval = torch.cat( + [ + _actor_loss_td, + _next_val_td, + _qval_td, + ], + 0, + ) + + # cat params + q_params_detach = self.qvalue_network_params.detach() + qvalue_params = torch.cat( + [ + q_params_detach, + self.target_qvalue_network_params, + self.qvalue_network_params, + ], + 0, + ) + tensordict_qval = vmap(self.qvalue_network)( + tensordict_qval, + qvalue_params, + ) + + state_action_value = tensordict_qval.get("state_value").squeeze(-1) + ( + state_action_value_actor, + next_state_action_value_qvalue, + state_action_value_qvalue, + ) = state_action_value.split( + [self.num_qvalue_nets, self.num_qvalue_nets, self.num_qvalue_nets], + dim=0, + ) + + loss_actor = -( + (state_action_value_actor.min(0)[0] * probs[0]).sum(-1, keepdim=True) + - self.alpha * logp_pi_pol[0] + ).mean() + + pred_next_val = ( + probs[1] + * (next_state_action_value_qvalue.min(0)[0] - self.alpha * logp_pi[1]) + ).sum(dim=-1, keepdim=True) + + target_value = next_state_value( + tensordict, + gamma=self.gamma, + pred_next_val=pred_next_val, + ) + + actions = torch.argmax(tensordict_select["action"], dim=-1) + + pred_val_1 = ( + state_action_value_qvalue[0].gather(-1, actions.unsqueeze(-1)).unsqueeze(0) + ) + pred_val_2 = ( + state_action_value_qvalue[1].gather(-1, actions.unsqueeze(-1)).unsqueeze(0) + ) + pred_val = torch.cat([pred_val_1, pred_val_2], dim=0).squeeze() + td_error = (pred_val - target_value.expand_as(pred_val)).pow(2) + loss_qval = ( + distance_loss( + pred_val, + target_value.expand_as(pred_val), + loss_function=self.loss_function, + ) + .mean(-1) + .sum() + * 0.5 + ) + + tensordict.set("td_error", td_error.detach().max(0)[0]) + + loss_alpha = self._loss_alpha(logp_pi_pol) + if not loss_qval.shape == loss_actor.shape: + raise RuntimeError( + f"QVal and actor loss have different shape: {loss_qval.shape} and {loss_actor.shape}" + ) + td_out = TensorDict( + { + "loss_actor": loss_actor.mean(), + "loss_qvalue": loss_qval.mean(), + "loss_alpha": loss_alpha.mean(), + "alpha": self.alpha.detach(), + "entropy": -logp_pi.mean().detach(), + "state_action_value_actor": state_action_value_actor.mean().detach(), + "action_log_prob_actor": logp_pi.mean().detach(), + "next.state_value": pred_next_val.mean().detach(), + "target_value": target_value.mean().detach(), + }, + [], + ) + + return td_out + + def _loss_alpha(self, log_pi: Tensor) -> Tensor: + if torch.is_grad_enabled() and not log_pi.requires_grad: + raise RuntimeError( + "expected log_pi to require gradient for the alpha loss)" + ) + if self.target_entropy is not None: + # we can compute this loss even if log_alpha is not a parameter + alpha_loss = -self.log_alpha.exp() * (log_pi.detach() + self.target_entropy) + else: + # placeholder + alpha_loss = torch.zeros_like(log_pi) + return alpha_loss diff --git a/torchrl/objectives/utils.py b/torchrl/objectives/utils.py index e18b6c841c4..fc164430cb5 100644 --- a/torchrl/objectives/utils.py +++ b/torchrl/objectives/utils.py @@ -4,6 +4,7 @@ # LICENSE file in the root directory of this source tree. import functools +from enum import Enum from typing import Iterable, Optional, Union import torch @@ -15,9 +16,36 @@ from torchrl.envs.utils import step_mdp -class DEFAULT_VALUE_FUN_PARAMS: - gamma = 0.99 - lmbda = 0.95 +class ValueFunctions(Enum): + TD0 = 1 + TD1 = 2 + TDLambda = 3 + GAE = 4 + +def default_value_kwargs(value_type: ValueFunctions): + """Default value function keyword argument generator. + + Args: + value_type (Enum.value): the value function type, from the + :class:`torchrl.objectives.utils.ValueFunctions` class. + + Examples: + >>> kwargs = default_value_kwargs(ValueFunctions.TDLambda) + {"gamma": 0.99, "lmbda": 0.95} + + """ + if value_type == ValueFunctions.TD1: + return {"gamma": 0.99} + elif value_type == ValueFunctions.TD0: + return {"gamma": 0.99} + elif value_type == ValueFunctions.GAE: + return {"gamma": 0.99, "lmbda": 0.95} + elif value_type == ValueFunctions.TDLambda: + return {"gamma": 0.99, "lmbda": 0.95} + else: + raise NotImplementedError(f"Unknown value type {value_type}.") + + class _context_manager: diff --git a/torchrl/objectives/value/advantages.py b/torchrl/objectives/value/advantages.py index d339ebf8c3b..7465206ce78 100644 --- a/torchrl/objectives/value/advantages.py +++ b/torchrl/objectives/value/advantages.py @@ -7,7 +7,7 @@ from typing import List, Optional, Tuple, Union import torch -from tensordict.nn import dispatch, TensorDictModule +from tensordict.nn import dispatch, TensorDictModule, is_functional from tensordict.tensordict import TensorDictBase from torch import nn, Tensor @@ -65,7 +65,7 @@ def forward( """ raise NotImplementedError - def value_estimate(self, tensordict, requires_grad=False, target_params: Optional[TensorDictBase] = None): + def value_estimate(self, tensordict, requires_grad=True, target_params: Optional[TensorDictBase] = None): """Gets a value estimate, usually used as a target value for the value network. Args: @@ -73,15 +73,29 @@ def value_estimate(self, tensordict, requires_grad=False, target_params: Optiona read. requires_grad (bool, optional): whether the estimate should be part of a computational graph. - Defaults to ``False``. + .. note:: + To avoid carrying gradient with respect to the parameters, + one can also use ``val_fun.value_estimate(tensordict, target_params=params.detach())`` + which allows gradients to pass through the value function + without including the parameters in the computational graph. + + Defaults to ``True``. target_params (TensorDictBase, optional): A nested TensorDict containing the target params to be passed to the functional value network module. """ raise NotImplementedError -class TDEstimate(ValueFunctionBase): - """Temporal Difference estimate of advantage function. + @property + def is_functional(self): + if isinstance(self.value_network, nn.Module): + return is_functional(self.value_network) + else: + raise RuntimeError("Cannot determine if value network is functional.") + + +class TD0Estimate(ValueFunctionBase): + """Myopic Temporal Difference (TD(0)) estimate of advantage function. Args: gamma (scalar): exponential mean discount. @@ -135,12 +149,169 @@ def __init__( ) self.out_keys = [self.advantage_key, self.value_target_key] - @property - def is_functional(self): - return ( - "_is_stateless" in self.value_network.__dict__ - and self.value_network.__dict__["_is_stateless"] + @_self_set_grad_enabled + @dispatch + def forward( + self, + tensordict: TensorDictBase, + params: Optional[TensorDictBase] = None, + target_params: Optional[TensorDictBase] = None, + ) -> TensorDictBase: + """Computes the TDEstimate given the data in tensordict. + + If a functional module is provided, a nested TensorDict containing the parameters + (and if relevant the target parameters) can be passed to the module. + + Args: + tensordict (TensorDictBase): A TensorDict containing the data + (an observation key, "action", ("next", "reward"), ("next", "done") and "next" tensordict state + as returned by the environment) necessary to compute the value estimates and the TDEstimate. + The data passed to this module should be structured as :obj:`[*B, T, F]` where :obj:`B` are + the batch size, :obj:`T` the time dimension and :obj:`F` the feature dimension(s). + params (TensorDictBase, optional): A nested TensorDict containing the params + to be passed to the functional value network module. + target_params (TensorDictBase, optional): A nested TensorDict containing the + target params to be passed to the functional value network module. + + Returns: + An updated TensorDict with an advantage and a value_error keys as defined in the constructor. + + Examples: + >>> from tensordict import TensorDict + >>> value_net = TensorDictModule( + ... nn.Linear(3, 1), in_keys=["obs"], out_keys=["state_value"] + ... ) + >>> module = TDEstimate( + ... gamma=0.98, + ... value_network=value_net, + ... differentiable=False, + ... ) + >>> obs, next_obs = torch.randn(2, 1, 10, 3) + >>> reward = torch.randn(1, 10, 1) + >>> done = torch.zeros(1, 10, 1, dtype=torch.bool) + >>> tensordict = TensorDict({"obs": obs, "next": {"obs": next_obs, "done": done, "reward": reward}}, [1, 10]) + >>> _ = module(tensordict) + >>> assert "advantage" in tensordict.keys() + + The module supports non-tensordict (i.e. unpacked tensordict) inputs too: + + Examples: + >>> value_net = TensorDictModule( + ... nn.Linear(3, 1), in_keys=["obs"], out_keys=["state_value"] + ... ) + >>> module = TDEstimate( + ... gamma=0.98, + ... value_network=value_net, + ... differentiable=False, + ... ) + >>> obs, next_obs = torch.randn(2, 1, 10, 3) + >>> reward = torch.randn(1, 10, 1) + >>> done = torch.zeros(1, 10, 1, dtype=torch.bool) + >>> advantage, value_target = module(obs=obs, reward=reward, done=done, next_obs=next_obs) + + """ + if tensordict.batch_dims < 1: + raise RuntimeError( + "Expected input tensordict to have at least one dimensions, got" + f"tensordict.batch_size = {tensordict.batch_size}" + ) + + kwargs = {} + if self.is_functional and params is None: + raise RuntimeError( + "Expected params to be passed to advantage module but got none." + ) + if params is not None: + kwargs["params"] = params.detach() + with hold_out_net(self.value_network): + self.value_network(tensordict, **kwargs) + value = tensordict.get(self.value_key) + + if params is not None and target_params is None: + target_params = params.detach() + value_target = self.value_estimate(tensordict, target_params=target_params) + tensordict.set("advantage", value_target - value) + tensordict.set("value_target", value_target) + return tensordict + + def value_estimate(self, tensordict, requires_grad=False, target_params: Optional[TensorDictBase] = None): + kwargs = {} + gamma = self.gamma + # we may still need to pass gradient, but we don't want to assign grads to + # value net params + reward = tensordict.get(("next", "reward")) + if self.average_rewards: + reward = reward - reward.mean() + reward = reward / reward.std().clamp_min(1e-4) + tensordict.set( + ("next", "reward"), reward + ) # we must update the rewards if they are used later in the code + step_td = step_mdp(tensordict) + if target_params is not None: + # we assume that target parameters are not differentiable + kwargs["params"] = target_params + with hold_out_net(self.value_network): + self.value_network(step_td, **kwargs) + next_value = step_td.get(self.value_key) + + done = tensordict.get(("next", "done")) + value_target = reward + gamma * (1 - done.to(reward.dtype)) * next_value + return value_target + +class TD1Estimate(ValueFunctionBase): + """Bootstrapped Temporal Difference (TD(1)) estimate of advantage function. + + Args: + gamma (scalar): exponential mean discount. + value_network (TensorDictModule): value operator used to retrieve the value estimates. + average_rewards (bool, optional): if True, rewards will be standardized + before the TD is computed. + differentiable (bool, optional): if True, gradients are propagated throught + the computation of the value function. Default is :obj:`False`. + advantage_key (str or tuple of str, optional): the key of the advantage entry. + Defaults to "advantage". + value_target_key (str or tuple of str, optional): the key of the advantage entry. + Defaults to "value_target". + value_key (str or tuple of str, optional): the value key to read from the input tensordict. + Defaults to "state_value". + + """ + + def __init__( + self, + gamma: Union[float, torch.Tensor], + value_network: TensorDictModule, + average_rewards: bool = False, + differentiable: bool = False, + advantage_key: Union[str, Tuple] = "advantage", + value_target_key: Union[str, Tuple] = "value_target", + value_key: Union[str, Tuple] = "state_value", + ): + super().__init__() + try: + device = next(value_network.parameters()).device + except StopIteration: + device = torch.device("cpu") + self.register_buffer("gamma", torch.tensor(gamma, device=device)) + self.value_network = value_network + + self.average_rewards = average_rewards + self.differentiable = differentiable + self.value_key = value_key + if value_key not in value_network.out_keys: + raise KeyError( + f"value key '{value_key}' not found in value network out_keys." + ) + + self.advantage_key = advantage_key + self.value_target_key = value_target_key + + self.in_keys = ( + value_network.in_keys + + [("next", "reward"), ("next", "done")] + + [("next", in_key) for in_key in value_network.in_keys] ) + self.out_keys = [self.advantage_key, self.value_target_key] @_self_set_grad_enabled @dispatch @@ -252,7 +423,7 @@ def value_estimate(self, tensordict, requires_grad=False, target_params: Optiona return value_target class TDLambdaEstimate(ValueFunctionBase): - """TD-Lambda estimate of advantage function. + """TD(:math:`\lambda`) estimate of advantage function. Args: gamma (scalar): exponential mean discount. @@ -313,13 +484,6 @@ def __init__( ) self.out_keys = [self.advantage_key, self.value_target_key] - @property - def is_functional(self): - return ( - "_is_stateless" in self.value_network.__dict__ - and self.value_network.__dict__["_is_stateless"] - ) - @_self_set_grad_enabled @dispatch def forward( @@ -433,14 +597,14 @@ def value_estimate(self, tensordict, requires_grad=False, target_params: Optiona done = tensordict.get(("next", "done")) if self.vectorized: - adv = vec_td_lambda_advantage_estimate( - gamma, lmbda, value, next_value, reward, done + val = vec_td_lambda_advantage_estimate( + gamma, lmbda, torch.zeros_like(next_value), next_value, reward, done ) else: - adv = td_lambda_advantage_estimate( - gamma, lmbda, value, next_value, reward, done + val = td_lambda_advantage_estimate( + gamma, lmbda, torch.zeros_like(next_value), next_value, reward, done ) - + return val class GAE(ValueFunctionBase): """A class wrapper around the generalized advantage estimate functional. @@ -510,13 +674,6 @@ def __init__( ) self.out_keys = [self.advantage_key, self.value_target_key] - @property - def is_functional(self): - return ( - "_is_stateless" in self.value_network.__dict__ - and self.value_network.__dict__["_is_stateless"] - ) - @_self_set_grad_enabled @dispatch def forward( From 1e34ef9d320aeac147e178f9f3273c519decbeb4 Mon Sep 17 00:00:00 2001 From: vmoens Date: Mon, 27 Mar 2023 11:46:05 +0100 Subject: [PATCH 17/89] SAC --- docs/source/reference/objectives.rst | 7 +- test/test_cost.py | 45 ++++--- torchrl/data/postprocs/postprocs.py | 2 +- torchrl/objectives/__init__.py | 2 + torchrl/objectives/common.py | 25 +++- torchrl/objectives/dqn.py | 111 +++++++-------- torchrl/objectives/sac.py | 179 +++++++++++++++---------- torchrl/objectives/utils.py | 14 +- torchrl/objectives/value/__init__.py | 8 +- torchrl/objectives/value/advantages.py | 162 +++++++++++++++++----- 10 files changed, 368 insertions(+), 187 deletions(-) diff --git a/docs/source/reference/objectives.rst b/docs/source/reference/objectives.rst index 2a7a2f92be7..ff178fba548 100644 --- a/docs/source/reference/objectives.rst +++ b/docs/source/reference/objectives.rst @@ -109,9 +109,10 @@ Returns :template: rl_template_noinherit.rst ValueFunctionBase - GAE + TD0Estimate + TD1Estimate TDLambdaEstimate - TDEstimate + GAE functional.generalized_advantage_estimate functional.vec_generalized_advantage_estimate functional.vec_td_lambda_return_estimate @@ -135,3 +136,5 @@ Utils next_state_value SoftUpdate HardUpdate + ValueFunctions + default_value_kwargs diff --git a/test/test_cost.py b/test/test_cost.py index 2359c34f6df..409bfa0e540 100644 --- a/test/test_cost.py +++ b/test/test_cost.py @@ -89,7 +89,7 @@ from torchrl.objectives.redq import REDQLoss from torchrl.objectives.reinforce import ReinforceLoss from torchrl.objectives.utils import HardUpdate, hold_out_net, SoftUpdate -from torchrl.objectives.value.advantages import GAE, TDEstimate, TDLambdaEstimate +from torchrl.objectives.value.advantages import GAE, TD1Estimate, TDLambdaEstimate from torchrl.objectives.value.functional import ( generalized_advantage_estimate, td_advantage_estimate, @@ -109,7 +109,9 @@ def __enter__(self): pass def __exit__(self, exc_type, exc_val, exc_tb): - assert (self.td.select(*self.td_clone.keys()) == self.td_clone).all(), "Some keys have been modified in the tensordict!" + assert ( + self.td.select(*self.td_clone.keys()) == self.td_clone + ).all(), "Some keys have been modified in the tensordict!" def get_devices(): @@ -340,9 +342,7 @@ def test_dqn_batcher(self, n, delay_value, device, action_spec_type, gamma=0.9): td = self._create_seq_mock_data_dqn( action_spec_type=action_spec_type, device=device ) - loss_fn = DQNLoss( - actor, loss_function="l2", delay_value=delay_value - ) + loss_fn = DQNLoss(actor, loss_function="l2", delay_value=delay_value) ms = MultiStep(gamma=gamma, n_steps=n).to(device) ms_td = ms(td.clone()) @@ -1072,7 +1072,6 @@ def test_sac( qvalue_network=qvalue, value_network=value, num_qvalue_nets=num_qvalue, - gamma=0.9, loss_function="l2", **kwargs, ) @@ -1197,7 +1196,6 @@ def test_sac_batcher( num_qvalue, device, version, - gamma=0.9, ): if (delay_actor or delay_qvalue) and not delay_value: pytest.skip("incompatible config") @@ -1224,12 +1222,11 @@ def test_sac_batcher( qvalue_network=qvalue, value_network=value, num_qvalue_nets=num_qvalue, - gamma=0.9, loss_function="l2", **kwargs, ) - ms = MultiStep(gamma=gamma, n_steps=n).to(device) + ms = MultiStep(gamma=0.9, n_steps=n).to(device) td_clone = td.clone() ms_td = ms(td_clone) @@ -2256,7 +2253,7 @@ def test_ppo(self, loss_class, device, gradient_mode, advantage): gamma=0.9, lmbda=0.9, value_network=value, differentiable=gradient_mode ) elif advantage == "td": - advantage = TDEstimate( + advantage = TD1Estimate( gamma=0.9, value_network=value, differentiable=gradient_mode ) elif advantage == "td_lambda": @@ -2319,7 +2316,7 @@ def test_ppo_shared(self, loss_class, device, advantage): value_network=value, ) elif advantage == "td": - advantage = TDEstimate( + advantage = TD1Estimate( gamma=0.9, value_network=value, ) @@ -2395,7 +2392,7 @@ def test_ppo_diff(self, loss_class, device, gradient_mode, advantage): gamma=0.9, lmbda=0.9, value_network=value, differentiable=gradient_mode ) elif advantage == "td": - advantage = TDEstimate( + advantage = TD1Estimate( gamma=0.9, value_network=value, differentiable=gradient_mode ) elif advantage == "td_lambda": @@ -2531,7 +2528,7 @@ def test_a2c(self, device, gradient_mode, advantage): gamma=0.9, lmbda=0.9, value_network=value, differentiable=gradient_mode ) elif advantage == "td": - advantage = TDEstimate( + advantage = TD1Estimate( gamma=0.9, value_network=value, differentiable=gradient_mode ) elif advantage == "td_lambda": @@ -2607,7 +2604,7 @@ def test_a2c_diff(self, device, gradient_mode, advantage): gamma=0.9, lmbda=0.9, value_network=value, differentiable=gradient_mode ) elif advantage == "td": - advantage = TDEstimate( + advantage = TD1Estimate( gamma=0.9, value_network=value, differentiable=gradient_mode ) elif advantage == "td_lambda": @@ -2682,7 +2679,7 @@ def test_reinforce_value_net(self, advantage, gradient_mode, delay_value): differentiable=gradient_mode, ) elif advantage == "td": - advantage = TDEstimate( + advantage = TD1Estimate( gamma=gamma, value_network=get_functional(value_net), differentiable=gradient_mode, @@ -4208,7 +4205,11 @@ def __init__(self, actor_network, qvalue_network): class TestAdv: @pytest.mark.parametrize( "adv,kwargs", - [[GAE, {"lmbda": 0.95}], [TDEstimate, {}], [TDLambdaEstimate, {"lmbda": 0.95}]], + [ + [GAE, {"lmbda": 0.95}], + [TD1Estimate, {}], + [TDLambdaEstimate, {"lmbda": 0.95}], + ], ) def test_dispatch( self, @@ -4236,7 +4237,11 @@ def test_dispatch( @pytest.mark.parametrize( "adv,kwargs", - [[GAE, {"lmbda": 0.95}], [TDEstimate, {}], [TDLambdaEstimate, {"lmbda": 0.95}]], + [ + [GAE, {"lmbda": 0.95}], + [TD1Estimate, {}], + [TDLambdaEstimate, {"lmbda": 0.95}], + ], ) def test_diff_reward( self, @@ -4273,7 +4278,11 @@ def test_diff_reward( @pytest.mark.parametrize( "adv,kwargs", - [[GAE, {"lmbda": 0.95}], [TDEstimate, {}], [TDLambdaEstimate, {"lmbda": 0.95}]], + [ + [GAE, {"lmbda": 0.95}], + [TD1Estimate, {}], + [TDLambdaEstimate, {"lmbda": 0.95}], + ], ) def test_non_differentiable(self, adv, kwargs): value_net = TensorDictModule( diff --git a/torchrl/data/postprocs/postprocs.py b/torchrl/data/postprocs/postprocs.py index 26cdc470824..2ec0bfb4d97 100644 --- a/torchrl/data/postprocs/postprocs.py +++ b/torchrl/data/postprocs/postprocs.py @@ -201,7 +201,7 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: tensordict.set("steps_to_next_obs", time_to_obs + 1) tensordict.rename_key_(("next", "reward"), ("next", "original_reward")) - tensordict["next"].update(tensordict_gather) + tensordict.get("next").update(tensordict_gather) tensordict.set(("next", "reward"), summed_rewards) tensordict.set("gamma", self.gamma ** (time_to_obs + 1)) nonterminal = time_to_obs != 0 diff --git a/torchrl/objectives/__init__.py b/torchrl/objectives/__init__.py index 73be0fe5e99..e74ccbac808 100644 --- a/torchrl/objectives/__init__.py +++ b/torchrl/objectives/__init__.py @@ -15,12 +15,14 @@ from .sac import DiscreteSACLoss, SACLoss from .td3 import TD3Loss from .utils import ( + default_value_kwargs, distance_loss, HardUpdate, hold_out_net, hold_out_params, next_state_value, SoftUpdate, + ValueFunctions, ) # from .value import bellman_max, c_val, dv_val, vtrace, GAE, TDLambdaEstimate, TDEstimate diff --git a/torchrl/objectives/common.py b/torchrl/objectives/common.py index c53c9214de1..a105b23da98 100644 --- a/torchrl/objectives/common.py +++ b/torchrl/objectives/common.py @@ -52,6 +52,7 @@ class LossModule(nn.Module): def __init__(self): super().__init__() self._param_maps = {} + self._value_function = None # self.register_forward_pre_hook(_parameters_to_tensordict) def forward(self, tensordict: TensorDictBase) -> TensorDictBase: @@ -355,7 +356,19 @@ def half(self) -> LossModule: def cpu(self) -> LossModule: return self.to(torch.device("cpu")) - def _default_value_function(self) -> ValueFunctionBase: + @property + def value_function(self) -> ValueFunctionBase: + out = self._value_function + if out is None: + self._default_value_function() + return self._value_function + return out + + @value_function.setter + def value_function(self, value): + self._value_function = value + + def _default_value_function(self): """A value-function constructor when none is provided. No kwarg should be present as default parameters should be retrieved @@ -387,18 +400,20 @@ def make_value_function(self, value_type: ValueFunctions, **hyperparams): """ if value_type == ValueFunctions.TD1: - raise NotImplementedError(f"Value type {value_type} it not implemented for loss {type(self)}.") + raise NotImplementedError( + f"Value type {value_type} it not implemented for loss {type(self)}." + ) elif value_type == ValueFunctions.TD0: raise NotImplementedError( f"Value type {value_type} it not implemented for loss {type(self)}." - ) + ) elif value_type == ValueFunctions.GAE: raise NotImplementedError( f"Value type {value_type} it not implemented for loss {type(self)}." - ) + ) elif value_type == ValueFunctions.TDLambda: raise NotImplementedError( f"Value type {value_type} it not implemented for loss {type(self)}." - ) + ) else: raise NotImplementedError(f"Unknown value type {value_type}") diff --git a/torchrl/objectives/dqn.py b/torchrl/objectives/dqn.py index 41a6887a858..0bb6a31a319 100644 --- a/torchrl/objectives/dqn.py +++ b/torchrl/objectives/dqn.py @@ -3,7 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from typing import Optional, Union +from typing import Union import torch from tensordict import TensorDict, TensorDictBase @@ -14,8 +14,9 @@ from torchrl.modules.tensordict_module.common import ensure_tensordict_compatible from .common import LossModule -from .utils import distance_loss, ValueFunctions, default_value_kwargs -from .value import TDLambdaEstimate, ValueFunctionBase +from .utils import default_value_kwargs, distance_loss, ValueFunctions +from .value import GAE, TDLambdaEstimate +from .value.advantages import TD0Estimate, TD1Estimate class DQNLoss(LossModule): @@ -23,8 +24,6 @@ class DQNLoss(LossModule): Args: value_network (QValueActor or nn.Module): a Q value operator. - value_function (ValueFunctionBase, optional): the value function module - to be used. Defaults to :class:`torchrl.objectives.values.TDLambdaEstimate`. loss_function (str): loss function for the value discrepancy. Can be one of "l1", "l2" or "smooth_l1". delay_value (bool, optional): whether to duplicate the value network into a new target value network to create a double DQN. Default is :obj:`False`. @@ -34,7 +33,6 @@ class DQNLoss(LossModule): def __init__( self, value_network: Union[QValueActor, nn.Module], - value_function: Optional[ValueFunctionBase] = None, loss_function: str = "l2", priority_key: str = "td_error", delay_value: bool = False, @@ -42,13 +40,6 @@ def __init__( super().__init__() self.delay_value = delay_value - if ( - value_function is not None - and value_function.value_network is not value_network - ): - raise RuntimeError( - "value_function.value_network and value_network must match." - ) value_network = ensure_tensordict_compatible( module=value_network, wrapper_type=QValueActor ) @@ -59,37 +50,41 @@ def __init__( create_target_params=self.delay_value, ) - if value_function is None: - value_function = self._default_value_function() - else: - value_function.value_key = "chosen_action_value" - self.value_function = value_function - self.value_network_in_keys = value_network.in_keys self.loss_function = loss_function self.priority_key = priority_key self.action_space = self.value_network.action_space - def make_value_function( - self, - value_type: ValueFunctions, - **hyperparams - ): + def make_value_function(self, value_type: ValueFunctions, **hyperparams): hp = dict(default_value_kwargs(value_type)) hp.update(hyperparams) - if value_type == ValueFunctions.TD1: - raise NotImplementedError(f"Value type {value_type} it not implemented for loss {type(self)}.") - elif value_type == ValueFunctions.TD0: - raise NotImplementedError( - f"Value type {value_type} it not implemented for loss {type(self)}." - ) - elif value_type == ValueFunctions.GAE: - raise NotImplementedError( - f"Value type {value_type} it not implemented for loss {type(self)}." - ) - elif value_type == ValueFunctions.TDLambda: - return TDLambdaEstimate( + if value_type is ValueFunctions.TD1: + self._value_function = TD1Estimate( + **hp, + value_network=self.value_network, + advantage_key="advantage", + value_target_key="value_target", + value_key="chosen_action_value", + ) + elif value_type is ValueFunctions.TD0: + self._value_function = TD0Estimate( + **hp, + value_network=self.value_network, + advantage_key="advantage", + value_target_key="value_target", + value_key="chosen_action_value", + ) + elif value_type is ValueFunctions.GAE: + self._value_function = GAE( + **hp, + value_network=self.value_network, + advantage_key="advantage", + value_target_key="value_target", + value_key="chosen_action_value", + ) + elif value_type is ValueFunctions.TDLambda: + self._value_function = TDLambdaEstimate( **hp, value_network=self.value_network, advantage_key="advantage", @@ -99,19 +94,8 @@ def make_value_function( else: raise NotImplementedError(f"Unknown value type {value_type}") - def _default_value_function(self): - return TDLambdaEstimate( - gamma=DEFAULT_VALUE_FUN_PARAMS.gamma, - lmbda=DEFAULT_VALUE_FUN_PARAMS.lmbda, - value_network=self.value_network, - average_rewards=True, - differentiable=False, - vectorized=True, - advantage_key="advantage", - value_target_key="value_target", - value_key="chosen_action_value", - ) + self.make_value_function(ValueFunctions.TDLambda) def forward(self, input_tensordict: TensorDictBase) -> TensorDict: """Computes the DQN loss given a tensordict sampled from the replay buffer. @@ -160,11 +144,9 @@ def forward(self, input_tensordict: TensorDictBase) -> TensorDict: action = action.to(torch.float) pred_val_index = (pred_val * action).sum(-1) - target_value = self.value_function( - tensordict.clone(False), - self.value_network_params, - self.target_value_network_params, - ).get(self.value_function.value_target_key).squeeze(-1) + target_value = self.value_function.value_estimate( + tensordict.clone(False), target_params=self.target_value_network_params + ).squeeze(-1) priority_tensor = (pred_val_index - target_value).pow(2) priority_tensor = priority_tensor.detach().unsqueeze(-1) @@ -317,6 +299,7 @@ def forward(self, input_tensordict: TensorDictBase) -> TensorDict: reward = reward.to("cpu") support = support.to("cpu") pns_a = pns_a.to("cpu") + Tz = reward + (1 - done.to(reward.dtype)) * discount * support if Tz.shape != torch.Size([batch_size, atoms]): raise RuntimeError( @@ -363,3 +346,25 @@ def forward(self, input_tensordict: TensorDictBase) -> TensorDict: ) loss_td = TensorDict({"loss": loss.mean()}, []) return loss_td + + def make_value_function(self, value_type: ValueFunctions, **hyperparams): + if value_type is ValueFunctions.TD1: + raise NotImplementedError( + f"value type {value_type} is not implemented for {self.__class__.__name__}." + ) + elif value_type is ValueFunctions.TD0: + # see forward call + pass + elif value_type is ValueFunctions.GAE: + raise NotImplementedError( + f"value type {value_type} is not implemented for {self.__class__.__name__}." + ) + elif value_type is ValueFunctions.TDLambda: + raise NotImplementedError( + f"value type {value_type} is not implemented for {self.__class__.__name__}." + ) + else: + raise NotImplementedError(f"Unknown value type {value_type}") + + def _default_value_function(self): + self.make_value_function(ValueFunctions.TD0) diff --git a/torchrl/objectives/sac.py b/torchrl/objectives/sac.py index 424e2f28bfb..4045dcfc119 100644 --- a/torchrl/objectives/sac.py +++ b/torchrl/objectives/sac.py @@ -9,17 +9,22 @@ import numpy as np import torch -from tensordict.nn import make_functional, TensorDictModule +from tensordict.nn import make_functional, TensorDictModule, TensorDictSequential from tensordict.tensordict import TensorDict, TensorDictBase from torch import Tensor from torchrl.modules import ProbabilisticActor from torchrl.modules.tensordict_module.actors import ActorCriticWrapper -from torchrl.objectives.utils import distance_loss, next_state_value -from .value import ValueFunctionBase, TDLambdaEstimate +from torchrl.objectives.utils import ( + default_value_kwargs, + distance_loss, + next_state_value, + ValueFunctions, +) from ..envs.utils import set_exploration_mode, step_mdp from .common import LossModule +from .value import GAE, TD0Estimate, TD1Estimate, TDLambdaEstimate try: from functorch import vmap @@ -31,6 +36,53 @@ FUNCTORCH_ERROR = err +class _SACValueNet(TensorDictSequential): + r"""Value network for SAC v2. + + SAC v2 is based on a value estimate of the form: + + .. math:: + + V = Q(s,a) - \alpha * \log p(a | s) + + This class computes this value given the actor and qvalue network + + """ + + def __init__(self, actor_network, qvalue_network): + super().__init__(actor_network, qvalue_network) + # we highjack the forward so the out_keys must be re-written + self.out_keys = ["state_value"] + + def forward(self, tensordict, _alpha, actor_params, qval_params): + """Computes the value as `val = qval - a * log_prob(a)`.""" + actor_network, qvalue_network = self + + obs_keys = actor_network.in_keys + data = tensordict.select(*obs_keys) + # get actions and log-probs + with torch.no_grad(): + with set_exploration_mode("random"): + dist = actor_network.get_dist(data, params=actor_params) + data.set("action", dist.rsample()) + log_prob = dist.log_prob(data.get("action")) + data.set("sample_log_prob", log_prob) + sample_log_prob = data.get("sample_log_prob") + + # get q-values + data = vmap(qvalue_network, (None, 0))(data, qval_params) + state_action_value = data.get("state_action_value") + if ( + state_action_value.shape[-len(sample_log_prob.shape) :] + != sample_log_prob.shape + ): + sample_log_prob = sample_log_prob.unsqueeze(-1) + state_value = state_action_value - _alpha * sample_log_prob + state_value = state_value.min(0)[0] + tensordict.set("state_value", state_value) + return tensordict + + class SACLoss(LossModule): """TorchRL implementation of the SAC loss. @@ -47,8 +99,6 @@ class SACLoss(LossModule): .. note:: If not provided, the second version of SAC is assumed, where only the Q-Value network is needed. - value_function (ValueFunctionBase, optional): the value function module - to be used. Defaults to :class:`torchrl.objectives.values.TDLambdaEstimate`. priority_key (str, optional): tensordict key where to write the priority (for prioritized replay buffer usage). Defaults to ``"td_error"``. @@ -83,7 +133,6 @@ def __init__( actor_network: ProbabilisticActor, qvalue_network: TensorDictModule, value_network: Optional[TensorDictModule] = None, - value_function: Optional[ValueFunctionBase] = None, num_qvalue_nets: int = 2, priority_key: str = "td_error", loss_function: str = "smooth_l1", @@ -178,30 +227,52 @@ def __init__( ) make_functional(self.actor_critic) - if value_function is None: - value_function = self._default_value_function() - elif self._version == 1: - # in v1, the next value requires an action to be sampled - value_function.value_network = self.actor_critic + def make_value_function(self, value_type: ValueFunctions, **hyperparams): + if self._version == 1: + value_net = self.actor_critic + elif self._version == 2: + value_net = _SACValueNet(self.actor_network, self.qvalue_network) else: - # TODO - pass - - self.value_function = value_function - + # unreachable + raise NotImplementedError + + value_key = "state_value" + hp = dict(default_value_kwargs(value_type)) + hp.update(hyperparams) + if value_type is ValueFunctions.TD1: + self._value_function = TD1Estimate( + **hp, + value_network=value_net, + value_target_key="value_target", + value_key=value_key, + ) + elif value_type is ValueFunctions.TD0: + self._value_function = TD0Estimate( + **hp, + value_network=value_net, + value_target_key="value_target", + value_key=value_key, + ) + elif value_type is ValueFunctions.GAE: + self._value_function = GAE( + **hp, + value_network=value_net, + value_target_key="value_target", + value_key=value_key, + ) + elif value_type is ValueFunctions.TDLambda: + self._value_function = TDLambdaEstimate( + **hp, + value_network=value_net, + value_target_key="value_target", + value_key=value_key, + ) + else: + raise NotImplementedError(f"Unknown value type {value_type}") def _default_value_function(self): - return TDLambdaEstimate( - gamma=DEFAULT_VALUE_FUN_PARAMS.gamma, - lmbda=DEFAULT_VALUE_FUN_PARAMS.lmbda, - value_network=self.actor_critic if self._version == 1 else self.qvalue_network, - average_rewards=True, - differentiable=False, - vectorized=True, - advantage_key="advantage", - value_target_key="value_target", - value_key="state_action_value" if self._version == 2 else "state_value", - ) + # TD0 by default, as in paper + self.make_value_function(ValueFunctions.TD0) @property def device(self) -> torch.device: @@ -248,7 +319,7 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: } if self._version == 1: out["loss_value"] = loss_value.mean() - return TensorDict(out,[]) + return TensorDict(out, []) def _loss_actor(self, tensordict: TensorDictBase) -> Tensor: # KL lossa @@ -291,9 +362,8 @@ def _loss_qvalue_v1(self, tensordict: TensorDictBase) -> Tuple[Tensor, Tensor]: ) with set_exploration_mode("mode"): target_value = self.value_function.value_estimate( - tensordict, - target_params=target_params - ) + tensordict, target_params=target_params + ).squeeze(-1) # value loss qvalue_network = self.qvalue_network @@ -324,45 +394,14 @@ def _loss_qvalue_v1(self, tensordict: TensorDictBase) -> Tuple[Tensor, Tensor]: return loss_value, priority_value def _loss_qvalue_v2(self, tensordict: TensorDictBase) -> Tuple[Tensor, Tensor]: - obs_keys = self.actor_network.in_keys - tensordict = tensordict.select("next", *obs_keys, "action") - - with torch.no_grad(): - next_td = step_mdp(tensordict).select( - *self.actor_network.in_keys - ) # next_observation -> - # observation - # select pseudo-action - with set_exploration_mode("random"): - dist = self.actor_network.get_dist( - next_td, - params=self.target_actor_network_params, - ) - next_td.set("action", dist.rsample()) - next_td.set("sample_log_prob", dist.log_prob(next_td["action"])) - sample_log_prob = next_td.get("sample_log_prob") - # get q-values - next_td = vmap(self.qvalue_network, (None, 0))( - next_td, - self.target_qvalue_network_params, - ) - state_action_value = next_td.get("state_action_value") - if ( - state_action_value.shape[-len(sample_log_prob.shape) :] - != sample_log_prob.shape - ): - sample_log_prob = sample_log_prob.unsqueeze(-1) - state_value = ( - state_action_value - self._alpha * sample_log_prob - ) - state_value = state_value.min(0)[0] - - tensordict.set("next.state_value", state_value) - target_value = next_state_value( + # we pass the alpha value to the tensordict. Since it's a scalar, we must erase the batch-size first. + target_value = self.value_function.value_estimate( tensordict, - gamma=self.gamma, - pred_next_val=state_value, - ) + _alpha=self._alpha, + actor_params=self.target_actor_network_params, + qval_params=self.target_qvalue_network_params, + ).squeeze(-1) + tensordict_expand = vmap(self.qvalue_network, (None, 0))( tensordict.select(*self.qvalue_network.in_keys), self.qvalue_network_params, @@ -390,8 +429,6 @@ def _loss_value(self, tensordict: TensorDictBase) -> Tensor: params=self.target_actor_network_params, ) # resample an action action = action_dist.rsample() - # if not self.actor_network.spec.is_in(action): - # action.data.copy_(self.actor_network.spec.project(action.data)) td_copy.set("action", action, inplace=False) diff --git a/torchrl/objectives/utils.py b/torchrl/objectives/utils.py index fc164430cb5..6f0a4a65c73 100644 --- a/torchrl/objectives/utils.py +++ b/torchrl/objectives/utils.py @@ -17,11 +17,23 @@ class ValueFunctions(Enum): + """Value function enumerator for custom-built estimators. + + Allows for a flexible usage of various value functions when the loss module + allows it. + + Examples: + >>> dqn_loss = DQNLoss(actor) + >>> dqn_loss.make_value_function(ValueFunctions.TD0, gamma=0.9) + + """ + TD0 = 1 TD1 = 2 TDLambda = 3 GAE = 4 + def default_value_kwargs(value_type: ValueFunctions): """Default value function keyword argument generator. @@ -46,8 +58,6 @@ def default_value_kwargs(value_type: ValueFunctions): raise NotImplementedError(f"Unknown value type {value_type}.") - - class _context_manager: def __init__(self, value=True): self.value = value diff --git a/torchrl/objectives/value/__init__.py b/torchrl/objectives/value/__init__.py index 6152732f411..ef224940ddf 100644 --- a/torchrl/objectives/value/__init__.py +++ b/torchrl/objectives/value/__init__.py @@ -3,4 +3,10 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from .advantages import GAE, TDEstimate, TDLambdaEstimate, ValueFunctionBase +from .advantages import ( + GAE, + TD0Estimate, + TD1Estimate, + TDLambdaEstimate, + ValueFunctionBase, +) diff --git a/torchrl/objectives/value/advantages.py b/torchrl/objectives/value/advantages.py index 7465206ce78..17a9c35c90d 100644 --- a/torchrl/objectives/value/advantages.py +++ b/torchrl/objectives/value/advantages.py @@ -4,10 +4,10 @@ # LICENSE file in the root directory of this source tree. import abc from functools import wraps -from typing import List, Optional, Tuple, Union +from typing import Callable, List, Optional, Tuple, Union import torch -from tensordict.nn import dispatch, TensorDictModule, is_functional +from tensordict.nn import dispatch, is_functional, TensorDictModule from tensordict.tensordict import TensorDictBase from torch import nn, Tensor @@ -32,9 +32,18 @@ def new_fun(self, *args, **kwargs): class ValueFunctionBase(nn.Module): - """An abstract parent class for value function modules.""" + """An abstract parent class for value function modules. - value_network: TensorDictModule + Its :meth:`ValueFunctionBase.forward` method will compute the value (given + by the value network) and the value estimate (given by the value estimator) + as well as the advantage and write these values in the output tensordict. + + If only the value estimate is needed, the :meth:`ValueFunctionBase.value_estimate` + should be used instead. + + """ + + value_network: Union[TensorDictModule, Callable] value_key: Union[Tuple[str], str] @abc.abstractmethod @@ -65,23 +74,20 @@ def forward( """ raise NotImplementedError - def value_estimate(self, tensordict, requires_grad=True, target_params: Optional[TensorDictBase] = None): + def value_estimate( + self, + tensordict, + target_params: Optional[TensorDictBase] = None, + **kwargs, + ): """Gets a value estimate, usually used as a target value for the value network. Args: tensordict (TensorDictBase): the tensordict containing the data to read. - requires_grad (bool, optional): whether the estimate should be part - of a computational graph. - .. note:: - To avoid carrying gradient with respect to the parameters, - one can also use ``val_fun.value_estimate(tensordict, target_params=params.detach())`` - which allows gradients to pass through the value function - without including the parameters in the computational graph. - - Defaults to ``True``. target_params (TensorDictBase, optional): A nested TensorDict containing the target params to be passed to the functional value network module. + **kwargs: the keyword arguments to be passed to the value network. """ raise NotImplementedError @@ -134,7 +140,10 @@ def __init__( self.average_rewards = average_rewards self.differentiable = differentiable self.value_key = value_key - if value_key not in value_network.out_keys: + if ( + hasattr(value_network, "out_keys") + and value_key not in value_network.out_keys + ): raise KeyError( f"value key '{value_key}' not found in value network out_keys." ) @@ -234,12 +243,18 @@ def forward( tensordict.set("value_target", value_target) return tensordict - def value_estimate(self, tensordict, requires_grad=False, target_params: Optional[TensorDictBase] = None): - kwargs = {} + def value_estimate( + self, + tensordict, + target_params: Optional[TensorDictBase] = None, + **kwargs, + ): gamma = self.gamma - # we may still need to pass gradient, but we don't want to assign grads to - # value net params reward = tensordict.get(("next", "reward")) + steps_to_next_obs = tensordict.get("steps_to_next_obs", None) + if steps_to_next_obs is not None: + gamma = gamma ** steps_to_next_obs.view_as(reward) + if self.average_rewards: reward = reward - reward.mean() reward = reward / reward.std().clamp_min(1e-4) @@ -258,6 +273,7 @@ def value_estimate(self, tensordict, requires_grad=False, target_params: Optiona value_target = reward + gamma * (1 - done.to(reward.dtype)) * next_value return value_target + class TD1Estimate(ValueFunctionBase): """Bootstrapped Temporal Difference (TD(1)) estimate of advantage function. @@ -298,7 +314,10 @@ def __init__( self.average_rewards = average_rewards self.differentiable = differentiable self.value_key = value_key - if value_key not in value_network.out_keys: + if ( + hasattr(value_network, "out_keys") + and value_key not in value_network.out_keys + ): raise KeyError( f"value key '{value_key}' not found in value network out_keys." ) @@ -398,12 +417,18 @@ def forward( tensordict.set("value_target", value_target) return tensordict - def value_estimate(self, tensordict, requires_grad=False, target_params: Optional[TensorDictBase] = None): - kwargs = {} + def value_estimate( + self, + tensordict, + target_params: Optional[TensorDictBase] = None, + **kwargs, + ): gamma = self.gamma - # we may still need to pass gradient, but we don't want to assign grads to - # value net params reward = tensordict.get(("next", "reward")) + steps_to_next_obs = tensordict.get("steps_to_next_obs", None) + if steps_to_next_obs is not None: + gamma = gamma ** steps_to_next_obs.view_as(reward) + if self.average_rewards: reward = reward - reward.mean() reward = reward / reward.std().clamp_min(1e-4) @@ -419,11 +444,14 @@ def value_estimate(self, tensordict, requires_grad=False, target_params: Optiona next_value = step_td.get(self.value_key) done = tensordict.get(("next", "done")) - value_target = td_advantage_estimate(gamma, torch.zeros_like(next_value), next_value, reward, done) + value_target = td_advantage_estimate( + gamma, torch.zeros_like(next_value), next_value, reward, done + ) return value_target + class TDLambdaEstimate(ValueFunctionBase): - """TD(:math:`\lambda`) estimate of advantage function. + r"""TD(:math:`\lambda`) estimate of advantage function. Args: gamma (scalar): exponential mean discount. @@ -469,7 +497,10 @@ def __init__( self.average_rewards = average_rewards self.differentiable = differentiable self.value_key = value_key - if value_key not in value_network.out_keys: + if ( + hasattr(value_network, "out_keys") + and value_key not in value_network.out_keys + ): raise KeyError( f"value key '{value_key}' not found in value network out_keys." ) @@ -566,15 +597,24 @@ def forward( target_params = params.detach() value_target = self.value_estimate(tensordict, target_params=target_params) - tensordict.set(self.advantage_key, value_target-value) + tensordict.set(self.advantage_key, value_target - value) tensordict.set(self.value_target_key, value_target) return tensordict - def value_estimate(self, tensordict, requires_grad=False, target_params: Optional[TensorDictBase] = None): + def value_estimate( + self, + tensordict, + target_params: Optional[TensorDictBase] = None, + **kwargs, + ): gamma = self.gamma - lmbda = self.lmbda reward = tensordict.get(("next", "reward")) + steps_to_next_obs = tensordict.get("steps_to_next_obs", None) + if steps_to_next_obs is not None: + gamma = gamma ** steps_to_next_obs.view_as(reward) + + lmbda = self.lmbda if self.average_rewards: reward = reward - reward.mean() reward = reward / reward.std().clamp_min(1e-4) @@ -582,9 +622,6 @@ def value_estimate(self, tensordict, requires_grad=False, target_params: Optiona ("next", "reward"), reward ) # we must update the rewards if they are used later in the code - - kwargs = {} - step_td = step_mdp(tensordict) if target_params is not None: # we assume that target parameters are not differentiable @@ -606,6 +643,7 @@ def value_estimate(self, tensordict, requires_grad=False, target_params: Optiona ) return val + class GAE(ValueFunctionBase): """A class wrapper around the generalized advantage estimate functional. @@ -656,7 +694,10 @@ def __init__( self.register_buffer("lmbda", torch.tensor(lmbda, device=device)) self.value_network = value_network self.value_key = value_key - if value_key not in value_network.out_keys: + if ( + hasattr(value_network, "out_keys") + and value_key not in value_network.out_keys + ): raise KeyError( f"value key '{value_key}' not found in value network out_keys." ) @@ -745,6 +786,11 @@ def forward( ) reward = tensordict.get(("next", "reward")) gamma, lmbda = self.gamma, self.lmbda + reward = tensordict.get(("next", "reward")) + steps_to_next_obs = tensordict.get("steps_to_next_obs", None) + if steps_to_next_obs is not None: + gamma = gamma ** steps_to_next_obs.view_as(reward) + kwargs = {} if self.is_functional and params is None: raise RuntimeError( @@ -785,3 +831,51 @@ def forward( tensordict.set(self.value_target_key, value_target) return tensordict + + def value_estimate( + self, + tensordict, + params: Optional[TensorDictBase] = None, + target_params: Optional[TensorDictBase] = None, + **kwargs, + ): + if tensordict.batch_dims < 1: + raise RuntimeError( + "Expected input tensordict to have at least one dimensions, got" + f"tensordict.batch_size = {tensordict.batch_size}" + ) + reward = tensordict.get(("next", "reward")) + gamma, lmbda = self.gamma, self.lmbda + steps_to_next_obs = tensordict.get("steps_to_next_obs", None) + if steps_to_next_obs is not None: + gamma = gamma ** steps_to_next_obs.view_as(reward) + + if self.is_functional and params is None: + raise RuntimeError( + "Expected params to be passed to advantage module but got none." + ) + if params is not None: + kwargs["params"] = params + with hold_out_net(self.value_network): + # we may still need to pass gradient, but we don't want to assign grads to + # value net params + self.value_network(tensordict, **kwargs) + + value = tensordict.get(self.value_key) + + step_td = step_mdp(tensordict) + if target_params is not None: + # we assume that target parameters are not differentiable + kwargs["params"] = target_params + elif "params" in kwargs: + kwargs["params"] = kwargs["params"].detach() + with hold_out_net(self.value_network): + # we may still need to pass gradient, but we don't want to assign grads to + # value net params + self.value_network(step_td, **kwargs) + next_value = step_td.get(self.value_key) + done = tensordict.get(("next", "done")) + _, value_target = vec_generalized_advantage_estimate( + gamma, lmbda, value, next_value, reward, done + ) + return value_target From 01f1ae70fcf3b18f1396f34e8844654553a25fea Mon Sep 17 00:00:00 2001 From: vmoens Date: Mon, 27 Mar 2023 18:28:34 +0100 Subject: [PATCH 18/89] amend --- test/test_cost.py | 158 ++++++---- torchrl/collectors/collectors.py | 6 +- torchrl/data/datasets/d4rl.py | 2 +- torchrl/data/replay_buffers/replay_buffers.py | 2 +- torchrl/data/replay_buffers/samplers.py | 2 +- torchrl/data/tensor_specs.py | 6 +- torchrl/envs/common.py | 10 +- torchrl/envs/libs/dm_control.py | 4 +- torchrl/envs/libs/utils.py | 2 +- torchrl/envs/transforms/r3m.py | 2 +- torchrl/envs/transforms/transforms.py | 22 +- torchrl/envs/transforms/vip.py | 2 +- torchrl/envs/utils.py | 10 +- torchrl/envs/vec_env.py | 6 +- torchrl/modules/models/models.py | 3 +- torchrl/modules/tensordict_module/actors.py | 8 +- torchrl/modules/tensordict_module/common.py | 2 +- .../tensordict_module/probabilistic.py | 2 +- torchrl/objectives/a2c.py | 73 ++++- torchrl/objectives/common.py | 10 +- torchrl/objectives/ddpg.py | 53 +++- torchrl/objectives/deprecated.py | 88 ++++-- torchrl/objectives/dqn.py | 16 +- torchrl/objectives/dreamer.py | 94 ++++-- torchrl/objectives/iql.py | 88 +++--- torchrl/objectives/ppo.py | 269 ++++++++++++++---- torchrl/objectives/redq.py | 90 +++--- torchrl/objectives/reinforce.py | 85 +++++- torchrl/objectives/sac.py | 197 +++++++------ torchrl/objectives/td3.py | 69 +++-- torchrl/objectives/value/advantages.py | 175 +++++++----- torchrl/objectives/value/functional.py | 36 ++- torchrl/objectives/value/utils.py | 2 +- torchrl/record/recorder.py | 4 +- torchrl/trainers/helpers/collectors.py | 2 +- torchrl/trainers/helpers/envs.py | 4 +- torchrl/trainers/trainers.py | 12 +- 37 files changed, 1084 insertions(+), 532 deletions(-) diff --git a/test/test_cost.py b/test/test_cost.py index 409bfa0e540..e09364ca69d 100644 --- a/test/test_cost.py +++ b/test/test_cost.py @@ -4,7 +4,6 @@ # LICENSE file in the root directory of this source tree. import argparse -import re from copy import deepcopy from packaging import version as pack_version @@ -88,7 +87,12 @@ from torchrl.objectives.deprecated import DoubleREDQLoss_deprecated, REDQLoss_deprecated from torchrl.objectives.redq import REDQLoss from torchrl.objectives.reinforce import ReinforceLoss -from torchrl.objectives.utils import HardUpdate, hold_out_net, SoftUpdate +from torchrl.objectives.utils import ( + HardUpdate, + hold_out_net, + SoftUpdate, + ValueFunctions, +) from torchrl.objectives.value.advantages import GAE, TD1Estimate, TDLambdaEstimate from torchrl.objectives.value.functional import ( generalized_advantage_estimate, @@ -295,7 +299,8 @@ def _create_seq_mock_data_dqn( @pytest.mark.parametrize( "action_spec_type", ("nd_bounded", "one_hot", "categorical") ) - def test_dqn(self, delay_value, device, action_spec_type): + @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) + def test_dqn(self, delay_value, device, action_spec_type, td_est): torch.manual_seed(self.seed) actor = self._create_mock_actor( action_spec_type=action_spec_type, device=device @@ -304,6 +309,12 @@ def test_dqn(self, delay_value, device, action_spec_type): action_spec_type=action_spec_type, device=device ) loss_fn = DQNLoss(actor, loss_function="l2", delay_value=delay_value) + if td_est is ValueFunctions.GAE: + with pytest.raises(NotImplementedError): + loss_fn.make_value_function(td_est) + return + if td_est is not None: + loss_fn.make_value_function(td_est) with _check_td_steady(td): loss = loss_fn(td) assert loss_fn.priority_key in td.keys() @@ -388,8 +399,9 @@ def test_dqn_batcher(self, n, delay_value, device, action_spec_type, gamma=0.9): @pytest.mark.parametrize( "action_spec_type", ("mult_one_hot", "one_hot", "categorical") ) + @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) def test_distributional_dqn( - self, atoms, delay_value, device, action_spec_type, gamma=0.9 + self, atoms, delay_value, device, action_spec_type, td_est, gamma=0.9 ): torch.manual_seed(self.seed) actor = self._create_mock_distributional_actor( @@ -401,6 +413,13 @@ def test_distributional_dqn( ).to(device) loss_fn = DistributionalDQNLoss(actor, gamma=gamma, delay_value=delay_value) + if td_est not in (None, ValueFunctions.TD0): + with pytest.raises(NotImplementedError): + loss_fn.make_value_function(td_est) + return + elif td_est is not None: + loss_fn.make_value_function(td_est) + with _check_td_steady(td): loss = loss_fn(td) assert loss_fn.priority_key in td.keys() @@ -529,7 +548,8 @@ def _create_seq_mock_data_ddpg( ) @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("delay_actor,delay_value", [(False, False), (True, True)]) - def test_ddpg(self, delay_actor, delay_value, device): + @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) + def test_ddpg(self, delay_actor, delay_value, device, td_est): torch.manual_seed(self.seed) actor = self._create_mock_actor(device=device) value = self._create_mock_value(device=device) @@ -537,11 +557,17 @@ def test_ddpg(self, delay_actor, delay_value, device): loss_fn = DDPGLoss( actor, value, - gamma=0.9, loss_function="l2", delay_actor=delay_actor, delay_value=delay_value, ) + if td_est is ValueFunctions.GAE: + with pytest.raises(NotImplementedError): + loss_fn.make_value_function(td_est) + return + if td_est is not None: + loss_fn.make_value_function(td_est) + with _check_td_steady(td): loss = loss_fn(td) @@ -632,7 +658,6 @@ def test_ddpg_batcher(self, n, delay_actor, delay_value, device, gamma=0.9): loss_fn = DDPGLoss( actor, value, - gamma=gamma, loss_function="l2", delay_actor=delay_actor, delay_value=delay_value, @@ -763,6 +788,7 @@ def _create_seq_mock_data_td3( ) @pytest.mark.parametrize("policy_noise", [0.1, 1.0]) @pytest.mark.parametrize("noise_clip", [0.1, 1.0]) + @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) def test_td3( self, delay_actor, @@ -770,6 +796,7 @@ def test_td3( device, policy_noise, noise_clip, + td_est, ): torch.manual_seed(self.seed) actor = self._create_mock_actor(device=device) @@ -778,13 +805,18 @@ def test_td3( loss_fn = TD3Loss( actor, value, - gamma=0.9, loss_function="l2", policy_noise=policy_noise, noise_clip=noise_clip, delay_actor=delay_actor, delay_qvalue=delay_qvalue, ) + if td_est is ValueFunctions.GAE: + with pytest.raises(NotImplementedError): + loss_fn.make_value_function(td_est) + return + if td_est is not None: + loss_fn.make_value_function(td_est) with _check_td_steady(td): loss = loss_fn(td) @@ -849,7 +881,6 @@ def test_td3_batcher( loss_fn = TD3Loss( actor, value, - gamma=0.9, policy_noise=policy_noise, noise_clip=noise_clip, delay_qvalue=delay_qvalue, @@ -1043,8 +1074,16 @@ def _create_seq_mock_data_sac( @pytest.mark.parametrize("delay_qvalue", (True, False)) @pytest.mark.parametrize("num_qvalue", [1, 2, 4, 8]) @pytest.mark.parametrize("device", get_available_devices()) + @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) def test_sac( - self, delay_value, delay_actor, delay_qvalue, num_qvalue, device, version + self, + delay_value, + delay_actor, + delay_qvalue, + num_qvalue, + device, + version, + td_est, ): if (delay_actor or delay_qvalue) and not delay_value: pytest.skip("incompatible config") @@ -1076,6 +1115,13 @@ def test_sac( **kwargs, ) + if td_est is ValueFunctions.GAE: + with pytest.raises(NotImplementedError): + loss_fn.make_value_function(td_est) + return + if td_est is not None: + loss_fn.make_value_function(td_est) + with _check_td_steady(td): loss = loss_fn(td) assert loss_fn.priority_key in td.keys() @@ -1445,6 +1491,7 @@ def _create_seq_mock_data_sac( @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("target_entropy_weight", [0.01, 0.5, 0.99]) @pytest.mark.parametrize("target_entropy", ["auto", 1.0, 0.1, 0.0]) + @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) def test_discrete_sac( self, delay_qvalue, @@ -1452,6 +1499,7 @@ def test_discrete_sac( device, target_entropy_weight, target_entropy, + td_est, ): torch.manual_seed(self.seed) @@ -1469,12 +1517,17 @@ def test_discrete_sac( qvalue_network=qvalue, num_actions=actor.spec["action"].space.n, num_qvalue_nets=num_qvalue, - gamma=0.9, target_entropy_weight=target_entropy_weight, target_entropy=target_entropy, loss_function="l2", **kwargs, ) + if td_est is ValueFunctions.GAE: + with pytest.raises(NotImplementedError): + loss_fn.make_value_function(td_est) + return + if td_est is not None: + loss_fn.make_value_function(td_est) with _check_td_steady(td): loss = loss_fn(td) @@ -1571,7 +1624,6 @@ def test_discrete_sac_batcher( qvalue_network=qvalue, num_actions=actor.spec["action"].space.n, num_qvalue_nets=num_qvalue, - gamma=0.9, loss_function="l2", target_entropy_weight=target_entropy_weight, target_entropy=target_entropy, @@ -1807,7 +1859,6 @@ def test_redq(self, delay_qvalue, num_qvalue, device): actor_network=actor, qvalue_network=qvalue, num_qvalue_nets=num_qvalue, - gamma=0.9, loss_function="l2", delay_qvalue=delay_qvalue, ) @@ -1892,7 +1943,6 @@ def test_redq_shared(self, delay_qvalue, num_qvalue, device): actor_network=actor, qvalue_network=qvalue, num_qvalue_nets=num_qvalue, - gamma=0.9, loss_function="l2", delay_qvalue=delay_qvalue, target_entropy=0.0, @@ -1997,7 +2047,6 @@ def test_redq_batched(self, delay_qvalue, num_qvalue, device): actor_network=deepcopy(actor), qvalue_network=deepcopy(qvalue), num_qvalue_nets=num_qvalue, - gamma=0.9, loss_function="l2", delay_qvalue=delay_qvalue, ) @@ -2009,7 +2058,6 @@ def test_redq_batched(self, delay_qvalue, num_qvalue, device): actor_network=deepcopy(actor), qvalue_network=deepcopy(qvalue), num_qvalue_nets=num_qvalue, - gamma=0.9, loss_function="l2", ) @@ -2041,7 +2089,6 @@ def test_redq_batcher(self, n, delay_qvalue, num_qvalue, device, gamma=0.9): actor_network=actor, qvalue_network=qvalue, num_qvalue_nets=num_qvalue, - gamma=0.9, loss_function="l2", delay_qvalue=delay_qvalue, ) @@ -2240,7 +2287,7 @@ def _create_seq_mock_data_ppo( @pytest.mark.parametrize("loss_class", (PPOLoss, ClipPPOLoss, KLPENPPOLoss)) @pytest.mark.parametrize("gradient_mode", (True, False)) - @pytest.mark.parametrize("advantage", ("gae", "td", "td_lambda")) + @pytest.mark.parametrize("advantage", ("gae", "td", "td_lambda", None)) @pytest.mark.parametrize("device", get_available_devices()) def test_ppo(self, loss_class, device, gradient_mode, advantage): torch.manual_seed(self.seed) @@ -2260,15 +2307,14 @@ def test_ppo(self, loss_class, device, gradient_mode, advantage): advantage = TDLambdaEstimate( gamma=0.9, lmbda=0.9, value_network=value, differentiable=gradient_mode ) + elif advantage is None: + pass else: raise NotImplementedError - loss_fn = loss_class(actor, value, gamma=0.9, loss_critic_type="l2") - with pytest.raises( - KeyError, match=re.escape('key "advantage" not found in TensorDict with') - ): - _ = loss_fn(td) - advantage(td) + loss_fn = loss_class(actor, value, loss_critic_type="l2") + if advantage is not None: + advantage(td) loss = loss_fn(td) loss_critic = loss["loss_critic"] loss_objective = loss["loss_objective"] + loss.get("loss_entropy", 0.0) @@ -2302,7 +2348,7 @@ def test_ppo(self, loss_class, device, gradient_mode, advantage): actor.zero_grad() @pytest.mark.parametrize("loss_class", (PPOLoss, ClipPPOLoss, KLPENPPOLoss)) - @pytest.mark.parametrize("advantage", ("gae", "td", "td_lambda")) + @pytest.mark.parametrize("advantage", ("gae", "td", "td_lambda", None)) @pytest.mark.parametrize("device", get_available_devices()) def test_ppo_shared(self, loss_class, device, advantage): torch.manual_seed(self.seed) @@ -2326,20 +2372,18 @@ def test_ppo_shared(self, loss_class, device, advantage): lmbda=0.9, value_network=value, ) + elif advantage is None: + pass else: raise NotImplementedError loss_fn = loss_class( actor, value, - gamma=0.9, loss_critic_type="l2", ) - with pytest.raises( - KeyError, match=re.escape('key "advantage" not found in TensorDict with') - ): - _ = loss_fn(td) - advantage(td) + if advantage is not None: + advantage(td) loss = loss_fn(td) loss_critic = loss["loss_critic"] loss_objective = loss["loss_objective"] + loss.get("loss_entropy", 0.0) @@ -2377,7 +2421,7 @@ def test_ppo_shared(self, loss_class, device, advantage): ) @pytest.mark.parametrize("loss_class", (PPOLoss, ClipPPOLoss, KLPENPPOLoss)) @pytest.mark.parametrize("gradient_mode", (True, False)) - @pytest.mark.parametrize("advantage", ("gae", "td", "td_lambda")) + @pytest.mark.parametrize("advantage", ("gae", "td", "td_lambda", None)) @pytest.mark.parametrize("device", get_available_devices()) def test_ppo_diff(self, loss_class, device, gradient_mode, advantage): if pack_version.parse(torch.__version__) > pack_version.parse("1.14"): @@ -2399,6 +2443,8 @@ def test_ppo_diff(self, loss_class, device, gradient_mode, advantage): advantage = TDLambdaEstimate( gamma=0.9, lmbda=0.9, value_network=value, differentiable=gradient_mode ) + elif advantage is None: + pass else: raise NotImplementedError @@ -2409,11 +2455,8 @@ def test_ppo_diff(self, loss_class, device, gradient_mode, advantage): for p in params: p.data.zero_() # assert len(list(floss_fn.parameters())) == 0 - with pytest.raises( - KeyError, match=re.escape('key "advantage" not found in TensorDict with') - ): - _ = floss_fn(params, buffers, td) - advantage(td) + if advantage is not None: + advantage(td) loss = floss_fn(params, buffers, td) loss_critic = loss["loss_critic"] @@ -2515,7 +2558,7 @@ def _create_seq_mock_data_a2c( return td @pytest.mark.parametrize("gradient_mode", (True, False)) - @pytest.mark.parametrize("advantage", ("gae", "td", "td_lambda")) + @pytest.mark.parametrize("advantage", ("gae", "td", "td_lambda", None)) @pytest.mark.parametrize("device", get_available_devices()) def test_a2c(self, device, gradient_mode, advantage): torch.manual_seed(self.seed) @@ -2535,10 +2578,12 @@ def test_a2c(self, device, gradient_mode, advantage): advantage = TDLambdaEstimate( gamma=0.9, lmbda=0.9, value_network=value, differentiable=gradient_mode ) + elif advantage is None: + pass else: raise NotImplementedError - loss_fn = A2CLoss(actor, value, gamma=0.9, loss_critic_type="l2") + loss_fn = A2CLoss(actor, value, loss_critic_type="l2") # Check error is raised when actions require grads td["action"].requires_grad = True @@ -2550,12 +2595,8 @@ def test_a2c(self, device, gradient_mode, advantage): td["action"].requires_grad = False td = td.exclude(loss_fn.value_target_key) - - with pytest.raises( - KeyError, match=re.escape('key "advantage" not found in TensorDict with') - ): - _ = loss_fn(td) - advantage(td) + if advantage is not None: + advantage(td) loss = loss_fn(td) loss_critic = loss["loss_critic"] loss_objective = loss["loss_objective"] + loss.get("loss_entropy", 0.0) @@ -2589,7 +2630,7 @@ def test_a2c(self, device, gradient_mode, advantage): not _has_functorch, reason=f"functorch not found, {FUNCTORCH_ERR}" ) @pytest.mark.parametrize("gradient_mode", (True, False)) - @pytest.mark.parametrize("advantage", ("gae", "td", "td_lambda")) + @pytest.mark.parametrize("advantage", ("gae", "td", "td_lambda", None)) @pytest.mark.parametrize("device", get_available_devices()) def test_a2c_diff(self, device, gradient_mode, advantage): if pack_version.parse(torch.__version__) > pack_version.parse("1.14"): @@ -2611,18 +2652,17 @@ def test_a2c_diff(self, device, gradient_mode, advantage): advantage = TDLambdaEstimate( gamma=0.9, lmbda=0.9, value_network=value, differentiable=gradient_mode ) + elif advantage is None: + pass else: raise NotImplementedError - loss_fn = A2CLoss(actor, value, gamma=0.9, loss_critic_type="l2") + loss_fn = A2CLoss(actor, value, loss_critic_type="l2") floss_fn, params, buffers = make_functional_with_buffers(loss_fn) - with pytest.raises( - KeyError, match=re.escape('key "advantage" not found in TensorDict with') - ): - _ = floss_fn(params, buffers, td) - advantage(td) + if advantage is not None: + advantage(td) loss = floss_fn(params, buffers, td) loss_critic = loss["loss_critic"] loss_objective = loss["loss_objective"] + loss.get("loss_entropy", 0.0) @@ -2655,7 +2695,7 @@ def test_a2c_diff(self, device, gradient_mode, advantage): class TestReinforce: @pytest.mark.parametrize("delay_value", [True, False]) @pytest.mark.parametrize("gradient_mode", [True, False]) - @pytest.mark.parametrize("advantage", ["gae", "td", "td_lambda"]) + @pytest.mark.parametrize("advantage", ["gae", "td", "td_lambda", None]) def test_reinforce_value_net(self, advantage, gradient_mode, delay_value): n_obs = 3 n_act = 5 @@ -2691,13 +2731,14 @@ def test_reinforce_value_net(self, advantage, gradient_mode, delay_value): value_network=get_functional(value_net), differentiable=gradient_mode, ) + elif advantage is None: + pass else: raise NotImplementedError loss_fn = ReinforceLoss( actor_net, critic=value_net, - gamma=gamma, delay_value=delay_value, ) @@ -2714,12 +2755,9 @@ def test_reinforce_value_net(self, advantage, gradient_mode, delay_value): [batch], ) - with pytest.raises( - KeyError, match=re.escape('key "advantage" not found in TensorDict with') - ): - _ = loss_fn(td) params = TensorDict(value_net.state_dict(), []).unflatten_keys(".") - advantage(td, params=params) + if advantage is not None: + advantage(td, params=params) loss_td = loss_fn(td) autograd.grad( loss_td.get("loss_actor"), @@ -3270,7 +3308,6 @@ def test_iql( qvalue_network=qvalue, value_network=value, num_qvalue_nets=num_qvalue, - gamma=0.9, temperature=temperature, expectile=expectile, loss_function="l2", @@ -3385,7 +3422,6 @@ def test_iql_batcher( qvalue_network=qvalue, value_network=value, num_qvalue_nets=num_qvalue, - gamma=0.9, temperature=temperature, expectile=expectile, loss_function="l2", diff --git a/torchrl/collectors/collectors.py b/torchrl/collectors/collectors.py index 853c6c8970e..7bd1f92a1d1 100644 --- a/torchrl/collectors/collectors.py +++ b/torchrl/collectors/collectors.py @@ -635,7 +635,7 @@ def set_seed(self, seed: int, static_seed: bool = False) -> int: Args: seed (int): integer representing the seed to be used for the environment. - static_seed(bool, optional): if True, the seed is not incremented. + static_seed(bool, optional): if ``True``, the seed is not incremented. Defaults to False Returns: @@ -1263,7 +1263,7 @@ def set_seed(self, seed: int, static_seed: bool = False) -> int: Args: seed: integer representing the seed to be used for the environment. - static_seed (bool, optional): if True, the seed is not incremented. + static_seed (bool, optional): if ``True``, the seed is not incremented. Defaults to False Returns: @@ -1840,7 +1840,7 @@ class aSyncDataCollector(MultiaSyncDataCollector): the output TensorDict will be stored. For long trajectories, it may be necessary to store the data on a different. device than the one where the policy is stored. Default is None. - update_at_each_batch (bool): if True, the policy weights will be updated every time a batch of trajectories + update_at_each_batch (bool): if ``True``, the policy weights will be updated every time a batch of trajectories is collected. default=False diff --git a/torchrl/data/datasets/d4rl.py b/torchrl/data/datasets/d4rl.py index 087793937f3..d80ecdf74fa 100644 --- a/torchrl/data/datasets/d4rl.py +++ b/torchrl/data/datasets/d4rl.py @@ -47,7 +47,7 @@ class D4RLExperienceReplay(TensorDictReplayBuffer): using multithreading. transform (Transform, optional): Transform to be executed when sample() is called. To chain transforms use the :obj:`Compose` class. - split_trajs (bool, optional): if True, the trajectories will be split + split_trajs (bool, optional): if ``True``, the trajectories will be split along the first dimension and padded to have a matching shape. To split the trajectories, the ``"done"`` signal will be used, which is recovered via ``done = timeout | terminal``. In other words, diff --git a/torchrl/data/replay_buffers/replay_buffers.py b/torchrl/data/replay_buffers/replay_buffers.py index 2ad9b3d65b9..0a20dc6dff7 100644 --- a/torchrl/data/replay_buffers/replay_buffers.py +++ b/torchrl/data/replay_buffers/replay_buffers.py @@ -561,7 +561,7 @@ class TensorDictPrioritizedReplayBuffer(TensorDictReplayBuffer): mini-batch of Tensor(s)/outputs. Used when using batched loading from a map-style dataset. pin_memory (bool, optional): whether pin_memory() should be called on - the rb samples. Default is :obj:`False`. + the rb samples. Default is ``False``. prefetch (int, optional): number of next batches to be prefetched using multithreading. transform (Transform, optional): Transform to be executed when sample() is called. diff --git a/torchrl/data/replay_buffers/samplers.py b/torchrl/data/replay_buffers/samplers.py index 4d180d35b49..9fd0fab8af4 100644 --- a/torchrl/data/replay_buffers/samplers.py +++ b/torchrl/data/replay_buffers/samplers.py @@ -76,7 +76,7 @@ class SamplerWithoutReplacement(Sampler): """A data-consuming sampler that ensures that the same sample is not present in consecutive batches. Args: - drop_last (bool, optional): if True, the last incomplete sample (if any) will be dropped. + drop_last (bool, optional): if ``True``, the last incomplete sample (if any) will be dropped. If False, this last sample will be kept and (unlike with torch dataloaders) completed with other samples from a fresh indices permutation. diff --git a/torchrl/data/tensor_specs.py b/torchrl/data/tensor_specs.py index 2614dddea4f..e07796028b7 100644 --- a/torchrl/data/tensor_specs.py +++ b/torchrl/data/tensor_specs.py @@ -2345,7 +2345,7 @@ def keys( :obj:`CompositeSpec(next=CompositeSpec(obs=None))` will lead to the keys :obj:`["next"]. Default is ``False``, i.e. nested keys will not be returned. - leaves_only (bool, optional): if :obj:`False`, the values returned + leaves_only (bool, optional): if ``False``, the values returned will contain every level of nesting, i.e. a :obj:`CompositeSpec(next=CompositeSpec(obs=None))` will lead to the keys :obj:`["next", ("next", "obs")]`. Default is ``False``. @@ -2367,7 +2367,7 @@ def items( :obj:`CompositeSpec(next=CompositeSpec(obs=None))` will lead to the keys :obj:`["next"]. Default is ``False``, i.e. nested keys will not be returned. - leaves_only (bool, optional): if :obj:`False`, the values returned + leaves_only (bool, optional): if ``False``, the values returned will contain every level of nesting, i.e. a :obj:`CompositeSpec(next=CompositeSpec(obs=None))` will lead to the keys :obj:`["next", ("next", "obs")]`. Default is ``False``. @@ -2395,7 +2395,7 @@ def values( :obj:`CompositeSpec(next=CompositeSpec(obs=None))` will lead to the keys :obj:`["next"]. Default is ``False``, i.e. nested keys will not be returned. - leaves_only (bool, optional): if :obj:`False`, the values returned + leaves_only (bool, optional): if ``False``, the values returned will contain every level of nesting, i.e. a :obj:`CompositeSpec(next=CompositeSpec(obs=None))` will lead to the keys :obj:`["next", ("next", "obs")]`. Default is ``False``. diff --git a/torchrl/envs/common.py b/torchrl/envs/common.py index b6a66b44663..1672867e652 100644 --- a/torchrl/envs/common.py +++ b/torchrl/envs/common.py @@ -127,7 +127,7 @@ class EnvBase(nn.Module, metaclass=abc.ABCMeta): - reward_spec (TensorSpec): sampling spec of the rewards; - batch_size (torch.Size): number of environments contained in the instance; - device (torch.device): device where the env input and output are expected to live - - run_type_checks (bool): if True, the observation and reward dtypes + - run_type_checks (bool): if ``True``, the observation and reward dtypes will be compared against their respective spec and an exception will be raised if they don't match. Defaults to False. @@ -538,7 +538,7 @@ def set_seed( Args: seed (int): seed to be set - static_seed (bool, optional): if True, the seed is not incremented. + static_seed (bool, optional): if ``True``, the seed is not incremented. Defaults to False Returns: @@ -651,11 +651,11 @@ def rollout( actions will be called using :obj:`env.rand_step()` default = None callback (callable, optional): function to be called at each iteration with the given TensorDict. - auto_reset (bool, optional): if True, resets automatically the environment + auto_reset (bool, optional): if ``True``, resets automatically the environment if it is in a done state when the rollout is initiated. Default is :obj:`True`. - auto_cast_to_device (bool, optional): if True, the device of the tensordict is automatically cast to the - policy device before the policy is used. Default is :obj:`False`. + auto_cast_to_device (bool, optional): if ``True``, the device of the tensordict is automatically cast to the + policy device before the policy is used. Default is ``False``. break_when_any_done (bool): breaks if any of the done state is True. If False, a reset() is called on the sub-envs that are done. Default is True. return_contiguous (bool): if False, a LazyStackedTensorDict will be returned. Default is True. diff --git a/torchrl/envs/libs/dm_control.py b/torchrl/envs/libs/dm_control.py index 34bcfa2580a..9097c6ca1d2 100644 --- a/torchrl/envs/libs/dm_control.py +++ b/torchrl/envs/libs/dm_control.py @@ -116,7 +116,7 @@ class DMControlWrapper(GymLikeEnv): Args: env (dm_control.suite env): environment instance - from_pixels (bool): if True, the observation + from_pixels (bool): if ``True``, the observation Examples: >>> env = dm_control.suite.load("cheetah", "run") @@ -272,7 +272,7 @@ class DMControlEnv(DMControlWrapper): env_name (str): name of the environment task_name (str): name of the task seed (int, optional): seed to use for the environment - from_pixels (bool, optional): if True, the observation will be returned + from_pixels (bool, optional): if ``True``, the observation will be returned as an image. Default is False. diff --git a/torchrl/envs/libs/utils.py b/torchrl/envs/libs/utils.py index 84efb840db8..d157ee06d3c 100644 --- a/torchrl/envs/libs/utils.py +++ b/torchrl/envs/libs/utils.py @@ -37,7 +37,7 @@ class GymPixelObservationWrapper(ObservationWrapper): env: The environment to wrap. pixels_only: If :obj:`True` (default), the original observation returned by the wrapped environment will be discarded, and a dictionary - observation will only include pixels. If :obj:`False`, the + observation will only include pixels. If ``False``, the observation dictionary will contain both the original observations and the pixel observations. render_kwargs: Optional :obj:`dict` containing keyword arguments passed diff --git a/torchrl/envs/transforms/r3m.py b/torchrl/envs/transforms/r3m.py index 6f6db79edd9..938e67c08a9 100644 --- a/torchrl/envs/transforms/r3m.py +++ b/torchrl/envs/transforms/r3m.py @@ -215,7 +215,7 @@ class R3MTransform(Compose): argument will be treaded separetely and each will be given a single, separated entry in the output tensordict. Defaults to :obj:`True`. download (bool, torchvision Weights config or corresponding string): - if True, the weights will be downloaded using the torch.hub download + if ``True``, the weights will be downloaded using the torch.hub download API (i.e. weights will be cached for future use). These weights are the original weights from the R3M publication. If the torchvision weights are needed, there are two ways they can be diff --git a/torchrl/envs/transforms/transforms.py b/torchrl/envs/transforms/transforms.py index a34d0bb8bed..6a0dd6be2b8 100644 --- a/torchrl/envs/transforms/transforms.py +++ b/torchrl/envs/transforms/transforms.py @@ -396,7 +396,7 @@ class TransformedEnv(EnvBase): transform (Transform, optional): transform to apply to the tensordict resulting from :obj:`env.step(td)`. If none is provided, an empty Compose placeholder in an eval mode is used. - cache_specs (bool, optional): if True, the specs will be cached once + cache_specs (bool, optional): if ``True``, the specs will be cached once and for all after the first call (i.e. the specs will be transformed_in only once). If the transform changes during training, the original spec transform may not be valid anymore, @@ -880,7 +880,7 @@ class ToTensorImage(ObservationTransform): with values between 0 and 1. Args: - unsqueeze (bool): if True, the observation tensor is unsqueezed + unsqueeze (bool): if ``True``, the observation tensor is unsqueezed along the first dimension. default=False. dtype (torch.dtype, optional): dtype to use for the resulting observations. @@ -1154,7 +1154,7 @@ class FlattenObservation(ObservationTransform): :obj:`["pixels"]` is assumed. out_keys (sequence of str, optional): the flatten observation keys. If none is provided, :obj:`in_keys` is assumed. - allow_positive_dim (bool, optional): if True, positive dimensions are accepted. + allow_positive_dim (bool, optional): if ``True``, positive dimensions are accepted. :obj:`FlattenObservation` will map these to the n^th feature dimension (ie n^th dimension after batch size of parent env) of the input tensor. Defaults to False, ie. non-negative dimensions are not permitted. @@ -1229,7 +1229,7 @@ class UnsqueezeTransform(Transform): Args: unsqueeze_dim (int): dimension to unsqueeze. Must be negative (or allow_positive_dim must be turned on). - allow_positive_dim (bool, optional): if True, positive dimensions are accepted. + allow_positive_dim (bool, optional): if ``True``, positive dimensions are accepted. :obj:`UnsqueezeTransform` will map these to the n^th feature dimension (ie n^th dimension after batch size of parent env) of the input tensor, independently from the tensordict batch size (ie positive dims may be @@ -1414,7 +1414,7 @@ class ObservationNorm(ObservationTransform): only the forward transform will be called. out_keys_inv (list of int, optional): output entries for the inverse transform. Defaults to the value of `in_keys_inv`. - standard_normal (bool, optional): if True, the transform will be + standard_normal (bool, optional): if ``True``, the transform will be .. math:: obs = (obs-loc)/scale @@ -1831,7 +1831,7 @@ class RewardScaling(Transform): Args: loc (number or torch.Tensor): location of the affine transform scale (number or torch.Tensor): scale of the affine transform - standard_normal (bool, optional): if True, the transform will be + standard_normal (bool, optional): if ``True``, the transform will be .. math:: reward = (reward-loc)/scale @@ -1993,9 +1993,9 @@ class CatTensors(Transform): out_key: key of the resulting tensor. dim (int, optional): dimension along which the concatenation will occur. Default is -1. - del_keys (bool, optional): if True, the input values will be deleted after + del_keys (bool, optional): if ``True``, the input values will be deleted after concatenation. Default is True. - unsqueeze_if_oor (bool, optional): if True, CatTensor will check that + unsqueeze_if_oor (bool, optional): if ``True``, CatTensor will check that the dimension indicated exist for the tensors to concatenate. If not, the tensors will be unsqueezed along that dimension. Default is False. @@ -2168,7 +2168,7 @@ class DiscreteActionProjection(Transform): num_actions_effective (int): max number of action considered. max_actions (int): maximum number of actions that this module can read. action_key (str, optional): key name of the action. Defaults to "action". - include_forward (bool, optional): if True, a call to forward will also + include_forward (bool, optional): if ``True``, a call to forward will also map the action from one domain to the other when the module is called by a replay buffer or an nn.Module chain. Defaults to True. @@ -2383,7 +2383,7 @@ class TensorDictPrimer(Transform): Args: primers (dict, optional): a dictionary containing key-spec pairs which will be used to populate the input tensordict. - random (bool, optional): if True, the values will be drawn randomly from + random (bool, optional): if ``True``, the values will be drawn randomly from the TensorSpec domain (or a unit Gaussian if unbounded). Otherwise a fixed value will be assumed. Defaults to `False`. default_value (float, optional): if non-random filling is chosen, this @@ -2771,7 +2771,7 @@ def build_td_for_shared_vecnorm( tensordict keys (iterable of str, optional): keys that have to be normalized. Default is `["next", "reward"]` - memmap (bool): if True, the resulting tensordict will be cast into + memmap (bool): if ``True``, the resulting tensordict will be cast into memmory map (using `memmap_()`). Otherwise, the tensordict will be placed in shared memory. diff --git a/torchrl/envs/transforms/vip.py b/torchrl/envs/transforms/vip.py index 4ba18177c56..2795439ef17 100644 --- a/torchrl/envs/transforms/vip.py +++ b/torchrl/envs/transforms/vip.py @@ -181,7 +181,7 @@ class VIPTransform(Compose): argument will be treaded separetely and each will be given a single, separated entry in the output tensordict. Defaults to :obj:`True`. download (bool, torchvision Weights config or corresponding string): - if True, the weights will be downloaded using the torch.hub download + if ``True``, the weights will be downloaded using the torch.hub download API (i.e. weights will be cached for future use). These weights are the original weights from the VIP publication. If the torchvision weights are needed, there are two ways they can be diff --git a/torchrl/envs/utils.py b/torchrl/envs/utils.py index f8ff52d6af2..c0bc3e2b6c4 100644 --- a/torchrl/envs/utils.py +++ b/torchrl/envs/utils.py @@ -39,17 +39,17 @@ def step_mdp( Args: tensordict (TensorDictBase): tensordict with keys to be renamed next_tensordict (TensorDictBase, optional): destination tensordict - keep_other (bool, optional): if True, all keys that do not start with :obj:`'next_'` will be kept. + keep_other (bool, optional): if ``True``, all keys that do not start with :obj:`'next_'` will be kept. Default is ``True``. - exclude_reward (bool, optional): if True, the :obj:`"reward"` key will be discarded + exclude_reward (bool, optional): if ``True``, the :obj:`"reward"` key will be discarded from the resulting tensordict. If ``False``, it will be copied (and replaced) from the ``"next"`` entry (if present). Default is ``False``. - exclude_done (bool, optional): if True, the :obj:`"done"` key will be discarded + exclude_done (bool, optional): if ``True``, the :obj:`"done"` key will be discarded from the resulting tensordict. If ``False``, it will be copied (and replaced) from the ``"next"`` entry (if present). Default is ``False``. - exclude_action (bool, optional): if True, the :obj:`"action"` key will + exclude_action (bool, optional): if ``True``, the :obj:`"action"` key will be discarded from the resulting tensordict. If ``False``, it will be kept in the root tensordict (since it should not be present in the ``"next"`` entry). @@ -232,7 +232,7 @@ def check_env_specs(env, return_contiguous=True, check_dtype=True, seed=0): Args: env (EnvBase): the env for which the specs have to be checked against data. - return_contiguous (bool, optional): if True, the random rollout will be called with + return_contiguous (bool, optional): if ``True``, the random rollout will be called with return_contiguous=True. This will fail in some cases (e.g. heterogeneous shapes of inputs/outputs). Defaults to True. check_dtype (bool, optional): if False, dtype checks will be skipped. diff --git a/torchrl/envs/vec_env.py b/torchrl/envs/vec_env.py index 5250b78dccf..c519b591728 100644 --- a/torchrl/envs/vec_env.py +++ b/torchrl/envs/vec_env.py @@ -107,7 +107,7 @@ class _BatchedEnv(EnvBase): needed, which comes with a slight compute overhead; create_env_kwargs (dict or list of dicts, optional): kwargs to be used with the environments being created; pin_memory (bool): if True and device is "cpu", calls :obj:`pin_memory` on the tensordicts when created. - share_individual_td (bool, optional): if True, a different tensordict is created for every process/worker and a lazy + share_individual_td (bool, optional): if ``True``, a different tensordict is created for every process/worker and a lazy stack is returned. default = None (False if single task); shared_memory (bool): whether or not the returned tensordict will be placed in shared memory; @@ -119,9 +119,9 @@ class _BatchedEnv(EnvBase): It is assumed that all environments will run on the same device as a common shared tensordict will be used to pass data from process to process. The device can be changed after instantiation using :obj:`env.to(device)`. - allow_step_when_done (bool, optional): if True, batched environments can + allow_step_when_done (bool, optional): if ``True``, batched environments can execute steps after a done state is encountered. - Defaults to :obj:`False`. + Defaults to ``False``. """ diff --git a/torchrl/modules/models/models.py b/torchrl/modules/models/models.py index cefca9b595c..575c12daa74 100644 --- a/torchrl/modules/models/models.py +++ b/torchrl/modules/models/models.py @@ -172,7 +172,6 @@ def __init__( _out_features_num = out_features if not isinstance(out_features, Number): - print(out_features, type(out_features)) _out_features_num = prod(out_features) self.out_features = out_features self._out_features_num = _out_features_num @@ -725,7 +724,7 @@ class DdpgCnnActor(nn.Module): 'bias_last_layer': True, } use_avg_pooling (bool, optional): if ``True``, a nn.AvgPooling layer is - used to aggregate the output. Default is :obj:`False`. + used to aggregate the output. Default is ``False``. device (Optional[DEVICE_TYPING]): device to create the module on. """ diff --git a/torchrl/modules/tensordict_module/actors.py b/torchrl/modules/tensordict_module/actors.py index ad191e7d14c..635fc90ca21 100644 --- a/torchrl/modules/tensordict_module/actors.py +++ b/torchrl/modules/tensordict_module/actors.py @@ -52,7 +52,7 @@ class Actor(SafeModule): occur because of exploration policies or numerical under/overflow issues. If this value is out of bounds, it is projected back onto the desired space using the :obj:`TensorSpec.project` - method. Default is :obj:`False`. + method. Default is ``False``. Examples: >>> import torch @@ -142,7 +142,7 @@ class ProbabilisticActor(SafeProbabilisticTensorDictSequential): occur because of exploration policies or numerical under/overflow issues. If this value is out of bounds, it is projected back onto the desired space using the :obj:`TensorSpec.project` - method. Default is :obj:`False`. + method. Default is ``False``. default_interaction_mode (str, optional): keyword-only argument. Default method to be used to retrieve the output value. Should be one of: 'mode', 'median', 'mean' or 'random' @@ -586,7 +586,7 @@ class QValueActor(Actor): occur because of exploration policies or numerical under/overflow issues. If this value is out of bounds, it is projected back onto the desired space using the :obj:`TensorSpec.project` - method. Default is :obj:`False`. + method. Default is ``False``. action_space (str, optional): The action space to be considered. Must be one of ``"one-hot"``, ``"mult_one_hot"``, ``"binary"`` or ``"categorical"``. @@ -659,7 +659,7 @@ class DistributionalQValueActor(QValueActor): occur because of exploration policies or numerical under/overflow issues. If this value is out of bounds, it is projected back onto the desired space using the :obj:`TensorSpec.project` - method. Default is :obj:`False`. + method. Default is ``False``. support (torch.Tensor): support of the action values. action_space (str, optional): The action space to be considered. Must be one of diff --git a/torchrl/modules/tensordict_module/common.py b/torchrl/modules/tensordict_module/common.py index 07486627f2b..0b12eaa2e82 100644 --- a/torchrl/modules/tensordict_module/common.py +++ b/torchrl/modules/tensordict_module/common.py @@ -119,7 +119,7 @@ class SafeModule(TensorDictModule): occur because of exploration policies or numerical under/overflow issues. If this value is out of bounds, it is projected back onto the desired space using the :obj:`TensorSpec.project` - method. Default is :obj:`False`. + method. Default is ``False``. Embedding a neural network in a TensorDictModule only requires to specify the input and output keys. The domain spec can be passed along if needed. TensorDictModule support functional and regular :obj:`nn.Module` objects. In the functional diff --git a/torchrl/modules/tensordict_module/probabilistic.py b/torchrl/modules/tensordict_module/probabilistic.py index f139f652031..299a8621c77 100644 --- a/torchrl/modules/tensordict_module/probabilistic.py +++ b/torchrl/modules/tensordict_module/probabilistic.py @@ -65,7 +65,7 @@ class SafeProbabilisticModule(ProbabilisticTensorDictModule): check will only occur for the distribution sample, but not the other tensors returned by the input module. If the sample is out of bounds, it is projected back onto the desired space using the `TensorSpec.project` method. - Default is :obj:`False`. + Default is ``False``. default_interaction_mode (str, optional): default method to be used to retrieve the output value. Should be one of: 'mode', 'median', 'mean' or 'random' (in which case the value is sampled randomly from the distribution). Default diff --git a/torchrl/objectives/a2c.py b/torchrl/objectives/a2c.py index a44557396ca..644324416f3 100644 --- a/torchrl/objectives/a2c.py +++ b/torchrl/objectives/a2c.py @@ -11,7 +11,8 @@ from torch import distributions as d from torchrl.objectives.common import LossModule -from torchrl.objectives.utils import distance_loss +from torchrl.objectives.utils import default_value_kwargs, distance_loss, ValueFunctions +from torchrl.objectives.value import GAE, TD0Estimate, TD1Estimate, TDLambdaEstimate class A2CLoss(LossModule): @@ -29,26 +30,47 @@ class A2CLoss(LossModule): critic (ValueOperator): value operator. advantage_key (str): the input tensordict key where the advantage is expected to be written. default: "advantage" - advantage_diff_key (str): the input tensordict key where advantage_diff is expected to be written. - default: "value_error" + value_target_key (str): the input tensordict key where the target state + value is expected to be written. Defaults to ``"value_target"``. + entropy_bonus (bool): if ``True``, an entropy bonus will be added to the + loss to favour exploratory policies. + samples_mc_entropy (int): if the distribution retrieved from the policy + operator does not have a closed form + formula for the entropy, a Monte-Carlo estimate will be used. + ``samples_mc_entropy`` will control how many + samples will be used to compute this estimate. + Defaults to ``1``. entropy_coef (float): the weight of the entropy loss. critic_coef (float): the weight of the critic loss. - gamma (scalar): a discount factor for return computation. - loss_function_type (str): loss function for the value discrepancy. Can be one of "l1", "l2" or "smooth_l1". - advantage_module (nn.Module): TensorDictModule used to compute tha advantage function. + loss_critic_type (str): loss function for the value discrepancy. + Can be one of "l1", "l2" or "smooth_l1". Defaults to ``"smooth_l1"``. + + .. note: + The advantage (typically GAE) can be computed by the loss function or + in the training loop. The latter option is usually preferred, but this is + up to the user to choose which option is to be preferred. + If the advantage key (``"advantage`` by default) is not present in the + input tensordict, the advantage will be computed by the :meth:`~.forward` + method. + A custom advantage module can be built using :meth:`~.make_value_function`. + The default is :class:`torchrl.objectives.value.GAE` with hyperparameters + dictated by :func:`torchrl.objectives.utils.default_value_kwargs`. + """ + default_value_type: ValueFunctions = ValueFunctions.GAE + def __init__( self, actor: ProbabilisticTensorDictSequential, critic: TensorDictModule, + *, advantage_key: str = "advantage", value_target_key: str = "value_target", entropy_bonus: bool = True, samples_mc_entropy: int = 1, entropy_coef: float = 0.01, critic_coef: float = 1.0, - gamma: float = 0.99, loss_critic_type: str = "smooth_l1", ): super().__init__() @@ -66,7 +88,6 @@ def __init__( self.register_buffer( "critic_coef", torch.tensor(critic_coef, device=self.device) ) - self.register_buffer("gamma", torch.tensor(gamma, device=self.device)) self.loss_critic_type = loss_critic_type def reset(self) -> None: @@ -96,6 +117,8 @@ def _log_probs( def loss_critic(self, tensordict: TensorDictBase) -> torch.Tensor: try: + # TODO: if the advantage is gathered by forward, this introduces an + # overhead that we could easily reduce. target_return = tensordict.get(self.value_target_key) tensordict_select = tensordict.select(*self.critic.in_keys) state_value = self.critic( @@ -118,8 +141,15 @@ def loss_critic(self, tensordict: TensorDictBase) -> torch.Tensor: return self.critic_coef * loss_value def forward(self, tensordict: TensorDictBase) -> TensorDictBase: - tensordict = tensordict.clone() - advantage = tensordict.get(self.advantage_key) + tensordict = tensordict.clone(False) + advantage = tensordict.get(self.advantage_key, None) + if advantage is None: + self.value_function( + tensordict, + params=self.critic_params, + target_params=self.target_critic_params, + ) + advantage = tensordict.get(self.advantage_key) log_probs, dist = self._log_probs(tensordict) loss = -(log_probs * advantage) td_out = TensorDict({"loss_objective": loss.mean()}, []) @@ -131,3 +161,26 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: loss_critic = self.loss_critic(tensordict).mean() td_out.set("loss_critic", loss_critic.mean()) return td_out + + def make_value_function(self, value_type: ValueFunctions, **hyperparams): + hp = dict(default_value_kwargs(value_type)) + hp.update(hyperparams) + value_key = "state_value" + if value_type == ValueFunctions.TD1: + self._value_function = TD1Estimate( + value_network=self.critic, value_key=value_key, **hp + ) + elif value_type == ValueFunctions.TD0: + self._value_function = TD0Estimate( + value_network=self.critic, value_key=value_key, **hp + ) + elif value_type == ValueFunctions.GAE: + self._value_function = GAE( + value_network=self.critic, value_key=value_key, **hp + ) + elif value_type == ValueFunctions.TDLambda: + self._value_function = TDLambdaEstimate( + value_network=self.critic, value_key=value_key, **hp + ) + else: + raise NotImplementedError(f"Unknown value type {value_type}") diff --git a/torchrl/objectives/common.py b/torchrl/objectives/common.py index a105b23da98..1cbddfb4deb 100644 --- a/torchrl/objectives/common.py +++ b/torchrl/objectives/common.py @@ -47,8 +47,15 @@ class LossModule(nn.Module): the various loss values throughout training. Other scalars present in the output tensordict will be logged too. + :cvar defaylt_value_type: The default value type of the class. + Losses that require a value estimation are equipped with a default value + pointer. This class attribute indicates which value estimator will be + used if none other is specified. + The value estimator can be changed using the :meth:`~.make_value_function` method. """ + default_value_type: ValueFunctions = None + def __init__(self): super().__init__() self._param_maps = {} @@ -358,6 +365,7 @@ def cpu(self) -> LossModule: @property def value_function(self) -> ValueFunctionBase: + """The value function blends in the reward and value estimate(s) from upcoming state(s)/state-action pair(s) into a target value estimate for the value network.""" out = self._value_function if out is None: self._default_value_function() @@ -375,7 +383,7 @@ def _default_value_function(self): from :obj:`torchrl.objectives.utils.DEFAULT_VALUE_FUN_PARAMS`. """ - raise NotImplementedError + self.make_value_function(self.default_value_type) def make_value_function(self, value_type: ValueFunctions, **hyperparams): """Value-function constructor. diff --git a/torchrl/objectives/ddpg.py b/torchrl/objectives/ddpg.py index 42dea6244fa..980c0e3b8c9 100644 --- a/torchrl/objectives/ddpg.py +++ b/torchrl/objectives/ddpg.py @@ -14,10 +14,16 @@ from tensordict.tensordict import TensorDict, TensorDictBase from torchrl.modules.tensordict_module.actors import ActorCriticWrapper -from torchrl.objectives.utils import distance_loss, hold_out_params, next_state_value +from torchrl.objectives.utils import ( + default_value_kwargs, + distance_loss, + hold_out_params, + ValueFunctions, +) from ..envs.utils import set_exploration_mode from .common import LossModule +from .value import GAE, TD0Estimate, TD1Estimate, TDLambdaEstimate class DDPGLoss(LossModule): @@ -26,21 +32,20 @@ class DDPGLoss(LossModule): Args: actor_network (TensorDictModule): a policy operator. value_network (TensorDictModule): a Q value operator. - gamma (scalar): a discount factor for return computation. - device (str, int or torch.device, optional): a device where the losses will be computed, if it can't be found - via the value operator. loss_function (str): loss function for the value discrepancy. Can be one of "l1", "l2" or "smooth_l1". delay_actor (bool, optional): whether to separate the target actor networks from the actor networks used for - data collection. Default is :obj:`False`. + data collection. Default is ``False``. delay_value (bool, optional): whether to separate the target value networks from the value networks used for - data collection. Default is :obj:`False`. + data collection. Default is ``False``. """ + default_value_type: ValueFunctions = ValueFunctions.TD0 + def __init__( self, actor_network: TensorDictModule, value_network: TensorDictModule, - gamma: float, + *, loss_function: str = "l2", delay_actor: bool = False, delay_value: bool = False, @@ -71,7 +76,6 @@ def __init__( self.actor_in_keys = actor_network.in_keys - self.register_buffer("gamma", torch.tensor(gamma)) self.loss_funtion = loss_function def forward(self, input_tensordict: TensorDictBase) -> TensorDict: @@ -147,7 +151,6 @@ def _loss_value( ) pred_val = td_copy.get("state_action_value").squeeze(-1) - actor_critic = self.actor_critic target_params = TensorDict( { "module": { @@ -159,12 +162,9 @@ def _loss_value( device=self.target_actor_network_params.device, ) with set_exploration_mode("mode"): - target_value = next_state_value( - tensordict, - actor_critic, - gamma=self.gamma, - params=target_params, - ) + target_value = self.value_function.value_estimate( + tensordict, target_params=target_params + ).squeeze(-1) # td_error = pred_val - target_value loss_value = distance_loss( @@ -172,3 +172,26 @@ def _loss_value( ) return loss_value, (pred_val - target_value).pow(2), pred_val, target_value + + def make_value_function(self, value_type: ValueFunctions, **hyperparams): + hp = dict(default_value_kwargs(value_type)) + hp.update(hyperparams) + value_key = "state_action_value" + if value_type == ValueFunctions.TD1: + self._value_function = TD1Estimate( + value_network=self.actor_critic, value_key=value_key, **hp + ) + elif value_type == ValueFunctions.TD0: + self._value_function = TD0Estimate( + value_network=self.actor_critic, value_key=value_key, **hp + ) + elif value_type == ValueFunctions.GAE: + raise NotImplementedError( + f"Value type {value_type} it not implemented for loss {type(self)}." + ) + elif value_type == ValueFunctions.TDLambda: + self._value_function = TDLambdaEstimate( + value_network=self.actor_critic, value_key=value_key, **hp + ) + else: + raise NotImplementedError(f"Unknown value type {value_type}") diff --git a/torchrl/objectives/deprecated.py b/torchrl/objectives/deprecated.py index 6fd5d01ab97..97b7aab7e5e 100644 --- a/torchrl/objectives/deprecated.py +++ b/torchrl/objectives/deprecated.py @@ -16,11 +16,13 @@ from torch import Tensor from torchrl.envs.utils import set_exploration_mode, step_mdp from torchrl.objectives import ( + default_value_kwargs, distance_loss, hold_out_params, - next_state_value as get_next_state_value, + ValueFunctions, ) from torchrl.objectives.common import LossModule +from torchrl.objectives.value import GAE, TD0Estimate, TD1Estimate, TDLambdaEstimate try: from functorch import vmap @@ -41,36 +43,49 @@ class REDQLoss_deprecated(LossModule): Args: actor_network (TensorDictModule): the actor to be trained - qvalue_network (TensorDictModule): a single Q-value network that will be multiplicated as many times as needed. - num_qvalue_nets (int, optional): Number of Q-value networks to be trained. Default is 10. - sub_sample_len (int, optional): number of Q-value networks to be subsampled to evaluate the next state value - Default is 2. - gamma (Number, optional): gamma decay factor. Default is 0.99. - priotity_key (str, optional): Key where to write the priority value for prioritized replay buffers. Default is - `"td_error"`. - loss_function (str, optional): loss function to be used for the Q-value. Can be one of `"smooth_l1"`, "l2", - "l1", Default is "smooth_l1". + qvalue_network (TensorDictModule): a single Q-value network that will + be multiplicated as many times as needed. + num_qvalue_nets (int, optional): Number of Q-value networks to be trained. + Default is ``10``. + sub_sample_len (int, optional): number of Q-value networks to be + subsampled to evaluate the next state value + Default is ``2``. + priority_key (str, optional): Key where to write the priority value + for prioritized replay buffers. Default is + ``"td_error"``. + loss_function (str, optional): loss function to be used for the Q-value. + Can be one of ``"smooth_l1"``, ``"l2"``, + ``"l1"``, Default is ``"smooth_l1"``. alpha_init (float, optional): initial entropy multiplier. - Default is 1.0. + Default is ``1.0``. min_alpha (float, optional): min value of alpha. - Default is 0.1. + Default is ``0.1``. max_alpha (float, optional): max value of alpha. - Default is 10.0. - fixed_alpha (bool, optional): whether alpha should be trained to match a target entropy. Default is :obj:`False`. - target_entropy (Union[str, Number], optional): Target entropy for the stochastic policy. Default is "auto". + Default is ``10.0``. + fixed_alpha (bool, optional): whether alpha should be trained to match + a target entropy. Default is ``False``. + target_entropy (Union[str, Number], optional): Target entropy for the + stochastic policy. Default is "auto". + delay_qvalue (bool, optional): Whether to separate the target Q value + networks from the Q value networks used + for data collection. Default is ``False``. + gSDE (bool, optional): Knowing if gSDE is used is necessary to create + random noise variables. + Default is ``False``. """ delay_actor: bool = False + default_value_type = ValueFunctions.TD0 def __init__( self, actor_network: TensorDictModule, qvalue_network: TensorDictModule, + *, num_qvalue_nets: int = 10, sub_sample_len: int = 2, - gamma: Number = 0.99, - priotity_key: str = "td_error", + priority_key: str = "td_error", loss_function: str = "smooth_l1", alpha_init: float = 1.0, min_alpha: float = 0.1, @@ -102,8 +117,7 @@ def __init__( ) self.num_qvalue_nets = num_qvalue_nets self.sub_sample_len = max(1, min(sub_sample_len, num_qvalue_nets - 1)) - self.register_buffer("gamma", torch.tensor(gamma)) - self.priority_key = priotity_key + self.priority_key = priority_key self.loss_function = loss_function try: @@ -197,7 +211,7 @@ def _qvalue_loss(self, tensordict: TensorDictBase) -> Tensor: tensordict_save = tensordict obs_keys = self.actor_network.in_keys - tensordict = tensordict.select("next", *obs_keys, "action") + tensordict = tensordict.clone(False).select("next", *obs_keys, "action") selected_models_idx = torch.randperm(self.num_qvalue_nets)[ : self.sub_sample_len @@ -227,17 +241,13 @@ def _qvalue_loss(self, tensordict: TensorDictBase) -> Tensor: != sample_log_prob.shape ): sample_log_prob = sample_log_prob.unsqueeze(-1) - state_value = ( + next_state_value = ( next_td.get("state_action_value") - self.alpha * sample_log_prob ) - state_value = state_value.min(0)[0] + next_state_value = next_state_value.min(0)[0] - tensordict.set("next.state_value", state_value) - target_value = get_next_state_value( - tensordict, - gamma=self.gamma, - pred_next_val=state_value, - ) + tensordict.set(("next", "state_value"), next_state_value) + target_value = self.value_function.value_estimate(tensordict).squeeze(-1) tensordict_expand = vmap(self.qvalue_network, (None, 0))( tensordict.select(*self.qvalue_network.in_keys), self.qvalue_network_params, @@ -265,6 +275,28 @@ def _loss_alpha(self, log_pi: Tensor) -> Tensor: alpha_loss = torch.zeros_like(log_pi) return alpha_loss + def make_value_function(self, value_type: ValueFunctions, **hyperparams): + hp = dict(default_value_kwargs(value_type)) + hp.update(hyperparams) + value_key = "state_value" + # we do not need a value network bc the next state value is already passed + if value_type == ValueFunctions.TD1: + self._value_function = TD1Estimate( + value_network=None, value_key=value_key, **hp + ) + elif value_type == ValueFunctions.TD0: + self._value_function = TD0Estimate( + value_network=None, value_key=value_key, **hp + ) + elif value_type == ValueFunctions.GAE: + self._value_function = GAE(value_network=None, value_key=value_key, **hp) + elif value_type == ValueFunctions.TDLambda: + self._value_function = TDLambdaEstimate( + value_network=None, value_key=value_key, **hp + ) + else: + raise NotImplementedError(f"Unknown value type {value_type}") + class DoubleREDQLoss_deprecated(REDQLoss_deprecated): """[Deprecated] Class for delayed target-REDQ (which should be the default behaviour).""" diff --git a/torchrl/objectives/dqn.py b/torchrl/objectives/dqn.py index 9792c782b0d..66ab0aba5c4 100644 --- a/torchrl/objectives/dqn.py +++ b/torchrl/objectives/dqn.py @@ -26,13 +26,16 @@ class DQNLoss(LossModule): value_network (QValueActor or nn.Module): a Q value operator. loss_function (str): loss function for the value discrepancy. Can be one of "l1", "l2" or "smooth_l1". delay_value (bool, optional): whether to duplicate the value network into a new target value network to - create a double DQN. Default is :obj:`False`. + create a double DQN. Default is ``False``. """ + default_value_type = ValueFunctions.TDLambda + def __init__( self, value_network: Union[QValueActor, nn.Module], + *, loss_function: str = "l2", priority_key: str = "td_error", delay_value: bool = False, @@ -76,12 +79,8 @@ def make_value_function(self, value_type: ValueFunctions, **hyperparams): value_key="chosen_action_value", ) elif value_type is ValueFunctions.GAE: - self._value_function = GAE( - **hp, - value_network=self.value_network, - advantage_key="advantage", - value_target_key="value_target", - value_key="chosen_action_value", + raise NotImplementedError( + f"Value type {value_type} it not implemented for loss {type(self)}." ) elif value_type is ValueFunctions.TDLambda: self._value_function = TDLambdaEstimate( @@ -94,9 +93,6 @@ def make_value_function(self, value_type: ValueFunctions, **hyperparams): else: raise NotImplementedError(f"Unknown value type {value_type}") - def _default_value_function(self): - self.make_value_function(ValueFunctions.TDLambda) - def forward(self, input_tensordict: TensorDictBase) -> TensorDict: """Computes the DQN loss given a tensordict sampled from the replay buffer. diff --git a/torchrl/objectives/dreamer.py b/torchrl/objectives/dreamer.py index 885558390c3..47f13e4ae90 100644 --- a/torchrl/objectives/dreamer.py +++ b/torchrl/objectives/dreamer.py @@ -11,15 +11,22 @@ from torchrl.envs.model_based.dreamer import DreamerEnv from torchrl.envs.utils import set_exploration_mode, step_mdp from torchrl.objectives.common import LossModule -from torchrl.objectives.utils import distance_loss, hold_out_net -from torchrl.objectives.value.functional import vec_td_lambda_return_estimate +from torchrl.objectives.utils import ( + default_value_kwargs, + distance_loss, + hold_out_net, + ValueFunctions, +) +from torchrl.objectives.value import GAE, TD0Estimate, TD1Estimate, TDLambdaEstimate class DreamerModelLoss(LossModule): """Dreamer Model Loss. - Computes the loss of the dreamer world model. The loss is composed of the kl divergence between the prior and posterior of the RSSM, - the reconstruction loss over the reconstructed observation and the reward loss over the predicted reward. + Computes the loss of the dreamer world model. The loss is composed of the + kl divergence between the prior and posterior of the RSSM, + the reconstruction loss over the reconstructed observation and the reward + loss over the predicted reward. Reference: https://arxiv.org/abs/1912.01603. @@ -31,10 +38,10 @@ class DreamerModelLoss(LossModule): reco_loss (str, optional): the reconstruction loss. Default: "l2". reward_loss (str, optional): the reward loss. Default: "l2". free_nats (int, optional): the free nats. Default: 3. - delayed_clamp (bool, optional): if True, the KL clamping occurs after + delayed_clamp (bool, optional): if ``True``, the KL clamping occurs after averaging. If False (default), the kl divergence is clamped to the free nats value first and then averaged. - global_average (bool, optional): if True, the losses will be averaged + global_average (bool, optional): if ``True``, the losses will be averaged over all dimensions. Otherwise, a sum will be performed over all non-batch/time dimensions and an average over batch and time. Default: False. @@ -43,6 +50,7 @@ class DreamerModelLoss(LossModule): def __init__( self, world_model: TensorDictModule, + *, lambda_kl: float = 1.0, lambda_reco: float = 1.0, lambda_reward: float = 1.0, @@ -129,7 +137,8 @@ def kl_loss( class DreamerActorLoss(LossModule): """Dreamer Actor Loss. - Computes the loss of the dreamer actor. The actor loss is computed as the negative average lambda return. + Computes the loss of the dreamer actor. The actor loss is computed as the + negative average lambda return. Reference: https://arxiv.org/abs/1912.01603. @@ -138,22 +147,21 @@ class DreamerActorLoss(LossModule): value_model (TensorDictModule): the value model. model_based_env (DreamerEnv): the model based environment. imagination_horizon (int, optional): The number of steps to unroll the - model. Default: 15. - gamma (float, optional): the gamma discount factor. Default: 0.99. - lmbda (float, optional): the lambda discount factor factor. Default: 0.95. - discount_loss (bool, optional): if True, the loss is discounted with a - gamma discount factor. Default: False. + model. Defaults to ``15``. + discount_loss (bool, optional): if ``True``, the loss is discounted with a + gamma discount factor. Default to ``False``. """ + default_value_type = ValueFunctions.TDLambda + def __init__( self, actor_model: TensorDictModule, value_model: TensorDictModule, model_based_env: DreamerEnv, + *, imagination_horizon: int = 15, - gamma: int = 0.99, - lmbda: int = 0.95, discount_loss: bool = False, # for consistency with paper ): super().__init__() @@ -161,8 +169,6 @@ def __init__( self.value_model = value_model self.model_based_env = model_based_env self.imagination_horizon = imagination_horizon - self.gamma = gamma - self.lmbda = lmbda self.discount_loss = discount_loss def forward(self, tensordict: TensorDict) -> Tuple[TensorDict, TensorDict]: @@ -192,9 +198,8 @@ def forward(self, tensordict: TensorDict) -> Tuple[TensorDict, TensorDict]: fake_data.set("lambda_target", lambda_target) if self.discount_loss: - discount = self.gamma * torch.ones_like( - lambda_target, device=tensordict.device - ) + gamma = self.value_function.gamma.to(tensordict.device) + discount = gamma.expand(lambda_target.shape) discount[..., 0, :] = 1 discount = discount.cumprod(dim=-2) actor_loss = -(lambda_target * discount).sum((-2, -1)).mean() @@ -205,15 +210,58 @@ def forward(self, tensordict: TensorDict) -> Tuple[TensorDict, TensorDict]: def lambda_target(self, reward: torch.Tensor, value: torch.Tensor) -> torch.Tensor: done = torch.zeros(reward.shape, dtype=torch.bool, device=reward.device) - return vec_td_lambda_return_estimate( - self.gamma, self.lmbda, value, reward, done + input_tensordict = TensorDict( + { + ("next", "reward"): reward, + ("next", "state_value"): value, + ("next", "done"): done, + }, + [], ) + return self.value_function.value_estimate(input_tensordict) + + def make_value_function(self, value_type: ValueFunctions, **hyperparams): + value_net = None + value_key = "state_value" + hp = dict(default_value_kwargs(value_type)) + hp.update(hyperparams) + if value_type is ValueFunctions.TD1: + self._value_function = TD1Estimate( + **hp, + value_network=value_net, + value_target_key="value_target", + value_key=value_key, + ) + elif value_type is ValueFunctions.TD0: + self._value_function = TD0Estimate( + **hp, + value_network=value_net, + value_target_key="value_target", + value_key=value_key, + ) + elif value_type is ValueFunctions.GAE: + self._value_function = GAE( + **hp, + value_network=value_net, + value_target_key="value_target", + value_key=value_key, + ) + elif value_type is ValueFunctions.TDLambda: + self._value_function = TDLambdaEstimate( + **hp, + value_network=value_net, + value_target_key="value_target", + value_key=value_key, + ) + else: + raise NotImplementedError(f"Unknown value type {value_type}") class DreamerValueLoss(LossModule): """Dreamer Value Loss. - Computes the loss of the dreamer value model. The value loss is computed between the predicted value and the lambda target. + Computes the loss of the dreamer value model. The value loss is computed + between the predicted value and the lambda target. Reference: https://arxiv.org/abs/1912.01603. @@ -221,7 +269,7 @@ class DreamerValueLoss(LossModule): value_model (TensorDictModule): the value model. value_loss (str, optional): the loss to use for the value loss. Default: "l2". gamma (float, optional): the gamma discount factor. Default: 0.99. - discount_loss (bool, optional): if True, the loss is discounted with a + discount_loss (bool, optional): if ``True``, the loss is discounted with a gamma discount factor. Default: False. """ diff --git a/torchrl/objectives/iql.py b/torchrl/objectives/iql.py index 83408fbd170..f817af1b904 100644 --- a/torchrl/objectives/iql.py +++ b/torchrl/objectives/iql.py @@ -3,7 +3,6 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from numbers import Number from typing import Optional, Tuple import torch @@ -12,10 +11,11 @@ from torch import Tensor from torchrl.modules import ProbabilisticActor -from torchrl.objectives.utils import distance_loss, next_state_value +from torchrl.objectives.utils import default_value_kwargs, distance_loss, ValueFunctions -from ..envs.utils import set_exploration_mode, step_mdp +from ..envs.utils import set_exploration_mode from .common import LossModule +from .value import GAE, TD0Estimate, TD1Estimate, TDLambdaEstimate try: from functorch import vmap @@ -35,14 +35,9 @@ class IQLLoss(LossModule): Args: actor_network (ProbabilisticActor): stochastic actor qvalue_network (TensorDictModule): Q(s, a) parametric model - value_network (TensorDictModule, optional): V(s) parametric model. If not - provided, the second version of SAC is assumed. - qvalue_network_bis (ProbabilisticTDModule, optional): if required, the - Q-value can be computed twice independently using two separate - networks. The minimum predicted value will then be used for - inference. - gamma (number, optional): discount for return computation - Default is 0.99 + value_network (TensorDictModule, optional): V(s) parametric model. + num_qvalue_nets (integer, optional): number of Q-Value networks used. + Defaults to ``2``. priority_key (str, optional): tensordict key where to write the priority (for prioritized replay buffer usage). Default is `"td_error"`. @@ -57,14 +52,16 @@ class IQLLoss(LossModule): """ + default_value_type = ValueFunctions.TD0 + def __init__( self, actor_network: ProbabilisticActor, qvalue_network: TensorDictModule, - value_network: Optional[TensorDictModule] = None, + value_network: Optional[TensorDictModule], + *, num_qvalue_nets: int = 2, - gamma: Number = 0.99, - priotity_key: str = "td_error", + priority_key: str = "td_error", loss_function: str = "smooth_l1", temperature: float = 1.0, expectile: float = 0.5, @@ -106,8 +103,7 @@ def __init__( + list(value_network.parameters()), ) - self.register_buffer("gamma", torch.tensor(gamma)) - self.priority_key = priotity_key + self.priority_key = priority_key self.loss_function = loss_function @property @@ -218,26 +214,9 @@ def _loss_qvalue(self, tensordict: TensorDictBase) -> Tuple[Tensor, Tensor]: obs_keys = self.actor_network.in_keys tensordict = tensordict.select("next", *obs_keys, "action") - with torch.no_grad(): - next_td = step_mdp(tensordict).select( - *self.actor_network.in_keys - ) # next_observation -> - # observation - # select pseudo-action - # get state values - next_td = self.value_network( - next_td, - params=self.value_network_params, - ) - - state_value = next_td.get("state_value") - - tensordict.set("next.state_value", state_value) - target_value = next_state_value( - tensordict, - gamma=self.gamma, - pred_next_val=state_value, - ) + target_value = self.value_function.value_estimate( + tensordict, target_params=self.target_value_network_params + ).squeeze(-1) tensordict_expand = vmap(self.qvalue_network, (None, 0))( tensordict.select(*self.qvalue_network.in_keys), self.qvalue_network_params, @@ -254,3 +233,40 @@ def _loss_qvalue(self, tensordict: TensorDictBase) -> Tuple[Tensor, Tensor]: .mean() ) return loss_qval, td_error.detach().max(0)[0] + + def make_value_function(self, value_type: ValueFunctions, **hyperparams): + value_net = self.value_network + + value_key = "state_value" + hp = dict(default_value_kwargs(value_type)) + hp.update(hyperparams) + if value_type is ValueFunctions.TD1: + self._value_function = TD1Estimate( + **hp, + value_network=value_net, + value_target_key="value_target", + value_key=value_key, + ) + elif value_type is ValueFunctions.TD0: + self._value_function = TD0Estimate( + **hp, + value_network=value_net, + value_target_key="value_target", + value_key=value_key, + ) + elif value_type is ValueFunctions.GAE: + self._value_function = GAE( + **hp, + value_network=value_net, + value_target_key="value_target", + value_key=value_key, + ) + elif value_type is ValueFunctions.TDLambda: + self._value_function = TDLambdaEstimate( + **hp, + value_network=value_net, + value_target_key="value_target", + value_key=value_key, + ) + else: + raise NotImplementedError(f"Unknown value type {value_type}") diff --git a/torchrl/objectives/ppo.py b/torchrl/objectives/ppo.py index 0ac84fd5daa..477cf3f5765 100644 --- a/torchrl/objectives/ppo.py +++ b/torchrl/objectives/ppo.py @@ -11,21 +11,25 @@ from tensordict.tensordict import TensorDict, TensorDictBase from torch import distributions as d -from torchrl.objectives.utils import distance_loss +from torchrl.objectives.utils import default_value_kwargs, distance_loss, ValueFunctions from .common import LossModule +from .value import GAE, TD0Estimate, TD1Estimate, TDLambdaEstimate class PPOLoss(LossModule): """A parent PPO loss class. - PPO (Proximal Policy Optimisation) is a model-free, online RL algorithm that makes use of a recorded (batch of) - trajectories to perform several optimization steps, while actively preventing the updated policy to deviate too + PPO (Proximal Policy Optimisation) is a model-free, online RL algorithm + that makes use of a recorded (batch of) + trajectories to perform several optimization steps, while actively + preventing the updated policy to deviate too much from its original parameter configuration. - PPO loss can be found in different flavours, depending on the way the constrained optimisation is implemented: - ClipPPOLoss and KLPENPPOLoss. - Unlike its subclasses, this class does not implement any regularisation and should therefore be used cautiously. + PPO loss can be found in different flavours, depending on the way the + constrained optimisation is implemented: ClipPPOLoss and KLPENPPOLoss. + Unlike its subclasses, this class does not implement any regularisation + and should therefore be used cautiously. For more details regarding PPO, refer to: "Proximal Policy Optimization Algorithms", https://arxiv.org/abs/1707.06347 @@ -33,35 +37,68 @@ class PPOLoss(LossModule): Args: actor (ProbabilisticTensorDictSequential): policy operator. critic (ValueOperator): value operator. - advantage_key (str): the input tensordict key where the advantage is expected to be written. - default: "advantage" - entropy_bonus (bool): if True, an entropy bonus will be added to the loss to favour exploratory policies. - samples_mc_entropy (int): if the distribution retrieved from the policy operator does not have a closed form - formula for the entropy, a Monte-Carlo estimate will be used. samples_mc_entropy will control how many + advantage_key (str): the input tensordict key where the advantage is + expected to be written. + Defaults to ``"advantage"``. + value_target_key (str): the input tensordict key where the target state + value is expected to be written. Defaults to ``"value_target"``. + entropy_bonus (bool): if ``True``, an entropy bonus will be added to the + loss to favour exploratory policies. + samples_mc_entropy (int): if the distribution retrieved from the policy + operator does not have a closed form + formula for the entropy, a Monte-Carlo estimate will be used. + ``samples_mc_entropy`` will control how many samples will be used to compute this estimate. - default: 1 + Defaults to ``1``. entropy_coef (scalar): entropy multiplier when computing the total loss. - default: 0.01 - critic_coef (scalar): critic loss multiplier when computing the total loss. - default: 1.0 - gamma (scalar): a discount factor for return computation. - loss_function (str): loss function for the value discrepancy. Can be one of "l1", "l2" or "smooth_l1". - normalize_advantage (bool): if True, the advantage will be normalized before being used. - Defaults to False. + Defaults to ``0.01``. + critic_coef (scalar): critic loss multiplier when computing the total + loss. Defaults to ``1.0``. + loss_critic_type (str): loss function for the value discrepancy. + Can be one of "l1", "l2" or "smooth_l1". Defaults to ``"smooth_l1"``. + normalize_advantage (bool): if ``True``, the advantage will be normalized + before being used. Defaults to ``False``. + + .. note: + The advantage (typically GAE) can be computed by the loss function or + in the training loop. The latter option is usually preferred, but this is + up to the user to choose which option is to be preferred. + If the advantage key (``"advantage`` by default) is not present in the + input tensordict, the advantage will be computed by the :meth:`~.forward` + method. + + >>> ppo_loss = PPOLoss(actor, critic) + >>> advantage = GAE(critic) + >>> data = next(datacollector) + >>> losses = ppo_loss(data) + >>> # equivalent + >>> advantage(data) + >>> losses = ppo_loss(data) + + A custom advantage module can be built using :meth:`~.make_value_function`. + The default is :class:`torchrl.objectives.value.GAE` with hyperparameters + dictated by :func:`torchrl.objectives.utils.default_value_kwargs`. + + >>> ppo_loss = PPOLoss(actor, critic) + >>> ppo_loss.make_value_function(ValueFunctions.TDLambda) + >>> data = next(datacollector) + >>> losses = ppo_loss(data) """ + default_value_type = ValueFunctions.GAE + def __init__( self, actor: ProbabilisticTensorDictSequential, critic: TensorDictModule, + *, advantage_key: str = "advantage", value_target_key: str = "value_target", entropy_bonus: bool = True, samples_mc_entropy: int = 1, entropy_coef: float = 0.01, critic_coef: float = 1.0, - gamma: float = 0.99, loss_critic_type: str = "smooth_l1", normalize_advantage: bool = False, ): @@ -82,7 +119,6 @@ def __init__( self.register_buffer( "critic_coef", torch.tensor(critic_coef, device=self.device) ) - self.register_buffer("gamma", torch.tensor(gamma, device=self.device)) self.loss_critic_type = loss_critic_type self.normalize_advantage = normalize_advantage @@ -117,6 +153,8 @@ def _log_weight( return log_weight, dist def loss_critic(self, tensordict: TensorDictBase) -> torch.Tensor: + # TODO: if the advantage is gathered by forward, this introduces an + # overhead that we could easily reduce. try: target_return = tensordict.get(self.value_target_key) tensordict_select = tensordict.select(*self.critic.in_keys) @@ -141,7 +179,14 @@ def loss_critic(self, tensordict: TensorDictBase) -> torch.Tensor: def forward(self, tensordict: TensorDictBase) -> TensorDictBase: tensordict = tensordict.clone(False) - advantage = tensordict.get(self.advantage_key) + advantage = tensordict.get(self.advantage_key, None) + if advantage is None: + self.value_function( + tensordict, + params=self.critic_params, + target_params=self.target_critic_params, + ) + advantage = tensordict.get(self.advantage_key) if self.normalize_advantage and advantage.numel() > 1: loc = advantage.mean().item() scale = advantage.std().clamp_min(1e-6).item() @@ -159,6 +204,29 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: td_out.set("loss_critic", loss_critic.mean()) return td_out + def make_value_function(self, value_type: ValueFunctions, **hyperparams): + hp = dict(default_value_kwargs(value_type)) + hp.update(hyperparams) + value_key = "state_value" + if value_type == ValueFunctions.TD1: + self._value_function = TD1Estimate( + value_network=self.critic, value_key=value_key, **hp + ) + elif value_type == ValueFunctions.TD0: + self._value_function = TD0Estimate( + value_network=self.critic, value_key=value_key, **hp + ) + elif value_type == ValueFunctions.GAE: + self._value_function = GAE( + value_network=self.critic, value_key=value_key, **hp + ) + elif value_type == ValueFunctions.TDLambda: + self._value_function = TDLambdaEstimate( + value_network=self.critic, value_key=value_key, **hp + ) + else: + raise NotImplementedError(f"Unknown value type {value_type}") + class ClipPPOLoss(PPOLoss): """Clipped PPO loss. @@ -170,22 +238,52 @@ class ClipPPOLoss(PPOLoss): actor (ProbabilisticTensorDictSequential): policy operator. critic (ValueOperator): value operator. advantage_key (str): the input tensordict key where the advantage is expected to be written. - default: "advantage" + Defaults to ``"advantage"``. + value_target_key (str): the input tensordict key where the target state + value is expected to be written. Defaults to ``"value_target"``. clip_epsilon (scalar): weight clipping threshold in the clipped PPO loss equation. default: 0.2 - entropy_bonus (bool): if True, an entropy bonus will be added to the loss to favour exploratory policies. - samples_mc_entropy (int): if the distribution retrieved from the policy operator does not have a closed form - formula for the entropy, a Monte-Carlo estimate will be used. samples_mc_entropy will control how many + entropy_bonus (bool): if ``True``, an entropy bonus will be added to the + loss to favour exploratory policies. + samples_mc_entropy (int): if the distribution retrieved from the policy + operator does not have a closed form + formula for the entropy, a Monte-Carlo estimate will be used. + ``samples_mc_entropy`` will control how many samples will be used to compute this estimate. - default: 1 + Defaults to ``1``. entropy_coef (scalar): entropy multiplier when computing the total loss. - default: 0.01 - critic_coef (scalar): critic loss multiplier when computing the total loss. - default: 1.0 - gamma (scalar): a discount factor for return computation. - loss_function (str): loss function for the value discrepancy. Can be one of "l1", "l2" or "smooth_l1". - normalize_advantage (bool): if True, the advantage will be normalized before being used. - Defaults to True. + Defaults to ``0.01``. + critic_coef (scalar): critic loss multiplier when computing the total + loss. Defaults to ``1.0``. + loss_critic_type (str): loss function for the value discrepancy. + Can be one of "l1", "l2" or "smooth_l1". Defaults to ``"smooth_l1"``. + normalize_advantage (bool): if ``True``, the advantage will be normalized + before being used. Defaults to ``False``. + + .. note: + The advantage (typically GAE) can be computed by the loss function or + in the training loop. The latter option is usually preferred, but this is + up to the user to choose which option is to be preferred. + If the advantage key (``"advantage`` by default) is not present in the + input tensordict, the advantage will be computed by the :meth:`~.forward` + method. + + >>> ppo_loss = ClipPPOLoss(actor, critic) + >>> advantage = GAE(critic) + >>> data = next(datacollector) + >>> losses = ppo_loss(data) + >>> # equivalent + >>> advantage(data) + >>> losses = ppo_loss(data) + + A custom advantage module can be built using :meth:`~.make_value_function`. + The default is :class:`torchrl.objectives.value.GAE` with hyperparameters + dictated by :func:`torchrl.objectives.utils.default_value_kwargs`. + + >>> ppo_loss = ClipPPOLoss(actor, critic) + >>> ppo_loss.make_value_function(ValueFunctions.TDLambda) + >>> data = next(datacollector) + >>> losses = ppo_loss(data) """ @@ -193,13 +291,13 @@ def __init__( self, actor: ProbabilisticTensorDictSequential, critic: TensorDictModule, + *, advantage_key: str = "advantage", clip_epsilon: float = 0.2, entropy_bonus: bool = True, samples_mc_entropy: int = 1, entropy_coef: float = 0.01, critic_coef: float = 1.0, - gamma: float = 0.99, loss_critic_type: str = "smooth_l1", normalize_advantage: bool = True, **kwargs, @@ -207,12 +305,11 @@ def __init__( super(ClipPPOLoss, self).__init__( actor, critic, - advantage_key, + advantage_key=advantage_key, entropy_bonus=entropy_bonus, samples_mc_entropy=samples_mc_entropy, entropy_coef=entropy_coef, critic_coef=critic_coef, - gamma=gamma, loss_critic_type=loss_critic_type, normalize_advantage=normalize_advantage, **kwargs, @@ -228,7 +325,14 @@ def _clip_bounds(self): def forward(self, tensordict: TensorDictBase) -> TensorDictBase: tensordict = tensordict.clone(False) - advantage = tensordict.get(self.advantage_key) + advantage = tensordict.get(self.advantage_key, None) + if advantage is None: + self.value_function( + tensordict, + params=self.critic_params, + target_params=self.target_critic_params, + ) + advantage = tensordict.get(self.advantage_key) log_weight, dist = self._log_weight(tensordict) # ESS for logging with torch.no_grad(): @@ -278,28 +382,61 @@ class KLPENPPOLoss(PPOLoss): Args: actor (ProbabilisticTensorDictSequential): policy operator. critic (ValueOperator): value operator. - advantage_key (str): the input tensordict key where the advantage is expected to be written. - default: "advantage" - dtarg (scalar): target KL divergence. - beta (scalar): initial KL divergence multiplier. - default: 1.0 - increment (scalar): how much beta should be incremented if KL > dtarg. Valid range: increment >= 1.0 - default: 2.0 - decrement (scalar): how much beta should be decremented if KL < dtarg. Valid range: decrement <= 1.0 - default: 0.5 - entropy_bonus (bool): if True, an entropy bonus will be added to the loss to favour exploratory policies. - samples_mc_entropy (int): if the distribution retrieved from the policy operator does not have a closed form - formula for the entropy, a Monte-Carlo estimate will be used. samples_mc_entropy will control how many + advantage_key (str, optional): the input tensordict key where the advantage is expected to be written. + Defaults to ``"advantage"``. + value_target_key (str, optional): the input tensordict key where the target state + value is expected to be written. Defaults to ``"value_target"``. + dtarg (scalar, optional): target KL divergence. Defaults to ``0.01``. + samples_mc_kl (int, optional): number of samples used to compute the KL divergence + if no analytical formula can be found. Defaults to ``1``. + beta (scalar, optional): initial KL divergence multiplier. + Defaults to ``1.0``. + decrement (scalar, optional): how much beta should be decremented if KL < dtarg. Valid range: decrement <= 1.0 + default: ``0.5``. + increment (scalar, optional): how much beta should be incremented if KL > dtarg. Valid range: increment >= 1.0 + default: ``2.0``. + entropy_bonus (bool, optional): if ``True``, an entropy bonus will be added to the + loss to favour exploratory policies. Defaults to ``True``. + samples_mc_entropy (int, optional): if the distribution retrieved from the policy + operator does not have a closed form + formula for the entropy, a Monte-Carlo estimate will be used. + ``samples_mc_entropy`` will control how many samples will be used to compute this estimate. - default: 1 - entropy_coef (scalar): entropy multiplier when computing the total loss. - default: 0.01 - critic_coef (scalar): critic loss multiplier when computing the total loss. - default: 1.0 - gamma (scalar): a discount factor for return computation. - loss_critic_type (str): loss function for the value discrepancy. Can be one of "l1", "l2" or "smooth_l1". - normalize_advantage (bool): if True, the advantage will be normalized before being used. - Defaults to True. + Defaults to ``1``. + entropy_coef (scalar, optional): entropy multiplier when computing the total loss. + Defaults to ``0.01``. + critic_coef (scalar, optional): critic loss multiplier when computing the total + loss. Defaults to ``1.0``. + loss_critic_type (str, optional): loss function for the value discrepancy. + Can be one of "l1", "l2" or "smooth_l1". Defaults to ``"smooth_l1"``. + normalize_advantage (bool, optional): if ``True``, the advantage will be normalized + before being used. Defaults to ``False``. + + + .. note: + The advantage (typically GAE) can be computed by the loss function or + in the training loop. The latter option is usually preferred, but this is + up to the user to choose which option is to be preferred. + If the advantage key (``"advantage`` by default) is not present in the + input tensordict, the advantage will be computed by the :meth:`~.forward` + method. + + >>> ppo_loss = KLPENPPOLoss(actor, critic) + >>> advantage = GAE(critic) + >>> data = next(datacollector) + >>> losses = ppo_loss(data) + >>> # equivalent + >>> advantage(data) + >>> losses = ppo_loss(data) + + A custom advantage module can be built using :meth:`~.make_value_function`. + The default is :class:`torchrl.objectives.value.GAE` with hyperparameters + dictated by :func:`torchrl.objectives.utils.default_value_kwargs`. + + >>> ppo_loss = KLPENPPOLoss(actor, critic) + >>> ppo_loss.make_value_function(ValueFunctions.TDLambda) + >>> data = next(datacollector) + >>> losses = ppo_loss(data) """ @@ -307,6 +444,7 @@ def __init__( self, actor: ProbabilisticTensorDictSequential, critic: TensorDictModule, + *, advantage_key="advantage", dtarg: float = 0.01, beta: float = 1.0, @@ -317,7 +455,6 @@ def __init__( samples_mc_entropy: int = 1, entropy_coef: float = 0.01, critic_coef: float = 1.0, - gamma: float = 0.99, loss_critic_type: str = "smooth_l1", normalize_advantage: bool = True, **kwargs, @@ -325,12 +462,11 @@ def __init__( super(KLPENPPOLoss, self).__init__( actor, critic, - advantage_key, + advantage_key=advantage_key, entropy_bonus=entropy_bonus, samples_mc_entropy=samples_mc_entropy, entropy_coef=entropy_coef, critic_coef=critic_coef, - gamma=gamma, loss_critic_type=loss_critic_type, normalize_advantage=normalize_advantage, **kwargs, @@ -354,7 +490,14 @@ def __init__( def forward(self, tensordict: TensorDictBase) -> TensorDict: tensordict = tensordict.clone(False) - advantage = tensordict.get(self.advantage_key) + advantage = tensordict.get(self.advantage_key, None) + if advantage is None: + self.value_function( + tensordict, + params=self.critic_params, + target_params=self.target_critic_params, + ) + advantage = tensordict.get(self.advantage_key) if self.normalize_advantage and advantage.numel() > 1: loc = advantage.mean().item() scale = advantage.std().clamp_min(1e-6).item() diff --git a/torchrl/objectives/redq.py b/torchrl/objectives/redq.py index fd039b555c4..a48e0d78580 100644 --- a/torchrl/objectives/redq.py +++ b/torchrl/objectives/redq.py @@ -16,10 +16,8 @@ from torchrl.envs.utils import set_exploration_mode, step_mdp from torchrl.objectives.common import LossModule -from torchrl.objectives.utils import ( - distance_loss, - next_state_value as get_next_state_value, -) +from torchrl.objectives.utils import default_value_kwargs, distance_loss, ValueFunctions +from torchrl.objectives.value import GAE, TD0Estimate, TD1Estimate, TDLambdaEstimate try: from functorch import vmap @@ -40,40 +38,49 @@ class REDQLoss(LossModule): Args: actor_network (TensorDictModule): the actor to be trained - qvalue_network (TensorDictModule): a single Q-value network that will be multiplicated as many times as needed. - num_qvalue_nets (int, optional): Number of Q-value networks to be trained. Default is 10. - sub_sample_len (int, optional): number of Q-value networks to be subsampled to evaluate the next state value - Default is 2. - gamma (Number, optional): gamma decay factor. Default is 0.99. - priotity_key (str, optional): Key where to write the priority value for prioritized replay buffers. Default is - `"td_error"`. - loss_function (str, optional): loss function to be used for the Q-value. Can be one of `"smooth_l1"`, "l2", - "l1", Default is "smooth_l1". + qvalue_network (TensorDictModule): a single Q-value network that will + be multiplicated as many times as needed. + num_qvalue_nets (int, optional): Number of Q-value networks to be trained. + Default is ``10``. + sub_sample_len (int, optional): number of Q-value networks to be + subsampled to evaluate the next state value + Default is ``2``. + priority_key (str, optional): Key where to write the priority value + for prioritized replay buffers. Default is + ``"td_error"``. + loss_function (str, optional): loss function to be used for the Q-value. + Can be one of ``"smooth_l1"``, ``"l2"``, + ``"l1"``, Default is ``"smooth_l1"``. alpha_init (float, optional): initial entropy multiplier. - Default is 1.0. + Default is ``1.0``. min_alpha (float, optional): min value of alpha. - Default is 0.1. + Default is ``0.1``. max_alpha (float, optional): max value of alpha. - Default is 10.0. - fixed_alpha (bool, optional): whether alpha should be trained to match a target entropy. Default is :obj:`False`. - target_entropy (Union[str, Number], optional): Target entropy for the stochastic policy. Default is "auto". - delay_qvalue (bool, optional): Whether to separate the target Q value networks from the Q value networks used - for data collection. Default is :obj:`False`. - gSDE (bool, optional): Knowing if gSDE is used is necessary to create random noise variables. - Default is False + Default is ``10.0``. + fixed_alpha (bool, optional): whether alpha should be trained to match + a target entropy. Default is ``False``. + target_entropy (Union[str, Number], optional): Target entropy for the + stochastic policy. Default is "auto". + delay_qvalue (bool, optional): Whether to separate the target Q value + networks from the Q value networks used + for data collection. Default is ``False``. + gSDE (bool, optional): Knowing if gSDE is used is necessary to create + random noise variables. + Default is ``False``. """ delay_actor: bool = False + default_value_type = ValueFunctions.TD0 def __init__( self, actor_network: TensorDictModule, qvalue_network: TensorDictModule, + *, num_qvalue_nets: int = 10, sub_sample_len: int = 2, - gamma: Number = 0.99, - priotity_key: str = "td_error", + priority_key: str = "td_error", loss_function: str = "smooth_l1", alpha_init: float = 1.0, min_alpha: float = 0.1, @@ -107,8 +114,7 @@ def __init__( ) self.num_qvalue_nets = num_qvalue_nets self.sub_sample_len = max(1, min(sub_sample_len, num_qvalue_nets - 1)) - self.register_buffer("gamma", torch.tensor(gamma)) - self.priority_key = priotity_key + self.priority_key = priority_key self.loss_function = loss_function try: @@ -156,7 +162,7 @@ def alpha(self): def forward(self, tensordict: TensorDictBase) -> TensorDictBase: obs_keys = self.actor_network.in_keys - tensordict_select = tensordict.select("next", *obs_keys, "action") + tensordict_select = tensordict.clone(False).select("next", *obs_keys, "action") selected_models_idx = torch.randperm(self.num_qvalue_nets)[ : self.sub_sample_len ].sort()[0] @@ -259,11 +265,9 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: ) next_state_value = next_state_value.min(0)[0] - target_value = get_next_state_value( - tensordict, - gamma=self.gamma, - pred_next_val=next_state_value, - ) + tensordict_select.set(("next", "state_value"), next_state_value.unsqueeze(-1)) + target_value = self.value_function.value_estimate(tensordict_select).squeeze(-1) + pred_val = state_action_value_qvalue td_error = (pred_val - target_value).pow(2) loss_qval = distance_loss( @@ -308,3 +312,25 @@ def _loss_alpha(self, log_pi: Tensor) -> Tensor: # placeholder alpha_loss = torch.zeros_like(log_pi) return alpha_loss + + def make_value_function(self, value_type: ValueFunctions, **hyperparams): + hp = dict(default_value_kwargs(value_type)) + hp.update(hyperparams) + value_key = "state_value" + # we do not need a value network bc the next state value is already passed + if value_type == ValueFunctions.TD1: + self._value_function = TD1Estimate( + value_network=None, value_key=value_key, **hp + ) + elif value_type == ValueFunctions.TD0: + self._value_function = TD0Estimate( + value_network=None, value_key=value_key, **hp + ) + elif value_type == ValueFunctions.GAE: + self._value_function = GAE(value_network=None, value_key=value_key, **hp) + elif value_type == ValueFunctions.TDLambda: + self._value_function = TDLambdaEstimate( + value_network=None, value_key=value_key, **hp + ) + else: + raise NotImplementedError(f"Unknown value type {value_type}") diff --git a/torchrl/objectives/reinforce.py b/torchrl/objectives/reinforce.py index c5b265d4e77..1079c4eb2be 100644 --- a/torchrl/objectives/reinforce.py +++ b/torchrl/objectives/reinforce.py @@ -10,7 +10,8 @@ from tensordict.nn import ProbabilisticTensorDictSequential, TensorDictModule from tensordict.tensordict import TensorDict, TensorDictBase from torchrl.objectives.common import LossModule -from torchrl.objectives.utils import distance_loss +from torchrl.objectives.utils import default_value_kwargs, distance_loss, ValueFunctions +from torchrl.objectives.value import GAE, TD0Estimate, TD1Estimate, TDLambdaEstimate class ReinforceLoss(LossModule): @@ -19,14 +20,55 @@ class ReinforceLoss(LossModule): Presented in "Simple statistical gradient-following algorithms for connectionist reinforcement learning", Williams, 1992 https://doi.org/10.1007/BF00992696 + + Args: + actor (ProbabilisticTensorDictSequential): policy operator. + critic (ValueOperator): value operator. + delay_value (bool, optional): if ``True``, a target network is needed + for the critic. Defaults to ``False``. + advantage_key (str): the input tensordict key where the advantage is + expected to be written. + Defaults to ``"advantage"``. + value_target_key (str): the input tensordict key where the target state + value is expected to be written. Defaults to ``"value_target"``. + loss_critic_type (str): loss function for the value discrepancy. + Can be one of "l1", "l2" or "smooth_l1". Defaults to ``"smooth_l1"``. + + .. note: + The advantage (typically GAE) can be computed by the loss function or + in the training loop. The latter option is usually preferred, but this is + up to the user to choose which option is to be preferred. + If the advantage key (``"advantage`` by default) is not present in the + input tensordict, the advantage will be computed by the :meth:`~.forward` + method. + + >>> reinforce_loss = ReinforceLoss(actor, critic) + >>> advantage = GAE(critic) + >>> data = next(datacollector) + >>> losses = reinforce_loss(data) + >>> # equivalent + >>> advantage(data) + >>> losses = reinforce_loss(data) + + A custom advantage module can be built using :meth:`~.make_value_function`. + The default is :class:`torchrl.objectives.value.GAE` with hyperparameters + dictated by :func:`torchrl.objectives.utils.default_value_kwargs`. + + >>> reinforce_loss = ReinforceLoss(actor, critic) + >>> reinforce_loss.make_value_function(ValueFunctions.TDLambda) + >>> data = next(datacollector) + >>> losses = reinforce_loss(data) + """ + default_value_type = ValueFunctions.GAE + def __init__( self, - actor_network: ProbabilisticTensorDictSequential, + actor: ProbabilisticTensorDictSequential, critic: Optional[TensorDictModule] = None, + *, delay_value: bool = False, - gamma: float = 0.99, advantage_key: str = "advantage", value_target_key: str = "value_target", loss_critic_type: str = "smooth_l1", @@ -37,11 +79,10 @@ def __init__( self.advantage_key = advantage_key self.value_target_key = value_target_key self.loss_critic_type = loss_critic_type - self.register_buffer("gamma", torch.tensor(gamma)) # Actor self.convert_to_functional( - actor_network, + actor, "actor_network", create_target_params=False, ) @@ -52,11 +93,18 @@ def __init__( critic, "critic", create_target_params=self.delay_value, - compare_against=list(actor_network.parameters()), + compare_against=list(actor.parameters()), ) def forward(self, tensordict: TensorDictBase) -> TensorDictBase: - advantage = tensordict.get(self.advantage_key) + advantage = tensordict.get(self.advantage_key, None) + if advantage is None: + self.value_function( + tensordict, + params=self.critic_params, + target_params=self.target_critic_params, + ) + advantage = tensordict.get(self.advantage_key) # compute log-prob tensordict = self.actor_network( @@ -95,3 +143,26 @@ def loss_critic(self, tensordict: TensorDictBase) -> torch.Tensor: f"can be used for the value loss." ) return loss_value + + def make_value_function(self, value_type: ValueFunctions, **hyperparams): + hp = dict(default_value_kwargs(value_type)) + hp.update(hyperparams) + value_key = "state_value" + if value_type == ValueFunctions.TD1: + self._value_function = TD1Estimate( + value_network=self.critic, value_key=value_key, **hp + ) + elif value_type == ValueFunctions.TD0: + self._value_function = TD0Estimate( + value_network=self.critic, value_key=value_key, **hp + ) + elif value_type == ValueFunctions.GAE: + self._value_function = GAE( + value_network=self.critic, value_key=value_key, **hp + ) + elif value_type == ValueFunctions.TDLambda: + self._value_function = TDLambdaEstimate( + value_network=self.critic, value_key=value_key, **hp + ) + else: + raise NotImplementedError(f"Unknown value type {value_type}") diff --git a/torchrl/objectives/sac.py b/torchrl/objectives/sac.py index 4045dcfc119..5a8bdb25aef 100644 --- a/torchrl/objectives/sac.py +++ b/torchrl/objectives/sac.py @@ -9,18 +9,13 @@ import numpy as np import torch -from tensordict.nn import make_functional, TensorDictModule, TensorDictSequential +from tensordict.nn import make_functional, TensorDictModule from tensordict.tensordict import TensorDict, TensorDictBase from torch import Tensor from torchrl.modules import ProbabilisticActor from torchrl.modules.tensordict_module.actors import ActorCriticWrapper -from torchrl.objectives.utils import ( - default_value_kwargs, - distance_loss, - next_state_value, - ValueFunctions, -) +from torchrl.objectives.utils import default_value_kwargs, distance_loss, ValueFunctions from ..envs.utils import set_exploration_mode, step_mdp from .common import LossModule @@ -36,53 +31,6 @@ FUNCTORCH_ERROR = err -class _SACValueNet(TensorDictSequential): - r"""Value network for SAC v2. - - SAC v2 is based on a value estimate of the form: - - .. math:: - - V = Q(s,a) - \alpha * \log p(a | s) - - This class computes this value given the actor and qvalue network - - """ - - def __init__(self, actor_network, qvalue_network): - super().__init__(actor_network, qvalue_network) - # we highjack the forward so the out_keys must be re-written - self.out_keys = ["state_value"] - - def forward(self, tensordict, _alpha, actor_params, qval_params): - """Computes the value as `val = qval - a * log_prob(a)`.""" - actor_network, qvalue_network = self - - obs_keys = actor_network.in_keys - data = tensordict.select(*obs_keys) - # get actions and log-probs - with torch.no_grad(): - with set_exploration_mode("random"): - dist = actor_network.get_dist(data, params=actor_params) - data.set("action", dist.rsample()) - log_prob = dist.log_prob(data.get("action")) - data.set("sample_log_prob", log_prob) - sample_log_prob = data.get("sample_log_prob") - - # get q-values - data = vmap(qvalue_network, (None, 0))(data, qval_params) - state_action_value = data.get("state_action_value") - if ( - state_action_value.shape[-len(sample_log_prob.shape) :] - != sample_log_prob.shape - ): - sample_log_prob = sample_log_prob.unsqueeze(-1) - state_value = state_action_value - _alpha * sample_log_prob - state_value = state_value.min(0)[0] - tensordict.set("state_value", state_value) - return tensordict - - class SACLoss(LossModule): """TorchRL implementation of the SAC loss. @@ -99,6 +47,8 @@ class SACLoss(LossModule): .. note:: If not provided, the second version of SAC is assumed, where only the Q-Value network is needed. + num_qvalue_nets (integer, optional): number of Q-Value networks used. + Defaults to ``2``. priority_key (str, optional): tensordict key where to write the priority (for prioritized replay buffer usage). Defaults to ``"td_error"``. @@ -110,29 +60,32 @@ class SACLoss(LossModule): Default is 0.1. max_alpha (float, optional): max value of alpha. Default is 10.0. - fixed_alpha (bool, optional): if True, alpha will be fixed to its + fixed_alpha (bool, optional): if ``True``, alpha will be fixed to its initial value. Otherwise, alpha will be optimized to match the 'target_entropy' value. - Default is :obj:`False`. + Default is ``False``. target_entropy (float or str, optional): Target entropy for the stochastic policy. Default is "auto", where target entropy is computed as :obj:`-prod(n_actions)`. delay_actor (bool, optional): Whether to separate the target actor networks from the actor networks used for data collection. - Default is :obj:`False`. + Default is ``False``. delay_qvalue (bool, optional): Whether to separate the target Q value networks from the Q value networks used for data collection. - Default is :obj:`False`. + Default is ``False``. delay_value (bool, optional): Whether to separate the target value networks from the value networks used for data collection. - Default is :obj:`False`. + Default is ``False``. """ + default_value_type = ValueFunctions.TD0 + def __init__( self, actor_network: ProbabilisticActor, qvalue_network: TensorDictModule, value_network: Optional[TensorDictModule] = None, + *, num_qvalue_nets: int = 2, priority_key: str = "td_error", loss_function: str = "smooth_l1", @@ -231,7 +184,8 @@ def make_value_function(self, value_type: ValueFunctions, **hyperparams): if self._version == 1: value_net = self.actor_critic elif self._version == 2: - value_net = _SACValueNet(self.actor_network, self.qvalue_network) + # we will take care of computing the next value inside this module + value_net = None else: # unreachable raise NotImplementedError @@ -254,11 +208,8 @@ def make_value_function(self, value_type: ValueFunctions, **hyperparams): value_key=value_key, ) elif value_type is ValueFunctions.GAE: - self._value_function = GAE( - **hp, - value_network=value_net, - value_target_key="value_target", - value_key=value_key, + raise NotImplementedError( + f"Value type {value_type} it not implemented for loss {type(self)}." ) elif value_type is ValueFunctions.TDLambda: self._value_function = TDLambdaEstimate( @@ -270,10 +221,6 @@ def make_value_function(self, value_type: ValueFunctions, **hyperparams): else: raise NotImplementedError(f"Unknown value type {value_type}") - def _default_value_function(self): - # TD0 by default, as in paper - self.make_value_function(ValueFunctions.TD0) - @property def device(self) -> torch.device: for p in self.parameters(): @@ -393,14 +340,57 @@ def _loss_qvalue_v1(self, tensordict: TensorDictBase) -> Tuple[Tensor, Tensor]: return loss_value, priority_value + def _get_value_v2(self, tensordict, _alpha, actor_params, qval_params): + r"""Value network for SAC v2. + + SAC v2 is based on a value estimate of the form: + + .. math:: + + V = Q(s,a) - \alpha * \log p(a | s) + + This class computes this value given the actor and qvalue network + + """ + tensordict = tensordict.clone(False) + # get actions and log-probs + with torch.no_grad(): + with set_exploration_mode("random"): + dist = self.actor_network.get_dist(tensordict, params=actor_params) + tensordict.set("action", dist.rsample()) + log_prob = dist.log_prob(tensordict.get("action")) + tensordict.set("sample_log_prob", log_prob) + sample_log_prob = tensordict.get("sample_log_prob") + + # get q-values + tensordict_expand = vmap(self.qvalue_network, (None, 0))( + tensordict, qval_params + ) + state_action_value = tensordict_expand.get("state_action_value") + if ( + state_action_value.shape[-len(sample_log_prob.shape) :] + != sample_log_prob.shape + ): + sample_log_prob = sample_log_prob.unsqueeze(-1) + state_value = state_action_value - _alpha * sample_log_prob + state_value = state_value.min(0)[0] + tensordict.set(("next", self.value_function.value_key), state_value) + target_value = self.value_function.value_estimate( + tensordict, + _alpha=self._alpha, + actor_params=self.target_actor_network_params, + qval_params=self.target_qvalue_network_params, + ).squeeze(-1) + return target_value + def _loss_qvalue_v2(self, tensordict: TensorDictBase) -> Tuple[Tensor, Tensor]: # we pass the alpha value to the tensordict. Since it's a scalar, we must erase the batch-size first. - target_value = self.value_function.value_estimate( + target_value = self._get_value_v2( tensordict, - _alpha=self._alpha, - actor_params=self.target_actor_network_params, - qval_params=self.target_qvalue_network_params, - ).squeeze(-1) + self._alpha, + self.target_actor_network_params, + self.target_qvalue_network_params, + ) tensordict_expand = vmap(self.qvalue_network, (None, 0))( tensordict.select(*self.qvalue_network.in_keys), @@ -476,9 +466,9 @@ class DiscreteSACLoss(LossModule): Args: actor_network (ProbabilisticActor): the actor to be trained qvalue_network (TensorDictModule): a single Q-value network that will be multiplicated as many times as needed. + num_actions (int): number of actions in the action space. num_qvalue_nets (int, optional): Number of Q-value networks to be trained. Default is 10. - gamma (Number, optional): gamma decay factor. Default is 0.99. - priotity_key (str, optional): Key where to write the priority value for prioritized replay buffers. Default is + priority_key (str, optional): Key where to write the priority value for prioritized replay buffers. Default is `"td_error"`. loss_function (str, optional): loss function to be used for the Q-value. Can be one of `"smooth_l1"`, "l2", "l1", Default is "smooth_l1". @@ -488,13 +478,15 @@ class DiscreteSACLoss(LossModule): Default is 0.1. max_alpha (float, optional): max value of alpha. Default is 10.0. - fixed_alpha (bool, optional): whether alpha should be trained to match a target entropy. Default is :obj:`False`. + fixed_alpha (bool, optional): whether alpha should be trained to match a target entropy. Default is ``False``. target_entropy_weight (float, optional): weight for the target entropy term. target_entropy (Union[str, Number], optional): Target entropy for the stochastic policy. Default is "auto". delay_qvalue (bool, optional): Whether to separate the target Q value networks from the Q value networks used - for data collection. Default is :obj:`False`. + for data collection. Default is ``False``. + """ + default_value_type = ValueFunctions.TD0 delay_actor: bool = False def __init__( @@ -502,9 +494,9 @@ def __init__( actor_network: ProbabilisticActor, qvalue_network: TensorDictModule, num_actions: int, + *, num_qvalue_nets: int = 2, - gamma: Number = 0.99, - priotity_key: str = "td_error", + priority_key: str = "td_error", loss_function: str = "smooth_l1", alpha_init: float = 1.0, min_alpha: float = 0.1, @@ -533,8 +525,7 @@ def __init__( compare_against=list(actor_network.parameters()), ) self.num_qvalue_nets = num_qvalue_nets - self.register_buffer("gamma", torch.tensor(gamma)) - self.priority_key = priotity_key + self.priority_key = priority_key self.loss_function = loss_function try: @@ -575,7 +566,7 @@ def alpha(self): def forward(self, tensordict: TensorDictBase) -> TensorDictBase: obs_keys = self.actor_network.in_keys - tensordict_select = tensordict.select("next", *obs_keys, "action") + tensordict_select = tensordict.clone(False).select("next", *obs_keys, "action") actor_params = torch.stack( [self.actor_network_params, self.target_actor_network_params], 0 @@ -668,11 +659,8 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: * (next_state_action_value_qvalue.min(0)[0] - self.alpha * logp_pi[1]) ).sum(dim=-1, keepdim=True) - target_value = next_state_value( - tensordict, - gamma=self.gamma, - pred_next_val=pred_next_val, - ) + tensordict_select.set(("next", self.value_function.value_key), pred_next_val) + target_value = self.value_function.value_estimate(tensordict_select).squeeze(-1) actions = torch.argmax(tensordict_select["action"], dim=-1) @@ -731,3 +719,36 @@ def _loss_alpha(self, log_pi: Tensor) -> Tensor: # placeholder alpha_loss = torch.zeros_like(log_pi) return alpha_loss + + def make_value_function(self, value_type: ValueFunctions, **hyperparams): + value_net = None + value_key = "state_value" + hp = dict(default_value_kwargs(value_type)) + hp.update(hyperparams) + if value_type is ValueFunctions.TD1: + self._value_function = TD1Estimate( + **hp, + value_network=value_net, + value_target_key="value_target", + value_key=value_key, + ) + elif value_type is ValueFunctions.TD0: + self._value_function = TD0Estimate( + **hp, + value_network=value_net, + value_target_key="value_target", + value_key=value_key, + ) + elif value_type is ValueFunctions.GAE: + raise NotImplementedError( + f"Value type {value_type} it not implemented for loss {type(self)}." + ) + elif value_type is ValueFunctions.TDLambda: + self._value_function = TDLambdaEstimate( + **hp, + value_network=value_net, + value_target_key="value_target", + value_key=value_key, + ) + else: + raise NotImplementedError(f"Unknown value type {value_type}") diff --git a/torchrl/objectives/td3.py b/torchrl/objectives/td3.py index 562e2f711ba..61dcdbbfcb1 100644 --- a/torchrl/objectives/td3.py +++ b/torchrl/objectives/td3.py @@ -3,7 +3,6 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from numbers import Number import torch from tensordict.nn import TensorDictModule @@ -12,10 +11,8 @@ from torchrl.envs.utils import set_exploration_mode, step_mdp from torchrl.objectives.common import LossModule -from torchrl.objectives.utils import ( - distance_loss, - next_state_value as get_next_state_value, -) +from torchrl.objectives.utils import default_value_kwargs, distance_loss, ValueFunctions +from torchrl.objectives.value import GAE, TD0Estimate, TD1Estimate, TDLambdaEstimate try: from functorch import vmap @@ -34,29 +31,29 @@ class TD3Loss(LossModule): actor_network (TensorDictModule): the actor to be trained qvalue_network (TensorDictModule): a single Q-value network that will be multiplicated as many times as needed. num_qvalue_nets (int, optional): Number of Q-value networks to be trained. Default is 10. - gamma (Number, optional): gamma decay factor. Default is 0.99. - max_action (float, optional): Maximum action, in MuJoCo environments typically 1.0. policy_noise (float, optional): Standard deviation for the target policy action noise. Default is 0.2. noise_clip (float, optional): Clipping range value for the sampled target policy action noise. Default is 0.5. - priotity_key (str, optional): Key where to write the priority value for prioritized replay buffers. Default is + priority_key (str, optional): Key where to write the priority value for prioritized replay buffers. Default is `"td_error"`. loss_function (str, optional): loss function to be used for the Q-value. Can be one of `"smooth_l1"`, "l2", "l1", Default is "smooth_l1". delay_actor (bool, optional): whether to separate the target actor networks from the actor networks used for - data collection. Default is :obj:`False`. + data collection. Default is ``False``. delay_qvalue (bool, optional): Whether to separate the target Q value networks from the Q value networks used - for data collection. Default is :obj:`False`. + for data collection. Default is ``False``. """ + default_value_type = ValueFunctions.TD0 + def __init__( self, actor_network: TensorDictModule, qvalue_network: TensorDictModule, + *, num_qvalue_nets: int = 2, - gamma: Number = 0.99, policy_noise: float = 0.2, noise_clip: float = 0.5, - priotity_key: str = "td_error", + priority_key: str = "td_error", loss_function: str = "smooth_l1", delay_actor: bool = False, delay_qvalue: bool = False, @@ -86,8 +83,7 @@ def __init__( ) self.num_qvalue_nets = num_qvalue_nets - self.register_buffer("gamma", torch.tensor(gamma)) - self.priority_key = priotity_key + self.priority_key = priority_key self.loss_function = loss_function self.policy_noise = policy_noise self.noise_clip = noise_clip @@ -95,16 +91,17 @@ def __init__( def forward(self, tensordict: TensorDictBase) -> TensorDictBase: obs_keys = self.actor_network.in_keys - tensordict_select = tensordict.select("next", *obs_keys, "action") + tensordict_save = tensordict + tensordict = tensordict.clone(False) actor_params = torch.stack( [self.actor_network_params, self.target_actor_network_params], 0 ) - tensordict_actor_grad = tensordict_select.select( + tensordict_actor_grad = tensordict.select( *obs_keys ) # to avoid overwriting keys - next_td_actor = step_mdp(tensordict_select).select( + next_td_actor = step_mdp(tensordict).select( *self.actor_network.in_keys ) # next_observation -> tensordict_actor = torch.stack([tensordict_actor_grad, next_td_actor], 0) @@ -134,9 +131,9 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: .select(*self.qvalue_network.in_keys) .expand(self.num_qvalue_nets, *tensordict_actor[0].batch_size) ) # for actor loss - _qval_td = tensordict_select.select(*self.qvalue_network.in_keys).expand( + _qval_td = tensordict.select(*self.qvalue_network.in_keys).expand( self.num_qvalue_nets, - *tensordict_select.select(*self.qvalue_network.in_keys).batch_size, + *tensordict.select(*self.qvalue_network.in_keys).batch_size, ) # for qvalue loss _next_val_td = ( tensordict_actor[1] @@ -180,12 +177,8 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: loss_actor = -(state_action_value_actor.min(0)[0]).mean() next_state_value = next_state_action_value_qvalue.min(0)[0] - - target_value = get_next_state_value( - tensordict, - gamma=self.gamma, - pred_next_val=next_state_value, - ) + tensordict.set(("next", "state_action_value"), next_state_value.unsqueeze(-1)) + target_value = self.value_function.value_estimate(tensordict).squeeze(-1) pred_val = state_action_value_qvalue td_error = (pred_val - target_value).pow(2) loss_qval = ( @@ -199,7 +192,7 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: * 0.5 ) - tensordict.set("td_error", td_error.detach().max(0)[0]) + tensordict_save.set("td_error", td_error.detach().max(0)[0]) if not loss_qval.shape == loss_actor.shape: raise RuntimeError( @@ -218,3 +211,27 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: ) return td_out + + def make_value_function(self, value_type: ValueFunctions, **hyperparams): + hp = dict(default_value_kwargs(value_type)) + hp.update(hyperparams) + value_key = "state_action_value" + # we do not need a value network bc the next state value is already passed + if value_type == ValueFunctions.TD1: + self._value_function = TD1Estimate( + value_network=None, value_key=value_key, **hp + ) + elif value_type == ValueFunctions.TD0: + self._value_function = TD0Estimate( + value_network=None, value_key=value_key, **hp + ) + elif value_type == ValueFunctions.GAE: + raise NotImplementedError( + f"Value type {value_type} it not implemented for loss {type(self)}." + ) + elif value_type == ValueFunctions.TDLambda: + self._value_function = TDLambdaEstimate( + value_network=None, value_key=value_key, **hp + ) + else: + raise NotImplementedError(f"Unknown value type {value_type}") diff --git a/torchrl/objectives/value/advantages.py b/torchrl/objectives/value/advantages.py index 17a9c35c90d..6695fbc2488 100644 --- a/torchrl/objectives/value/advantages.py +++ b/torchrl/objectives/value/advantages.py @@ -82,6 +82,9 @@ def value_estimate( ): """Gets a value estimate, usually used as a target value for the value network. + If the state value key is present under ``tensordict.get(("next", self.value_key))`` + then this value will be used without recurring to the value network. + Args: tensordict (TensorDictBase): the tensordict containing the data to read. @@ -89,6 +92,8 @@ def value_estimate( target params to be passed to the functional value network module. **kwargs: the keyword arguments to be passed to the value network. + Returns: a tensor corresponding to the state value. + """ raise NotImplementedError @@ -99,17 +104,24 @@ def is_functional(self): else: raise RuntimeError("Cannot determine if value network is functional.") + @property + def is_stateless(self): + if not self.is_functional: + return False + return self.value_network._is_stateless + class TD0Estimate(ValueFunctionBase): """Myopic Temporal Difference (TD(0)) estimate of advantage function. Args: gamma (scalar): exponential mean discount. - value_network (TensorDictModule): value operator used to retrieve the value estimates. - average_rewards (bool, optional): if True, rewards will be standardized + value_network (TensorDictModule): value operator used to retrieve + the value estimates. + average_rewards (bool, optional): if ``True``, rewards will be standardized before the TD is computed. - differentiable (bool, optional): if True, gradients are propagated throught - the computation of the value function. Default is :obj:`False`. + differentiable (bool, optional): if ``True``, gradients are propagated throught + the computation of the value function. Default is ``False``. advantage_key (str or tuple of str, optional): the key of the advantage entry. Defaults to "advantage". value_target_key (str or tuple of str, optional): the key of the advantage entry. @@ -121,6 +133,7 @@ class TD0Estimate(ValueFunctionBase): def __init__( self, + *, gamma: Union[float, torch.Tensor], value_network: TensorDictModule, average_rewards: bool = False, @@ -132,7 +145,7 @@ def __init__( super().__init__() try: device = next(value_network.parameters()).device - except StopIteration: + except (AttributeError, StopIteration): device = torch.device("cpu") self.register_buffer("gamma", torch.tensor(gamma, device=device)) self.value_network = value_network @@ -151,11 +164,16 @@ def __init__( self.advantage_key = advantage_key self.value_target_key = value_target_key - self.in_keys = ( - value_network.in_keys - + [("next", "reward"), ("next", "done")] - + [("next", in_key) for in_key in value_network.in_keys] - ) + try: + self.in_keys = ( + value_network.in_keys + + [("next", "reward"), ("next", "done")] + + [("next", in_key) for in_key in value_network.in_keys] + ) + except AttributeError: + # value network does not have an `in_keys` attribute + pass + self.out_keys = [self.advantage_key, self.value_target_key] @_self_set_grad_enabled @@ -226,7 +244,7 @@ def forward( ) kwargs = {} - if self.is_functional and params is None: + if self.is_stateless and params is None: raise RuntimeError( "Expected params to be passed to advantage module but got none." ) @@ -249,8 +267,9 @@ def value_estimate( target_params: Optional[TensorDictBase] = None, **kwargs, ): - gamma = self.gamma reward = tensordict.get(("next", "reward")) + device = reward.device + gamma = self.gamma.to(device) steps_to_next_obs = tensordict.get("steps_to_next_obs", None) if steps_to_next_obs is not None: gamma = gamma ** steps_to_next_obs.view_as(reward) @@ -262,12 +281,12 @@ def value_estimate( ("next", "reward"), reward ) # we must update the rewards if they are used later in the code step_td = step_mdp(tensordict) - if target_params is not None: - # we assume that target parameters are not differentiable - kwargs["params"] = target_params - with hold_out_net(self.value_network): - self.value_network(step_td, **kwargs) - next_value = step_td.get(self.value_key) + if self.value_key not in step_td.keys(): + if target_params is not None: + kwargs["params"] = target_params + with hold_out_net(self.value_network): + self.value_network(step_td, **kwargs) + next_value = step_td.get(self.value_key) done = tensordict.get(("next", "done")) value_target = reward + gamma * (1 - done.to(reward.dtype)) * next_value @@ -280,10 +299,10 @@ class TD1Estimate(ValueFunctionBase): Args: gamma (scalar): exponential mean discount. value_network (TensorDictModule): value operator used to retrieve the value estimates. - average_rewards (bool, optional): if True, rewards will be standardized + average_rewards (bool, optional): if ``True``, rewards will be standardized before the TD is computed. - differentiable (bool, optional): if True, gradients are propagated throught - the computation of the value function. Default is :obj:`False`. + differentiable (bool, optional): if ``True``, gradients are propagated throught + the computation of the value function. Default is ``False``. advantage_key (str or tuple of str, optional): the key of the advantage entry. Defaults to "advantage". value_target_key (str or tuple of str, optional): the key of the advantage entry. @@ -295,6 +314,7 @@ class TD1Estimate(ValueFunctionBase): def __init__( self, + *, gamma: Union[float, torch.Tensor], value_network: TensorDictModule, average_rewards: bool = False, @@ -306,7 +326,7 @@ def __init__( super().__init__() try: device = next(value_network.parameters()).device - except StopIteration: + except (AttributeError, StopIteration): device = torch.device("cpu") self.register_buffer("gamma", torch.tensor(gamma, device=device)) self.value_network = value_network @@ -325,11 +345,15 @@ def __init__( self.advantage_key = advantage_key self.value_target_key = value_target_key - self.in_keys = ( - value_network.in_keys - + [("next", "reward"), ("next", "done")] - + [("next", in_key) for in_key in value_network.in_keys] - ) + try: + self.in_keys = ( + value_network.in_keys + + [("next", "reward"), ("next", "done")] + + [("next", in_key) for in_key in value_network.in_keys] + ) + except AttributeError: + # value network does not have an `in_keys` attribute + pass self.out_keys = [self.advantage_key, self.value_target_key] @_self_set_grad_enabled @@ -400,7 +424,7 @@ def forward( ) kwargs = {} - if self.is_functional and params is None: + if self.is_stateless and params is None: raise RuntimeError( "Expected params to be passed to advantage module but got none." ) @@ -423,8 +447,9 @@ def value_estimate( target_params: Optional[TensorDictBase] = None, **kwargs, ): - gamma = self.gamma reward = tensordict.get(("next", "reward")) + device = reward.device + gamma = self.gamma.to(device) steps_to_next_obs = tensordict.get("steps_to_next_obs", None) if steps_to_next_obs is not None: gamma = gamma ** steps_to_next_obs.view_as(reward) @@ -436,12 +461,12 @@ def value_estimate( ("next", "reward"), reward ) # we must update the rewards if they are used later in the code step_td = step_mdp(tensordict) - if target_params is not None: - # we assume that target parameters are not differentiable - kwargs["params"] = target_params - with hold_out_net(self.value_network): - self.value_network(step_td, **kwargs) - next_value = step_td.get(self.value_key) + if self.value_key not in step_td.keys(): + if target_params is not None: + kwargs["params"] = target_params + with hold_out_net(self.value_network): + self.value_network(step_td, **kwargs) + next_value = step_td.get(self.value_key) done = tensordict.get(("next", "done")) value_target = td_advantage_estimate( @@ -457,10 +482,10 @@ class TDLambdaEstimate(ValueFunctionBase): gamma (scalar): exponential mean discount. lmbda (scalar): trajectory discount. value_network (TensorDictModule): value operator used to retrieve the value estimates. - average_rewards (bool, optional): if True, rewards will be standardized + average_rewards (bool, optional): if ``True``, rewards will be standardized before the TD is computed. - differentiable (bool, optional): if True, gradients are propagated throught - the computation of the value function. Default is :obj:`False`. + differentiable (bool, optional): if ``True``, gradients are propagated throught + the computation of the value function. Default is ``False``. vectorized (bool, optional): whether to use the vectorized version of the lambda return. Default is `True`. advantage_key (str or tuple of str, optional): the key of the advantage entry. @@ -474,6 +499,7 @@ class TDLambdaEstimate(ValueFunctionBase): def __init__( self, + *, gamma: Union[float, torch.Tensor], lmbda: Union[float, torch.Tensor], value_network: TensorDictModule, @@ -487,7 +513,7 @@ def __init__( super().__init__() try: device = next(value_network.parameters()).device - except StopIteration: + except (AttributeError, StopIteration): device = torch.device("cpu") self.register_buffer("gamma", torch.tensor(gamma, device=device)) self.register_buffer("lmbda", torch.tensor(lmbda, device=device)) @@ -508,11 +534,15 @@ def __init__( self.advantage_key = advantage_key self.value_target_key = value_target_key - self.in_keys = ( - value_network.in_keys - + [("next", "reward"), ("next", "done")] - + [("next", in_key) for in_key in value_network.in_keys] - ) + try: + self.in_keys = ( + value_network.in_keys + + [("next", "reward"), ("next", "done")] + + [("next", in_key) for in_key in value_network.in_keys] + ) + except AttributeError: + # value network does not have an `in_keys` attribute + pass self.out_keys = [self.advantage_key, self.value_target_key] @_self_set_grad_enabled @@ -584,7 +614,7 @@ def forward( f"tensordict.batch_size = {tensordict.batch_size}" ) kwargs = {} - if self.is_functional and params is None: + if self.is_stateless and params is None: raise RuntimeError( "Expected params to be passed to advantage module but got none." ) @@ -607,9 +637,9 @@ def value_estimate( target_params: Optional[TensorDictBase] = None, **kwargs, ): - - gamma = self.gamma reward = tensordict.get(("next", "reward")) + device = reward.device + gamma = self.gamma.to(device) steps_to_next_obs = tensordict.get("steps_to_next_obs", None) if steps_to_next_obs is not None: gamma = gamma ** steps_to_next_obs.view_as(reward) @@ -623,14 +653,12 @@ def value_estimate( ) # we must update the rewards if they are used later in the code step_td = step_mdp(tensordict) - if target_params is not None: - # we assume that target parameters are not differentiable - kwargs["params"] = target_params - with hold_out_net(self.value_network): - # we may still need to pass gradient, but we don't want to assign grads to - # value net params - self.value_network(step_td, **kwargs) - next_value = step_td.get(self.value_key) + if self.value_key not in step_td.keys(): + if target_params is not None: + kwargs["params"] = target_params + with hold_out_net(self.value_network): + self.value_network(step_td, **kwargs) + next_value = step_td.get(self.value_key) done = tensordict.get(("next", "done")) if self.vectorized: @@ -654,10 +682,10 @@ class GAE(ValueFunctionBase): gamma (scalar): exponential mean discount. lmbda (scalar): trajectory discount. value_network (TensorDictModule): value operator used to retrieve the value estimates. - average_gae (bool): if True, the resulting GAE values will be standardized. - Default is :obj:`False`. - differentiable (bool, optional): if True, gradients are propagated throught - the computation of the value function. Default is :obj:`False`. + average_gae (bool): if ``True``, the resulting GAE values will be standardized. + Default is ``False``. + differentiable (bool, optional): if ``True``, gradients are propagated throught + the computation of the value function. Default is ``False``. advantage_key (str or tuple of str, optional): the key of the advantage entry. Defaults to "advantage". value_target_key (str or tuple of str, optional): the key of the advantage entry. @@ -676,6 +704,7 @@ class GAE(ValueFunctionBase): def __init__( self, + *, gamma: Union[float, torch.Tensor], lmbda: float, value_network: TensorDictModule, @@ -688,7 +717,7 @@ def __init__( super().__init__() try: device = next(value_network.parameters()).device - except StopIteration: + except (AttributeError, StopIteration): device = torch.device("cpu") self.register_buffer("gamma", torch.tensor(gamma, device=device)) self.register_buffer("lmbda", torch.tensor(lmbda, device=device)) @@ -708,11 +737,16 @@ def __init__( self.advantage_key = advantage_key self.value_target_key = value_target_key - self.in_keys = ( - value_network.in_keys - + [("next", "reward"), ("next", "done")] - + [("next", in_key) for in_key in value_network.in_keys] - ) + try: + self.in_keys = ( + value_network.in_keys + + [("next", "reward"), ("next", "done")] + + [("next", in_key) for in_key in value_network.in_keys] + ) + except AttributeError: + # value network does not have an `in_keys` attribute + pass + self.out_keys = [self.advantage_key, self.value_target_key] @_self_set_grad_enabled @@ -785,14 +819,14 @@ def forward( f"tensordict.batch_size = {tensordict.batch_size}" ) reward = tensordict.get(("next", "reward")) - gamma, lmbda = self.gamma, self.lmbda - reward = tensordict.get(("next", "reward")) + device = reward.device + gamma, lmbda = self.gamma.to(device), self.lmbda.to(device) steps_to_next_obs = tensordict.get("steps_to_next_obs", None) if steps_to_next_obs is not None: gamma = gamma ** steps_to_next_obs.view_as(reward) kwargs = {} - if self.is_functional and params is None: + if self.is_stateless and params is None: raise RuntimeError( "Expected params to be passed to advantage module but got none." ) @@ -845,12 +879,13 @@ def value_estimate( f"tensordict.batch_size = {tensordict.batch_size}" ) reward = tensordict.get(("next", "reward")) - gamma, lmbda = self.gamma, self.lmbda + device = reward.device + gamma, lmbda = self.gamma.to(device), self.lmbda.to(device) steps_to_next_obs = tensordict.get("steps_to_next_obs", None) if steps_to_next_obs is not None: gamma = gamma ** steps_to_next_obs.view_as(reward) - if self.is_functional and params is None: + if self.is_stateless and params is None: raise RuntimeError( "Expected params to be passed to advantage module but got none." ) diff --git a/torchrl/objectives/value/functional.py b/torchrl/objectives/value/functional.py index d4e0a4f0c77..534eb47306d 100644 --- a/torchrl/objectives/value/functional.py +++ b/torchrl/objectives/value/functional.py @@ -45,6 +45,10 @@ def generalized_advantage_estimate( done (Tensor): boolean flag for end of episode. """ + if not (next_state_value.shape == state_value.shape == reward.shape == done.shape): + raise RuntimeError( + "All input tensors (value, reward and done states) must share a unique shape." + ) for tensor in (next_state_value, state_value, reward, done): if tensor.shape[-1] != 1: raise RuntimeError( @@ -97,6 +101,10 @@ def vec_generalized_advantage_estimate( done (Tensor): boolean flag for end of episode. """ + if not (next_state_value.shape == state_value.shape == reward.shape == done.shape): + raise RuntimeError( + "All input tensors (value, reward and done states) must share a unique shape." + ) for tensor in (next_state_value, state_value, reward, done): if tensor.shape[-1] != 1: raise RuntimeError( @@ -163,6 +171,10 @@ def td_advantage_estimate( done (Tensor): boolean flag for end of episode. """ + if not (next_state_value.shape == state_value.shape == reward.shape == done.shape): + raise RuntimeError( + "All input tensors (value, reward and done states) must share a unique shape." + ) for tensor in (next_state_value, state_value, reward, done): if tensor.shape[-1] != 1: raise RuntimeError( @@ -191,7 +203,7 @@ def td_lambda_return_estimate( reward (Tensor): reward of taking actions in the environment. must be a [Batch x TimeSteps x 1] or [Batch x TimeSteps] tensor done (Tensor): boolean flag for end of episode. - rolling_gamma (bool, optional): if True, it is assumed that each gamma + rolling_gamma (bool, optional): if ``True``, it is assumed that each gamma if a gamma tensor is tied to a single event: gamma = [g1, g2, g3, g4] value = [v1, v2, v3, v4] @@ -214,6 +226,10 @@ def td_lambda_return_estimate( Default is True. """ + if not (next_state_value.shape == reward.shape == done.shape): + raise RuntimeError( + "All input tensors (value, reward and done states) must share a unique shape." + ) for tensor in (next_state_value, reward, done): if tensor.shape[-1] != 1: raise RuntimeError( @@ -288,7 +304,7 @@ def td_lambda_advantage_estimate( reward (Tensor): reward of taking actions in the environment. must be a [Batch x TimeSteps x 1] or [Batch x TimeSteps] tensor done (Tensor): boolean flag for end of episode. - rolling_gamma (bool, optional): if True, it is assumed that each gamma + rolling_gamma (bool, optional): if ``True``, it is assumed that each gamma if a gamma tensor is tied to a single event: gamma = [g1, g2, g3, g4] value = [v1, v2, v3, v4] @@ -311,6 +327,10 @@ def td_lambda_advantage_estimate( Default is True. """ + if not (next_state_value.shape == state_value.shape == reward.shape == done.shape): + raise RuntimeError( + "All input tensors (value, reward and done states) must share a unique shape." + ) if not state_value.shape == next_state_value.shape: raise RuntimeError("shape of state_value and next_state_value must match") returns = td_lambda_return_estimate( @@ -342,7 +362,7 @@ def vec_td_lambda_advantage_estimate( reward (Tensor): reward of taking actions in the environment. must be a [Batch x TimeSteps x 1] or [Batch x TimeSteps] tensor done (Tensor): boolean flag for end of episode. - rolling_gamma (bool, optional): if True, it is assumed that each gamma + rolling_gamma (bool, optional): if ``True``, it is assumed that each gamma if a gamma tensor is tied to a single event: gamma = [g1, g2, g3, g4] value = [v1, v2, v3, v4] @@ -365,6 +385,10 @@ def vec_td_lambda_advantage_estimate( Default is True. """ + if not (next_state_value.shape == state_value.shape == reward.shape == done.shape): + raise RuntimeError( + "All input tensors (value, reward and done states) must share a unique shape." + ) return ( vec_td_lambda_return_estimate( gamma, lmbda, next_state_value, reward, done, rolling_gamma @@ -387,7 +411,7 @@ def vec_td_lambda_return_estimate( reward (Tensor): reward of taking actions in the environment. must be a [Batch x TimeSteps x 1] or [Batch x TimeSteps] tensor done (Tensor): boolean flag for end of episode. - rolling_gamma (bool, optional): if True, it is assumed that each gamma + rolling_gamma (bool, optional): if ``True``, it is assumed that each gamma if a gamma tensor is tied to a single event: gamma = [g1, g2, g3, g4] value = [v1, v2, v3, v4] @@ -410,6 +434,10 @@ def vec_td_lambda_return_estimate( Default is True. """ + if not (next_state_value.shape == reward.shape == done.shape): + raise RuntimeError( + "All input tensors (value, reward and done states) must share a unique shape." + ) shape = next_state_value.shape if not shape[-1] == 1: raise RuntimeError("last dimension of inputs shape must be singleton") diff --git a/torchrl/objectives/value/utils.py b/torchrl/objectives/value/utils.py index beaeacf4bf8..d5091fcd67f 100644 --- a/torchrl/objectives/value/utils.py +++ b/torchrl/objectives/value/utils.py @@ -145,7 +145,7 @@ def _make_gammas_tensor(gamma: torch.Tensor, T: int, rolling_gamma: bool): Args: gamma (torch.tensor): the gamma tensor to be prepared. T (int): the time length - rolling_gamma (bool): if True, the gamma value is set for each step + rolling_gamma (bool): if ``True``, the gamma value is set for each step independently. If False, the gamma value at (i, t) will be used for the trajectory following (i, t). diff --git a/torchrl/record/recorder.py b/torchrl/record/recorder.py index 529e64927d5..56c68e065ca 100644 --- a/torchrl/record/recorder.py +++ b/torchrl/record/recorder.py @@ -34,7 +34,7 @@ class VideoRecorder(ObservationTransform): skip (int): frame interval in the output video. Default is 2. center_crop (int, optional): value of square center crop. - make_grid (bool, optional): if True, a grid is created assuming that a + make_grid (bool, optional): if ``True``, a grid is created assuming that a tensor of shape [B x W x H x 3] is provided, with B being the batch size. Default is True. @@ -138,7 +138,7 @@ class TensorDictRecorder(Transform): Args: out_file_base (str): a string defining the prefix of the file where the tensordict will be written. - skip_reset (bool): if True, the first TensorDict of the list will be discarded (usually the tensordict + skip_reset (bool): if ``True``, the first TensorDict of the list will be discarded (usually the tensordict resulting from the call to :obj:`env.reset()`) default: True skip (int): frame interval for the saved tensordict. diff --git a/torchrl/trainers/helpers/collectors.py b/torchrl/trainers/helpers/collectors.py index e60e943bf9d..4ec6056363d 100644 --- a/torchrl/trainers/helpers/collectors.py +++ b/torchrl/trainers/helpers/collectors.py @@ -373,7 +373,7 @@ class OnPolicyCollectorConfig: # If the collector device differs from the policy device (cuda:0 if available), then the # weights of the collector policy are synchronized with collector.update_policy_weights_(). pin_memory: bool = False - # if True, the data collector will call pin_memory before dispatching tensordicts onto the passing device + # if ``True``, the data collector will call pin_memory before dispatching tensordicts onto the passing device frames_per_batch: int = 1000 # number of steps executed in the environment per collection. # This value represents how many steps will the data collector execute and return in *each* diff --git a/torchrl/trainers/helpers/envs.py b/torchrl/trainers/helpers/envs.py index 3babbb3d5f7..14036cd355b 100644 --- a/torchrl/trainers/helpers/envs.py +++ b/torchrl/trainers/helpers/envs.py @@ -244,7 +244,7 @@ def transformed_env_constructor( custom_env (EnvBase, optional): if an existing environment needs to be transformed_in, it can be passed directly to this helper. `custom_env_maker` and `custom_env` are exclusive features. - return_transformed_envs (bool, optional): if True, a transformed_in environment + return_transformed_envs (bool, optional): if ``True``, a transformed_in environment is returned. action_dim_gsde (int, Optional): if gSDE is used, this can present the action dim to initialize the noise. Make sure this is indicated in environment executed in parallel. @@ -556,7 +556,7 @@ class EnvConfig: max_frames_per_traj: int = 1000 # Number of steps before a reset of the environment is called (if it has not been flagged as done before). batch_transform: bool = False - # if True, the transforms will be applied to the parallel env, and not to each individual env.\ + # if ``True``, the transforms will be applied to the parallel env, and not to each individual env.\ image_size: int = 84 # if True and environment has discrete action space, then it is encoded as categorical values rather than one-hot. categorical_action_encoding: bool = False diff --git a/torchrl/trainers/trainers.py b/torchrl/trainers/trainers.py index 06338d9d41c..cbd1a66cb77 100644 --- a/torchrl/trainers/trainers.py +++ b/torchrl/trainers/trainers.py @@ -603,11 +603,11 @@ class ReplayBufferTrainer(TrainerHookBase): replay_buffer (TensorDictReplayBuffer): replay buffer to be used. batch_size (int): batch size when sampling data from the latest collection or from the replay buffer. - memmap (bool, optional): if True, a memmap tensordict is created. + memmap (bool, optional): if ``True``, a memmap tensordict is created. Default is False. device (device, optional): device where the samples must be placed. Default is cpu. - flatten_tensordicts (bool, optional): if True, the tensordicts will be + flatten_tensordicts (bool, optional): if ``True``, the tensordicts will be flattened (or equivalently masked with the valid mask obtained from the collector) before being passed to the replay buffer. Otherwise, no transform will be achieved other than padding (see :obj:`max_dims` arg below). @@ -792,8 +792,8 @@ class LogReward(TrainerHookBase): Args: logname (str, optional): name of the rewards to be logged. Default is :obj:`"r_training"`. - log_pbar (bool, optional): if True, the reward value will be logged on - the progression bar. Default is :obj:`False`. + log_pbar (bool, optional): if ``True``, the reward value will be logged on + the progression bar. Default is ``False``. reward_key (str or tuple, optional): the key where to find the reward in the input batch. Defaults to ``("next", "reward")`` @@ -1125,7 +1125,7 @@ class Recorder(TrainerHookBase): out_key (str, optional): reward key to set to the logger. Default is `"reward_evaluation"`. suffix (str, optional): suffix of the video to be recorded. - log_pbar (bool, optional): if True, the reward value will be logged on + log_pbar (bool, optional): if ``True``, the reward value will be logged on the progression bar. Default is `False`. """ @@ -1265,7 +1265,7 @@ class CountFramesLog(TrainerHookBase): frame_skip (int): frame skip of the environment. This argument is important to keep track of the total number of frames, not the apparent one. - log_pbar (bool, optional): if True, the reward value will be logged on + log_pbar (bool, optional): if ``True``, the reward value will be logged on the progression bar. Default is `False`. Examples: From acd0ec1152c740218742c5904d2e6c0fca7ba76c Mon Sep 17 00:00:00 2001 From: vmoens Date: Mon, 27 Mar 2023 21:19:50 +0100 Subject: [PATCH 19/89] test --- test/test_cost.py | 58 ++++++++++++++++++++++++++++---- torchrl/objectives/ddpg.py | 2 +- torchrl/objectives/deprecated.py | 6 ++-- torchrl/objectives/dqn.py | 2 +- torchrl/objectives/dreamer.py | 9 ++--- torchrl/objectives/iql.py | 9 ++--- torchrl/objectives/redq.py | 6 ++-- torchrl/objectives/sac.py | 2 +- torchrl/objectives/td3.py | 2 +- 9 files changed, 70 insertions(+), 26 deletions(-) diff --git a/test/test_cost.py b/test/test_cost.py index e09364ca69d..4477feee0da 100644 --- a/test/test_cost.py +++ b/test/test_cost.py @@ -1847,7 +1847,8 @@ def _create_seq_mock_data_redq( @pytest.mark.parametrize("delay_qvalue", (True, False)) @pytest.mark.parametrize("num_qvalue", [1, 2, 4, 8]) @pytest.mark.parametrize("device", get_available_devices()) - def test_redq(self, delay_qvalue, num_qvalue, device): + @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) + def test_redq(self, delay_qvalue, num_qvalue, device, td_est): torch.manual_seed(self.seed) td = self._create_mock_data_redq(device=device) @@ -1862,6 +1863,12 @@ def test_redq(self, delay_qvalue, num_qvalue, device): loss_function="l2", delay_qvalue=delay_qvalue, ) + if td_est is ValueFunctions.GAE: + with pytest.raises(NotImplementedError): + loss_fn.make_value_function(td_est) + return + if td_est is not None: + loss_fn.make_value_function(td_est) with _check_td_steady(td): loss = loss_fn(td) @@ -2035,7 +2042,8 @@ def test_redq_shared(self, delay_qvalue, num_qvalue, device): @pytest.mark.parametrize("delay_qvalue", (True, False)) @pytest.mark.parametrize("num_qvalue", [1, 2, 4, 8]) @pytest.mark.parametrize("device", get_available_devices()) - def test_redq_batched(self, delay_qvalue, num_qvalue, device): + @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) + def test_redq_batched(self, delay_qvalue, num_qvalue, device, td_est): torch.manual_seed(self.seed) td = self._create_mock_data_redq(device=device) @@ -2050,6 +2058,12 @@ def test_redq_batched(self, delay_qvalue, num_qvalue, device): loss_function="l2", delay_qvalue=delay_qvalue, ) + if td_est is ValueFunctions.GAE: + with pytest.raises(NotImplementedError): + loss_fn.make_value_function(td_est) + return + if td_est is not None: + loss_fn.make_value_function(td_est) loss_class_deprec = ( REDQLoss_deprecated if not delay_qvalue else DoubleREDQLoss_deprecated @@ -2060,6 +2074,12 @@ def test_redq_batched(self, delay_qvalue, num_qvalue, device): num_qvalue_nets=num_qvalue, loss_function="l2", ) + if td_est is ValueFunctions.GAE: + with pytest.raises(NotImplementedError): + loss_fn_deprec.make_value_function(td_est) + return + if td_est is not None: + loss_fn_deprec.make_value_function(td_est) td_clone1 = td.clone() td_clone2 = td.clone() @@ -2289,7 +2309,8 @@ def _create_seq_mock_data_ppo( @pytest.mark.parametrize("gradient_mode", (True, False)) @pytest.mark.parametrize("advantage", ("gae", "td", "td_lambda", None)) @pytest.mark.parametrize("device", get_available_devices()) - def test_ppo(self, loss_class, device, gradient_mode, advantage): + @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) + def test_ppo(self, loss_class, device, gradient_mode, advantage, td_est): torch.manual_seed(self.seed) td = self._create_seq_mock_data_ppo(device=device) @@ -2315,6 +2336,10 @@ def test_ppo(self, loss_class, device, gradient_mode, advantage): loss_fn = loss_class(actor, value, loss_critic_type="l2") if advantage is not None: advantage(td) + else: + if td_est is not None: + loss_fn.make_value_function(td_est) + loss = loss_fn(td) loss_critic = loss["loss_critic"] loss_objective = loss["loss_objective"] + loss.get("loss_entropy", 0.0) @@ -2560,7 +2585,8 @@ def _create_seq_mock_data_a2c( @pytest.mark.parametrize("gradient_mode", (True, False)) @pytest.mark.parametrize("advantage", ("gae", "td", "td_lambda", None)) @pytest.mark.parametrize("device", get_available_devices()) - def test_a2c(self, device, gradient_mode, advantage): + @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) + def test_a2c(self, device, gradient_mode, advantage, td_est): torch.manual_seed(self.seed) td = self._create_seq_mock_data_a2c(device=device) @@ -2597,6 +2623,8 @@ def test_a2c(self, device, gradient_mode, advantage): td = td.exclude(loss_fn.value_target_key) if advantage is not None: advantage(td) + else: + loss_fn.make_value_function(td_est) loss = loss_fn(td) loss_critic = loss["loss_critic"] loss_objective = loss["loss_objective"] + loss.get("loss_entropy", 0.0) @@ -2696,7 +2724,8 @@ class TestReinforce: @pytest.mark.parametrize("delay_value", [True, False]) @pytest.mark.parametrize("gradient_mode", [True, False]) @pytest.mark.parametrize("advantage", ["gae", "td", "td_lambda", None]) - def test_reinforce_value_net(self, advantage, gradient_mode, delay_value): + @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) + def test_reinforce_value_net(self, advantage, gradient_mode, delay_value, td_est): n_obs = 3 n_act = 5 batch = 4 @@ -2758,6 +2787,8 @@ def test_reinforce_value_net(self, advantage, gradient_mode, delay_value): params = TensorDict(value_net.state_dict(), []).unflatten_keys(".") if advantage is not None: advantage(td, params=params) + else: + loss_fn.make_value_function(td_est) loss_td = loss_fn(td) autograd.grad( loss_td.get("loss_actor"), @@ -3113,7 +3144,8 @@ def test_dreamer_env(self, device, imagination_horizon, discount_loss): @pytest.mark.parametrize("imagination_horizon", [3, 5]) @pytest.mark.parametrize("discount_loss", [True, False]) - def test_dreamer_actor(self, device, imagination_horizon, discount_loss): + @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) + def test_dreamer_actor(self, device, imagination_horizon, discount_loss, td_est): tensordict = self._create_actor_data(2, 3, 10, 5).to(device) mb_env = self._create_mb_env(10, 5).to(device) actor_model = self._create_actor_model(10, 5).to(device) @@ -3125,6 +3157,12 @@ def test_dreamer_actor(self, device, imagination_horizon, discount_loss): imagination_horizon=imagination_horizon, discount_loss=discount_loss, ) + if td_est is ValueFunctions.GAE: + with pytest.raises(NotImplementedError): + loss_module.make_value_function(td_est) + return + if td_est is not None: + loss_module.make_value_function(td_est) loss_td, fake_data = loss_module(tensordict) assert not fake_data.requires_grad assert fake_data.shape == torch.Size([tensordict.numel(), imagination_horizon]) @@ -3288,12 +3326,14 @@ def _create_seq_mock_data_iql( @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("temperature", [0.0, 0.1, 1.0, 10.0]) @pytest.mark.parametrize("expectile", [0.1, 0.5, 1.0]) + @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) def test_iql( self, num_qvalue, device, temperature, expectile, + td_est, ): torch.manual_seed(self.seed) @@ -3312,6 +3352,12 @@ def test_iql( expectile=expectile, loss_function="l2", ) + if td_est is ValueFunctions.GAE: + with pytest.raises(NotImplementedError): + loss_fn.make_value_function(td_est) + return + if td_est is not None: + loss_fn.make_value_function(td_est) with _check_td_steady(td): loss = loss_fn(td) diff --git a/torchrl/objectives/ddpg.py b/torchrl/objectives/ddpg.py index 980c0e3b8c9..6bb15af4052 100644 --- a/torchrl/objectives/ddpg.py +++ b/torchrl/objectives/ddpg.py @@ -23,7 +23,7 @@ from ..envs.utils import set_exploration_mode from .common import LossModule -from .value import GAE, TD0Estimate, TD1Estimate, TDLambdaEstimate +from .value import TD0Estimate, TD1Estimate, TDLambdaEstimate class DDPGLoss(LossModule): diff --git a/torchrl/objectives/deprecated.py b/torchrl/objectives/deprecated.py index 97b7aab7e5e..40cdfe1d687 100644 --- a/torchrl/objectives/deprecated.py +++ b/torchrl/objectives/deprecated.py @@ -22,7 +22,7 @@ ValueFunctions, ) from torchrl.objectives.common import LossModule -from torchrl.objectives.value import GAE, TD0Estimate, TD1Estimate, TDLambdaEstimate +from torchrl.objectives.value import TD0Estimate, TD1Estimate, TDLambdaEstimate try: from functorch import vmap @@ -289,7 +289,9 @@ def make_value_function(self, value_type: ValueFunctions, **hyperparams): value_network=None, value_key=value_key, **hp ) elif value_type == ValueFunctions.GAE: - self._value_function = GAE(value_network=None, value_key=value_key, **hp) + raise NotImplementedError( + f"Value type {value_type} it not implemented for loss {type(self)}." + ) elif value_type == ValueFunctions.TDLambda: self._value_function = TDLambdaEstimate( value_network=None, value_key=value_key, **hp diff --git a/torchrl/objectives/dqn.py b/torchrl/objectives/dqn.py index 66ab0aba5c4..b8d385f2b8e 100644 --- a/torchrl/objectives/dqn.py +++ b/torchrl/objectives/dqn.py @@ -15,7 +15,7 @@ from .common import LossModule from .utils import default_value_kwargs, distance_loss, ValueFunctions -from .value import GAE, TDLambdaEstimate +from .value import TDLambdaEstimate from .value.advantages import TD0Estimate, TD1Estimate diff --git a/torchrl/objectives/dreamer.py b/torchrl/objectives/dreamer.py index 47f13e4ae90..67c66b93fa8 100644 --- a/torchrl/objectives/dreamer.py +++ b/torchrl/objectives/dreamer.py @@ -17,7 +17,7 @@ hold_out_net, ValueFunctions, ) -from torchrl.objectives.value import GAE, TD0Estimate, TD1Estimate, TDLambdaEstimate +from torchrl.objectives.value import TD0Estimate, TD1Estimate, TDLambdaEstimate class DreamerModelLoss(LossModule): @@ -240,11 +240,8 @@ def make_value_function(self, value_type: ValueFunctions, **hyperparams): value_key=value_key, ) elif value_type is ValueFunctions.GAE: - self._value_function = GAE( - **hp, - value_network=value_net, - value_target_key="value_target", - value_key=value_key, + raise NotImplementedError( + f"Value type {value_type} it not implemented for loss {type(self)}." ) elif value_type is ValueFunctions.TDLambda: self._value_function = TDLambdaEstimate( diff --git a/torchrl/objectives/iql.py b/torchrl/objectives/iql.py index f817af1b904..4da4027d5b8 100644 --- a/torchrl/objectives/iql.py +++ b/torchrl/objectives/iql.py @@ -15,7 +15,7 @@ from ..envs.utils import set_exploration_mode from .common import LossModule -from .value import GAE, TD0Estimate, TD1Estimate, TDLambdaEstimate +from .value import TD0Estimate, TD1Estimate, TDLambdaEstimate try: from functorch import vmap @@ -255,11 +255,8 @@ def make_value_function(self, value_type: ValueFunctions, **hyperparams): value_key=value_key, ) elif value_type is ValueFunctions.GAE: - self._value_function = GAE( - **hp, - value_network=value_net, - value_target_key="value_target", - value_key=value_key, + raise NotImplementedError( + f"Value type {value_type} it not implemented for loss {type(self)}." ) elif value_type is ValueFunctions.TDLambda: self._value_function = TDLambdaEstimate( diff --git a/torchrl/objectives/redq.py b/torchrl/objectives/redq.py index a48e0d78580..1a9ecb45955 100644 --- a/torchrl/objectives/redq.py +++ b/torchrl/objectives/redq.py @@ -17,7 +17,7 @@ from torchrl.envs.utils import set_exploration_mode, step_mdp from torchrl.objectives.common import LossModule from torchrl.objectives.utils import default_value_kwargs, distance_loss, ValueFunctions -from torchrl.objectives.value import GAE, TD0Estimate, TD1Estimate, TDLambdaEstimate +from torchrl.objectives.value import TD0Estimate, TD1Estimate, TDLambdaEstimate try: from functorch import vmap @@ -327,7 +327,9 @@ def make_value_function(self, value_type: ValueFunctions, **hyperparams): value_network=None, value_key=value_key, **hp ) elif value_type == ValueFunctions.GAE: - self._value_function = GAE(value_network=None, value_key=value_key, **hp) + raise NotImplementedError( + f"Value type {value_type} it not implemented for loss {type(self)}." + ) elif value_type == ValueFunctions.TDLambda: self._value_function = TDLambdaEstimate( value_network=None, value_key=value_key, **hp diff --git a/torchrl/objectives/sac.py b/torchrl/objectives/sac.py index 5a8bdb25aef..ad7263c2fda 100644 --- a/torchrl/objectives/sac.py +++ b/torchrl/objectives/sac.py @@ -19,7 +19,7 @@ from ..envs.utils import set_exploration_mode, step_mdp from .common import LossModule -from .value import GAE, TD0Estimate, TD1Estimate, TDLambdaEstimate +from .value import TD0Estimate, TD1Estimate, TDLambdaEstimate try: from functorch import vmap diff --git a/torchrl/objectives/td3.py b/torchrl/objectives/td3.py index 61dcdbbfcb1..b6c96a253e4 100644 --- a/torchrl/objectives/td3.py +++ b/torchrl/objectives/td3.py @@ -12,7 +12,7 @@ from torchrl.envs.utils import set_exploration_mode, step_mdp from torchrl.objectives.common import LossModule from torchrl.objectives.utils import default_value_kwargs, distance_loss, ValueFunctions -from torchrl.objectives.value import GAE, TD0Estimate, TD1Estimate, TDLambdaEstimate +from torchrl.objectives.value import TD0Estimate, TD1Estimate, TDLambdaEstimate try: from functorch import vmap From 7855ef6ab73f3784de68aeb998f995e1af22056c Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 28 Mar 2023 09:05:57 +0100 Subject: [PATCH 20/89] smooth deprecation --- test/test_cost.py | 4 ++-- torchrl/objectives/a2c.py | 15 +++++++++++++-- torchrl/objectives/ddpg.py | 9 +++++++++ torchrl/objectives/deprecated.py | 10 +++++++++- torchrl/objectives/dqn.py | 16 ++++++++++++++-- torchrl/objectives/dreamer.py | 23 ++++++++++++++++++++--- torchrl/objectives/iql.py | 15 +++++++++++++-- torchrl/objectives/ppo.py | 19 +++++++++++++++++-- torchrl/objectives/redq.py | 15 +++++++++++++-- torchrl/objectives/reinforce.py | 15 +++++++++++++-- torchrl/objectives/sac.py | 15 +++++++++++++-- torchrl/objectives/td3.py | 15 +++++++++++++-- torchrl/objectives/utils.py | 6 ++++++ 13 files changed, 155 insertions(+), 22 deletions(-) diff --git a/test/test_cost.py b/test/test_cost.py index 4477feee0da..dc763f37f66 100644 --- a/test/test_cost.py +++ b/test/test_cost.py @@ -2623,7 +2623,7 @@ def test_a2c(self, device, gradient_mode, advantage, td_est): td = td.exclude(loss_fn.value_target_key) if advantage is not None: advantage(td) - else: + elif td_est is not None: loss_fn.make_value_function(td_est) loss = loss_fn(td) loss_critic = loss["loss_critic"] @@ -2787,7 +2787,7 @@ def test_reinforce_value_net(self, advantage, gradient_mode, delay_value, td_est params = TensorDict(value_net.state_dict(), []).unflatten_keys(".") if advantage is not None: advantage(td, params=params) - else: + elif td_est is not None: loss_fn.make_value_function(td_est) loss_td = loss_fn(td) autograd.grad( diff --git a/torchrl/objectives/a2c.py b/torchrl/objectives/a2c.py index 644324416f3..a8bdade55b6 100644 --- a/torchrl/objectives/a2c.py +++ b/torchrl/objectives/a2c.py @@ -2,7 +2,7 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. - +import warnings from typing import Tuple import torch @@ -11,7 +11,12 @@ from torch import distributions as d from torchrl.objectives.common import LossModule -from torchrl.objectives.utils import default_value_kwargs, distance_loss, ValueFunctions +from torchrl.objectives.utils import ( + _GAMMA_LMBDA_DEPREC_WARNING, + default_value_kwargs, + distance_loss, + ValueFunctions, +) from torchrl.objectives.value import GAE, TD0Estimate, TD1Estimate, TDLambdaEstimate @@ -72,6 +77,7 @@ def __init__( entropy_coef: float = 0.01, critic_coef: float = 1.0, loss_critic_type: str = "smooth_l1", + gamma: float = None, ): super().__init__() self.convert_to_functional( @@ -88,6 +94,9 @@ def __init__( self.register_buffer( "critic_coef", torch.tensor(critic_coef, device=self.device) ) + if gamma is not None: + warnings.warn(_GAMMA_LMBDA_DEPREC_WARNING) + self.gamma = gamma self.loss_critic_type = loss_critic_type def reset(self) -> None: @@ -165,6 +174,8 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: def make_value_function(self, value_type: ValueFunctions, **hyperparams): hp = dict(default_value_kwargs(value_type)) hp.update(hyperparams) + if hasattr(self, "gamma"): + hp["gamma"] = self.gamma value_key = "state_value" if value_type == ValueFunctions.TD1: self._value_function = TD1Estimate( diff --git a/torchrl/objectives/ddpg.py b/torchrl/objectives/ddpg.py index 6bb15af4052..f8c6e83a621 100644 --- a/torchrl/objectives/ddpg.py +++ b/torchrl/objectives/ddpg.py @@ -5,6 +5,7 @@ from __future__ import annotations +import warnings from copy import deepcopy from typing import Tuple @@ -15,6 +16,7 @@ from torchrl.modules.tensordict_module.actors import ActorCriticWrapper from torchrl.objectives.utils import ( + _GAMMA_LMBDA_DEPREC_WARNING, default_value_kwargs, distance_loss, hold_out_params, @@ -49,6 +51,7 @@ def __init__( loss_function: str = "l2", delay_actor: bool = False, delay_value: bool = False, + gamma: float = None, ) -> None: super().__init__() self.delay_actor = delay_actor @@ -78,6 +81,10 @@ def __init__( self.loss_funtion = loss_function + if gamma is not None: + warnings.warn(_GAMMA_LMBDA_DEPREC_WARNING) + self.gamma = gamma + def forward(self, input_tensordict: TensorDictBase) -> TensorDict: """Computes the DDPG losses given a tensordict sampled from the replay buffer. @@ -175,6 +182,8 @@ def _loss_value( def make_value_function(self, value_type: ValueFunctions, **hyperparams): hp = dict(default_value_kwargs(value_type)) + if hasattr(self, "gamma"): + hp["gamma"] = self.gamma hp.update(hyperparams) value_key = "state_action_value" if value_type == ValueFunctions.TD1: diff --git a/torchrl/objectives/deprecated.py b/torchrl/objectives/deprecated.py index 40cdfe1d687..0a518bff304 100644 --- a/torchrl/objectives/deprecated.py +++ b/torchrl/objectives/deprecated.py @@ -2,8 +2,8 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. - import math +import warnings from numbers import Number from typing import Tuple, Union @@ -22,6 +22,7 @@ ValueFunctions, ) from torchrl.objectives.common import LossModule +from torchrl.objectives.utils import _GAMMA_LMBDA_DEPREC_WARNING from torchrl.objectives.value import TD0Estimate, TD1Estimate, TDLambdaEstimate try: @@ -94,6 +95,7 @@ def __init__( target_entropy: Union[str, Number] = "auto", delay_qvalue: bool = True, gSDE: bool = False, + gamma: float = None, ): if not _has_functorch: raise ImportError("Failed to import functorch.") from FUNCTORCH_ERR @@ -156,6 +158,10 @@ def __init__( ) self.gSDE = gSDE + if gamma is not None: + warnings.warn(_GAMMA_LMBDA_DEPREC_WARNING) + self.gamma = gamma + @property def alpha(self): # keep alpha is a reasonable range @@ -277,6 +283,8 @@ def _loss_alpha(self, log_pi: Tensor) -> Tensor: def make_value_function(self, value_type: ValueFunctions, **hyperparams): hp = dict(default_value_kwargs(value_type)) + if hasattr(self, "gamma"): + hp["gamma"] = self.gamma hp.update(hyperparams) value_key = "state_value" # we do not need a value network bc the next state value is already passed diff --git a/torchrl/objectives/dqn.py b/torchrl/objectives/dqn.py index b8d385f2b8e..01dbcc98182 100644 --- a/torchrl/objectives/dqn.py +++ b/torchrl/objectives/dqn.py @@ -2,7 +2,7 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. - +import warnings from typing import Union import torch @@ -14,7 +14,12 @@ from torchrl.modules.tensordict_module.common import ensure_tensordict_compatible from .common import LossModule -from .utils import default_value_kwargs, distance_loss, ValueFunctions +from .utils import ( + _GAMMA_LMBDA_DEPREC_WARNING, + default_value_kwargs, + distance_loss, + ValueFunctions, +) from .value import TDLambdaEstimate from .value.advantages import TD0Estimate, TD1Estimate @@ -39,6 +44,7 @@ def __init__( loss_function: str = "l2", priority_key: str = "td_error", delay_value: bool = False, + gamma: float = None, ) -> None: super().__init__() @@ -59,8 +65,14 @@ def __init__( self.priority_key = priority_key self.action_space = self.value_network.action_space + if gamma is not None: + warnings.warn(_GAMMA_LMBDA_DEPREC_WARNING) + self.gamma = gamma + def make_value_function(self, value_type: ValueFunctions, **hyperparams): hp = dict(default_value_kwargs(value_type)) + if hasattr(self, "gamma"): + hp["gamma"] = self.gamma hp.update(hyperparams) if value_type is ValueFunctions.TD1: self._value_function = TD1Estimate( diff --git a/torchrl/objectives/dreamer.py b/torchrl/objectives/dreamer.py index 67c66b93fa8..146686ed03c 100644 --- a/torchrl/objectives/dreamer.py +++ b/torchrl/objectives/dreamer.py @@ -2,6 +2,7 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +import warnings from typing import Optional, Tuple import torch @@ -12,6 +13,7 @@ from torchrl.envs.utils import set_exploration_mode, step_mdp from torchrl.objectives.common import LossModule from torchrl.objectives.utils import ( + _GAMMA_LMBDA_DEPREC_WARNING, default_value_kwargs, distance_loss, hold_out_net, @@ -163,6 +165,8 @@ def __init__( *, imagination_horizon: int = 15, discount_loss: bool = False, # for consistency with paper + gamma: int = None, + lmbda: int = None, ): super().__init__() self.actor_model = actor_model @@ -170,6 +174,12 @@ def __init__( self.model_based_env = model_based_env self.imagination_horizon = imagination_horizon self.discount_loss = discount_loss + if gamma is not None: + warnings.warn(_GAMMA_LMBDA_DEPREC_WARNING) + self.gamma = gamma + if lmbda is not None: + warnings.warn(_GAMMA_LMBDA_DEPREC_WARNING) + self.lmbda = lmbda def forward(self, tensordict: TensorDict) -> Tuple[TensorDict, TensorDict]: with torch.no_grad(): @@ -224,6 +234,8 @@ def make_value_function(self, value_type: ValueFunctions, **hyperparams): value_net = None value_key = "state_value" hp = dict(default_value_kwargs(value_type)) + if hasattr(self, "gamma"): + hp["gamma"] = self.gamma hp.update(hyperparams) if value_type is ValueFunctions.TD1: self._value_function = TD1Estimate( @@ -240,10 +252,14 @@ def make_value_function(self, value_type: ValueFunctions, **hyperparams): value_key=value_key, ) elif value_type is ValueFunctions.GAE: + if hasattr(self, "lmbda"): + hp["lmbda"] = self.lmbda raise NotImplementedError( f"Value type {value_type} it not implemented for loss {type(self)}." ) elif value_type is ValueFunctions.TDLambda: + if hasattr(self, "lmbda"): + hp["lmbda"] = self.lmbda self._value_function = TDLambdaEstimate( **hp, value_network=value_net, @@ -264,10 +280,11 @@ class DreamerValueLoss(LossModule): Args: value_model (TensorDictModule): the value model. - value_loss (str, optional): the loss to use for the value loss. Default: "l2". - gamma (float, optional): the gamma discount factor. Default: 0.99. + value_loss (str, optional): the loss to use for the value loss. + Default: ``"l2"``. discount_loss (bool, optional): if ``True``, the loss is discounted with a gamma discount factor. Default: False. + gamma (float, optional): the gamma discount factor. Default: ``0.99``. """ @@ -275,8 +292,8 @@ def __init__( self, value_model: TensorDictModule, value_loss: Optional[str] = None, - gamma: int = 0.99, discount_loss: bool = False, # for consistency with paper + gamma: int = 0.99, ): super().__init__() self.value_model = value_model diff --git a/torchrl/objectives/iql.py b/torchrl/objectives/iql.py index 4da4027d5b8..26d7c1a8c2f 100644 --- a/torchrl/objectives/iql.py +++ b/torchrl/objectives/iql.py @@ -2,7 +2,7 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. - +import warnings from typing import Optional, Tuple import torch @@ -11,7 +11,12 @@ from torch import Tensor from torchrl.modules import ProbabilisticActor -from torchrl.objectives.utils import default_value_kwargs, distance_loss, ValueFunctions +from torchrl.objectives.utils import ( + _GAMMA_LMBDA_DEPREC_WARNING, + default_value_kwargs, + distance_loss, + ValueFunctions, +) from ..envs.utils import set_exploration_mode from .common import LossModule @@ -65,6 +70,7 @@ def __init__( loss_function: str = "smooth_l1", temperature: float = 1.0, expectile: float = 0.5, + gamma: float = None, ) -> None: if not _has_functorch: raise ImportError("Failed to import functorch.") from FUNCTORCH_ERROR @@ -105,6 +111,9 @@ def __init__( self.priority_key = priority_key self.loss_function = loss_function + if gamma is not None: + warnings.warn(_GAMMA_LMBDA_DEPREC_WARNING) + self.gamma = gamma @property def device(self) -> torch.device: @@ -239,6 +248,8 @@ def make_value_function(self, value_type: ValueFunctions, **hyperparams): value_key = "state_value" hp = dict(default_value_kwargs(value_type)) + if hasattr(self, "gamma"): + hp["gamma"] = self.gamma hp.update(hyperparams) if value_type is ValueFunctions.TD1: self._value_function = TD1Estimate( diff --git a/torchrl/objectives/ppo.py b/torchrl/objectives/ppo.py index 477cf3f5765..3bb97d9a371 100644 --- a/torchrl/objectives/ppo.py +++ b/torchrl/objectives/ppo.py @@ -2,8 +2,8 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. - import math +import warnings from typing import Tuple import torch @@ -11,7 +11,12 @@ from tensordict.tensordict import TensorDict, TensorDictBase from torch import distributions as d -from torchrl.objectives.utils import default_value_kwargs, distance_loss, ValueFunctions +from torchrl.objectives.utils import ( + _GAMMA_LMBDA_DEPREC_WARNING, + default_value_kwargs, + distance_loss, + ValueFunctions, +) from .common import LossModule from .value import GAE, TD0Estimate, TD1Estimate, TDLambdaEstimate @@ -101,6 +106,7 @@ def __init__( critic_coef: float = 1.0, loss_critic_type: str = "smooth_l1", normalize_advantage: bool = False, + gamma: float = None, ): super().__init__() self.convert_to_functional( @@ -121,6 +127,9 @@ def __init__( ) self.loss_critic_type = loss_critic_type self.normalize_advantage = normalize_advantage + if gamma is not None: + warnings.warn(_GAMMA_LMBDA_DEPREC_WARNING) + self.gamma = gamma def reset(self) -> None: pass @@ -206,6 +215,8 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: def make_value_function(self, value_type: ValueFunctions, **hyperparams): hp = dict(default_value_kwargs(value_type)) + if hasattr(self, "gamma"): + hp["gamma"] = self.gamma hp.update(hyperparams) value_key = "state_value" if value_type == ValueFunctions.TD1: @@ -300,6 +311,7 @@ def __init__( critic_coef: float = 1.0, loss_critic_type: str = "smooth_l1", normalize_advantage: bool = True, + gamma: float = None, **kwargs, ): super(ClipPPOLoss, self).__init__( @@ -312,6 +324,7 @@ def __init__( critic_coef=critic_coef, loss_critic_type=loss_critic_type, normalize_advantage=normalize_advantage, + gamma=gamma, **kwargs, ) self.register_buffer("clip_epsilon", torch.tensor(clip_epsilon)) @@ -457,6 +470,7 @@ def __init__( critic_coef: float = 1.0, loss_critic_type: str = "smooth_l1", normalize_advantage: bool = True, + gamma: float = None, **kwargs, ): super(KLPENPPOLoss, self).__init__( @@ -469,6 +483,7 @@ def __init__( critic_coef=critic_coef, loss_critic_type=loss_critic_type, normalize_advantage=normalize_advantage, + gamma=gamma, **kwargs, ) diff --git a/torchrl/objectives/redq.py b/torchrl/objectives/redq.py index 1a9ecb45955..fe717c77cd7 100644 --- a/torchrl/objectives/redq.py +++ b/torchrl/objectives/redq.py @@ -2,8 +2,8 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. - import math +import warnings from numbers import Number from typing import Union @@ -16,7 +16,12 @@ from torchrl.envs.utils import set_exploration_mode, step_mdp from torchrl.objectives.common import LossModule -from torchrl.objectives.utils import default_value_kwargs, distance_loss, ValueFunctions +from torchrl.objectives.utils import ( + _GAMMA_LMBDA_DEPREC_WARNING, + default_value_kwargs, + distance_loss, + ValueFunctions, +) from torchrl.objectives.value import TD0Estimate, TD1Estimate, TDLambdaEstimate try: @@ -89,6 +94,7 @@ def __init__( target_entropy: Union[str, Number] = "auto", delay_qvalue: bool = True, gSDE: bool = False, + gamma: float = None, ): if not _has_functorch: raise ImportError("Failed to import functorch.") from FUNCTORCH_ERR @@ -152,6 +158,9 @@ def __init__( "target_entropy", torch.tensor(target_entropy, device=device) ) self.gSDE = gSDE + if gamma is not None: + warnings.warn(_GAMMA_LMBDA_DEPREC_WARNING) + self.gamma = gamma @property def alpha(self): @@ -315,6 +324,8 @@ def _loss_alpha(self, log_pi: Tensor) -> Tensor: def make_value_function(self, value_type: ValueFunctions, **hyperparams): hp = dict(default_value_kwargs(value_type)) + if hasattr(self, "gamma"): + hp["gamma"] = self.gamma hp.update(hyperparams) value_key = "state_value" # we do not need a value network bc the next state value is already passed diff --git a/torchrl/objectives/reinforce.py b/torchrl/objectives/reinforce.py index 1079c4eb2be..2c972291455 100644 --- a/torchrl/objectives/reinforce.py +++ b/torchrl/objectives/reinforce.py @@ -2,7 +2,7 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. - +import warnings from typing import Optional import torch @@ -10,7 +10,12 @@ from tensordict.nn import ProbabilisticTensorDictSequential, TensorDictModule from tensordict.tensordict import TensorDict, TensorDictBase from torchrl.objectives.common import LossModule -from torchrl.objectives.utils import default_value_kwargs, distance_loss, ValueFunctions +from torchrl.objectives.utils import ( + _GAMMA_LMBDA_DEPREC_WARNING, + default_value_kwargs, + distance_loss, + ValueFunctions, +) from torchrl.objectives.value import GAE, TD0Estimate, TD1Estimate, TDLambdaEstimate @@ -72,6 +77,7 @@ def __init__( advantage_key: str = "advantage", value_target_key: str = "value_target", loss_critic_type: str = "smooth_l1", + gamma: float = None, ) -> None: super().__init__() @@ -95,6 +101,9 @@ def __init__( create_target_params=self.delay_value, compare_against=list(actor.parameters()), ) + if gamma is not None: + warnings.warn(_GAMMA_LMBDA_DEPREC_WARNING) + self.gamma = gamma def forward(self, tensordict: TensorDictBase) -> TensorDictBase: advantage = tensordict.get(self.advantage_key, None) @@ -146,6 +155,8 @@ def loss_critic(self, tensordict: TensorDictBase) -> torch.Tensor: def make_value_function(self, value_type: ValueFunctions, **hyperparams): hp = dict(default_value_kwargs(value_type)) + if hasattr(self, "gamma"): + hp["gamma"] = self.gamma hp.update(hyperparams) value_key = "state_value" if value_type == ValueFunctions.TD1: diff --git a/torchrl/objectives/sac.py b/torchrl/objectives/sac.py index ad7263c2fda..f24d1f13b3c 100644 --- a/torchrl/objectives/sac.py +++ b/torchrl/objectives/sac.py @@ -2,8 +2,8 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. - import math +import warnings from numbers import Number from typing import Optional, Tuple, Union @@ -15,7 +15,12 @@ from torchrl.modules import ProbabilisticActor from torchrl.modules.tensordict_module.actors import ActorCriticWrapper -from torchrl.objectives.utils import default_value_kwargs, distance_loss, ValueFunctions +from torchrl.objectives.utils import ( + _GAMMA_LMBDA_DEPREC_WARNING, + default_value_kwargs, + distance_loss, + ValueFunctions, +) from ..envs.utils import set_exploration_mode, step_mdp from .common import LossModule @@ -97,6 +102,7 @@ def __init__( delay_actor: bool = False, delay_qvalue: bool = False, delay_value: bool = False, + gamma: float = None, ) -> None: if not _has_functorch: raise ImportError("Failed to import functorch.") from FUNCTORCH_ERROR @@ -179,6 +185,9 @@ def __init__( self.actor_network, self.value_network ) make_functional(self.actor_critic) + if gamma is not None: + warnings.warn(_GAMMA_LMBDA_DEPREC_WARNING) + self.gamma = gamma def make_value_function(self, value_type: ValueFunctions, **hyperparams): if self._version == 1: @@ -725,6 +734,8 @@ def make_value_function(self, value_type: ValueFunctions, **hyperparams): value_key = "state_value" hp = dict(default_value_kwargs(value_type)) hp.update(hyperparams) + if hasattr(self, "gamma"): + hp["gamma"] = self.gamma if value_type is ValueFunctions.TD1: self._value_function = TD1Estimate( **hp, diff --git a/torchrl/objectives/td3.py b/torchrl/objectives/td3.py index b6c96a253e4..e3c56dbb41a 100644 --- a/torchrl/objectives/td3.py +++ b/torchrl/objectives/td3.py @@ -2,7 +2,7 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. - +import warnings import torch from tensordict.nn import TensorDictModule @@ -11,7 +11,12 @@ from torchrl.envs.utils import set_exploration_mode, step_mdp from torchrl.objectives.common import LossModule -from torchrl.objectives.utils import default_value_kwargs, distance_loss, ValueFunctions +from torchrl.objectives.utils import ( + _GAMMA_LMBDA_DEPREC_WARNING, + default_value_kwargs, + distance_loss, + ValueFunctions, +) from torchrl.objectives.value import TD0Estimate, TD1Estimate, TDLambdaEstimate try: @@ -57,6 +62,7 @@ def __init__( loss_function: str = "smooth_l1", delay_actor: bool = False, delay_qvalue: bool = False, + gamma: float = None, ) -> None: if not _has_functorch: raise ImportError( @@ -88,6 +94,9 @@ def __init__( self.policy_noise = policy_noise self.noise_clip = noise_clip self.max_action = actor_network.spec["action"].space.maximum.max().item() + if gamma is not None: + warnings.warn(_GAMMA_LMBDA_DEPREC_WARNING) + self.gamma = gamma def forward(self, tensordict: TensorDictBase) -> TensorDictBase: obs_keys = self.actor_network.in_keys @@ -214,6 +223,8 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: def make_value_function(self, value_type: ValueFunctions, **hyperparams): hp = dict(default_value_kwargs(value_type)) + if hasattr(self, "gamma"): + hp["gamma"] = self.gamma hp.update(hyperparams) value_key = "state_action_value" # we do not need a value network bc the next state value is already passed diff --git a/torchrl/objectives/utils.py b/torchrl/objectives/utils.py index bcdc38434ab..09edf449e08 100644 --- a/torchrl/objectives/utils.py +++ b/torchrl/objectives/utils.py @@ -15,6 +15,12 @@ from torchrl.envs.utils import step_mdp +_GAMMA_LMBDA_DEPREC_WARNING = ( + "Passing gamma / lambda parameters through the loss constructor " + "is deprecated and will be removed soon. To customize your value function, " + "run `loss_module.make_value_function(ValueFunctions., gamma=val)`." +) + class ValueFunctions(Enum): """Value function enumerator for custom-built estimators. From 55361cc26681d0e76ae0661b458df9929b5f8b29 Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 28 Mar 2023 09:16:06 +0100 Subject: [PATCH 21/89] amend --- torchrl/objectives/common.py | 4 ++-- torchrl/objectives/td3.py | 28 ++++++++++++++++++---------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/torchrl/objectives/common.py b/torchrl/objectives/common.py index 1cbddfb4deb..9f1605ebb4d 100644 --- a/torchrl/objectives/common.py +++ b/torchrl/objectives/common.py @@ -54,7 +54,7 @@ class LossModule(nn.Module): The value estimator can be changed using the :meth:`~.make_value_function` method. """ - default_value_type: ValueFunctions = None + default_value_function: ValueFunctions = None def __init__(self): super().__init__() @@ -383,7 +383,7 @@ def _default_value_function(self): from :obj:`torchrl.objectives.utils.DEFAULT_VALUE_FUN_PARAMS`. """ - self.make_value_function(self.default_value_type) + self.make_value_function(self.default_value_function) def make_value_function(self, value_type: ValueFunctions, **hyperparams): """Value-function constructor. diff --git a/torchrl/objectives/td3.py b/torchrl/objectives/td3.py index e3c56dbb41a..1ef911a8891 100644 --- a/torchrl/objectives/td3.py +++ b/torchrl/objectives/td3.py @@ -34,21 +34,29 @@ class TD3Loss(LossModule): Args: actor_network (TensorDictModule): the actor to be trained - qvalue_network (TensorDictModule): a single Q-value network that will be multiplicated as many times as needed. - num_qvalue_nets (int, optional): Number of Q-value networks to be trained. Default is 10. - policy_noise (float, optional): Standard deviation for the target policy action noise. Default is 0.2. - noise_clip (float, optional): Clipping range value for the sampled target policy action noise. Default is 0.5. - priority_key (str, optional): Key where to write the priority value for prioritized replay buffers. Default is + qvalue_network (TensorDictModule): a single Q-value network that will + be multiplicated as many times as needed. + num_qvalue_nets (int, optional): Number of Q-value networks to be + trained. Default is ``10``. + policy_noise (float, optional): Standard deviation for the target + policy action noise. Default is ``0.2``. + noise_clip (float, optional): Clipping range value for the sampled + target policy action noise. Default is ``0.5``. + priority_key (str, optional): Key where to write the priority value + for prioritized replay buffers. Default is `"td_error"`. - loss_function (str, optional): loss function to be used for the Q-value. Can be one of `"smooth_l1"`, "l2", - "l1", Default is "smooth_l1". - delay_actor (bool, optional): whether to separate the target actor networks from the actor networks used for + loss_function (str, optional): loss function to be used for the Q-value. + Can be one of ``"smooth_l1"``, ``"l2"``, + ``"l1"``, Default is ``"smooth_l1"``. + delay_actor (bool, optional): whether to separate the target actor + networks from the actor networks used for data collection. Default is ``False``. - delay_qvalue (bool, optional): Whether to separate the target Q value networks from the Q value networks used + delay_qvalue (bool, optional): Whether to separate the target Q value + networks from the Q value networks used for data collection. Default is ``False``. """ - default_value_type = ValueFunctions.TD0 + default_value_function = ValueFunctions.TD0 def __init__( self, From fd874576798ea91a6fdb6aef73da2e5f97ceb01a Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 28 Mar 2023 11:51:25 +0100 Subject: [PATCH 22/89] amend --- docs/source/index.rst | 2 +- docs/source/reference/objectives.rst | 83 ++++++++++++++- test/test_cost.py | 138 ++++++++++++------------- test/test_modules.py | 4 +- torchrl/modules/planners/mppi.py | 4 +- torchrl/objectives/__init__.py | 2 +- torchrl/objectives/a2c.py | 24 ++--- torchrl/objectives/common.py | 32 +++--- torchrl/objectives/ddpg.py | 22 ++-- torchrl/objectives/deprecated.py | 22 ++-- torchrl/objectives/dqn.py | 38 +++---- torchrl/objectives/dreamer.py | 22 ++-- torchrl/objectives/iql.py | 22 ++-- torchrl/objectives/ppo.py | 34 +++--- torchrl/objectives/redq.py | 22 ++-- torchrl/objectives/reinforce.py | 26 ++--- torchrl/objectives/sac.py | 40 +++---- torchrl/objectives/td3.py | 22 ++-- torchrl/objectives/utils.py | 18 ++-- torchrl/objectives/value/__init__.py | 5 +- torchrl/objectives/value/advantages.py | 33 ++++-- 21 files changed, 356 insertions(+), 259 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index c8463c53909..d379b418298 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -73,7 +73,7 @@ Knowledge Base ============== .. toctree:: - :maxdepth: 1 + :maxdepth: 2 reference/knowledge_base diff --git a/docs/source/reference/objectives.rst b/docs/source/reference/objectives.rst index ff178fba548..01a7379e298 100644 --- a/docs/source/reference/objectives.rst +++ b/docs/source/reference/objectives.rst @@ -3,6 +3,81 @@ torchrl.objectives package ========================== +TorchRL provides a series of losses to use in your training scripts. +The aim is to have losses that are easily reusable/swappable and that have +a simple signature. + +The main characteristics of TorchRL losses are: + +- they are stateful objects: they contain a copy of the trainable parameters + such that ``loss_module.parameters()`` gives whatever is needed to train the + algorithm. +- They follow the ``tensordict`` convention: the :meth:`torch.nn.Module.forward` + method will receive a tensordict as input that contains all the necessary + information to return a loss value. +- They output a :class:`tensordict.TensorDict` instance with the loss values + written under a ``"loss_`` where ``smth`` is a string describing the + loss. Additional keys in the tensordict may be useful metrics to log during + training time. + .. note:: + The reason we return independent losses is to let the user use a different + optimizer for different sets of parameters for instance. Summing the losses + can be simply done via ``sum(loss for key, loss in loss_vals.items() if key.startswith("loss_")``. + +Training value functions +------------------------ + +TorchRL provides a range of **value estimators** such as TD(0), TD(1), TD(:math:`\lambda`) +and GAE. +In a nutshell, a value estimator is a function of data (mostly +rewards and done states) and a state value (ie. the value +returned by a function that is fit to estimate state-values). +To learn more about value estimators, check the introduction to RL from `Sutton +and Barto `_, +in particular the chapters about value iteration and TD learning. +It gives a somewhat biased estimation of the discounted return following a state +or a state-action pair based on data and proxy maps. These estimators are +used in two contexts: + +- To train the value network to learn the "true" state value (or state-action + value) map, one needs a target value to fit it to. The better (less bias, + less variance) the estimator, the better the value network will be, which in + turn can speed up the policy training significantly. Typically, the value + network loss will look like: + + >>> value = value_network(states) + >>> target_value = value_estimator(rewards, done, value_network(next_state)) + >>> value_net_loss = (value - target_value).pow(2).mean() + +- Computing an "advantage" signal for policy-optimization. The advantage is + the delta between the value estimate (from the estimator, ie from "real" data) + and the output of the value network (ie the proxy to this value). A positive + advantage can be seen as a signal that the policy actually performed better + than expected, thereby signaling that there is room for improvement if that + trajectory is to be taken as example. Conversely, a negative advantage signifies + that the policy underperformed compared to what was to be expected. + +Thins are not always as easy as in the example above and the formula to compute +the value estimator or the advantage may be slightly more intricate than this. +To help users flexibly use one or another value estimator, we provide a simple +API to change it on-the-fly. Here is an example with DQN, but all modules will +follow a similar structure: + + >>> from torchrl.objectives import DQNLoss, ValueEstimators + >>> loss_module = DQNLoss(actor) + >>> kwargs = {"gamma": 0.9, "lmbda": 0.9} + >>> loss_module.make_value_estimator(ValueEstimators.TDLambda, **kwargs) + +The :class:`torchrl.objectives.ValueEstimators` class enumerates the value +estimators to choose from. This makes it easy for the users to rely on +auto-completion to make their choice. + +.. autosummary:: + :toctree: generated/ + :template: rl_template_noinherit.rst + + LossModule + DQN --- @@ -108,10 +183,10 @@ Returns :toctree: generated/ :template: rl_template_noinherit.rst - ValueFunctionBase - TD0Estimate - TD1Estimate - TDLambdaEstimate + ValueEstimatorBase + TD0Estimator + TD1Estimator + TDLambdaEstimator GAE functional.generalized_advantage_estimate functional.vec_generalized_advantage_estimate diff --git a/test/test_cost.py b/test/test_cost.py index dc763f37f66..0aa87bcb2c4 100644 --- a/test/test_cost.py +++ b/test/test_cost.py @@ -91,9 +91,9 @@ HardUpdate, hold_out_net, SoftUpdate, - ValueFunctions, + ValueEstimators, ) -from torchrl.objectives.value.advantages import GAE, TD1Estimate, TDLambdaEstimate +from torchrl.objectives.value.advantages import GAE, TD1Estimator, TDLambdaEstimator from torchrl.objectives.value.functional import ( generalized_advantage_estimate, td_advantage_estimate, @@ -299,7 +299,7 @@ def _create_seq_mock_data_dqn( @pytest.mark.parametrize( "action_spec_type", ("nd_bounded", "one_hot", "categorical") ) - @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) + @pytest.mark.parametrize("td_est", list(ValueEstimators) + [None]) def test_dqn(self, delay_value, device, action_spec_type, td_est): torch.manual_seed(self.seed) actor = self._create_mock_actor( @@ -309,12 +309,12 @@ def test_dqn(self, delay_value, device, action_spec_type, td_est): action_spec_type=action_spec_type, device=device ) loss_fn = DQNLoss(actor, loss_function="l2", delay_value=delay_value) - if td_est is ValueFunctions.GAE: + if td_est is ValueEstimators.GAE: with pytest.raises(NotImplementedError): - loss_fn.make_value_function(td_est) + loss_fn.make_value_estimator(td_est) return if td_est is not None: - loss_fn.make_value_function(td_est) + loss_fn.make_value_estimator(td_est) with _check_td_steady(td): loss = loss_fn(td) assert loss_fn.priority_key in td.keys() @@ -399,7 +399,7 @@ def test_dqn_batcher(self, n, delay_value, device, action_spec_type, gamma=0.9): @pytest.mark.parametrize( "action_spec_type", ("mult_one_hot", "one_hot", "categorical") ) - @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) + @pytest.mark.parametrize("td_est", list(ValueEstimators) + [None]) def test_distributional_dqn( self, atoms, delay_value, device, action_spec_type, td_est, gamma=0.9 ): @@ -413,12 +413,12 @@ def test_distributional_dqn( ).to(device) loss_fn = DistributionalDQNLoss(actor, gamma=gamma, delay_value=delay_value) - if td_est not in (None, ValueFunctions.TD0): + if td_est not in (None, ValueEstimators.TD0): with pytest.raises(NotImplementedError): - loss_fn.make_value_function(td_est) + loss_fn.make_value_estimator(td_est) return elif td_est is not None: - loss_fn.make_value_function(td_est) + loss_fn.make_value_estimator(td_est) with _check_td_steady(td): loss = loss_fn(td) @@ -548,7 +548,7 @@ def _create_seq_mock_data_ddpg( ) @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("delay_actor,delay_value", [(False, False), (True, True)]) - @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) + @pytest.mark.parametrize("td_est", list(ValueEstimators) + [None]) def test_ddpg(self, delay_actor, delay_value, device, td_est): torch.manual_seed(self.seed) actor = self._create_mock_actor(device=device) @@ -561,12 +561,12 @@ def test_ddpg(self, delay_actor, delay_value, device, td_est): delay_actor=delay_actor, delay_value=delay_value, ) - if td_est is ValueFunctions.GAE: + if td_est is ValueEstimators.GAE: with pytest.raises(NotImplementedError): - loss_fn.make_value_function(td_est) + loss_fn.make_value_estimator(td_est) return if td_est is not None: - loss_fn.make_value_function(td_est) + loss_fn.make_value_estimator(td_est) with _check_td_steady(td): loss = loss_fn(td) @@ -788,7 +788,7 @@ def _create_seq_mock_data_td3( ) @pytest.mark.parametrize("policy_noise", [0.1, 1.0]) @pytest.mark.parametrize("noise_clip", [0.1, 1.0]) - @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) + @pytest.mark.parametrize("td_est", list(ValueEstimators) + [None]) def test_td3( self, delay_actor, @@ -811,12 +811,12 @@ def test_td3( delay_actor=delay_actor, delay_qvalue=delay_qvalue, ) - if td_est is ValueFunctions.GAE: + if td_est is ValueEstimators.GAE: with pytest.raises(NotImplementedError): - loss_fn.make_value_function(td_est) + loss_fn.make_value_estimator(td_est) return if td_est is not None: - loss_fn.make_value_function(td_est) + loss_fn.make_value_estimator(td_est) with _check_td_steady(td): loss = loss_fn(td) @@ -1074,7 +1074,7 @@ def _create_seq_mock_data_sac( @pytest.mark.parametrize("delay_qvalue", (True, False)) @pytest.mark.parametrize("num_qvalue", [1, 2, 4, 8]) @pytest.mark.parametrize("device", get_available_devices()) - @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) + @pytest.mark.parametrize("td_est", list(ValueEstimators) + [None]) def test_sac( self, delay_value, @@ -1115,12 +1115,12 @@ def test_sac( **kwargs, ) - if td_est is ValueFunctions.GAE: + if td_est is ValueEstimators.GAE: with pytest.raises(NotImplementedError): - loss_fn.make_value_function(td_est) + loss_fn.make_value_estimator(td_est) return if td_est is not None: - loss_fn.make_value_function(td_est) + loss_fn.make_value_estimator(td_est) with _check_td_steady(td): loss = loss_fn(td) @@ -1491,7 +1491,7 @@ def _create_seq_mock_data_sac( @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("target_entropy_weight", [0.01, 0.5, 0.99]) @pytest.mark.parametrize("target_entropy", ["auto", 1.0, 0.1, 0.0]) - @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) + @pytest.mark.parametrize("td_est", list(ValueEstimators) + [None]) def test_discrete_sac( self, delay_qvalue, @@ -1522,12 +1522,12 @@ def test_discrete_sac( loss_function="l2", **kwargs, ) - if td_est is ValueFunctions.GAE: + if td_est is ValueEstimators.GAE: with pytest.raises(NotImplementedError): - loss_fn.make_value_function(td_est) + loss_fn.make_value_estimator(td_est) return if td_est is not None: - loss_fn.make_value_function(td_est) + loss_fn.make_value_estimator(td_est) with _check_td_steady(td): loss = loss_fn(td) @@ -1847,7 +1847,7 @@ def _create_seq_mock_data_redq( @pytest.mark.parametrize("delay_qvalue", (True, False)) @pytest.mark.parametrize("num_qvalue", [1, 2, 4, 8]) @pytest.mark.parametrize("device", get_available_devices()) - @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) + @pytest.mark.parametrize("td_est", list(ValueEstimators) + [None]) def test_redq(self, delay_qvalue, num_qvalue, device, td_est): torch.manual_seed(self.seed) @@ -1863,12 +1863,12 @@ def test_redq(self, delay_qvalue, num_qvalue, device, td_est): loss_function="l2", delay_qvalue=delay_qvalue, ) - if td_est is ValueFunctions.GAE: + if td_est is ValueEstimators.GAE: with pytest.raises(NotImplementedError): - loss_fn.make_value_function(td_est) + loss_fn.make_value_estimator(td_est) return if td_est is not None: - loss_fn.make_value_function(td_est) + loss_fn.make_value_estimator(td_est) with _check_td_steady(td): loss = loss_fn(td) @@ -2042,7 +2042,7 @@ def test_redq_shared(self, delay_qvalue, num_qvalue, device): @pytest.mark.parametrize("delay_qvalue", (True, False)) @pytest.mark.parametrize("num_qvalue", [1, 2, 4, 8]) @pytest.mark.parametrize("device", get_available_devices()) - @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) + @pytest.mark.parametrize("td_est", list(ValueEstimators) + [None]) def test_redq_batched(self, delay_qvalue, num_qvalue, device, td_est): torch.manual_seed(self.seed) @@ -2058,12 +2058,12 @@ def test_redq_batched(self, delay_qvalue, num_qvalue, device, td_est): loss_function="l2", delay_qvalue=delay_qvalue, ) - if td_est is ValueFunctions.GAE: + if td_est is ValueEstimators.GAE: with pytest.raises(NotImplementedError): - loss_fn.make_value_function(td_est) + loss_fn.make_value_estimator(td_est) return if td_est is not None: - loss_fn.make_value_function(td_est) + loss_fn.make_value_estimator(td_est) loss_class_deprec = ( REDQLoss_deprecated if not delay_qvalue else DoubleREDQLoss_deprecated @@ -2074,12 +2074,12 @@ def test_redq_batched(self, delay_qvalue, num_qvalue, device, td_est): num_qvalue_nets=num_qvalue, loss_function="l2", ) - if td_est is ValueFunctions.GAE: + if td_est is ValueEstimators.GAE: with pytest.raises(NotImplementedError): - loss_fn_deprec.make_value_function(td_est) + loss_fn_deprec.make_value_estimator(td_est) return if td_est is not None: - loss_fn_deprec.make_value_function(td_est) + loss_fn_deprec.make_value_estimator(td_est) td_clone1 = td.clone() td_clone2 = td.clone() @@ -2309,7 +2309,7 @@ def _create_seq_mock_data_ppo( @pytest.mark.parametrize("gradient_mode", (True, False)) @pytest.mark.parametrize("advantage", ("gae", "td", "td_lambda", None)) @pytest.mark.parametrize("device", get_available_devices()) - @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) + @pytest.mark.parametrize("td_est", list(ValueEstimators) + [None]) def test_ppo(self, loss_class, device, gradient_mode, advantage, td_est): torch.manual_seed(self.seed) td = self._create_seq_mock_data_ppo(device=device) @@ -2321,11 +2321,11 @@ def test_ppo(self, loss_class, device, gradient_mode, advantage, td_est): gamma=0.9, lmbda=0.9, value_network=value, differentiable=gradient_mode ) elif advantage == "td": - advantage = TD1Estimate( + advantage = TD1Estimator( gamma=0.9, value_network=value, differentiable=gradient_mode ) elif advantage == "td_lambda": - advantage = TDLambdaEstimate( + advantage = TDLambdaEstimator( gamma=0.9, lmbda=0.9, value_network=value, differentiable=gradient_mode ) elif advantage is None: @@ -2338,7 +2338,7 @@ def test_ppo(self, loss_class, device, gradient_mode, advantage, td_est): advantage(td) else: if td_est is not None: - loss_fn.make_value_function(td_est) + loss_fn.make_value_estimator(td_est) loss = loss_fn(td) loss_critic = loss["loss_critic"] @@ -2387,12 +2387,12 @@ def test_ppo_shared(self, loss_class, device, advantage): value_network=value, ) elif advantage == "td": - advantage = TD1Estimate( + advantage = TD1Estimator( gamma=0.9, value_network=value, ) elif advantage == "td_lambda": - advantage = TDLambdaEstimate( + advantage = TDLambdaEstimator( gamma=0.9, lmbda=0.9, value_network=value, @@ -2461,11 +2461,11 @@ def test_ppo_diff(self, loss_class, device, gradient_mode, advantage): gamma=0.9, lmbda=0.9, value_network=value, differentiable=gradient_mode ) elif advantage == "td": - advantage = TD1Estimate( + advantage = TD1Estimator( gamma=0.9, value_network=value, differentiable=gradient_mode ) elif advantage == "td_lambda": - advantage = TDLambdaEstimate( + advantage = TDLambdaEstimator( gamma=0.9, lmbda=0.9, value_network=value, differentiable=gradient_mode ) elif advantage is None: @@ -2585,7 +2585,7 @@ def _create_seq_mock_data_a2c( @pytest.mark.parametrize("gradient_mode", (True, False)) @pytest.mark.parametrize("advantage", ("gae", "td", "td_lambda", None)) @pytest.mark.parametrize("device", get_available_devices()) - @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) + @pytest.mark.parametrize("td_est", list(ValueEstimators) + [None]) def test_a2c(self, device, gradient_mode, advantage, td_est): torch.manual_seed(self.seed) td = self._create_seq_mock_data_a2c(device=device) @@ -2597,11 +2597,11 @@ def test_a2c(self, device, gradient_mode, advantage, td_est): gamma=0.9, lmbda=0.9, value_network=value, differentiable=gradient_mode ) elif advantage == "td": - advantage = TD1Estimate( + advantage = TD1Estimator( gamma=0.9, value_network=value, differentiable=gradient_mode ) elif advantage == "td_lambda": - advantage = TDLambdaEstimate( + advantage = TDLambdaEstimator( gamma=0.9, lmbda=0.9, value_network=value, differentiable=gradient_mode ) elif advantage is None: @@ -2624,7 +2624,7 @@ def test_a2c(self, device, gradient_mode, advantage, td_est): if advantage is not None: advantage(td) elif td_est is not None: - loss_fn.make_value_function(td_est) + loss_fn.make_value_estimator(td_est) loss = loss_fn(td) loss_critic = loss["loss_critic"] loss_objective = loss["loss_objective"] + loss.get("loss_entropy", 0.0) @@ -2673,11 +2673,11 @@ def test_a2c_diff(self, device, gradient_mode, advantage): gamma=0.9, lmbda=0.9, value_network=value, differentiable=gradient_mode ) elif advantage == "td": - advantage = TD1Estimate( + advantage = TD1Estimator( gamma=0.9, value_network=value, differentiable=gradient_mode ) elif advantage == "td_lambda": - advantage = TDLambdaEstimate( + advantage = TDLambdaEstimator( gamma=0.9, lmbda=0.9, value_network=value, differentiable=gradient_mode ) elif advantage is None: @@ -2724,7 +2724,7 @@ class TestReinforce: @pytest.mark.parametrize("delay_value", [True, False]) @pytest.mark.parametrize("gradient_mode", [True, False]) @pytest.mark.parametrize("advantage", ["gae", "td", "td_lambda", None]) - @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) + @pytest.mark.parametrize("td_est", list(ValueEstimators) + [None]) def test_reinforce_value_net(self, advantage, gradient_mode, delay_value, td_est): n_obs = 3 n_act = 5 @@ -2748,13 +2748,13 @@ def test_reinforce_value_net(self, advantage, gradient_mode, delay_value, td_est differentiable=gradient_mode, ) elif advantage == "td": - advantage = TD1Estimate( + advantage = TD1Estimator( gamma=gamma, value_network=get_functional(value_net), differentiable=gradient_mode, ) elif advantage == "td_lambda": - advantage = TDLambdaEstimate( + advantage = TDLambdaEstimator( gamma=0.9, lmbda=0.9, value_network=get_functional(value_net), @@ -2788,7 +2788,7 @@ def test_reinforce_value_net(self, advantage, gradient_mode, delay_value, td_est if advantage is not None: advantage(td, params=params) elif td_est is not None: - loss_fn.make_value_function(td_est) + loss_fn.make_value_estimator(td_est) loss_td = loss_fn(td) autograd.grad( loss_td.get("loss_actor"), @@ -3144,7 +3144,7 @@ def test_dreamer_env(self, device, imagination_horizon, discount_loss): @pytest.mark.parametrize("imagination_horizon", [3, 5]) @pytest.mark.parametrize("discount_loss", [True, False]) - @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) + @pytest.mark.parametrize("td_est", list(ValueEstimators) + [None]) def test_dreamer_actor(self, device, imagination_horizon, discount_loss, td_est): tensordict = self._create_actor_data(2, 3, 10, 5).to(device) mb_env = self._create_mb_env(10, 5).to(device) @@ -3157,12 +3157,12 @@ def test_dreamer_actor(self, device, imagination_horizon, discount_loss, td_est) imagination_horizon=imagination_horizon, discount_loss=discount_loss, ) - if td_est is ValueFunctions.GAE: + if td_est is ValueEstimators.GAE: with pytest.raises(NotImplementedError): - loss_module.make_value_function(td_est) + loss_module.make_value_estimator(td_est) return if td_est is not None: - loss_module.make_value_function(td_est) + loss_module.make_value_estimator(td_est) loss_td, fake_data = loss_module(tensordict) assert not fake_data.requires_grad assert fake_data.shape == torch.Size([tensordict.numel(), imagination_horizon]) @@ -3326,7 +3326,7 @@ def _create_seq_mock_data_iql( @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("temperature", [0.0, 0.1, 1.0, 10.0]) @pytest.mark.parametrize("expectile", [0.1, 0.5, 1.0]) - @pytest.mark.parametrize("td_est", list(ValueFunctions) + [None]) + @pytest.mark.parametrize("td_est", list(ValueEstimators) + [None]) def test_iql( self, num_qvalue, @@ -3352,12 +3352,12 @@ def test_iql( expectile=expectile, loss_function="l2", ) - if td_est is ValueFunctions.GAE: + if td_est is ValueEstimators.GAE: with pytest.raises(NotImplementedError): - loss_fn.make_value_function(td_est) + loss_fn.make_value_estimator(td_est) return if td_est is not None: - loss_fn.make_value_function(td_est) + loss_fn.make_value_estimator(td_est) with _check_td_steady(td): loss = loss_fn(td) @@ -4289,8 +4289,8 @@ class TestAdv: "adv,kwargs", [ [GAE, {"lmbda": 0.95}], - [TD1Estimate, {}], - [TDLambdaEstimate, {"lmbda": 0.95}], + [TD1Estimator, {}], + [TDLambdaEstimator, {"lmbda": 0.95}], ], ) def test_dispatch( @@ -4321,8 +4321,8 @@ def test_dispatch( "adv,kwargs", [ [GAE, {"lmbda": 0.95}], - [TD1Estimate, {}], - [TDLambdaEstimate, {"lmbda": 0.95}], + [TD1Estimator, {}], + [TDLambdaEstimator, {"lmbda": 0.95}], ], ) def test_diff_reward( @@ -4362,8 +4362,8 @@ def test_diff_reward( "adv,kwargs", [ [GAE, {"lmbda": 0.95}], - [TD1Estimate, {}], - [TDLambdaEstimate, {"lmbda": 0.95}], + [TD1Estimator, {}], + [TDLambdaEstimator, {"lmbda": 0.95}], ], ) def test_non_differentiable(self, adv, kwargs): diff --git a/test/test_modules.py b/test/test_modules.py index 78f4f61c39b..ab3ee0303c6 100644 --- a/test/test_modules.py +++ b/test/test_modules.py @@ -37,7 +37,7 @@ ) from torchrl.modules.models.utils import SquashDims from torchrl.modules.planners.mppi import MPPIPlanner -from torchrl.objectives.value import TDLambdaEstimate +from torchrl.objectives.value import TDLambdaEstimator @pytest.fixture @@ -477,7 +477,7 @@ def test_MPPI(self, device, batch_size, seed=1): env = MockBatchedUnLockedEnv(device=device) value_net = nn.LazyLinear(1, device=device) value_net = ValueOperator(value_net, in_keys=["observation"]) - advantage_module = TDLambdaEstimate( + advantage_module = TDLambdaEstimator( 0.99, 0.95, value_net, diff --git a/torchrl/modules/planners/mppi.py b/torchrl/modules/planners/mppi.py index 21fb53fae00..bd95e2de5c8 100644 --- a/torchrl/modules/planners/mppi.py +++ b/torchrl/modules/planners/mppi.py @@ -46,7 +46,7 @@ class MPPIPlanner(MPCPlannerBase): >>> from torchrl.data import CompositeSpec, NdUnboundedContinuousTensorSpec >>> from torchrl.envs.model_based import ModelBasedEnvBase >>> from torchrl.modules import TensorDictModule, ValueOperator - >>> from torchrl.objectives.value import TDLambdaEstimate + >>> from torchrl.objectives.value import TDLambdaEstimator >>> class MyMBEnv(ModelBasedEnvBase): ... def __init__(self, world_model, device="cpu", dtype=None, batch_size=None): ... super().__init__(world_model, device=device, dtype=dtype, batch_size=batch_size) @@ -87,7 +87,7 @@ class MPPIPlanner(MPCPlannerBase): >>> env = MyMBEnv(world_model) >>> value_net = nn.Linear(4, 1) >>> value_net = ValueOperator(value_net, in_keys=["hidden_observation"]) - >>> adv = TDLambdaEstimate( + >>> adv = TDLambdaEstimator( ... 0.99, ... 0.95, ... value_net, diff --git a/torchrl/objectives/__init__.py b/torchrl/objectives/__init__.py index e74ccbac808..70d794e6495 100644 --- a/torchrl/objectives/__init__.py +++ b/torchrl/objectives/__init__.py @@ -22,7 +22,7 @@ hold_out_params, next_state_value, SoftUpdate, - ValueFunctions, + ValueEstimators, ) # from .value import bellman_max, c_val, dv_val, vtrace, GAE, TDLambdaEstimate, TDEstimate diff --git a/torchrl/objectives/a2c.py b/torchrl/objectives/a2c.py index a8bdade55b6..af91ce86385 100644 --- a/torchrl/objectives/a2c.py +++ b/torchrl/objectives/a2c.py @@ -15,9 +15,9 @@ _GAMMA_LMBDA_DEPREC_WARNING, default_value_kwargs, distance_loss, - ValueFunctions, + ValueEstimators, ) -from torchrl.objectives.value import GAE, TD0Estimate, TD1Estimate, TDLambdaEstimate +from torchrl.objectives.value import GAE, TD0Estimator, TD1Estimator, TDLambdaEstimator class A2CLoss(LossModule): @@ -57,13 +57,13 @@ class A2CLoss(LossModule): If the advantage key (``"advantage`` by default) is not present in the input tensordict, the advantage will be computed by the :meth:`~.forward` method. - A custom advantage module can be built using :meth:`~.make_value_function`. + A custom advantage module can be built using :meth:`~.make_value_estimator`. The default is :class:`torchrl.objectives.value.GAE` with hyperparameters dictated by :func:`torchrl.objectives.utils.default_value_kwargs`. """ - default_value_type: ValueFunctions = ValueFunctions.GAE + default_value_estimator: ValueEstimators = ValueEstimators.GAE def __init__( self, @@ -171,26 +171,26 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: td_out.set("loss_critic", loss_critic.mean()) return td_out - def make_value_function(self, value_type: ValueFunctions, **hyperparams): + def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): hp = dict(default_value_kwargs(value_type)) hp.update(hyperparams) if hasattr(self, "gamma"): hp["gamma"] = self.gamma value_key = "state_value" - if value_type == ValueFunctions.TD1: - self._value_function = TD1Estimate( + if value_type == ValueEstimators.TD1: + self._value_function = TD1Estimator( value_network=self.critic, value_key=value_key, **hp ) - elif value_type == ValueFunctions.TD0: - self._value_function = TD0Estimate( + elif value_type == ValueEstimators.TD0: + self._value_function = TD0Estimator( value_network=self.critic, value_key=value_key, **hp ) - elif value_type == ValueFunctions.GAE: + elif value_type == ValueEstimators.GAE: self._value_function = GAE( value_network=self.critic, value_key=value_key, **hp ) - elif value_type == ValueFunctions.TDLambda: - self._value_function = TDLambdaEstimate( + elif value_type == ValueEstimators.TDLambda: + self._value_function = TDLambdaEstimator( value_network=self.critic, value_key=value_key, **hp ) else: diff --git a/torchrl/objectives/common.py b/torchrl/objectives/common.py index 9f1605ebb4d..b6dd85119e9 100644 --- a/torchrl/objectives/common.py +++ b/torchrl/objectives/common.py @@ -18,8 +18,8 @@ from torch.nn import Parameter from torchrl.modules.utils import Buffer -from torchrl.objectives.utils import ValueFunctions -from torchrl.objectives.value import ValueFunctionBase +from torchrl.objectives.utils import ValueEstimators +from torchrl.objectives.value import ValueEstimatorBase _has_functorch = False try: @@ -51,10 +51,10 @@ class LossModule(nn.Module): Losses that require a value estimation are equipped with a default value pointer. This class attribute indicates which value estimator will be used if none other is specified. - The value estimator can be changed using the :meth:`~.make_value_function` method. + The value estimator can be changed using the :meth:`~.make_value_estimator` method. """ - default_value_function: ValueFunctions = None + default_value_estimator: ValueEstimators = None def __init__(self): super().__init__() @@ -364,11 +364,11 @@ def cpu(self) -> LossModule: return self.to(torch.device("cpu")) @property - def value_function(self) -> ValueFunctionBase: + def value_function(self) -> ValueEstimatorBase: """The value function blends in the reward and value estimate(s) from upcoming state(s)/state-action pair(s) into a target value estimate for the value network.""" out = self._value_function if out is None: - self._default_value_function() + self._default_value_estimator() return self._value_function return out @@ -376,23 +376,23 @@ def value_function(self) -> ValueFunctionBase: def value_function(self, value): self._value_function = value - def _default_value_function(self): + def _default_value_estimator(self): """A value-function constructor when none is provided. No kwarg should be present as default parameters should be retrieved from :obj:`torchrl.objectives.utils.DEFAULT_VALUE_FUN_PARAMS`. """ - self.make_value_function(self.default_value_function) + self.make_value_estimator(self.default_value_estimator) - def make_value_function(self, value_type: ValueFunctions, **hyperparams): + def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): """Value-function constructor. If the non-default value function is wanted, it must be built using this method. Args: - value_type (ValueFunctions): A :class:`torchrl.objectives.utils.ValueFunctions` + value_type (ValueEstimators): A :class:`torchrl.objectives.utils.ValueFunctions` enum type indicating the value function to use. **hyperparams: hyperparameters to use for the value function. If not provided, the value indicated by @@ -402,24 +402,24 @@ def make_value_function(self, value_type: ValueFunctions, **hyperparams): Examples: >>> # initialize the DQN loss >>> dqn_loss = DQNLoss(actor) - >>> dqn_loss.make_value_function( - ... ValueFunctions.TD1, + >>> dqn_loss.make_value_estimator( + ... ValueEstimators.TD1, ... gamma=0.9) """ - if value_type == ValueFunctions.TD1: + if value_type == ValueEstimators.TD1: raise NotImplementedError( f"Value type {value_type} it not implemented for loss {type(self)}." ) - elif value_type == ValueFunctions.TD0: + elif value_type == ValueEstimators.TD0: raise NotImplementedError( f"Value type {value_type} it not implemented for loss {type(self)}." ) - elif value_type == ValueFunctions.GAE: + elif value_type == ValueEstimators.GAE: raise NotImplementedError( f"Value type {value_type} it not implemented for loss {type(self)}." ) - elif value_type == ValueFunctions.TDLambda: + elif value_type == ValueEstimators.TDLambda: raise NotImplementedError( f"Value type {value_type} it not implemented for loss {type(self)}." ) diff --git a/torchrl/objectives/ddpg.py b/torchrl/objectives/ddpg.py index f8c6e83a621..f87a5cc5423 100644 --- a/torchrl/objectives/ddpg.py +++ b/torchrl/objectives/ddpg.py @@ -20,12 +20,12 @@ default_value_kwargs, distance_loss, hold_out_params, - ValueFunctions, + ValueEstimators, ) from ..envs.utils import set_exploration_mode from .common import LossModule -from .value import TD0Estimate, TD1Estimate, TDLambdaEstimate +from .value import TD0Estimator, TD1Estimator, TDLambdaEstimator class DDPGLoss(LossModule): @@ -41,7 +41,7 @@ class DDPGLoss(LossModule): data collection. Default is ``False``. """ - default_value_type: ValueFunctions = ValueFunctions.TD0 + default_value_estimator: ValueEstimators = ValueEstimators.TD0 def __init__( self, @@ -180,26 +180,26 @@ def _loss_value( return loss_value, (pred_val - target_value).pow(2), pred_val, target_value - def make_value_function(self, value_type: ValueFunctions, **hyperparams): + def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): hp = dict(default_value_kwargs(value_type)) if hasattr(self, "gamma"): hp["gamma"] = self.gamma hp.update(hyperparams) value_key = "state_action_value" - if value_type == ValueFunctions.TD1: - self._value_function = TD1Estimate( + if value_type == ValueEstimators.TD1: + self._value_function = TD1Estimator( value_network=self.actor_critic, value_key=value_key, **hp ) - elif value_type == ValueFunctions.TD0: - self._value_function = TD0Estimate( + elif value_type == ValueEstimators.TD0: + self._value_function = TD0Estimator( value_network=self.actor_critic, value_key=value_key, **hp ) - elif value_type == ValueFunctions.GAE: + elif value_type == ValueEstimators.GAE: raise NotImplementedError( f"Value type {value_type} it not implemented for loss {type(self)}." ) - elif value_type == ValueFunctions.TDLambda: - self._value_function = TDLambdaEstimate( + elif value_type == ValueEstimators.TDLambda: + self._value_function = TDLambdaEstimator( value_network=self.actor_critic, value_key=value_key, **hp ) else: diff --git a/torchrl/objectives/deprecated.py b/torchrl/objectives/deprecated.py index 0a518bff304..17e7e346808 100644 --- a/torchrl/objectives/deprecated.py +++ b/torchrl/objectives/deprecated.py @@ -19,11 +19,11 @@ default_value_kwargs, distance_loss, hold_out_params, - ValueFunctions, + ValueEstimators, ) from torchrl.objectives.common import LossModule from torchrl.objectives.utils import _GAMMA_LMBDA_DEPREC_WARNING -from torchrl.objectives.value import TD0Estimate, TD1Estimate, TDLambdaEstimate +from torchrl.objectives.value import TD0Estimator, TD1Estimator, TDLambdaEstimator try: from functorch import vmap @@ -77,7 +77,7 @@ class REDQLoss_deprecated(LossModule): """ delay_actor: bool = False - default_value_type = ValueFunctions.TD0 + default_value_estimator = ValueEstimators.TD0 def __init__( self, @@ -281,27 +281,27 @@ def _loss_alpha(self, log_pi: Tensor) -> Tensor: alpha_loss = torch.zeros_like(log_pi) return alpha_loss - def make_value_function(self, value_type: ValueFunctions, **hyperparams): + def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): hp = dict(default_value_kwargs(value_type)) if hasattr(self, "gamma"): hp["gamma"] = self.gamma hp.update(hyperparams) value_key = "state_value" # we do not need a value network bc the next state value is already passed - if value_type == ValueFunctions.TD1: - self._value_function = TD1Estimate( + if value_type == ValueEstimators.TD1: + self._value_function = TD1Estimator( value_network=None, value_key=value_key, **hp ) - elif value_type == ValueFunctions.TD0: - self._value_function = TD0Estimate( + elif value_type == ValueEstimators.TD0: + self._value_function = TD0Estimator( value_network=None, value_key=value_key, **hp ) - elif value_type == ValueFunctions.GAE: + elif value_type == ValueEstimators.GAE: raise NotImplementedError( f"Value type {value_type} it not implemented for loss {type(self)}." ) - elif value_type == ValueFunctions.TDLambda: - self._value_function = TDLambdaEstimate( + elif value_type == ValueEstimators.TDLambda: + self._value_function = TDLambdaEstimator( value_network=None, value_key=value_key, **hp ) else: diff --git a/torchrl/objectives/dqn.py b/torchrl/objectives/dqn.py index 01dbcc98182..77017064509 100644 --- a/torchrl/objectives/dqn.py +++ b/torchrl/objectives/dqn.py @@ -18,10 +18,10 @@ _GAMMA_LMBDA_DEPREC_WARNING, default_value_kwargs, distance_loss, - ValueFunctions, + ValueEstimators, ) -from .value import TDLambdaEstimate -from .value.advantages import TD0Estimate, TD1Estimate +from .value import TDLambdaEstimator +from .value.advantages import TD0Estimator, TD1Estimator class DQNLoss(LossModule): @@ -35,7 +35,7 @@ class DQNLoss(LossModule): """ - default_value_type = ValueFunctions.TDLambda + default_value_estimator = ValueEstimators.TDLambda def __init__( self, @@ -69,33 +69,33 @@ def __init__( warnings.warn(_GAMMA_LMBDA_DEPREC_WARNING) self.gamma = gamma - def make_value_function(self, value_type: ValueFunctions, **hyperparams): + def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): hp = dict(default_value_kwargs(value_type)) if hasattr(self, "gamma"): hp["gamma"] = self.gamma hp.update(hyperparams) - if value_type is ValueFunctions.TD1: - self._value_function = TD1Estimate( + if value_type is ValueEstimators.TD1: + self._value_function = TD1Estimator( **hp, value_network=self.value_network, advantage_key="advantage", value_target_key="value_target", value_key="chosen_action_value", ) - elif value_type is ValueFunctions.TD0: - self._value_function = TD0Estimate( + elif value_type is ValueEstimators.TD0: + self._value_function = TD0Estimator( **hp, value_network=self.value_network, advantage_key="advantage", value_target_key="value_target", value_key="chosen_action_value", ) - elif value_type is ValueFunctions.GAE: + elif value_type is ValueEstimators.GAE: raise NotImplementedError( f"Value type {value_type} it not implemented for loss {type(self)}." ) - elif value_type is ValueFunctions.TDLambda: - self._value_function = TDLambdaEstimate( + elif value_type is ValueEstimators.TDLambda: + self._value_function = TDLambdaEstimator( **hp, value_network=self.value_network, advantage_key="advantage", @@ -359,24 +359,24 @@ def forward(self, input_tensordict: TensorDictBase) -> TensorDict: loss_td = TensorDict({"loss": loss.mean()}, []) return loss_td - def make_value_function(self, value_type: ValueFunctions, **hyperparams): - if value_type is ValueFunctions.TD1: + def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): + if value_type is ValueEstimators.TD1: raise NotImplementedError( f"value type {value_type} is not implemented for {self.__class__.__name__}." ) - elif value_type is ValueFunctions.TD0: + elif value_type is ValueEstimators.TD0: # see forward call pass - elif value_type is ValueFunctions.GAE: + elif value_type is ValueEstimators.GAE: raise NotImplementedError( f"value type {value_type} is not implemented for {self.__class__.__name__}." ) - elif value_type is ValueFunctions.TDLambda: + elif value_type is ValueEstimators.TDLambda: raise NotImplementedError( f"value type {value_type} is not implemented for {self.__class__.__name__}." ) else: raise NotImplementedError(f"Unknown value type {value_type}") - def _default_value_function(self): - self.make_value_function(ValueFunctions.TD0) + def _default_value_estimator(self): + self.make_value_estimator(ValueEstimators.TD0) diff --git a/torchrl/objectives/dreamer.py b/torchrl/objectives/dreamer.py index 146686ed03c..03f215d0953 100644 --- a/torchrl/objectives/dreamer.py +++ b/torchrl/objectives/dreamer.py @@ -17,9 +17,9 @@ default_value_kwargs, distance_loss, hold_out_net, - ValueFunctions, + ValueEstimators, ) -from torchrl.objectives.value import TD0Estimate, TD1Estimate, TDLambdaEstimate +from torchrl.objectives.value import TD0Estimator, TD1Estimator, TDLambdaEstimator class DreamerModelLoss(LossModule): @@ -155,7 +155,7 @@ class DreamerActorLoss(LossModule): """ - default_value_type = ValueFunctions.TDLambda + default_value_estimator = ValueEstimators.TDLambda def __init__( self, @@ -230,37 +230,37 @@ def lambda_target(self, reward: torch.Tensor, value: torch.Tensor) -> torch.Tens ) return self.value_function.value_estimate(input_tensordict) - def make_value_function(self, value_type: ValueFunctions, **hyperparams): + def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): value_net = None value_key = "state_value" hp = dict(default_value_kwargs(value_type)) if hasattr(self, "gamma"): hp["gamma"] = self.gamma hp.update(hyperparams) - if value_type is ValueFunctions.TD1: - self._value_function = TD1Estimate( + if value_type is ValueEstimators.TD1: + self._value_function = TD1Estimator( **hp, value_network=value_net, value_target_key="value_target", value_key=value_key, ) - elif value_type is ValueFunctions.TD0: - self._value_function = TD0Estimate( + elif value_type is ValueEstimators.TD0: + self._value_function = TD0Estimator( **hp, value_network=value_net, value_target_key="value_target", value_key=value_key, ) - elif value_type is ValueFunctions.GAE: + elif value_type is ValueEstimators.GAE: if hasattr(self, "lmbda"): hp["lmbda"] = self.lmbda raise NotImplementedError( f"Value type {value_type} it not implemented for loss {type(self)}." ) - elif value_type is ValueFunctions.TDLambda: + elif value_type is ValueEstimators.TDLambda: if hasattr(self, "lmbda"): hp["lmbda"] = self.lmbda - self._value_function = TDLambdaEstimate( + self._value_function = TDLambdaEstimator( **hp, value_network=value_net, value_target_key="value_target", diff --git a/torchrl/objectives/iql.py b/torchrl/objectives/iql.py index 26d7c1a8c2f..ca1ea2a01a0 100644 --- a/torchrl/objectives/iql.py +++ b/torchrl/objectives/iql.py @@ -15,12 +15,12 @@ _GAMMA_LMBDA_DEPREC_WARNING, default_value_kwargs, distance_loss, - ValueFunctions, + ValueEstimators, ) from ..envs.utils import set_exploration_mode from .common import LossModule -from .value import TD0Estimate, TD1Estimate, TDLambdaEstimate +from .value import TD0Estimator, TD1Estimator, TDLambdaEstimator try: from functorch import vmap @@ -57,7 +57,7 @@ class IQLLoss(LossModule): """ - default_value_type = ValueFunctions.TD0 + default_value_estimator = ValueEstimators.TD0 def __init__( self, @@ -243,7 +243,7 @@ def _loss_qvalue(self, tensordict: TensorDictBase) -> Tuple[Tensor, Tensor]: ) return loss_qval, td_error.detach().max(0)[0] - def make_value_function(self, value_type: ValueFunctions, **hyperparams): + def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): value_net = self.value_network value_key = "state_value" @@ -251,26 +251,26 @@ def make_value_function(self, value_type: ValueFunctions, **hyperparams): if hasattr(self, "gamma"): hp["gamma"] = self.gamma hp.update(hyperparams) - if value_type is ValueFunctions.TD1: - self._value_function = TD1Estimate( + if value_type is ValueEstimators.TD1: + self._value_function = TD1Estimator( **hp, value_network=value_net, value_target_key="value_target", value_key=value_key, ) - elif value_type is ValueFunctions.TD0: - self._value_function = TD0Estimate( + elif value_type is ValueEstimators.TD0: + self._value_function = TD0Estimator( **hp, value_network=value_net, value_target_key="value_target", value_key=value_key, ) - elif value_type is ValueFunctions.GAE: + elif value_type is ValueEstimators.GAE: raise NotImplementedError( f"Value type {value_type} it not implemented for loss {type(self)}." ) - elif value_type is ValueFunctions.TDLambda: - self._value_function = TDLambdaEstimate( + elif value_type is ValueEstimators.TDLambda: + self._value_function = TDLambdaEstimator( **hp, value_network=value_net, value_target_key="value_target", diff --git a/torchrl/objectives/ppo.py b/torchrl/objectives/ppo.py index 3bb97d9a371..c3b89b45bb6 100644 --- a/torchrl/objectives/ppo.py +++ b/torchrl/objectives/ppo.py @@ -15,11 +15,11 @@ _GAMMA_LMBDA_DEPREC_WARNING, default_value_kwargs, distance_loss, - ValueFunctions, + ValueEstimators, ) from .common import LossModule -from .value import GAE, TD0Estimate, TD1Estimate, TDLambdaEstimate +from .value import GAE, TD0Estimator, TD1Estimator, TDLambdaEstimator class PPOLoss(LossModule): @@ -80,18 +80,18 @@ class PPOLoss(LossModule): >>> advantage(data) >>> losses = ppo_loss(data) - A custom advantage module can be built using :meth:`~.make_value_function`. + A custom advantage module can be built using :meth:`~.make_value_estimator`. The default is :class:`torchrl.objectives.value.GAE` with hyperparameters dictated by :func:`torchrl.objectives.utils.default_value_kwargs`. >>> ppo_loss = PPOLoss(actor, critic) - >>> ppo_loss.make_value_function(ValueFunctions.TDLambda) + >>> ppo_loss.make_value_estimator(ValueEstimators.TDLambda) >>> data = next(datacollector) >>> losses = ppo_loss(data) """ - default_value_type = ValueFunctions.GAE + default_value_estimator = ValueEstimators.GAE def __init__( self, @@ -213,26 +213,26 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: td_out.set("loss_critic", loss_critic.mean()) return td_out - def make_value_function(self, value_type: ValueFunctions, **hyperparams): + def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): hp = dict(default_value_kwargs(value_type)) if hasattr(self, "gamma"): hp["gamma"] = self.gamma hp.update(hyperparams) value_key = "state_value" - if value_type == ValueFunctions.TD1: - self._value_function = TD1Estimate( + if value_type == ValueEstimators.TD1: + self._value_function = TD1Estimator( value_network=self.critic, value_key=value_key, **hp ) - elif value_type == ValueFunctions.TD0: - self._value_function = TD0Estimate( + elif value_type == ValueEstimators.TD0: + self._value_function = TD0Estimator( value_network=self.critic, value_key=value_key, **hp ) - elif value_type == ValueFunctions.GAE: + elif value_type == ValueEstimators.GAE: self._value_function = GAE( value_network=self.critic, value_key=value_key, **hp ) - elif value_type == ValueFunctions.TDLambda: - self._value_function = TDLambdaEstimate( + elif value_type == ValueEstimators.TDLambda: + self._value_function = TDLambdaEstimator( value_network=self.critic, value_key=value_key, **hp ) else: @@ -287,12 +287,12 @@ class ClipPPOLoss(PPOLoss): >>> advantage(data) >>> losses = ppo_loss(data) - A custom advantage module can be built using :meth:`~.make_value_function`. + A custom advantage module can be built using :meth:`~.make_value_estimator`. The default is :class:`torchrl.objectives.value.GAE` with hyperparameters dictated by :func:`torchrl.objectives.utils.default_value_kwargs`. >>> ppo_loss = ClipPPOLoss(actor, critic) - >>> ppo_loss.make_value_function(ValueFunctions.TDLambda) + >>> ppo_loss.make_value_estimator(ValueEstimators.TDLambda) >>> data = next(datacollector) >>> losses = ppo_loss(data) @@ -442,12 +442,12 @@ class KLPENPPOLoss(PPOLoss): >>> advantage(data) >>> losses = ppo_loss(data) - A custom advantage module can be built using :meth:`~.make_value_function`. + A custom advantage module can be built using :meth:`~.make_value_estimator`. The default is :class:`torchrl.objectives.value.GAE` with hyperparameters dictated by :func:`torchrl.objectives.utils.default_value_kwargs`. >>> ppo_loss = KLPENPPOLoss(actor, critic) - >>> ppo_loss.make_value_function(ValueFunctions.TDLambda) + >>> ppo_loss.make_value_estimator(ValueEstimators.TDLambda) >>> data = next(datacollector) >>> losses = ppo_loss(data) diff --git a/torchrl/objectives/redq.py b/torchrl/objectives/redq.py index fe717c77cd7..dc7b146b142 100644 --- a/torchrl/objectives/redq.py +++ b/torchrl/objectives/redq.py @@ -20,9 +20,9 @@ _GAMMA_LMBDA_DEPREC_WARNING, default_value_kwargs, distance_loss, - ValueFunctions, + ValueEstimators, ) -from torchrl.objectives.value import TD0Estimate, TD1Estimate, TDLambdaEstimate +from torchrl.objectives.value import TD0Estimator, TD1Estimator, TDLambdaEstimator try: from functorch import vmap @@ -76,7 +76,7 @@ class REDQLoss(LossModule): """ delay_actor: bool = False - default_value_type = ValueFunctions.TD0 + default_value_estimator = ValueEstimators.TD0 def __init__( self, @@ -322,27 +322,27 @@ def _loss_alpha(self, log_pi: Tensor) -> Tensor: alpha_loss = torch.zeros_like(log_pi) return alpha_loss - def make_value_function(self, value_type: ValueFunctions, **hyperparams): + def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): hp = dict(default_value_kwargs(value_type)) if hasattr(self, "gamma"): hp["gamma"] = self.gamma hp.update(hyperparams) value_key = "state_value" # we do not need a value network bc the next state value is already passed - if value_type == ValueFunctions.TD1: - self._value_function = TD1Estimate( + if value_type == ValueEstimators.TD1: + self._value_function = TD1Estimator( value_network=None, value_key=value_key, **hp ) - elif value_type == ValueFunctions.TD0: - self._value_function = TD0Estimate( + elif value_type == ValueEstimators.TD0: + self._value_function = TD0Estimator( value_network=None, value_key=value_key, **hp ) - elif value_type == ValueFunctions.GAE: + elif value_type == ValueEstimators.GAE: raise NotImplementedError( f"Value type {value_type} it not implemented for loss {type(self)}." ) - elif value_type == ValueFunctions.TDLambda: - self._value_function = TDLambdaEstimate( + elif value_type == ValueEstimators.TDLambda: + self._value_function = TDLambdaEstimator( value_network=None, value_key=value_key, **hp ) else: diff --git a/torchrl/objectives/reinforce.py b/torchrl/objectives/reinforce.py index 2c972291455..98384444298 100644 --- a/torchrl/objectives/reinforce.py +++ b/torchrl/objectives/reinforce.py @@ -14,9 +14,9 @@ _GAMMA_LMBDA_DEPREC_WARNING, default_value_kwargs, distance_loss, - ValueFunctions, + ValueEstimators, ) -from torchrl.objectives.value import GAE, TD0Estimate, TD1Estimate, TDLambdaEstimate +from torchrl.objectives.value import GAE, TD0Estimator, TD1Estimator, TDLambdaEstimator class ReinforceLoss(LossModule): @@ -55,18 +55,18 @@ class ReinforceLoss(LossModule): >>> advantage(data) >>> losses = reinforce_loss(data) - A custom advantage module can be built using :meth:`~.make_value_function`. + A custom advantage module can be built using :meth:`~.make_value_estimator`. The default is :class:`torchrl.objectives.value.GAE` with hyperparameters dictated by :func:`torchrl.objectives.utils.default_value_kwargs`. >>> reinforce_loss = ReinforceLoss(actor, critic) - >>> reinforce_loss.make_value_function(ValueFunctions.TDLambda) + >>> reinforce_loss.make_value_estimator(ValueEstimators.TDLambda) >>> data = next(datacollector) >>> losses = reinforce_loss(data) """ - default_value_type = ValueFunctions.GAE + default_value_estimator = ValueEstimators.GAE def __init__( self, @@ -153,26 +153,26 @@ def loss_critic(self, tensordict: TensorDictBase) -> torch.Tensor: ) return loss_value - def make_value_function(self, value_type: ValueFunctions, **hyperparams): + def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): hp = dict(default_value_kwargs(value_type)) if hasattr(self, "gamma"): hp["gamma"] = self.gamma hp.update(hyperparams) value_key = "state_value" - if value_type == ValueFunctions.TD1: - self._value_function = TD1Estimate( + if value_type == ValueEstimators.TD1: + self._value_function = TD1Estimator( value_network=self.critic, value_key=value_key, **hp ) - elif value_type == ValueFunctions.TD0: - self._value_function = TD0Estimate( + elif value_type == ValueEstimators.TD0: + self._value_function = TD0Estimator( value_network=self.critic, value_key=value_key, **hp ) - elif value_type == ValueFunctions.GAE: + elif value_type == ValueEstimators.GAE: self._value_function = GAE( value_network=self.critic, value_key=value_key, **hp ) - elif value_type == ValueFunctions.TDLambda: - self._value_function = TDLambdaEstimate( + elif value_type == ValueEstimators.TDLambda: + self._value_function = TDLambdaEstimator( value_network=self.critic, value_key=value_key, **hp ) else: diff --git a/torchrl/objectives/sac.py b/torchrl/objectives/sac.py index f24d1f13b3c..3d82d141af6 100644 --- a/torchrl/objectives/sac.py +++ b/torchrl/objectives/sac.py @@ -19,12 +19,12 @@ _GAMMA_LMBDA_DEPREC_WARNING, default_value_kwargs, distance_loss, - ValueFunctions, + ValueEstimators, ) from ..envs.utils import set_exploration_mode, step_mdp from .common import LossModule -from .value import TD0Estimate, TD1Estimate, TDLambdaEstimate +from .value import TD0Estimator, TD1Estimator, TDLambdaEstimator try: from functorch import vmap @@ -83,7 +83,7 @@ class SACLoss(LossModule): Default is ``False``. """ - default_value_type = ValueFunctions.TD0 + default_value_estimator = ValueEstimators.TD0 def __init__( self, @@ -189,7 +189,7 @@ def __init__( warnings.warn(_GAMMA_LMBDA_DEPREC_WARNING) self.gamma = gamma - def make_value_function(self, value_type: ValueFunctions, **hyperparams): + def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): if self._version == 1: value_net = self.actor_critic elif self._version == 2: @@ -202,26 +202,26 @@ def make_value_function(self, value_type: ValueFunctions, **hyperparams): value_key = "state_value" hp = dict(default_value_kwargs(value_type)) hp.update(hyperparams) - if value_type is ValueFunctions.TD1: - self._value_function = TD1Estimate( + if value_type is ValueEstimators.TD1: + self._value_function = TD1Estimator( **hp, value_network=value_net, value_target_key="value_target", value_key=value_key, ) - elif value_type is ValueFunctions.TD0: - self._value_function = TD0Estimate( + elif value_type is ValueEstimators.TD0: + self._value_function = TD0Estimator( **hp, value_network=value_net, value_target_key="value_target", value_key=value_key, ) - elif value_type is ValueFunctions.GAE: + elif value_type is ValueEstimators.GAE: raise NotImplementedError( f"Value type {value_type} it not implemented for loss {type(self)}." ) - elif value_type is ValueFunctions.TDLambda: - self._value_function = TDLambdaEstimate( + elif value_type is ValueEstimators.TDLambda: + self._value_function = TDLambdaEstimator( **hp, value_network=value_net, value_target_key="value_target", @@ -495,7 +495,7 @@ class DiscreteSACLoss(LossModule): """ - default_value_type = ValueFunctions.TD0 + default_value_estimator = ValueEstimators.TD0 delay_actor: bool = False def __init__( @@ -729,33 +729,33 @@ def _loss_alpha(self, log_pi: Tensor) -> Tensor: alpha_loss = torch.zeros_like(log_pi) return alpha_loss - def make_value_function(self, value_type: ValueFunctions, **hyperparams): + def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): value_net = None value_key = "state_value" hp = dict(default_value_kwargs(value_type)) hp.update(hyperparams) if hasattr(self, "gamma"): hp["gamma"] = self.gamma - if value_type is ValueFunctions.TD1: - self._value_function = TD1Estimate( + if value_type is ValueEstimators.TD1: + self._value_function = TD1Estimator( **hp, value_network=value_net, value_target_key="value_target", value_key=value_key, ) - elif value_type is ValueFunctions.TD0: - self._value_function = TD0Estimate( + elif value_type is ValueEstimators.TD0: + self._value_function = TD0Estimator( **hp, value_network=value_net, value_target_key="value_target", value_key=value_key, ) - elif value_type is ValueFunctions.GAE: + elif value_type is ValueEstimators.GAE: raise NotImplementedError( f"Value type {value_type} it not implemented for loss {type(self)}." ) - elif value_type is ValueFunctions.TDLambda: - self._value_function = TDLambdaEstimate( + elif value_type is ValueEstimators.TDLambda: + self._value_function = TDLambdaEstimator( **hp, value_network=value_net, value_target_key="value_target", diff --git a/torchrl/objectives/td3.py b/torchrl/objectives/td3.py index 1ef911a8891..1f8ed97e37f 100644 --- a/torchrl/objectives/td3.py +++ b/torchrl/objectives/td3.py @@ -15,9 +15,9 @@ _GAMMA_LMBDA_DEPREC_WARNING, default_value_kwargs, distance_loss, - ValueFunctions, + ValueEstimators, ) -from torchrl.objectives.value import TD0Estimate, TD1Estimate, TDLambdaEstimate +from torchrl.objectives.value import TD0Estimator, TD1Estimator, TDLambdaEstimator try: from functorch import vmap @@ -56,7 +56,7 @@ class TD3Loss(LossModule): for data collection. Default is ``False``. """ - default_value_function = ValueFunctions.TD0 + default_value_estimator = ValueEstimators.TD0 def __init__( self, @@ -229,27 +229,27 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: return td_out - def make_value_function(self, value_type: ValueFunctions, **hyperparams): + def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): hp = dict(default_value_kwargs(value_type)) if hasattr(self, "gamma"): hp["gamma"] = self.gamma hp.update(hyperparams) value_key = "state_action_value" # we do not need a value network bc the next state value is already passed - if value_type == ValueFunctions.TD1: - self._value_function = TD1Estimate( + if value_type == ValueEstimators.TD1: + self._value_function = TD1Estimator( value_network=None, value_key=value_key, **hp ) - elif value_type == ValueFunctions.TD0: - self._value_function = TD0Estimate( + elif value_type == ValueEstimators.TD0: + self._value_function = TD0Estimator( value_network=None, value_key=value_key, **hp ) - elif value_type == ValueFunctions.GAE: + elif value_type == ValueEstimators.GAE: raise NotImplementedError( f"Value type {value_type} it not implemented for loss {type(self)}." ) - elif value_type == ValueFunctions.TDLambda: - self._value_function = TDLambdaEstimate( + elif value_type == ValueEstimators.TDLambda: + self._value_function = TDLambdaEstimator( value_network=None, value_key=value_key, **hp ) else: diff --git a/torchrl/objectives/utils.py b/torchrl/objectives/utils.py index 09edf449e08..63891c0fe27 100644 --- a/torchrl/objectives/utils.py +++ b/torchrl/objectives/utils.py @@ -18,11 +18,11 @@ _GAMMA_LMBDA_DEPREC_WARNING = ( "Passing gamma / lambda parameters through the loss constructor " "is deprecated and will be removed soon. To customize your value function, " - "run `loss_module.make_value_function(ValueFunctions., gamma=val)`." + "run `loss_module.make_value_estimator(ValueFunctions., gamma=val)`." ) -class ValueFunctions(Enum): +class ValueEstimators(Enum): """Value function enumerator for custom-built estimators. Allows for a flexible usage of various value functions when the loss module @@ -30,7 +30,7 @@ class ValueFunctions(Enum): Examples: >>> dqn_loss = DQNLoss(actor) - >>> dqn_loss.make_value_function(ValueFunctions.TD0, gamma=0.9) + >>> dqn_loss.make_value_estimator(ValueEstimators.TD0, gamma=0.9) """ @@ -40,7 +40,7 @@ class ValueFunctions(Enum): GAE = 4 -def default_value_kwargs(value_type: ValueFunctions): +def default_value_kwargs(value_type: ValueEstimators): """Default value function keyword argument generator. Args: @@ -48,17 +48,17 @@ def default_value_kwargs(value_type: ValueFunctions): :class:`torchrl.objectives.utils.ValueFunctions` class. Examples: - >>> kwargs = default_value_kwargs(ValueFunctions.TDLambda) + >>> kwargs = default_value_kwargs(ValueEstimators.TDLambda) {"gamma": 0.99, "lmbda": 0.95} """ - if value_type == ValueFunctions.TD1: + if value_type == ValueEstimators.TD1: return {"gamma": 0.99} - elif value_type == ValueFunctions.TD0: + elif value_type == ValueEstimators.TD0: return {"gamma": 0.99} - elif value_type == ValueFunctions.GAE: + elif value_type == ValueEstimators.GAE: return {"gamma": 0.99, "lmbda": 0.95} - elif value_type == ValueFunctions.TDLambda: + elif value_type == ValueEstimators.TDLambda: return {"gamma": 0.99, "lmbda": 0.95} else: raise NotImplementedError(f"Unknown value type {value_type}.") diff --git a/torchrl/objectives/value/__init__.py b/torchrl/objectives/value/__init__.py index ef224940ddf..11ae2e6d9e2 100644 --- a/torchrl/objectives/value/__init__.py +++ b/torchrl/objectives/value/__init__.py @@ -6,7 +6,10 @@ from .advantages import ( GAE, TD0Estimate, + TD0Estimator, TD1Estimate, + TD1Estimator, TDLambdaEstimate, - ValueFunctionBase, + TDLambdaEstimator, + ValueEstimatorBase, ) diff --git a/torchrl/objectives/value/advantages.py b/torchrl/objectives/value/advantages.py index 6695fbc2488..a8f681c2163 100644 --- a/torchrl/objectives/value/advantages.py +++ b/torchrl/objectives/value/advantages.py @@ -3,6 +3,8 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import abc +import warnings +from copy import deepcopy from functools import wraps from typing import Callable, List, Optional, Tuple, Union @@ -31,7 +33,7 @@ def new_fun(self, *args, **kwargs): return new_fun -class ValueFunctionBase(nn.Module): +class ValueEstimatorBase(nn.Module): """An abstract parent class for value function modules. Its :meth:`ValueFunctionBase.forward` method will compute the value (given @@ -111,7 +113,7 @@ def is_stateless(self): return self.value_network._is_stateless -class TD0Estimate(ValueFunctionBase): +class TD0Estimator(ValueEstimatorBase): """Myopic Temporal Difference (TD(0)) estimate of advantage function. Args: @@ -293,7 +295,7 @@ def value_estimate( return value_target -class TD1Estimate(ValueFunctionBase): +class TD1Estimator(ValueEstimatorBase): """Bootstrapped Temporal Difference (TD(1)) estimate of advantage function. Args: @@ -475,7 +477,7 @@ def value_estimate( return value_target -class TDLambdaEstimate(ValueFunctionBase): +class TDLambdaEstimator(ValueEstimatorBase): r"""TD(:math:`\lambda`) estimate of advantage function. Args: @@ -577,7 +579,7 @@ def forward( >>> value_net = TensorDictModule( ... nn.Linear(3, 1), in_keys=["obs"], out_keys=["state_value"] ... ) - >>> module = TDLambdaEstimate( + >>> module = TDLambdaEstimator( ... gamma=0.98, ... lmbda=0.94, ... value_network=value_net, @@ -596,7 +598,7 @@ def forward( >>> value_net = TensorDictModule( ... nn.Linear(3, 1), in_keys=["obs"], out_keys=["state_value"] ... ) - >>> module = TDLambdaEstimate( + >>> module = TDLambdaEstimator( ... gamma=0.98, ... lmbda=0.94, ... value_network=value_net, @@ -672,7 +674,7 @@ def value_estimate( return val -class GAE(ValueFunctionBase): +class GAE(ValueEstimatorBase): """A class wrapper around the generalized advantage estimate functional. Refer to "HIGH-DIMENSIONAL CONTINUOUS CONTROL USING GENERALIZED ADVANTAGE ESTIMATION" @@ -914,3 +916,20 @@ def value_estimate( gamma, lmbda, value, next_value, reward, done ) return value_target + + +def _deprecate_class(cls, new_cls): + @wraps(cls.__init__) + def new_init(self, *args, **kwargs): + warnings.warn(f"class {cls} is deprecated, please use {new_cls} instead.") + cls.__init__(self, *args, **kwargs) + + cls.__init__ = new_init + + +TD0Estimate = deepcopy(TD0Estimator) +_deprecate_class(TD0Estimate, TD0Estimator) +TD1Estimate = deepcopy(TD1Estimator) +_deprecate_class(TD1Estimate, TD1Estimator) +TDLambdaEstimate = deepcopy(TDLambdaEstimator) +_deprecate_class(TDLambdaEstimate, TDLambdaEstimator) From f00803445385dc200ad3f237bf427df6022eb1fd Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 28 Mar 2023 13:57:05 +0100 Subject: [PATCH 23/89] amend --- docs/source/reference/objectives.rst | 15 +- test/test_cost.py | 176 +++++++++- test/test_modules.py | 6 +- torchrl/objectives/a2c.py | 10 +- torchrl/objectives/common.py | 14 +- torchrl/objectives/ddpg.py | 8 +- torchrl/objectives/deprecated.py | 8 +- torchrl/objectives/dqn.py | 10 +- torchrl/objectives/dreamer.py | 10 +- torchrl/objectives/iql.py | 8 +- torchrl/objectives/ppo.py | 14 +- torchrl/objectives/redq.py | 10 +- torchrl/objectives/reinforce.py | 10 +- torchrl/objectives/sac.py | 24 +- torchrl/objectives/td3.py | 8 +- torchrl/objectives/utils.py | 8 +- torchrl/objectives/value/advantages.py | 19 +- torchrl/objectives/value/functional.py | 436 ++++++++++++++++++++----- 18 files changed, 632 insertions(+), 162 deletions(-) diff --git a/docs/source/reference/objectives.rst b/docs/source/reference/objectives.rst index 01a7379e298..ba91adc2f5e 100644 --- a/docs/source/reference/objectives.rst +++ b/docs/source/reference/objectives.rst @@ -188,13 +188,18 @@ Returns TD1Estimator TDLambdaEstimator GAE - functional.generalized_advantage_estimate - functional.vec_generalized_advantage_estimate - functional.vec_td_lambda_return_estimate - functional.vec_td_lambda_advantage_estimate + functional.td0_return_estimate + functional.td0_advantage_estimate + functional.td1_return_estimate + functional.vec_td1_return_estimate + functional.td1_advantage_estimate + functional.vec_td1_advantage_estimate functional.td_lambda_return_estimate + functional.vec_td_lambda_return_estimate functional.td_lambda_advantage_estimate - functional.td_advantage_estimate + functional.vec_td_lambda_advantage_estimate + functional.generalized_advantage_estimate + functional.vec_generalized_advantage_estimate Utils diff --git a/test/test_cost.py b/test/test_cost.py index 0aa87bcb2c4..7ebb33a6771 100644 --- a/test/test_cost.py +++ b/test/test_cost.py @@ -96,9 +96,11 @@ from torchrl.objectives.value.advantages import GAE, TD1Estimator, TDLambdaEstimator from torchrl.objectives.value.functional import ( generalized_advantage_estimate, - td_advantage_estimate, + td0_advantage_estimate, + td1_advantage_estimate, td_lambda_advantage_estimate, vec_generalized_advantage_estimate, + vec_td1_advantage_estimate, vec_td_lambda_advantage_estimate, ) from torchrl.objectives.value.utils import _custom_conv1d, _make_gammas_tensor @@ -3727,6 +3729,30 @@ def test_tdlambda(self, device, gamma, lmbda, N, T, random_gamma, rolling_gamma) ) torch.testing.assert_close(r1, r2, rtol=1e-4, atol=1e-4) + @pytest.mark.parametrize("device", get_available_devices()) + @pytest.mark.parametrize("gamma", [0.1, 0.5, 0.99]) + @pytest.mark.parametrize("N", [(3,), (7, 3)]) + @pytest.mark.parametrize("T", [3, 5, 200]) + # @pytest.mark.parametrize("random_gamma,rolling_gamma", [[True, False], [True, True], [False, None]]) + @pytest.mark.parametrize("random_gamma,rolling_gamma", [[False, None]]) + def test_td1(self, device, gamma, N, T, random_gamma, rolling_gamma): + torch.manual_seed(0) + + done = torch.zeros(*N, T, 1, device=device, dtype=torch.bool).bernoulli_(0.1) + reward = torch.randn(*N, T, 1, device=device) + state_value = torch.randn(*N, T, 1, device=device) + next_state_value = torch.randn(*N, T, 1, device=device) + if random_gamma: + gamma = torch.rand_like(reward) * gamma + + r1 = vec_td1_advantage_estimate( + gamma, state_value, next_state_value, reward, done, rolling_gamma + ) + r2 = td1_advantage_estimate( + gamma, state_value, next_state_value, reward, done, rolling_gamma + ) + torch.testing.assert_close(r1, r2, rtol=1e-4, atol=1e-4) + @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("gamma", [0.99, 0.5, 0.1]) @pytest.mark.parametrize("lmbda", [0.99, 0.5, 0.1]) @@ -3796,6 +3822,49 @@ def test_tdlambda_tensor_gamma(self, device, gamma, lmbda, N, T, has_done): torch.testing.assert_close(v1, v2, rtol=1e-4, atol=1e-4) + @pytest.mark.parametrize("device", get_available_devices()) + @pytest.mark.parametrize("gamma", [0.5, 0.99, 0.1]) + @pytest.mark.parametrize("N", [(3,), (7, 3)]) + @pytest.mark.parametrize("T", [3, 5, 200]) + @pytest.mark.parametrize("has_done", [True, False]) + def test_td1_tensor_gamma(self, device, gamma, N, T, has_done): + """Tests vec_td_lambda_advantage_estimate against itself with + gamma being a tensor or a scalar + + """ + torch.manual_seed(0) + + done = torch.zeros(*N, T, 1, device=device, dtype=torch.bool) + if has_done: + done = done.bernoulli_(0.1) + reward = torch.randn(*N, T, 1, device=device) + state_value = torch.randn(*N, T, 1, device=device) + next_state_value = torch.randn(*N, T, 1, device=device) + + gamma_tensor = torch.full((*N, T, 1), gamma, device=device) + + v1 = vec_td1_advantage_estimate( + gamma, state_value, next_state_value, reward, done + ) + v2 = vec_td1_advantage_estimate( + gamma_tensor, state_value, next_state_value, reward, done + ) + + torch.testing.assert_close(v1, v2, rtol=1e-4, atol=1e-4) + + # # same with last done being true + done[..., -1, :] = True # terminating trajectory + gamma_tensor[..., -1, :] = 0.0 + + v1 = vec_td1_advantage_estimate( + gamma, state_value, next_state_value, reward, done + ) + v2 = vec_td1_advantage_estimate( + gamma_tensor, state_value, next_state_value, reward, done + ) + + torch.testing.assert_close(v1, v2, rtol=1e-4, atol=1e-4) + @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("gamma", [0.5, 0.99, 0.1]) @pytest.mark.parametrize("lmbda", [0.1, 0.5, 0.99]) @@ -3843,6 +3912,48 @@ def test_vectdlambda_tensor_gamma( torch.testing.assert_close(v1, v2, rtol=1e-4, atol=1e-4) + @pytest.mark.parametrize("device", get_available_devices()) + @pytest.mark.parametrize("gamma", [0.5, 0.99, 0.1]) + @pytest.mark.parametrize("N", [(3,), (7, 3)]) + @pytest.mark.parametrize("T", [3, 5, 50]) + @pytest.mark.parametrize("has_done", [True, False]) + def test_vectd1_tensor_gamma( + self, device, gamma, N, T, dtype_fixture, has_done # noqa + ): + """Tests td_lambda_advantage_estimate against vec_td_lambda_advantage_estimate + with gamma being a tensor or a scalar + + """ + + torch.manual_seed(0) + + done = torch.zeros(*N, T, 1, device=device, dtype=torch.bool) + if has_done: + done = done.bernoulli_(0.1) + reward = torch.randn(*N, T, 1, device=device) + state_value = torch.randn(*N, T, 1, device=device) + next_state_value = torch.randn(*N, T, 1, device=device) + + gamma_tensor = torch.full((*N, T, 1), gamma, device=device) + + v1 = td1_advantage_estimate(gamma, state_value, next_state_value, reward, done) + v2 = vec_td1_advantage_estimate( + gamma_tensor, state_value, next_state_value, reward, done + ) + + torch.testing.assert_close(v1, v2, rtol=1e-4, atol=1e-4) + + # same with last done being true + done[..., -1, :] = True # terminating trajectory + gamma_tensor[..., -1, :] = 0.0 + + v1 = td1_advantage_estimate(gamma, state_value, next_state_value, reward, done) + v2 = vec_td1_advantage_estimate( + gamma_tensor, state_value, next_state_value, reward, done + ) + + torch.testing.assert_close(v1, v2, rtol=1e-4, atol=1e-4) + @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("lmbda", [0.1, 0.5, 0.99]) @pytest.mark.parametrize("N", [(3,), (7, 3)]) @@ -3904,6 +4015,63 @@ def test_vectdlambda_rand_gamma( ) torch.testing.assert_close(v1, v2, rtol=1e-4, atol=1e-4) + @pytest.mark.parametrize("device", get_available_devices()) + @pytest.mark.parametrize("N", [(3,), (7, 3)]) + @pytest.mark.parametrize("T", [50, 3]) + @pytest.mark.parametrize("rolling_gamma", [True, False, None]) + @pytest.mark.parametrize("has_done", [True, False]) + @pytest.mark.parametrize("seed", range(1)) + def test_vectd1_rand_gamma( + self, device, N, T, rolling_gamma, dtype_fixture, has_done, seed # noqa + ): + """Tests td_lambda_advantage_estimate against vec_td_lambda_advantage_estimate + with gamma being a random tensor + + """ + torch.manual_seed(seed) + + done = torch.zeros(*N, T, 1, device=device, dtype=torch.bool) + if has_done: + done = done.bernoulli_(0.1) + reward = torch.randn(*N, T, 1, device=device) + state_value = torch.randn(*N, T, 1, device=device) + next_state_value = torch.randn(*N, T, 1, device=device) + + # avoid low values of gamma + gamma_tensor = 0.5 + torch.rand_like(next_state_value) / 2 + + v1 = td1_advantage_estimate( + gamma_tensor, + state_value, + next_state_value, + reward, + done, + rolling_gamma, + ) + if rolling_gamma is False and not done[..., 1:, :][done[..., :-1, :]].all(): + # if a not-done follows a done, then rolling_gamma=False cannot be used + with pytest.raises( + NotImplementedError, match="When using rolling_gamma=False" + ): + vec_td1_advantage_estimate( + gamma_tensor, + state_value, + next_state_value, + reward, + done, + rolling_gamma, + ) + return + v2 = vec_td1_advantage_estimate( + gamma_tensor, + state_value, + next_state_value, + reward, + done, + rolling_gamma, + ) + torch.testing.assert_close(v1, v2, rtol=1e-4, atol=1e-4) + @pytest.mark.parametrize("device", get_available_devices()) @pytest.mark.parametrize("gamma", [0.99, "rand"]) @pytest.mark.parametrize("N", [(3,), (3, 7)]) @@ -4066,21 +4234,21 @@ def test_successive_traj_tdadv( # avoid low values of gamma gamma_tensor = 0.5 + torch.rand_like(next_state_value) / 2 - v1 = td_advantage_estimate( + v1 = td0_advantage_estimate( gamma_tensor, state_value, next_state_value, reward, done, ) - v1a = td_advantage_estimate( + v1a = td0_advantage_estimate( gamma_tensor[..., : T // 2, :], state_value[..., : T // 2, :], next_state_value[..., : T // 2, :], reward[..., : T // 2, :], done[..., : T // 2, :], ) - v1b = td_advantage_estimate( + v1b = td0_advantage_estimate( gamma_tensor[..., T // 2 :, :], state_value[..., T // 2 :, :], next_state_value[..., T // 2 :, :], diff --git a/test/test_modules.py b/test/test_modules.py index ab3ee0303c6..bd16a1097aa 100644 --- a/test/test_modules.py +++ b/test/test_modules.py @@ -478,9 +478,9 @@ def test_MPPI(self, device, batch_size, seed=1): value_net = nn.LazyLinear(1, device=device) value_net = ValueOperator(value_net, in_keys=["observation"]) advantage_module = TDLambdaEstimator( - 0.99, - 0.95, - value_net, + gamma=0.99, + lmbda=0.95, + value_network=value_net, ) value_net(env.reset()) planner = MPPIPlanner( diff --git a/torchrl/objectives/a2c.py b/torchrl/objectives/a2c.py index af91ce86385..4b7c40c56c2 100644 --- a/torchrl/objectives/a2c.py +++ b/torchrl/objectives/a2c.py @@ -153,7 +153,7 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: tensordict = tensordict.clone(False) advantage = tensordict.get(self.advantage_key, None) if advantage is None: - self.value_function( + self.value_estimator( tensordict, params=self.critic_params, target_params=self.target_critic_params, @@ -178,19 +178,19 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): hp["gamma"] = self.gamma value_key = "state_value" if value_type == ValueEstimators.TD1: - self._value_function = TD1Estimator( + self._value_estimator = TD1Estimator( value_network=self.critic, value_key=value_key, **hp ) elif value_type == ValueEstimators.TD0: - self._value_function = TD0Estimator( + self._value_estimator = TD0Estimator( value_network=self.critic, value_key=value_key, **hp ) elif value_type == ValueEstimators.GAE: - self._value_function = GAE( + self._value_estimator = GAE( value_network=self.critic, value_key=value_key, **hp ) elif value_type == ValueEstimators.TDLambda: - self._value_function = TDLambdaEstimator( + self._value_estimator = TDLambdaEstimator( value_network=self.critic, value_key=value_key, **hp ) else: diff --git a/torchrl/objectives/common.py b/torchrl/objectives/common.py index b6dd85119e9..770d3f3e406 100644 --- a/torchrl/objectives/common.py +++ b/torchrl/objectives/common.py @@ -59,7 +59,7 @@ class LossModule(nn.Module): def __init__(self): super().__init__() self._param_maps = {} - self._value_function = None + self._value_estimator = None # self.register_forward_pre_hook(_parameters_to_tensordict) def forward(self, tensordict: TensorDictBase) -> TensorDictBase: @@ -364,17 +364,17 @@ def cpu(self) -> LossModule: return self.to(torch.device("cpu")) @property - def value_function(self) -> ValueEstimatorBase: + def value_estimator(self) -> ValueEstimatorBase: """The value function blends in the reward and value estimate(s) from upcoming state(s)/state-action pair(s) into a target value estimate for the value network.""" - out = self._value_function + out = self._value_estimator if out is None: self._default_value_estimator() - return self._value_function + return self._value_estimator return out - @value_function.setter - def value_function(self, value): - self._value_function = value + @value_estimator.setter + def value_estimator(self, value): + self._value_estimator = value def _default_value_estimator(self): """A value-function constructor when none is provided. diff --git a/torchrl/objectives/ddpg.py b/torchrl/objectives/ddpg.py index f87a5cc5423..c1cacd7349e 100644 --- a/torchrl/objectives/ddpg.py +++ b/torchrl/objectives/ddpg.py @@ -169,7 +169,7 @@ def _loss_value( device=self.target_actor_network_params.device, ) with set_exploration_mode("mode"): - target_value = self.value_function.value_estimate( + target_value = self.value_estimator.value_estimate( tensordict, target_params=target_params ).squeeze(-1) @@ -187,11 +187,11 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): hp.update(hyperparams) value_key = "state_action_value" if value_type == ValueEstimators.TD1: - self._value_function = TD1Estimator( + self._value_estimator = TD1Estimator( value_network=self.actor_critic, value_key=value_key, **hp ) elif value_type == ValueEstimators.TD0: - self._value_function = TD0Estimator( + self._value_estimator = TD0Estimator( value_network=self.actor_critic, value_key=value_key, **hp ) elif value_type == ValueEstimators.GAE: @@ -199,7 +199,7 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): f"Value type {value_type} it not implemented for loss {type(self)}." ) elif value_type == ValueEstimators.TDLambda: - self._value_function = TDLambdaEstimator( + self._value_estimator = TDLambdaEstimator( value_network=self.actor_critic, value_key=value_key, **hp ) else: diff --git a/torchrl/objectives/deprecated.py b/torchrl/objectives/deprecated.py index 17e7e346808..9116a29e59c 100644 --- a/torchrl/objectives/deprecated.py +++ b/torchrl/objectives/deprecated.py @@ -253,7 +253,7 @@ def _qvalue_loss(self, tensordict: TensorDictBase) -> Tensor: next_state_value = next_state_value.min(0)[0] tensordict.set(("next", "state_value"), next_state_value) - target_value = self.value_function.value_estimate(tensordict).squeeze(-1) + target_value = self.value_estimator.value_estimate(tensordict).squeeze(-1) tensordict_expand = vmap(self.qvalue_network, (None, 0))( tensordict.select(*self.qvalue_network.in_keys), self.qvalue_network_params, @@ -289,11 +289,11 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): value_key = "state_value" # we do not need a value network bc the next state value is already passed if value_type == ValueEstimators.TD1: - self._value_function = TD1Estimator( + self._value_estimator = TD1Estimator( value_network=None, value_key=value_key, **hp ) elif value_type == ValueEstimators.TD0: - self._value_function = TD0Estimator( + self._value_estimator = TD0Estimator( value_network=None, value_key=value_key, **hp ) elif value_type == ValueEstimators.GAE: @@ -301,7 +301,7 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): f"Value type {value_type} it not implemented for loss {type(self)}." ) elif value_type == ValueEstimators.TDLambda: - self._value_function = TDLambdaEstimator( + self._value_estimator = TDLambdaEstimator( value_network=None, value_key=value_key, **hp ) else: diff --git a/torchrl/objectives/dqn.py b/torchrl/objectives/dqn.py index 77017064509..e584b894ed7 100644 --- a/torchrl/objectives/dqn.py +++ b/torchrl/objectives/dqn.py @@ -75,7 +75,7 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): hp["gamma"] = self.gamma hp.update(hyperparams) if value_type is ValueEstimators.TD1: - self._value_function = TD1Estimator( + self._value_estimator = TD1Estimator( **hp, value_network=self.value_network, advantage_key="advantage", @@ -83,7 +83,7 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): value_key="chosen_action_value", ) elif value_type is ValueEstimators.TD0: - self._value_function = TD0Estimator( + self._value_estimator = TD0Estimator( **hp, value_network=self.value_network, advantage_key="advantage", @@ -95,7 +95,7 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): f"Value type {value_type} it not implemented for loss {type(self)}." ) elif value_type is ValueEstimators.TDLambda: - self._value_function = TDLambdaEstimator( + self._value_estimator = TDLambdaEstimator( **hp, value_network=self.value_network, advantage_key="advantage", @@ -155,7 +155,7 @@ def forward(self, input_tensordict: TensorDictBase) -> TensorDict: action = action.to(torch.float) pred_val_index = (pred_val * action).sum(-1) - target_value = self.value_function.value_estimate( + target_value = self.value_estimator.value_estimate( tensordict.clone(False), target_params=self.target_value_network_params ).squeeze(-1) @@ -191,7 +191,7 @@ class DistributionalDQNLoss(LossModule): gamma (scalar): a discount factor for return computation. .. note:: Unlike :class:`DQNLoss`, this class does not currently support - custom value functions. The next value estimation is not + custom value functions. The next value estimation is always bootstrapped. delay_value (bool): whether to duplicate the value network into a new target value network to create double DQN diff --git a/torchrl/objectives/dreamer.py b/torchrl/objectives/dreamer.py index 03f215d0953..cfad2825a4e 100644 --- a/torchrl/objectives/dreamer.py +++ b/torchrl/objectives/dreamer.py @@ -208,7 +208,7 @@ def forward(self, tensordict: TensorDict) -> Tuple[TensorDict, TensorDict]: fake_data.set("lambda_target", lambda_target) if self.discount_loss: - gamma = self.value_function.gamma.to(tensordict.device) + gamma = self.value_estimator.gamma.to(tensordict.device) discount = gamma.expand(lambda_target.shape) discount[..., 0, :] = 1 discount = discount.cumprod(dim=-2) @@ -228,7 +228,7 @@ def lambda_target(self, reward: torch.Tensor, value: torch.Tensor) -> torch.Tens }, [], ) - return self.value_function.value_estimate(input_tensordict) + return self.value_estimator.value_estimate(input_tensordict) def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): value_net = None @@ -238,14 +238,14 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): hp["gamma"] = self.gamma hp.update(hyperparams) if value_type is ValueEstimators.TD1: - self._value_function = TD1Estimator( + self._value_estimator = TD1Estimator( **hp, value_network=value_net, value_target_key="value_target", value_key=value_key, ) elif value_type is ValueEstimators.TD0: - self._value_function = TD0Estimator( + self._value_estimator = TD0Estimator( **hp, value_network=value_net, value_target_key="value_target", @@ -260,7 +260,7 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): elif value_type is ValueEstimators.TDLambda: if hasattr(self, "lmbda"): hp["lmbda"] = self.lmbda - self._value_function = TDLambdaEstimator( + self._value_estimator = TDLambdaEstimator( **hp, value_network=value_net, value_target_key="value_target", diff --git a/torchrl/objectives/iql.py b/torchrl/objectives/iql.py index ca1ea2a01a0..4e993ef5579 100644 --- a/torchrl/objectives/iql.py +++ b/torchrl/objectives/iql.py @@ -223,7 +223,7 @@ def _loss_qvalue(self, tensordict: TensorDictBase) -> Tuple[Tensor, Tensor]: obs_keys = self.actor_network.in_keys tensordict = tensordict.select("next", *obs_keys, "action") - target_value = self.value_function.value_estimate( + target_value = self.value_estimator.value_estimate( tensordict, target_params=self.target_value_network_params ).squeeze(-1) tensordict_expand = vmap(self.qvalue_network, (None, 0))( @@ -252,14 +252,14 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): hp["gamma"] = self.gamma hp.update(hyperparams) if value_type is ValueEstimators.TD1: - self._value_function = TD1Estimator( + self._value_estimator = TD1Estimator( **hp, value_network=value_net, value_target_key="value_target", value_key=value_key, ) elif value_type is ValueEstimators.TD0: - self._value_function = TD0Estimator( + self._value_estimator = TD0Estimator( **hp, value_network=value_net, value_target_key="value_target", @@ -270,7 +270,7 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): f"Value type {value_type} it not implemented for loss {type(self)}." ) elif value_type is ValueEstimators.TDLambda: - self._value_function = TDLambdaEstimator( + self._value_estimator = TDLambdaEstimator( **hp, value_network=value_net, value_target_key="value_target", diff --git a/torchrl/objectives/ppo.py b/torchrl/objectives/ppo.py index c3b89b45bb6..638174d21a6 100644 --- a/torchrl/objectives/ppo.py +++ b/torchrl/objectives/ppo.py @@ -190,7 +190,7 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: tensordict = tensordict.clone(False) advantage = tensordict.get(self.advantage_key, None) if advantage is None: - self.value_function( + self.value_estimator( tensordict, params=self.critic_params, target_params=self.target_critic_params, @@ -220,19 +220,19 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): hp.update(hyperparams) value_key = "state_value" if value_type == ValueEstimators.TD1: - self._value_function = TD1Estimator( + self._value_estimator = TD1Estimator( value_network=self.critic, value_key=value_key, **hp ) elif value_type == ValueEstimators.TD0: - self._value_function = TD0Estimator( + self._value_estimator = TD0Estimator( value_network=self.critic, value_key=value_key, **hp ) elif value_type == ValueEstimators.GAE: - self._value_function = GAE( + self._value_estimator = GAE( value_network=self.critic, value_key=value_key, **hp ) elif value_type == ValueEstimators.TDLambda: - self._value_function = TDLambdaEstimator( + self._value_estimator = TDLambdaEstimator( value_network=self.critic, value_key=value_key, **hp ) else: @@ -340,7 +340,7 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: tensordict = tensordict.clone(False) advantage = tensordict.get(self.advantage_key, None) if advantage is None: - self.value_function( + self.value_estimator( tensordict, params=self.critic_params, target_params=self.target_critic_params, @@ -507,7 +507,7 @@ def forward(self, tensordict: TensorDictBase) -> TensorDict: tensordict = tensordict.clone(False) advantage = tensordict.get(self.advantage_key, None) if advantage is None: - self.value_function( + self.value_estimator( tensordict, params=self.critic_params, target_params=self.target_critic_params, diff --git a/torchrl/objectives/redq.py b/torchrl/objectives/redq.py index dc7b146b142..417b6d90fcf 100644 --- a/torchrl/objectives/redq.py +++ b/torchrl/objectives/redq.py @@ -275,7 +275,9 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: next_state_value = next_state_value.min(0)[0] tensordict_select.set(("next", "state_value"), next_state_value.unsqueeze(-1)) - target_value = self.value_function.value_estimate(tensordict_select).squeeze(-1) + target_value = self.value_estimator.value_estimate(tensordict_select).squeeze( + -1 + ) pred_val = state_action_value_qvalue td_error = (pred_val - target_value).pow(2) @@ -330,11 +332,11 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): value_key = "state_value" # we do not need a value network bc the next state value is already passed if value_type == ValueEstimators.TD1: - self._value_function = TD1Estimator( + self._value_estimator = TD1Estimator( value_network=None, value_key=value_key, **hp ) elif value_type == ValueEstimators.TD0: - self._value_function = TD0Estimator( + self._value_estimator = TD0Estimator( value_network=None, value_key=value_key, **hp ) elif value_type == ValueEstimators.GAE: @@ -342,7 +344,7 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): f"Value type {value_type} it not implemented for loss {type(self)}." ) elif value_type == ValueEstimators.TDLambda: - self._value_function = TDLambdaEstimator( + self._value_estimator = TDLambdaEstimator( value_network=None, value_key=value_key, **hp ) else: diff --git a/torchrl/objectives/reinforce.py b/torchrl/objectives/reinforce.py index 98384444298..baa0a4c2ae8 100644 --- a/torchrl/objectives/reinforce.py +++ b/torchrl/objectives/reinforce.py @@ -108,7 +108,7 @@ def __init__( def forward(self, tensordict: TensorDictBase) -> TensorDictBase: advantage = tensordict.get(self.advantage_key, None) if advantage is None: - self.value_function( + self.value_estimator( tensordict, params=self.critic_params, target_params=self.target_critic_params, @@ -160,19 +160,19 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): hp.update(hyperparams) value_key = "state_value" if value_type == ValueEstimators.TD1: - self._value_function = TD1Estimator( + self._value_estimator = TD1Estimator( value_network=self.critic, value_key=value_key, **hp ) elif value_type == ValueEstimators.TD0: - self._value_function = TD0Estimator( + self._value_estimator = TD0Estimator( value_network=self.critic, value_key=value_key, **hp ) elif value_type == ValueEstimators.GAE: - self._value_function = GAE( + self._value_estimator = GAE( value_network=self.critic, value_key=value_key, **hp ) elif value_type == ValueEstimators.TDLambda: - self._value_function = TDLambdaEstimator( + self._value_estimator = TDLambdaEstimator( value_network=self.critic, value_key=value_key, **hp ) else: diff --git a/torchrl/objectives/sac.py b/torchrl/objectives/sac.py index 3d82d141af6..8177c2f393c 100644 --- a/torchrl/objectives/sac.py +++ b/torchrl/objectives/sac.py @@ -203,14 +203,14 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): hp = dict(default_value_kwargs(value_type)) hp.update(hyperparams) if value_type is ValueEstimators.TD1: - self._value_function = TD1Estimator( + self._value_estimator = TD1Estimator( **hp, value_network=value_net, value_target_key="value_target", value_key=value_key, ) elif value_type is ValueEstimators.TD0: - self._value_function = TD0Estimator( + self._value_estimator = TD0Estimator( **hp, value_network=value_net, value_target_key="value_target", @@ -221,7 +221,7 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): f"Value type {value_type} it not implemented for loss {type(self)}." ) elif value_type is ValueEstimators.TDLambda: - self._value_function = TDLambdaEstimator( + self._value_estimator = TDLambdaEstimator( **hp, value_network=value_net, value_target_key="value_target", @@ -317,7 +317,7 @@ def _loss_qvalue_v1(self, tensordict: TensorDictBase) -> Tuple[Tensor, Tensor]: _run_checks=False, ) with set_exploration_mode("mode"): - target_value = self.value_function.value_estimate( + target_value = self.value_estimator.value_estimate( tensordict, target_params=target_params ).squeeze(-1) @@ -383,8 +383,8 @@ def _get_value_v2(self, tensordict, _alpha, actor_params, qval_params): sample_log_prob = sample_log_prob.unsqueeze(-1) state_value = state_action_value - _alpha * sample_log_prob state_value = state_value.min(0)[0] - tensordict.set(("next", self.value_function.value_key), state_value) - target_value = self.value_function.value_estimate( + tensordict.set(("next", self.value_estimator.value_key), state_value) + target_value = self.value_estimator.value_estimate( tensordict, _alpha=self._alpha, actor_params=self.target_actor_network_params, @@ -668,8 +668,10 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: * (next_state_action_value_qvalue.min(0)[0] - self.alpha * logp_pi[1]) ).sum(dim=-1, keepdim=True) - tensordict_select.set(("next", self.value_function.value_key), pred_next_val) - target_value = self.value_function.value_estimate(tensordict_select).squeeze(-1) + tensordict_select.set(("next", self.value_estimator.value_key), pred_next_val) + target_value = self.value_estimator.value_estimate(tensordict_select).squeeze( + -1 + ) actions = torch.argmax(tensordict_select["action"], dim=-1) @@ -737,14 +739,14 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): if hasattr(self, "gamma"): hp["gamma"] = self.gamma if value_type is ValueEstimators.TD1: - self._value_function = TD1Estimator( + self._value_estimator = TD1Estimator( **hp, value_network=value_net, value_target_key="value_target", value_key=value_key, ) elif value_type is ValueEstimators.TD0: - self._value_function = TD0Estimator( + self._value_estimator = TD0Estimator( **hp, value_network=value_net, value_target_key="value_target", @@ -755,7 +757,7 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): f"Value type {value_type} it not implemented for loss {type(self)}." ) elif value_type is ValueEstimators.TDLambda: - self._value_function = TDLambdaEstimator( + self._value_estimator = TDLambdaEstimator( **hp, value_network=value_net, value_target_key="value_target", diff --git a/torchrl/objectives/td3.py b/torchrl/objectives/td3.py index 1f8ed97e37f..41043852ada 100644 --- a/torchrl/objectives/td3.py +++ b/torchrl/objectives/td3.py @@ -195,7 +195,7 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: next_state_value = next_state_action_value_qvalue.min(0)[0] tensordict.set(("next", "state_action_value"), next_state_value.unsqueeze(-1)) - target_value = self.value_function.value_estimate(tensordict).squeeze(-1) + target_value = self.value_estimator.value_estimate(tensordict).squeeze(-1) pred_val = state_action_value_qvalue td_error = (pred_val - target_value).pow(2) loss_qval = ( @@ -237,11 +237,11 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): value_key = "state_action_value" # we do not need a value network bc the next state value is already passed if value_type == ValueEstimators.TD1: - self._value_function = TD1Estimator( + self._value_estimator = TD1Estimator( value_network=None, value_key=value_key, **hp ) elif value_type == ValueEstimators.TD0: - self._value_function = TD0Estimator( + self._value_estimator = TD0Estimator( value_network=None, value_key=value_key, **hp ) elif value_type == ValueEstimators.GAE: @@ -249,7 +249,7 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): f"Value type {value_type} it not implemented for loss {type(self)}." ) elif value_type == ValueEstimators.TDLambda: - self._value_function = TDLambdaEstimator( + self._value_estimator = TDLambdaEstimator( value_network=None, value_key=value_key, **hp ) else: diff --git a/torchrl/objectives/utils.py b/torchrl/objectives/utils.py index 63891c0fe27..250087a34a4 100644 --- a/torchrl/objectives/utils.py +++ b/torchrl/objectives/utils.py @@ -34,10 +34,10 @@ class ValueEstimators(Enum): """ - TD0 = 1 - TD1 = 2 - TDLambda = 3 - GAE = 4 + TD0 = "Bootstrapped TD (1-step return)" + TD1 = "TD(1) (infinity-step return)" + TDLambda = "TD(lambda)" + GAE = "Generalized advantage estimate" def default_value_kwargs(value_type: ValueEstimators): diff --git a/torchrl/objectives/value/advantages.py b/torchrl/objectives/value/advantages.py index a8f681c2163..e3dada133c2 100644 --- a/torchrl/objectives/value/advantages.py +++ b/torchrl/objectives/value/advantages.py @@ -4,7 +4,6 @@ # LICENSE file in the root directory of this source tree. import abc import warnings -from copy import deepcopy from functools import wraps from typing import Callable, List, Optional, Tuple, Union @@ -17,9 +16,9 @@ from torchrl.objectives.utils import hold_out_net from torchrl.objectives.value.functional import ( - td_advantage_estimate, td_lambda_advantage_estimate, vec_generalized_advantage_estimate, + vec_td1_advantage_estimate, vec_td_lambda_advantage_estimate, ) @@ -114,7 +113,9 @@ def is_stateless(self): class TD0Estimator(ValueEstimatorBase): - """Myopic Temporal Difference (TD(0)) estimate of advantage function. + """Temporal Difference (TD(0)) estimate of advantage function. + + AKA bootstrapped temporal difference or 1-step return. Args: gamma (scalar): exponential mean discount. @@ -296,7 +297,7 @@ def value_estimate( class TD1Estimator(ValueEstimatorBase): - """Bootstrapped Temporal Difference (TD(1)) estimate of advantage function. + r""":math:`\infty`-Temporal Difference (TD(1)) estimate of advantage function. Args: gamma (scalar): exponential mean discount. @@ -471,7 +472,7 @@ def value_estimate( next_value = step_td.get(self.value_key) done = tensordict.get(("next", "done")) - value_target = td_advantage_estimate( + value_target = vec_td1_advantage_estimate( gamma, torch.zeros_like(next_value), next_value, reward, done ) return value_target @@ -927,9 +928,11 @@ def new_init(self, *args, **kwargs): cls.__init__ = new_init -TD0Estimate = deepcopy(TD0Estimator) +TD0Estimate = type("TD0Estimate", TD0Estimator.__bases__, dict(TD0Estimator.__dict__)) _deprecate_class(TD0Estimate, TD0Estimator) -TD1Estimate = deepcopy(TD1Estimator) +TD1Estimate = type("TD1Estimate", TD1Estimator.__bases__, dict(TD1Estimator.__dict__)) _deprecate_class(TD1Estimate, TD1Estimator) -TDLambdaEstimate = deepcopy(TDLambdaEstimator) +TDLambdaEstimate = type( + "TDLambdaEstimate", TDLambdaEstimator.__bases__, dict(TDLambdaEstimator.__dict__) +) _deprecate_class(TDLambdaEstimate, TDLambdaEstimator) diff --git a/torchrl/objectives/value/functional.py b/torchrl/objectives/value/functional.py index 534eb47306d..b7afeea8664 100644 --- a/torchrl/objectives/value/functional.py +++ b/torchrl/objectives/value/functional.py @@ -10,15 +10,24 @@ __all__ = [ "generalized_advantage_estimate", "vec_generalized_advantage_estimate", - "vec_td_lambda_return_estimate", - "vec_td_lambda_advantage_estimate", + "td0_advantage_estimate", + "td0_return_estimate", + "td1_return_estimate", + "vec_td1_return_estimate", + "td1_advantage_estimate", + "vec_td1_advantage_estimate", "td_lambda_return_estimate", + "vec_td_lambda_return_estimate", "td_lambda_advantage_estimate", - "td_advantage_estimate", + "vec_td_lambda_advantage_estimate", ] from torchrl.objectives.value.utils import _custom_conv1d, _make_gammas_tensor +######################################################################## +# GAE +# --- + def generalized_advantage_estimate( gamma: float, @@ -28,7 +37,7 @@ def generalized_advantage_estimate( reward: torch.Tensor, done: torch.Tensor, ) -> Tuple[torch.Tensor, torch.Tensor]: - """Get generalized advantage estimate of a trajectory. + """Generalized advantage estimate of a trajectory. Refer to "HIGH-DIMENSIONAL CONTINUOUS CONTROL USING GENERALIZED ADVANTAGE ESTIMATION" https://arxiv.org/pdf/1506.02438.pdf for more context. @@ -37,13 +46,14 @@ def generalized_advantage_estimate( gamma (scalar): exponential mean discount. lmbda (scalar): trajectory discount. state_value (Tensor): value function result with old_state input. - must be a [Batch x TimeSteps x 1] or [Batch x TimeSteps] tensor next_state_value (Tensor): value function result with new_state input. - must be a [Batch x TimeSteps x 1] or [Batch x TimeSteps] tensor reward (Tensor): reward of taking actions in the environment. - must be a [Batch x TimeSteps x 1] or [Batch x TimeSteps] tensor done (Tensor): boolean flag for end of episode. + All tensors (values, reward and done) must have shape + ``[*Batch x TimeSteps x F]``, with ``F`` features (for single agent, + single task, single objective F=1). + """ if not (next_state_value.shape == state_value.shape == reward.shape == done.shape): raise RuntimeError( @@ -84,7 +94,7 @@ def vec_generalized_advantage_estimate( reward: torch.Tensor, done: torch.Tensor, ) -> Tuple[torch.Tensor, torch.Tensor]: - """Get generalized advantage estimate of a trajectory. + """Vectorized Generalized advantage estimate of a trajectory. Refer to "HIGH-DIMENSIONAL CONTINUOUS CONTROL USING GENERALIZED ADVANTAGE ESTIMATION" https://arxiv.org/pdf/1506.02438.pdf for more context. @@ -93,13 +103,14 @@ def vec_generalized_advantage_estimate( gamma (scalar): exponential mean discount. lmbda (scalar): trajectory discount. state_value (Tensor): value function result with old_state input. - must be a [Batch x TimeSteps x 1] or [Batch x TimeSteps] tensor next_state_value (Tensor): value function result with new_state input. - must be a [Batch x TimeSteps x 1] or [Batch x TimeSteps] tensor reward (Tensor): reward of taking actions in the environment. - must be a [Batch x TimeSteps x 1] or [Batch x TimeSteps] tensor done (Tensor): boolean flag for end of episode. + All tensors (values, reward and done) must have shape + ``[*Batch x TimeSteps x F]``, with ``F`` features (for single agent, + single task, single objective F=1). + """ if not (next_state_value.shape == state_value.shape == reward.shape == done.shape): raise RuntimeError( @@ -148,61 +159,90 @@ def vec_generalized_advantage_estimate( return advantage, value_target -def td_advantage_estimate( +######################################################################## +# TD(0) +# ----- + + +def td0_advantage_estimate( gamma: float, state_value: torch.Tensor, next_state_value: torch.Tensor, reward: torch.Tensor, done: torch.Tensor, ) -> Tuple[torch.Tensor, torch.Tensor]: - """Get generalized advantage estimate of a trajectory. + """TD(0) advantage estimate of a trajectory. - Refer to "HIGH-DIMENSIONAL CONTINUOUS CONTROL USING GENERALIZED ADVANTAGE ESTIMATION" - https://arxiv.org/pdf/1506.02438.pdf for more context. + Also known as bootstrapped Temporal Difference or one-step return. Args: gamma (scalar): exponential mean discount. state_value (Tensor): value function result with old_state input. - must be a [Batch x TimeSteps x 1] or [Batch x TimeSteps] tensor next_state_value (Tensor): value function result with new_state input. - must be a [Batch x TimeSteps x 1] or [Batch x TimeSteps] tensor reward (Tensor): reward of taking actions in the environment. - must be a [Batch x TimeSteps x 1] or [Batch x TimeSteps] tensor done (Tensor): boolean flag for end of episode. + All tensors (values, reward and done) must have shape + ``[*Batch x TimeSteps x F]``, with ``F`` features (for single agent, + single task, single objective F=1). + """ if not (next_state_value.shape == state_value.shape == reward.shape == done.shape): raise RuntimeError( "All input tensors (value, reward and done states) must share a unique shape." ) - for tensor in (next_state_value, state_value, reward, done): - if tensor.shape[-1] != 1: - raise RuntimeError( - "Last dimension of generalized_advantage_estimate inputs must be a singleton dimension." - ) not_done = 1 - done.to(next_state_value.dtype) advantage = reward + gamma * not_done * next_state_value - state_value return advantage -def td_lambda_return_estimate( +def td0_return_estimate( gamma: float, - lmbda: float, next_state_value: torch.Tensor, reward: torch.Tensor, done: torch.Tensor, - rolling_gamma: bool = None, -) -> torch.Tensor: - """TD(lambda) return estimate. +) -> Tuple[torch.Tensor, torch.Tensor]: + """TD(0) discounted return estimate of a trajectory. + + Also known as bootstrapped Temporal Difference or one-step return. Args: gamma (scalar): exponential mean discount. - lmbda (scalar): trajectory discount. next_state_value (Tensor): value function result with new_state input. must be a [Batch x TimeSteps x 1] or [Batch x TimeSteps] tensor reward (Tensor): reward of taking actions in the environment. must be a [Batch x TimeSteps x 1] or [Batch x TimeSteps] tensor done (Tensor): boolean flag for end of episode. + + """ + if not (next_state_value.shape == reward.shape == done.shape): + raise RuntimeError( + "All input tensors (value, reward and done states) must share a unique shape." + ) + not_done = 1 - done.to(next_state_value.dtype) + advantage = reward + gamma * not_done * next_state_value + return advantage + + +######################################################################## +# TD(1) +# ---------- + + +def td1_return_estimate( + gamma: float, + next_state_value: torch.Tensor, + reward: torch.Tensor, + done: torch.Tensor, + rolling_gamma: bool = None, +) -> torch.Tensor: + r"""TD(1) return estimate. + + Args: + gamma (scalar): exponential mean discount. + next_state_value (Tensor): value function result with new_state input. + reward (Tensor): reward of taking actions in the environment. + done (Tensor): boolean flag for end of episode. rolling_gamma (bool, optional): if ``True``, it is assumed that each gamma if a gamma tensor is tied to a single event: gamma = [g1, g2, g3, g4] @@ -225,84 +265,65 @@ def td_lambda_return_estimate( ] Default is True. + All tensors (values, reward and done) must have shape + ``[*Batch x TimeSteps x F]``, with ``F`` features (for single agent, + single task, single objective F=1). + """ if not (next_state_value.shape == reward.shape == done.shape): raise RuntimeError( "All input tensors (value, reward and done states) must share a unique shape." ) - for tensor in (next_state_value, reward, done): - if tensor.shape[-1] != 1: - raise RuntimeError( - "Last dimension of generalized_advantage_estimate inputs must be a singleton dimension." - ) not_done = 1 - done.to(next_state_value.dtype) returns = torch.empty_like(next_state_value) T = returns.shape[-2] - # if gamma is not a tensor of the same shape as other inputs, we use rolling_gamma = True single_gamma = False if not (isinstance(gamma, torch.Tensor) and gamma.shape == not_done.shape): single_gamma = True gamma = torch.full_like(next_state_value, gamma) - single_lambda = False - if not (isinstance(lmbda, torch.Tensor) and lmbda.shape == not_done.shape): - single_lambda = True - lmbda = torch.full_like(next_state_value, lmbda) - if rolling_gamma is None: rolling_gamma = True - elif not rolling_gamma and single_gamma and single_lambda: + elif not rolling_gamma and single_gamma: raise RuntimeError( - "rolling_gamma=False is expected only with time-sensitive gamma or lambda values" + "rolling_gamma=False is expected only with time-sensitive gamma values" ) if rolling_gamma: gamma = gamma * not_done g = next_state_value[..., -1, :] for i in reversed(range(T)): - g = returns[..., i, :] = reward[..., i, :] + gamma[..., i, :] * ( - (1 - lmbda[..., i, :]) * next_state_value[..., i, :] - + lmbda[..., i, :] * g - ) + g = returns[..., i, :] = reward[..., i, :] + gamma[..., i, :] * g else: for k in range(T): g = next_state_value[..., -1, :] _gamma = gamma[..., k, :] - _lambda = lmbda[..., k, :] nd = not_done _gamma = _gamma.unsqueeze(-2) * nd for i in reversed(range(k, T)): - g = reward[..., i, :] + _gamma[..., i, :] * ( - (1 - _lambda) * next_state_value[..., i, :] + _lambda * g - ) + g = reward[..., i, :] + _gamma[..., i, :] * g returns[..., k, :] = g - return returns -def td_lambda_advantage_estimate( +def td1_advantage_estimate( gamma: float, - lmbda: float, state_value: torch.Tensor, next_state_value: torch.Tensor, reward: torch.Tensor, done: torch.Tensor, rolling_gamma: bool = None, ) -> torch.Tensor: - """TD(lambda) advantage estimate. + """TD(1) advantage estimate. Args: gamma (scalar): exponential mean discount. - lmbda (scalar): trajectory discount. state_value (Tensor): value function result with old_state input. - must be a [Batch x TimeSteps x 1] or [Batch x TimeSteps] tensor next_state_value (Tensor): value function result with new_state input. - must be a [Batch x TimeSteps x 1] or [Batch x TimeSteps] tensor reward (Tensor): reward of taking actions in the environment. - must be a [Batch x TimeSteps x 1] or [Batch x TimeSteps] tensor done (Tensor): boolean flag for end of episode. rolling_gamma (bool, optional): if ``True``, it is assumed that each gamma if a gamma tensor is tied to a single event: @@ -326,6 +347,10 @@ def td_lambda_advantage_estimate( ] Default is True. + All tensors (values, reward and done) must have shape + ``[*Batch x TimeSteps x F]``, with ``F`` features (for single agent, + single task, single objective F=1). + """ if not (next_state_value.shape == state_value.shape == reward.shape == done.shape): raise RuntimeError( @@ -333,34 +358,73 @@ def td_lambda_advantage_estimate( ) if not state_value.shape == next_state_value.shape: raise RuntimeError("shape of state_value and next_state_value must match") - returns = td_lambda_return_estimate( - gamma, lmbda, next_state_value, reward, done, rolling_gamma - ) + returns = td1_return_estimate(gamma, next_state_value, reward, done, rolling_gamma) advantage = returns - state_value return advantage -def vec_td_lambda_advantage_estimate( +def vec_td1_return_estimate( + gamma, next_state_value, reward, done, rolling_gamma: Optional[bool] = None +): + """Vectorized TD(1) return estimate. + + Args: + gamma (scalar, Tensor): exponential mean discount. If tensor-valued, + next_state_value (Tensor): value function result with new_state input. + reward (Tensor): reward of taking actions in the environment. + done (Tensor): boolean flag for end of episode. + rolling_gamma (bool, optional): if ``True``, it is assumed that each gamma + if a gamma tensor is tied to a single event: + gamma = [g1, g2, g3, g4] + value = [v1, v2, v3, v4] + return = [ + v1 + g1 v2 + g1 g2 v3 + g1 g2 g3 v4, + v2 + g2 v3 + g2 g3 v4, + v3 + g3 v4, + v4, + ] + if False, it is assumed that each gamma is tied to the upcoming + trajectory: + gamma = [g1, g2, g3, g4] + value = [v1, v2, v3, v4] + return = [ + v1 + g1 v2 + g1**2 v3 + g**3 v4, + v2 + g2 v3 + g2**2 v4, + v3 + g3 v4, + v4, + ] + Default is True. + + All tensors (values, reward and done) must have shape + ``[*Batch x TimeSteps x F]``, with ``F`` features (for single agent, + single task, single objective F=1). + + """ + return vec_td_lambda_return_estimate( + gamma=gamma, + next_state_value=next_state_value, + reward=reward, + done=done, + rolling_gamma=rolling_gamma, + lmbda=1, + ) + + +def vec_td1_advantage_estimate( gamma, - lmbda, state_value, next_state_value, reward, done, rolling_gamma: bool = None, ): - """Vectorized TD(lambda) advantage estimate. + """Vectorized TD(1) advantage estimate. Args: gamma (scalar, Tensor): exponential mean discount. If tensor-valued, - must be a [Batch x TimeSteps x 1] tensor. - lmbda (scalar): trajectory discount. state_value (Tensor): value function result with old_state input. - must be a [Batch x TimeSteps x 1] or [Batch x TimeSteps] tensor next_state_value (Tensor): value function result with new_state input. - must be a [Batch x TimeSteps x 1] or [Batch x TimeSteps] tensor reward (Tensor): reward of taking actions in the environment. - must be a [Batch x TimeSteps x 1] or [Batch x TimeSteps] tensor done (Tensor): boolean flag for end of episode. rolling_gamma (bool, optional): if ``True``, it is assumed that each gamma if a gamma tensor is tied to a single event: @@ -384,23 +448,188 @@ def vec_td_lambda_advantage_estimate( ] Default is True. + All tensors (values, reward and done) must have shape + ``[*Batch x TimeSteps x F]``, with ``F`` features (for single agent, + single task, single objective F=1). + """ if not (next_state_value.shape == state_value.shape == reward.shape == done.shape): raise RuntimeError( "All input tensors (value, reward and done states) must share a unique shape." ) return ( - vec_td_lambda_return_estimate( - gamma, lmbda, next_state_value, reward, done, rolling_gamma - ) + vec_td1_return_estimate(gamma, next_state_value, reward, done, rolling_gamma) - state_value ) +######################################################################## +# TD(lambda) +# ---------- + + +def td_lambda_return_estimate( + gamma: float, + lmbda: float, + next_state_value: torch.Tensor, + reward: torch.Tensor, + done: torch.Tensor, + rolling_gamma: bool = None, +) -> torch.Tensor: + r"""TD(:math:`\lambda`) return estimate. + + Args: + gamma (scalar): exponential mean discount. + lmbda (scalar): trajectory discount. + next_state_value (Tensor): value function result with new_state input. + reward (Tensor): reward of taking actions in the environment. + done (Tensor): boolean flag for end of episode. + rolling_gamma (bool, optional): if ``True``, it is assumed that each gamma + if a gamma tensor is tied to a single event: + gamma = [g1, g2, g3, g4] + value = [v1, v2, v3, v4] + return = [ + v1 + g1 v2 + g1 g2 v3 + g1 g2 g3 v4, + v2 + g2 v3 + g2 g3 v4, + v3 + g3 v4, + v4, + ] + if False, it is assumed that each gamma is tied to the upcoming + trajectory: + gamma = [g1, g2, g3, g4] + value = [v1, v2, v3, v4] + return = [ + v1 + g1 v2 + g1**2 v3 + g**3 v4, + v2 + g2 v3 + g2**2 v4, + v3 + g3 v4, + v4, + ] + Default is True. + + All tensors (values, reward and done) must have shape + ``[*Batch x TimeSteps x F]``, with ``F`` features (for single agent, + single task, single objective F=1). + + """ + if not (next_state_value.shape == reward.shape == done.shape): + raise RuntimeError( + "All input tensors (value, reward and done states) must share a unique shape." + ) + for tensor in (next_state_value, reward, done): + if tensor.shape[-1] != 1: + raise RuntimeError( + "Last dimension of generalized_advantage_estimate inputs must be a singleton dimension." + ) + not_done = 1 - done.to(next_state_value.dtype) + + returns = torch.empty_like(next_state_value) + + T = returns.shape[-2] + + # if gamma is not a tensor of the same shape as other inputs, we use rolling_gamma = True + single_gamma = False + if not (isinstance(gamma, torch.Tensor) and gamma.shape == not_done.shape): + single_gamma = True + gamma = torch.full_like(next_state_value, gamma) + + single_lambda = False + if not (isinstance(lmbda, torch.Tensor) and lmbda.shape == not_done.shape): + single_lambda = True + lmbda = torch.full_like(next_state_value, lmbda) + + if rolling_gamma is None: + rolling_gamma = True + elif not rolling_gamma and single_gamma and single_lambda: + raise RuntimeError( + "rolling_gamma=False is expected only with time-sensitive gamma or lambda values" + ) + + if rolling_gamma: + gamma = gamma * not_done + g = next_state_value[..., -1, :] + for i in reversed(range(T)): + g = returns[..., i, :] = reward[..., i, :] + gamma[..., i, :] * ( + (1 - lmbda[..., i, :]) * next_state_value[..., i, :] + + lmbda[..., i, :] * g + ) + else: + for k in range(T): + g = next_state_value[..., -1, :] + _gamma = gamma[..., k, :] + _lambda = lmbda[..., k, :] + nd = not_done + _gamma = _gamma.unsqueeze(-2) * nd + for i in reversed(range(k, T)): + g = reward[..., i, :] + _gamma[..., i, :] * ( + (1 - _lambda) * next_state_value[..., i, :] + _lambda * g + ) + returns[..., k, :] = g + + return returns + + +def td_lambda_advantage_estimate( + gamma: float, + lmbda: float, + state_value: torch.Tensor, + next_state_value: torch.Tensor, + reward: torch.Tensor, + done: torch.Tensor, + rolling_gamma: bool = None, +) -> torch.Tensor: + r"""TD(:math:`\lambda`) advantage estimate. + + Args: + gamma (scalar): exponential mean discount. + lmbda (scalar): trajectory discount. + state_value (Tensor): value function result with old_state input. + next_state_value (Tensor): value function result with new_state input. + reward (Tensor): reward of taking actions in the environment. + done (Tensor): boolean flag for end of episode. + rolling_gamma (bool, optional): if ``True``, it is assumed that each gamma + if a gamma tensor is tied to a single event: + gamma = [g1, g2, g3, g4] + value = [v1, v2, v3, v4] + return = [ + v1 + g1 v2 + g1 g2 v3 + g1 g2 g3 v4, + v2 + g2 v3 + g2 g3 v4, + v3 + g3 v4, + v4, + ] + if False, it is assumed that each gamma is tied to the upcoming + trajectory: + gamma = [g1, g2, g3, g4] + value = [v1, v2, v3, v4] + return = [ + v1 + g1 v2 + g1**2 v3 + g**3 v4, + v2 + g2 v3 + g2**2 v4, + v3 + g3 v4, + v4, + ] + Default is True. + + All tensors (values, reward and done) must have shape + ``[*Batch x TimeSteps x F]``, with ``F`` features (for single agent, + single task, single objective F=1). + + """ + if not (next_state_value.shape == state_value.shape == reward.shape == done.shape): + raise RuntimeError( + "All input tensors (value, reward and done states) must share a unique shape." + ) + if not state_value.shape == next_state_value.shape: + raise RuntimeError("shape of state_value and next_state_value must match") + returns = td_lambda_return_estimate( + gamma, lmbda, next_state_value, reward, done, rolling_gamma + ) + advantage = returns - state_value + return advantage + + def vec_td_lambda_return_estimate( gamma, lmbda, next_state_value, reward, done, rolling_gamma: Optional[bool] = None ): - """Vectorized TD(lambda) return estimate. + r"""Vectorized TD(:math:`\lambda`) return estimate. Args: gamma (scalar, Tensor): exponential mean discount. If tensor-valued, @@ -433,6 +662,10 @@ def vec_td_lambda_return_estimate( ] Default is True. + All tensors (values, reward and done) must have shape + ``[*Batch x TimeSteps x F]``, with ``F`` features (for single agent, + single task, single objective F=1). + """ if not (next_state_value.shape == reward.shape == done.shape): raise RuntimeError( @@ -538,3 +771,60 @@ def vec_td_lambda_return_estimate( v3[..., :-1] = 0 v3 = _custom_conv1d(v3, dec * (gammas * lambdas).transpose(1, 2)) return (v1 + v2 + v3).view(shape) + + +def vec_td_lambda_advantage_estimate( + gamma, + lmbda, + state_value, + next_state_value, + reward, + done, + rolling_gamma: bool = None, +): + r"""Vectorized TD(:math:`\lambda`) advantage estimate. + + Args: + gamma (scalar, Tensor): exponential mean discount. If tensor-valued, + lmbda (scalar): trajectory discount. + state_value (Tensor): value function result with old_state input. + next_state_value (Tensor): value function result with new_state input. + reward (Tensor): reward of taking actions in the environment. + done (Tensor): boolean flag for end of episode. + rolling_gamma (bool, optional): if ``True``, it is assumed that each gamma + if a gamma tensor is tied to a single event: + gamma = [g1, g2, g3, g4] + value = [v1, v2, v3, v4] + return = [ + v1 + g1 v2 + g1 g2 v3 + g1 g2 g3 v4, + v2 + g2 v3 + g2 g3 v4, + v3 + g3 v4, + v4, + ] + if False, it is assumed that each gamma is tied to the upcoming + trajectory: + gamma = [g1, g2, g3, g4] + value = [v1, v2, v3, v4] + return = [ + v1 + g1 v2 + g1**2 v3 + g**3 v4, + v2 + g2 v3 + g2**2 v4, + v3 + g3 v4, + v4, + ] + Default is True. + + All tensors (values, reward and done) must have shape + ``[*Batch x TimeSteps x F]``, with ``F`` features (for single agent, + single task, single objective F=1). + + """ + if not (next_state_value.shape == state_value.shape == reward.shape == done.shape): + raise RuntimeError( + "All input tensors (value, reward and done states) must share a unique shape." + ) + return ( + vec_td_lambda_return_estimate( + gamma, lmbda, next_state_value, reward, done, rolling_gamma + ) + - state_value + ) From 400cfd13850ae455f42d10d5ec2bce6342ceb98f Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 28 Mar 2023 14:56:33 +0100 Subject: [PATCH 24/89] amend --- examples/a2c/a2c.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/a2c/a2c.py b/examples/a2c/a2c.py index 2ce1a32336b..7d821abea04 100644 --- a/examples/a2c/a2c.py +++ b/examples/a2c/a2c.py @@ -10,7 +10,7 @@ from hydra.core.config_store import ConfigStore from torchrl.envs.transforms import RewardScaling from torchrl.envs.utils import set_exploration_mode -from torchrl.objectives.value import TDEstimate +from torchrl.objectives.value import TD0Estimate from torchrl.record.loggers import generate_exp_name, get_logger from torchrl.trainers.helpers.collectors import ( make_collector_onpolicy, @@ -144,7 +144,7 @@ def main(cfg: "DictConfig"): # noqa: F821 ) critic_model = model.get_value_operator() - advantage = TDEstimate( + advantage = TD0Estimate( cfg.gamma, value_network=critic_model, average_rewards=True, From 64768b07ad0a7a35aa5c2b2c304aa7d4dc7121e4 Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 28 Mar 2023 15:27:59 +0100 Subject: [PATCH 25/89] amend --- torchrl/objectives/value/advantages.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/torchrl/objectives/value/advantages.py b/torchrl/objectives/value/advantages.py index e3dada133c2..6fe2ed7b0ca 100644 --- a/torchrl/objectives/value/advantages.py +++ b/torchrl/objectives/value/advantages.py @@ -16,10 +16,10 @@ from torchrl.objectives.utils import hold_out_net from torchrl.objectives.value.functional import ( - td_lambda_advantage_estimate, + td_lambda_return_estimate, vec_generalized_advantage_estimate, - vec_td1_advantage_estimate, - vec_td_lambda_advantage_estimate, + vec_td1_return_estimate, + vec_td_lambda_return_estimate, ) @@ -472,9 +472,7 @@ def value_estimate( next_value = step_td.get(self.value_key) done = tensordict.get(("next", "done")) - value_target = vec_td1_advantage_estimate( - gamma, torch.zeros_like(next_value), next_value, reward, done - ) + value_target = vec_td1_return_estimate(gamma, next_value, reward, done) return value_target @@ -665,13 +663,9 @@ def value_estimate( done = tensordict.get(("next", "done")) if self.vectorized: - val = vec_td_lambda_advantage_estimate( - gamma, lmbda, torch.zeros_like(next_value), next_value, reward, done - ) + val = vec_td_lambda_return_estimate(gamma, lmbda, next_value, reward, done) else: - val = td_lambda_advantage_estimate( - gamma, lmbda, torch.zeros_like(next_value), next_value, reward, done - ) + val = td_lambda_return_estimate(gamma, lmbda, next_value, reward, done) return val From f5550df8b2ca5594bca996e4c5189119bb341eee Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 28 Mar 2023 16:14:29 +0100 Subject: [PATCH 26/89] amend --- examples/a2c/a2c.py | 7 ++-- examples/ppo/ppo.py | 3 +- torchrl/objectives/value/advantages.py | 51 ++++++++++++++++++-------- 3 files changed, 42 insertions(+), 19 deletions(-) diff --git a/examples/a2c/a2c.py b/examples/a2c/a2c.py index 7d821abea04..0a2783127cf 100644 --- a/examples/a2c/a2c.py +++ b/examples/a2c/a2c.py @@ -10,7 +10,7 @@ from hydra.core.config_store import ConfigStore from torchrl.envs.transforms import RewardScaling from torchrl.envs.utils import set_exploration_mode -from torchrl.objectives.value import TD0Estimate +from torchrl.objectives.value import TD0Estimator from torchrl.record.loggers import generate_exp_name, get_logger from torchrl.trainers.helpers.collectors import ( make_collector_onpolicy, @@ -144,14 +144,15 @@ def main(cfg: "DictConfig"): # noqa: F821 ) critic_model = model.get_value_operator() - advantage = TD0Estimate( + advantage = TD0Estimator( cfg.gamma, value_network=critic_model, average_rewards=True, + differentiable=True, ) trainer.register_op( "process_optim_batch", - advantage, + torch.no_grad()(advantage), ) final_seed = collector.set_seed(cfg.seed) diff --git a/examples/ppo/ppo.py b/examples/ppo/ppo.py index 3c9bb8e4a1e..e7ce860f173 100644 --- a/examples/ppo/ppo.py +++ b/examples/ppo/ppo.py @@ -168,10 +168,11 @@ def main(cfg: "DictConfig"): # noqa: F821 cfg.lmbda, value_network=critic_model, average_gae=True, + differentiable=True, ) trainer.register_op( "process_optim_batch", - lambda tensordict: advantage(tensordict.to(device)), + lambda tensordict: torch.no_grad()(advantage(tensordict.to(device))), ) trainer._process_optim_batch_ops = [ trainer._process_optim_batch_ops[-1], diff --git a/torchrl/objectives/value/advantages.py b/torchrl/objectives/value/advantages.py index 6fe2ed7b0ca..a6081a71b8c 100644 --- a/torchrl/objectives/value/advantages.py +++ b/torchrl/objectives/value/advantages.py @@ -46,6 +46,10 @@ class ValueEstimatorBase(nn.Module): value_network: Union[TensorDictModule, Callable] value_key: Union[Tuple[str], str] + DIFF_DEPREC_MSG = "differentiable=False will soon be deprecated and all value computations will be made" \ + "differentiable. " \ + "Consider using differentiable=True and " \ + "decorate your function with `torch.no_grad()` or pass detached functional parameters." @abc.abstractmethod def forward( @@ -54,7 +58,7 @@ def forward( params: Optional[TensorDictBase] = None, target_params: Optional[TensorDictBase] = None, ) -> TensorDictBase: - """Computes the a value estimate given the data in tensordict. + """Computes the advantage estimate given the data in tensordict. If a functional module is provided, a nested TensorDict containing the parameters (and if relevant the target parameters) can be passed to the module. @@ -123,8 +127,12 @@ class TD0Estimator(ValueEstimatorBase): the value estimates. average_rewards (bool, optional): if ``True``, rewards will be standardized before the TD is computed. - differentiable (bool, optional): if ``True``, gradients are propagated throught + differentiable (bool, optional): if ``True``, gradients are propagated through the computation of the value function. Default is ``False``. + .. note:: + The proper way to make the function call non-differentiable is to + decorate it in a `torch.no_grad()` context manager/decorator or + pass detached parameters for functional modules. advantage_key (str or tuple of str, optional): the key of the advantage entry. Defaults to "advantage". value_target_key (str or tuple of str, optional): the key of the advantage entry. @@ -155,6 +163,8 @@ def __init__( self.average_rewards = average_rewards self.differentiable = differentiable + if not differentiable: + warnings.warn(self.DIFF_DEPREC_MSG) self.value_key = value_key if ( hasattr(value_network, "out_keys") @@ -187,7 +197,7 @@ def forward( params: Optional[TensorDictBase] = None, target_params: Optional[TensorDictBase] = None, ) -> TensorDictBase: - """Computes the TDEstimate given the data in tensordict. + """Computes the TD(0) advantage given the data in tensordict. If a functional module is provided, a nested TensorDict containing the parameters (and if relevant the target parameters) can be passed to the module. @@ -214,7 +224,6 @@ def forward( >>> module = TDEstimate( ... gamma=0.98, ... value_network=value_net, - ... differentiable=False, ... ) >>> obs, next_obs = torch.randn(2, 1, 10, 3) >>> reward = torch.randn(1, 10, 1) @@ -232,7 +241,6 @@ def forward( >>> module = TDEstimate( ... gamma=0.98, ... value_network=value_net, - ... differentiable=False, ... ) >>> obs, next_obs = torch.randn(2, 1, 10, 3) >>> reward = torch.randn(1, 10, 1) @@ -304,8 +312,12 @@ class TD1Estimator(ValueEstimatorBase): value_network (TensorDictModule): value operator used to retrieve the value estimates. average_rewards (bool, optional): if ``True``, rewards will be standardized before the TD is computed. - differentiable (bool, optional): if ``True``, gradients are propagated throught + differentiable (bool, optional): if ``True``, gradients are propagated through the computation of the value function. Default is ``False``. + .. note:: + The proper way to make the function call non-differentiable is to + decorate it in a `torch.no_grad()` context manager/decorator or + pass detached parameters for functional modules. advantage_key (str or tuple of str, optional): the key of the advantage entry. Defaults to "advantage". value_target_key (str or tuple of str, optional): the key of the advantage entry. @@ -336,6 +348,8 @@ def __init__( self.average_rewards = average_rewards self.differentiable = differentiable + if not differentiable: + warnings.warn(self.DIFF_DEPREC_MSG) self.value_key = value_key if ( hasattr(value_network, "out_keys") @@ -367,7 +381,7 @@ def forward( params: Optional[TensorDictBase] = None, target_params: Optional[TensorDictBase] = None, ) -> TensorDictBase: - """Computes the TDEstimate given the data in tensordict. + """Computes the TD(1) advantage given the data in tensordict. If a functional module is provided, a nested TensorDict containing the parameters (and if relevant the target parameters) can be passed to the module. @@ -394,7 +408,6 @@ def forward( >>> module = TDEstimate( ... gamma=0.98, ... value_network=value_net, - ... differentiable=False, ... ) >>> obs, next_obs = torch.randn(2, 1, 10, 3) >>> reward = torch.randn(1, 10, 1) @@ -412,7 +425,6 @@ def forward( >>> module = TDEstimate( ... gamma=0.98, ... value_network=value_net, - ... differentiable=False, ... ) >>> obs, next_obs = torch.randn(2, 1, 10, 3) >>> reward = torch.randn(1, 10, 1) @@ -485,8 +497,12 @@ class TDLambdaEstimator(ValueEstimatorBase): value_network (TensorDictModule): value operator used to retrieve the value estimates. average_rewards (bool, optional): if ``True``, rewards will be standardized before the TD is computed. - differentiable (bool, optional): if ``True``, gradients are propagated throught + differentiable (bool, optional): if ``True``, gradients are propagated through the computation of the value function. Default is ``False``. + .. note:: + The proper way to make the function call non-differentiable is to + decorate it in a `torch.no_grad()` context manager/decorator or + pass detached parameters for functional modules. vectorized (bool, optional): whether to use the vectorized version of the lambda return. Default is `True`. advantage_key (str or tuple of str, optional): the key of the advantage entry. @@ -523,6 +539,8 @@ def __init__( self.average_rewards = average_rewards self.differentiable = differentiable + if not differentiable: + warnings.warn(self.DIFF_DEPREC_MSG) self.value_key = value_key if ( hasattr(value_network, "out_keys") @@ -554,7 +572,7 @@ def forward( params: Optional[List[Tensor]] = None, target_params: Optional[List[Tensor]] = None, ) -> TensorDictBase: - """Computes the TDLambdaEstimate given the data in tensordict. + r"""Computes the TD(:math:`\lambda`) advantage given the data in tensordict. If a functional module is provided, a nested TensorDict containing the parameters (and if relevant the target parameters) can be passed to the module. @@ -582,7 +600,6 @@ def forward( ... gamma=0.98, ... lmbda=0.94, ... value_network=value_net, - ... differentiable=False, ... ) >>> obs, next_obs = torch.randn(2, 1, 10, 3) >>> reward = torch.randn(1, 10, 1) @@ -601,7 +618,6 @@ def forward( ... gamma=0.98, ... lmbda=0.94, ... value_network=value_net, - ... differentiable=False, ... ) >>> obs, next_obs = torch.randn(2, 1, 10, 3) >>> reward = torch.randn(1, 10, 1) @@ -681,8 +697,12 @@ class GAE(ValueEstimatorBase): value_network (TensorDictModule): value operator used to retrieve the value estimates. average_gae (bool): if ``True``, the resulting GAE values will be standardized. Default is ``False``. - differentiable (bool, optional): if ``True``, gradients are propagated throught + differentiable (bool, optional): if ``True``, gradients are propagated through the computation of the value function. Default is ``False``. + .. note:: + The proper way to make the function call non-differentiable is to + decorate it in a `torch.no_grad()` context manager/decorator or + pass detached parameters for functional modules. advantage_key (str or tuple of str, optional): the key of the advantage entry. Defaults to "advantage". value_target_key (str or tuple of str, optional): the key of the advantage entry. @@ -730,7 +750,8 @@ def __init__( self.average_gae = average_gae self.differentiable = differentiable - + if not differentiable: + warnings.warn(self.DIFF_DEPREC_MSG) self.advantage_key = advantage_key self.value_target_key = value_target_key From 218ab1ae63fc4d488787e15cc267255488b09683 Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 28 Mar 2023 16:19:59 +0100 Subject: [PATCH 27/89] amend --- examples/a2c/a2c.py | 2 +- torchrl/objectives/value/advantages.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/a2c/a2c.py b/examples/a2c/a2c.py index 0a2783127cf..f6d3de8b29d 100644 --- a/examples/a2c/a2c.py +++ b/examples/a2c/a2c.py @@ -145,7 +145,7 @@ def main(cfg: "DictConfig"): # noqa: F821 critic_model = model.get_value_operator() advantage = TD0Estimator( - cfg.gamma, + gamma=cfg.gamma, value_network=critic_model, average_rewards=True, differentiable=True, diff --git a/torchrl/objectives/value/advantages.py b/torchrl/objectives/value/advantages.py index a6081a71b8c..f3354d59a34 100644 --- a/torchrl/objectives/value/advantages.py +++ b/torchrl/objectives/value/advantages.py @@ -46,10 +46,12 @@ class ValueEstimatorBase(nn.Module): value_network: Union[TensorDictModule, Callable] value_key: Union[Tuple[str], str] - DIFF_DEPREC_MSG = "differentiable=False will soon be deprecated and all value computations will be made" \ - "differentiable. " \ - "Consider using differentiable=True and " \ - "decorate your function with `torch.no_grad()` or pass detached functional parameters." + DIFF_DEPREC_MSG = ( + "differentiable=False will soon be deprecated and all value computations will be made" + "differentiable. " + "Consider using differentiable=True and " + "decorate your function with `torch.no_grad()` or pass detached functional parameters." + ) @abc.abstractmethod def forward( From 6cdcc8e24b50673583488d5971b56c26625bff52 Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 28 Mar 2023 16:20:18 +0100 Subject: [PATCH 28/89] amend --- examples/ppo/ppo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/ppo/ppo.py b/examples/ppo/ppo.py index e7ce860f173..0a7d8b60315 100644 --- a/examples/ppo/ppo.py +++ b/examples/ppo/ppo.py @@ -164,8 +164,8 @@ def main(cfg: "DictConfig"): # noqa: F821 critic_model = model.get_value_operator() advantage = GAE( - cfg.gamma, - cfg.lmbda, + gamma=cfg.gamma, + lmbda=cfg.lmbda, value_network=critic_model, average_gae=True, differentiable=True, From b47dee21b1206e74178e5b642c7bd1bae9747e1f Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 28 Mar 2023 16:53:41 +0100 Subject: [PATCH 29/89] amend --- torchrl/objectives/value/advantages.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/torchrl/objectives/value/advantages.py b/torchrl/objectives/value/advantages.py index f3354d59a34..0f7c0a2feea 100644 --- a/torchrl/objectives/value/advantages.py +++ b/torchrl/objectives/value/advantages.py @@ -16,6 +16,7 @@ from torchrl.objectives.utils import hold_out_net from torchrl.objectives.value.functional import ( + td0_return_estimate, td_lambda_return_estimate, vec_generalized_advantage_estimate, vec_td1_return_estimate, @@ -302,7 +303,9 @@ def value_estimate( next_value = step_td.get(self.value_key) done = tensordict.get(("next", "done")) - value_target = reward + gamma * (1 - done.to(reward.dtype)) * next_value + value_target = td0_return_estimate( + gamma=gamma, next_state_value=next_value, reward=reward, done=done + ) return value_target From e9bb239a33c8ad7a254e1c5d9078525d86bd4b25 Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 28 Mar 2023 17:31:13 +0100 Subject: [PATCH 30/89] amend --- docs/source/reference/objectives.rst | 6 +- tutorials/sphinx-tutorials/coding_ddpg.py | 149 ++++++++++++++-------- 2 files changed, 98 insertions(+), 57 deletions(-) diff --git a/docs/source/reference/objectives.rst b/docs/source/reference/objectives.rst index ba91adc2f5e..1eb9d17bb16 100644 --- a/docs/source/reference/objectives.rst +++ b/docs/source/reference/objectives.rst @@ -16,13 +16,15 @@ The main characteristics of TorchRL losses are: method will receive a tensordict as input that contains all the necessary information to return a loss value. - They output a :class:`tensordict.TensorDict` instance with the loss values - written under a ``"loss_`` where ``smth`` is a string describing the + written under a ``"loss_"`` where ``smth`` is a string describing the loss. Additional keys in the tensordict may be useful metrics to log during training time. .. note:: The reason we return independent losses is to let the user use a different optimizer for different sets of parameters for instance. Summing the losses - can be simply done via ``sum(loss for key, loss in loss_vals.items() if key.startswith("loss_")``. + can be simply done via + + >>> loss_val = sum(loss for key, loss in loss_vals.items() if key.startswith("loss_")) Training value functions ------------------------ diff --git a/tutorials/sphinx-tutorials/coding_ddpg.py b/tutorials/sphinx-tutorials/coding_ddpg.py index c35bd87c41e..b39596914e7 100644 --- a/tutorials/sphinx-tutorials/coding_ddpg.py +++ b/tutorials/sphinx-tutorials/coding_ddpg.py @@ -25,7 +25,8 @@ # # Key learnings: # -# - how to build an environment in TorchRL, including transforms +# - how to write a loss module and customize its value estimator; +# - how to build an environment in torchrl, including transforms # (e.g. data normalization) and parallel execution; # - how to design a policy and value network; # - how to collect data from your environment efficiently and store them @@ -34,18 +35,17 @@ # - and finally how to evaluate your model. # # This tutorial assumes that you have completed the PPO tutorial which gives -# an overview of the TorchRL components. -# -# -# This tutorial assumes the reader is familiar with some of TorchRL primitives, -# such as :class:`tensordict.TensorDict` and -# :class:`tensordict.nn.TensorDictModules`, although it should be +# an overview of the torchrl components and dependencies, such as +# :class:`tensordict.TensorDict` and :class:`tensordict.nn.TensorDictModules`, +# although it should be # sufficiently transparent to be understood without a deep understanding of # these classes. # -# We do not aim at giving a SOTA implementation of the algorithm, but rather -# to provide a high-level illustration of TorchRL features in the context of -# this algorithm. +# .. note:: +# We do not aim at giving a SOTA implementation of the algorithm, but rather +# to provide a high-level illustration of torchrl's loss implementations +# and the library features that are to be used in the context of +# this algorithm. # # Imports # ------- @@ -100,56 +100,90 @@ from torchrl.trainers import Recorder ############################################################################### -# TorchRL LossModule -# ------------------ +# torchrl :class:`torchrl.objectives.LossModule` +# ---------------------------------------------- +# +# TorchRL provides a series of losses to use in your training scripts. +# The aim is to have losses that are easily reusable/swappable and that have +# a simple signature. +# +# The main characteristics of TorchRL losses are: +# +# - they are stateful objects: they contain a copy of the trainable parameters +# such that ``loss_module.parameters()`` gives whatever is needed to train the +# algorithm. +# - They follow the ``tensordict`` convention: the :meth:`torch.nn.Module.forward` +# method will receive a tensordict as input that contains all the necessary +# information to return a loss value. +# +# >>> data = replay_buffer.sample() +# >>> loss_dict = loss_module(data) +# +# - They output a :class:`tensordict.TensorDict` instance with the loss values +# written under a ``"loss_"`` where ``smth`` is a string describing the +# loss. Additional keys in the tensordict may be useful metrics to log during +# training time. +# .. note:: +# The reason we return independent losses is to let the user use a different +# optimizer for different sets of parameters for instance. Summing the losses +# can be simply done via +# +# >>> loss_val = sum(loss for key, loss in loss_dict.items() if key.startswith("loss_")) # # The ``__init__`` method # ~~~~~~~~~~~~~~~~~~~~~~~ # # The parent class of all losses is :class:`torchrl.objectives.LossModule`. -# As many other components of the library, its :meth:`__call__` method expects -# as input a :class:`tensordict.TensorDict` instance sampled from an expenrience -# replay buffer. Using this format makes it possible to re-use the module across +# As many other components of the library, its :meth:`torchrl.objectives.LossModule.forward` method expects +# as input a :class:`tensordict.TensorDict` instance sampled from an experience +# replay buffer, or any similar data structure. Using this format makes it +# possible to re-use the module across # modalities, or in complex settings where the model needs to read multiple -# entries for instance. +# entries for instance. In other words, it allows us to code a loss module that +# is oblivious to the data type that is being given to is and that focuses on +# running the elementary steps of the loss function and only those. # # To keep the tutorial as didactic as we can, we'll be displaying each method -# of the class independently and we'll be populating the class at a later stage. +# of the class independently and we'll be populating the class at a later +# stage. # -# Let us start with the :meth:`__init__` method. DDPG aims at a simple goal: +# Let us start with the :meth:`torchrl.objectives.LossModule.__init__` +# method. DDPG aims at solving a control task with a simple strategy: # training a policy to output actions that maximise the value predicted by # a value network. Hence, our loss module needs to receive two networks in its # constructor: an actor and a value networks. We expect both of these to be -# tensordict-compatible objects, such as :class:`tensordict.nn.TensorDictModule`. +# tensordict-compatible objects, such as +# :class:`tensordict.nn.TensorDictModule`. +# Our loss function will need to compute a target value and fit the value +# network to this, and generate an action and fit the policy such that its +# value estimate is maximised. # # The crucial step of the :meth:`LossModule.__init__` method is the call to -# :meth:`LossModule.convert_to_functional`. This method will extract the -# parameters from the module and convert it to a functional module. +# :meth:`torchrl.LossModule.convert_to_functional`. This method will extract +# the parameters from the module and convert it to a functional module. +# Strictly speaking, this is not necessary and one may perfectly code all +# the losses without it. However, we encourage its usage for the following +# reason. +# # The reason TorchRL does this is that RL algorithms often execute the same -# model with different sets of parameters, called "trainable" and "target" parameters. +# model with different sets of parameters, called "trainable" and "target" +# parameters. # The "trainable" parameters are those that the optimizer needs to fit. The # "target" parameters are usually a copy of the formers with some time lag -# (absolute or diluted through a moving average). These target parameters -# are used to compute the value associated with the next observation. -# One the advantages of using a set of target parameters for the value model -# that do not match exactly the current configuration is that they provide -# a pessimistic bound on the value function being computed. +# (absolute or diluted through a moving average). +# These target parameters are used to compute the value associated with the +# next observation. One the advantages of using a set of target parameters +# for the value model that do not match exactly the current configuration is +# that they provide a pessimistic bound on the value function being computed. # Pay attention to the ``create_target_params`` keyword argument below: this # argument tells the :meth:`torchrl.objectives.LossModule.convert_to_functional` # method to create a set of target parameters in the loss module to be used # for target value computation. If this is set to ``False`` (see the actor network # for instance) the ``target_actor_network_params`` attribute will still be -# accessible but this will just return a detached version of the actor parameters. +# accessible but this will just return a **detached** version of the +# actor parameters. # -# Later, we will see how the target parameters should be updated in TorchRL. -# -# We also incorporate an advantage module. This will be used to compute the -# next state value using our value network. We'll see later in this tutorial -# how various advantage modules can be used. If none is provided, we'll -# be using the TD(lambda) method, which is usually preferable to TD(0). -# Notice that this choice makes it necessary that the tensordict provided -# has its last dimension representing the time span of the experiment (ie -# our replay buffer must be populated using non-flatten data). +# Later, we will see how the target parameters should be updated in torchrl. # @@ -157,7 +191,6 @@ def _init( self, actor_network: TensorDictModule, value_network: TensorDictModule, - advantage="td(lambda)", ) -> None: super(type(self), self).__init__() @@ -178,24 +211,29 @@ def _init( # Since the value we'll be using is based on the actor and value network, # we put them together in a single actor-critic container. actor_critic = ActorCriticWrapper(actor_network, value_network) - if advantage == "td(lambda)": - advantage_module = TDLambdaEstimate( - gamma=0.99, - lmbda=0.95, - value_network=actor_critic, - value_key="state_action_value", - ) - elif advantage == "td(0)": - advantage_module = TDEstimate( - gamma=0.99, value_network=actor_critic, value_key="state_action_value" - ) - else: - raise NotImplementedError("advantage must be one of 'td(lambda)' or 'td(0)'.") - self.advantage = advantage - self.advantage_module = advantage_module - self.loss_funtion = "l2" +############################################################################### +# The value estimator loss method +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# In many RL algorithm, the value network (or Q-value network) is trained based +# on an empirical value estimate. This can be bootstrapped (TD(0), low +# variance, high bias), meaning +# that the target value is obtained using the next reward and nothing else, or +# a Monte-Carlo estimate can be obtained (TD(1)) in which case the whole +# sequence of upcoming rewards will be used (high variance, low bias). An +# intermediate estimator (TD(:math:`\lambda`)) can also be used to compromise +# bias and variance. +# TorchRL makes it easy to use one or the other estimator via the +# :class:`torchrl.objectives.utils.ValueEstimators` Enum class, which contains +# pointers to all the value estimators implemented. Let us define the default +# value function here. We will take the simplest version (TD(0)), and show later +# on how this can be changed. + +from torchrl.objectives.utils import ValueEstimators + +default_value_estimator = ValueEstimators.TD0 ############################################################################### # The actor loss method @@ -321,6 +359,7 @@ def _forward(self, input_tensordict: TensorDictBase) -> TensorDict: class DDPGLoss(LossModule): + default_value_estimator = default_value_estimator __init__ = _init forward = _forward loss_value = _loss_value @@ -563,7 +602,7 @@ def make_t_env(): # value network, trained to estimate the value of a state-action pair, and a # parametric actor that learns how to select actions that maximize this value. # -# Recall that building a torchrl module requires two steps: +# Recall that building a TorchRL module requires two steps: # # - writing the :class:`torch.nn.Module` that will be used as network, # - wrapping the network in a :class:`tensordict.nn.TensorDictModule` where the From 34469f2f45ab1b3ec47aeb4e51a46b7fbfa066a5 Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 28 Mar 2023 17:34:10 +0100 Subject: [PATCH 31/89] differentiable=True --- torchrl/objectives/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/torchrl/objectives/utils.py b/torchrl/objectives/utils.py index 250087a34a4..3daf5e70876 100644 --- a/torchrl/objectives/utils.py +++ b/torchrl/objectives/utils.py @@ -53,13 +53,13 @@ def default_value_kwargs(value_type: ValueEstimators): """ if value_type == ValueEstimators.TD1: - return {"gamma": 0.99} + return {"gamma": 0.99, "differentiable": True} elif value_type == ValueEstimators.TD0: - return {"gamma": 0.99} + return {"gamma": 0.99, "differentiable": True} elif value_type == ValueEstimators.GAE: - return {"gamma": 0.99, "lmbda": 0.95} + return {"gamma": 0.99, "lmbda": 0.95, "differentiable": True} elif value_type == ValueEstimators.TDLambda: - return {"gamma": 0.99, "lmbda": 0.95} + return {"gamma": 0.99, "lmbda": 0.95, "differentiable": True} else: raise NotImplementedError(f"Unknown value type {value_type}.") From aae2bbe6002dde8034d3378c4a344490978bdf58 Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 28 Mar 2023 17:37:20 +0100 Subject: [PATCH 32/89] differentiable=True --- torchrl/objectives/a2c.py | 2 +- torchrl/objectives/ppo.py | 6 +++--- torchrl/objectives/reinforce.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/torchrl/objectives/a2c.py b/torchrl/objectives/a2c.py index 4b7c40c56c2..2ac6ba7a4ec 100644 --- a/torchrl/objectives/a2c.py +++ b/torchrl/objectives/a2c.py @@ -155,7 +155,7 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: if advantage is None: self.value_estimator( tensordict, - params=self.critic_params, + params=self.critic_params.detach(), target_params=self.target_critic_params, ) advantage = tensordict.get(self.advantage_key) diff --git a/torchrl/objectives/ppo.py b/torchrl/objectives/ppo.py index 638174d21a6..2b4e115d35b 100644 --- a/torchrl/objectives/ppo.py +++ b/torchrl/objectives/ppo.py @@ -192,7 +192,7 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: if advantage is None: self.value_estimator( tensordict, - params=self.critic_params, + params=self.critic_params.detach(), target_params=self.target_critic_params, ) advantage = tensordict.get(self.advantage_key) @@ -342,7 +342,7 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: if advantage is None: self.value_estimator( tensordict, - params=self.critic_params, + params=self.critic_params.detach(), target_params=self.target_critic_params, ) advantage = tensordict.get(self.advantage_key) @@ -509,7 +509,7 @@ def forward(self, tensordict: TensorDictBase) -> TensorDict: if advantage is None: self.value_estimator( tensordict, - params=self.critic_params, + params=self.critic_params.detach(), target_params=self.target_critic_params, ) advantage = tensordict.get(self.advantage_key) diff --git a/torchrl/objectives/reinforce.py b/torchrl/objectives/reinforce.py index baa0a4c2ae8..21f8e3c40db 100644 --- a/torchrl/objectives/reinforce.py +++ b/torchrl/objectives/reinforce.py @@ -110,7 +110,7 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase: if advantage is None: self.value_estimator( tensordict, - params=self.critic_params, + params=self.critic_params.detach(), target_params=self.target_critic_params, ) advantage = tensordict.get(self.advantage_key) From c9c106baf162f1f48518d1046dfeb5eeca4a4d16 Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 28 Mar 2023 21:28:10 +0100 Subject: [PATCH 33/89] amend --- torchrl/data/__init__.py | 2 +- torchrl/trainers/trainers.py | 44 +- tutorials/sphinx-tutorials/coding_ddpg.py | 82 ++- tutorials/sphinx-tutorials/coding_dqn.py | 703 ++++------------------ 4 files changed, 218 insertions(+), 613 deletions(-) diff --git a/torchrl/data/__init__.py b/torchrl/data/__init__.py index 6608b49cade..fa26ce0c6a9 100644 --- a/torchrl/data/__init__.py +++ b/torchrl/data/__init__.py @@ -3,6 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +from . import datasets from .postprocs import MultiStep from .replay_buffers import ( LazyMemmapStorage, @@ -30,4 +31,3 @@ UnboundedContinuousTensorSpec, UnboundedDiscreteTensorSpec, ) -from . import datasets diff --git a/torchrl/trainers/trainers.py b/torchrl/trainers/trainers.py index 4f040696271..1608f853ad4 100644 --- a/torchrl/trainers/trainers.py +++ b/torchrl/trainers/trainers.py @@ -601,8 +601,10 @@ class ReplayBufferTrainer(TrainerHookBase): Args: replay_buffer (TensorDictReplayBuffer): replay buffer to be used. - batch_size (int): batch size when sampling data from the - latest collection or from the replay buffer. + batch_size (int, optional): batch size when sampling data from the + latest collection or from the replay buffer. If none is provided, + the replay buffer batch-size will be used (preferred option for + unchanged batch-sizes). memmap (bool, optional): if ``True``, a memmap tensordict is created. Default is False. device (device, optional): device where the samples must be placed. @@ -630,7 +632,7 @@ class ReplayBufferTrainer(TrainerHookBase): def __init__( self, replay_buffer: TensorDictReplayBuffer, - batch_size: int, + batch_size: Optional[int] = None, memmap: bool = False, device: DEVICE_TYPING = "cpu", flatten_tensordicts: bool = True, @@ -640,6 +642,12 @@ def __init__( self.batch_size = batch_size self.memmap = memmap self.device = device + if flatten_tensordicts: + warnings.warn( + "flatten_tensordicts default value will soon be changed " + "to False for a faster execution. Make sure your " + "code is robust to this change." + ) self.flatten_tensordicts = flatten_tensordicts self.max_dims = max_dims @@ -668,7 +676,7 @@ def extend(self, batch: TensorDictBase) -> TensorDictBase: self.replay_buffer.extend(batch) def sample(self, batch: TensorDictBase) -> TensorDictBase: - sample = self.replay_buffer.sample(self.batch_size) + sample = self.replay_buffer.sample(batch_size=self.batch_size) return sample.to(self.device, non_blocking=True) def update_priority(self, batch: TensorDictBase) -> None: @@ -1094,7 +1102,7 @@ def register(self, trainer: Trainer, name: str = "batch_subsampler"): class Recorder(TrainerHookBase): - """Recorder hook for Trainer. + """Recorder hook for :class:`torchrl.trainers.Trainer`. Args: record_interval (int): total number of optimisation steps @@ -1118,33 +1126,45 @@ class Recorder(TrainerHookBase): the performance of the policy, it should be possible to turn off the explorative behaviour by calling the `set_exploration_mode('mode')` context manager. - recorder (EnvBase): An environment instance to be used + environment (EnvBase): An environment instance to be used for testing. exploration_mode (str, optional): exploration mode to use for the policy. By default, no exploration is used and the value used is "mode". Set to "random" to enable exploration - out_key (str, optional): reward key to set to the logger. Default is - `"reward_evaluation"`. + log_keys (sequence of str or tuples or str, optional): keys to read in the tensordict + for logging. Defaults to ``[("next", "reward")]``. + out_keys (Dict[str, str], optional): a dictionary mapping the ``log_keys`` + to their name in the logs. Defaults to ``{("next", "reward"): "r_evaluation"}``. suffix (str, optional): suffix of the video to be recorded. log_pbar (bool, optional): if ``True``, the reward value will be logged on the progression bar. Default is `False`. """ + ENV_DEPREC = ( + "the environment should be passed under the 'environment' key" + " and not the 'recorder' key." + ) + def __init__( self, + *, record_interval: int, record_frames: int, frame_skip: int, policy_exploration: TensorDictModule, - recorder: EnvBase, + environment: EnvBase = None, exploration_mode: str = "random", - log_keys: Optional[List[str]] = None, - out_keys: Optional[Dict[str, str]] = None, + log_keys: Optional[List[Union[str, Tuple[str]]]] = None, + out_keys: Optional[Dict[Union[str, Tuple[str]], str]] = None, suffix: Optional[str] = None, log_pbar: bool = False, + recorder: EnvBase = None, ) -> None: - + if environment is None and recorder is not None: + warnings.warn(self.ENV_DEPREC) + elif environment is not None and recorder is not None: + raise ValueError("environment and recorder conflict.") self.policy_exploration = policy_exploration self.recorder = recorder self.record_frames = record_frames diff --git a/tutorials/sphinx-tutorials/coding_ddpg.py b/tutorials/sphinx-tutorials/coding_ddpg.py index b39596914e7..0d1c353d472 100644 --- a/tutorials/sphinx-tutorials/coding_ddpg.py +++ b/tutorials/sphinx-tutorials/coding_ddpg.py @@ -56,7 +56,6 @@ from typing import Tuple from torchrl.objectives import LossModule -from torchrl.objectives.value import TDEstimate, TDLambdaEstimate warnings.filterwarnings("ignore") # sphinx_gallery_end_ignore @@ -92,11 +91,7 @@ OrnsteinUhlenbeckProcessWrapper, ValueOperator, ) -from torchrl.objectives.utils import ( - distance_loss, - hold_out_params, - SoftUpdate, -) +from torchrl.objectives.utils import distance_loss, SoftUpdate from torchrl.trainers import Recorder ############################################################################### @@ -211,8 +206,10 @@ def _init( # Since the value we'll be using is based on the actor and value network, # we put them together in a single actor-critic container. actor_critic = ActorCriticWrapper(actor_network, value_network) + self.actor_critic = actor_critic self.loss_funtion = "l2" + ############################################################################### # The value estimator loss method # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -236,6 +233,45 @@ def _init( default_value_estimator = ValueEstimators.TD0 ############################################################################### +# We also need to give some instructions to DDPG on how to build the value +# estimator, depending on the user query. Depending on the estimator provided, +# we will build the corresponding module to be used at train time: + +from torchrl.objectives.utils import default_value_kwargs +from torchrl.objectives.value import TD0Estimator, TD1Estimator, TDLambdaEstimator + + +def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): + hp = dict(default_value_kwargs(value_type)) + if hasattr(self, "gamma"): + hp["gamma"] = self.gamma + hp.update(hyperparams) + value_key = "state_action_value" + if value_type == ValueEstimators.TD1: + self._value_estimator = TD1Estimator( + value_network=self.actor_critic, value_key=value_key, **hp + ) + elif value_type == ValueEstimators.TD0: + self._value_estimator = TD0Estimator( + value_network=self.actor_critic, value_key=value_key, **hp + ) + elif value_type == ValueEstimators.GAE: + raise NotImplementedError( + f"Value type {value_type} it not implemented for loss {type(self)}." + ) + elif value_type == ValueEstimators.TDLambda: + self._value_estimator = TDLambdaEstimator( + value_network=self.actor_critic, value_key=value_key, **hp + ) + else: + raise NotImplementedError(f"Unknown value type {value_type}") + + +############################################################################### +# The ``make_value_estimator`` method can but does not need to be called: if +# not, the :class:`torchrl.objectives.LossModule` will query this method with +# its default estimator. +# # The actor loss method # ~~~~~~~~~~~~~~~~~~~~~ # @@ -249,6 +285,8 @@ def _init( # For this, the :func:`torchrl.objectives.utils.hold_out_params` function # can be used. +from torchrl.objectives.utils import hold_out_params + def _loss_actor( self, @@ -274,8 +312,8 @@ def _loss_actor( # ~~~~~~~~~~~~~~~~~~~~~ # # We now need to optimize our value network parameters. -# To do this, we will rely on the advantage module provided during -# the loss construction. +# To do this, we will rely on the value estimator of our class: +# def _loss_value( @@ -284,18 +322,12 @@ def _loss_value( ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: td_copy = tensordict.detach() + # V(s, a) + self.value_network(td_copy, params=self.value_network_params) + pred_val = td_copy.get("state_action_value").squeeze(-1) + # we manually reconstruct the parameters of the actor-critic, where the first # set of parameters belongs to the actor and the second to the value function. - params = TensorDict( - { - "module": { - "0": self.actor_network_params.detach(), - "1": self.value_network_params, - } - }, - batch_size=self.target_actor_network_params.batch_size, - device=self.target_actor_network_params.device, - ) target_params = TensorDict( { "module": { @@ -306,14 +338,16 @@ def _loss_value( batch_size=self.target_actor_network_params.batch_size, device=self.target_actor_network_params.device, ) - with set_exploration_mode("mode"): - self.advantage_module(td_copy, params=params, target_params=target_params) - target_value = td_copy.get(self.advantage_module.value_target_key) - pred_val = td_copy.get("state_action_value") + with set_exploration_mode("mode"): # we make sure that no exploration is performed + target_value = self.value_estimator.value_estimate( + tensordict, target_params=target_params + ).squeeze(-1) + # td_error = pred_val - target_value loss_value = distance_loss(pred_val, target_value, loss_function=self.loss_funtion) + td_error = (pred_val - target_value).pow(2) - return loss_value, (pred_val - target_value).pow(2), pred_val, target_value + return loss_value, td_error, pred_val, target_value ############################################################################### @@ -874,6 +908,8 @@ def make_replay_buffer(buffer_size, prefetch=3): # updater. # loss_module = DDPGLoss(actor, qnet) +# let's use the TD(lambda) estimator! +loss_module.make_value_estimator(ValueEstimators.TDLambda) target_net_updater = SoftUpdate(loss_module, eps=0.98) target_net_updater.init_() diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py index 4124d87a492..9313e6e8b08 100644 --- a/tutorials/sphinx-tutorials/coding_dqn.py +++ b/tutorials/sphinx-tutorials/coding_dqn.py @@ -26,7 +26,8 @@ # The trainer is fully customisable and offers a large set of functionalities. # The tutorial is organised around its construction. # We will be detailing how to build each of the components of the library first, -# and then put the pieces together using the `torchrl.trainers.Trainer` class. +# and then put the pieces together using the :class:`torchrl.trainers.Trainer` +# class. # # Along the road, we will also focus on some other aspects of the library: # @@ -86,7 +87,8 @@ import warnings from torchrl.objectives import DQNLoss, SoftUpdate -from torchrl.trainers import Trainer, ReplayBufferTrainer, UpdateWeights +from torchrl.record.loggers.csv import CSVLogger +from torchrl.trainers import Recorder, ReplayBufferTrainer, Trainer, UpdateWeights warnings.filterwarnings("ignore") # sphinx_gallery_end_ignore @@ -177,7 +179,10 @@ def is_notebook() -> bool: # -def make_env(parallel=False, obs_norm_sd=None, ): +def make_env( + parallel=False, + obs_norm_sd=None, +): if obs_norm_sd is None: obs_norm_sd = {"standard_normal": True} if parallel: @@ -185,13 +190,19 @@ def make_env(parallel=False, obs_norm_sd=None, ): num_workers, EnvCreator( lambda: GymEnv( - "CartPole-v1", from_pixels=True, pixels_only=True, device=device, + "CartPole-v1", + from_pixels=True, + pixels_only=True, + device=device, ) ), ) else: base_env = GymEnv( - "CartPole-v1", from_pixels=True, pixels_only=True, device=device, + "CartPole-v1", + from_pixels=True, + pixels_only=True, + device=device, ) env = TransformedEnv( @@ -222,17 +233,19 @@ def make_env(parallel=False, obs_norm_sd=None, ): # not all dimensions disappear in the process: # + def get_norm_stats(): test_env = make_env() test_env.transform[-1].init_stats( - num_iter=1000, cat_dim=0, reduce_dim=[-1, -2, -4], keep_dims=(-1, -2) -) + num_iter=1000, cat_dim=0, reduce_dim=[-1, -2, -4], keep_dims=(-1, -2) + ) obs_norm_sd = test_env.transform[-1].state_dict() # let's check that normalizing constants have a size of ``[C, 1, 1]`` where # ``C=4`` (because of :class:`torchrl.envs.CatFrames`). print(obs_norm_sd) return obs_norm_sd + ############################################################################### # Building the model (Deep Q-network) # ----------------------------------- @@ -255,6 +268,7 @@ def get_norm_stats(): # in the input :class:`tensordict.TensorDict`. # + def make_model(dummy_env): cnn_kwargs = { "num_cells": [32, 64, 64], @@ -317,13 +331,16 @@ def make_model(dummy_env): # The only requirement of this storage is that the data passed to it at write # time must always have the same shape. -def get_replay_buffer(buffer_size, n_optim): + +def get_replay_buffer(buffer_size, n_optim, batch_size): replay_buffer = TensorDictReplayBuffer( + batch_size=batch_size, storage=LazyMemmapStorage(buffer_size), prefetch=n_optim, ) return replay_buffer + ############################################################################### # Data collector # ~~~~~~~~~~~~~~ @@ -354,12 +371,13 @@ def get_replay_buffer(buffer_size, n_optim): # out training loop must account for. For simplicity, we set the devices to # the same value for all sub-collectors. -def get_collector(obs_norm_sd, num_collectors, actor_explore, frames_per_batch, total_frames, device): + +def get_collector( + obs_norm_sd, num_collectors, actor_explore, frames_per_batch, total_frames, device +): data_collector = MultiaSyncDataCollector( [ - make_env( - parallel=True, obs_norm_sd=obs_norm_sd - ), + make_env(parallel=True, obs_norm_sd=obs_norm_sd), ] * num_collectors, policy=actor_explore, @@ -375,6 +393,7 @@ def get_collector(obs_norm_sd, num_collectors, actor_explore, frames_per_batch, ) return data_collector + ############################################################################### # Loss function # ------------- @@ -394,11 +413,13 @@ def get_collector(obs_norm_sd, num_collectors, actor_explore, frames_per_batch, # in similar algorithms. # + def get_loss_module(actor, gamma): loss_module = DQNLoss(actor, gamma=gamma, delay_value=True) target_updater = SoftUpdate(loss_module) return loss_module, target_updater + ############################################################################### # Hyperparameters # --------------- @@ -475,7 +496,6 @@ def get_loss_module(actor, gamma): num_workers = 2 # 8 num_collectors = 2 # 4 - ############################################################################### # Environment and exploration # ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -524,582 +544,111 @@ def get_loss_module(actor, gamma): # Get model actor, actor_explore = make_model(test_env) loss_module, target_net_updater = get_loss_module(actor, gamma) -collector = get_collector(stats, num_collectors, actor_explore, frames_per_batch, total_frames, device) -optimizer = torch.optim.Adam(loss_module.parameters(), lr=lr, weight_decay=wd, betas=betas) +collector = get_collector( + stats, num_collectors, actor_explore, frames_per_batch, total_frames, device +) +optimizer = torch.optim.Adam( + loss_module.parameters(), lr=lr, weight_decay=wd, betas=betas +) +logger = CSVLogger(exp_name="dqn_exp", log_dir="./") + trainer = Trainer( - collector=collector, + collector=collector, total_frames=total_frames, frame_skip=1, loss_module=loss_module, optimizer=optimizer, - logger=None, - optim_steps_per_batch = n_optim, + logger=logger, + optim_steps_per_batch=n_optim, ) -buffer_hook = ReplayBufferTrainer(get_replay_buffer(buffer_size, n_optim)) +############################################################################### +# Registering hooks +# ~~~~~~~~~~~~~~~~~ +# +# Registering hooks can be achieved in two separate ways: +# +# - If the hook has it, the :meth:`torchrl.trainers.TrainerHookBase.register` +# method is the first choice. One just needs to provide the trainer as input +# and the hook will be registered with a default name at a default location. +# For some hooks, the registration can be quite complex: :class:`torchrl.trainers.ReplayBufferTrainer` +# requires 3 hooks (``extend``, ``sample`` and ``update_priority``) which +# can be cumbersome to implement. +buffer_hook = ReplayBufferTrainer( + get_replay_buffer(buffer_size, n_optim, batch_size=batch_size), + flatten_tensordicts=False, +) buffer_hook.register(trainer) weight_updater = UpdateWeights(collector, update_weights_interval=1) weight_updater.register(trainer) +recorder = Recorder( + record_interval=100, # log every 100 optimization steps + record_frames=10_000, # maximum number of frames in the record + frame_skip=1, + policy_exploration=actor_explore, + environment=test_env, + exploration_mode="mode", + log_keys=[("next", "reward")], + out_keys={("next", "reward"): "rewards"}, +) +recorder.register(trainer) +############################################################################### +# - Any callable (including :class:`torchrl.trainers.TrainerHookBase` +# subclasses) can be registered using :meth:`torchrl.trainers.Trainer.register_op`. +# In this case, a location must be explicitely passed (). This method gives +# more control over the location of the hook but it also requires more +# understanding of the Trainer mechanism. +# Check the `trainer documentation `_ +# for a detailed description of the trainer hooks. +# trainer.register_op("post_optim", target_net_updater.step) +############################################################################### +# Here we are, ready to train our algorithm! A simple call to +# ``trainer.train()`` and we'll be getting our results logged in. +# trainer.train() -# ############################################################################### -# # We represent the parameters and targets as flat structures, but unflattening -# # them is quite easy: -# -# params_flat = params.flatten_keys(".") -# -# ############################################################################### -# # We will be using the adam optimizer: -# -# optim = torch.optim.Adam(list(params_flat.values()), lr, betas=betas) -# -# ############################################################################### -# # We create a test environment for evaluation of the policy: -# -# test_env = make_env( -# parallel=False, obs_norm_sd=obs_norm_sd -# ) -# # sanity check: -# print(actor_explore(test_env.reset())) -# -# ############################################################################### -# # Training loop of a regular DQN -# # ------------------------------ -# # -# # We'll start with a simple implementation of DQN where the returns are -# # computed without bootstrapping, i.e. -# # -# # .. math:: -# # -# # Q_{t}(s, a) = R(s, a) + \gamma * V_{t+1}(s) -# # -# # where :math:`Q(s, a)` is the Q-value of the current state-action pair, -# # :math:`R(s, a)` is the result of the reward function, and :math:`V(s)` is a -# # value function that returns 0 for terminating states. -# # -# # We store the logs in a defaultdict: -# -# logs_exp1 = defaultdict(list) -# prev_traj_count = 0 -# -# pbar = tqdm.tqdm(total=total_frames) -# for j, data in enumerate(data_collector): -# current_frames = data.numel() -# pbar.update(current_frames) -# data = data.view(-1) -# -# # We store the values on the replay buffer, after placing them on CPU. -# # When called for the first time, this will instantiate our storage -# # object which will print its content. -# replay_buffer.extend(data.cpu()) -# -# # some logging -# if len(logs_exp1["frames"]): -# logs_exp1["frames"].append(current_frames + logs_exp1["frames"][-1]) -# else: -# logs_exp1["frames"].append(current_frames) -# -# if data["next", "done"].any(): -# done = data["next", "done"].squeeze(-1) -# logs_exp1["traj_lengths"].append( -# data["next", "step_count"][done].float().mean().item() -# ) -# -# # check that we have enough data to start training -# if sum(logs_exp1["frames"]) > init_random_frames: -# for _ in range(n_optim): -# # sample from the RB and send to device -# sampled_data = replay_buffer.sample(batch_size) -# sampled_data = sampled_data.to(device, non_blocking=True) -# -# # collect data from RB -# reward = sampled_data["next", "reward"].squeeze(-1) -# done = sampled_data["next", "done"].squeeze(-1).to(reward.dtype) -# action = sampled_data["action"].clone() -# -# # Compute action value (of the action actually taken) at time t -# # By default, TorchRL uses one-hot encodings for discrete actions -# sampled_data_out = sampled_data.select(*actor.in_keys) -# sampled_data_out = factor(sampled_data_out, params=params) -# action_value = sampled_data_out["action_value"] -# action_value = (action_value * action.to(action_value.dtype)).sum(-1) -# with torch.no_grad(): -# # compute best action value for the next step, using target parameters -# tdstep = step_mdp(sampled_data) -# next_value = factor( -# tdstep.select(*actor.in_keys), -# params=params_target, -# )["chosen_action_value"].squeeze(-1) -# exp_value = reward + gamma * next_value * (1 - done) -# assert exp_value.shape == action_value.shape -# # we use MSE loss but L1 or smooth L1 should also work -# error = nn.functional.mse_loss(exp_value, action_value).mean() -# error.backward() -# -# gv = nn.utils.clip_grad_norm_(list(params_flat.values()), 1) -# -# optim.step() -# optim.zero_grad() -# -# # update of the target parameters -# params_target.apply( -# lambda p_target, p_orig: p_orig * tau + p_target * (1 - tau), -# params.detach(), -# inplace=True, -# ) -# -# actor_explore.step(current_frames) -# -# # Logging -# logs_exp1["grad_vals"].append(float(gv)) -# logs_exp1["losses"].append(error.item()) -# logs_exp1["values"].append(action_value.mean().item()) -# logs_exp1["traj_count"].append( -# prev_traj_count + data["next", "done"].sum().item() -# ) -# prev_traj_count = logs_exp1["traj_count"][-1] -# -# if j % 10 == 0: -# with set_exploration_mode("mode"), torch.no_grad(): -# # execute a rollout. The `set_exploration_mode("mode")` has no effect here since the policy is deterministic, but we add it for completeness -# eval_rollout = test_env.rollout( -# max_steps=10000, -# policy=actor, -# ).cpu() -# logs_exp1["traj_lengths_eval"].append(eval_rollout.shape[-1]) -# logs_exp1["evals"].append(eval_rollout["next", "reward"].sum().item()) -# if len(logs_exp1["mavgs"]): -# logs_exp1["mavgs"].append( -# logs_exp1["evals"][-1] * 0.05 + logs_exp1["mavgs"][-1] * 0.95 -# ) -# else: -# logs_exp1["mavgs"].append(logs_exp1["evals"][-1]) -# logs_exp1["traj_count_eval"].append(logs_exp1["traj_count"][-1]) -# pbar.set_description( -# f"error: {error: 4.4f}, value: {action_value.mean(): 4.4f}, test return: {logs_exp1['evals'][-1]: 4.4f}" -# ) -# -# # update policy weights -# data_collector.update_policy_weights_() -# -# ############################################################################### -# # We write a custom plot function to display the performance of our algorithm -# # -# -# -# def plot(logs, name): -# plt.figure(figsize=(15, 10)) -# plt.subplot(2, 3, 1) -# plt.plot( -# logs["frames"][-len(logs["evals"]) :], -# logs["evals"], -# label="return (eval)", -# ) -# plt.plot( -# logs["frames"][-len(logs["mavgs"]) :], -# logs["mavgs"], -# label="mavg of returns (eval)", -# ) -# plt.xlabel("frames collected") -# plt.ylabel("trajectory length (= return)") -# plt.subplot(2, 3, 2) -# plt.plot( -# logs["traj_count"][-len(logs["evals"]) :], -# logs["evals"], -# label="return", -# ) -# plt.plot( -# logs["traj_count"][-len(logs["mavgs"]) :], -# logs["mavgs"], -# label="mavg", -# ) -# plt.xlabel("trajectories collected") -# plt.legend() -# plt.subplot(2, 3, 3) -# plt.plot(logs["frames"][-len(logs["losses"]) :], logs["losses"]) -# plt.xlabel("frames collected") -# plt.title("loss") -# plt.subplot(2, 3, 4) -# plt.plot(logs["frames"][-len(logs["values"]) :], logs["values"]) -# plt.xlabel("frames collected") -# plt.title("value") -# plt.subplot(2, 3, 5) -# plt.plot( -# logs["frames"][-len(logs["grad_vals"]) :], -# logs["grad_vals"], -# ) -# plt.xlabel("frames collected") -# plt.title("grad norm") -# if len(logs["traj_lengths"]): -# plt.subplot(2, 3, 6) -# plt.plot(logs["traj_lengths"]) -# plt.xlabel("batches") -# plt.title("traj length (training)") -# plt.savefig(name) -# if is_notebook(): -# plt.show() -# -# -# ############################################################################### -# # The performance of the policy can be measured as the length of trajectories. -# # As we can see on the results of the :func:`plot` function, the performance -# # of the policy increases, albeit slowly. -# # -# # .. code-block:: python -# # -# # plot(logs_exp1, "dqn_td0.png") -# # -# # .. figure:: /_static/img/dqn_td0.png -# # :alt: Cart Pole results with TD(0) -# # -# -# print("shutting down") -# data_collector.shutdown() -# del data_collector -# -# ############################################################################### -# # DQN with TD(:math:`\lambda`) -# # ---------------------------- -# # -# # We can improve the above algorithm by getting a better estimate of the -# # return, using not only the next state value but the whole sequence of rewards -# # and values that follow a particular step. -# # -# # TorchRL provides a vectorized version of TD(lambda) named -# # :func:`torchrl.objectives.value.functional.vec_td_lambda_advantage_estimate`. -# # We'll use this to obtain a target value that the value network will be -# # trained to match. -# # -# # The big difference in this implementation is that we'll store entire -# # trajectories and not single steps in the replay buffer. This will be done -# # automatically as long as we're not "flattening" the tensordict collected: -# # by keeping a shape ``[Batch x timesteps]`` and giving this -# # to the RB, we'll be creating a replay buffer of size -# # ``[Capacity x timesteps]``. -# -# -# from torchrl.objectives.value.functional import vec_td_lambda_advantage_estimate -# -# ############################################################################### -# # We reset the actor parameters: -# # -# -# ( -# factor, -# actor, -# actor_explore, -# params, -# params_target, -# ) = make_model(test_env) -# params_flat = params.flatten_keys(".") -# -# optim = torch.optim.Adam(list(params_flat.values()), lr, betas=betas) -# test_env = make_env( -# parallel=False, obs_norm_sd=obs_norm_sd -# ) -# print(actor_explore(test_env.reset())) -# -# ############################################################################### -# # Data: Replay buffer and collector -# # --------------------------------- -# # -# # We need to build a new replay buffer of the appropriate size: -# # -# -# max_size = frames_per_batch // num_workers -# -# replay_buffer = TensorDictReplayBuffer( -# storage=LazyMemmapStorage(-(-buffer_size // max_size)), -# prefetch=n_optim, -# ) -# -# data_collector = MultiaSyncDataCollector( -# [ -# make_env( -# parallel=True, obs_norm_sd=obs_norm_sd -# ), -# ] -# * num_collectors, -# policy=actor_explore, -# frames_per_batch=frames_per_batch, -# total_frames=total_frames, -# exploration_mode="random", -# devices=[device] * num_collectors, -# storing_devices=[device] * num_collectors, -# # devices=[f"cuda:{i}" for i in range(1, 1 + num_collectors)], -# # storing_devices=[f"cuda:{i}" for i in range(1, 1 + num_collectors)], -# split_trajs=False, -# ) -# -# -# logs_exp2 = defaultdict(list) -# prev_traj_count = 0 -# -# ############################################################################### -# # Training loop -# # ------------- -# # -# # There are very few differences with the training loop above: -# # -# # - The tensordict received by the collector is used as-is, without being -# # flattened (recall the ``data.view(-1)`` above), to keep the temporal -# # relation between consecutive steps. -# # - We use :func:`vec_td_lambda_advantage_estimate` to compute the target -# # value. -# -# pbar = tqdm.tqdm(total=total_frames) -# for j, data in enumerate(data_collector): -# current_frames = data.numel() -# pbar.update(current_frames) -# -# replay_buffer.extend(data.cpu()) -# if len(logs_exp2["frames"]): -# logs_exp2["frames"].append(current_frames + logs_exp2["frames"][-1]) -# else: -# logs_exp2["frames"].append(current_frames) -# -# if data["next", "done"].any(): -# done = data["next", "done"].squeeze(-1) -# logs_exp2["traj_lengths"].append( -# data["next", "step_count"][done].float().mean().item() -# ) -# -# if sum(logs_exp2["frames"]) > init_random_frames: -# for _ in range(n_optim): -# sampled_data = replay_buffer.sample(batch_size // max_size) -# sampled_data = sampled_data.clone().to(device, non_blocking=True) -# -# reward = sampled_data["next", "reward"] -# done = sampled_data["next", "done"].to(reward.dtype) -# action = sampled_data["action"].clone() -# -# sampled_data_out = sampled_data.select(*actor.in_keys) -# sampled_data_out = vmap(factor, (0, None))(sampled_data_out, params) -# action_value = sampled_data_out["action_value"] -# action_value = (action_value * action.to(action_value.dtype)).sum(-1, True) -# with torch.no_grad(): -# tdstep = step_mdp(sampled_data) -# next_value = vmap(factor, (0, None))( -# tdstep.select(*actor.in_keys), params -# ) -# next_value = next_value["chosen_action_value"] -# error = vec_td_lambda_advantage_estimate( -# gamma, -# lmbda, -# action_value, -# next_value, -# reward, -# done, -# ).pow(2) -# error = error.mean() -# error.backward() -# -# gv = nn.utils.clip_grad_norm_(list(params_flat.values()), 1) -# -# optim.step() -# optim.zero_grad() -# -# # update of the target parameters -# params_target.apply( -# lambda p_target, p_orig: p_orig * tau + p_target * (1 - tau), -# params.detach(), -# inplace=True, -# ) -# -# actor_explore.step(current_frames) -# -# # Logging -# logs_exp2["grad_vals"].append(float(gv)) -# -# logs_exp2["losses"].append(error.item()) -# logs_exp2["values"].append(action_value.mean().item()) -# logs_exp2["traj_count"].append( -# prev_traj_count + data["next", "done"].sum().item() -# ) -# prev_traj_count = logs_exp2["traj_count"][-1] -# if j % 10 == 0: -# with set_exploration_mode("mode"), torch.no_grad(): -# # execute a rollout. The `set_exploration_mode("mode")` has -# # no effect here since the policy is deterministic, but we add -# # it for completeness -# eval_rollout = test_env.rollout( -# max_steps=10000, -# policy=actor, -# ).cpu() -# logs_exp2["traj_lengths_eval"].append(eval_rollout.shape[-1]) -# logs_exp2["evals"].append(eval_rollout["next", "reward"].sum().item()) -# if len(logs_exp2["mavgs"]): -# logs_exp2["mavgs"].append( -# logs_exp2["evals"][-1] * 0.05 + logs_exp2["mavgs"][-1] * 0.95 -# ) -# else: -# logs_exp2["mavgs"].append(logs_exp2["evals"][-1]) -# logs_exp2["traj_count_eval"].append(logs_exp2["traj_count"][-1]) -# pbar.set_description( -# f"error: {error: 4.4f}, value: {action_value.mean(): 4.4f}, test return: {logs_exp2['evals'][-1]: 4.4f}" -# ) -# -# # update policy weights -# data_collector.update_policy_weights_() -# -# -# ############################################################################### -# # TD(:math:`\lambda`) performs significantly better than TD(0) because it -# # retrieves a much less biased estimate of the state-action value. -# # -# # .. code-block:: python -# # -# # plot(logs_exp2, "dqn_tdlambda.png") -# # -# # .. figure:: /_static/img/dqn_tdlambda.png -# # :alt: Cart Pole results with TD(lambda) -# # -# -# -# print("shutting down") -# data_collector.shutdown() -# del data_collector -# -# ############################################################################### -# # Let's compare the results on a single plot. Because the TD(lambda) version -# # works better, we'll have fewer episodes collected for a given number of -# # frames (as there are more frames per episode). -# # -# # **Note**: As already mentioned above, to get a more reasonable performance, -# # use a greater value for ``total_frames`` e.g. 500000. -# -# -# def plot_both(): -# frames_td0 = logs_exp1["frames"] -# frames_tdlambda = logs_exp2["frames"] -# evals_td0 = logs_exp1["evals"] -# evals_tdlambda = logs_exp2["evals"] -# mavgs_td0 = logs_exp1["mavgs"] -# mavgs_tdlambda = logs_exp2["mavgs"] -# traj_count_td0 = logs_exp1["traj_count_eval"] -# traj_count_tdlambda = logs_exp2["traj_count_eval"] -# -# plt.figure(figsize=(15, 10)) -# plt.subplot(1, 2, 1) -# plt.plot(frames_td0[-len(evals_td0) :], evals_td0, label="return (td0)", alpha=0.5) -# plt.plot( -# frames_tdlambda[-len(evals_tdlambda) :], -# evals_tdlambda, -# label="return (td(lambda))", -# alpha=0.5, -# ) -# plt.plot(frames_td0[-len(mavgs_td0) :], mavgs_td0, label="mavg (td0)") -# plt.plot( -# frames_tdlambda[-len(mavgs_tdlambda) :], -# mavgs_tdlambda, -# label="mavg (td(lambda))", -# ) -# plt.xlabel("frames collected") -# plt.ylabel("trajectory length (= return)") -# -# plt.subplot(1, 2, 2) -# plt.plot( -# traj_count_td0[-len(evals_td0) :], -# evals_td0, -# label="return (td0)", -# alpha=0.5, -# ) -# plt.plot( -# traj_count_tdlambda[-len(evals_tdlambda) :], -# evals_tdlambda, -# label="return (td(lambda))", -# alpha=0.5, -# ) -# plt.plot(traj_count_td0[-len(mavgs_td0) :], mavgs_td0, label="mavg (td0)") -# plt.plot( -# traj_count_tdlambda[-len(mavgs_tdlambda) :], -# mavgs_tdlambda, -# label="mavg (td(lambda))", -# ) -# plt.xlabel("trajectories collected") -# plt.legend() -# -# plt.savefig("dqn.png") -# -# -# ############################################################################### -# # .. code-block:: python -# # -# # plot_both() -# # -# # .. figure:: /_static/img/dqn.png -# # :alt: Cart Pole results from the TD(:math:`lambda`) trained policy. -# # -# # Finally, we generate a new video to check what the algorithm has learnt. -# # If all goes well, the duration should be significantly longer than with a -# # random rollout. -# # -# # To get the raw pixels of the rollout, we insert a -# # :class:`torchrl.envs.CatTensors` transform that precedes all others and copies -# # the ``"pixels"`` key onto a ``"pixels_save"`` key. This is necessary because -# # the other transforms that modify this key will update its value in-place in -# # the output tensordict. -# # -# -# test_env.transform.insert(0, CatTensors(["pixels"], "pixels_save", del_keys=False)) -# eval_rollout = test_env.rollout(max_steps=10000, policy=actor, auto_reset=True).cpu() -# -# # sphinx_gallery_start_ignore -# import imageio -# -# imageio.mimwrite("cartpole.gif", eval_rollout["pixels_save"].numpy(), fps=30) -# # sphinx_gallery_end_ignore -# -# del test_env -# -# ############################################################################### -# # The video of the rollout can be saved using the imageio package: -# # -# # .. code-block:: -# # -# # import imageio -# # imageio.mimwrite('cartpole.mp4', eval_rollout["pixels_save"].numpy(), fps=30); -# # -# # .. figure:: /_static/img/cartpole.gif -# # :alt: Cart Pole results from the TD(:math:`\lambda`) trained policy. -# -# ############################################################################### -# # Conclusion and possible improvements -# # ------------------------------------ -# # -# # In this tutorial we have learnt: -# # -# # - How to train a policy that read pixel-based states, what transforms to -# # include and how to normalize the data; -# # - How to create a policy that picks up the action with the highest value -# # with :class:`torchrl.modules.QValueNetwork`; -# # - How to build a multiprocessed data collector; -# # - How to train a DQN with TD(:math:`\lambda`) returns. -# # -# # We have seen that using TD(:math:`\lambda`) greatly improved the performance -# # of DQN. Other possible improvements could include: -# # -# # - Using the Multi-Step post-processing. Multi-step will project an action -# # to the nth following step, and create a discounted sum of the rewards in -# # between. This trick can make the algorithm noticebly less myopic. To use -# # this, simply create the collector with -# # -# # from torchrl.data.postprocs.postprocs import MultiStep -# # collector = CollectorClass(..., postproc=MultiStep(gamma, n)) -# # -# # where ``n`` is the number of looking-forward steps. Pay attention to the -# # fact that the ``gamma`` factor has to be corrected by the number of -# # steps till the next observation when being passed to -# # ``vec_td_lambda_advantage_estimate``: -# # -# # gamma = gamma ** tensordict["steps_to_next_obs"] -# # - A prioritized replay buffer could also be used. This will give a -# # higher priority to samples that have the worst value accuracy. -# # - A distributional loss (see ``torchrl.objectives.DistributionalDQNLoss`` -# # for more information). -# # - More fancy exploration techniques, such as NoisyLinear layers and such -# # (check ``torchrl.modules.NoisyLinear``, which is fully compatible with the -# # ``MLP`` class used in our Dueling DQN). +############################################################################### +# We can now quickly check the CSVs with the results. + +# TODO + +############################################################################### +# Conclusion and possible improvements +# ------------------------------------ +# +# In this tutorial we have learnt: +# +# - How to train a policy that read pixel-based states, what transforms to +# include and how to normalize the data; +# - How to create a policy that picks up the action with the highest value +# with :class:`torchrl.modules.QValueNetwork`; +# - How to build a multiprocessed data collector; +# - How to train a DQN with TD(:math:`\lambda`) returns. +# +# We have seen that using TD(:math:`\lambda`) greatly improved the performance +# of DQN. Other possible improvements could include: +# +# - Using the Multi-Step post-processing. Multi-step will project an action +# to the nth following step, and create a discounted sum of the rewards in +# between. This trick can make the algorithm noticebly less myopic. To use +# this, simply create the collector with +# +# from torchrl.data.postprocs.postprocs import MultiStep +# collector = CollectorClass(..., postproc=MultiStep(gamma, n)) +# +# where ``n`` is the number of looking-forward steps. Pay attention to the +# fact that the ``gamma`` factor has to be corrected by the number of +# steps till the next observation when being passed to +# ``vec_td_lambda_advantage_estimate``: +# +# gamma = gamma ** tensordict["steps_to_next_obs"] +# - A prioritized replay buffer could also be used. This will give a +# higher priority to samples that have the worst value accuracy. +# - A distributional loss (see ``torchrl.objectives.DistributionalDQNLoss`` +# for more information). +# - More fancy exploration techniques, such as NoisyLinear layers and such +# (check ``torchrl.modules.NoisyLinear``, which is fully compatible with the +# ``MLP`` class used in our Dueling DQN). From c957916cfcfe431e2edcf6c605bd57d2532a868d Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 28 Mar 2023 21:33:34 +0100 Subject: [PATCH 34/89] fix trainer --- test/test_trainer.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/test/test_trainer.py b/test/test_trainer.py index 1251d4edd48..8544efb6b6f 100644 --- a/test/test_trainer.py +++ b/test/test_trainer.py @@ -89,11 +89,10 @@ class MockingLossModule(nn.Module): def mocking_trainer(file=None, optimizer=_mocking_optim) -> Trainer: trainer = Trainer( - MockingCollector(), - *[ - None, - ] - * 2, + collector=MockingCollector(), + total_frames=None, + frame_skip=None, + optim_steps_per_batch=None, loss_module=MockingLossModule(), optimizer=optimizer, save_trainer_file=file, From 03007282a351f4c825af1cf8d0a783b1c5e9dca4 Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 29 Mar 2023 08:55:36 +0100 Subject: [PATCH 35/89] no grad --- tutorials/sphinx-tutorials/coding_ppo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tutorials/sphinx-tutorials/coding_ppo.py b/tutorials/sphinx-tutorials/coding_ppo.py index 77ed207837f..274269a3dac 100644 --- a/tutorials/sphinx-tutorials/coding_ppo.py +++ b/tutorials/sphinx-tutorials/coding_ppo.py @@ -602,7 +602,8 @@ # We'll need an "advantage" signal to make PPO work. # We re-compute it at each epoch as its value depends on the value # network which is updated in the inner loop. - advantage_module(tensordict_data) + with torch.no_grad(): + advantage_module(tensordict_data) data_view = tensordict_data.reshape(-1) replay_buffer.extend(data_view.cpu()) for _ in range(frames_per_batch // sub_batch_size): From 86915fe373a7e247a67d6af3a6fb870c0756e928 Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 29 Mar 2023 11:01:43 +0100 Subject: [PATCH 36/89] init --- torchrl/data/replay_buffers/replay_buffers.py | 426 +++++++++++++++--- torchrl/data/replay_buffers/samplers.py | 4 + 2 files changed, 374 insertions(+), 56 deletions(-) diff --git a/torchrl/data/replay_buffers/replay_buffers.py b/torchrl/data/replay_buffers/replay_buffers.py index 0a20dc6dff7..1e824817b92 100644 --- a/torchrl/data/replay_buffers/replay_buffers.py +++ b/torchrl/data/replay_buffers/replay_buffers.py @@ -82,32 +82,80 @@ def decorated_fun(self, *args, **kwargs): class ReplayBuffer: """A generic, composable replay buffer class. + All arguments are keyword-only arguments. + Args: storage (Storage, optional): the storage to be used. If none is provided - a default ListStorage with max_size of 1_000 will be created. - sampler (Sampler, optional): the sampler to be used. If none is provided - a default RandomSampler() will be used. + a default :class:`torchrl.data.replay_buffers.ListStorage` with + ``max_size`` of ``1_000`` will be created. + sampler (Sampler, optional): the sampler to be used. If none is provided, + a default :class:`torchrl.data.replay_buffers.RandomSampler` + will be used. writer (Writer, optional): the writer to be used. If none is provided - a default RoundRobinWriter() will be used. + a default :class:`torchrl.data.replay_buffers.RoundRobinWriter` + will be used. collate_fn (callable, optional): merges a list of samples to form a mini-batch of Tensor(s)/outputs. Used when using batched - loading from a map-style dataset. + loading from a map-style dataset. The default value will be decided + based on the storage type. pin_memory (bool): whether pin_memory() should be called on the rb samples. prefetch (int, optional): number of next batches to be prefetched - using multithreading. - transform (Transform, optional): Transform to be executed when sample() is called. - To chain transforms use the :obj:`Compose` class. + using multithreading. Defaults to None (no prefetching). + transform (Transform, optional): Transform to be executed when + sample() is called. + To chain transforms use the :class:`torchrl.envs.Compose` class. Transforms should be used with :class:`tensordict.TensorDict` content. If used with other structures, the transforms should be - encoded with a `"data"` leading key that will be used to + encoded with a ``"data"`` leading key that will be used to construct a tensordict from the non-tensordict content. - batch_size (int, optional): the batch size to be used when sample() is called. + batch_size (int, optional): the batch size to be used when sample() is + called. + .. note:: + The batch-size can be specified at construction time via the + ``batch_size`` argument, or at sampling time. The former should + be preferred whenever the batch-size is consistent across the + experiment. If the batch-size is likely to change, it can be + passed to the :meth:`~.sample` method. This option is + incompatible with prefetching (since this requires to know the + batch-size in advance) as well as with samplers that have a + ``drop_last`` argument. + Examples: + >>> import torch + >>> + >>> from torchrl.data import ReplayBuffer, ListStorage + >>> + >>> torch.manual_seed(0) + >>> rb = ReplayBuffer( + ... storage=ListStorage(max_size=1000), + ... batch_size=5, + ... ) + >>> # populate the replay buffer + >>> data = range(10) + >>> rb.extend(data) + >>> # sample will return as many elements as specified in the constructor + >>> sample = rb.sample() + >>> print(sample) + tensor([4, 9, 3, 0, 3]) + >>> # Passing the batch-size to the sample method overrides the one in the constructor + >>> sample = rb.sample(batch_size=3) + >>> print(sample) + tensor([9, 7, 3]) + >>> # one cans sample using the ``sample`` method or iterate over the buffer + >>> for i, batch in enumerate(rb): + ... print(i, batch) + ... if i == 3: + ... break + 0 tensor([7, 3, 1, 6, 6]) + 1 tensor([9, 8, 6, 6, 8]) + 2 tensor([4, 3, 6, 9, 1]) + 3 tensor([4, 4, 1, 9, 9]) """ def __init__( self, + *, storage: Optional[Storage] = None, sampler: Optional[Sampler] = None, writer: Optional[Writer] = None, @@ -147,10 +195,21 @@ def __init__( transform.eval() self._transform = transform - if batch_size is None: - warnings.warn( - "Constructing replay buffer without specifying behaviour is no longer " - "recommended, and will be deprecated in the future." + if batch_size is None and prefetch: + raise ValueError( + "Dynamic batch-size specification is incompatible " + "with multithreaded sampling. " + "When using prefetch, the batch-size must be specified in " + "advance. " + ) + if ( + batch_size is None + and hasattr(self._sampler, "drop_last") + and self._sampler.drop_last + ): + raise ValueError( + "Samplers with drop_last=True must work with a predictible batch-size. " + "Please pass the batch-size to the ReplayBuffer constructor." ) self._batch_size = batch_size @@ -247,6 +306,7 @@ def update_priority( def _sample(self, batch_size: int) -> Tuple[Any, dict]: with self._replay_lock: index, info = self._sampler.sample(self._storage, batch_size) + info["index"] = index data = self._storage[index] if not isinstance(index, INT_CLASSES): data = self._collate_fn(data) @@ -279,17 +339,26 @@ def sample( A batch of data selected in the replay buffer. A tuple containing this batch and info if return_info flag is set to True. """ - if batch_size is not None: + if ( + batch_size is not None + and self._batch_size is not None + and batch_size != self._batch_size + ): warnings.warn( - "batch_size argument in sample has been deprecated. Set the batch_size " - "when constructing the replay buffer instead." + f"Got conflicting batch_sizes in constructor ({self._batch_size}) " + f"and `sample` ({batch_size}). Refer to the ReplayBuffer documentation " + "for a proper usage of the batch-size arguments. " + "The batch-size provided to the sample method " + "will prevail." ) - elif self._batch_size is not None: + elif batch_size is None and self._batch_size is not None: batch_size = self._batch_size - else: + elif batch_size is None: raise RuntimeError( "batch_size not specified. You can specify the batch_size when " - "constructing the replay buffer" + "constructing the replay buffer, or pass it to the sample method. " + "Refer to the ReplayBuffer documentation " + "for a proper usage of the batch-size arguments." ) if not self._prefetch: ret = self._sample(batch_size) @@ -336,9 +405,12 @@ def insert_transform(self, index: int, transform: "Transform") -> None: # noqa- self._transform.insert(index, transform) def __iter__(self): + if self._sampler.ran_out: + self._sampler.ran_out = False if self._batch_size is None: raise RuntimeError( - "batch_size was not specified during construction of the replay buffer" + "Cannot iterate over the replay buffer. " + "Batch_size was not specified during construction of the replay buffer." ) while not self._sampler.ran_out: data = self.sample() @@ -348,6 +420,8 @@ def __iter__(self): class PrioritizedReplayBuffer(ReplayBuffer): """Prioritized replay buffer. + All arguments are keyword-only arguments. + Presented in "Schaul, T.; Quan, J.; Antonoglou, I.; and Silver, D. 2015. Prioritized experience replay." @@ -359,22 +433,75 @@ class PrioritizedReplayBuffer(ReplayBuffer): beta (float): importance sampling negative exponent. eps (float): delta added to the priorities to ensure that the buffer does not contain null priorities. - dtype (torch.dtype): type of the data. Can be torch.float or torch.double. storage (Storage, optional): the storage to be used. If none is provided - a default ListStorage with max_size of 1_000 will be created. + a default :class:`torchrl.data.replay_buffers.ListStorage` with + ``max_size`` of ``1_000`` will be created. collate_fn (callable, optional): merges a list of samples to form a mini-batch of Tensor(s)/outputs. Used when using batched - loading from a map-style dataset. + loading from a map-style dataset. The default value will be decided + based on the storage type. pin_memory (bool): whether pin_memory() should be called on the rb samples. prefetch (int, optional): number of next batches to be prefetched - using multithreading. - transform (Transform, optional): Transform to be executed when sample() is called. - To chain transforms use the :obj:`Compose` class. + using multithreading. Defaults to None (no prefetching). + transform (Transform, optional): Transform to be executed when + sample() is called. + To chain transforms use the :class:`torchrl.envs.Compose` class. + Transforms should be used with :class:`tensordict.TensorDict` + content. If used with other structures, the transforms should be + encoded with a ``"data"`` leading key that will be used to + construct a tensordict from the non-tensordict content. + batch_size (int, optional): the batch size to be used when sample() is + called. + .. note:: + The batch-size can be specified at construction time via the + ``batch_size`` argument, or at sampling time. The former should + be preferred whenever the batch-size is consistent across the + experiment. If the batch-size is likely to change, it can be + passed to the :meth:`~.sample` method. This option is + incompatible with prefetching (since this requires to know the + batch-size in advance) as well as with samplers that have a + ``drop_last`` argument. + + .. note:: + Generic prioritized replay buffers (ie. non-tensordict backed) require + calling :meth:`~.sample` with the ``return_info`` argument set to + ``True`` to have access to the indices, and hence update the priority. + Using :class:`tensordict.TensorDict` and the related + :class:`torchrl.data.TensorDictPrioritizedReplayBuffer` simplifies this + process. + + Examples: + >>> import torch + >>> + >>> from torchrl.data import ListStorage, PrioritizedReplayBuffer + >>> + >>> torch.manual_seed(0) + >>> + >>> rb = PrioritizedReplayBuffer(alpha=0.7, beta=0.9, storage=ListStorage(10)) + >>> data = range(10) + >>> rb.extend(data) + >>> sample = rb.sample(3) + >>> print(sample) + tensor([1, 0, 1]) + >>> # get the info to find what the indices are + >>> sample, info = rb.sample(5, return_info=True) + >>> print(sample, info) + tensor([2, 7, 4, 3, 5]) {'_weight': array([1., 1., 1., 1., 1.], dtype=float32), 'index': array([2, 7, 4, 3, 5])} + >>> # update priority + >>> priority = torch.ones(5) * 5 + >>> rb.update_priority(info["index"], priority) + >>> # and now a new sample, the weights should be updated + >>> sample, info = rb.sample(5, return_info=True) + >>> print(sample, info) + tensor([2, 5, 2, 2, 5]) {'_weight': array([0.36278465, 0.36278465, 0.36278465, 0.36278465, 0.36278465], + dtype=float32), 'index': array([2, 5, 2, 2, 5])} + """ def __init__( self, + *, alpha: float, beta: float, eps: float = 1e-8, @@ -401,15 +528,114 @@ def __init__( class TensorDictReplayBuffer(ReplayBuffer): - """TensorDict-specific wrapper around the ReplayBuffer class. + """TensorDict-specific wrapper around the :class:`torchrl.data.ReplayBuffer` class. + + All arguments are keyword-only arguments. Args: - priority_key (str): the key at which priority is assumed to be stored - within TensorDicts added to this ReplayBuffer. + storage (Storage, optional): the storage to be used. If none is provided + a default :class:`torchrl.data.replay_buffers.ListStorage` with + ``max_size`` of ``1_000`` will be created. + sampler (Sampler, optional): the sampler to be used. If none is provided + a default RandomSampler() will be used. + writer (Writer, optional): the writer to be used. If none is provided + a default :class:`torchrl.data.replay_buffers.RoundRobinWriter` + will be used. + collate_fn (callable, optional): merges a list of samples to form a + mini-batch of Tensor(s)/outputs. Used when using batched + loading from a map-style dataset. The default value will be decided + based on the storage type. + pin_memory (bool): whether pin_memory() should be called on the rb + samples. + prefetch (int, optional): number of next batches to be prefetched + using multithreading. Defaults to None (no prefetching). + transform (Transform, optional): Transform to be executed when + sample() is called. + To chain transforms use the :class:`torchrl.envs.Compose` class. + Transforms should be used with :class:`tensordict.TensorDict` + content. If used with other structures, the transforms should be + encoded with a ``"data"`` leading key that will be used to + construct a tensordict from the non-tensordict content. + batch_size (int, optional): the batch size to be used when sample() is + called. + .. note:: + The batch-size can be specified at construction time via the + ``batch_size`` argument, or at sampling time. The former should + be preferred whenever the batch-size is consistent across the + experiment. If the batch-size is likely to change, it can be + passed to the :meth:`~.sample` method. This option is + incompatible with prefetching (since this requires to know the + batch-size in advance) as well as with samplers that have a + ``drop_last`` argument. + priority_key (str, optional): the key at which priority is assumed to + be stored within TensorDicts added to this ReplayBuffer. + This is to be used when the sampler is of type + :class:`torchrl.data.PrioritizedSampler`. + Defaults to ``"td_error"``. + + Examples: + >>> import torch + >>> + >>> from torchrl.data import LazyTensorStorage, TensorDictReplayBuffer + >>> from tensordict import TensorDict + >>> + >>> torch.manual_seed(0) + >>> + >>> rb = TensorDictReplayBuffer(storage=LazyTensorStorage(10), batch_size=5) + >>> data = TensorDict({"a": torch.ones(10, 3), ("b", "c"): torch.zeros(10, 1, 1)}, [10]) + >>> rb.extend(data) + >>> sample = rb.sample(3) + >>> # samples keep track of the index + >>> print(sample) + TensorDict( + fields={ + a: Tensor(shape=torch.Size([3, 3]), device=cpu, dtype=torch.float32, is_shared=False), + b: TensorDict( + fields={ + c: Tensor(shape=torch.Size([3, 1, 1]), device=cpu, dtype=torch.float32, is_shared=False)}, + batch_size=torch.Size([3]), + device=cpu, + is_shared=False), + index: Tensor(shape=torch.Size([3]), device=cpu, dtype=torch.int32, is_shared=False)}, + batch_size=torch.Size([3]), + device=cpu, + is_shared=False) + >>> # we can iterate over the buffer + >>> for i, data in enumerate(rb): + ... print(i, data) + ... if i == 2: + ... break + 0 TensorDict( + fields={ + a: Tensor(shape=torch.Size([5, 3]), device=cpu, dtype=torch.float32, is_shared=False), + b: TensorDict( + fields={ + c: Tensor(shape=torch.Size([5, 1, 1]), device=cpu, dtype=torch.float32, is_shared=False)}, + batch_size=torch.Size([5]), + device=cpu, + is_shared=False), + index: Tensor(shape=torch.Size([5]), device=cpu, dtype=torch.int32, is_shared=False)}, + batch_size=torch.Size([5]), + device=cpu, + is_shared=False) + 1 TensorDict( + fields={ + a: Tensor(shape=torch.Size([5, 3]), device=cpu, dtype=torch.float32, is_shared=False), + b: TensorDict( + fields={ + c: Tensor(shape=torch.Size([5, 1, 1]), device=cpu, dtype=torch.float32, is_shared=False)}, + batch_size=torch.Size([5]), + device=cpu, + is_shared=False), + index: Tensor(shape=torch.Size([5]), device=cpu, dtype=torch.int32, is_shared=False)}, + batch_size=torch.Size([5]), + device=cpu, + is_shared=False) + """ - def __init__(self, *args, priority_key: str = "td_error", **kw) -> None: - super().__init__(*args, **kw) + def __init__(self, *, priority_key: str = "td_error", **kw) -> None: + super().__init__(**kw) self.priority_key = priority_key def _get_priority(self, tensordict: TensorDictBase) -> Optional[torch.Tensor]: @@ -498,8 +724,8 @@ def update_tensordict_priority(self, data: TensorDictBase) -> None: def sample( self, batch_size: Optional[int] = None, - include_info: bool = False, return_info: bool = False, + include_info: bool = None, ) -> TensorDictBase: """Samples a batch of data from the replay buffer. @@ -509,7 +735,6 @@ def sample( batch_size (int, optional): size of data to be collected. If none is provided, this method will sample a batch-size as indicated by the sampler. - include_info (bool): whether to add info to the returned tensordict. return_info (bool): whether to return info. If True, the result is a tuple (data, info). If False, the result is the data. @@ -517,10 +742,18 @@ def sample( A tensordict containing a batch of data selected in the replay buffer. A tuple containing this tensordict and info if return_info flag is set to True. """ + if include_info is not None: + warnings.warn( + "include_info is going to be deprecated soon." + "The default behaviour has changed to `include_info=True` " + "to avoid bugs linked to wrongly preassigned values in the " + "output tensordict." + ) + data, info = super().sample(batch_size, return_info=True) - if include_info: + if include_info in (True, None): for k, v in info.items(): - data.set(k, torch.tensor(v, device=data.device), inplace=True) + data.set(k, torch.tensor(v, device=data.device)) if "_batch_size" in data.keys(): # we need to reset the batch-size shape = data.pop("_batch_size") @@ -539,40 +772,119 @@ def sample( class TensorDictPrioritizedReplayBuffer(TensorDictReplayBuffer): - """TensorDict-specific wrapper around the PrioritizedReplayBuffer class. + """TensorDict-specific wrapper around the :class:`torchrl.data.PrioritizedReplayBuffer` class. - This class returns tensordicts with a new key "index" that represents + All arguments are keyword-only arguments. + + This class returns tensordicts with a new key ``"index"`` that represents the index of each element in the replay buffer. It also provides the - 'update_tensordict_priority' method that only requires for the + :meth:`~.update_tensordict_priority` method that only requires for the tensordict to be passed to it with its new priority value. Args: - alpha (float): exponent α determines how much prioritization is - used, with α = 0 corresponding to the uniform case. + alpha (float): exponent α determines how much prioritization is used, + with α = 0 corresponding to the uniform case. beta (float): importance sampling negative exponent. - priority_key (str, optional): key where the priority value can be - found in the stored tensordicts. Default is :obj:`"td_error"` - eps (float, optional): delta added to the priorities to ensure that the - buffer does not contain null priorities. - dtype (torch.dtype): type of the data. Can be torch.float or torch.double. + eps (float): delta added to the priorities to ensure that the buffer + does not contain null priorities. storage (Storage, optional): the storage to be used. If none is provided - a default ListStorage with max_size of 1_000 will be created. + a default :class:`torchrl.data.replay_buffers.ListStorage` with + ``max_size`` of ``1_000`` will be created. collate_fn (callable, optional): merges a list of samples to form a - mini-batch of Tensor(s)/outputs. Used when using batched loading - from a map-style dataset. - pin_memory (bool, optional): whether pin_memory() should be called on - the rb samples. Default is ``False``. + mini-batch of Tensor(s)/outputs. Used when using batched + loading from a map-style dataset. The default value will be decided + based on the storage type. + pin_memory (bool): whether pin_memory() should be called on the rb + samples. prefetch (int, optional): number of next batches to be prefetched - using multithreading. - transform (Transform, optional): Transform to be executed when sample() is called. - To chain transforms use the :obj:`Compose` class. + using multithreading. Defaults to None (no prefetching). + transform (Transform, optional): Transform to be executed when + sample() is called. + To chain transforms use the :class:`torchrl.envs.Compose` class. + Transforms should be used with :class:`tensordict.TensorDict` + content. If used with other structures, the transforms should be + encoded with a ``"data"`` leading key that will be used to + construct a tensordict from the non-tensordict content. + batch_size (int, optional): the batch size to be used when sample() is + called. + .. note:: + The batch-size can be specified at construction time via the + ``batch_size`` argument, or at sampling time. The former should + be preferred whenever the batch-size is consistent across the + experiment. If the batch-size is likely to change, it can be + passed to the :meth:`~.sample` method. This option is + incompatible with prefetching (since this requires to know the + batch-size in advance) as well as with samplers that have a + ``drop_last`` argument. + priority_key (str, optional): the key at which priority is assumed to + be stored within TensorDicts added to this ReplayBuffer. + This is to be used when the sampler is of type + :class:`torchrl.data.PrioritizedSampler`. + Defaults to ``"td_error"``. reduction (str, optional): the reduction method for multidimensional tensordicts (ie stored trajectories). Can be one of "max", "min", "median" or "mean". + + Examples: + >>> import torch + >>> + >>> from torchrl.data import LazyTensorStorage, TensorDictPrioritizedReplayBuffer + >>> from tensordict import TensorDict + >>> + >>> torch.manual_seed(0) + >>> + >>> rb = TensorDictPrioritizedReplayBuffer(alpha=0.7, beta=1.1, storage=LazyTensorStorage(10), batch_size=5) + >>> data = TensorDict({"a": torch.ones(10, 3), ("b", "c"): torch.zeros(10, 3, 1)}, [10]) + >>> rb.extend(data) + >>> print("len of rb", len(rb)) + len of rb 10 + >>> sample = rb.sample(5) + >>> print(sample) + TensorDict( + fields={ + _weight: Tensor(shape=torch.Size([5]), device=cpu, dtype=torch.float32, is_shared=False), + a: Tensor(shape=torch.Size([5, 3]), device=cpu, dtype=torch.float32, is_shared=False), + b: TensorDict( + fields={ + c: Tensor(shape=torch.Size([5, 3, 1]), device=cpu, dtype=torch.float32, is_shared=False)}, + batch_size=torch.Size([5]), + device=cpu, + is_shared=False), + index: Tensor(shape=torch.Size([5]), device=cpu, dtype=torch.int64, is_shared=False)}, + batch_size=torch.Size([5]), + device=cpu, + is_shared=False) + >>> print("index", sample["index"]) + index tensor([9, 5, 2, 2, 7]) + >>> # give a high priority to these samples... + >>> sample.set("td_error", 100*torch.ones(sample.shape)) + >>> # and update priority + >>> rb.update_tensordict_priority(sample) + >>> # the new sample should have a high overlap with the previous one + >>> sample = rb.sample(5) + >>> print(sample) + TensorDict( + fields={ + _weight: Tensor(shape=torch.Size([5]), device=cpu, dtype=torch.float32, is_shared=False), + a: Tensor(shape=torch.Size([5, 3]), device=cpu, dtype=torch.float32, is_shared=False), + b: TensorDict( + fields={ + c: Tensor(shape=torch.Size([5, 3, 1]), device=cpu, dtype=torch.float32, is_shared=False)}, + batch_size=torch.Size([5]), + device=cpu, + is_shared=False), + index: Tensor(shape=torch.Size([5]), device=cpu, dtype=torch.int64, is_shared=False)}, + batch_size=torch.Size([5]), + device=cpu, + is_shared=False) + >>> print("index", sample["index"]) + index tensor([2, 5, 5, 9, 7]) + """ def __init__( self, + *, alpha: float, beta: float, priority_key: str = "td_error", @@ -612,10 +924,12 @@ def __init__(self, *args, **kwargs): def sample( self, batch_size: Optional[int] = None, - include_info: bool = False, + include_info: bool = None, return_info: bool = False, ) -> TensorDictBase: - return super().sample(batch_size, include_info, return_info) + return super().sample( + batch_size=batch_size, include_info=include_info, return_info=return_info + ) def add(self, data: TensorDictBase) -> int: return super().add(data) diff --git a/torchrl/data/replay_buffers/samplers.py b/torchrl/data/replay_buffers/samplers.py index 9fd0fab8af4..564b1197c2c 100644 --- a/torchrl/data/replay_buffers/samplers.py +++ b/torchrl/data/replay_buffers/samplers.py @@ -137,6 +137,10 @@ def sample(self, storage: Storage, batch_size: int) -> Tuple[Any, dict]: def ran_out(self): return self._ran_out + @ran_out.setter + def ran_out(self, value): + self._ran_out = value + class PrioritizedSampler(Sampler): """Prioritized sampler for replay buffer. From d23af8b1760a0879af6592922db350dc23771d7d Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 29 Mar 2023 11:06:51 +0100 Subject: [PATCH 37/89] tests --- test/test_rb.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/test/test_rb.py b/test/test_rb.py index 6c829ca5668..8d75d287236 100644 --- a/test/test_rb.py +++ b/test/test_rb.py @@ -580,6 +580,54 @@ def test_index(self, rbtype, storage, size, prefetch): assert b +def test_multi_loops(): + """Tests that one can iterate multiple times over a buffer without rep.""" + rb = ReplayBuffer( + batch_size=5, storage=ListStorage(10), sampler=SamplerWithoutReplacement() + ) + rb.extend(torch.zeros(10)) + for i, d in enumerate(rb): # noqa: B007 + assert (d == 0).all() + assert i == 1 + for i, d in enumerate(rb): # noqa: B007 + assert (d == 0).all() + assert i == 1 + + +def test_batch_errors(): + """Tests error messages related to batch-size""" + rb = ReplayBuffer( + storage=ListStorage(10), sampler=SamplerWithoutReplacement(drop_last=False) + ) + rb.extend(torch.zeros(10)) + rb.sample(3) # that works + with pytest.raises( + RuntimeError, + match="Cannot iterate over the replay buffer. Batch_size was not specified", + ): + for _ in rb: + pass + with pytest.raises(RuntimeError, match="batch_size not specified"): + rb.sample() + with pytest.raises(ValueError, match="Samplers with drop_last=True"): + ReplayBuffer( + storage=ListStorage(10), sampler=SamplerWithoutReplacement(drop_last=True) + ) + # that works + ReplayBuffer( + storage=ListStorage(10), + ) + rb = ReplayBuffer( + storage=ListStorage(10), + sampler=SamplerWithoutReplacement(drop_last=False), + batch_size=3, + ) + rb.extend(torch.zeros(10)) + for _ in rb: + pass + rb.sample() + + @pytest.mark.parametrize("priority_key", ["pk", "td_error"]) @pytest.mark.parametrize("contiguous", [True, False]) @pytest.mark.parametrize("device", get_available_devices()) From 68c34425b1c9fb8658c440b91c82a4245b49a5a3 Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 29 Mar 2023 12:04:00 +0100 Subject: [PATCH 38/89] empty commit From dec5c56ba3a7b6742fd233e8f8d1ac16655ca54c Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 29 Mar 2023 13:16:20 +0100 Subject: [PATCH 39/89] tests --- test/test_trainer.py | 18 +++++++------ test/test_transforms.py | 56 ++++++++++++++++++++--------------------- 2 files changed, 38 insertions(+), 36 deletions(-) diff --git a/test/test_trainer.py b/test/test_trainer.py index 1251d4edd48..533fd4f0b0d 100644 --- a/test/test_trainer.py +++ b/test/test_trainer.py @@ -203,7 +203,9 @@ def test_rb_trainer(self, prioritized): S = 100 storage = ListStorage(S) if prioritized: - replay_buffer = TensorDictPrioritizedReplayBuffer(1.1, 0.9, storage=storage) + replay_buffer = TensorDictPrioritizedReplayBuffer( + alpha=1.1, beta=0.9, storage=storage + ) else: replay_buffer = TensorDictReplayBuffer(storage=storage) @@ -260,8 +262,8 @@ def test_rb_trainer_state_dict(self, prioritized, storage_type): if prioritized: replay_buffer = TensorDictPrioritizedReplayBuffer( - 1.1, - 0.9, + alpha=1.1, + beta=0.9, storage=storage, ) else: @@ -293,7 +295,7 @@ def test_rb_trainer_state_dict(self, prioritized, storage_type): trainer2 = mocking_trainer() if prioritized: replay_buffer2 = TensorDictPrioritizedReplayBuffer( - 1.1, 0.9, storage=storage + alpha=1.1, beta=0.9, storage=storage ) else: replay_buffer2 = TensorDictReplayBuffer(storage=storage) @@ -398,8 +400,8 @@ def make_storage(): storage = make_storage() if prioritized: replay_buffer = TensorDictPrioritizedReplayBuffer( - 1.1, - 0.9, + alpha=1.1, + beta=0.9, storage=storage, ) else: @@ -430,8 +432,8 @@ def make_storage(): storage2 = make_storage() if prioritized: replay_buffer2 = TensorDictPrioritizedReplayBuffer( - 1.1, - 0.9, + alpha=1.1, + beta=0.9, storage=storage2, ) else: diff --git a/test/test_transforms.py b/test/test_transforms.py index 0b7d9391e6a..b28cb9a758a 100644 --- a/test/test_transforms.py +++ b/test/test_transforms.py @@ -281,7 +281,7 @@ def test_transform_rb(self): batch = [20] torch.manual_seed(0) br = BinarizeReward() - rb = ReplayBuffer(LazyTensorStorage(20)) + rb = ReplayBuffer(storage=LazyTensorStorage(20)) rb.append_transform(br) reward = torch.randn(*batch, 1, device=device) misc = torch.randn(*batch, 1, device=device) @@ -419,7 +419,7 @@ def test_transform_rb(self): key_tensors = [key1_tensor, key2_tensor] td = TensorDict(dict(zip(keys, key_tensors)), batch_size, device=device) cat_frames = CatFrames(N=N, in_keys=keys, dim=dim) - rb = ReplayBuffer(LazyTensorStorage(20)) + rb = ReplayBuffer(storage=LazyTensorStorage(20)) rb.append_transform(cat_frames) rb.extend(td) with pytest.raises( @@ -651,7 +651,7 @@ def test_transform_rb(self, model, device): out_keys=out_keys, tensor_pixels_keys=tensor_pixels_key, ) - rb = ReplayBuffer(LazyTensorStorage(20)) + rb = ReplayBuffer(storage=LazyTensorStorage(20)) rb.append_transform(r3m) td = TensorDict({"pixels": torch.randint(255, (10, 244, 244, 3))}, [10]) rb.extend(td) @@ -1027,7 +1027,7 @@ def test_transform_env(self): def test_transform_rb(self): transform = StepCounter(10) - rb = ReplayBuffer(LazyTensorStorage(20)) + rb = ReplayBuffer(storage=LazyTensorStorage(20)) td = TensorDict({"a": torch.randn(10)}, [10]) rb.extend(td) rb.append_transform(transform) @@ -1345,7 +1345,7 @@ def test_transform_rb(self): dim=-1, del_keys=True, ) - rb = ReplayBuffer(LazyTensorStorage(20)) + rb = ReplayBuffer(storage=LazyTensorStorage(20)) rb.append_transform(ct) td = ( TensorDict( @@ -1525,7 +1525,7 @@ def test_transform_rb( batch, ) td.set("dont touch", dont_touch.clone()) - rb = ReplayBuffer(LazyTensorStorage(10)) + rb = ReplayBuffer(storage=LazyTensorStorage(10)) rb.append_transform(cc) rb.extend(td) td = rb.sample(10) @@ -1668,7 +1668,7 @@ def test_transform_model(self, include_forward): @pytest.mark.parametrize("include_forward", [True, False]) def test_transform_rb(self, include_forward): - rb = ReplayBuffer(LazyTensorStorage(10)) + rb = ReplayBuffer(storage=LazyTensorStorage(10)) t = DiscreteActionProjection(7, 10, include_forward=include_forward) rb.append_transform(t) td = TensorDict( @@ -1863,7 +1863,7 @@ def test_transform_model(self, dtype_fixture): # noqa: F811 def test_transform_rb( self, ): - rb = ReplayBuffer(LazyTensorStorage(10)) + rb = ReplayBuffer(storage=LazyTensorStorage(10)) t = DoubleToFloat(in_keys=["observation"], in_keys_inv=["action"]) rb.append_transform(t) td = TensorDict( @@ -2029,7 +2029,7 @@ def test_transform_model(self): def test_transform_rb(self): t = ExcludeTransform("a") - rb = ReplayBuffer(LazyTensorStorage(10)) + rb = ReplayBuffer(storage=LazyTensorStorage(10)) rb.append_transform(t) td = TensorDict( { @@ -2193,7 +2193,7 @@ def test_transform_model(self): def test_transform_rb(self): t = SelectTransform("b", "c") - rb = ReplayBuffer(LazyTensorStorage(10)) + rb = ReplayBuffer(storage=LazyTensorStorage(10)) rb.append_transform(t) td = TensorDict( { @@ -2377,7 +2377,7 @@ def test_transform_model(self, out_keys): def test_transform_rb(self, out_keys): t = FlattenObservation(-3, -1, out_keys=out_keys) td = TensorDict({"pixels": torch.randint(255, (10, 10, 3))}, []).expand(10) - rb = ReplayBuffer(LazyTensorStorage(10)) + rb = ReplayBuffer(storage=LazyTensorStorage(10)) rb.append_transform(t) rb.extend(td) td = rb.sample(2) @@ -2480,7 +2480,7 @@ def test_transform_model(self): def test_transform_rb(self): t = FrameSkipTransform(2) - rb = ReplayBuffer(LazyTensorStorage(10)) + rb = ReplayBuffer(storage=LazyTensorStorage(10)) rb.append_transform(t) tensordict = TensorDict({"a": torch.zeros(10)}, [10]) rb.extend(tensordict) @@ -2678,7 +2678,7 @@ def test_transform_model(self, out_keys): @pytest.mark.parametrize("out_keys", [None, ["stuff"]]) def test_transform_rb(self, out_keys): td = TensorDict({"pixels": torch.rand(3, 12, 12)}, []).expand(3) - rb = ReplayBuffer(LazyTensorStorage(10)) + rb = ReplayBuffer(storage=LazyTensorStorage(10)) rb.append_transform(GrayScale(out_keys=out_keys)) rb.extend(td) r = rb.sample(3) @@ -2751,7 +2751,7 @@ def test_transform_model(self): def test_transform_rb(self): t = NoopResetEnv() - rb = ReplayBuffer(LazyTensorStorage(10)) + rb = ReplayBuffer(storage=LazyTensorStorage(10)) rb.append_transform(t) td = TensorDict({}, [10]) rb.extend(td) @@ -3025,7 +3025,7 @@ def test_transform_rb(self): standard_normal=standard_normal, ) ) - rb = ReplayBuffer(LazyTensorStorage(10)) + rb = ReplayBuffer(storage=LazyTensorStorage(10)) rb.append_transform(t) obs = torch.randn(7) @@ -3449,7 +3449,7 @@ def test_transform_model(self): def test_transform_rb(self): t = Resize(20, 21, in_keys=["pixels"]) - rb = ReplayBuffer(LazyTensorStorage(10)) + rb = ReplayBuffer(storage=LazyTensorStorage(10)) rb.append_transform(t) td = TensorDict({"pixels": torch.randn(3, 32, 32)}, []).expand(10) rb.extend(td) @@ -3527,7 +3527,7 @@ def test_transform_model(self): def test_transform_rb(self): t = RewardClipping(-0.1, 0.1) - rb = ReplayBuffer(LazyTensorStorage(10)) + rb = ReplayBuffer(storage=LazyTensorStorage(10)) td = TensorDict({"reward": torch.randn(10)}, []).expand(10) rb.append_transform(t) rb.extend(td) @@ -3677,7 +3677,7 @@ def test_transform_rb(self, standard_normal): loc = 0.5 scale = 1.5 t = RewardScaling(0.5, 1.5, standard_normal=standard_normal) - rb = ReplayBuffer(LazyTensorStorage(10)) + rb = ReplayBuffer(storage=LazyTensorStorage(10)) reward = torch.randn(10) td = TensorDict({"reward": reward}, []).expand(10) rb.append_transform(t) @@ -3768,7 +3768,7 @@ def test_transform_rb( self, ): t = RewardSum() - rb = ReplayBuffer(LazyTensorStorage(10)) + rb = ReplayBuffer(storage=LazyTensorStorage(10)) reward = torch.randn(10) td = TensorDict({("next", "reward"): reward}, []).expand(10) rb.append_transform(t) @@ -4102,7 +4102,7 @@ def test_transform_rb(self, out_keys, unsqueeze_dim): out_keys=out_keys, allow_positive_dim=True, ) - rb = ReplayBuffer(LazyTensorStorage(10)) + rb = ReplayBuffer(storage=LazyTensorStorage(10)) rb.append_transform(t) td = TensorDict( {"observation": TensorDict({"stuff": torch.randn(3, 4)}, [3, 4])}, [] @@ -4349,7 +4349,7 @@ def test_transform_rb(self, out_keys): out_keys=out_keys, allow_positive_dim=True, ) - rb = ReplayBuffer(LazyTensorStorage(10)) + rb = ReplayBuffer(storage=LazyTensorStorage(10)) rb.append_transform(t) td = TensorDict( {"observation": TensorDict({"stuff": torch.randn(3, 1, 4)}, [3, 1, 4])}, [] @@ -4544,7 +4544,7 @@ def test_transform_model(self, out_keys): @pytest.mark.parametrize("out_keys", [None, ["stuff"]]) def test_transform_rb(self, out_keys): t = ToTensorImage(in_keys=["pixels"], out_keys=out_keys) - rb = ReplayBuffer(LazyTensorStorage(10)) + rb = ReplayBuffer(storage=LazyTensorStorage(10)) rb.append_transform(t) td = TensorDict({"pixels": torch.randint(255, (21, 22, 3))}, []) rb.extend(td.expand(10)) @@ -4587,7 +4587,7 @@ def test_transform_model(self): def test_transform_rb(self): batch_size = (2,) t = TensorDictPrimer(mykey=UnboundedContinuousTensorSpec([*batch_size, 3])) - rb = ReplayBuffer(LazyTensorStorage(10)) + rb = ReplayBuffer(storage=LazyTensorStorage(10)) rb.append_transform(t) td = TensorDict({"a": torch.zeros(())}, []) rb.extend(td.expand(10)) @@ -4882,7 +4882,7 @@ def test_transform_rb(self): in_keys=["observation"], T=3, ) - rb = ReplayBuffer(LazyTensorStorage(20)) + rb = ReplayBuffer(storage=LazyTensorStorage(20)) rb.append_transform(t) rb.extend(td) with pytest.raises( @@ -5010,7 +5010,7 @@ def test_transform_rb(self): action_dim = 5 batch_size = (2,) t = gSDENoise(state_dim=state_dim, action_dim=action_dim, shape=batch_size) - rb = ReplayBuffer(LazyTensorStorage(10)) + rb = ReplayBuffer(storage=LazyTensorStorage(10)) rb.append_transform(t) td = TensorDict({"a": torch.zeros(())}, []) rb.extend(td.expand(10)) @@ -5158,7 +5158,7 @@ def test_transform_rb(self, model, device): out_keys=out_keys, tensor_pixels_keys=tensor_pixels_key, ) - rb = ReplayBuffer(LazyTensorStorage(20)) + rb = ReplayBuffer(storage=LazyTensorStorage(20)) rb.append_transform(vip) td = TensorDict({"pixels": torch.randint(255, (10, 244, 244, 3))}, [10]) rb.extend(td) @@ -6583,7 +6583,7 @@ def test_transform_rb(self, create_copy, inverse): else: t = RenameTransform(["a"], ["b"], ["a"], ["b"], create_copy=create_copy) tensordict = TensorDict({"b": torch.randn(())}, []).expand(10) - rb = ReplayBuffer(LazyTensorStorage(20)) + rb = ReplayBuffer(storage=LazyTensorStorage(20)) rb.append_transform(t) rb.extend(tensordict) assert "a" in rb._storage._storage.keys() @@ -6679,7 +6679,7 @@ def test_transform_model(self): def test_transform_rb(self): batch = [1] device = "cpu" - rb = ReplayBuffer(LazyTensorStorage(20)) + rb = ReplayBuffer(storage=LazyTensorStorage(20)) rb.append_transform(InitTracker()) reward = torch.randn(*batch, 1, device=device) misc = torch.randn(*batch, 1, device=device) From 76120983cde50ca84e3d52071c81f184e83ed518 Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 29 Mar 2023 13:18:29 +0100 Subject: [PATCH 40/89] tests --- test/test_rb_distributed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_rb_distributed.py b/test/test_rb_distributed.py index 252913500f3..7443601c76d 100644 --- a/test/test_rb_distributed.py +++ b/test/test_rb_distributed.py @@ -53,7 +53,7 @@ def sample_from_buffer_remotely_returns_correct_tensordict_test(rank, name, worl _, inserted = _add_random_tensor_dict_to_buffer(buffer) sampled = _sample_from_buffer(buffer, 1) assert type(sampled) is type(inserted) is TensorDict - assert (sampled == inserted)["a"].item() + assert (sampled["a"] == inserted["a"]).all() @pytest.mark.skipif( From 69eb921397db5dcac91c2f99b696ca59ed0ff8a7 Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 29 Mar 2023 13:25:32 +0100 Subject: [PATCH 41/89] amend --- tutorials/sphinx-tutorials/coding_ddpg.py | 2 ++ tutorials/sphinx-tutorials/coding_dqn.py | 30 +++++++++++++++++++++-- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/tutorials/sphinx-tutorials/coding_ddpg.py b/tutorials/sphinx-tutorials/coding_ddpg.py index 0d1c353d472..53fd830f3be 100644 --- a/tutorials/sphinx-tutorials/coding_ddpg.py +++ b/tutorials/sphinx-tutorials/coding_ddpg.py @@ -394,6 +394,8 @@ def _forward(self, input_tensordict: TensorDictBase) -> TensorDict: class DDPGLoss(LossModule): default_value_estimator = default_value_estimator + make_value_estimator = make_value_estimator + __init__ = _init forward = _forward loss_value = _loss_value diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py index 9313e6e8b08..a0943107486 100644 --- a/tutorials/sphinx-tutorials/coding_dqn.py +++ b/tutorials/sphinx-tutorials/coding_dqn.py @@ -84,6 +84,8 @@ # of this algorithm. # sphinx_gallery_start_ignore +import os +import uuid import warnings from torchrl.objectives import DQNLoss, SoftUpdate @@ -550,7 +552,8 @@ def get_loss_module(actor, gamma): optimizer = torch.optim.Adam( loss_module.parameters(), lr=lr, weight_decay=wd, betas=betas ) -logger = CSVLogger(exp_name="dqn_exp", log_dir="./") +exp_name = f"dqn_exp_{uuid.uuid1()}" +logger = CSVLogger(exp_name=exp_name, log_dir="./") trainer = Trainer( collector=collector, @@ -613,7 +616,30 @@ def get_loss_module(actor, gamma): ############################################################################### # We can now quickly check the CSVs with the results. -# TODO +def print_csv_files_in_folder(folder_path): + """ + Find all CSV files in a folder and print the first 10 lines of each file. + + Args: + folder_path (str): The relative path to the folder. + + Returns: + list: A list of all CSV files in the folder. + """ + csv_files = [] + for file in os.listdir(folder_path): + if file.endswith(".csv"): + csv_files.append(os.path.join(folder_path, file)) + for csv_file in csv_files: + print(f"File: {csv_file}") + with open(csv_file, "r") as f: + for i, line in enumerate(f): + if i == 10: + break + print(line.strip()) + print("\n") + +print_csv_files_in_folder(exp_name) ############################################################################### # Conclusion and possible improvements From 5d32e102d2461e772f07b34e5db7546fc5ca6cdb Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 29 Mar 2023 13:53:38 +0100 Subject: [PATCH 42/89] amend --- torchrl/data/replay_buffers/replay_buffers.py | 2 ++ tutorials/sphinx-tutorials/coding_dqn.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/torchrl/data/replay_buffers/replay_buffers.py b/torchrl/data/replay_buffers/replay_buffers.py index 0a20dc6dff7..a8044c063ec 100644 --- a/torchrl/data/replay_buffers/replay_buffers.py +++ b/torchrl/data/replay_buffers/replay_buffers.py @@ -482,6 +482,8 @@ def extend(self, tensordicts: Union[List, TensorDictBase]) -> torch.Tensor: return index def update_tensordict_priority(self, data: TensorDictBase) -> None: + if not isinstance(self._sampler, PrioritizedSampler): + return priority = torch.tensor( [self._get_priority(td) for td in data], dtype=torch.float, diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py index a0943107486..21fda673cd2 100644 --- a/tutorials/sphinx-tutorials/coding_dqn.py +++ b/tutorials/sphinx-tutorials/coding_dqn.py @@ -616,6 +616,7 @@ def get_loss_module(actor, gamma): ############################################################################### # We can now quickly check the CSVs with the results. + def print_csv_files_in_folder(folder_path): """ Find all CSV files in a folder and print the first 10 lines of each file. @@ -639,6 +640,7 @@ def print_csv_files_in_folder(folder_path): print(line.strip()) print("\n") + print_csv_files_in_folder(exp_name) ############################################################################### From c471b96a31d2ade23d50566c82674f5c4409e3eb Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 29 Mar 2023 14:03:05 +0100 Subject: [PATCH 43/89] fix examples --- torchrl/trainers/helpers/replay_buffer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torchrl/trainers/helpers/replay_buffer.py b/torchrl/trainers/helpers/replay_buffer.py index 4f9c48bf4b9..229a22cbe8e 100644 --- a/torchrl/trainers/helpers/replay_buffer.py +++ b/torchrl/trainers/helpers/replay_buffer.py @@ -35,6 +35,7 @@ def make_replay_buffer( sampler=sampler, pin_memory=device != torch.device("cpu"), prefetch=cfg.buffer_prefetch, + batch_size=cfg.batch_size, ) return buffer From d9ab47770cfaeeb9097a01dea9baa7bf9b7cbeb4 Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 29 Mar 2023 14:12:27 +0100 Subject: [PATCH 44/89] fix dqn updater --- tutorials/sphinx-tutorials/coding_dqn.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py index 21fda673cd2..15d98ad092a 100644 --- a/tutorials/sphinx-tutorials/coding_dqn.py +++ b/tutorials/sphinx-tutorials/coding_dqn.py @@ -546,6 +546,8 @@ def get_loss_module(actor, gamma): # Get model actor, actor_explore = make_model(test_env) loss_module, target_net_updater = get_loss_module(actor, gamma) +target_net_updater.init_() + collector = get_collector( stats, num_collectors, actor_explore, frames_per_batch, total_frames, device ) From fb7d5de0dc2122de9fab64346c99c98cdd3e50a6 Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 29 Mar 2023 14:18:05 +0100 Subject: [PATCH 45/89] fix doc --- docs/source/reference/data.rst | 4 +--- docs/source/reference/envs.rst | 1 - docs/source/reference/objectives.rst | 2 +- torchrl/data/datasets/__init__.py | 1 + torchrl/data/postprocs/postprocs.py | 6 +++--- torchrl/objectives/common.py | 2 +- torchrl/objectives/utils.py | 4 ++-- 7 files changed, 9 insertions(+), 11 deletions(-) diff --git a/docs/source/reference/data.rst b/docs/source/reference/data.rst index 7a0143e746e..079e5877654 100644 --- a/docs/source/reference/data.rst +++ b/docs/source/reference/data.rst @@ -180,8 +180,6 @@ Here's an example: .. currentmodule:: torchrl.data.datasets -.. currentmodule:: torchrl.data.datasets - .. autosummary:: :toctree: generated/ :template: rl_template.rst @@ -220,7 +218,7 @@ Check the :obj:`torchrl.envs.utils.check_env_specs` method for a sanity check. Utils ----- -.. currentmodule:: torchrl.data.datasets +.. currentmodule:: torchrl.data .. autosummary:: :toctree: generated/ diff --git a/docs/source/reference/envs.rst b/docs/source/reference/envs.rst index 8b661bfa391..430dea36996 100644 --- a/docs/source/reference/envs.rst +++ b/docs/source/reference/envs.rst @@ -114,7 +114,6 @@ provides more information on how to design a custom environment from scratch. EnvBase GymLikeEnv EnvMetaData - Specs Vectorized envs --------------- diff --git a/docs/source/reference/objectives.rst b/docs/source/reference/objectives.rst index 1eb9d17bb16..384117de4c9 100644 --- a/docs/source/reference/objectives.rst +++ b/docs/source/reference/objectives.rst @@ -218,5 +218,5 @@ Utils next_state_value SoftUpdate HardUpdate - ValueFunctions + ValueEstimators default_value_kwargs diff --git a/torchrl/data/datasets/__init__.py b/torchrl/data/datasets/__init__.py index 6fcc35a0d46..81a668648d0 100644 --- a/torchrl/data/datasets/__init__.py +++ b/torchrl/data/datasets/__init__.py @@ -1 +1,2 @@ from .d4rl import D4RLExperienceReplay +from .openml import OpenMLExperienceReplay diff --git a/torchrl/data/postprocs/postprocs.py b/torchrl/data/postprocs/postprocs.py index dba8cafde4a..21f51115d6c 100644 --- a/torchrl/data/postprocs/postprocs.py +++ b/torchrl/data/postprocs/postprocs.py @@ -82,9 +82,9 @@ def _get_reward( class MultiStep(nn.Module): """Multistep reward transform. - Presented in 'Sutton, R. S. 1988. Learning to - predict by the methods of temporal differences. Machine learning 3( - 1):9–44.' + Presented in + + | Sutton, R. S. 1988. Learning to predict by the methods of temporal differences. Machine learning 3(1):9–44. This module maps the "next" observation to the t + n "next" observation. It is an identity transform whenever :attr:`n_steps` is 0. diff --git a/torchrl/objectives/common.py b/torchrl/objectives/common.py index 770d3f3e406..9c37b1cbdca 100644 --- a/torchrl/objectives/common.py +++ b/torchrl/objectives/common.py @@ -392,7 +392,7 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): this method. Args: - value_type (ValueEstimators): A :class:`torchrl.objectives.utils.ValueFunctions` + value_type (ValueEstimators): A :class:`torchrl.objectives.utils.ValueEstimators` enum type indicating the value function to use. **hyperparams: hyperparameters to use for the value function. If not provided, the value indicated by diff --git a/torchrl/objectives/utils.py b/torchrl/objectives/utils.py index 3daf5e70876..9d393a51d05 100644 --- a/torchrl/objectives/utils.py +++ b/torchrl/objectives/utils.py @@ -18,7 +18,7 @@ _GAMMA_LMBDA_DEPREC_WARNING = ( "Passing gamma / lambda parameters through the loss constructor " "is deprecated and will be removed soon. To customize your value function, " - "run `loss_module.make_value_estimator(ValueFunctions., gamma=val)`." + "run `loss_module.make_value_estimator(ValueEstimators., gamma=val)`." ) @@ -45,7 +45,7 @@ def default_value_kwargs(value_type: ValueEstimators): Args: value_type (Enum.value): the value function type, from the - :class:`torchrl.objectives.utils.ValueFunctions` class. + :class:`torchrl.objectives.utils.ValueEstimators` class. Examples: >>> kwargs = default_value_kwargs(ValueEstimators.TDLambda) From 180b5b2becceef318c20291c9c198f3e49b5cf1e Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 29 Mar 2023 14:21:46 +0100 Subject: [PATCH 46/89] print td shape --- torchrl/data/datasets/openml.py | 9 +++++++-- tutorials/sphinx-tutorials/coding_ddpg.py | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/torchrl/data/datasets/openml.py b/torchrl/data/datasets/openml.py index 78b90793682..76ccb66f601 100644 --- a/torchrl/data/datasets/openml.py +++ b/torchrl/data/datasets/openml.py @@ -8,8 +8,13 @@ import numpy as np from tensordict.tensordict import TensorDict -from torchrl.data import LazyMemmapStorage, TensorDictReplayBuffer -from torchrl.data.replay_buffers import Sampler, SamplerWithoutReplacement, Writer +from torchrl.data.replay_buffers import ( + LazyMemmapStorage, + Sampler, + SamplerWithoutReplacement, + TensorDictReplayBuffer, + Writer, +) class OpenMLExperienceReplay(TensorDictReplayBuffer): diff --git a/tutorials/sphinx-tutorials/coding_ddpg.py b/tutorials/sphinx-tutorials/coding_ddpg.py index 53fd830f3be..3b4176a817c 100644 --- a/tutorials/sphinx-tutorials/coding_ddpg.py +++ b/tutorials/sphinx-tutorials/coding_ddpg.py @@ -1063,6 +1063,7 @@ def make_replay_buffer(buffer_size, prefetch=3): # extend the replay buffer with the new data current_frames = tensordict.numel() collected_frames += current_frames + print("Tensordict shape: ", tensordict.shape) replay_buffer.extend(tensordict.cpu()) # optimization steps From 921b91b954493be83d6b7f1c65632ca1035ec0dc Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 29 Mar 2023 14:33:01 +0100 Subject: [PATCH 47/89] fix recorder --- test/test_trainer.py | 8 ++++---- torchrl/trainers/trainers.py | 17 +++++++++-------- tutorials/sphinx-tutorials/coding_ddpg.py | 1 - 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/test/test_trainer.py b/test/test_trainer.py index 8544efb6b6f..c83460eb753 100644 --- a/test/test_trainer.py +++ b/test/test_trainer.py @@ -859,7 +859,7 @@ def test_recorder(self, N=8): with tempfile.TemporaryDirectory() as folder: logger = TensorboardLogger(exp_name=folder) - recorder = transformed_env_constructor( + environment = transformed_env_constructor( args, video_tag="tmp", norm_obs_only=True, @@ -871,7 +871,7 @@ def test_recorder(self, N=8): record_frames=args.record_frames, frame_skip=args.frame_skip, policy_exploration=None, - recorder=recorder, + environment=environment, record_interval=args.record_interval, ) trainer = mocking_trainer() @@ -933,7 +933,7 @@ def _make_recorder_and_trainer(tmpdirname): raise NotImplementedError trainer = mocking_trainer(file) - recorder = transformed_env_constructor( + environment = transformed_env_constructor( args, video_tag="tmp", norm_obs_only=True, @@ -945,7 +945,7 @@ def _make_recorder_and_trainer(tmpdirname): record_frames=args.record_frames, frame_skip=args.frame_skip, policy_exploration=None, - recorder=recorder, + environment=environment, record_interval=args.record_interval, ) recorder.register(trainer) diff --git a/torchrl/trainers/trainers.py b/torchrl/trainers/trainers.py index 1608f853ad4..52d58542442 100644 --- a/torchrl/trainers/trainers.py +++ b/torchrl/trainers/trainers.py @@ -1163,10 +1163,11 @@ def __init__( ) -> None: if environment is None and recorder is not None: warnings.warn(self.ENV_DEPREC) + environment = recorder elif environment is not None and recorder is not None: raise ValueError("environment and recorder conflict.") self.policy_exploration = policy_exploration - self.recorder = recorder + self.environment = environment self.record_frames = record_frames self.frame_skip = frame_skip self._count = 0 @@ -1189,8 +1190,8 @@ def __call__(self, batch: TensorDictBase) -> Dict: with set_exploration_mode(self.exploration_mode): if isinstance(self.policy_exploration, torch.nn.Module): self.policy_exploration.eval() - self.recorder.eval() - td_record = self.recorder.rollout( + self.environment.eval() + td_record = self.environment.rollout( policy=self.policy_exploration, max_steps=self.record_frames, auto_reset=True, @@ -1199,8 +1200,8 @@ def __call__(self, batch: TensorDictBase) -> Dict: ).clone() if isinstance(self.policy_exploration, torch.nn.Module): self.policy_exploration.train() - self.recorder.train() - self.recorder.transform.dump(suffix=self.suffix) + self.environment.train() + self.environment.transform.dump(suffix=self.suffix) out = {} for key in self.log_keys: @@ -1214,18 +1215,18 @@ def __call__(self, batch: TensorDictBase) -> Dict: out[self.out_keys[key]] = value out["log_pbar"] = self.log_pbar self._count += 1 - self.recorder.close() + self.environment.close() return out def state_dict(self) -> Dict: return { "_count": self._count, - "recorder_state_dict": self.recorder.state_dict(), + "recorder_state_dict": self.environment.state_dict(), } def load_state_dict(self, state_dict: Dict) -> None: self._count = state_dict["_count"] - self.recorder.load_state_dict(state_dict["recorder_state_dict"]) + self.environment.load_state_dict(state_dict["recorder_state_dict"]) def register(self, trainer: Trainer, name: str = "recorder"): trainer.register_module(name, self) diff --git a/tutorials/sphinx-tutorials/coding_ddpg.py b/tutorials/sphinx-tutorials/coding_ddpg.py index 3b4176a817c..53fd830f3be 100644 --- a/tutorials/sphinx-tutorials/coding_ddpg.py +++ b/tutorials/sphinx-tutorials/coding_ddpg.py @@ -1063,7 +1063,6 @@ def make_replay_buffer(buffer_size, prefetch=3): # extend the replay buffer with the new data current_frames = tensordict.numel() collected_frames += current_frames - print("Tensordict shape: ", tensordict.shape) replay_buffer.extend(tensordict.cpu()) # optimization steps From 8984654d4bbaf1d833955af32116dace1e6359e4 Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 29 Mar 2023 14:39:33 +0100 Subject: [PATCH 48/89] fix examples --- examples/discrete_sac/discrete_sac.py | 6 +++--- examples/iql/iql_online.py | 13 +++++++++---- examples/td3/td3.py | 14 ++++++++++---- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/examples/discrete_sac/discrete_sac.py b/examples/discrete_sac/discrete_sac.py index 6fc101ff533..987571747f6 100644 --- a/examples/discrete_sac/discrete_sac.py +++ b/examples/discrete_sac/discrete_sac.py @@ -44,7 +44,7 @@ def make_replay_buffer( batch_size=256, buffer_scratch_dir="/tmp/", device="cpu", - make_replay_buffer=3, + prefetch=3, ): if prb: replay_buffer = TensorDictPrioritizedReplayBuffer( @@ -52,7 +52,7 @@ def make_replay_buffer( beta=0.5, pin_memory=False, batch_size=batch_size, - prefetch=make_replay_buffer, + prefetch=prefetch, storage=LazyMemmapStorage( buffer_size, scratch_dir=buffer_scratch_dir, @@ -63,7 +63,7 @@ def make_replay_buffer( replay_buffer = TensorDictReplayBuffer( pin_memory=False, batch_size=batch_size, - prefetch=make_replay_buffer, + prefetch=prefetch, storage=LazyMemmapStorage( buffer_size, scratch_dir=buffer_scratch_dir, diff --git a/examples/iql/iql_online.py b/examples/iql/iql_online.py index 4dcc5bea747..1512f471f10 100644 --- a/examples/iql/iql_online.py +++ b/examples/iql/iql_online.py @@ -36,33 +36,36 @@ def env_maker(env_name, frame_skip=1, device="cpu", from_pixels=False): def make_replay_buffer( + batch_size, prb=False, buffer_size=1000000, buffer_scratch_dir="/tmp/", device="cpu", - make_replay_buffer=3, + prefetch=3, ): if prb: replay_buffer = TensorDictPrioritizedReplayBuffer( alpha=0.7, beta=0.5, pin_memory=False, - prefetch=make_replay_buffer, + prefetch=prefetch, storage=LazyMemmapStorage( buffer_size, scratch_dir=buffer_scratch_dir, device=device, ), + batch_size=batch_size, ) else: replay_buffer = TensorDictReplayBuffer( pin_memory=False, - prefetch=make_replay_buffer, + prefetch=prefetch, storage=LazyMemmapStorage( buffer_size, scratch_dir=buffer_scratch_dir, device=device, ), + batch_size=batch_size, ) return replay_buffer @@ -218,7 +221,9 @@ def env_factory(num_workers): collector.set_seed(cfg.seed) # Make Replay Buffer - replay_buffer = make_replay_buffer(buffer_size=cfg.buffer_size, device=device) + replay_buffer = make_replay_buffer( + buffer_size=cfg.buffer_size, device=device, batch_size=cfg.batch_size + ) # Optimizers params = list(loss_module.parameters()) diff --git a/examples/td3/td3.py b/examples/td3/td3.py index 659da599240..a285c29acef 100644 --- a/examples/td3/td3.py +++ b/examples/td3/td3.py @@ -60,33 +60,36 @@ def apply_env_transforms(env, reward_scaling=1.0): def make_replay_buffer( + batch_size, prb=False, buffer_size=1000000, buffer_scratch_dir="/tmp/", device="cpu", - make_replay_buffer=3, + prefetch=3, ): if prb: replay_buffer = TensorDictPrioritizedReplayBuffer( alpha=0.7, beta=0.5, pin_memory=False, - prefetch=make_replay_buffer, + prefetch=prefetch, storage=LazyMemmapStorage( buffer_size, scratch_dir=buffer_scratch_dir, device=device, ), + batch_size=batch_size, ) else: replay_buffer = TensorDictReplayBuffer( pin_memory=False, - prefetch=make_replay_buffer, + prefetch=prefetch, storage=LazyMemmapStorage( buffer_size, scratch_dir=buffer_scratch_dir, device=device, ), + batch_size=batch_size, ) return replay_buffer @@ -239,7 +242,10 @@ def main(cfg: "DictConfig"): # noqa: F821 # Make Replay Buffer replay_buffer = make_replay_buffer( - prb=cfg.prb, buffer_size=cfg.buffer_size, device=device + batch_size=cfg.batch_size, + prb=cfg.prb, + buffer_size=cfg.buffer_size, + device=device, ) # Optimizers From a3f76d18f5d776e86826fcacd5b3fbeba99cda89 Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 29 Mar 2023 14:46:14 +0100 Subject: [PATCH 49/89] tmp --- tutorials/sphinx-tutorials/coding_ddpg.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tutorials/sphinx-tutorials/coding_ddpg.py b/tutorials/sphinx-tutorials/coding_ddpg.py index 53fd830f3be..b1b74764c44 100644 --- a/tutorials/sphinx-tutorials/coding_ddpg.py +++ b/tutorials/sphinx-tutorials/coding_ddpg.py @@ -68,7 +68,7 @@ from tensordict.nn import TensorDictModule from tensordict.tensordict import TensorDict, TensorDictBase from torch import nn, optim -from torchrl.collectors import MultiaSyncDataCollector +from torchrl.collectors import MultiaSyncDataCollector, SyncDataCollector from torchrl.data import CompositeSpec, TensorDictReplayBuffer from torchrl.data.postprocs import MultiStep from torchrl.data.replay_buffers.samplers import PrioritizedSampler, RandomSampler @@ -987,8 +987,10 @@ def make_replay_buffer(buffer_size, prefetch=3): else: multistep = None -collector = MultiaSyncDataCollector( - create_env_fn=[create_env_fn, create_env_fn], +warnings.warn("Change collector!!") + +collector = SyncDataCollector( + create_env_fn=create_env_fn, policy=actor_model_explore, total_frames=total_frames, max_frames_per_traj=max_frames_per_traj, @@ -998,7 +1000,7 @@ def make_replay_buffer(buffer_size, prefetch=3): postproc=multistep, split_trajs=True, device=device, # device for execution - storing_devices=[device, device], # device where data will be stored and passed + storing_device=device, # device where data will be stored and passed update_at_each_batch=False, exploration_mode="random", ) From a10900acc6c94bded874b330f081798735761e40 Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 29 Mar 2023 14:46:55 +0100 Subject: [PATCH 50/89] tmp --- tutorials/sphinx-tutorials/coding_ddpg.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tutorials/sphinx-tutorials/coding_ddpg.py b/tutorials/sphinx-tutorials/coding_ddpg.py index b1b74764c44..288bc8a508f 100644 --- a/tutorials/sphinx-tutorials/coding_ddpg.py +++ b/tutorials/sphinx-tutorials/coding_ddpg.py @@ -818,7 +818,8 @@ def make_replay_buffer(buffer_size, prefetch=3): ) # Number of environments in each data collector -env_per_collector = 2 +warnings.warn("More envs!") +env_per_collector = 1 # Total frames we will use during training. Scale up to 500K - 1M for a more # meaningful training From 2e65eef20f5d8483afb689e359d8d473f449a772 Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 29 Mar 2023 14:49:31 +0100 Subject: [PATCH 51/89] tmp --- tutorials/sphinx-tutorials/coding_ddpg.py | 2062 ++++++++++----------- 1 file changed, 1031 insertions(+), 1031 deletions(-) diff --git a/tutorials/sphinx-tutorials/coding_ddpg.py b/tutorials/sphinx-tutorials/coding_ddpg.py index 288bc8a508f..81542a4c1d0 100644 --- a/tutorials/sphinx-tutorials/coding_ddpg.py +++ b/tutorials/sphinx-tutorials/coding_ddpg.py @@ -60,1089 +60,1089 @@ warnings.filterwarnings("ignore") # sphinx_gallery_end_ignore - -import numpy as np -import torch.cuda -import tqdm -from matplotlib import pyplot as plt -from tensordict.nn import TensorDictModule -from tensordict.tensordict import TensorDict, TensorDictBase -from torch import nn, optim -from torchrl.collectors import MultiaSyncDataCollector, SyncDataCollector -from torchrl.data import CompositeSpec, TensorDictReplayBuffer -from torchrl.data.postprocs import MultiStep -from torchrl.data.replay_buffers.samplers import PrioritizedSampler, RandomSampler -from torchrl.data.replay_buffers.storages import LazyMemmapStorage -from torchrl.envs import ( - CatTensors, - DoubleToFloat, - EnvCreator, - ObservationNorm, - ParallelEnv, -) -from torchrl.envs.libs.dm_control import DMControlEnv -from torchrl.envs.libs.gym import GymEnv -from torchrl.envs.transforms import RewardScaling, TransformedEnv -from torchrl.envs.utils import set_exploration_mode -from torchrl.modules import ( - Actor, - ActorCriticWrapper, - MLP, - OrnsteinUhlenbeckProcessWrapper, - ValueOperator, -) -from torchrl.objectives.utils import distance_loss, SoftUpdate -from torchrl.trainers import Recorder - -############################################################################### -# torchrl :class:`torchrl.objectives.LossModule` -# ---------------------------------------------- -# -# TorchRL provides a series of losses to use in your training scripts. -# The aim is to have losses that are easily reusable/swappable and that have -# a simple signature. -# -# The main characteristics of TorchRL losses are: -# -# - they are stateful objects: they contain a copy of the trainable parameters -# such that ``loss_module.parameters()`` gives whatever is needed to train the -# algorithm. -# - They follow the ``tensordict`` convention: the :meth:`torch.nn.Module.forward` -# method will receive a tensordict as input that contains all the necessary -# information to return a loss value. -# -# >>> data = replay_buffer.sample() -# >>> loss_dict = loss_module(data) -# -# - They output a :class:`tensordict.TensorDict` instance with the loss values -# written under a ``"loss_"`` where ``smth`` is a string describing the -# loss. Additional keys in the tensordict may be useful metrics to log during -# training time. -# .. note:: -# The reason we return independent losses is to let the user use a different -# optimizer for different sets of parameters for instance. Summing the losses -# can be simply done via -# -# >>> loss_val = sum(loss for key, loss in loss_dict.items() if key.startswith("loss_")) -# -# The ``__init__`` method -# ~~~~~~~~~~~~~~~~~~~~~~~ -# -# The parent class of all losses is :class:`torchrl.objectives.LossModule`. -# As many other components of the library, its :meth:`torchrl.objectives.LossModule.forward` method expects -# as input a :class:`tensordict.TensorDict` instance sampled from an experience -# replay buffer, or any similar data structure. Using this format makes it -# possible to re-use the module across -# modalities, or in complex settings where the model needs to read multiple -# entries for instance. In other words, it allows us to code a loss module that -# is oblivious to the data type that is being given to is and that focuses on -# running the elementary steps of the loss function and only those. -# -# To keep the tutorial as didactic as we can, we'll be displaying each method -# of the class independently and we'll be populating the class at a later -# stage. -# -# Let us start with the :meth:`torchrl.objectives.LossModule.__init__` -# method. DDPG aims at solving a control task with a simple strategy: -# training a policy to output actions that maximise the value predicted by -# a value network. Hence, our loss module needs to receive two networks in its -# constructor: an actor and a value networks. We expect both of these to be -# tensordict-compatible objects, such as -# :class:`tensordict.nn.TensorDictModule`. -# Our loss function will need to compute a target value and fit the value -# network to this, and generate an action and fit the policy such that its -# value estimate is maximised. -# -# The crucial step of the :meth:`LossModule.__init__` method is the call to -# :meth:`torchrl.LossModule.convert_to_functional`. This method will extract -# the parameters from the module and convert it to a functional module. -# Strictly speaking, this is not necessary and one may perfectly code all -# the losses without it. However, we encourage its usage for the following -# reason. -# -# The reason TorchRL does this is that RL algorithms often execute the same -# model with different sets of parameters, called "trainable" and "target" -# parameters. -# The "trainable" parameters are those that the optimizer needs to fit. The -# "target" parameters are usually a copy of the formers with some time lag -# (absolute or diluted through a moving average). -# These target parameters are used to compute the value associated with the -# next observation. One the advantages of using a set of target parameters -# for the value model that do not match exactly the current configuration is -# that they provide a pessimistic bound on the value function being computed. -# Pay attention to the ``create_target_params`` keyword argument below: this -# argument tells the :meth:`torchrl.objectives.LossModule.convert_to_functional` -# method to create a set of target parameters in the loss module to be used -# for target value computation. If this is set to ``False`` (see the actor network -# for instance) the ``target_actor_network_params`` attribute will still be -# accessible but this will just return a **detached** version of the -# actor parameters. -# -# Later, we will see how the target parameters should be updated in torchrl. -# - - -def _init( - self, - actor_network: TensorDictModule, - value_network: TensorDictModule, -) -> None: - super(type(self), self).__init__() - - self.convert_to_functional( - actor_network, - "actor_network", - create_target_params=False, +if __name__ == "__main__": + import numpy as np + import torch.cuda + import tqdm + from matplotlib import pyplot as plt + from tensordict.nn import TensorDictModule + from tensordict.tensordict import TensorDict, TensorDictBase + from torch import nn, optim + from torchrl.collectors import MultiaSyncDataCollector, SyncDataCollector + from torchrl.data import CompositeSpec, TensorDictReplayBuffer + from torchrl.data.postprocs import MultiStep + from torchrl.data.replay_buffers.samplers import PrioritizedSampler, RandomSampler + from torchrl.data.replay_buffers.storages import LazyMemmapStorage + from torchrl.envs import ( + CatTensors, + DoubleToFloat, + EnvCreator, + ObservationNorm, + ParallelEnv, ) - self.convert_to_functional( - value_network, - "value_network", - create_target_params=True, - compare_against=list(actor_network.parameters()), + from torchrl.envs.libs.dm_control import DMControlEnv + from torchrl.envs.libs.gym import GymEnv + from torchrl.envs.transforms import RewardScaling, TransformedEnv + from torchrl.envs.utils import set_exploration_mode + from torchrl.modules import ( + Actor, + ActorCriticWrapper, + MLP, + OrnsteinUhlenbeckProcessWrapper, + ValueOperator, ) - - self.actor_in_keys = actor_network.in_keys - - # Since the value we'll be using is based on the actor and value network, - # we put them together in a single actor-critic container. - actor_critic = ActorCriticWrapper(actor_network, value_network) - self.actor_critic = actor_critic - self.loss_funtion = "l2" - - -############################################################################### -# The value estimator loss method -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# In many RL algorithm, the value network (or Q-value network) is trained based -# on an empirical value estimate. This can be bootstrapped (TD(0), low -# variance, high bias), meaning -# that the target value is obtained using the next reward and nothing else, or -# a Monte-Carlo estimate can be obtained (TD(1)) in which case the whole -# sequence of upcoming rewards will be used (high variance, low bias). An -# intermediate estimator (TD(:math:`\lambda`)) can also be used to compromise -# bias and variance. -# TorchRL makes it easy to use one or the other estimator via the -# :class:`torchrl.objectives.utils.ValueEstimators` Enum class, which contains -# pointers to all the value estimators implemented. Let us define the default -# value function here. We will take the simplest version (TD(0)), and show later -# on how this can be changed. - -from torchrl.objectives.utils import ValueEstimators - -default_value_estimator = ValueEstimators.TD0 - -############################################################################### -# We also need to give some instructions to DDPG on how to build the value -# estimator, depending on the user query. Depending on the estimator provided, -# we will build the corresponding module to be used at train time: - -from torchrl.objectives.utils import default_value_kwargs -from torchrl.objectives.value import TD0Estimator, TD1Estimator, TDLambdaEstimator - - -def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): - hp = dict(default_value_kwargs(value_type)) - if hasattr(self, "gamma"): - hp["gamma"] = self.gamma - hp.update(hyperparams) - value_key = "state_action_value" - if value_type == ValueEstimators.TD1: - self._value_estimator = TD1Estimator( - value_network=self.actor_critic, value_key=value_key, **hp - ) - elif value_type == ValueEstimators.TD0: - self._value_estimator = TD0Estimator( - value_network=self.actor_critic, value_key=value_key, **hp - ) - elif value_type == ValueEstimators.GAE: - raise NotImplementedError( - f"Value type {value_type} it not implemented for loss {type(self)}." + from torchrl.objectives.utils import distance_loss, SoftUpdate + from torchrl.trainers import Recorder + + ############################################################################### + # torchrl :class:`torchrl.objectives.LossModule` + # ---------------------------------------------- + # + # TorchRL provides a series of losses to use in your training scripts. + # The aim is to have losses that are easily reusable/swappable and that have + # a simple signature. + # + # The main characteristics of TorchRL losses are: + # + # - they are stateful objects: they contain a copy of the trainable parameters + # such that ``loss_module.parameters()`` gives whatever is needed to train the + # algorithm. + # - They follow the ``tensordict`` convention: the :meth:`torch.nn.Module.forward` + # method will receive a tensordict as input that contains all the necessary + # information to return a loss value. + # + # >>> data = replay_buffer.sample() + # >>> loss_dict = loss_module(data) + # + # - They output a :class:`tensordict.TensorDict` instance with the loss values + # written under a ``"loss_"`` where ``smth`` is a string describing the + # loss. Additional keys in the tensordict may be useful metrics to log during + # training time. + # .. note:: + # The reason we return independent losses is to let the user use a different + # optimizer for different sets of parameters for instance. Summing the losses + # can be simply done via + # + # >>> loss_val = sum(loss for key, loss in loss_dict.items() if key.startswith("loss_")) + # + # The ``__init__`` method + # ~~~~~~~~~~~~~~~~~~~~~~~ + # + # The parent class of all losses is :class:`torchrl.objectives.LossModule`. + # As many other components of the library, its :meth:`torchrl.objectives.LossModule.forward` method expects + # as input a :class:`tensordict.TensorDict` instance sampled from an experience + # replay buffer, or any similar data structure. Using this format makes it + # possible to re-use the module across + # modalities, or in complex settings where the model needs to read multiple + # entries for instance. In other words, it allows us to code a loss module that + # is oblivious to the data type that is being given to is and that focuses on + # running the elementary steps of the loss function and only those. + # + # To keep the tutorial as didactic as we can, we'll be displaying each method + # of the class independently and we'll be populating the class at a later + # stage. + # + # Let us start with the :meth:`torchrl.objectives.LossModule.__init__` + # method. DDPG aims at solving a control task with a simple strategy: + # training a policy to output actions that maximise the value predicted by + # a value network. Hence, our loss module needs to receive two networks in its + # constructor: an actor and a value networks. We expect both of these to be + # tensordict-compatible objects, such as + # :class:`tensordict.nn.TensorDictModule`. + # Our loss function will need to compute a target value and fit the value + # network to this, and generate an action and fit the policy such that its + # value estimate is maximised. + # + # The crucial step of the :meth:`LossModule.__init__` method is the call to + # :meth:`torchrl.LossModule.convert_to_functional`. This method will extract + # the parameters from the module and convert it to a functional module. + # Strictly speaking, this is not necessary and one may perfectly code all + # the losses without it. However, we encourage its usage for the following + # reason. + # + # The reason TorchRL does this is that RL algorithms often execute the same + # model with different sets of parameters, called "trainable" and "target" + # parameters. + # The "trainable" parameters are those that the optimizer needs to fit. The + # "target" parameters are usually a copy of the formers with some time lag + # (absolute or diluted through a moving average). + # These target parameters are used to compute the value associated with the + # next observation. One the advantages of using a set of target parameters + # for the value model that do not match exactly the current configuration is + # that they provide a pessimistic bound on the value function being computed. + # Pay attention to the ``create_target_params`` keyword argument below: this + # argument tells the :meth:`torchrl.objectives.LossModule.convert_to_functional` + # method to create a set of target parameters in the loss module to be used + # for target value computation. If this is set to ``False`` (see the actor network + # for instance) the ``target_actor_network_params`` attribute will still be + # accessible but this will just return a **detached** version of the + # actor parameters. + # + # Later, we will see how the target parameters should be updated in torchrl. + # + + + def _init( + self, + actor_network: TensorDictModule, + value_network: TensorDictModule, + ) -> None: + super(type(self), self).__init__() + + self.convert_to_functional( + actor_network, + "actor_network", + create_target_params=False, ) - elif value_type == ValueEstimators.TDLambda: - self._value_estimator = TDLambdaEstimator( - value_network=self.actor_critic, value_key=value_key, **hp + self.convert_to_functional( + value_network, + "value_network", + create_target_params=True, + compare_against=list(actor_network.parameters()), ) - else: - raise NotImplementedError(f"Unknown value type {value_type}") - -############################################################################### -# The ``make_value_estimator`` method can but does not need to be called: if -# not, the :class:`torchrl.objectives.LossModule` will query this method with -# its default estimator. -# -# The actor loss method -# ~~~~~~~~~~~~~~~~~~~~~ -# -# The central piece of an RL algorithm is the training loss for the actor. -# In the case of DDPG, this function is quite simple: we just need to compute -# the value associated with an action computed using the policy and optimize -# the actor weights to maximise this value. -# -# When computing this value, we must make sure to take the value parameters out -# of the graph, otherwise the actor and value loss will be mixed up. -# For this, the :func:`torchrl.objectives.utils.hold_out_params` function -# can be used. - -from torchrl.objectives.utils import hold_out_params - - -def _loss_actor( - self, - tensordict, -) -> torch.Tensor: - td_copy = tensordict.select(*self.actor_in_keys).detach() - # Get an action from the actor network - td_copy = self.actor_network( - td_copy, - params=self.actor_network_params, - ) - # get the value associated with that action - with hold_out_params(self.value_network_params) as params: - td_copy = self.value_network( + self.actor_in_keys = actor_network.in_keys + + # Since the value we'll be using is based on the actor and value network, + # we put them together in a single actor-critic container. + actor_critic = ActorCriticWrapper(actor_network, value_network) + self.actor_critic = actor_critic + self.loss_funtion = "l2" + + + ############################################################################### + # The value estimator loss method + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # + # In many RL algorithm, the value network (or Q-value network) is trained based + # on an empirical value estimate. This can be bootstrapped (TD(0), low + # variance, high bias), meaning + # that the target value is obtained using the next reward and nothing else, or + # a Monte-Carlo estimate can be obtained (TD(1)) in which case the whole + # sequence of upcoming rewards will be used (high variance, low bias). An + # intermediate estimator (TD(:math:`\lambda`)) can also be used to compromise + # bias and variance. + # TorchRL makes it easy to use one or the other estimator via the + # :class:`torchrl.objectives.utils.ValueEstimators` Enum class, which contains + # pointers to all the value estimators implemented. Let us define the default + # value function here. We will take the simplest version (TD(0)), and show later + # on how this can be changed. + + from torchrl.objectives.utils import ValueEstimators + + default_value_estimator = ValueEstimators.TD0 + + ############################################################################### + # We also need to give some instructions to DDPG on how to build the value + # estimator, depending on the user query. Depending on the estimator provided, + # we will build the corresponding module to be used at train time: + + from torchrl.objectives.utils import default_value_kwargs + from torchrl.objectives.value import TD0Estimator, TD1Estimator, TDLambdaEstimator + + + def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): + hp = dict(default_value_kwargs(value_type)) + if hasattr(self, "gamma"): + hp["gamma"] = self.gamma + hp.update(hyperparams) + value_key = "state_action_value" + if value_type == ValueEstimators.TD1: + self._value_estimator = TD1Estimator( + value_network=self.actor_critic, value_key=value_key, **hp + ) + elif value_type == ValueEstimators.TD0: + self._value_estimator = TD0Estimator( + value_network=self.actor_critic, value_key=value_key, **hp + ) + elif value_type == ValueEstimators.GAE: + raise NotImplementedError( + f"Value type {value_type} it not implemented for loss {type(self)}." + ) + elif value_type == ValueEstimators.TDLambda: + self._value_estimator = TDLambdaEstimator( + value_network=self.actor_critic, value_key=value_key, **hp + ) + else: + raise NotImplementedError(f"Unknown value type {value_type}") + + + ############################################################################### + # The ``make_value_estimator`` method can but does not need to be called: if + # not, the :class:`torchrl.objectives.LossModule` will query this method with + # its default estimator. + # + # The actor loss method + # ~~~~~~~~~~~~~~~~~~~~~ + # + # The central piece of an RL algorithm is the training loss for the actor. + # In the case of DDPG, this function is quite simple: we just need to compute + # the value associated with an action computed using the policy and optimize + # the actor weights to maximise this value. + # + # When computing this value, we must make sure to take the value parameters out + # of the graph, otherwise the actor and value loss will be mixed up. + # For this, the :func:`torchrl.objectives.utils.hold_out_params` function + # can be used. + + from torchrl.objectives.utils import hold_out_params + + + def _loss_actor( + self, + tensordict, + ) -> torch.Tensor: + td_copy = tensordict.select(*self.actor_in_keys).detach() + # Get an action from the actor network + td_copy = self.actor_network( td_copy, - params=params, + params=self.actor_network_params, ) - return -td_copy.get("state_action_value") - - -############################################################################### -# The value loss method -# ~~~~~~~~~~~~~~~~~~~~~ -# -# We now need to optimize our value network parameters. -# To do this, we will rely on the value estimator of our class: -# - - -def _loss_value( - self, - tensordict, -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - td_copy = tensordict.detach() - - # V(s, a) - self.value_network(td_copy, params=self.value_network_params) - pred_val = td_copy.get("state_action_value").squeeze(-1) - - # we manually reconstruct the parameters of the actor-critic, where the first - # set of parameters belongs to the actor and the second to the value function. - target_params = TensorDict( - { - "module": { - "0": self.target_actor_network_params, - "1": self.target_value_network_params, - } - }, - batch_size=self.target_actor_network_params.batch_size, - device=self.target_actor_network_params.device, - ) - with set_exploration_mode("mode"): # we make sure that no exploration is performed - target_value = self.value_estimator.value_estimate( - tensordict, target_params=target_params - ).squeeze(-1) - - # td_error = pred_val - target_value - loss_value = distance_loss(pred_val, target_value, loss_function=self.loss_funtion) - td_error = (pred_val - target_value).pow(2) - - return loss_value, td_error, pred_val, target_value - - -############################################################################### -# Putting things together in a forward call -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# The only missing piece is the forward method, which will glue together the -# value and actor loss, collect the cost values and write them in a tensordict -# delivered to the user. - - -def _forward(self, input_tensordict: TensorDictBase) -> TensorDict: - if not input_tensordict.device == self.device: - raise RuntimeError( - f"Got device={input_tensordict.device} but " - f"actor_network.device={self.device} (self.device={self.device})" + # get the value associated with that action + with hold_out_params(self.value_network_params) as params: + td_copy = self.value_network( + td_copy, + params=params, + ) + return -td_copy.get("state_action_value") + + + ############################################################################### + # The value loss method + # ~~~~~~~~~~~~~~~~~~~~~ + # + # We now need to optimize our value network parameters. + # To do this, we will rely on the value estimator of our class: + # + + + def _loss_value( + self, + tensordict, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + td_copy = tensordict.detach() + + # V(s, a) + self.value_network(td_copy, params=self.value_network_params) + pred_val = td_copy.get("state_action_value").squeeze(-1) + + # we manually reconstruct the parameters of the actor-critic, where the first + # set of parameters belongs to the actor and the second to the value function. + target_params = TensorDict( + { + "module": { + "0": self.target_actor_network_params, + "1": self.target_value_network_params, + } + }, + batch_size=self.target_actor_network_params.batch_size, + device=self.target_actor_network_params.device, ) + with set_exploration_mode("mode"): # we make sure that no exploration is performed + target_value = self.value_estimator.value_estimate( + tensordict, target_params=target_params + ).squeeze(-1) - loss_value, td_error, pred_val, target_value = self.loss_value( - input_tensordict, - ) - td_error = td_error.detach() - td_error = td_error.unsqueeze(input_tensordict.ndimension()) - if input_tensordict.device is not None: - td_error = td_error.to(input_tensordict.device) - input_tensordict.set( - "td_error", - td_error, - inplace=True, - ) - loss_actor = self.loss_actor(input_tensordict) - return TensorDict( - source={ - "loss_actor": loss_actor.mean(), - "loss_value": loss_value.mean(), - "pred_value": pred_val.mean().detach(), - "target_value": target_value.mean().detach(), - "pred_value_max": pred_val.max().detach(), - "target_value_max": target_value.max().detach(), - }, - batch_size=[], - ) + # td_error = pred_val - target_value + loss_value = distance_loss(pred_val, target_value, loss_function=self.loss_funtion) + td_error = (pred_val - target_value).pow(2) + return loss_value, td_error, pred_val, target_value -class DDPGLoss(LossModule): - default_value_estimator = default_value_estimator - make_value_estimator = make_value_estimator - __init__ = _init - forward = _forward - loss_value = _loss_value - loss_actor = _loss_actor + ############################################################################### + # Putting things together in a forward call + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # + # The only missing piece is the forward method, which will glue together the + # value and actor loss, collect the cost values and write them in a tensordict + # delivered to the user. -############################################################################### -# Now that we have our loss, we can use it to train a policy to solve a -# control task. -# -# Environment -# ----------- -# -# In most algorithms, the first thing that needs to be taken care of is the -# construction of the environment as it conditions the remainder of the -# training script. -# -# For this example, we will be using the ``"cheetah"`` task. The goal is to make -# a half-cheetah run as fast as possible. -# -# In TorchRL, one can create such a task by relying on dm_control or gym: -# -# .. code-block:: python -# -# env = GymEnv("HalfCheetah-v4") -# -# or -# -# .. code-block:: python -# -# env = DMControlEnv("cheetah", "run") -# -# By default, these environment disable rendering. Training from states is -# usually easier than training from images. To keep things simple, we focus -# on learning from states only. To pass the pixels to the tensordicts that -# are collected by :func:`env.step()`, simply pass the ``from_pixels=True`` -# argument to the constructor: -# -# .. code-block:: python -# -# env = GymEnv("HalfCheetah-v4", from_pixels=True, pixels_only=True) -# -# We write a :func:`make_env` helper function that will create an environment -# with either one of the two backends considered above (dm-control or gym). -# - -env_library = None -env_name = None - - -def make_env(): - """Create a base env.""" - global env_library - global env_name - - if backend == "dm_control": - env_name = "cheetah" - env_task = "run" - env_args = (env_name, env_task) - env_library = DMControlEnv - elif backend == "gym": - env_name = "HalfCheetah-v4" - env_args = (env_name,) - env_library = GymEnv - else: - raise NotImplementedError - - env_kwargs = { - "device": device, - "frame_skip": frame_skip, - "from_pixels": from_pixels, - "pixels_only": from_pixels, - } - env = env_library(*env_args, **env_kwargs) - return env - - -############################################################################### -# Transforms -# ~~~~~~~~~~ -# -# Now that we have a base environment, we may want to modify its representation -# to make it more policy-friendly. In TorchRL, transforms are appended to the -# base environment in a specialized :class:`torchr.envs.TransformedEnv` class. -# -# - It is common in DDPG to rescale the reward using some heuristic value. We -# will multiply the reward by 5 in this example. -# -# - If we are using :mod:`dm_control`, it is also important to build an interface -# between the simulator which works with double precision numbers, and our -# script which presumably uses single precision ones. This transformation goes -# both ways: when calling :func:`env.step`, our actions will need to be -# represented in double precision, and the output will need to be transformed -# to single precision. -# The :class:`torchrl.envs.DoubleToFloat` transform does exactly this: the -# ``in_keys`` list refers to the keys that will need to be transformed from -# double to float, while the ``in_keys_inv`` refers to those that need to -# be transformed to double before being passed to the environment. -# -# - We concatenate the state keys together using the :class:`torchrl.envs.CatTensors` -# transform. -# -# - Finally, we also leave the possibility of normalizing the states: we will -# take care of computing the normalizing constants later on. -# - + def _forward(self, input_tensordict: TensorDictBase) -> TensorDict: + if not input_tensordict.device == self.device: + raise RuntimeError( + f"Got device={input_tensordict.device} but " + f"actor_network.device={self.device} (self.device={self.device})" + ) -def make_transformed_env( - env, -): - """Apply transforms to the env (such as reward scaling and state normalization).""" + loss_value, td_error, pred_val, target_value = self.loss_value( + input_tensordict, + ) + td_error = td_error.detach() + td_error = td_error.unsqueeze(input_tensordict.ndimension()) + if input_tensordict.device is not None: + td_error = td_error.to(input_tensordict.device) + input_tensordict.set( + "td_error", + td_error, + inplace=True, + ) + loss_actor = self.loss_actor(input_tensordict) + return TensorDict( + source={ + "loss_actor": loss_actor.mean(), + "loss_value": loss_value.mean(), + "pred_value": pred_val.mean().detach(), + "target_value": target_value.mean().detach(), + "pred_value_max": pred_val.max().detach(), + "target_value_max": target_value.max().detach(), + }, + batch_size=[], + ) - env = TransformedEnv(env) - # we append transforms one by one, although we might as well create the - # transformed environment using the `env = TransformedEnv(base_env, transforms)` - # syntax. - env.append_transform(RewardScaling(loc=0.0, scale=reward_scaling)) + class DDPGLoss(LossModule): + default_value_estimator = default_value_estimator + make_value_estimator = make_value_estimator + + __init__ = _init + forward = _forward + loss_value = _loss_value + loss_actor = _loss_actor + + + ############################################################################### + # Now that we have our loss, we can use it to train a policy to solve a + # control task. + # + # Environment + # ----------- + # + # In most algorithms, the first thing that needs to be taken care of is the + # construction of the environment as it conditions the remainder of the + # training script. + # + # For this example, we will be using the ``"cheetah"`` task. The goal is to make + # a half-cheetah run as fast as possible. + # + # In TorchRL, one can create such a task by relying on dm_control or gym: + # + # .. code-block:: python + # + # env = GymEnv("HalfCheetah-v4") + # + # or + # + # .. code-block:: python + # + # env = DMControlEnv("cheetah", "run") + # + # By default, these environment disable rendering. Training from states is + # usually easier than training from images. To keep things simple, we focus + # on learning from states only. To pass the pixels to the tensordicts that + # are collected by :func:`env.step()`, simply pass the ``from_pixels=True`` + # argument to the constructor: + # + # .. code-block:: python + # + # env = GymEnv("HalfCheetah-v4", from_pixels=True, pixels_only=True) + # + # We write a :func:`make_env` helper function that will create an environment + # with either one of the two backends considered above (dm-control or gym). + # + + env_library = None + env_name = None + + + def make_env(): + """Create a base env.""" + global env_library + global env_name + + if backend == "dm_control": + env_name = "cheetah" + env_task = "run" + env_args = (env_name, env_task) + env_library = DMControlEnv + elif backend == "gym": + env_name = "HalfCheetah-v4" + env_args = (env_name,) + env_library = GymEnv + else: + raise NotImplementedError + + env_kwargs = { + "device": device, + "frame_skip": frame_skip, + "from_pixels": from_pixels, + "pixels_only": from_pixels, + } + env = env_library(*env_args, **env_kwargs) + return env + + + ############################################################################### + # Transforms + # ~~~~~~~~~~ + # + # Now that we have a base environment, we may want to modify its representation + # to make it more policy-friendly. In TorchRL, transforms are appended to the + # base environment in a specialized :class:`torchr.envs.TransformedEnv` class. + # + # - It is common in DDPG to rescale the reward using some heuristic value. We + # will multiply the reward by 5 in this example. + # + # - If we are using :mod:`dm_control`, it is also important to build an interface + # between the simulator which works with double precision numbers, and our + # script which presumably uses single precision ones. This transformation goes + # both ways: when calling :func:`env.step`, our actions will need to be + # represented in double precision, and the output will need to be transformed + # to single precision. + # The :class:`torchrl.envs.DoubleToFloat` transform does exactly this: the + # ``in_keys`` list refers to the keys that will need to be transformed from + # double to float, while the ``in_keys_inv`` refers to those that need to + # be transformed to double before being passed to the environment. + # + # - We concatenate the state keys together using the :class:`torchrl.envs.CatTensors` + # transform. + # + # - Finally, we also leave the possibility of normalizing the states: we will + # take care of computing the normalizing constants later on. + # + + + def make_transformed_env( + env, + ): + """Apply transforms to the env (such as reward scaling and state normalization).""" + + env = TransformedEnv(env) + + # we append transforms one by one, although we might as well create the + # transformed environment using the `env = TransformedEnv(base_env, transforms)` + # syntax. + env.append_transform(RewardScaling(loc=0.0, scale=reward_scaling)) + + double_to_float_list = [] + double_to_float_inv_list = [] + if env_library is DMControlEnv: + # DMControl requires double-precision + double_to_float_list += [ + "reward", + "action", + ] + double_to_float_inv_list += ["action"] + + # We concatenate all states into a single "observation_vector" + # even if there is a single tensor, it'll be renamed in "observation_vector". + # This facilitates the downstream operations as we know the name of the + # output tensor. + # In some environments (not half-cheetah), there may be more than one + # observation vector: in this case this code snippet will concatenate them + # all. + selected_keys = list(env.observation_spec.keys()) + out_key = "observation_vector" + env.append_transform(CatTensors(in_keys=selected_keys, out_key=out_key)) + + # we normalize the states, but for now let's just instantiate a stateless + # version of the transform + env.append_transform(ObservationNorm(in_keys=[out_key], standard_normal=True)) + + double_to_float_list.append(out_key) + env.append_transform( + DoubleToFloat( + in_keys=double_to_float_list, in_keys_inv=double_to_float_inv_list + ) + ) - double_to_float_list = [] - double_to_float_inv_list = [] - if env_library is DMControlEnv: - # DMControl requires double-precision - double_to_float_list += [ - "reward", - "action", + return env + + + ############################################################################### + # Normalization of the observations + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # + # To compute the normalizing statistics, we run an arbitrary number of random + # steps in the environment and compute the mean and standard deviation of the + # collected observations. The :func:`ObservationNorm.init_stats()` method can + # be used for this purpose. To get the summary statistics, we create a dummy + # environment and run it for a given number of steps, collect data over a given + # number of steps and compute its summary statistics. + # + + + def get_env_stats(): + """Gets the stats of an environment.""" + proof_env = make_transformed_env(make_env()) + proof_env.set_seed(seed) + t = proof_env.transform[2] + t.init_stats(init_env_steps) + transform_state_dict = t.state_dict() + proof_env.close() + return transform_state_dict + + + ############################################################################### + # Parallel execution + # ~~~~~~~~~~~~~~~~~~ + # + # The following helper function allows us to run environments in parallel. + # Running environments in parallel can significantly speed up the collection + # throughput. When using transformed environment, we need to choose whether we + # want to execute the transform individually for each environment, or + # centralize the data and transform it in batch. Both approaches are easy to + # code: + # + # .. code-block:: python + # + # env = ParallelEnv( + # lambda: TransformedEnv(GymEnv("HalfCheetah-v4"), transforms), + # num_workers=4 + # ) + # env = TransformedEnv( + # ParallelEnv(lambda: GymEnv("HalfCheetah-v4"), num_workers=4), + # transforms + # ) + # + # To leverage the vectorization capabilities of PyTorch, we adopt + # the first method: + # + + + def parallel_env_constructor( + transform_state_dict, + ): + if env_per_collector == 1: + + def make_t_env(): + env = make_transformed_env(make_env()) + env.transform[2].init_stats(3) + env.transform[2].loc.copy_(transform_state_dict["loc"]) + env.transform[2].scale.copy_(transform_state_dict["scale"]) + return env + + env_creator = EnvCreator(make_t_env) + return env_creator + + parallel_env = ParallelEnv( + num_workers=env_per_collector, + create_env_fn=EnvCreator(lambda: make_env()), + create_env_kwargs=None, + pin_memory=False, + ) + env = make_transformed_env(parallel_env) + # we call `init_stats` for a limited number of steps, just to instantiate + # the lazy buffers. + env.transform[2].init_stats(3, cat_dim=1, reduce_dim=[0, 1]) + env.transform[2].load_state_dict(transform_state_dict) + return env + + + ############################################################################### + # Building the model + # ------------------ + # + # We now turn to the setup of the model. As we have seen, DDPG requires a + # value network, trained to estimate the value of a state-action pair, and a + # parametric actor that learns how to select actions that maximize this value. + # + # Recall that building a TorchRL module requires two steps: + # + # - writing the :class:`torch.nn.Module` that will be used as network, + # - wrapping the network in a :class:`tensordict.nn.TensorDictModule` where the + # data flow is handled by specifying the input and output keys. + # + # In more complex scenarios, :class:`tensordict.nn.TensorDictSequential` can + # also be used. + # + # + # The Q-Value network is wrapped in a :class:`torchrl.modules.ValueOperator` + # that automatically sets the ``out_keys`` to ``"state_action_value`` for q-value + # networks and ``state_value`` for other value networks. + # + # Since we use lazy modules, it is necessary to materialize the lazy modules + # before being able to move the policy from device to device and achieve other + # operations. Hence, it is good practice to run the modules with a small + # sample of data. For this purpose, we generate fake data from the + # environment specs. + # + + + def make_ddpg_actor( + transform_state_dict, + device="cpu", + ): + proof_environment = make_transformed_env(make_env()) + proof_environment.transform[2].init_stats(3) + proof_environment.transform[2].load_state_dict(transform_state_dict) + + env_specs = proof_environment.specs + in_features = env_specs["output_spec"]["observation"]["observation_vector"].shape[ + -1 ] - double_to_float_inv_list += ["action"] - - # We concatenate all states into a single "observation_vector" - # even if there is a single tensor, it'll be renamed in "observation_vector". - # This facilitates the downstream operations as we know the name of the - # output tensor. - # In some environments (not half-cheetah), there may be more than one - # observation vector: in this case this code snippet will concatenate them - # all. - selected_keys = list(env.observation_spec.keys()) - out_key = "observation_vector" - env.append_transform(CatTensors(in_keys=selected_keys, out_key=out_key)) - - # we normalize the states, but for now let's just instantiate a stateless - # version of the transform - env.append_transform(ObservationNorm(in_keys=[out_key], standard_normal=True)) - - double_to_float_list.append(out_key) - env.append_transform( - DoubleToFloat( - in_keys=double_to_float_list, in_keys_inv=double_to_float_inv_list + out_features = env_specs["input_spec"]["action"].shape[-1] + + actor_net = MLP( + in_features=in_features, + out_features=out_features, + num_cells=[num_cells] * num_layers, + activation_class=nn.Tanh, + activate_last_layer=True, # with this option on, we use a Tanh map as a last layer, thereby constraining the action to the [-1; 1] domain + ) + in_keys = ["observation_vector"] + out_keys = ["action"] + + actor = Actor( + actor_net, + in_keys=in_keys, + out_keys=out_keys, + spec=CompositeSpec(action=env_specs["input_spec"]["action"]), + ).to(device) + + q_net = MLP( + in_features=in_features + + out_features, # receives an action and an observation as input + out_features=1, + num_cells=[num_cells] * num_layers, + activation_class=nn.Tanh, ) - ) - - return env + in_keys = in_keys + ["action"] + qnet = ValueOperator( + in_keys=in_keys, + module=q_net, + ).to(device) + + return actor, qnet + + + ############################################################################### + # Evaluator: building your recorder object + # ---------------------------------------- + # + # As the training data is obtained using some exploration strategy, the true + # performance of our algorithm needs to be assessed in deterministic mode. We + # do this using a dedicated class, ``Recorder``, which executes the policy in + # the environment at a given frequency and returns some statistics obtained + # from these simulations. + # + # The following helper function builds this object: + + + def make_recorder(actor_model_explore, transform_state_dict): + base_env = make_env() + recorder = make_transformed_env(base_env) + recorder.transform[2].init_stats(3) + recorder.transform[2].load_state_dict(transform_state_dict) + + recorder_obj = Recorder( + record_frames=1000, + frame_skip=frame_skip, + policy_exploration=actor_model_explore, + recorder=recorder, + exploration_mode="mean", + record_interval=record_interval, + ) + return recorder_obj + + + ############################################################################### + # Replay buffer + # ------------- + # + # Replay buffers come in two flavors: prioritized (where some error signal + # is used to give a higher likelihood of sampling to some items than others) + # and regular, circular experience replay. + # + # TorchRL replay buffers are composable: one can pick up the storage, sampling + # and writing strategies. It is also possible to + # store tensors on physical memory using a memory-mapped array. The following + # function takes care of creating the replay buffer with the desired + # hyperparameters: + # + + + def make_replay_buffer(buffer_size, prefetch=3): + if prb: + sampler = PrioritizedSampler( + max_capacity=buffer_size, + alpha=0.7, + beta=0.5, + ) + else: + sampler = RandomSampler() + replay_buffer = TensorDictReplayBuffer( + storage=LazyMemmapStorage( + buffer_size, + scratch_dir=buffer_scratch_dir, + device=device, + ), + sampler=sampler, + pin_memory=False, + prefetch=prefetch, + ) + return replay_buffer -############################################################################### -# Normalization of the observations -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# To compute the normalizing statistics, we run an arbitrary number of random -# steps in the environment and compute the mean and standard deviation of the -# collected observations. The :func:`ObservationNorm.init_stats()` method can -# be used for this purpose. To get the summary statistics, we create a dummy -# environment and run it for a given number of steps, collect data over a given -# number of steps and compute its summary statistics. -# + ############################################################################### + # Hyperparameters + # --------------- + # + # After having written our helper functions, it is time to set the + # experiment hyperparameters: -def get_env_stats(): - """Gets the stats of an environment.""" - proof_env = make_transformed_env(make_env()) - proof_env.set_seed(seed) - t = proof_env.transform[2] - t.init_stats(init_env_steps) - transform_state_dict = t.state_dict() - proof_env.close() - return transform_state_dict + ############################################################################### + # Environment + # ~~~~~~~~~~~ + # The backend can be gym or dm_control + backend = "gym" -############################################################################### -# Parallel execution -# ~~~~~~~~~~~~~~~~~~ -# -# The following helper function allows us to run environments in parallel. -# Running environments in parallel can significantly speed up the collection -# throughput. When using transformed environment, we need to choose whether we -# want to execute the transform individually for each environment, or -# centralize the data and transform it in batch. Both approaches are easy to -# code: -# -# .. code-block:: python -# -# env = ParallelEnv( -# lambda: TransformedEnv(GymEnv("HalfCheetah-v4"), transforms), -# num_workers=4 -# ) -# env = TransformedEnv( -# ParallelEnv(lambda: GymEnv("HalfCheetah-v4"), num_workers=4), -# transforms -# ) -# -# To leverage the vectorization capabilities of PyTorch, we adopt -# the first method: -# + exp_name = "cheetah" + # frame_skip batches multiple step together with a single action + # If > 1, the other frame counts (e.g. frames_per_batch, total_frames) need to + # be adjusted to have a consistent total number of frames collected across + # experiments. + frame_skip = 2 + from_pixels = False + # Scaling the reward helps us control the signal magnitude for a more + # efficient learning. + reward_scaling = 5.0 -def parallel_env_constructor( - transform_state_dict, -): - if env_per_collector == 1: + # Number of random steps used as for stats computation using ObservationNorm + init_env_steps = 1000 - def make_t_env(): - env = make_transformed_env(make_env()) - env.transform[2].init_stats(3) - env.transform[2].loc.copy_(transform_state_dict["loc"]) - env.transform[2].scale.copy_(transform_state_dict["scale"]) - return env + # Exploration: Number of frames before OU noise becomes null + annealing_frames = 1000000 // frame_skip - env_creator = EnvCreator(make_t_env) - return env_creator + ############################################################################### + # Collection + # ~~~~~~~~~~ - parallel_env = ParallelEnv( - num_workers=env_per_collector, - create_env_fn=EnvCreator(lambda: make_env()), - create_env_kwargs=None, - pin_memory=False, + # We will execute the policy on cuda if available + device = ( + torch.device("cpu") if torch.cuda.device_count() == 0 else torch.device("cuda:0") ) - env = make_transformed_env(parallel_env) - # we call `init_stats` for a limited number of steps, just to instantiate - # the lazy buffers. - env.transform[2].init_stats(3, cat_dim=1, reduce_dim=[0, 1]) - env.transform[2].load_state_dict(transform_state_dict) - return env - -############################################################################### -# Building the model -# ------------------ -# -# We now turn to the setup of the model. As we have seen, DDPG requires a -# value network, trained to estimate the value of a state-action pair, and a -# parametric actor that learns how to select actions that maximize this value. -# -# Recall that building a TorchRL module requires two steps: -# -# - writing the :class:`torch.nn.Module` that will be used as network, -# - wrapping the network in a :class:`tensordict.nn.TensorDictModule` where the -# data flow is handled by specifying the input and output keys. -# -# In more complex scenarios, :class:`tensordict.nn.TensorDictSequential` can -# also be used. -# -# -# The Q-Value network is wrapped in a :class:`torchrl.modules.ValueOperator` -# that automatically sets the ``out_keys`` to ``"state_action_value`` for q-value -# networks and ``state_value`` for other value networks. -# -# Since we use lazy modules, it is necessary to materialize the lazy modules -# before being able to move the policy from device to device and achieve other -# operations. Hence, it is good practice to run the modules with a small -# sample of data. For this purpose, we generate fake data from the -# environment specs. -# - - -def make_ddpg_actor( - transform_state_dict, - device="cpu", -): - proof_environment = make_transformed_env(make_env()) - proof_environment.transform[2].init_stats(3) - proof_environment.transform[2].load_state_dict(transform_state_dict) - - env_specs = proof_environment.specs - in_features = env_specs["output_spec"]["observation"]["observation_vector"].shape[ - -1 - ] - out_features = env_specs["input_spec"]["action"].shape[-1] - - actor_net = MLP( - in_features=in_features, - out_features=out_features, - num_cells=[num_cells] * num_layers, - activation_class=nn.Tanh, - activate_last_layer=True, # with this option on, we use a Tanh map as a last layer, thereby constraining the action to the [-1; 1] domain + # Number of environments in each data collector + warnings.warn("More envs!") + env_per_collector = 1 + + # Total frames we will use during training. Scale up to 500K - 1M for a more + # meaningful training + total_frames = 10000 // frame_skip + + # Number of frames returned by the collector at each iteration of the outer loop. + # We expect batches from the collector to have a shape [env_per_collector, frames_per_batch // env_per_collector] + frames_per_batch = env_per_collector * 1000 // frame_skip + max_frames_per_traj = 1000 // frame_skip + init_random_frames = 0 + # We'll be using the MultiStep class to have a less myopic representation of + # upcoming states + n_steps_forward = 3 + + # record every 10 batch collected + record_interval = 10 + + ############################################################################### + # Optimizer and optimization + # ~~~~~~~~~~~~~~~~~~~~~~~~~~ + + lr = 5e-4 + weight_decay = 0.0 + # UTD: Number of iterations of the inner loop + update_to_data = 32 + batch_size = 128 + + ############################################################################### + # Model + # ~~~~~ + + gamma = 0.99 + tau = 0.005 # Decay factor for the target network + + # Network specs + num_cells = 64 + num_layers = 2 + + ############################################################################### + # Replay buffer + # ~~~~~~~~~~~~~ + + # If True, a Prioritized replay buffer will be used + prb = True + # Number of frames stored in the buffer + traj_len_collector = frames_per_batch // env_per_collector + buffer_size = min(total_frames, 1_000_000 // traj_len_collector) + buffer_scratch_dir = "/tmp/" + + seed = 0 + + ############################################################################### + # Initialization + # -------------- + # + # To initialize the experiment, we first acquire the observation statistics, + # then build the networks, wrap them in an exploration wrapper (following the + # seminal DDPG paper, we used an Ornstein-Uhlenbeck process to add noise to the + # sampled actions). + + + # Seeding + torch.manual_seed(seed) + np.random.seed(seed) + + ############################################################################### + # Normalization stats + # ~~~~~~~~~~~~~~~~~~~ + + transform_state_dict = get_env_stats() + + ############################################################################### + # Models: policy and q-value network + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + actor, qnet = make_ddpg_actor( + transform_state_dict=transform_state_dict, + device=device, ) - in_keys = ["observation_vector"] - out_keys = ["action"] - - actor = Actor( - actor_net, - in_keys=in_keys, - out_keys=out_keys, - spec=CompositeSpec(action=env_specs["input_spec"]["action"]), + if device == torch.device("cpu"): + actor.share_memory() + + + ############################################################################### + # Loss module + # ~~~~~~~~~~~ + # We build our loss module with the actor and qnet we've just created. + # Because we have target parameters to update, we _must_ create a target network + # updater. + # + loss_module = DDPGLoss(actor, qnet) + # let's use the TD(lambda) estimator! + loss_module.make_value_estimator(ValueEstimators.TDLambda) + target_net_updater = SoftUpdate(loss_module, eps=0.98) + target_net_updater.init_() + + ############################################################################### + # The policy is wrapped in a :class:`torchrl.modules.OrnsteinUhlenbeckProcessWrapper` + # exploration module: + + actor_model_explore = OrnsteinUhlenbeckProcessWrapper( + actor, + annealing_num_steps=annealing_frames, ).to(device) - - q_net = MLP( - in_features=in_features - + out_features, # receives an action and an observation as input - out_features=1, - num_cells=[num_cells] * num_layers, - activation_class=nn.Tanh, + if device == torch.device("cpu"): + actor_model_explore.share_memory() + + ############################################################################### + # Parallel environment creation + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # + # We pass the stats computed earlier to normalize the output of our + # environment: + + create_env_fn = parallel_env_constructor( + transform_state_dict=transform_state_dict, ) - in_keys = in_keys + ["action"] - qnet = ValueOperator( - in_keys=in_keys, - module=q_net, - ).to(device) - - return actor, qnet - - -############################################################################### -# Evaluator: building your recorder object -# ---------------------------------------- -# -# As the training data is obtained using some exploration strategy, the true -# performance of our algorithm needs to be assessed in deterministic mode. We -# do this using a dedicated class, ``Recorder``, which executes the policy in -# the environment at a given frequency and returns some statistics obtained -# from these simulations. -# -# The following helper function builds this object: - - -def make_recorder(actor_model_explore, transform_state_dict): - base_env = make_env() - recorder = make_transformed_env(base_env) - recorder.transform[2].init_stats(3) - recorder.transform[2].load_state_dict(transform_state_dict) - - recorder_obj = Recorder( - record_frames=1000, - frame_skip=frame_skip, - policy_exploration=actor_model_explore, - recorder=recorder, - exploration_mode="mean", - record_interval=record_interval, - ) - return recorder_obj - - -############################################################################### -# Replay buffer -# ------------- -# -# Replay buffers come in two flavors: prioritized (where some error signal -# is used to give a higher likelihood of sampling to some items than others) -# and regular, circular experience replay. -# -# TorchRL replay buffers are composable: one can pick up the storage, sampling -# and writing strategies. It is also possible to -# store tensors on physical memory using a memory-mapped array. The following -# function takes care of creating the replay buffer with the desired -# hyperparameters: -# - - -def make_replay_buffer(buffer_size, prefetch=3): - if prb: - sampler = PrioritizedSampler( - max_capacity=buffer_size, - alpha=0.7, - beta=0.5, - ) + ############################################################################### + # Data collector + # ~~~~~~~~~~~~~~ + # + # TorchRL provides specialized classes to help you collect data by executing + # the policy in the environment. These "data collectors" iteratively compute + # the action to be executed at a given time, then execute a step in the + # environment and reset it when required. + # Data collectors are designed to help developers have a tight control + # on the number of frames per batch of data, on the (a)sync nature of this + # collection and on the resources allocated to the data collection (e.g. GPU, + # number of workers etc). + # + # Here we will use + # :class:`torchrl.collectors.MultiaSyncDataCollector`, a data collector that + # will be executed in an async manner (i.e. data will be collected while + # the policy is being optimized). With the :class:`MultiaSyncDataCollector`, + # multiple workers are running rollouts separately. When a batch is asked, it + # is gathered from the first worker that can provide it. + # + # The parameters to specify are: + # + # - the list of environment creation functions, + # - the policy, + # - the total number of frames before the collector is considered empty, + # - the maximum number of frames per trajectory (useful for non-terminating + # environments, like dm_control ones). + # + # One should also pass: + # + # - the number of frames in each batch collected, + # - the number of random steps executed independently from the policy, + # - the devices used for policy execution + # - the devices used to store data before the data is passed to the main + # process. + # + # Collectors also accept post-processing hooks. + # For instance, the :class:`torchrl.data.postprocs.MultiStep` class passed as + # ``postproc`` makes it so that the rewards of the ``n`` upcoming steps are + # summed (with some discount factor) and the next observation is changed to + # be the n-step forward observation. One could pass other transforms too: + # using :class:`tensordict.nn.TensorDictModule` and + # :class:`tensordict.nn.TensorDictSequential` we can seamlessly append a + # wide range of transforms to our collector. + + if n_steps_forward > 0: + multistep = MultiStep(n_steps=n_steps_forward, gamma=gamma) else: - sampler = RandomSampler() - replay_buffer = TensorDictReplayBuffer( - storage=LazyMemmapStorage( - buffer_size, - scratch_dir=buffer_scratch_dir, - device=device, - ), - sampler=sampler, - pin_memory=False, - prefetch=prefetch, + multistep = None + + warnings.warn("Change collector!!") + + collector = MultiaSyncDataCollector( + create_env_fn=[create_env_fn, create_env_fn], + policy=actor_model_explore, + total_frames=total_frames, + max_frames_per_traj=max_frames_per_traj, + frames_per_batch=frames_per_batch, + init_random_frames=init_random_frames, + reset_at_each_iter=False, + postproc=multistep, + split_trajs=True, + device=device, # device for execution + storing_device=device, # device where data will be stored and passed + update_at_each_batch=False, + exploration_mode="random", ) - return replay_buffer + collector.set_seed(seed) -############################################################################### -# Hyperparameters -# --------------- -# -# After having written our helper functions, it is time to set the -# experiment hyperparameters: - -############################################################################### -# Environment -# ~~~~~~~~~~~ - -# The backend can be gym or dm_control -backend = "gym" - -exp_name = "cheetah" - -# frame_skip batches multiple step together with a single action -# If > 1, the other frame counts (e.g. frames_per_batch, total_frames) need to -# be adjusted to have a consistent total number of frames collected across -# experiments. -frame_skip = 2 -from_pixels = False -# Scaling the reward helps us control the signal magnitude for a more -# efficient learning. -reward_scaling = 5.0 - -# Number of random steps used as for stats computation using ObservationNorm -init_env_steps = 1000 - -# Exploration: Number of frames before OU noise becomes null -annealing_frames = 1000000 // frame_skip - -############################################################################### -# Collection -# ~~~~~~~~~~ - -# We will execute the policy on cuda if available -device = ( - torch.device("cpu") if torch.cuda.device_count() == 0 else torch.device("cuda:0") -) - -# Number of environments in each data collector -warnings.warn("More envs!") -env_per_collector = 1 - -# Total frames we will use during training. Scale up to 500K - 1M for a more -# meaningful training -total_frames = 10000 // frame_skip - -# Number of frames returned by the collector at each iteration of the outer loop. -# We expect batches from the collector to have a shape [env_per_collector, frames_per_batch // env_per_collector] -frames_per_batch = env_per_collector * 1000 // frame_skip -max_frames_per_traj = 1000 // frame_skip -init_random_frames = 0 -# We'll be using the MultiStep class to have a less myopic representation of -# upcoming states -n_steps_forward = 3 - -# record every 10 batch collected -record_interval = 10 - -############################################################################### -# Optimizer and optimization -# ~~~~~~~~~~~~~~~~~~~~~~~~~~ - -lr = 5e-4 -weight_decay = 0.0 -# UTD: Number of iterations of the inner loop -update_to_data = 32 -batch_size = 128 - -############################################################################### -# Model -# ~~~~~ - -gamma = 0.99 -tau = 0.005 # Decay factor for the target network - -# Network specs -num_cells = 64 -num_layers = 2 - -############################################################################### -# Replay buffer -# ~~~~~~~~~~~~~ - -# If True, a Prioritized replay buffer will be used -prb = True -# Number of frames stored in the buffer -traj_len_collector = frames_per_batch // env_per_collector -buffer_size = min(total_frames, 1_000_000 // traj_len_collector) -buffer_scratch_dir = "/tmp/" - -seed = 0 - -############################################################################### -# Initialization -# -------------- -# -# To initialize the experiment, we first acquire the observation statistics, -# then build the networks, wrap them in an exploration wrapper (following the -# seminal DDPG paper, we used an Ornstein-Uhlenbeck process to add noise to the -# sampled actions). - - -# Seeding -torch.manual_seed(seed) -np.random.seed(seed) - -############################################################################### -# Normalization stats -# ~~~~~~~~~~~~~~~~~~~ - -transform_state_dict = get_env_stats() - -############################################################################### -# Models: policy and q-value network -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -actor, qnet = make_ddpg_actor( - transform_state_dict=transform_state_dict, - device=device, -) -if device == torch.device("cpu"): - actor.share_memory() - - -############################################################################### -# Loss module -# ~~~~~~~~~~~ -# We build our loss module with the actor and qnet we've just created. -# Because we have target parameters to update, we _must_ create a target network -# updater. -# -loss_module = DDPGLoss(actor, qnet) -# let's use the TD(lambda) estimator! -loss_module.make_value_estimator(ValueEstimators.TDLambda) -target_net_updater = SoftUpdate(loss_module, eps=0.98) -target_net_updater.init_() - -############################################################################### -# The policy is wrapped in a :class:`torchrl.modules.OrnsteinUhlenbeckProcessWrapper` -# exploration module: - -actor_model_explore = OrnsteinUhlenbeckProcessWrapper( - actor, - annealing_num_steps=annealing_frames, -).to(device) -if device == torch.device("cpu"): - actor_model_explore.share_memory() - -############################################################################### -# Parallel environment creation -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# We pass the stats computed earlier to normalize the output of our -# environment: + ############################################################################### + # Replay buffer + # ~~~~~~~~~~~~~ + # -create_env_fn = parallel_env_constructor( - transform_state_dict=transform_state_dict, -) + replay_buffer = make_replay_buffer(buffer_size, prefetch=3) -############################################################################### -# Data collector -# ~~~~~~~~~~~~~~ -# -# TorchRL provides specialized classes to help you collect data by executing -# the policy in the environment. These "data collectors" iteratively compute -# the action to be executed at a given time, then execute a step in the -# environment and reset it when required. -# Data collectors are designed to help developers have a tight control -# on the number of frames per batch of data, on the (a)sync nature of this -# collection and on the resources allocated to the data collection (e.g. GPU, -# number of workers etc). -# -# Here we will use -# :class:`torchrl.collectors.MultiaSyncDataCollector`, a data collector that -# will be executed in an async manner (i.e. data will be collected while -# the policy is being optimized). With the :class:`MultiaSyncDataCollector`, -# multiple workers are running rollouts separately. When a batch is asked, it -# is gathered from the first worker that can provide it. -# -# The parameters to specify are: -# -# - the list of environment creation functions, -# - the policy, -# - the total number of frames before the collector is considered empty, -# - the maximum number of frames per trajectory (useful for non-terminating -# environments, like dm_control ones). -# -# One should also pass: -# -# - the number of frames in each batch collected, -# - the number of random steps executed independently from the policy, -# - the devices used for policy execution -# - the devices used to store data before the data is passed to the main -# process. -# -# Collectors also accept post-processing hooks. -# For instance, the :class:`torchrl.data.postprocs.MultiStep` class passed as -# ``postproc`` makes it so that the rewards of the ``n`` upcoming steps are -# summed (with some discount factor) and the next observation is changed to -# be the n-step forward observation. One could pass other transforms too: -# using :class:`tensordict.nn.TensorDictModule` and -# :class:`tensordict.nn.TensorDictSequential` we can seamlessly append a -# wide range of transforms to our collector. - -if n_steps_forward > 0: - multistep = MultiStep(n_steps=n_steps_forward, gamma=gamma) -else: - multistep = None - -warnings.warn("Change collector!!") - -collector = SyncDataCollector( - create_env_fn=create_env_fn, - policy=actor_model_explore, - total_frames=total_frames, - max_frames_per_traj=max_frames_per_traj, - frames_per_batch=frames_per_batch, - init_random_frames=init_random_frames, - reset_at_each_iter=False, - postproc=multistep, - split_trajs=True, - device=device, # device for execution - storing_device=device, # device where data will be stored and passed - update_at_each_batch=False, - exploration_mode="random", -) - -collector.set_seed(seed) - -############################################################################### -# Replay buffer -# ~~~~~~~~~~~~~ -# + ############################################################################### + # Recorder + # ~~~~~~~~ -replay_buffer = make_replay_buffer(buffer_size, prefetch=3) + recorder = make_recorder(actor_model_explore, transform_state_dict) -############################################################################### -# Recorder -# ~~~~~~~~ + ############################################################################### + # Optimizer + # ~~~~~~~~~ + # + # Finally, we will use the Adam optimizer for the policy and value network, + # with the same learning rate for both. -recorder = make_recorder(actor_model_explore, transform_state_dict) - -############################################################################### -# Optimizer -# ~~~~~~~~~ -# -# Finally, we will use the Adam optimizer for the policy and value network, -# with the same learning rate for both. + optimizer = optim.Adam(loss_module.parameters(), lr=lr, weight_decay=weight_decay) + total_collection_steps = total_frames // frames_per_batch -optimizer = optim.Adam(loss_module.parameters(), lr=lr, weight_decay=weight_decay) -total_collection_steps = total_frames // frames_per_batch - -scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( - optimizer, T_max=total_collection_steps -) - -############################################################################### -# Time to train the policy -# ------------------------ -# -# The training loop is pretty straightforward now that we have built all the -# modules we need. -# - -rewards = [] -rewards_eval = [] - -# Main loop -norm_factor_training = ( - sum(gamma**i for i in range(n_steps_forward)) if n_steps_forward else 1 -) - -collected_frames = 0 -pbar = tqdm.tqdm(total=total_frames) -r0 = None -for i, tensordict in enumerate(collector): - - # update weights of the inference policy - collector.update_policy_weights_() - - if r0 is None: - r0 = tensordict["next", "reward"].mean().item() - pbar.update(tensordict.numel()) + scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( + optimizer, T_max=total_collection_steps + ) - # extend the replay buffer with the new data - current_frames = tensordict.numel() - collected_frames += current_frames - replay_buffer.extend(tensordict.cpu()) + ############################################################################### + # Time to train the policy + # ------------------------ + # + # The training loop is pretty straightforward now that we have built all the + # modules we need. + # - # optimization steps - if collected_frames >= init_random_frames: - for _ in range(update_to_data): - # sample from replay buffer - sampled_tensordict = replay_buffer.sample(batch_size).clone() + rewards = [] + rewards_eval = [] - # Compute loss - loss_dict = loss_module(sampled_tensordict) + # Main loop + norm_factor_training = ( + sum(gamma**i for i in range(n_steps_forward)) if n_steps_forward else 1 + ) - # optimize - loss_val = sum( - value for key, value in loss_dict.items() if key.startswith("loss") + collected_frames = 0 + pbar = tqdm.tqdm(total=total_frames) + r0 = None + for i, tensordict in enumerate(collector): + + # update weights of the inference policy + collector.update_policy_weights_() + + if r0 is None: + r0 = tensordict["next", "reward"].mean().item() + pbar.update(tensordict.numel()) + + # extend the replay buffer with the new data + current_frames = tensordict.numel() + collected_frames += current_frames + replay_buffer.extend(tensordict.cpu()) + + # optimization steps + if collected_frames >= init_random_frames: + for _ in range(update_to_data): + # sample from replay buffer + sampled_tensordict = replay_buffer.sample(batch_size).clone() + + # Compute loss + loss_dict = loss_module(sampled_tensordict) + + # optimize + loss_val = sum( + value for key, value in loss_dict.items() if key.startswith("loss") + ) + loss_val.backward() + optimizer.step() + optimizer.zero_grad() + + # update priority + if prb: + replay_buffer.update_tensordict_priority(sampled_tensordict) + # update target network + target_net_updater.step() + + rewards.append( + ( + i, + tensordict["next", "reward"].mean().item() + / norm_factor_training + / frame_skip, ) - loss_val.backward() - optimizer.step() - optimizer.zero_grad() - - # update priority - if prb: - replay_buffer.update_tensordict_priority(sampled_tensordict) - # update target network - target_net_updater.step() - - rewards.append( - ( - i, - tensordict["next", "reward"].mean().item() - / norm_factor_training - / frame_skip, ) - ) - td_record = recorder(None) - if td_record is not None: - rewards_eval.append((i, td_record["r_evaluation"].item())) - if len(rewards_eval): - pbar.set_description( - f"reward: {rewards[-1][1]: 4.4f} (r0 = {r0: 4.4f}), reward eval: reward: {rewards_eval[-1][1]: 4.4f}" - ) - - # update the exploration strategy - actor_model_explore.step(current_frames) - if collected_frames >= init_random_frames: - scheduler.step() - -collector.shutdown() -del collector + td_record = recorder(None) + if td_record is not None: + rewards_eval.append((i, td_record["r_evaluation"].item())) + if len(rewards_eval): + pbar.set_description( + f"reward: {rewards[-1][1]: 4.4f} (r0 = {r0: 4.4f}), reward eval: reward: {rewards_eval[-1][1]: 4.4f}" + ) -############################################################################### -# Experiment results -# ------------------ -# -# We make a simple plot of the average rewards during training. We can observe -# that our policy learned quite well to solve the task. -# -# **Note**: As already mentioned above, to get a more reasonable performance, -# use a greater value for ``total_frames`` e.g. 1M. - -plt.figure() -plt.plot(*zip(*rewards), label="training") -plt.plot(*zip(*rewards_eval), label="eval") -plt.legend() -plt.xlabel("iter") -plt.ylabel("reward") -plt.tight_layout() - -############################################################################### -# Conclusion -# ---------- -# -# In this tutorial, we have learnt how to code a loss module in TorchRL given -# the concrete example of DDPG. -# -# The key takeaways are: -# -# - How to use the :class:`torchrl.objectives.LossModule` class to register components; -# - How to use (or not) a target network, and how to update its parameters; -# - How to create an optimizer associated with a loss module. + # update the exploration strategy + actor_model_explore.step(current_frames) + if collected_frames >= init_random_frames: + scheduler.step() + + collector.shutdown() + del collector + + ############################################################################### + # Experiment results + # ------------------ + # + # We make a simple plot of the average rewards during training. We can observe + # that our policy learned quite well to solve the task. + # + # **Note**: As already mentioned above, to get a more reasonable performance, + # use a greater value for ``total_frames`` e.g. 1M. + + plt.figure() + plt.plot(*zip(*rewards), label="training") + plt.plot(*zip(*rewards_eval), label="eval") + plt.legend() + plt.xlabel("iter") + plt.ylabel("reward") + plt.tight_layout() + + ############################################################################### + # Conclusion + # ---------- + # + # In this tutorial, we have learnt how to code a loss module in TorchRL given + # the concrete example of DDPG. + # + # The key takeaways are: + # + # - How to use the :class:`torchrl.objectives.LossModule` class to register components; + # - How to use (or not) a target network, and how to update its parameters; + # - How to create an optimizer associated with a loss module. # From aca694621c0751888b51abcc7f3df1d9873f82da Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 29 Mar 2023 14:51:16 +0100 Subject: [PATCH 52/89] tmp --- tutorials/sphinx-tutorials/coding_ddpg.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tutorials/sphinx-tutorials/coding_ddpg.py b/tutorials/sphinx-tutorials/coding_ddpg.py index 81542a4c1d0..362f3b4467a 100644 --- a/tutorials/sphinx-tutorials/coding_ddpg.py +++ b/tutorials/sphinx-tutorials/coding_ddpg.py @@ -818,8 +818,7 @@ def make_replay_buffer(buffer_size, prefetch=3): ) # Number of environments in each data collector - warnings.warn("More envs!") - env_per_collector = 1 + env_per_collector = 2 # Total frames we will use during training. Scale up to 500K - 1M for a more # meaningful training From 206830ab86c52bb392ceda4168934f9d39f8e301 Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 29 Mar 2023 14:51:25 +0100 Subject: [PATCH 53/89] tmp --- tutorials/sphinx-tutorials/coding_ddpg.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tutorials/sphinx-tutorials/coding_ddpg.py b/tutorials/sphinx-tutorials/coding_ddpg.py index 362f3b4467a..94e5acceacd 100644 --- a/tutorials/sphinx-tutorials/coding_ddpg.py +++ b/tutorials/sphinx-tutorials/coding_ddpg.py @@ -987,8 +987,6 @@ def make_replay_buffer(buffer_size, prefetch=3): else: multistep = None - warnings.warn("Change collector!!") - collector = MultiaSyncDataCollector( create_env_fn=[create_env_fn, create_env_fn], policy=actor_model_explore, From 3b4e0e786df226043cad35b2c3b109f8492ba4d2 Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 29 Mar 2023 14:55:02 +0100 Subject: [PATCH 54/89] tmp --- tutorials/sphinx-tutorials/coding_ddpg.py | 2054 +++++++++++---------- 1 file changed, 1030 insertions(+), 1024 deletions(-) diff --git a/tutorials/sphinx-tutorials/coding_ddpg.py b/tutorials/sphinx-tutorials/coding_ddpg.py index 94e5acceacd..dab9d740216 100644 --- a/tutorials/sphinx-tutorials/coding_ddpg.py +++ b/tutorials/sphinx-tutorials/coding_ddpg.py @@ -60,1086 +60,1092 @@ warnings.filterwarnings("ignore") # sphinx_gallery_end_ignore -if __name__ == "__main__": - import numpy as np - import torch.cuda - import tqdm - from matplotlib import pyplot as plt - from tensordict.nn import TensorDictModule - from tensordict.tensordict import TensorDict, TensorDictBase - from torch import nn, optim - from torchrl.collectors import MultiaSyncDataCollector, SyncDataCollector - from torchrl.data import CompositeSpec, TensorDictReplayBuffer - from torchrl.data.postprocs import MultiStep - from torchrl.data.replay_buffers.samplers import PrioritizedSampler, RandomSampler - from torchrl.data.replay_buffers.storages import LazyMemmapStorage - from torchrl.envs import ( - CatTensors, - DoubleToFloat, - EnvCreator, - ObservationNorm, - ParallelEnv, +import numpy as np +import torch.cuda +import tqdm +from matplotlib import pyplot as plt +from tensordict.nn import TensorDictModule +from tensordict.tensordict import TensorDict, TensorDictBase +from torch import nn, optim +from torchrl.collectors import MultiaSyncDataCollector, SyncDataCollector +from torchrl.data import CompositeSpec, TensorDictReplayBuffer +from torchrl.data.postprocs import MultiStep +from torchrl.data.replay_buffers.samplers import PrioritizedSampler, RandomSampler +from torchrl.data.replay_buffers.storages import LazyMemmapStorage +from torchrl.envs import ( + CatTensors, + DoubleToFloat, + EnvCreator, + ObservationNorm, + ParallelEnv, +) +from torchrl.envs.libs.dm_control import DMControlEnv +from torchrl.envs.libs.gym import GymEnv +from torchrl.envs.transforms import RewardScaling, TransformedEnv +from torchrl.envs.utils import set_exploration_mode +from torchrl.modules import ( + Actor, + ActorCriticWrapper, + MLP, + OrnsteinUhlenbeckProcessWrapper, + ValueOperator, +) +from torchrl.objectives.utils import distance_loss, SoftUpdate +from torchrl.trainers import Recorder + +############################################################################### +# torchrl :class:`torchrl.objectives.LossModule` +# ---------------------------------------------- +# +# TorchRL provides a series of losses to use in your training scripts. +# The aim is to have losses that are easily reusable/swappable and that have +# a simple signature. +# +# The main characteristics of TorchRL losses are: +# +# - they are stateful objects: they contain a copy of the trainable parameters +# such that ``loss_module.parameters()`` gives whatever is needed to train the +# algorithm. +# - They follow the ``tensordict`` convention: the :meth:`torch.nn.Module.forward` +# method will receive a tensordict as input that contains all the necessary +# information to return a loss value. +# +# >>> data = replay_buffer.sample() +# >>> loss_dict = loss_module(data) +# +# - They output a :class:`tensordict.TensorDict` instance with the loss values +# written under a ``"loss_"`` where ``smth`` is a string describing the +# loss. Additional keys in the tensordict may be useful metrics to log during +# training time. +# .. note:: +# The reason we return independent losses is to let the user use a different +# optimizer for different sets of parameters for instance. Summing the losses +# can be simply done via +# +# >>> loss_val = sum(loss for key, loss in loss_dict.items() if key.startswith("loss_")) +# +# The ``__init__`` method +# ~~~~~~~~~~~~~~~~~~~~~~~ +# +# The parent class of all losses is :class:`torchrl.objectives.LossModule`. +# As many other components of the library, its :meth:`torchrl.objectives.LossModule.forward` method expects +# as input a :class:`tensordict.TensorDict` instance sampled from an experience +# replay buffer, or any similar data structure. Using this format makes it +# possible to re-use the module across +# modalities, or in complex settings where the model needs to read multiple +# entries for instance. In other words, it allows us to code a loss module that +# is oblivious to the data type that is being given to is and that focuses on +# running the elementary steps of the loss function and only those. +# +# To keep the tutorial as didactic as we can, we'll be displaying each method +# of the class independently and we'll be populating the class at a later +# stage. +# +# Let us start with the :meth:`torchrl.objectives.LossModule.__init__` +# method. DDPG aims at solving a control task with a simple strategy: +# training a policy to output actions that maximise the value predicted by +# a value network. Hence, our loss module needs to receive two networks in its +# constructor: an actor and a value networks. We expect both of these to be +# tensordict-compatible objects, such as +# :class:`tensordict.nn.TensorDictModule`. +# Our loss function will need to compute a target value and fit the value +# network to this, and generate an action and fit the policy such that its +# value estimate is maximised. +# +# The crucial step of the :meth:`LossModule.__init__` method is the call to +# :meth:`torchrl.LossModule.convert_to_functional`. This method will extract +# the parameters from the module and convert it to a functional module. +# Strictly speaking, this is not necessary and one may perfectly code all +# the losses without it. However, we encourage its usage for the following +# reason. +# +# The reason TorchRL does this is that RL algorithms often execute the same +# model with different sets of parameters, called "trainable" and "target" +# parameters. +# The "trainable" parameters are those that the optimizer needs to fit. The +# "target" parameters are usually a copy of the formers with some time lag +# (absolute or diluted through a moving average). +# These target parameters are used to compute the value associated with the +# next observation. One the advantages of using a set of target parameters +# for the value model that do not match exactly the current configuration is +# that they provide a pessimistic bound on the value function being computed. +# Pay attention to the ``create_target_params`` keyword argument below: this +# argument tells the :meth:`torchrl.objectives.LossModule.convert_to_functional` +# method to create a set of target parameters in the loss module to be used +# for target value computation. If this is set to ``False`` (see the actor network +# for instance) the ``target_actor_network_params`` attribute will still be +# accessible but this will just return a **detached** version of the +# actor parameters. +# +# Later, we will see how the target parameters should be updated in torchrl. +# + + +def _init( + self, + actor_network: TensorDictModule, + value_network: TensorDictModule, +) -> None: + super(type(self), self).__init__() + + self.convert_to_functional( + actor_network, + "actor_network", + create_target_params=False, ) - from torchrl.envs.libs.dm_control import DMControlEnv - from torchrl.envs.libs.gym import GymEnv - from torchrl.envs.transforms import RewardScaling, TransformedEnv - from torchrl.envs.utils import set_exploration_mode - from torchrl.modules import ( - Actor, - ActorCriticWrapper, - MLP, - OrnsteinUhlenbeckProcessWrapper, - ValueOperator, + self.convert_to_functional( + value_network, + "value_network", + create_target_params=True, + compare_against=list(actor_network.parameters()), ) - from torchrl.objectives.utils import distance_loss, SoftUpdate - from torchrl.trainers import Recorder - - ############################################################################### - # torchrl :class:`torchrl.objectives.LossModule` - # ---------------------------------------------- - # - # TorchRL provides a series of losses to use in your training scripts. - # The aim is to have losses that are easily reusable/swappable and that have - # a simple signature. - # - # The main characteristics of TorchRL losses are: - # - # - they are stateful objects: they contain a copy of the trainable parameters - # such that ``loss_module.parameters()`` gives whatever is needed to train the - # algorithm. - # - They follow the ``tensordict`` convention: the :meth:`torch.nn.Module.forward` - # method will receive a tensordict as input that contains all the necessary - # information to return a loss value. - # - # >>> data = replay_buffer.sample() - # >>> loss_dict = loss_module(data) - # - # - They output a :class:`tensordict.TensorDict` instance with the loss values - # written under a ``"loss_"`` where ``smth`` is a string describing the - # loss. Additional keys in the tensordict may be useful metrics to log during - # training time. - # .. note:: - # The reason we return independent losses is to let the user use a different - # optimizer for different sets of parameters for instance. Summing the losses - # can be simply done via - # - # >>> loss_val = sum(loss for key, loss in loss_dict.items() if key.startswith("loss_")) - # - # The ``__init__`` method - # ~~~~~~~~~~~~~~~~~~~~~~~ - # - # The parent class of all losses is :class:`torchrl.objectives.LossModule`. - # As many other components of the library, its :meth:`torchrl.objectives.LossModule.forward` method expects - # as input a :class:`tensordict.TensorDict` instance sampled from an experience - # replay buffer, or any similar data structure. Using this format makes it - # possible to re-use the module across - # modalities, or in complex settings where the model needs to read multiple - # entries for instance. In other words, it allows us to code a loss module that - # is oblivious to the data type that is being given to is and that focuses on - # running the elementary steps of the loss function and only those. - # - # To keep the tutorial as didactic as we can, we'll be displaying each method - # of the class independently and we'll be populating the class at a later - # stage. - # - # Let us start with the :meth:`torchrl.objectives.LossModule.__init__` - # method. DDPG aims at solving a control task with a simple strategy: - # training a policy to output actions that maximise the value predicted by - # a value network. Hence, our loss module needs to receive two networks in its - # constructor: an actor and a value networks. We expect both of these to be - # tensordict-compatible objects, such as - # :class:`tensordict.nn.TensorDictModule`. - # Our loss function will need to compute a target value and fit the value - # network to this, and generate an action and fit the policy such that its - # value estimate is maximised. - # - # The crucial step of the :meth:`LossModule.__init__` method is the call to - # :meth:`torchrl.LossModule.convert_to_functional`. This method will extract - # the parameters from the module and convert it to a functional module. - # Strictly speaking, this is not necessary and one may perfectly code all - # the losses without it. However, we encourage its usage for the following - # reason. - # - # The reason TorchRL does this is that RL algorithms often execute the same - # model with different sets of parameters, called "trainable" and "target" - # parameters. - # The "trainable" parameters are those that the optimizer needs to fit. The - # "target" parameters are usually a copy of the formers with some time lag - # (absolute or diluted through a moving average). - # These target parameters are used to compute the value associated with the - # next observation. One the advantages of using a set of target parameters - # for the value model that do not match exactly the current configuration is - # that they provide a pessimistic bound on the value function being computed. - # Pay attention to the ``create_target_params`` keyword argument below: this - # argument tells the :meth:`torchrl.objectives.LossModule.convert_to_functional` - # method to create a set of target parameters in the loss module to be used - # for target value computation. If this is set to ``False`` (see the actor network - # for instance) the ``target_actor_network_params`` attribute will still be - # accessible but this will just return a **detached** version of the - # actor parameters. - # - # Later, we will see how the target parameters should be updated in torchrl. - # - - - def _init( - self, - actor_network: TensorDictModule, - value_network: TensorDictModule, - ) -> None: - super(type(self), self).__init__() - - self.convert_to_functional( - actor_network, - "actor_network", - create_target_params=False, + + self.actor_in_keys = actor_network.in_keys + + # Since the value we'll be using is based on the actor and value network, + # we put them together in a single actor-critic container. + actor_critic = ActorCriticWrapper(actor_network, value_network) + self.actor_critic = actor_critic + self.loss_funtion = "l2" + + +############################################################################### +# The value estimator loss method +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# In many RL algorithm, the value network (or Q-value network) is trained based +# on an empirical value estimate. This can be bootstrapped (TD(0), low +# variance, high bias), meaning +# that the target value is obtained using the next reward and nothing else, or +# a Monte-Carlo estimate can be obtained (TD(1)) in which case the whole +# sequence of upcoming rewards will be used (high variance, low bias). An +# intermediate estimator (TD(:math:`\lambda`)) can also be used to compromise +# bias and variance. +# TorchRL makes it easy to use one or the other estimator via the +# :class:`torchrl.objectives.utils.ValueEstimators` Enum class, which contains +# pointers to all the value estimators implemented. Let us define the default +# value function here. We will take the simplest version (TD(0)), and show later +# on how this can be changed. + +from torchrl.objectives.utils import ValueEstimators + +default_value_estimator = ValueEstimators.TD0 + +############################################################################### +# We also need to give some instructions to DDPG on how to build the value +# estimator, depending on the user query. Depending on the estimator provided, +# we will build the corresponding module to be used at train time: + +from torchrl.objectives.utils import default_value_kwargs +from torchrl.objectives.value import TD0Estimator, TD1Estimator, TDLambdaEstimator + + +def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): + hp = dict(default_value_kwargs(value_type)) + if hasattr(self, "gamma"): + hp["gamma"] = self.gamma + hp.update(hyperparams) + value_key = "state_action_value" + if value_type == ValueEstimators.TD1: + self._value_estimator = TD1Estimator( + value_network=self.actor_critic, value_key=value_key, **hp + ) + elif value_type == ValueEstimators.TD0: + self._value_estimator = TD0Estimator( + value_network=self.actor_critic, value_key=value_key, **hp + ) + elif value_type == ValueEstimators.GAE: + raise NotImplementedError( + f"Value type {value_type} it not implemented for loss {type(self)}." ) - self.convert_to_functional( - value_network, - "value_network", - create_target_params=True, - compare_against=list(actor_network.parameters()), + elif value_type == ValueEstimators.TDLambda: + self._value_estimator = TDLambdaEstimator( + value_network=self.actor_critic, value_key=value_key, **hp ) + else: + raise NotImplementedError(f"Unknown value type {value_type}") - self.actor_in_keys = actor_network.in_keys - - # Since the value we'll be using is based on the actor and value network, - # we put them together in a single actor-critic container. - actor_critic = ActorCriticWrapper(actor_network, value_network) - self.actor_critic = actor_critic - self.loss_funtion = "l2" - - - ############################################################################### - # The value estimator loss method - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # - # In many RL algorithm, the value network (or Q-value network) is trained based - # on an empirical value estimate. This can be bootstrapped (TD(0), low - # variance, high bias), meaning - # that the target value is obtained using the next reward and nothing else, or - # a Monte-Carlo estimate can be obtained (TD(1)) in which case the whole - # sequence of upcoming rewards will be used (high variance, low bias). An - # intermediate estimator (TD(:math:`\lambda`)) can also be used to compromise - # bias and variance. - # TorchRL makes it easy to use one or the other estimator via the - # :class:`torchrl.objectives.utils.ValueEstimators` Enum class, which contains - # pointers to all the value estimators implemented. Let us define the default - # value function here. We will take the simplest version (TD(0)), and show later - # on how this can be changed. - - from torchrl.objectives.utils import ValueEstimators - - default_value_estimator = ValueEstimators.TD0 - - ############################################################################### - # We also need to give some instructions to DDPG on how to build the value - # estimator, depending on the user query. Depending on the estimator provided, - # we will build the corresponding module to be used at train time: - - from torchrl.objectives.utils import default_value_kwargs - from torchrl.objectives.value import TD0Estimator, TD1Estimator, TDLambdaEstimator - - - def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): - hp = dict(default_value_kwargs(value_type)) - if hasattr(self, "gamma"): - hp["gamma"] = self.gamma - hp.update(hyperparams) - value_key = "state_action_value" - if value_type == ValueEstimators.TD1: - self._value_estimator = TD1Estimator( - value_network=self.actor_critic, value_key=value_key, **hp - ) - elif value_type == ValueEstimators.TD0: - self._value_estimator = TD0Estimator( - value_network=self.actor_critic, value_key=value_key, **hp - ) - elif value_type == ValueEstimators.GAE: - raise NotImplementedError( - f"Value type {value_type} it not implemented for loss {type(self)}." - ) - elif value_type == ValueEstimators.TDLambda: - self._value_estimator = TDLambdaEstimator( - value_network=self.actor_critic, value_key=value_key, **hp - ) - else: - raise NotImplementedError(f"Unknown value type {value_type}") - - - ############################################################################### - # The ``make_value_estimator`` method can but does not need to be called: if - # not, the :class:`torchrl.objectives.LossModule` will query this method with - # its default estimator. - # - # The actor loss method - # ~~~~~~~~~~~~~~~~~~~~~ - # - # The central piece of an RL algorithm is the training loss for the actor. - # In the case of DDPG, this function is quite simple: we just need to compute - # the value associated with an action computed using the policy and optimize - # the actor weights to maximise this value. - # - # When computing this value, we must make sure to take the value parameters out - # of the graph, otherwise the actor and value loss will be mixed up. - # For this, the :func:`torchrl.objectives.utils.hold_out_params` function - # can be used. - - from torchrl.objectives.utils import hold_out_params - - - def _loss_actor( - self, - tensordict, - ) -> torch.Tensor: - td_copy = tensordict.select(*self.actor_in_keys).detach() - # Get an action from the actor network - td_copy = self.actor_network( + +############################################################################### +# The ``make_value_estimator`` method can but does not need to be called: if +# not, the :class:`torchrl.objectives.LossModule` will query this method with +# its default estimator. +# +# The actor loss method +# ~~~~~~~~~~~~~~~~~~~~~ +# +# The central piece of an RL algorithm is the training loss for the actor. +# In the case of DDPG, this function is quite simple: we just need to compute +# the value associated with an action computed using the policy and optimize +# the actor weights to maximise this value. +# +# When computing this value, we must make sure to take the value parameters out +# of the graph, otherwise the actor and value loss will be mixed up. +# For this, the :func:`torchrl.objectives.utils.hold_out_params` function +# can be used. + +from torchrl.objectives.utils import hold_out_params + + +def _loss_actor( + self, + tensordict, +) -> torch.Tensor: + td_copy = tensordict.select(*self.actor_in_keys).detach() + # Get an action from the actor network + td_copy = self.actor_network( + td_copy, + params=self.actor_network_params, + ) + # get the value associated with that action + with hold_out_params(self.value_network_params) as params: + td_copy = self.value_network( td_copy, - params=self.actor_network_params, + params=params, ) - # get the value associated with that action - with hold_out_params(self.value_network_params) as params: - td_copy = self.value_network( - td_copy, - params=params, - ) - return -td_copy.get("state_action_value") - - - ############################################################################### - # The value loss method - # ~~~~~~~~~~~~~~~~~~~~~ - # - # We now need to optimize our value network parameters. - # To do this, we will rely on the value estimator of our class: - # - - - def _loss_value( - self, - tensordict, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - td_copy = tensordict.detach() - - # V(s, a) - self.value_network(td_copy, params=self.value_network_params) - pred_val = td_copy.get("state_action_value").squeeze(-1) - - # we manually reconstruct the parameters of the actor-critic, where the first - # set of parameters belongs to the actor and the second to the value function. - target_params = TensorDict( - { - "module": { - "0": self.target_actor_network_params, - "1": self.target_value_network_params, - } - }, - batch_size=self.target_actor_network_params.batch_size, - device=self.target_actor_network_params.device, - ) - with set_exploration_mode("mode"): # we make sure that no exploration is performed - target_value = self.value_estimator.value_estimate( - tensordict, target_params=target_params - ).squeeze(-1) + return -td_copy.get("state_action_value") - # td_error = pred_val - target_value - loss_value = distance_loss(pred_val, target_value, loss_function=self.loss_funtion) - td_error = (pred_val - target_value).pow(2) - return loss_value, td_error, pred_val, target_value +############################################################################### +# The value loss method +# ~~~~~~~~~~~~~~~~~~~~~ +# +# We now need to optimize our value network parameters. +# To do this, we will rely on the value estimator of our class: +# - ############################################################################### - # Putting things together in a forward call - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # - # The only missing piece is the forward method, which will glue together the - # value and actor loss, collect the cost values and write them in a tensordict - # delivered to the user. +def _loss_value( + self, + tensordict, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + td_copy = tensordict.detach() + + # V(s, a) + self.value_network(td_copy, params=self.value_network_params) + pred_val = td_copy.get("state_action_value").squeeze(-1) + + # we manually reconstruct the parameters of the actor-critic, where the first + # set of parameters belongs to the actor and the second to the value function. + target_params = TensorDict( + { + "module": { + "0": self.target_actor_network_params, + "1": self.target_value_network_params, + } + }, + batch_size=self.target_actor_network_params.batch_size, + device=self.target_actor_network_params.device, + ) + with set_exploration_mode("mode"): # we make sure that no exploration is performed + target_value = self.value_estimator.value_estimate( + tensordict, target_params=target_params + ).squeeze(-1) + # td_error = pred_val - target_value + loss_value = distance_loss(pred_val, target_value, loss_function=self.loss_funtion) + td_error = (pred_val - target_value).pow(2) - def _forward(self, input_tensordict: TensorDictBase) -> TensorDict: - if not input_tensordict.device == self.device: - raise RuntimeError( - f"Got device={input_tensordict.device} but " - f"actor_network.device={self.device} (self.device={self.device})" - ) + return loss_value, td_error, pred_val, target_value - loss_value, td_error, pred_val, target_value = self.loss_value( - input_tensordict, - ) - td_error = td_error.detach() - td_error = td_error.unsqueeze(input_tensordict.ndimension()) - if input_tensordict.device is not None: - td_error = td_error.to(input_tensordict.device) - input_tensordict.set( - "td_error", - td_error, - inplace=True, - ) - loss_actor = self.loss_actor(input_tensordict) - return TensorDict( - source={ - "loss_actor": loss_actor.mean(), - "loss_value": loss_value.mean(), - "pred_value": pred_val.mean().detach(), - "target_value": target_value.mean().detach(), - "pred_value_max": pred_val.max().detach(), - "target_value_max": target_value.max().detach(), - }, - batch_size=[], - ) +############################################################################### +# Putting things together in a forward call +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# The only missing piece is the forward method, which will glue together the +# value and actor loss, collect the cost values and write them in a tensordict +# delivered to the user. - class DDPGLoss(LossModule): - default_value_estimator = default_value_estimator - make_value_estimator = make_value_estimator - - __init__ = _init - forward = _forward - loss_value = _loss_value - loss_actor = _loss_actor - - - ############################################################################### - # Now that we have our loss, we can use it to train a policy to solve a - # control task. - # - # Environment - # ----------- - # - # In most algorithms, the first thing that needs to be taken care of is the - # construction of the environment as it conditions the remainder of the - # training script. - # - # For this example, we will be using the ``"cheetah"`` task. The goal is to make - # a half-cheetah run as fast as possible. - # - # In TorchRL, one can create such a task by relying on dm_control or gym: - # - # .. code-block:: python - # - # env = GymEnv("HalfCheetah-v4") - # - # or - # - # .. code-block:: python - # - # env = DMControlEnv("cheetah", "run") - # - # By default, these environment disable rendering. Training from states is - # usually easier than training from images. To keep things simple, we focus - # on learning from states only. To pass the pixels to the tensordicts that - # are collected by :func:`env.step()`, simply pass the ``from_pixels=True`` - # argument to the constructor: - # - # .. code-block:: python - # - # env = GymEnv("HalfCheetah-v4", from_pixels=True, pixels_only=True) - # - # We write a :func:`make_env` helper function that will create an environment - # with either one of the two backends considered above (dm-control or gym). - # - - env_library = None - env_name = None - - - def make_env(): - """Create a base env.""" - global env_library - global env_name - - if backend == "dm_control": - env_name = "cheetah" - env_task = "run" - env_args = (env_name, env_task) - env_library = DMControlEnv - elif backend == "gym": - env_name = "HalfCheetah-v4" - env_args = (env_name,) - env_library = GymEnv - else: - raise NotImplementedError - - env_kwargs = { - "device": device, - "frame_skip": frame_skip, - "from_pixels": from_pixels, - "pixels_only": from_pixels, - } - env = env_library(*env_args, **env_kwargs) - return env - - - ############################################################################### - # Transforms - # ~~~~~~~~~~ - # - # Now that we have a base environment, we may want to modify its representation - # to make it more policy-friendly. In TorchRL, transforms are appended to the - # base environment in a specialized :class:`torchr.envs.TransformedEnv` class. - # - # - It is common in DDPG to rescale the reward using some heuristic value. We - # will multiply the reward by 5 in this example. - # - # - If we are using :mod:`dm_control`, it is also important to build an interface - # between the simulator which works with double precision numbers, and our - # script which presumably uses single precision ones. This transformation goes - # both ways: when calling :func:`env.step`, our actions will need to be - # represented in double precision, and the output will need to be transformed - # to single precision. - # The :class:`torchrl.envs.DoubleToFloat` transform does exactly this: the - # ``in_keys`` list refers to the keys that will need to be transformed from - # double to float, while the ``in_keys_inv`` refers to those that need to - # be transformed to double before being passed to the environment. - # - # - We concatenate the state keys together using the :class:`torchrl.envs.CatTensors` - # transform. - # - # - Finally, we also leave the possibility of normalizing the states: we will - # take care of computing the normalizing constants later on. - # - - - def make_transformed_env( - env, - ): - """Apply transforms to the env (such as reward scaling and state normalization).""" - - env = TransformedEnv(env) - - # we append transforms one by one, although we might as well create the - # transformed environment using the `env = TransformedEnv(base_env, transforms)` - # syntax. - env.append_transform(RewardScaling(loc=0.0, scale=reward_scaling)) - - double_to_float_list = [] - double_to_float_inv_list = [] - if env_library is DMControlEnv: - # DMControl requires double-precision - double_to_float_list += [ - "reward", - "action", - ] - double_to_float_inv_list += ["action"] - - # We concatenate all states into a single "observation_vector" - # even if there is a single tensor, it'll be renamed in "observation_vector". - # This facilitates the downstream operations as we know the name of the - # output tensor. - # In some environments (not half-cheetah), there may be more than one - # observation vector: in this case this code snippet will concatenate them - # all. - selected_keys = list(env.observation_spec.keys()) - out_key = "observation_vector" - env.append_transform(CatTensors(in_keys=selected_keys, out_key=out_key)) - - # we normalize the states, but for now let's just instantiate a stateless - # version of the transform - env.append_transform(ObservationNorm(in_keys=[out_key], standard_normal=True)) - - double_to_float_list.append(out_key) - env.append_transform( - DoubleToFloat( - in_keys=double_to_float_list, in_keys_inv=double_to_float_inv_list - ) - ) - return env - - - ############################################################################### - # Normalization of the observations - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # - # To compute the normalizing statistics, we run an arbitrary number of random - # steps in the environment and compute the mean and standard deviation of the - # collected observations. The :func:`ObservationNorm.init_stats()` method can - # be used for this purpose. To get the summary statistics, we create a dummy - # environment and run it for a given number of steps, collect data over a given - # number of steps and compute its summary statistics. - # - - - def get_env_stats(): - """Gets the stats of an environment.""" - proof_env = make_transformed_env(make_env()) - proof_env.set_seed(seed) - t = proof_env.transform[2] - t.init_stats(init_env_steps) - transform_state_dict = t.state_dict() - proof_env.close() - return transform_state_dict - - - ############################################################################### - # Parallel execution - # ~~~~~~~~~~~~~~~~~~ - # - # The following helper function allows us to run environments in parallel. - # Running environments in parallel can significantly speed up the collection - # throughput. When using transformed environment, we need to choose whether we - # want to execute the transform individually for each environment, or - # centralize the data and transform it in batch. Both approaches are easy to - # code: - # - # .. code-block:: python - # - # env = ParallelEnv( - # lambda: TransformedEnv(GymEnv("HalfCheetah-v4"), transforms), - # num_workers=4 - # ) - # env = TransformedEnv( - # ParallelEnv(lambda: GymEnv("HalfCheetah-v4"), num_workers=4), - # transforms - # ) - # - # To leverage the vectorization capabilities of PyTorch, we adopt - # the first method: - # - - - def parallel_env_constructor( - transform_state_dict, - ): - if env_per_collector == 1: - - def make_t_env(): - env = make_transformed_env(make_env()) - env.transform[2].init_stats(3) - env.transform[2].loc.copy_(transform_state_dict["loc"]) - env.transform[2].scale.copy_(transform_state_dict["scale"]) - return env - - env_creator = EnvCreator(make_t_env) - return env_creator - - parallel_env = ParallelEnv( - num_workers=env_per_collector, - create_env_fn=EnvCreator(lambda: make_env()), - create_env_kwargs=None, - pin_memory=False, +def _forward(self, input_tensordict: TensorDictBase) -> TensorDict: + if not input_tensordict.device == self.device: + raise RuntimeError( + f"Got device={input_tensordict.device} but " + f"actor_network.device={self.device} (self.device={self.device})" ) - env = make_transformed_env(parallel_env) - # we call `init_stats` for a limited number of steps, just to instantiate - # the lazy buffers. - env.transform[2].init_stats(3, cat_dim=1, reduce_dim=[0, 1]) - env.transform[2].load_state_dict(transform_state_dict) - return env - - - ############################################################################### - # Building the model - # ------------------ - # - # We now turn to the setup of the model. As we have seen, DDPG requires a - # value network, trained to estimate the value of a state-action pair, and a - # parametric actor that learns how to select actions that maximize this value. - # - # Recall that building a TorchRL module requires two steps: - # - # - writing the :class:`torch.nn.Module` that will be used as network, - # - wrapping the network in a :class:`tensordict.nn.TensorDictModule` where the - # data flow is handled by specifying the input and output keys. - # - # In more complex scenarios, :class:`tensordict.nn.TensorDictSequential` can - # also be used. - # - # - # The Q-Value network is wrapped in a :class:`torchrl.modules.ValueOperator` - # that automatically sets the ``out_keys`` to ``"state_action_value`` for q-value - # networks and ``state_value`` for other value networks. - # - # Since we use lazy modules, it is necessary to materialize the lazy modules - # before being able to move the policy from device to device and achieve other - # operations. Hence, it is good practice to run the modules with a small - # sample of data. For this purpose, we generate fake data from the - # environment specs. - # - - - def make_ddpg_actor( - transform_state_dict, - device="cpu", - ): - proof_environment = make_transformed_env(make_env()) - proof_environment.transform[2].init_stats(3) - proof_environment.transform[2].load_state_dict(transform_state_dict) - - env_specs = proof_environment.specs - in_features = env_specs["output_spec"]["observation"]["observation_vector"].shape[ - -1 + + loss_value, td_error, pred_val, target_value = self.loss_value( + input_tensordict, + ) + td_error = td_error.detach() + td_error = td_error.unsqueeze(input_tensordict.ndimension()) + if input_tensordict.device is not None: + td_error = td_error.to(input_tensordict.device) + input_tensordict.set( + "td_error", + td_error, + inplace=True, + ) + loss_actor = self.loss_actor(input_tensordict) + return TensorDict( + source={ + "loss_actor": loss_actor.mean(), + "loss_value": loss_value.mean(), + "pred_value": pred_val.mean().detach(), + "target_value": target_value.mean().detach(), + "pred_value_max": pred_val.max().detach(), + "target_value_max": target_value.max().detach(), + }, + batch_size=[], + ) + + +class DDPGLoss(LossModule): + default_value_estimator = default_value_estimator + make_value_estimator = make_value_estimator + + __init__ = _init + forward = _forward + loss_value = _loss_value + loss_actor = _loss_actor + + +############################################################################### +# Now that we have our loss, we can use it to train a policy to solve a +# control task. +# +# Environment +# ----------- +# +# In most algorithms, the first thing that needs to be taken care of is the +# construction of the environment as it conditions the remainder of the +# training script. +# +# For this example, we will be using the ``"cheetah"`` task. The goal is to make +# a half-cheetah run as fast as possible. +# +# In TorchRL, one can create such a task by relying on dm_control or gym: +# +# .. code-block:: python +# +# env = GymEnv("HalfCheetah-v4") +# +# or +# +# .. code-block:: python +# +# env = DMControlEnv("cheetah", "run") +# +# By default, these environment disable rendering. Training from states is +# usually easier than training from images. To keep things simple, we focus +# on learning from states only. To pass the pixels to the tensordicts that +# are collected by :func:`env.step()`, simply pass the ``from_pixels=True`` +# argument to the constructor: +# +# .. code-block:: python +# +# env = GymEnv("HalfCheetah-v4", from_pixels=True, pixels_only=True) +# +# We write a :func:`make_env` helper function that will create an environment +# with either one of the two backends considered above (dm-control or gym). +# + +env_library = None +env_name = None + + +def make_env(): + """Create a base env.""" + global env_library + global env_name + + if backend == "dm_control": + env_name = "cheetah" + env_task = "run" + env_args = (env_name, env_task) + env_library = DMControlEnv + elif backend == "gym": + env_name = "HalfCheetah-v4" + env_args = (env_name,) + env_library = GymEnv + else: + raise NotImplementedError + + env_kwargs = { + "device": device, + "frame_skip": frame_skip, + "from_pixels": from_pixels, + "pixels_only": from_pixels, + } + env = env_library(*env_args, **env_kwargs) + return env + + +############################################################################### +# Transforms +# ~~~~~~~~~~ +# +# Now that we have a base environment, we may want to modify its representation +# to make it more policy-friendly. In TorchRL, transforms are appended to the +# base environment in a specialized :class:`torchr.envs.TransformedEnv` class. +# +# - It is common in DDPG to rescale the reward using some heuristic value. We +# will multiply the reward by 5 in this example. +# +# - If we are using :mod:`dm_control`, it is also important to build an interface +# between the simulator which works with double precision numbers, and our +# script which presumably uses single precision ones. This transformation goes +# both ways: when calling :func:`env.step`, our actions will need to be +# represented in double precision, and the output will need to be transformed +# to single precision. +# The :class:`torchrl.envs.DoubleToFloat` transform does exactly this: the +# ``in_keys`` list refers to the keys that will need to be transformed from +# double to float, while the ``in_keys_inv`` refers to those that need to +# be transformed to double before being passed to the environment. +# +# - We concatenate the state keys together using the :class:`torchrl.envs.CatTensors` +# transform. +# +# - Finally, we also leave the possibility of normalizing the states: we will +# take care of computing the normalizing constants later on. +# + + +def make_transformed_env( + env, +): + """Apply transforms to the env (such as reward scaling and state normalization).""" + + env = TransformedEnv(env) + + # we append transforms one by one, although we might as well create the + # transformed environment using the `env = TransformedEnv(base_env, transforms)` + # syntax. + env.append_transform(RewardScaling(loc=0.0, scale=reward_scaling)) + + double_to_float_list = [] + double_to_float_inv_list = [] + if env_library is DMControlEnv: + # DMControl requires double-precision + double_to_float_list += [ + "reward", + "action", ] - out_features = env_specs["input_spec"]["action"].shape[-1] - - actor_net = MLP( - in_features=in_features, - out_features=out_features, - num_cells=[num_cells] * num_layers, - activation_class=nn.Tanh, - activate_last_layer=True, # with this option on, we use a Tanh map as a last layer, thereby constraining the action to the [-1; 1] domain - ) - in_keys = ["observation_vector"] - out_keys = ["action"] - - actor = Actor( - actor_net, - in_keys=in_keys, - out_keys=out_keys, - spec=CompositeSpec(action=env_specs["input_spec"]["action"]), - ).to(device) - - q_net = MLP( - in_features=in_features - + out_features, # receives an action and an observation as input - out_features=1, - num_cells=[num_cells] * num_layers, - activation_class=nn.Tanh, + double_to_float_inv_list += ["action"] + + # We concatenate all states into a single "observation_vector" + # even if there is a single tensor, it'll be renamed in "observation_vector". + # This facilitates the downstream operations as we know the name of the + # output tensor. + # In some environments (not half-cheetah), there may be more than one + # observation vector: in this case this code snippet will concatenate them + # all. + selected_keys = list(env.observation_spec.keys()) + out_key = "observation_vector" + env.append_transform(CatTensors(in_keys=selected_keys, out_key=out_key)) + + # we normalize the states, but for now let's just instantiate a stateless + # version of the transform + env.append_transform(ObservationNorm(in_keys=[out_key], standard_normal=True)) + + double_to_float_list.append(out_key) + env.append_transform( + DoubleToFloat( + in_keys=double_to_float_list, in_keys_inv=double_to_float_inv_list ) + ) + + return env - in_keys = in_keys + ["action"] - qnet = ValueOperator( - in_keys=in_keys, - module=q_net, - ).to(device) - - return actor, qnet - - - ############################################################################### - # Evaluator: building your recorder object - # ---------------------------------------- - # - # As the training data is obtained using some exploration strategy, the true - # performance of our algorithm needs to be assessed in deterministic mode. We - # do this using a dedicated class, ``Recorder``, which executes the policy in - # the environment at a given frequency and returns some statistics obtained - # from these simulations. - # - # The following helper function builds this object: - - - def make_recorder(actor_model_explore, transform_state_dict): - base_env = make_env() - recorder = make_transformed_env(base_env) - recorder.transform[2].init_stats(3) - recorder.transform[2].load_state_dict(transform_state_dict) - - recorder_obj = Recorder( - record_frames=1000, - frame_skip=frame_skip, - policy_exploration=actor_model_explore, - recorder=recorder, - exploration_mode="mean", - record_interval=record_interval, - ) - return recorder_obj - - - ############################################################################### - # Replay buffer - # ------------- - # - # Replay buffers come in two flavors: prioritized (where some error signal - # is used to give a higher likelihood of sampling to some items than others) - # and regular, circular experience replay. - # - # TorchRL replay buffers are composable: one can pick up the storage, sampling - # and writing strategies. It is also possible to - # store tensors on physical memory using a memory-mapped array. The following - # function takes care of creating the replay buffer with the desired - # hyperparameters: - # - - - def make_replay_buffer(buffer_size, prefetch=3): - if prb: - sampler = PrioritizedSampler( - max_capacity=buffer_size, - alpha=0.7, - beta=0.5, - ) - else: - sampler = RandomSampler() - replay_buffer = TensorDictReplayBuffer( - storage=LazyMemmapStorage( - buffer_size, - scratch_dir=buffer_scratch_dir, - device=device, - ), - sampler=sampler, - pin_memory=False, - prefetch=prefetch, - ) - return replay_buffer +############################################################################### +# Normalization of the observations +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# To compute the normalizing statistics, we run an arbitrary number of random +# steps in the environment and compute the mean and standard deviation of the +# collected observations. The :func:`ObservationNorm.init_stats()` method can +# be used for this purpose. To get the summary statistics, we create a dummy +# environment and run it for a given number of steps, collect data over a given +# number of steps and compute its summary statistics. +# + + +def get_env_stats(): + """Gets the stats of an environment.""" + proof_env = make_transformed_env(make_env()) + proof_env.set_seed(seed) + t = proof_env.transform[2] + t.init_stats(init_env_steps) + transform_state_dict = t.state_dict() + proof_env.close() + return transform_state_dict + + +############################################################################### +# Parallel execution +# ~~~~~~~~~~~~~~~~~~ +# +# The following helper function allows us to run environments in parallel. +# Running environments in parallel can significantly speed up the collection +# throughput. When using transformed environment, we need to choose whether we +# want to execute the transform individually for each environment, or +# centralize the data and transform it in batch. Both approaches are easy to +# code: +# +# .. code-block:: python +# +# env = ParallelEnv( +# lambda: TransformedEnv(GymEnv("HalfCheetah-v4"), transforms), +# num_workers=4 +# ) +# env = TransformedEnv( +# ParallelEnv(lambda: GymEnv("HalfCheetah-v4"), num_workers=4), +# transforms +# ) +# +# To leverage the vectorization capabilities of PyTorch, we adopt +# the first method: +# - ############################################################################### - # Hyperparameters - # --------------- - # - # After having written our helper functions, it is time to set the - # experiment hyperparameters: - ############################################################################### - # Environment - # ~~~~~~~~~~~ +def parallel_env_constructor( + transform_state_dict, +): + if env_per_collector == 1: - # The backend can be gym or dm_control - backend = "gym" + def make_t_env(): + env = make_transformed_env(make_env()) + env.transform[2].init_stats(3) + env.transform[2].loc.copy_(transform_state_dict["loc"]) + env.transform[2].scale.copy_(transform_state_dict["scale"]) + return env - exp_name = "cheetah" + env_creator = EnvCreator(make_t_env) + return env_creator - # frame_skip batches multiple step together with a single action - # If > 1, the other frame counts (e.g. frames_per_batch, total_frames) need to - # be adjusted to have a consistent total number of frames collected across - # experiments. - frame_skip = 2 - from_pixels = False - # Scaling the reward helps us control the signal magnitude for a more - # efficient learning. - reward_scaling = 5.0 + parallel_env = ParallelEnv( + num_workers=env_per_collector, + create_env_fn=EnvCreator(lambda: make_env()), + create_env_kwargs=None, + pin_memory=False, + ) + env = make_transformed_env(parallel_env) + # we call `init_stats` for a limited number of steps, just to instantiate + # the lazy buffers. + env.transform[2].init_stats(3, cat_dim=1, reduce_dim=[0, 1]) + env.transform[2].load_state_dict(transform_state_dict) + return env - # Number of random steps used as for stats computation using ObservationNorm - init_env_steps = 1000 - # Exploration: Number of frames before OU noise becomes null - annealing_frames = 1000000 // frame_skip +############################################################################### +# Building the model +# ------------------ +# +# We now turn to the setup of the model. As we have seen, DDPG requires a +# value network, trained to estimate the value of a state-action pair, and a +# parametric actor that learns how to select actions that maximize this value. +# +# Recall that building a TorchRL module requires two steps: +# +# - writing the :class:`torch.nn.Module` that will be used as network, +# - wrapping the network in a :class:`tensordict.nn.TensorDictModule` where the +# data flow is handled by specifying the input and output keys. +# +# In more complex scenarios, :class:`tensordict.nn.TensorDictSequential` can +# also be used. +# +# +# The Q-Value network is wrapped in a :class:`torchrl.modules.ValueOperator` +# that automatically sets the ``out_keys`` to ``"state_action_value`` for q-value +# networks and ``state_value`` for other value networks. +# +# Since we use lazy modules, it is necessary to materialize the lazy modules +# before being able to move the policy from device to device and achieve other +# operations. Hence, it is good practice to run the modules with a small +# sample of data. For this purpose, we generate fake data from the +# environment specs. +# - ############################################################################### - # Collection - # ~~~~~~~~~~ - # We will execute the policy on cuda if available - device = ( - torch.device("cpu") if torch.cuda.device_count() == 0 else torch.device("cuda:0") +def make_ddpg_actor( + transform_state_dict, + device="cpu", +): + proof_environment = make_transformed_env(make_env()) + proof_environment.transform[2].init_stats(3) + proof_environment.transform[2].load_state_dict(transform_state_dict) + + env_specs = proof_environment.specs + in_features = env_specs["output_spec"]["observation"]["observation_vector"].shape[ + -1 + ] + out_features = env_specs["input_spec"]["action"].shape[-1] + + actor_net = MLP( + in_features=in_features, + out_features=out_features, + num_cells=[num_cells] * num_layers, + activation_class=nn.Tanh, + activate_last_layer=True, # with this option on, we use a Tanh map as a last layer, thereby constraining the action to the [-1; 1] domain ) + in_keys = ["observation_vector"] + out_keys = ["action"] + + actor = Actor( + actor_net, + in_keys=in_keys, + out_keys=out_keys, + spec=CompositeSpec(action=env_specs["input_spec"]["action"]), + ).to(device) - # Number of environments in each data collector - env_per_collector = 2 - - # Total frames we will use during training. Scale up to 500K - 1M for a more - # meaningful training - total_frames = 10000 // frame_skip - - # Number of frames returned by the collector at each iteration of the outer loop. - # We expect batches from the collector to have a shape [env_per_collector, frames_per_batch // env_per_collector] - frames_per_batch = env_per_collector * 1000 // frame_skip - max_frames_per_traj = 1000 // frame_skip - init_random_frames = 0 - # We'll be using the MultiStep class to have a less myopic representation of - # upcoming states - n_steps_forward = 3 - - # record every 10 batch collected - record_interval = 10 - - ############################################################################### - # Optimizer and optimization - # ~~~~~~~~~~~~~~~~~~~~~~~~~~ - - lr = 5e-4 - weight_decay = 0.0 - # UTD: Number of iterations of the inner loop - update_to_data = 32 - batch_size = 128 - - ############################################################################### - # Model - # ~~~~~ - - gamma = 0.99 - tau = 0.005 # Decay factor for the target network - - # Network specs - num_cells = 64 - num_layers = 2 - - ############################################################################### - # Replay buffer - # ~~~~~~~~~~~~~ - - # If True, a Prioritized replay buffer will be used - prb = True - # Number of frames stored in the buffer - traj_len_collector = frames_per_batch // env_per_collector - buffer_size = min(total_frames, 1_000_000 // traj_len_collector) - buffer_scratch_dir = "/tmp/" - - seed = 0 - - ############################################################################### - # Initialization - # -------------- - # - # To initialize the experiment, we first acquire the observation statistics, - # then build the networks, wrap them in an exploration wrapper (following the - # seminal DDPG paper, we used an Ornstein-Uhlenbeck process to add noise to the - # sampled actions). - - - # Seeding - torch.manual_seed(seed) - np.random.seed(seed) - - ############################################################################### - # Normalization stats - # ~~~~~~~~~~~~~~~~~~~ - - transform_state_dict = get_env_stats() - - ############################################################################### - # Models: policy and q-value network - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - actor, qnet = make_ddpg_actor( - transform_state_dict=transform_state_dict, - device=device, + q_net = MLP( + in_features=in_features + + out_features, # receives an action and an observation as input + out_features=1, + num_cells=[num_cells] * num_layers, + activation_class=nn.Tanh, ) - if device == torch.device("cpu"): - actor.share_memory() - - - ############################################################################### - # Loss module - # ~~~~~~~~~~~ - # We build our loss module with the actor and qnet we've just created. - # Because we have target parameters to update, we _must_ create a target network - # updater. - # - loss_module = DDPGLoss(actor, qnet) - # let's use the TD(lambda) estimator! - loss_module.make_value_estimator(ValueEstimators.TDLambda) - target_net_updater = SoftUpdate(loss_module, eps=0.98) - target_net_updater.init_() - - ############################################################################### - # The policy is wrapped in a :class:`torchrl.modules.OrnsteinUhlenbeckProcessWrapper` - # exploration module: - - actor_model_explore = OrnsteinUhlenbeckProcessWrapper( - actor, - annealing_num_steps=annealing_frames, + + in_keys = in_keys + ["action"] + qnet = ValueOperator( + in_keys=in_keys, + module=q_net, ).to(device) - if device == torch.device("cpu"): - actor_model_explore.share_memory() - - ############################################################################### - # Parallel environment creation - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # - # We pass the stats computed earlier to normalize the output of our - # environment: - - create_env_fn = parallel_env_constructor( - transform_state_dict=transform_state_dict, + + return actor, qnet + + +############################################################################### +# Evaluator: building your recorder object +# ---------------------------------------- +# +# As the training data is obtained using some exploration strategy, the true +# performance of our algorithm needs to be assessed in deterministic mode. We +# do this using a dedicated class, ``Recorder``, which executes the policy in +# the environment at a given frequency and returns some statistics obtained +# from these simulations. +# +# The following helper function builds this object: + + +def make_recorder(actor_model_explore, transform_state_dict): + base_env = make_env() + recorder = make_transformed_env(base_env) + recorder.transform[2].init_stats(3) + recorder.transform[2].load_state_dict(transform_state_dict) + + recorder_obj = Recorder( + record_frames=1000, + frame_skip=frame_skip, + policy_exploration=actor_model_explore, + recorder=recorder, + exploration_mode="mean", + record_interval=record_interval, ) + return recorder_obj + - ############################################################################### - # Data collector - # ~~~~~~~~~~~~~~ - # - # TorchRL provides specialized classes to help you collect data by executing - # the policy in the environment. These "data collectors" iteratively compute - # the action to be executed at a given time, then execute a step in the - # environment and reset it when required. - # Data collectors are designed to help developers have a tight control - # on the number of frames per batch of data, on the (a)sync nature of this - # collection and on the resources allocated to the data collection (e.g. GPU, - # number of workers etc). - # - # Here we will use - # :class:`torchrl.collectors.MultiaSyncDataCollector`, a data collector that - # will be executed in an async manner (i.e. data will be collected while - # the policy is being optimized). With the :class:`MultiaSyncDataCollector`, - # multiple workers are running rollouts separately. When a batch is asked, it - # is gathered from the first worker that can provide it. - # - # The parameters to specify are: - # - # - the list of environment creation functions, - # - the policy, - # - the total number of frames before the collector is considered empty, - # - the maximum number of frames per trajectory (useful for non-terminating - # environments, like dm_control ones). - # - # One should also pass: - # - # - the number of frames in each batch collected, - # - the number of random steps executed independently from the policy, - # - the devices used for policy execution - # - the devices used to store data before the data is passed to the main - # process. - # - # Collectors also accept post-processing hooks. - # For instance, the :class:`torchrl.data.postprocs.MultiStep` class passed as - # ``postproc`` makes it so that the rewards of the ``n`` upcoming steps are - # summed (with some discount factor) and the next observation is changed to - # be the n-step forward observation. One could pass other transforms too: - # using :class:`tensordict.nn.TensorDictModule` and - # :class:`tensordict.nn.TensorDictSequential` we can seamlessly append a - # wide range of transforms to our collector. - - if n_steps_forward > 0: - multistep = MultiStep(n_steps=n_steps_forward, gamma=gamma) +############################################################################### +# Replay buffer +# ------------- +# +# Replay buffers come in two flavors: prioritized (where some error signal +# is used to give a higher likelihood of sampling to some items than others) +# and regular, circular experience replay. +# +# TorchRL replay buffers are composable: one can pick up the storage, sampling +# and writing strategies. It is also possible to +# store tensors on physical memory using a memory-mapped array. The following +# function takes care of creating the replay buffer with the desired +# hyperparameters: +# + + +def make_replay_buffer(buffer_size, batch_size, prefetch=3): + if prb: + sampler = PrioritizedSampler( + max_capacity=buffer_size, + alpha=0.7, + beta=0.5, + ) else: - multistep = None - - collector = MultiaSyncDataCollector( - create_env_fn=[create_env_fn, create_env_fn], - policy=actor_model_explore, - total_frames=total_frames, - max_frames_per_traj=max_frames_per_traj, - frames_per_batch=frames_per_batch, - init_random_frames=init_random_frames, - reset_at_each_iter=False, - postproc=multistep, - split_trajs=True, - device=device, # device for execution - storing_device=device, # device where data will be stored and passed - update_at_each_batch=False, - exploration_mode="random", + sampler = RandomSampler() + replay_buffer = TensorDictReplayBuffer( + storage=LazyMemmapStorage( + buffer_size, + scratch_dir=buffer_scratch_dir, + device=device, + ), + batch_size=batch_size, + sampler=sampler, + pin_memory=False, + prefetch=prefetch, ) + return replay_buffer - collector.set_seed(seed) - ############################################################################### - # Replay buffer - # ~~~~~~~~~~~~~ - # +############################################################################### +# Hyperparameters +# --------------- +# +# After having written our helper functions, it is time to set the +# experiment hyperparameters: + +############################################################################### +# Environment +# ~~~~~~~~~~~ + +# The backend can be gym or dm_control +backend = "gym" + +exp_name = "cheetah" + +# frame_skip batches multiple step together with a single action +# If > 1, the other frame counts (e.g. frames_per_batch, total_frames) need to +# be adjusted to have a consistent total number of frames collected across +# experiments. +frame_skip = 2 +from_pixels = False +# Scaling the reward helps us control the signal magnitude for a more +# efficient learning. +reward_scaling = 5.0 + +# Number of random steps used as for stats computation using ObservationNorm +init_env_steps = 1000 + +# Exploration: Number of frames before OU noise becomes null +annealing_frames = 1000000 // frame_skip + +############################################################################### +# Collection +# ~~~~~~~~~~ + +# We will execute the policy on cuda if available +device = ( + torch.device("cpu") if torch.cuda.device_count() == 0 else torch.device("cuda:0") +) + +# Number of environments in each data collector +env_per_collector = 2 + +# Total frames we will use during training. Scale up to 500K - 1M for a more +# meaningful training +total_frames = 10000 // frame_skip + +# Number of frames returned by the collector at each iteration of the outer loop. +# We expect batches from the collector to have a shape [env_per_collector, frames_per_batch // env_per_collector] +frames_per_batch = env_per_collector * 1000 // frame_skip +max_frames_per_traj = 1000 // frame_skip +init_random_frames = 0 +# We'll be using the MultiStep class to have a less myopic representation of +# upcoming states +n_steps_forward = 3 + +# record every 10 batch collected +record_interval = 10 + +############################################################################### +# Optimizer and optimization +# ~~~~~~~~~~~~~~~~~~~~~~~~~~ + +lr = 5e-4 +weight_decay = 0.0 +# UTD: Number of iterations of the inner loop +update_to_data = 32 +batch_size = 128 + +############################################################################### +# Model +# ~~~~~ + +gamma = 0.99 +tau = 0.005 # Decay factor for the target network + +# Network specs +num_cells = 64 +num_layers = 2 + +############################################################################### +# Replay buffer +# ~~~~~~~~~~~~~ + +# If True, a Prioritized replay buffer will be used +prb = True +# Number of frames stored in the buffer +traj_len_collector = frames_per_batch // env_per_collector +buffer_size = min(total_frames, 1_000_000 // traj_len_collector) +buffer_scratch_dir = "/tmp/" + +seed = 0 + +############################################################################### +# Initialization +# -------------- +# +# To initialize the experiment, we first acquire the observation statistics, +# then build the networks, wrap them in an exploration wrapper (following the +# seminal DDPG paper, we used an Ornstein-Uhlenbeck process to add noise to the +# sampled actions). - replay_buffer = make_replay_buffer(buffer_size, prefetch=3) - ############################################################################### - # Recorder - # ~~~~~~~~ +# Seeding +torch.manual_seed(seed) +np.random.seed(seed) - recorder = make_recorder(actor_model_explore, transform_state_dict) +############################################################################### +# Normalization stats +# ~~~~~~~~~~~~~~~~~~~ - ############################################################################### - # Optimizer - # ~~~~~~~~~ - # - # Finally, we will use the Adam optimizer for the policy and value network, - # with the same learning rate for both. +transform_state_dict = get_env_stats() - optimizer = optim.Adam(loss_module.parameters(), lr=lr, weight_decay=weight_decay) - total_collection_steps = total_frames // frames_per_batch +############################################################################### +# Models: policy and q-value network +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( - optimizer, T_max=total_collection_steps - ) +actor, qnet = make_ddpg_actor( + transform_state_dict=transform_state_dict, + device=device, +) +if device == torch.device("cpu"): + actor.share_memory() - ############################################################################### - # Time to train the policy - # ------------------------ - # - # The training loop is pretty straightforward now that we have built all the - # modules we need. - # - rewards = [] - rewards_eval = [] +############################################################################### +# Loss module +# ~~~~~~~~~~~ +# We build our loss module with the actor and qnet we've just created. +# Because we have target parameters to update, we _must_ create a target network +# updater. +# +loss_module = DDPGLoss(actor, qnet) +# let's use the TD(lambda) estimator! +loss_module.make_value_estimator(ValueEstimators.TDLambda) +target_net_updater = SoftUpdate(loss_module, eps=0.98) +target_net_updater.init_() + +############################################################################### +# The policy is wrapped in a :class:`torchrl.modules.OrnsteinUhlenbeckProcessWrapper` +# exploration module: + +actor_model_explore = OrnsteinUhlenbeckProcessWrapper( + actor, + annealing_num_steps=annealing_frames, +).to(device) +if device == torch.device("cpu"): + actor_model_explore.share_memory() + +############################################################################### +# Parallel environment creation +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# We pass the stats computed earlier to normalize the output of our +# environment: - # Main loop - norm_factor_training = ( - sum(gamma**i for i in range(n_steps_forward)) if n_steps_forward else 1 - ) +create_env_fn = parallel_env_constructor( + transform_state_dict=transform_state_dict, +) - collected_frames = 0 - pbar = tqdm.tqdm(total=total_frames) - r0 = None - for i, tensordict in enumerate(collector): +############################################################################### +# Data collector +# ~~~~~~~~~~~~~~ +# +# TorchRL provides specialized classes to help you collect data by executing +# the policy in the environment. These "data collectors" iteratively compute +# the action to be executed at a given time, then execute a step in the +# environment and reset it when required. +# Data collectors are designed to help developers have a tight control +# on the number of frames per batch of data, on the (a)sync nature of this +# collection and on the resources allocated to the data collection (e.g. GPU, +# number of workers etc). +# +# Here we will use +# :class:`torchrl.collectors.MultiaSyncDataCollector`, a data collector that +# will be executed in an async manner (i.e. data will be collected while +# the policy is being optimized). With the :class:`MultiaSyncDataCollector`, +# multiple workers are running rollouts separately. When a batch is asked, it +# is gathered from the first worker that can provide it. +# +# The parameters to specify are: +# +# - the list of environment creation functions, +# - the policy, +# - the total number of frames before the collector is considered empty, +# - the maximum number of frames per trajectory (useful for non-terminating +# environments, like dm_control ones). +# +# One should also pass: +# +# - the number of frames in each batch collected, +# - the number of random steps executed independently from the policy, +# - the devices used for policy execution +# - the devices used to store data before the data is passed to the main +# process. +# +# Collectors also accept post-processing hooks. +# For instance, the :class:`torchrl.data.postprocs.MultiStep` class passed as +# ``postproc`` makes it so that the rewards of the ``n`` upcoming steps are +# summed (with some discount factor) and the next observation is changed to +# be the n-step forward observation. One could pass other transforms too: +# using :class:`tensordict.nn.TensorDictModule` and +# :class:`tensordict.nn.TensorDictSequential` we can seamlessly append a +# wide range of transforms to our collector. + +if n_steps_forward > 0: + multistep = MultiStep(n_steps=n_steps_forward, gamma=gamma) +else: + multistep = None + +collector = MultiaSyncDataCollector( + create_env_fn=[create_env_fn, create_env_fn], + policy=actor_model_explore, + total_frames=total_frames, + max_frames_per_traj=max_frames_per_traj, + frames_per_batch=frames_per_batch, + init_random_frames=init_random_frames, + reset_at_each_iter=False, + postproc=multistep, + split_trajs=True, + device=device, # device for execution + storing_device=device, # device where data will be stored and passed + update_at_each_batch=False, + exploration_mode="random", +) + +collector.set_seed(seed) + +############################################################################### +# Replay buffer +# ~~~~~~~~~~~~~ +# - # update weights of the inference policy - collector.update_policy_weights_() +replay_buffer = make_replay_buffer(buffer_size=buffer_size, batch_size=batch_size, prefetch=3) - if r0 is None: - r0 = tensordict["next", "reward"].mean().item() - pbar.update(tensordict.numel()) +############################################################################### +# Recorder +# ~~~~~~~~ - # extend the replay buffer with the new data - current_frames = tensordict.numel() - collected_frames += current_frames - replay_buffer.extend(tensordict.cpu()) +recorder = make_recorder(actor_model_explore, transform_state_dict) + +############################################################################### +# Optimizer +# ~~~~~~~~~ +# +# Finally, we will use the Adam optimizer for the policy and value network, +# with the same learning rate for both. + +optimizer = optim.Adam(loss_module.parameters(), lr=lr, weight_decay=weight_decay) +total_collection_steps = total_frames // frames_per_batch - # optimization steps - if collected_frames >= init_random_frames: - for _ in range(update_to_data): - # sample from replay buffer - sampled_tensordict = replay_buffer.sample(batch_size).clone() - - # Compute loss - loss_dict = loss_module(sampled_tensordict) - - # optimize - loss_val = sum( - value for key, value in loss_dict.items() if key.startswith("loss") - ) - loss_val.backward() - optimizer.step() - optimizer.zero_grad() - - # update priority - if prb: - replay_buffer.update_tensordict_priority(sampled_tensordict) - # update target network - target_net_updater.step() - - rewards.append( - ( - i, - tensordict["next", "reward"].mean().item() - / norm_factor_training - / frame_skip, +scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( + optimizer, T_max=total_collection_steps +) + +############################################################################### +# Time to train the policy +# ------------------------ +# +# The training loop is pretty straightforward now that we have built all the +# modules we need. +# + +rewards = [] +rewards_eval = [] + +# Main loop +norm_factor_training = ( + sum(gamma**i for i in range(n_steps_forward)) if n_steps_forward else 1 +) + +collected_frames = 0 +pbar = tqdm.tqdm(total=total_frames) +r0 = None +for i, tensordict in enumerate(collector): + + # update weights of the inference policy + collector.update_policy_weights_() + + if r0 is None: + r0 = tensordict["next", "reward"].mean().item() + pbar.update(tensordict.numel()) + + # extend the replay buffer with the new data + current_frames = tensordict.numel() + collected_frames += current_frames + try: + replay_buffer.extend(tensordict.cpu()) + except Exception as err: + print("iteration", i) + print(replay_buffer._storage._storage) + print(tensordict) + raise err + + # optimization steps + if collected_frames >= init_random_frames: + for _ in range(update_to_data): + # sample from replay buffer + sampled_tensordict = replay_buffer.sample(batch_size).clone() + + # Compute loss + loss_dict = loss_module(sampled_tensordict) + + # optimize + loss_val = sum( + value for key, value in loss_dict.items() if key.startswith("loss") ) + loss_val.backward() + optimizer.step() + optimizer.zero_grad() + + # update priority + if prb: + replay_buffer.update_tensordict_priority(sampled_tensordict) + # update target network + target_net_updater.step() + + rewards.append( + ( + i, + tensordict["next", "reward"].mean().item() + / norm_factor_training + / frame_skip, ) - td_record = recorder(None) - if td_record is not None: - rewards_eval.append((i, td_record["r_evaluation"].item())) - if len(rewards_eval): - pbar.set_description( - f"reward: {rewards[-1][1]: 4.4f} (r0 = {r0: 4.4f}), reward eval: reward: {rewards_eval[-1][1]: 4.4f}" - ) + ) + td_record = recorder(None) + if td_record is not None: + rewards_eval.append((i, td_record["r_evaluation"].item())) + if len(rewards_eval): + pbar.set_description( + f"reward: {rewards[-1][1]: 4.4f} (r0 = {r0: 4.4f}), reward eval: reward: {rewards_eval[-1][1]: 4.4f}" + ) + + # update the exploration strategy + actor_model_explore.step(current_frames) + if collected_frames >= init_random_frames: + scheduler.step() + +collector.shutdown() +del collector - # update the exploration strategy - actor_model_explore.step(current_frames) - if collected_frames >= init_random_frames: - scheduler.step() - - collector.shutdown() - del collector - - ############################################################################### - # Experiment results - # ------------------ - # - # We make a simple plot of the average rewards during training. We can observe - # that our policy learned quite well to solve the task. - # - # **Note**: As already mentioned above, to get a more reasonable performance, - # use a greater value for ``total_frames`` e.g. 1M. - - plt.figure() - plt.plot(*zip(*rewards), label="training") - plt.plot(*zip(*rewards_eval), label="eval") - plt.legend() - plt.xlabel("iter") - plt.ylabel("reward") - plt.tight_layout() - - ############################################################################### - # Conclusion - # ---------- - # - # In this tutorial, we have learnt how to code a loss module in TorchRL given - # the concrete example of DDPG. - # - # The key takeaways are: - # - # - How to use the :class:`torchrl.objectives.LossModule` class to register components; - # - How to use (or not) a target network, and how to update its parameters; - # - How to create an optimizer associated with a loss module. +############################################################################### +# Experiment results +# ------------------ +# +# We make a simple plot of the average rewards during training. We can observe +# that our policy learned quite well to solve the task. +# +# **Note**: As already mentioned above, to get a more reasonable performance, +# use a greater value for ``total_frames`` e.g. 1M. + +plt.figure() +plt.plot(*zip(*rewards), label="training") +plt.plot(*zip(*rewards_eval), label="eval") +plt.legend() +plt.xlabel("iter") +plt.ylabel("reward") +plt.tight_layout() + +############################################################################### +# Conclusion +# ---------- +# +# In this tutorial, we have learnt how to code a loss module in TorchRL given +# the concrete example of DDPG. +# +# The key takeaways are: +# +# - How to use the :class:`torchrl.objectives.LossModule` class to register components; +# - How to use (or not) a target network, and how to update its parameters; +# - How to create an optimizer associated with a loss module. # From 6d2ff4b9a9711b2499fcc18886463df16e1c5d2d Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 29 Mar 2023 16:01:56 +0100 Subject: [PATCH 55/89] amend --- tutorials/sphinx-tutorials/coding_ddpg.py | 14 ++---- tutorials/sphinx-tutorials/coding_dqn.py | 57 +++++++++++++---------- 2 files changed, 38 insertions(+), 33 deletions(-) diff --git a/tutorials/sphinx-tutorials/coding_ddpg.py b/tutorials/sphinx-tutorials/coding_ddpg.py index dab9d740216..68e4a9e0bdb 100644 --- a/tutorials/sphinx-tutorials/coding_ddpg.py +++ b/tutorials/sphinx-tutorials/coding_ddpg.py @@ -67,7 +67,7 @@ from tensordict.nn import TensorDictModule from tensordict.tensordict import TensorDict, TensorDictBase from torch import nn, optim -from torchrl.collectors import MultiaSyncDataCollector, SyncDataCollector +from torchrl.collectors import MultiaSyncDataCollector from torchrl.data import CompositeSpec, TensorDictReplayBuffer from torchrl.data.postprocs import MultiStep from torchrl.data.replay_buffers.samplers import PrioritizedSampler, RandomSampler @@ -1010,7 +1010,9 @@ def make_replay_buffer(buffer_size, batch_size, prefetch=3): # ~~~~~~~~~~~~~ # -replay_buffer = make_replay_buffer(buffer_size=buffer_size, batch_size=batch_size, prefetch=3) +replay_buffer = make_replay_buffer( + buffer_size=buffer_size, batch_size=batch_size, prefetch=3 +) ############################################################################### # Recorder @@ -1063,13 +1065,7 @@ def make_replay_buffer(buffer_size, batch_size, prefetch=3): # extend the replay buffer with the new data current_frames = tensordict.numel() collected_frames += current_frames - try: - replay_buffer.extend(tensordict.cpu()) - except Exception as err: - print("iteration", i) - print(replay_buffer._storage._storage) - print(tensordict) - raise err + replay_buffer.extend(tensordict.cpu()) # optimization steps if collected_frames >= init_random_frames: diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py index 15d98ad092a..c54518fa547 100644 --- a/tutorials/sphinx-tutorials/coding_dqn.py +++ b/tutorials/sphinx-tutorials/coding_dqn.py @@ -610,6 +610,11 @@ def get_loss_module(actor, gamma): trainer.register_op("post_optim", target_net_updater.step) ############################################################################### +# .. note:: +# It is possible to link multiple optimizers to the trainer if needed. +# In this case, each optimizer will be tied to a field in the loss dictionary. +# Check the :class:`torchrl.trainers.OptimizerHook` to learn more. +# # Here we are, ready to train our algorithm! A simple call to # ``trainer.train()`` and we'll be getting our results logged in. # @@ -621,26 +626,28 @@ def get_loss_module(actor, gamma): def print_csv_files_in_folder(folder_path): """ - Find all CSV files in a folder and print the first 10 lines of each file. + Find all CSV files in a folder and return the first 10 lines of each file as a string. Args: folder_path (str): The relative path to the folder. Returns: - list: A list of all CSV files in the folder. + str: A string containing the first 10 lines of each CSV file in the folder. """ csv_files = [] + output_str = "" for file in os.listdir(folder_path): if file.endswith(".csv"): csv_files.append(os.path.join(folder_path, file)) for csv_file in csv_files: - print(f"File: {csv_file}") + output_str += f"File: {csv_file}\n" with open(csv_file, "r") as f: for i, line in enumerate(f): if i == 10: break - print(line.strip()) - print("\n") + output_str += line.strip() + "\n" + output_str += "\n" + return output_str print_csv_files_in_folder(exp_name) @@ -649,36 +656,38 @@ def print_csv_files_in_folder(folder_path): # Conclusion and possible improvements # ------------------------------------ # -# In this tutorial we have learnt: +# In this tutorial we have learned: # -# - How to train a policy that read pixel-based states, what transforms to -# include and how to normalize the data; -# - How to create a policy that picks up the action with the highest value -# with :class:`torchrl.modules.QValueNetwork`; +# - How to write a Trainer, including building its components and registering +# them in the trainer; +# - How to code a DQN algorithm, including how to create a policy that picks +# up the action with the highest value with +# :class:`torchrl.modules.QValueNetwork`; # - How to build a multiprocessed data collector; -# - How to train a DQN with TD(:math:`\lambda`) returns. # -# We have seen that using TD(:math:`\lambda`) greatly improved the performance -# of DQN. Other possible improvements could include: +# Possible improvements to this tutorial could include: # -# - Using the Multi-Step post-processing. Multi-step will project an action -# to the nth following step, and create a discounted sum of the rewards in -# between. This trick can make the algorithm noticebly less myopic. To use -# this, simply create the collector with +# - Using the :class:`torchrl.data.MultiStep` +# post-processing. Multi-step will project an action +# to the :math:`n^{th}` following step, and create a discounted sum of the +# rewards in between. This trick can make the algorithm noticeably less +# myopic (although the reward is then biased). To use this, simply +# create the collector with # -# from torchrl.data.postprocs.postprocs import MultiStep -# collector = CollectorClass(..., postproc=MultiStep(gamma, n)) +# >>> from torchrl.data.postprocs.postprocs import MultiStep +# >>> collector = CollectorClass(..., postproc=MultiStep(gamma, n)) # # where ``n`` is the number of looking-forward steps. Pay attention to the # fact that the ``gamma`` factor has to be corrected by the number of # steps till the next observation when being passed to # ``vec_td_lambda_advantage_estimate``: # -# gamma = gamma ** tensordict["steps_to_next_obs"] +# >>> gamma = gamma ** tensordict["steps_to_next_obs"] +# # - A prioritized replay buffer could also be used. This will give a # higher priority to samples that have the worst value accuracy. -# - A distributional loss (see ``torchrl.objectives.DistributionalDQNLoss`` +# Learn more on the `replay buffer section `_ +# of the documentation. +# - A distributional loss (see :class:`torchrl.objectives.DistributionalDQNLoss` # for more information). -# - More fancy exploration techniques, such as NoisyLinear layers and such -# (check ``torchrl.modules.NoisyLinear``, which is fully compatible with the -# ``MLP`` class used in our Dueling DQN). +# - More fancy exploration techniques, such as :class:`torchrl.modules.NoisyLinear` layers and such. From 1411cf48de6d6b2b6438ce3f8b0b654ba08b5581 Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 29 Mar 2023 16:32:35 +0100 Subject: [PATCH 56/89] amend --- tutorials/sphinx-tutorials/coding_dqn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py index c54518fa547..6abdae9d60d 100644 --- a/tutorials/sphinx-tutorials/coding_dqn.py +++ b/tutorials/sphinx-tutorials/coding_dqn.py @@ -650,7 +650,7 @@ def print_csv_files_in_folder(folder_path): return output_str -print_csv_files_in_folder(exp_name) +print_csv_files_in_folder("csv_logs/" + exp_name) ############################################################################### # Conclusion and possible improvements From 259a1beb8d9799d117d035c4c57c230af1d3b702 Mon Sep 17 00:00:00 2001 From: vmoens Date: Thu, 30 Mar 2023 12:57:17 +0100 Subject: [PATCH 57/89] amend --- torchrl/collectors/collectors.py | 6 +- torchrl/data/replay_buffers/replay_buffers.py | 16 +-- torchrl/data/replay_buffers/storages.py | 37 +++++- torchrl/envs/transforms/transforms.py | 60 ++++++++-- .../modules/tensordict_module/exploration.py | 23 ++-- torchrl/objectives/ddpg.py | 6 - torchrl/objectives/utils.py | 9 +- torchrl/trainers/trainers.py | 4 +- tutorials/sphinx-tutorials/coding_ddpg.py | 109 ++++++++++-------- 9 files changed, 174 insertions(+), 96 deletions(-) diff --git a/torchrl/collectors/collectors.py b/torchrl/collectors/collectors.py index 7bd1f92a1d1..0c77db4ccb3 100644 --- a/torchrl/collectors/collectors.py +++ b/torchrl/collectors/collectors.py @@ -569,15 +569,18 @@ def __init__( traj_ids, ) + with torch.no_grad(): + self._tensordict_out = env.fake_tensordict() if ( hasattr(self.policy, "spec") and self.policy.spec is not None and all(v is not None for v in self.policy.spec.values()) and set(self.policy.spec.keys(True, True)) == set(self.policy.out_keys) + and any(key not in self._tensordict_out.keys(isinstance(key, tuple)) for key in self.policy.spec) ): # if policy spec is non-empty, all the values are not None and the keys # match the out_keys we assume the user has given all relevant information - self._tensordict_out = env.fake_tensordict().to_tensordict() + # the policy could have more keys than the env: self._tensordict_out.update(self.policy.spec.zero()) self._tensordict_out = ( self._tensordict_out.unsqueeze(-1) @@ -589,7 +592,6 @@ def __init__( # determine the relevant keys with which to pre-populate _tensordict_out. # See #505 for additional context. with torch.no_grad(): - self._tensordict_out = env.fake_tensordict() self._tensordict_out = self._tensordict_out.to(self.device) self._tensordict_out = self.policy(self._tensordict_out).unsqueeze(-1) self._tensordict_out = ( diff --git a/torchrl/data/replay_buffers/replay_buffers.py b/torchrl/data/replay_buffers/replay_buffers.py index eb1ed4dbe15..cf04db52ea7 100644 --- a/torchrl/data/replay_buffers/replay_buffers.py +++ b/torchrl/data/replay_buffers/replay_buffers.py @@ -11,7 +11,7 @@ import torch from tensordict.tensordict import LazyStackedTensorDict, TensorDict, TensorDictBase -from tensordict.utils import expand_right +from tensordict.utils import expand_as_right from torchrl.data.utils import DEVICE_TYPING @@ -755,19 +755,7 @@ def sample( data, info = super().sample(batch_size, return_info=True) if include_info in (True, None): for k, v in info.items(): - data.set(k, torch.tensor(v, device=data.device)) - if "_batch_size" in data.keys(): - # we need to reset the batch-size - shape = data.pop("_batch_size") - shape = shape[0] - shape = torch.Size([data.shape[0], *shape]) - # we may need to update some values in the data - for key, value in data.items(): - if value.ndim >= len(shape): - continue - value = expand_right(value, shape) - data.set(key, value) - data.batch_size = shape + data.set(k, expand_as_right(torch.tensor(v, device=data.device), data)) if return_info: return data, info return data diff --git a/torchrl/data/replay_buffers/storages.py b/torchrl/data/replay_buffers/storages.py index 7a789260e48..d96e2498f6b 100644 --- a/torchrl/data/replay_buffers/storages.py +++ b/torchrl/data/replay_buffers/storages.py @@ -14,6 +14,7 @@ from tensordict.memmap import MemmapTensor from tensordict.prototype import is_tensorclass from tensordict.tensordict import is_tensor_collection, TensorDict, TensorDictBase +from tensordict.utils import expand_right from torchrl._utils import _CKPT_BACKEND, VERBOSE from torchrl.data.replay_buffers.utils import INT_CLASSES @@ -423,10 +424,42 @@ def _mem_map_tensor_as_tensor(mem_map_tensor: MemmapTensor) -> torch.Tensor: return mem_map_tensor._tensor +def _reset_batch_size(x): + """Resets the batch size of a tensordict. + + In some cases we save the original shape of the tensordict as a tensor (or memmap tensor). + + This function will read that tensor, extract its items and reset the shape + of the tensordict to it. If items have an incompatible shape (e.g. "index") + they will be expanded to the right to match it. + + """ + shape = x.pop("_batch_size", None) + if shape is not None: + # we need to reset the batch-size + if isinstance(shape, MemmapTensor): + shape = shape.as_tensor() + locked = x.is_locked + if locked: + x.unlock_() + shape = [s.item() for s in shape[0]] + shape = torch.Size([x.shape[0], *shape]) + # we may need to update some values in the data + for key, value in x.items(): + if value.ndim >= len(shape): + continue + value = expand_right(value, shape) + x.set(key, value) + x.batch_size = shape + if locked: + x.lock_() + return x + + def _collate_list_tensordict(x): out = torch.stack(x, 0) if isinstance(out, TensorDictBase): - return out.to_tensordict() + return _reset_batch_size(out.to_tensordict()) return out @@ -436,7 +469,7 @@ def _collate_list_tensors(*x): def _collate_contiguous(x): if isinstance(x, TensorDictBase): - return x.to_tensordict() + return _reset_batch_size(x).to_tensordict() return x.clone() diff --git a/torchrl/envs/transforms/transforms.py b/torchrl/envs/transforms/transforms.py index 6a0dd6be2b8..08f9dfe5c46 100644 --- a/torchrl/envs/transforms/transforms.py +++ b/torchrl/envs/transforms/transforms.py @@ -2602,6 +2602,13 @@ class VecNorm(Transform): default: 0.99 eps (number, optional): lower bound of the running standard deviation (for numerical underflow). Default is 1e-4. + shapes (List[torch.Size], optional): if provided, represents the shape + of each in_keys. Its length must match the one of ``in_keys``. + Each shape must match the trailing dimension of the corresponding + entry. + If not, the feature dimensions of the entry (ie all dims that do + not belong to the tensordict batch-size) will be considered as + feature dimension. Examples: >>> from torchrl.envs.libs.gym import GymEnv @@ -2629,6 +2636,7 @@ def __init__( lock: mp.Lock = None, decay: float = 0.9999, eps: float = 1e-4, + shapes: List[torch.Size] = None, ) -> None: if lock is None: lock = mp.Lock() @@ -2656,8 +2664,14 @@ def __init__( self.lock = lock self.decay = decay + self.shapes = shapes self.eps = eps + def _key_str(self, key): + if not isinstance(key, str): + key = "_".join(key) + return key + def _call(self, tensordict: TensorDictBase) -> TensorDictBase: if self.lock is not None: self.lock.acquire() @@ -2681,17 +2695,44 @@ def _call(self, tensordict: TensorDictBase) -> TensorDictBase: forward = _call def _init(self, tensordict: TensorDictBase, key: str) -> None: - if self._td is None or key + "_sum" not in self._td.keys(): - td_view = tensordict.view(-1) - td_select = td_view[0] - d = {key + "_sum": torch.zeros_like(td_select.get(key))} - d.update({key + "_ssq": torch.zeros_like(td_select.get(key))}) + key_str = self._key_str(key) + if self._td is None or key_str + "_sum" not in self._td.keys(): + if key is not key_str and key_str in tensordict.keys(): + raise RuntimeError( + f"Conflicting key names: {key_str} from VecNorm and input tensordict keys." + ) + if self.shapes is None: + td_view = tensordict.view(-1) + td_select = td_view[0] + item = td_select.get(key) + d = {key_str + "_sum": torch.zeros_like(item)} + d.update({key_str + "_ssq": torch.zeros_like(item)}) + else: + idx = 0 + for in_key in self.in_keys: + if in_key != key: + idx += 1 + else: + break + shape = self.shapes[idx] + item = tensordict.get(key) + d = { + key_str + + "_sum": torch.zeros(shape, device=item.device, dtype=item.dtype) + } + d.update( + { + key_str + + "_ssq": torch.zeros( + shape, device=item.device, dtype=item.dtype + ) + } + ) + d.update( { - key - + "_count": torch.zeros( - 1, device=td_select.get(key).device, dtype=torch.float - ) + key_str + + "_count": torch.zeros(1, device=item.device, dtype=torch.float) } ) if self._td is None: @@ -2702,6 +2743,7 @@ def _init(self, tensordict: TensorDictBase, key: str) -> None: pass def _update(self, key, value, N) -> torch.Tensor: + key = self._key_str(key) _sum = self._td.get(key + "_sum") _ssq = self._td.get(key + "_ssq") _count = self._td.get(key + "_count") diff --git a/torchrl/modules/tensordict_module/exploration.py b/torchrl/modules/tensordict_module/exploration.py index 4d8feaef8b8..8eb0bf6dc54 100644 --- a/torchrl/modules/tensordict_module/exploration.py +++ b/torchrl/modules/tensordict_module/exploration.py @@ -178,14 +178,21 @@ def __init__( self.register_buffer("std", torch.tensor([std])) self.register_buffer("sigma", torch.tensor([sigma_init])) self.action_key = action_key - self.spec = ( - spec - if spec is not None - else policy.spec - if hasattr(policy, "spec") - else None - ) + self.out_keys = list(self.td_module.out_keys) + if spec is not None: + if not isinstance(spec, CompositeSpec) and len(self.out_keys) == 1: + spec = CompositeSpec({self.out_keys[0]: spec}) + elif not isinstance(spec, CompositeSpec): + raise ValueError(f"Cannot infer which key the spec is made for, got spec={spec} and out_keys={self.out_keys}.") + self._spec = spec + elif hasattr(self.td_module, "_spec"): + self._spec = self.td_module._spec.clone() + else: + self._spec = CompositeSpec({key: None for key in policy.in_keys}) + self.safe = safe + if self.safe: + self.register_forward_hook(_forward_hook_safe_action) def step(self, frames: int = 1) -> None: """A step of sigma decay. @@ -341,7 +348,7 @@ def __init__( self.register_buffer("eps", torch.tensor([eps_init])) self.out_keys = list(self.td_module.out_keys) + self.ou.out_keys self._spec = CompositeSpec( - **self.td_module._spec, **{key: None for key in self.ou.out_keys} + **self.td_module._spec, **{key: None for key in self.ou.out_keys}, shape=self.td_module._spec.shape ) if len(set(self.out_keys)) != len(self.out_keys): raise RuntimeError(f"Got multiple identical output keys: {self.out_keys}") diff --git a/torchrl/objectives/ddpg.py b/torchrl/objectives/ddpg.py index c1cacd7349e..917f5df44c6 100644 --- a/torchrl/objectives/ddpg.py +++ b/torchrl/objectives/ddpg.py @@ -99,12 +99,6 @@ def forward(self, input_tensordict: TensorDictBase) -> TensorDict: a tuple of 2 tensors containing the DDPG loss. """ - if not input_tensordict.device == self.device: - raise RuntimeError( - f"Got device={input_tensordict.device} but " - f"actor_network.device={self.device} (self.device={self.device})" - ) - loss_value, td_error, pred_val, target_value = self._loss_value( input_tensordict, ) diff --git a/torchrl/objectives/utils.py b/torchrl/objectives/utils.py index 9d393a51d05..8b72f1f6620 100644 --- a/torchrl/objectives/utils.py +++ b/torchrl/objectives/utils.py @@ -242,15 +242,18 @@ def __repr__(self) -> str: class SoftUpdate(TargetNetUpdater): - """A soft-update class for target network update in Double DQN/DDPG. + r"""A soft-update class for target network update in Double DQN/DDPG. This was proposed in "CONTINUOUS CONTROL WITH DEEP REINFORCEMENT LEARNING", https://arxiv.org/pdf/1509.02971.pdf Args: loss_module (DQNLoss or DDPGLoss): loss module where the target network should be updated. eps (scalar): epsilon in the update equation: - param = prev_param * eps + new_param * (1-eps) - default: 0.999 + .. math:: + + \theta_t = \theta_{t-1} * \epsilon + \theta_t * (1-\epsilon) + + Defaults to 0.999 """ def __init__( diff --git a/torchrl/trainers/trainers.py b/torchrl/trainers/trainers.py index 52d58542442..ce3516f55db 100644 --- a/torchrl/trainers/trainers.py +++ b/torchrl/trainers/trainers.py @@ -1114,7 +1114,7 @@ class Recorder(TrainerHookBase): each iteration, otherwise the frame count can be underestimated. For logging, this parameter is important to normalize the reward. Finally, to compare different runs with different frame_skip, - one must normalize the frame count and rewards. Default is 1. + one must normalize the frame count and rewards. Defaults to ``1``. policy_exploration (ProbabilisticTDModule): a policy instance used for @@ -1151,7 +1151,7 @@ def __init__( *, record_interval: int, record_frames: int, - frame_skip: int, + frame_skip: int = 1, policy_exploration: TensorDictModule, environment: EnvBase = None, exploration_mode: str = "random", diff --git a/tutorials/sphinx-tutorials/coding_ddpg.py b/tutorials/sphinx-tutorials/coding_ddpg.py index 68e4a9e0bdb..503a53d48d1 100644 --- a/tutorials/sphinx-tutorials/coding_ddpg.py +++ b/tutorials/sphinx-tutorials/coding_ddpg.py @@ -69,7 +69,6 @@ from torch import nn, optim from torchrl.collectors import MultiaSyncDataCollector from torchrl.data import CompositeSpec, TensorDictReplayBuffer -from torchrl.data.postprocs import MultiStep from torchrl.data.replay_buffers.samplers import PrioritizedSampler, RandomSampler from torchrl.data.replay_buffers.storages import LazyMemmapStorage from torchrl.envs import ( @@ -342,7 +341,6 @@ def _loss_value( tensordict, target_params=target_params ).squeeze(-1) - # td_error = pred_val - target_value loss_value = distance_loss(pred_val, target_value, loss_function=self.loss_funtion) td_error = (pred_val - target_value).pow(2) @@ -790,50 +788,65 @@ def make_replay_buffer(buffer_size, batch_size, prefetch=3): # The backend can be gym or dm_control backend = "gym" -exp_name = "cheetah" - -# frame_skip batches multiple step together with a single action -# If > 1, the other frame counts (e.g. frames_per_batch, total_frames) need to -# be adjusted to have a consistent total number of frames collected across -# experiments. +############################################################################### +# .. note:: +# ``frame_skip`` batches multiple step together with a single action +# If > 1, the other frame counts (e.g. frames_per_batch, total_frames) need to +# be adjusted to have a consistent total number of frames collected across +# experiments. This is important as raising the frame-skip but keeping the +# total number of frames unchanged may seem like cheating: all things compared, +# a dataset of 10M elements collected with a frame-skip of 2 and another with +# a frame-skip of 1 actually have a ratio of interactions with the environment +# of 2:1! +# frame_skip = 2 from_pixels = False + +############################################################################### # Scaling the reward helps us control the signal magnitude for a more # efficient learning. reward_scaling = 5.0 +############################################################################### # Number of random steps used as for stats computation using ObservationNorm init_env_steps = 1000 +############################################################################### # Exploration: Number of frames before OU noise becomes null annealing_frames = 1000000 // frame_skip ############################################################################### # Collection # ~~~~~~~~~~ - +# # We will execute the policy on cuda if available device = ( torch.device("cpu") if torch.cuda.device_count() == 0 else torch.device("cuda:0") ) +############################################################################### # Number of environments in each data collector env_per_collector = 2 +############################################################################### # Total frames we will use during training. Scale up to 500K - 1M for a more # meaningful training total_frames = 10000 // frame_skip +############################################################################### # Number of frames returned by the collector at each iteration of the outer loop. -# We expect batches from the collector to have a shape [env_per_collector, frames_per_batch // env_per_collector] -frames_per_batch = env_per_collector * 1000 // frame_skip +# We expect batches from the collector to have a shape +# ``[env_per_collector, traj_len]`` where ``traj_len`` is the time dimension +# of the samples. TorchRL's datacollectors are given a certain number of +# environment and a number of frames to deliver in each batch. We can +# We can easily calculate how many frames we need to ask to the collectors: +traj_len = 50 # time length of the batches +frames_per_batch = env_per_collector * traj_len // frame_skip max_frames_per_traj = 1000 // frame_skip init_random_frames = 0 -# We'll be using the MultiStep class to have a less myopic representation of -# upcoming states -n_steps_forward = 3 -# record every 10 batch collected +############################################################################### +# We will be recording the performance every 10 batch collected record_interval = 10 ############################################################################### @@ -843,14 +856,22 @@ def make_replay_buffer(buffer_size, batch_size, prefetch=3): lr = 5e-4 weight_decay = 0.0 # UTD: Number of iterations of the inner loop -update_to_data = 32 -batch_size = 128 +update_to_data = 4 + +############################################################################### +# Because we'll be sampling from a replay buffer that stores sub-trajectories +# of length ``traj_len``, we need to compute how large the batch-size +# is going to be based on the total number of elements we expect to find +# divided by the trajectory length: +batch_size = 128 // traj_len * frame_skip + ############################################################################### # Model # ~~~~~ gamma = 0.99 +lmbda = 0.2 tau = 0.005 # Decay factor for the target network # Network specs @@ -860,12 +881,11 @@ def make_replay_buffer(buffer_size, batch_size, prefetch=3): ############################################################################### # Replay buffer # ~~~~~~~~~~~~~ - -# If True, a Prioritized replay buffer will be used -prb = True +# If ``prb=True``, a Prioritized replay buffer will be used +prb = False +############################################################################### # Number of frames stored in the buffer -traj_len_collector = frames_per_batch // env_per_collector -buffer_size = min(total_frames, 1_000_000 // traj_len_collector) +buffer_size = min(total_frames, 1_000_000 // traj_len) buffer_scratch_dir = "/tmp/" seed = 0 @@ -911,11 +931,20 @@ def make_replay_buffer(buffer_size, batch_size, prefetch=3): # loss_module = DDPGLoss(actor, qnet) # let's use the TD(lambda) estimator! -loss_module.make_value_estimator(ValueEstimators.TDLambda) -target_net_updater = SoftUpdate(loss_module, eps=0.98) +loss_module.make_value_estimator(ValueEstimators.TDLambda, gamma=gamma, lmbda=lmbda) +target_net_updater = SoftUpdate(loss_module, eps=1 - tau) target_net_updater.init_() ############################################################################### +# .. note:: +# Off-policy usually dictates a TD(0) estimator. Here, we use a TD(:math:`\lambda`) +# estimator, which will introduce some bias as the trajectory that follows +# a certain state has been collected with an outdated policy. +# This trick, as the multi-step trick that can be used during data collection, +# are alternative versions of "hacks" that we usually find to work well in +# practice despite the fact that they introduce some bias in the return +# estimates. +# # The policy is wrapped in a :class:`torchrl.modules.OrnsteinUhlenbeckProcessWrapper` # exploration module: @@ -973,19 +1002,6 @@ def make_replay_buffer(buffer_size, batch_size, prefetch=3): # - the devices used to store data before the data is passed to the main # process. # -# Collectors also accept post-processing hooks. -# For instance, the :class:`torchrl.data.postprocs.MultiStep` class passed as -# ``postproc`` makes it so that the rewards of the ``n`` upcoming steps are -# summed (with some discount factor) and the next observation is changed to -# be the n-step forward observation. One could pass other transforms too: -# using :class:`tensordict.nn.TensorDictModule` and -# :class:`tensordict.nn.TensorDictSequential` we can seamlessly append a -# wide range of transforms to our collector. - -if n_steps_forward > 0: - multistep = MultiStep(n_steps=n_steps_forward, gamma=gamma) -else: - multistep = None collector = MultiaSyncDataCollector( create_env_fn=[create_env_fn, create_env_fn], @@ -995,7 +1011,6 @@ def make_replay_buffer(buffer_size, batch_size, prefetch=3): frames_per_batch=frames_per_batch, init_random_frames=init_random_frames, reset_at_each_iter=False, - postproc=multistep, split_trajs=True, device=device, # device for execution storing_device=device, # device where data will be stored and passed @@ -1046,9 +1061,6 @@ def make_replay_buffer(buffer_size, batch_size, prefetch=3): rewards_eval = [] # Main loop -norm_factor_training = ( - sum(gamma**i for i in range(n_steps_forward)) if n_steps_forward else 1 -) collected_frames = 0 pbar = tqdm.tqdm(total=total_frames) @@ -1071,15 +1083,13 @@ def make_replay_buffer(buffer_size, batch_size, prefetch=3): if collected_frames >= init_random_frames: for _ in range(update_to_data): # sample from replay buffer - sampled_tensordict = replay_buffer.sample(batch_size).clone() + sampled_tensordict = replay_buffer.sample() # Compute loss loss_dict = loss_module(sampled_tensordict) # optimize - loss_val = sum( - value for key, value in loss_dict.items() if key.startswith("loss") - ) + loss_val = loss_dict["loss_actor"] + loss_dict["loss_value"] loss_val.backward() optimizer.step() optimizer.zero_grad() @@ -1093,9 +1103,7 @@ def make_replay_buffer(buffer_size, batch_size, prefetch=3): rewards.append( ( i, - tensordict["next", "reward"].mean().item() - / norm_factor_training - / frame_skip, + tensordict["next", "reward"].mean().item() / frame_skip, ) ) td_record = recorder(None) @@ -1103,7 +1111,7 @@ def make_replay_buffer(buffer_size, batch_size, prefetch=3): rewards_eval.append((i, td_record["r_evaluation"].item())) if len(rewards_eval): pbar.set_description( - f"reward: {rewards[-1][1]: 4.4f} (r0 = {r0: 4.4f}), reward eval: reward: {rewards_eval[-1][1]: 4.4f}" + f"reward: {rewards[-1][1]: 4.4f} (r0 = {r0: 4.4f}), reward eval: reward: {rewards_eval[-1][1]: 4.4f}, shape={sampled_tensordict.shape}" ) # update the exploration strategy @@ -1141,7 +1149,8 @@ def make_replay_buffer(buffer_size, batch_size, prefetch=3): # # The key takeaways are: # -# - How to use the :class:`torchrl.objectives.LossModule` class to register components; +# - How to use the :class:`torchrl.objectives.LossModule` class to code up a new +# loss component; # - How to use (or not) a target network, and how to update its parameters; # - How to create an optimizer associated with a loss module. # From bad0d6a22410e871af57d0c0f40c9e18a1365f2a Mon Sep 17 00:00:00 2001 From: vmoens Date: Thu, 30 Mar 2023 16:10:03 +0100 Subject: [PATCH 58/89] init --- test/test_exploration.py | 201 +++++++++++++----- torchrl/collectors/collectors.py | 37 +++- torchrl/data/tensor_specs.py | 5 +- .../modules/tensordict_module/exploration.py | 66 +++++- 4 files changed, 234 insertions(+), 75 deletions(-) diff --git a/test/test_exploration.py b/test/test_exploration.py index 0ebe0a9d97d..103de211d6e 100644 --- a/test/test_exploration.py +++ b/test/test_exploration.py @@ -8,10 +8,14 @@ import pytest import torch from _utils_internal import get_available_devices +from mocking_classes import ContinuousActionVecMockEnv from scipy.stats import ttest_1samp from tensordict.tensordict import TensorDict from torch import nn + +from torchrl.collectors import SyncDataCollector from torchrl.data import BoundedTensorSpec, CompositeSpec +from torchrl.envs import SerialEnv from torchrl.envs.transforms.transforms import gSDENoise from torchrl.envs.utils import set_exploration_mode from torchrl.modules import SafeModule, SafeSequential @@ -21,7 +25,7 @@ NormalParamWrapper, ) from torchrl.modules.models.exploration import LazygSDEModule -from torchrl.modules.tensordict_module.actors import ProbabilisticActor +from torchrl.modules.tensordict_module.actors import Actor, ProbabilisticActor from torchrl.modules.tensordict_module.exploration import ( _OrnsteinUhlenbeckProcess, AdditiveGaussianWrapper, @@ -30,70 +34,122 @@ @pytest.mark.parametrize("device", get_available_devices()) -def test_ou(device, seed=0): - torch.manual_seed(seed) - td = TensorDict({"action": torch.randn(3) / 10}, batch_size=[], device=device) - ou = _OrnsteinUhlenbeckProcess(10.0, mu=2.0, x0=-4, sigma=0.1, sigma_min=0.01) +class TestOrnsteinUhlenbeckProcessWrapper: + def test_ou(self, device, seed=0): + torch.manual_seed(seed) + td = TensorDict({"action": torch.randn(3) / 10}, batch_size=[], device=device) + ou = _OrnsteinUhlenbeckProcess(10.0, mu=2.0, x0=-4, sigma=0.1, sigma_min=0.01) - tds = [] - for i in range(2000): - td = ou.add_sample(td) - tds.append(td.clone()) - td.set_("action", torch.randn(3) / 10) - if i % 1000 == 0: - td.zero_() + tds = [] + for i in range(2000): + td = ou.add_sample(td) + tds.append(td.clone()) + td.set_("action", torch.randn(3) / 10) + if i % 1000 == 0: + td.zero_() - tds = torch.stack(tds, 0) + tds = torch.stack(tds, 0) - tset, pval_acc = ttest_1samp(tds.get("action")[950:1000, 0].cpu().numpy(), 2.0) - tset, pval_reg = ttest_1samp(tds.get("action")[:50, 0].cpu().numpy(), 2.0) - assert pval_acc > 0.05 - assert pval_reg < 0.1 + tset, pval_acc = ttest_1samp(tds.get("action")[950:1000, 0].cpu().numpy(), 2.0) + tset, pval_reg = ttest_1samp(tds.get("action")[:50, 0].cpu().numpy(), 2.0) + assert pval_acc > 0.05 + assert pval_reg < 0.1 - tset, pval_acc = ttest_1samp(tds.get("action")[1950:2000, 0].cpu().numpy(), 2.0) - tset, pval_reg = ttest_1samp(tds.get("action")[1000:1050, 0].cpu().numpy(), 2.0) - assert pval_acc > 0.05 - assert pval_reg < 0.1 + tset, pval_acc = ttest_1samp(tds.get("action")[1950:2000, 0].cpu().numpy(), 2.0) + tset, pval_reg = ttest_1samp(tds.get("action")[1000:1050, 0].cpu().numpy(), 2.0) + assert pval_acc > 0.05 + assert pval_reg < 0.1 + def test_ou_wrapper(self, device, d_obs=4, d_act=6, batch=32, n_steps=100, seed=0): + torch.manual_seed(seed) + net = NormalParamWrapper(nn.Linear(d_obs, 2 * d_act)).to(device) + module = SafeModule(net, in_keys=["observation"], out_keys=["loc", "scale"]) + action_spec = BoundedTensorSpec(-torch.ones(d_act), torch.ones(d_act), (d_act,)) + policy = ProbabilisticActor( + spec=action_spec, + module=module, + in_keys=["loc", "scale"], + distribution_class=TanhNormal, + default_interaction_mode="random", + ).to(device) + exploratory_policy = OrnsteinUhlenbeckProcessWrapper(policy) -@pytest.mark.parametrize("device", get_available_devices()) -def test_ou_wrapper(device, d_obs=4, d_act=6, batch=32, n_steps=100, seed=0): - torch.manual_seed(seed) - net = NormalParamWrapper(nn.Linear(d_obs, 2 * d_act)).to(device) - module = SafeModule(net, in_keys=["observation"], out_keys=["loc", "scale"]) - action_spec = BoundedTensorSpec(-torch.ones(d_act), torch.ones(d_act), (d_act,)) - policy = ProbabilisticActor( - spec=action_spec, - module=module, - in_keys=["loc", "scale"], - distribution_class=TanhNormal, - default_interaction_mode="random", - ).to(device) - exploratory_policy = OrnsteinUhlenbeckProcessWrapper(policy) + tensordict = TensorDict( + batch_size=[batch], + source={ + "observation": torch.randn(batch, d_obs, device=device), + "step_count": torch.zeros(batch, device=device), + }, + device=device, + ) + out_noexp = [] + out = [] + for i in range(n_steps): + tensordict_noexp = policy(tensordict.clone()) + tensordict = exploratory_policy(tensordict.clone()) + if i == 0: + assert (tensordict[exploratory_policy.ou.steps_key] == 1).all() + else: + assert not (tensordict[exploratory_policy.ou.steps_key] == 1).all() - tensordict = TensorDict( - batch_size=[batch], - source={"observation": torch.randn(batch, d_obs, device=device)}, - device=device, - ) - out_noexp = [] - out = [] - for _ in range(n_steps): - tensordict_noexp = policy(tensordict.select("observation")) - tensordict = exploratory_policy(tensordict) - out.append(tensordict.clone()) - out_noexp.append(tensordict_noexp.clone()) - tensordict.set_("observation", torch.randn(batch, d_obs, device=device)) - out = torch.stack(out, 0) - out_noexp = torch.stack(out_noexp, 0) - assert (out_noexp.get("action") != out.get("action")).all() - assert (out.get("action") <= 1.0).all(), out.get("action").min() - assert (out.get("action") >= -1.0).all(), out.get("action").max() + out.append(tensordict.clone()) + out_noexp.append(tensordict_noexp.clone()) + tensordict.set_("observation", torch.randn(batch, d_obs, device=device)) + tensordict["step_count"] += 1 + out = torch.stack(out, 0) + out_noexp = torch.stack(out_noexp, 0) + assert (out_noexp.get("action") != out.get("action")).all() + assert (out.get("action") <= 1.0).all(), out.get("action").min() + assert (out.get("action") >= -1.0).all(), out.get("action").max() + + @pytest.mark.parametrize("parallel_spec", [True, False]) + @pytest.mark.parametrize("probabilistic", [True, False]) + def test_collector(self, device, parallel_spec, probabilistic, seed=0): + torch.manual_seed(seed) + env = SerialEnv(2, ContinuousActionVecMockEnv) + # the module must work with the action spec of a single env or a serial env + if parallel_spec: + action_spec = env.action_spec + else: + action_spec = ContinuousActionVecMockEnv().action_spec + d_act = action_spec.shape[-1] + if probabilistic: + net = NormalParamWrapper(nn.LazyLinear(2 * d_act)).to(device) + module = SafeModule( + net, + in_keys=["observation"], + out_keys=["loc", "scale"], + ) + policy = ProbabilisticActor( + module=module, + in_keys=["loc", "scale"], + distribution_class=TanhNormal, + default_interaction_mode="random", + spec=action_spec, + ).to(device) + else: + net = nn.LazyLinear(d_act).to(device) + policy = Actor( + net, in_keys=["observation"], out_keys=["action"], spec=action_spec + ) + + exploratory_policy = OrnsteinUhlenbeckProcessWrapper(policy) + exploratory_policy(env.reset()) + collector = SyncDataCollector( + create_env_fn=env, + policy=exploratory_policy, + frames_per_batch=100, + total_frames=1000, + ) + for _ in collector: + # check that we can run the policy + pass + return @pytest.mark.parametrize("device", get_available_devices()) -@pytest.mark.parametrize("spec_origin", ["spec", "policy", None]) class TestAdditiveGaussian: + @pytest.mark.parametrize("spec_origin", ["spec", "policy", None]) def test_additivegaussian_sd( self, device, @@ -167,6 +223,7 @@ def test_additivegaussian_sd( ) assert abs(noisy_action.std() - sigma_end) < 1e-1 + @pytest.mark.parametrize("spec_origin", ["spec", "policy", None]) def test_additivegaussian_wrapper( self, device, spec_origin, d_obs=4, d_act=6, batch=32, n_steps=100, seed=0 ): @@ -213,6 +270,42 @@ def test_additivegaussian_wrapper( if action_spec is not None: assert action_spec.is_in(out.get("action")) + @pytest.mark.parametrize("parallel_spec", [True, False]) + def test_collector(self, device, parallel_spec, seed=0): + torch.manual_seed(seed) + env = SerialEnv(2, ContinuousActionVecMockEnv) + # the module must work with the action spec of a single env or a serial env + if parallel_spec: + action_spec = env.action_spec + else: + action_spec = ContinuousActionVecMockEnv().action_spec + d_act = action_spec.shape[-1] + net = NormalParamWrapper(nn.LazyLinear(2 * d_act)).to(device) + module = SafeModule( + net, + in_keys=["observation"], + out_keys=["loc", "scale"], + ) + policy = ProbabilisticActor( + module=module, + in_keys=["loc", "scale"], + distribution_class=TanhNormal, + default_interaction_mode="random", + spec=action_spec, + ).to(device) + exploratory_policy = AdditiveGaussianWrapper(policy, safe=False) + exploratory_policy(env.reset()) + collector = SyncDataCollector( + create_env_fn=env, + policy=exploratory_policy, + frames_per_batch=100, + total_frames=1000, + ) + for _ in collector: + # check that we can run the policy + pass + return + @pytest.mark.parametrize("state_dim", [7]) @pytest.mark.parametrize("action_dim", [5, 11]) diff --git a/torchrl/collectors/collectors.py b/torchrl/collectors/collectors.py index 5865df5b18f..ea311794212 100644 --- a/torchrl/collectors/collectors.py +++ b/torchrl/collectors/collectors.py @@ -31,8 +31,8 @@ _check_for_faulty_process, accept_remote_rref_udf_invocation, prod, - VERBOSE, RL_WARNINGS, + VERBOSE, ) from torchrl.collectors.utils import split_trajectories from torchrl.data.tensor_specs import TensorSpec @@ -574,32 +574,51 @@ def __init__( traj_ids, ) + with torch.no_grad(): + self._tensordict_out = env.fake_tensordict() if ( hasattr(self.policy, "spec") and self.policy.spec is not None - and all(v is not None for v in self.policy.spec.values()) - and set(self.policy.spec.keys(True, True)) == set(self.policy.out_keys) + and all( + v is not None for v in self.policy.spec.values() + ) # if a spec is None, we don't know anything about it + # and set(self.policy.spec.keys(True, True)) == set(self.policy.out_keys) + and any( + key not in self._tensordict_out.keys(isinstance(key, tuple)) + for key in self.policy.spec + ) ): # if policy spec is non-empty, all the values are not None and the keys # match the out_keys we assume the user has given all relevant information - self._tensordict_out = env.fake_tensordict().to_tensordict() - self._tensordict_out.update(self.policy.spec.zero()) + # the policy could have more keys than the env: + for key, spec in self.policy.spec.items(): # this may break for nested keys + if key in self._tensordict_out.keys(): + continue + if spec.ndim < self._tensordict_out.ndim: + spec = spec.expand(self._tensordict_out.shape) + self._tensordict_out.set(key, spec.zero()) self._tensordict_out = ( self._tensordict_out.unsqueeze(-1) .expand(*env.batch_size, self.frames_per_batch) - .to_tensordict() + .clone() ) - else: + elif hasattr(self.policy, "spec") and self.policy.spec is not None: + # reach this if the policy has specs and they match with the fake tensordict + self._tensordict_out = ( + self._tensordict_out.unsqueeze(-1) + .expand(*env.batch_size, self.frames_per_batch) + .clone() + ) + elif not hasattr(self.policy, "spec") or self.policy.spec is None: # otherwise, we perform a small number of steps with the policy to # determine the relevant keys with which to pre-populate _tensordict_out. # See #505 for additional context. with torch.no_grad(): - self._tensordict_out = env.fake_tensordict() self._tensordict_out = self._tensordict_out.to(self.device) self._tensordict_out = self.policy(self._tensordict_out).unsqueeze(-1) self._tensordict_out = ( self._tensordict_out.expand(*env.batch_size, self.frames_per_batch) - .to_tensordict() + .clone() .zero_() ) # in addition to outputs of the policy, we add traj_ids and step_count to diff --git a/torchrl/data/tensor_specs.py b/torchrl/data/tensor_specs.py index e07796028b7..2a3996c8681 100644 --- a/torchrl/data/tensor_specs.py +++ b/torchrl/data/tensor_specs.py @@ -2437,7 +2437,10 @@ def clone(self) -> CompositeSpec: except RuntimeError: device = self._device return self.__class__( - {key: item.clone() for key, item in self.items()}, + { + key: item.clone() if item is not None else None + for key, item in self.items() + }, device=device, shape=self.shape, ) diff --git a/torchrl/modules/tensordict_module/exploration.py b/torchrl/modules/tensordict_module/exploration.py index 4d8feaef8b8..271ec60ff4f 100644 --- a/torchrl/modules/tensordict_module/exploration.py +++ b/torchrl/modules/tensordict_module/exploration.py @@ -2,7 +2,7 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. - +import warnings from typing import Optional, Union import numpy as np @@ -11,7 +11,11 @@ from tensordict.tensordict import TensorDictBase from tensordict.utils import expand_as_right -from torchrl.data.tensor_specs import CompositeSpec, TensorSpec +from torchrl.data.tensor_specs import ( + CompositeSpec, + TensorSpec, + UnboundedContinuousTensorSpec, +) from torchrl.envs.utils import exploration_mode from torchrl.modules.tensordict_module.common import _forward_hook_safe_action @@ -178,14 +182,19 @@ def __init__( self.register_buffer("std", torch.tensor([std])) self.register_buffer("sigma", torch.tensor([sigma_init])) self.action_key = action_key - self.spec = ( - spec - if spec is not None - else policy.spec - if hasattr(policy, "spec") - else None - ) + self.out_keys = list(self.td_module.out_keys) + if spec is not None: + if not isinstance(spec, CompositeSpec) and len(self.out_keys) >= 1: + spec = CompositeSpec({self.out_keys[0]: spec}) + self._spec = spec + elif hasattr(self.td_module, "_spec"): + self._spec = self.td_module._spec.clone() + else: + self._spec = CompositeSpec({key: None for key in policy.in_keys}) + self.safe = safe + if self.safe: + self.register_forward_hook(_forward_hook_safe_action) def step(self, frames: int = 1) -> None: """A step of sigma decay. @@ -340,8 +349,21 @@ def __init__( self.annealing_num_steps = annealing_num_steps self.register_buffer("eps", torch.tensor([eps_init])) self.out_keys = list(self.td_module.out_keys) + self.ou.out_keys + noise_key = self.ou.noise_key + steps_key = self.ou.steps_key + + ou_specs = { + noise_key: None, + steps_key: UnboundedContinuousTensorSpec( + shape=(*self.td_module._spec.shape, 1), + device=self.td_module._spec.device, + dtype=torch.int64, + ), + } self._spec = CompositeSpec( - **self.td_module._spec, **{key: None for key in self.ou.out_keys} + **self.td_module._spec, + **ou_specs, + shape=self.td_module._spec.shape, ) if len(set(self.out_keys)) != len(self.out_keys): raise RuntimeError(f"Got multiple identical output keys: {self.out_keys}") @@ -349,6 +371,10 @@ def __init__( if self.safe: self.register_forward_hook(_forward_hook_safe_action) + @property + def spec(self): + return self._spec + def step(self, frames: int = 1) -> None: """Updates the eps noise factor. @@ -375,6 +401,17 @@ def step(self, frames: int = 1) -> None: def forward(self, tensordict: TensorDictBase) -> TensorDictBase: tensordict = super().forward(tensordict) if exploration_mode() == "random" or exploration_mode() is None: + if "step_count" not in tensordict.keys(): + warnings.warn( + f"The tensordict passed to {self.__class__.__name__} appears to be " + f"missing the 'step_count' entry. This entry is used to " + f"reset the noise at the beginning of a trajectory, without it " + f"the behaviour of this exploration method is undefined. " + f"This is allowed for BC compatibility purposes but it will be deprecated soon! " + f"To create a 'step_count' entry, simply append a StepCounter " + f"transform to your environment with `env = TransformedEnv(env, StepCounter())`." + ) + tensordict.set("step_count", torch.ones(tensordict.shape)) tensordict = self.ou.add_sample(tensordict, self.eps.item()) return tensordict @@ -421,10 +458,13 @@ def noise_key(self): def steps_key(self): return self._steps_key # + str(id(self)) - def _make_noise_pair(self, tensordict: TensorDictBase) -> None: + def _make_noise_pair(self, tensordict: TensorDictBase, is_init=None) -> None: + if is_init is not None: + tensordict = tensordict.get_sub_tensordict(is_init.view(tensordict.shape)) tensordict.set( self.noise_key, torch.zeros(tensordict.get(self.key).shape, device=tensordict.device), + inplace=is_init is not None, ) tensordict.set( self.steps_key, @@ -433,6 +473,7 @@ def _make_noise_pair(self, tensordict: TensorDictBase) -> None: dtype=torch.long, device=tensordict.device, ), + inplace=is_init is not None, ) def add_sample( @@ -441,6 +482,9 @@ def add_sample( if self.noise_key not in tensordict.keys(): self._make_noise_pair(tensordict) + step_count = tensordict.get("step_count", None) + if step_count is not None and not step_count.all(): + self._make_noise_pair(tensordict, step_count == 0) prev_noise = tensordict.get(self.noise_key) prev_noise = prev_noise + self.x0 From 2fe0f82fe54f6f4c5abe866552665efbcd1608b6 Mon Sep 17 00:00:00 2001 From: vmoens Date: Thu, 30 Mar 2023 16:15:13 +0100 Subject: [PATCH 59/89] amend --- torchrl/modules/tensordict_module/common.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/torchrl/modules/tensordict_module/common.py b/torchrl/modules/tensordict_module/common.py index 0b12eaa2e82..0b540c2b55f 100644 --- a/torchrl/modules/tensordict_module/common.py +++ b/torchrl/modules/tensordict_module/common.py @@ -71,7 +71,14 @@ def _forward_hook_safe_action(module, tensordict_in, tensordict_out): for _spec, _key in zip(values, keys): if _spec is None: continue - if not _spec.is_in(tensordict_out.get(_key)): + item = tensordict_out.get(_key, None) + if item is None: + # this will happen when an exploration (e.g. OU) writes a key only + # during exploration, but is missing otherwise. + # it's fine since what we want here it to make sure that a key + # is within bounds if it is present + continue + if not _spec.is_in(item): try: tensordict_out.set_( _key, From 91fa500d98bf35517f873667eba6385633f11966 Mon Sep 17 00:00:00 2001 From: vmoens Date: Thu, 30 Mar 2023 21:00:40 +0100 Subject: [PATCH 60/89] amend --- test/test_exploration.py | 21 +++++ .../modules/tensordict_module/exploration.py | 82 ++++++++++++++----- 2 files changed, 83 insertions(+), 20 deletions(-) diff --git a/test/test_exploration.py b/test/test_exploration.py index 103de211d6e..ef94cc7897d 100644 --- a/test/test_exploration.py +++ b/test/test_exploration.py @@ -29,10 +29,31 @@ from torchrl.modules.tensordict_module.exploration import ( _OrnsteinUhlenbeckProcess, AdditiveGaussianWrapper, + EGreedyWrapper, OrnsteinUhlenbeckProcessWrapper, ) +@pytest.mark.parametrize("eps_init", [0.0, 0.5, 1.0]) +class TestEGreedy: + def test_egreedy(self, eps_init): + torch.manual_seed(0) + spec = BoundedTensorSpec(1, 1, torch.Size([4])) + module = torch.nn.Linear(4, 4, bias=False) + policy = Actor(spec=spec, module=module) + explorative_policy = EGreedyWrapper(policy, eps_init=eps_init, eps_end=eps_init) + td = TensorDict({"observation": torch.zeros(10, 4)}, batch_size=[10]) + action = explorative_policy(td).get("action") + if eps_init == 0: + assert (action == 0).all() + elif eps_init == 1: + assert (action == 1).all() + else: + assert (action == 1).any() + assert (action == 0).any() + assert ((action == 1) | (action == 0)).all() + + @pytest.mark.parametrize("device", get_available_devices()) class TestOrnsteinUhlenbeckProcessWrapper: def test_ou(self, device, seed=0): diff --git a/torchrl/modules/tensordict_module/exploration.py b/torchrl/modules/tensordict_module/exploration.py index 271ec60ff4f..7e573f1cce5 100644 --- a/torchrl/modules/tensordict_module/exploration.py +++ b/torchrl/modules/tensordict_module/exploration.py @@ -58,7 +58,7 @@ class EGreedyWrapper(TensorDictModuleWrapper): >>> print(explorative_policy(td).get("action")) tensor([[ 0.0000, 0.0000, 0.0000, 0.0000], [ 0.0000, 0.0000, 0.0000, 0.0000], - [-0.6986, -0.9366, -0.5837, 0.8596], + [ 0.9055, -0.9277, -0.6295, -0.2532], [ 0.0000, 0.0000, 0.0000, 0.0000], [ 0.0000, 0.0000, 0.0000, 0.0000], [ 0.0000, 0.0000, 0.0000, 0.0000], @@ -86,13 +86,20 @@ def __init__( self.annealing_num_steps = annealing_num_steps self.register_buffer("eps", torch.tensor([eps_init])) self.action_key = action_key - self.spec = ( - spec - if spec is not None - else policy.spec - if hasattr(policy, "spec") - else None - ) + if spec is not None: + if not isinstance(spec, CompositeSpec) and len(self.out_keys) >= 1: + spec = CompositeSpec({action_key: spec}, shape=spec.shape[:-1]) + self._spec = spec + elif hasattr(self.td_module, "_spec"): + self._spec = self.td_module._spec.clone() + if action_key not in self._spec.keys(): + self._spec[action_key] = None + elif hasattr(self.td_module, "spec"): + self._spec = self.td_module.spec.clone() + if action_key not in self._spec.keys(): + self._spec[action_key] = None + else: + self._spec = CompositeSpec({key: None for key in policy.out_keys}) def step(self, frames: int = 1) -> None: """A step of epsilon decay. @@ -163,6 +170,7 @@ class AdditiveGaussianWrapper(TensorDictModuleWrapper): def __init__( self, policy: TensorDictModule, + *, sigma_init: float = 1.0, sigma_end: float = 0.1, annealing_num_steps: int = 1000, @@ -183,19 +191,33 @@ def __init__( self.register_buffer("sigma", torch.tensor([sigma_init])) self.action_key = action_key self.out_keys = list(self.td_module.out_keys) + if action_key not in self.out_keys: + raise RuntimeError( + f"The action key {action_key} was not found in the td_module out_keys {self.td_module.out_keys}." + ) if spec is not None: if not isinstance(spec, CompositeSpec) and len(self.out_keys) >= 1: - spec = CompositeSpec({self.out_keys[0]: spec}) + spec = CompositeSpec({action_key: spec}, shape=spec.shape[:-1]) self._spec = spec elif hasattr(self.td_module, "_spec"): self._spec = self.td_module._spec.clone() + if action_key not in self._spec.keys(): + self._spec[action_key] = None + elif hasattr(self.td_module, "spec"): + self._spec = self.td_module.spec.clone() + if action_key not in self._spec.keys(): + self._spec[action_key] = None else: - self._spec = CompositeSpec({key: None for key in policy.in_keys}) + self._spec = CompositeSpec({key: None for key in policy.out_keys}) self.safe = safe if self.safe: self.register_forward_hook(_forward_hook_safe_action) + @property + def spec(self): + return self._spec + def step(self, frames: int = 1) -> None: """A step of sigma decay. @@ -222,8 +244,7 @@ def _add_noise(self, action: torch.Tensor) -> torch.Tensor: ).to(action.device) action = action + noise * sigma spec = self.spec - if isinstance(spec, CompositeSpec): - spec = spec[self.action_key] + spec = spec[self.action_key] if spec is not None: action = spec.project(action) elif self.safe: @@ -283,8 +304,11 @@ class OrnsteinUhlenbeckProcessWrapper(TensorDictModuleWrapper): default: None n_steps_annealing (int): number of steps for the sigma annealing. default: 1000 - key (str): key of the action to be modified. + action_key (str): key of the action to be modified. default: "action" + spec (TensorSpec, optional): if provided, the sampled action will be + projected onto the valid action space once explored. If not provided, + the exploration wrapper will attempt to recover it from the policy. safe (bool): if ``True``, actions that are out of bounds given the action specs will be projected in the space given the :obj:`TensorSpec.project` heuristic. default: True @@ -315,6 +339,7 @@ class OrnsteinUhlenbeckProcessWrapper(TensorDictModuleWrapper): def __init__( self, policy: TensorDictModule, + *, eps_init: float = 1.0, eps_end: float = 0.1, annealing_num_steps: int = 1000, @@ -325,9 +350,16 @@ def __init__( x0: Optional[Union[torch.Tensor, np.ndarray]] = None, sigma_min: Optional[float] = None, n_steps_annealing: int = 1000, - key: str = "action", + action_key: str = "action", + spec: TensorSpec = None, safe: bool = True, + key: str = None, ): + if key is not None: + action_key = key + warnings.warn( + f"the 'key' keyword argument of {type(self)} has been renamed 'action_key'. The 'key' entry will be deprecated soon." + ) super().__init__(policy) self.ou = _OrnsteinUhlenbeckProcess( theta=theta, @@ -337,7 +369,7 @@ def __init__( x0=x0, sigma_min=sigma_min, n_steps_annealing=n_steps_annealing, - key=key, + key=action_key, ) self.register_buffer("eps_init", torch.tensor([eps_init])) self.register_buffer("eps_end", torch.tensor([eps_end])) @@ -360,11 +392,21 @@ def __init__( dtype=torch.int64, ), } - self._spec = CompositeSpec( - **self.td_module._spec, - **ou_specs, - shape=self.td_module._spec.shape, - ) + if spec is not None: + if not isinstance(spec, CompositeSpec) and len(self.out_keys) >= 1: + spec = CompositeSpec({action_key: spec}, shape=spec.shape[:-1]) + self._spec = spec + elif hasattr(self.td_module, "_spec"): + self._spec = self.td_module._spec.clone() + if action_key not in self._spec.keys(): + self._spec[action_key] = None + elif hasattr(self.td_module, "spec"): + self._spec = self.td_module.spec.clone() + if action_key not in self._spec.keys(): + self._spec[action_key] = None + else: + self._spec = CompositeSpec({key: None for key in policy.out_keys}) + self._spec.update(ou_specs) if len(set(self.out_keys)) != len(self.out_keys): raise RuntimeError(f"Got multiple identical output keys: {self.out_keys}") self.safe = safe From c251d1abc8a752b405c42d3d2c2480953bfb8a31 Mon Sep 17 00:00:00 2001 From: vmoens Date: Fri, 31 Mar 2023 08:40:42 +0100 Subject: [PATCH 61/89] bf --- test/test_collector.py | 2 +- test/test_exploration.py | 19 ++++++++++++++----- torchrl/collectors/collectors.py | 8 ++++---- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/test/test_collector.py b/test/test_collector.py index 07e1d591607..474cc819a60 100644 --- a/test/test_collector.py +++ b/test/test_collector.py @@ -914,9 +914,9 @@ def make_env(): @pytest.mark.parametrize( "collector_class", [ + SyncDataCollector, MultiaSyncDataCollector, MultiSyncDataCollector, - SyncDataCollector, ], ) @pytest.mark.parametrize("init_random_frames", [0, 50]) diff --git a/test/test_exploration.py b/test/test_exploration.py index ef94cc7897d..b19ce9eb3c3 100644 --- a/test/test_exploration.py +++ b/test/test_exploration.py @@ -110,13 +110,20 @@ def test_ou_wrapper(self, device, d_obs=4, d_act=6, batch=32, n_steps=100, seed= tensordict = exploratory_policy(tensordict.clone()) if i == 0: assert (tensordict[exploratory_policy.ou.steps_key] == 1).all() + elif i == n_steps // 2 + 1: + assert ( + tensordict[exploratory_policy.ou.steps_key][: batch // 2] == 1 + ).all() else: - assert not (tensordict[exploratory_policy.ou.steps_key] == 1).all() + assert not (tensordict[exploratory_policy.ou.steps_key] == 1).any() out.append(tensordict.clone()) out_noexp.append(tensordict_noexp.clone()) tensordict.set_("observation", torch.randn(batch, d_obs, device=device)) tensordict["step_count"] += 1 + if i == n_steps // 2: + tensordict["step_count"][: batch // 2] = 0 + out = torch.stack(out, 0) out_noexp = torch.stack(out_noexp, 0) assert (out_noexp.get("action") != out.get("action")).all() @@ -127,12 +134,12 @@ def test_ou_wrapper(self, device, d_obs=4, d_act=6, batch=32, n_steps=100, seed= @pytest.mark.parametrize("probabilistic", [True, False]) def test_collector(self, device, parallel_spec, probabilistic, seed=0): torch.manual_seed(seed) - env = SerialEnv(2, ContinuousActionVecMockEnv) + env = SerialEnv(2, ContinuousActionVecMockEnv, device=device) # the module must work with the action spec of a single env or a serial env if parallel_spec: action_spec = env.action_spec else: - action_spec = ContinuousActionVecMockEnv().action_spec + action_spec = ContinuousActionVecMockEnv(device=device).action_spec d_act = action_spec.shape[-1] if probabilistic: net = NormalParamWrapper(nn.LazyLinear(2 * d_act)).to(device) @@ -161,6 +168,7 @@ def test_collector(self, device, parallel_spec, probabilistic, seed=0): policy=exploratory_policy, frames_per_batch=100, total_frames=1000, + device=device, ) for _ in collector: # check that we can run the policy @@ -294,12 +302,12 @@ def test_additivegaussian_wrapper( @pytest.mark.parametrize("parallel_spec", [True, False]) def test_collector(self, device, parallel_spec, seed=0): torch.manual_seed(seed) - env = SerialEnv(2, ContinuousActionVecMockEnv) + env = SerialEnv(2, ContinuousActionVecMockEnv, device=device) # the module must work with the action spec of a single env or a serial env if parallel_spec: action_spec = env.action_spec else: - action_spec = ContinuousActionVecMockEnv().action_spec + action_spec = ContinuousActionVecMockEnv(device=device).action_spec d_act = action_spec.shape[-1] net = NormalParamWrapper(nn.LazyLinear(2 * d_act)).to(device) module = SafeModule( @@ -321,6 +329,7 @@ def test_collector(self, device, parallel_spec, seed=0): policy=exploratory_policy, frames_per_batch=100, total_frames=1000, + device=device, ) for _ in collector: # check that we can run the policy diff --git a/torchrl/collectors/collectors.py b/torchrl/collectors/collectors.py index ea311794212..72c0ab71b65 100644 --- a/torchrl/collectors/collectors.py +++ b/torchrl/collectors/collectors.py @@ -580,19 +580,19 @@ def __init__( hasattr(self.policy, "spec") and self.policy.spec is not None and all( - v is not None for v in self.policy.spec.values() + v is not None for v in self.policy.spec.values(True, True) ) # if a spec is None, we don't know anything about it # and set(self.policy.spec.keys(True, True)) == set(self.policy.out_keys) and any( key not in self._tensordict_out.keys(isinstance(key, tuple)) - for key in self.policy.spec + for key in self.policy.spec.keys(True, True) ) ): # if policy spec is non-empty, all the values are not None and the keys # match the out_keys we assume the user has given all relevant information # the policy could have more keys than the env: - for key, spec in self.policy.spec.items(): # this may break for nested keys - if key in self._tensordict_out.keys(): + for key, spec in self.policy.spec.items(True, True): + if key in self._tensordict_out.keys(isinstance(key, tuple)): continue if spec.ndim < self._tensordict_out.ndim: spec = spec.expand(self._tensordict_out.shape) From 495acff88e4e008ec738ddf48493d3f6d231974f Mon Sep 17 00:00:00 2001 From: vmoens Date: Fri, 31 Mar 2023 11:49:24 +0100 Subject: [PATCH 62/89] bf --- test/test_collector.py | 2 +- test/test_exploration.py | 10 +++++-- torchrl/collectors/collectors.py | 14 ++++++++-- torchrl/modules/models/models.py | 46 ++++++++++++++++++-------------- 4 files changed, 47 insertions(+), 25 deletions(-) diff --git a/test/test_collector.py b/test/test_collector.py index 474cc819a60..4dc92491fe7 100644 --- a/test/test_collector.py +++ b/test/test_collector.py @@ -920,7 +920,7 @@ def make_env(): ], ) @pytest.mark.parametrize("init_random_frames", [0, 50]) -@pytest.mark.parametrize("explicit_spec", [True, False]) +@pytest.mark.parametrize("explicit_spec", [False, True]) @pytest.mark.parametrize("split_trajs", [True, False]) def test_collector_output_keys( collector_class, init_random_frames, explicit_spec, split_trajs diff --git a/test/test_exploration.py b/test/test_exploration.py index b19ce9eb3c3..7543e172ff1 100644 --- a/test/test_exploration.py +++ b/test/test_exploration.py @@ -134,7 +134,10 @@ def test_ou_wrapper(self, device, d_obs=4, d_act=6, batch=32, n_steps=100, seed= @pytest.mark.parametrize("probabilistic", [True, False]) def test_collector(self, device, parallel_spec, probabilistic, seed=0): torch.manual_seed(seed) - env = SerialEnv(2, ContinuousActionVecMockEnv, device=device) + env = SerialEnv( + 2, + ContinuousActionVecMockEnv, + ) # the module must work with the action spec of a single env or a serial env if parallel_spec: action_spec = env.action_spec @@ -302,7 +305,10 @@ def test_additivegaussian_wrapper( @pytest.mark.parametrize("parallel_spec", [True, False]) def test_collector(self, device, parallel_spec, seed=0): torch.manual_seed(seed) - env = SerialEnv(2, ContinuousActionVecMockEnv, device=device) + env = SerialEnv( + 2, + ContinuousActionVecMockEnv, + ) # the module must work with the action spec of a single env or a serial env if parallel_spec: action_spec = env.action_spec diff --git a/torchrl/collectors/collectors.py b/torchrl/collectors/collectors.py index 72c0ab71b65..b7762f12d3c 100644 --- a/torchrl/collectors/collectors.py +++ b/torchrl/collectors/collectors.py @@ -602,16 +602,26 @@ def __init__( .expand(*env.batch_size, self.frames_per_batch) .clone() ) - elif hasattr(self.policy, "spec") and self.policy.spec is not None: + elif ( + hasattr(self.policy, "spec") + and self.policy.spec is not None + and all(v is not None for v in self.policy.spec.values(True, True)) + and all( + key in self._tensordict_out.keys(isinstance(key, tuple)) + for key in self.policy.spec.keys(True, True) + ) + ): # reach this if the policy has specs and they match with the fake tensordict self._tensordict_out = ( self._tensordict_out.unsqueeze(-1) .expand(*env.batch_size, self.frames_per_batch) .clone() ) - elif not hasattr(self.policy, "spec") or self.policy.spec is None: + else: # otherwise, we perform a small number of steps with the policy to # determine the relevant keys with which to pre-populate _tensordict_out. + # This is the safest thing to do if the spec has None fields or if there is + # no spec at all. # See #505 for additional context. with torch.no_grad(): self._tensordict_out = self._tensordict_out.to(self.device) diff --git a/torchrl/modules/models/models.py b/torchrl/modules/models/models.py index 575c12daa74..494ba536e3d 100644 --- a/torchrl/modules/models/models.py +++ b/torchrl/modules/models/models.py @@ -670,7 +670,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: def ddpg_init_last_layer( - last_layer: nn.Module, + module: nn.Sequential, scale: float = 6e-4, device: Optional[DEVICE_TYPING] = None, ) -> None: @@ -680,6 +680,12 @@ def ddpg_init_last_layer( https://arxiv.org/pdf/1509.02971.pdf """ + for last_layer in reversed(module): + if isinstance(last_layer, (nn.Linear, nn.Conv2d)): + break + else: + raise RuntimeError("Could not find a nn.Linear / nn.Conv2d to initialize.") + last_layer.weight.data.copy_( torch.rand_like(last_layer.weight.data, device=device) * scale - scale / 2 ) @@ -767,7 +773,7 @@ def __init__( mlp_net_default_kwargs.update(mlp_net_kwargs) self.convnet = ConvNet(device=device, **conv_net_default_kwargs) self.mlp = MLP(device=device, **mlp_net_default_kwargs) - ddpg_init_last_layer(self.mlp[-1], 6e-4, device=device) + ddpg_init_last_layer(self.mlp, 6e-4, device=device) def forward(self, observation: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: hidden = self.convnet(observation) @@ -816,7 +822,7 @@ def __init__( mlp_net_kwargs = mlp_net_kwargs if mlp_net_kwargs is not None else {} mlp_net_default_kwargs.update(mlp_net_kwargs) self.mlp = MLP(device=device, **mlp_net_default_kwargs) - ddpg_init_last_layer(self.mlp[-1], 6e-3, device=device) + ddpg_init_last_layer(self.mlp, 6e-3, device=device) def forward(self, observation: torch.Tensor) -> torch.Tensor: action = self.mlp(observation) @@ -897,7 +903,7 @@ def __init__( mlp_net_default_kwargs.update(mlp_net_kwargs) self.convnet = ConvNet(device=device, **conv_net_default_kwargs) self.mlp = MLP(device=device, **mlp_net_default_kwargs) - ddpg_init_last_layer(self.mlp[-1], 6e-4, device=device) + ddpg_init_last_layer(self.mlp, 6e-4, device=device) def forward(self, observation: torch.Tensor, action: torch.Tensor) -> torch.Tensor: hidden = torch.cat([self.convnet(observation), action], -1) @@ -917,23 +923,23 @@ class DdpgMlpQNet(nn.Module): Args: mlp_net_kwargs_net1 (dict, optional): kwargs for MLP. Default: { - 'in_features': None, - 'out_features': 400, - 'depth': 0, - 'num_cells': [], - 'activation_class': nn.ELU, - 'bias_last_layer': True, - 'activate_last_layer': True, - } + 'in_features': None, + 'out_features': 400, + 'depth': 0, + 'num_cells': [], + 'activation_class': nn.ELU, + 'bias_last_layer': True, + 'activate_last_layer': True, + } mlp_net_kwargs_net2 Default: { - 'in_features': None, - 'out_features': 1, - 'depth': 1, - 'num_cells': [300, ], - 'activation_class': nn.ELU, - 'bias_last_layer': True, - } + 'in_features': None, + 'out_features': 1, + 'depth': 1, + 'num_cells': [300, ], + 'activation_class': nn.ELU, + 'bias_last_layer': True, + } device (Optional[DEVICE_TYPING]): device to create the module on. """ @@ -973,7 +979,7 @@ def __init__( ) mlp2_net_default_kwargs.update(mlp_net_kwargs_net2) self.mlp2 = MLP(device=device, **mlp2_net_default_kwargs) - ddpg_init_last_layer(self.mlp2[-1], 6e-3, device=device) + ddpg_init_last_layer(self.mlp2, 6e-3, device=device) def forward(self, observation: torch.Tensor, action: torch.Tensor) -> torch.Tensor: value = self.mlp2(torch.cat([self.mlp1(observation), action], -1)) From 59da7a2732f040ef342cdd7fd0e4a8d89737c88b Mon Sep 17 00:00:00 2001 From: vmoens Date: Fri, 31 Mar 2023 12:11:46 +0100 Subject: [PATCH 63/89] amend --- torchrl/collectors/collectors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchrl/collectors/collectors.py b/torchrl/collectors/collectors.py index b7762f12d3c..be81aa9c70c 100644 --- a/torchrl/collectors/collectors.py +++ b/torchrl/collectors/collectors.py @@ -594,7 +594,7 @@ def __init__( for key, spec in self.policy.spec.items(True, True): if key in self._tensordict_out.keys(isinstance(key, tuple)): continue - if spec.ndim < self._tensordict_out.ndim: + if self.policy.spec.ndim < self._tensordict_out.ndim: spec = spec.expand(self._tensordict_out.shape) self._tensordict_out.set(key, spec.zero()) self._tensordict_out = ( From 2498b5fbd1a3623ba0b985363ee37fcd79d51cad Mon Sep 17 00:00:00 2001 From: vmoens Date: Fri, 31 Mar 2023 13:44:29 +0100 Subject: [PATCH 64/89] amend --- torchrl/collectors/collectors.py | 7 +++-- torchrl/data/tensor_specs.py | 54 ++++++++++++++++---------------- 2 files changed, 31 insertions(+), 30 deletions(-) diff --git a/torchrl/collectors/collectors.py b/torchrl/collectors/collectors.py index be81aa9c70c..dd2505a78b3 100644 --- a/torchrl/collectors/collectors.py +++ b/torchrl/collectors/collectors.py @@ -591,11 +591,12 @@ def __init__( # if policy spec is non-empty, all the values are not None and the keys # match the out_keys we assume the user has given all relevant information # the policy could have more keys than the env: - for key, spec in self.policy.spec.items(True, True): + policy_spec = self.policy.spec + if policy_spec.ndim < self._tensordict_out.ndim: + policy_spec = policy_spec.expand(self._tensordict_out.shape) + for key, spec in policy_spec.items(True, True): if key in self._tensordict_out.keys(isinstance(key, tuple)): continue - if self.policy.spec.ndim < self._tensordict_out.ndim: - spec = spec.expand(self._tensordict_out.shape) self._tensordict_out.set(key, spec.zero()) self._tensordict_out = ( self._tensordict_out.unsqueeze(-1) diff --git a/torchrl/data/tensor_specs.py b/torchrl/data/tensor_specs.py index 2a3996c8681..73cc015d6e8 100644 --- a/torchrl/data/tensor_specs.py +++ b/torchrl/data/tensor_specs.py @@ -821,12 +821,12 @@ def expand(self, *shape): shape = shape[0] if any(val < 0 for val in shape): raise ValueError( - f"{self.__class__.__name__}.extend does not support negative shapes." + f"{self.__class__.__name__}.expand does not support negative shapes." ) if any(s1 != s2 and s2 != 1 for s1, s2 in zip(shape[-self.ndim :], self.shape)): raise ValueError( - f"The last {self.ndim} of the extended shape must match the" - f"shape of the CompositeSpec in CompositeSpec.extend." + f"The last {self.ndim} of the expanded shape {shape} must match the" + f"shape of the {self.__class__.__name__} spec in expand()." ) return self.__class__( n=shape[-1], shape=shape, device=self.device, dtype=self.dtype @@ -1065,12 +1065,12 @@ def expand(self, *shape): shape = shape[0] if any(val < 0 for val in shape): raise ValueError( - f"{self.__class__.__name__}.extend does not support negative shapes." + f"{self.__class__.__name__}.expand does not support negative shapes." ) if any(s1 != s2 and s2 != 1 for s1, s2 in zip(shape[-self.ndim :], self.shape)): raise ValueError( - f"The last {self.ndim} of the extended shape must match the" - f"shape of the CompositeSpec in CompositeSpec.extend." + f"The last {self.ndim} of the expanded shape {shape} must match the" + f"shape of the {self.__class__.__name__} spec in expand()." ) return self.__class__( minimum=self.space.minimum.expand(shape).clone(), @@ -1260,12 +1260,12 @@ def expand(self, *shape): shape = shape[0] if any(val < 0 for val in shape): raise ValueError( - f"{self.__class__.__name__}.extend does not support negative shapes." + f"{self.__class__.__name__}.expand does not support negative shapes." ) if any(s1 != s2 and s2 != 1 for s1, s2 in zip(shape[-self.ndim :], self.shape)): raise ValueError( - f"The last {self.ndim} of the extended shape must match the" - f"shape of the CompositeSpec in CompositeSpec.extend." + f"The last {self.ndim} of the expanded shape {shape} must match the" + f"shape of the {self.__class__.__name__} spec in expand()." ) return self.__class__(shape=shape, device=self.device, dtype=self.dtype) @@ -1347,12 +1347,12 @@ def expand(self, *shape): shape = shape[0] if any(val < 0 for val in shape): raise ValueError( - f"{self.__class__.__name__}.extend does not support negative shapes." + f"{self.__class__.__name__}.expand does not support negative shapes." ) if any(s1 != s2 and s2 != 1 for s1, s2 in zip(shape[-self.ndim :], self.shape)): raise ValueError( - f"The last {self.ndim} of the extended shape must match the" - f"shape of the CompositeSpec in CompositeSpec.extend." + f"The last {self.ndim} of the expanded shape {shape} must match the" + f"shape of the {self.__class__.__name__} spec in expand()." ) return self.__class__(shape=shape, device=self.device, dtype=self.dtype) @@ -1537,12 +1537,12 @@ def expand(self, *shape): shape = shape[0] if any(val < 0 for val in shape): raise ValueError( - f"{self.__class__.__name__}.extend does not support negative shapes." + f"{self.__class__.__name__}.expand does not support negative shapes." ) if any(s1 != s2 and s2 != 1 for s1, s2 in zip(shape[-self.ndim :], self.shape)): raise ValueError( - f"The last {self.ndim} of the extended shape must match the" - f"shape of the CompositeSpec in CompositeSpec.extend." + f"The last {self.ndim} of the expanded shape {shape} must match the" + f"shape of the {self.__class__.__name__} spec in expand()." ) return self.__class__( nvec=nvecs, shape=shape, device=self.device, dtype=self.dtype @@ -1676,12 +1676,12 @@ def expand(self, *shape): shape = shape[0] if any(val < 0 for val in shape): raise ValueError( - f"{self.__class__.__name__}.extend does not support negative shapes." + f"{self.__class__.__name__}.expand does not support negative shapes." ) if any(s1 != s2 and s2 != 1 for s1, s2 in zip(shape[-self.ndim :], self.shape)): raise ValueError( - f"The last {self.ndim} of the extended shape must match the" - f"shape of the CompositeSpec in CompositeSpec.extend." + f"The last {self.ndim} of the expanded shape {shape} must match the" + f"shape of the {self.__class__.__name__} spec in expand()." ) return self.__class__( n=self.space.n, shape=shape, device=self.device, dtype=self.dtype @@ -1762,12 +1762,12 @@ def expand(self, *shape): shape = shape[0] if any(val < 0 for val in shape): raise ValueError( - f"{self.__class__.__name__}.extend does not support negative shapes." + f"{self.__class__.__name__}.expand does not support negative shapes." ) if any(s1 != s2 and s2 != 1 for s1, s2 in zip(shape[-self.ndim :], self.shape)): raise ValueError( - f"The last {self.ndim} of the extended shape must match the" - f"shape of the CompositeSpec in CompositeSpec.extend." + f"The last {self.ndim} of the expanded shape {shape} must match the" + f"shape of the {self.__class__.__name__} spec in expand()." ) return self.__class__( n=shape[-1], shape=shape, device=self.device, dtype=self.dtype @@ -1981,12 +1981,12 @@ def expand(self, *shape): shape = shape[0] if any(val < 0 for val in shape): raise ValueError( - f"{self.__class__.__name__}.extend does not support negative shapes." + f"{self.__class__.__name__}.expand does not support negative shapes." ) if any(s1 != s2 and s2 != 1 for s1, s2 in zip(shape[-self.ndim :], self.shape)): raise ValueError( - f"The last {self.ndim} of the extended shape must match the" - f"shape of the CompositeSpec in CompositeSpec.extend." + f"The last {self.ndim} of the expanded shape {shape} must match the" + f"shape of the {self.__class__.__name__} spec in expand()." ) return self.__class__( nvec=self.nvec, shape=shape, device=self.device, dtype=self.dtype @@ -2501,11 +2501,11 @@ def expand(self, *shape): if len(shape) == 1 and isinstance(shape[0], (tuple, list, torch.Size)): shape = shape[0] if any(val < 0 for val in shape): - raise ValueError("CompositeSpec.extend does not support negative shapes.") + raise ValueError("CompositeSpec.expand does not support negative shapes.") if any(s1 != s2 and s2 != 1 for s1, s2 in zip(shape[-self.ndim :], self.shape)): raise ValueError( - f"The last {self.ndim} of the extended shape must match the" - f"shape of the CompositeSpec in CompositeSpec.extend." + f"The last {self.ndim} of the expanded shape {shape} must match the" + f"shape of the {self.__class__.__name__} spec in expand()." ) try: device = self.device From f1da0813d0245d1678eebb7d9309e279e45e380d Mon Sep 17 00:00:00 2001 From: vmoens Date: Sun, 2 Apr 2023 08:00:28 +0100 Subject: [PATCH 65/89] stateful functional modules --- torchrl/objectives/common.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/torchrl/objectives/common.py b/torchrl/objectives/common.py index 9c37b1cbdca..5bca30cf5d1 100644 --- a/torchrl/objectives/common.py +++ b/torchrl/objectives/common.py @@ -99,9 +99,8 @@ def convert_to_functional( buffer_names = next(itertools.islice(zip(*module.named_buffers()), 1)) except StopIteration: buffer_names = () - params = make_functional(module, funs_to_decorate=funs_to_decorate) - functional_module = deepcopy(module) - repopulate_module(module, params) + params = make_functional(module, funs_to_decorate=funs_to_decorate, keep_params=True) + functional_module = module params_and_buffers = params # we transform the buffers in params to make sure they follow the device From 14e0a734af1777b9c07509e2dde0fb4a7b319e4c Mon Sep 17 00:00:00 2001 From: vmoens Date: Mon, 3 Apr 2023 08:00:44 +0100 Subject: [PATCH 66/89] amend --- torchrl/objectives/common.py | 4 +++- torchrl/objectives/utils.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/torchrl/objectives/common.py b/torchrl/objectives/common.py index 5bca30cf5d1..106ad36ef62 100644 --- a/torchrl/objectives/common.py +++ b/torchrl/objectives/common.py @@ -99,7 +99,9 @@ def convert_to_functional( buffer_names = next(itertools.islice(zip(*module.named_buffers()), 1)) except StopIteration: buffer_names = () - params = make_functional(module, funs_to_decorate=funs_to_decorate, keep_params=True) + params = make_functional( + module, funs_to_decorate=funs_to_decorate, keep_params=True + ) functional_module = module params_and_buffers = params diff --git a/torchrl/objectives/utils.py b/torchrl/objectives/utils.py index 8b72f1f6620..3af554935a9 100644 --- a/torchrl/objectives/utils.py +++ b/torchrl/objectives/utils.py @@ -267,7 +267,7 @@ def __init__( ], eps: float = 0.999, ): - if not (eps < 1.0 and eps > 0.0): + if not (eps <= 1.0 and eps >= 0.0): raise ValueError( f"Got eps = {eps} when it was supposed to be between 0 and 1." ) From 94ec94e998d58692efca65116b75dc4981250428 Mon Sep 17 00:00:00 2001 From: vmoens Date: Mon, 3 Apr 2023 16:48:31 +0100 Subject: [PATCH 67/89] amend --- tutorials/sphinx-tutorials/coding_ddpg.py | 726 ++++++++++++---------- tutorials/sphinx-tutorials/coding_dqn.py | 45 +- 2 files changed, 419 insertions(+), 352 deletions(-) diff --git a/tutorials/sphinx-tutorials/coding_ddpg.py b/tutorials/sphinx-tutorials/coding_ddpg.py index 503a53d48d1..914186f4ed9 100644 --- a/tutorials/sphinx-tutorials/coding_ddpg.py +++ b/tutorials/sphinx-tutorials/coding_ddpg.py @@ -5,6 +5,7 @@ **Author**: `Vincent Moens `_ """ + ############################################################################## # TorchRL separates the training of RL algorithms in various pieces that will be # assembled in your training script: the environment, the data collection and @@ -47,50 +48,26 @@ # and the library features that are to be used in the context of # this algorithm. # -# Imports -# ------- +# Imports and setup +# ----------------- # # sphinx_gallery_start_ignore import warnings from typing import Tuple -from torchrl.objectives import LossModule - warnings.filterwarnings("ignore") # sphinx_gallery_end_ignore -import numpy as np import torch.cuda import tqdm -from matplotlib import pyplot as plt -from tensordict.nn import TensorDictModule -from tensordict.tensordict import TensorDict, TensorDictBase -from torch import nn, optim -from torchrl.collectors import MultiaSyncDataCollector -from torchrl.data import CompositeSpec, TensorDictReplayBuffer -from torchrl.data.replay_buffers.samplers import PrioritizedSampler, RandomSampler -from torchrl.data.replay_buffers.storages import LazyMemmapStorage -from torchrl.envs import ( - CatTensors, - DoubleToFloat, - EnvCreator, - ObservationNorm, - ParallelEnv, -) -from torchrl.envs.libs.dm_control import DMControlEnv -from torchrl.envs.libs.gym import GymEnv -from torchrl.envs.transforms import RewardScaling, TransformedEnv -from torchrl.envs.utils import set_exploration_mode -from torchrl.modules import ( - Actor, - ActorCriticWrapper, - MLP, - OrnsteinUhlenbeckProcessWrapper, - ValueOperator, + + +############################################################################### +# We will execute the policy on cuda if available +device = ( + torch.device("cpu") if torch.cuda.device_count() == 0 else torch.device("cuda:0") ) -from torchrl.objectives.utils import distance_loss, SoftUpdate -from torchrl.trainers import Recorder ############################################################################### # torchrl :class:`torchrl.objectives.LossModule` @@ -179,6 +156,8 @@ # Later, we will see how the target parameters should be updated in torchrl. # +from tensordict.nn import TensorDictModule + def _init( self, @@ -190,7 +169,7 @@ def _init( self.convert_to_functional( actor_network, "actor_network", - create_target_params=False, + create_target_params=True, ) self.convert_to_functional( value_network, @@ -283,25 +262,21 @@ def make_value_estimator(self, value_type: ValueEstimators, **hyperparams): # For this, the :func:`torchrl.objectives.utils.hold_out_params` function # can be used. -from torchrl.objectives.utils import hold_out_params - def _loss_actor( self, tensordict, ) -> torch.Tensor: - td_copy = tensordict.select(*self.actor_in_keys).detach() + td_copy = tensordict.select(*self.actor_in_keys) # Get an action from the actor network td_copy = self.actor_network( td_copy, - params=self.actor_network_params, ) # get the value associated with that action - with hold_out_params(self.value_network_params) as params: - td_copy = self.value_network( - td_copy, - params=params, - ) + td_copy = self.value_network( + td_copy, + params=self.value_network_params.detach(), + ) return -td_copy.get("state_action_value") @@ -313,12 +288,14 @@ def _loss_actor( # To do this, we will rely on the value estimator of our class: # +from torchrl.objectives.utils import distance_loss + def _loss_value( self, tensordict, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - td_copy = tensordict.detach() + td_copy = tensordict.clone() # V(s, a) self.value_network(td_copy, params=self.value_network_params) @@ -336,11 +313,11 @@ def _loss_value( batch_size=self.target_actor_network_params.batch_size, device=self.target_actor_network_params.device, ) - with set_exploration_mode("mode"): # we make sure that no exploration is performed - target_value = self.value_estimator.value_estimate( - tensordict, target_params=target_params - ).squeeze(-1) + target_value = self.value_estimator.value_estimate( + tensordict, target_params=target_params + ).squeeze(-1) + # Computes the value loss: L2, L1 or smooth L1 depending on self.loss_funtion loss_value = distance_loss(pred_val, target_value, loss_function=self.loss_funtion) td_error = (pred_val - target_value).pow(2) @@ -355,14 +332,10 @@ def _loss_value( # value and actor loss, collect the cost values and write them in a tensordict # delivered to the user. +from tensordict.tensordict import TensorDict, TensorDictBase -def _forward(self, input_tensordict: TensorDictBase) -> TensorDict: - if not input_tensordict.device == self.device: - raise RuntimeError( - f"Got device={input_tensordict.device} but " - f"actor_network.device={self.device} (self.device={self.device})" - ) +def _forward(self, input_tensordict: TensorDictBase) -> TensorDict: loss_value, td_error, pred_val, target_value = self.loss_value( input_tensordict, ) @@ -389,6 +362,9 @@ def _forward(self, input_tensordict: TensorDictBase) -> TensorDict: ) +from torchrl.objectives import LossModule + + class DDPGLoss(LossModule): default_value_estimator = default_value_estimator make_value_estimator = make_value_estimator @@ -439,11 +415,14 @@ class DDPGLoss(LossModule): # with either one of the two backends considered above (dm-control or gym). # +from torchrl.envs.libs.dm_control import DMControlEnv +from torchrl.envs.libs.gym import GymEnv + env_library = None env_name = None -def make_env(): +def make_env(from_pixels=False): """Create a base env.""" global env_library global env_name @@ -462,9 +441,9 @@ def make_env(): env_kwargs = { "device": device, - "frame_skip": frame_skip, "from_pixels": from_pixels, "pixels_only": from_pixels, + "frame_skip": 2, } env = env_library(*env_args, **env_kwargs) return env @@ -499,6 +478,17 @@ def make_env(): # take care of computing the normalizing constants later on. # +from torchrl.envs import ( + CatTensors, + DoubleToFloat, + EnvCreator, + ObservationNorm, + ParallelEnv, + RewardScaling, + StepCounter, + TransformedEnv, +) + def make_transformed_env( env, @@ -544,31 +534,9 @@ def make_transformed_env( ) ) - return env + env.append_transform(StepCounter(max_frames_per_traj)) - -############################################################################### -# Normalization of the observations -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# To compute the normalizing statistics, we run an arbitrary number of random -# steps in the environment and compute the mean and standard deviation of the -# collected observations. The :func:`ObservationNorm.init_stats()` method can -# be used for this purpose. To get the summary statistics, we create a dummy -# environment and run it for a given number of steps, collect data over a given -# number of steps and compute its summary statistics. -# - - -def get_env_stats(): - """Gets the stats of an environment.""" - proof_env = make_transformed_env(make_env()) - proof_env.set_seed(seed) - t = proof_env.transform[2] - t.init_stats(init_env_steps) - transform_state_dict = t.state_dict() - proof_env.close() - return transform_state_dict + return env ############################################################################### @@ -599,6 +567,7 @@ def get_env_stats(): def parallel_env_constructor( + env_per_collector, transform_state_dict, ): if env_per_collector == 1: @@ -627,6 +596,82 @@ def make_t_env(): return env +# The backend can be gym or dm_control +backend = "gym" + +############################################################################### +# .. note:: +# ``frame_skip`` batches multiple step together with a single action +# If > 1, the other frame counts (e.g. frames_per_batch, total_frames) need to +# be adjusted to have a consistent total number of frames collected across +# experiments. This is important as raising the frame-skip but keeping the +# total number of frames unchanged may seem like cheating: all things compared, +# a dataset of 10M elements collected with a frame-skip of 2 and another with +# a frame-skip of 1 actually have a ratio of interactions with the environment +# of 2:1! In a nutshell, one should be cautious about the frame-count of a +# training script when dealing with frame skipping as this may lead to +# biased comparisons between training strategies. +# + +############################################################################### +# Scaling the reward helps us control the signal magnitude for a more +# efficient learning. +reward_scaling = 5.0 + +############################################################################### +# We also define when a trajectory will be truncated. A thousand steps (500 if +# frame-skip = 2) is a good number to use for cheetah: + +max_frames_per_traj = 500 + +############################################################################### +# Normalization of the observations +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# To compute the normalizing statistics, we run an arbitrary number of random +# steps in the environment and compute the mean and standard deviation of the +# collected observations. The :func:`ObservationNorm.init_stats()` method can +# be used for this purpose. To get the summary statistics, we create a dummy +# environment and run it for a given number of steps, collect data over a given +# number of steps and compute its summary statistics. +# + + +def get_env_stats(): + """Gets the stats of an environment.""" + proof_env = make_transformed_env(make_env()) + t = proof_env.transform[2] + t.init_stats(init_env_steps) + transform_state_dict = t.state_dict() + proof_env.close() + return transform_state_dict + + +############################################################################### +# Normalization stats +# ~~~~~~~~~~~~~~~~~~~ +# Number of random steps used as for stats computation using ObservationNorm + +init_env_steps = 5000 + +transform_state_dict = get_env_stats() + +############################################################################### +# Number of environments in each data collector +env_per_collector = 4 + +############################################################################### +# We pass the stats computed earlier to normalize the output of our +# environment: + +parallel_env = parallel_env_constructor( + env_per_collector=env_per_collector, + transform_state_dict=transform_state_dict, +) + + +from torchrl.data import CompositeSpec + ############################################################################### # Building the model # ------------------ @@ -649,6 +694,10 @@ def make_t_env(): # that automatically sets the ``out_keys`` to ``"state_action_value`` for q-value # networks and ``state_value`` for other value networks. # +# TorchRL provides a built-in version of the DDPG networks as presented in the +# original paper. These can be found under :class:`torchrl.modules.DdpgMlpActor` +# and :class:`torchrl.modules.DdpgMlpQNet`. +# # Since we use lazy modules, it is necessary to materialize the lazy modules # before being able to move the policy from device to device and achieve other # operations. Hence, it is good practice to run the modules with a small @@ -656,6 +705,16 @@ def make_t_env(): # environment specs. # +from torchrl.modules import ( + ActorCriticWrapper, + DdpgMlpActor, + DdpgMlpQNet, + OrnsteinUhlenbeckProcessWrapper, + ProbabilisticActor, + TanhDelta, + ValueOperator, +) + def make_ddpg_actor( transform_state_dict, @@ -666,35 +725,29 @@ def make_ddpg_actor( proof_environment.transform[2].load_state_dict(transform_state_dict) env_specs = proof_environment.specs - in_features = env_specs["output_spec"]["observation"]["observation_vector"].shape[ - -1 - ] out_features = env_specs["input_spec"]["action"].shape[-1] - actor_net = MLP( - in_features=in_features, - out_features=out_features, - num_cells=[num_cells] * num_layers, - activation_class=nn.Tanh, - activate_last_layer=True, # with this option on, we use a Tanh map as a last layer, thereby constraining the action to the [-1; 1] domain + actor_net = DdpgMlpActor( + action_dim=out_features, ) + in_keys = ["observation_vector"] - out_keys = ["action"] + out_keys = ["param"] - actor = Actor( + actor = TensorDictModule( actor_net, in_keys=in_keys, out_keys=out_keys, + ) + + actor = ProbabilisticActor( + actor, + distribution_class=TanhDelta, + in_keys=["param"], spec=CompositeSpec(action=env_specs["input_spec"]["action"]), ).to(device) - q_net = MLP( - in_features=in_features - + out_features, # receives an action and an observation as input - out_features=1, - num_cells=[num_cells] * num_layers, - activation_class=nn.Tanh, - ) + q_net = DdpgMlpQNet() in_keys = in_keys + ["action"] qnet = ValueOperator( @@ -702,9 +755,113 @@ def make_ddpg_actor( module=q_net, ).to(device) + # init lazy moduless + qnet(actor(proof_environment.reset())) return actor, qnet +actor, qnet = make_ddpg_actor( + transform_state_dict=transform_state_dict, + device=device, +) + +############################################################################### +# Exploration +# ~~~~~~~~~~~ +# +# The policy is wrapped in a :class:`torchrl.modules.OrnsteinUhlenbeckProcessWrapper` +# exploration module, as suggesed in the original paper. +# Let's define the number of frames before OU noise reaches its minimum value +annealing_frames = 1_000_000 + +actor_model_explore = OrnsteinUhlenbeckProcessWrapper( + actor, + annealing_num_steps=annealing_frames, +).to(device) +if device == torch.device("cpu"): + actor_model_explore.share_memory() + + +############################################################################### +# Data collector +# -------------- +# +# TorchRL provides specialized classes to help you collect data by executing +# the policy in the environment. These "data collectors" iteratively compute +# the action to be executed at a given time, then execute a step in the +# environment and reset it when required. +# Data collectors are designed to help developers have a tight control +# on the number of frames per batch of data, on the (a)sync nature of this +# collection and on the resources allocated to the data collection (e.g. GPU, +# number of workers etc). +# +# Here we will use +# :class:`torchrl.collectors.MultiaSyncDataCollector`, a data collector that +# will be executed in an async manner (i.e. data will be collected while +# the policy is being optimized). With the :class:`MultiaSyncDataCollector`, +# multiple workers are running rollouts separately. When a batch is asked, it +# is gathered from the first worker that can provide it. +# +# The parameters to specify are: +# +# - the list of environment creation functions, +# - the policy, +# - the total number of frames before the collector is considered empty, +# - the maximum number of frames per trajectory (useful for non-terminating +# environments, like dm_control ones). +# .. note:: +# The ``max_frames_per_traj`` passed to the collector will have the effect +# of registering a new :class:`torchrl.envs.StepCounter` transform +# with the environment used for inference. We can achieve the same result +# manually, as we do in this script. +# +# One should also pass: +# +# - the number of frames in each batch collected, +# - the number of random steps executed independently from the policy, +# - the devices used for policy execution +# - the devices used to store data before the data is passed to the main +# process. +# +# The total frames we will use during training should be around 1M. +total_frames = 10_000 # 1_000_000 + +############################################################################### +# The number of frames returned by the collector at each iteration of the outer +# loop is equal to the length of each sub-trajectories times the number of envs +# run in parallel in each collector. +# +# In other words, we expect batches from the collector to have a shape +# ``[env_per_collector, traj_len]`` where +# ``traj_len=frames_per_batch/env_per_collector``: +# +traj_len = 200 +frames_per_batch = env_per_collector * traj_len +init_random_frames = 5000 +num_collectors = 2 + +from torchrl.collectors import MultiaSyncDataCollector + +collector = MultiaSyncDataCollector( + create_env_fn=[ + parallel_env, + ] + * num_collectors, + policy=actor_model_explore, + total_frames=total_frames, + # max_frames_per_traj=max_frames_per_traj, # this is achieved by the env constructor + frames_per_batch=frames_per_batch, + init_random_frames=init_random_frames, + reset_at_each_iter=False, + split_trajs=False, + device=device, + # device for execution + storing_device=device, + # device where data will be stored and passed + update_at_each_batch=False, + exploration_mode="random", +) + ############################################################################### # Evaluator: building your recorder object # ---------------------------------------- @@ -716,25 +873,42 @@ def make_ddpg_actor( # from these simulations. # # The following helper function builds this object: +from torchrl.trainers import Recorder -def make_recorder(actor_model_explore, transform_state_dict): +def make_recorder(actor_model_explore, transform_state_dict, record_interval): base_env = make_env() - recorder = make_transformed_env(base_env) - recorder.transform[2].init_stats(3) - recorder.transform[2].load_state_dict(transform_state_dict) + environment = make_transformed_env(base_env) + environment.transform[2].init_stats( + 3 + ) # must be instantiated to load the state dict + environment.transform[2].load_state_dict(transform_state_dict) recorder_obj = Recorder( record_frames=1000, - frame_skip=frame_skip, policy_exploration=actor_model_explore, - recorder=recorder, - exploration_mode="mean", + environment=environment, + exploration_mode="mode", record_interval=record_interval, ) return recorder_obj +############################################################################### +# We will be recording the performance every 10 batch collected +record_interval = 10 + +recorder = make_recorder( + actor_model_explore, transform_state_dict, record_interval=record_interval +) + +from torchrl.data.replay_buffers import ( + LazyMemmapStorage, + PrioritizedSampler, + RandomSampler, + TensorDictReplayBuffer, +) + ############################################################################### # Replay buffer # ------------- @@ -750,8 +924,10 @@ def make_recorder(actor_model_explore, transform_state_dict): # hyperparameters: # +from torchrl.envs import RandomCropTensorDict + -def make_replay_buffer(buffer_size, batch_size, prefetch=3): +def make_replay_buffer(buffer_size, batch_size, random_crop_len, prefetch=3, prb=False): if prb: sampler = PrioritizedSampler( max_capacity=buffer_size, @@ -764,176 +940,102 @@ def make_replay_buffer(buffer_size, batch_size, prefetch=3): storage=LazyMemmapStorage( buffer_size, scratch_dir=buffer_scratch_dir, - device=device, ), batch_size=batch_size, sampler=sampler, pin_memory=False, prefetch=prefetch, + transform=RandomCropTensorDict(random_crop_len, sample_dim=1), ) return replay_buffer ############################################################################### -# Hyperparameters -# --------------- -# -# After having written our helper functions, it is time to set the -# experiment hyperparameters: +# We'll store the replay buffer in a temporary dirrectory on disk -############################################################################### -# Environment -# ~~~~~~~~~~~ +import tempfile -# The backend can be gym or dm_control -backend = "gym" +tmpdir = tempfile.TemporaryDirectory() +buffer_scratch_dir = tmpdir.name ############################################################################### -# .. note:: -# ``frame_skip`` batches multiple step together with a single action -# If > 1, the other frame counts (e.g. frames_per_batch, total_frames) need to -# be adjusted to have a consistent total number of frames collected across -# experiments. This is important as raising the frame-skip but keeping the -# total number of frames unchanged may seem like cheating: all things compared, -# a dataset of 10M elements collected with a frame-skip of 2 and another with -# a frame-skip of 1 actually have a ratio of interactions with the environment -# of 2:1! +# Replay buffer storage and batch size +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # -frame_skip = 2 -from_pixels = False - -############################################################################### -# Scaling the reward helps us control the signal magnitude for a more -# efficient learning. -reward_scaling = 5.0 - -############################################################################### -# Number of random steps used as for stats computation using ObservationNorm -init_env_steps = 1000 - -############################################################################### -# Exploration: Number of frames before OU noise becomes null -annealing_frames = 1000000 // frame_skip - -############################################################################### -# Collection -# ~~~~~~~~~~ +# TorchRL replay buffer counts the number of elements along the first dimension. +# Since we'll be feeding trajectories to our buffer, we need to adapt the buffer +# size by dividing it by the length of the sub-trajectories yielded by our +# data collector. +# Regarding the batch-size, our sampling strategy will consist in sampling +# trajectories of length ``traj_len=200`` before selecting sub-trajecotries +# or length ``random_crop_len=25`` on which the loss will be computed. +# This strategy balances the choice of storing whole trajectories of a certain +# length with the need for providing sampels with a sufficient heterogeneity +# to our loss. The following figure shows the dataflow from a collector +# that gets 8 frames in each batch with 2 environments run in parallel, +# feeds them to a replay buffer that contains 1000 trajectories and +# samples sub-trajectories of 2 time steps each. # -# We will execute the policy on cuda if available -device = ( - torch.device("cpu") if torch.cuda.device_count() == 0 else torch.device("cuda:0") -) - -############################################################################### -# Number of environments in each data collector -env_per_collector = 2 - -############################################################################### -# Total frames we will use during training. Scale up to 500K - 1M for a more -# meaningful training -total_frames = 10000 // frame_skip - -############################################################################### -# Number of frames returned by the collector at each iteration of the outer loop. -# We expect batches from the collector to have a shape -# ``[env_per_collector, traj_len]`` where ``traj_len`` is the time dimension -# of the samples. TorchRL's datacollectors are given a certain number of -# environment and a number of frames to deliver in each batch. We can -# We can easily calculate how many frames we need to ask to the collectors: -traj_len = 50 # time length of the batches -frames_per_batch = env_per_collector * traj_len // frame_skip -max_frames_per_traj = 1000 // frame_skip -init_random_frames = 0 - -############################################################################### -# We will be recording the performance every 10 batch collected -record_interval = 10 - -############################################################################### -# Optimizer and optimization -# ~~~~~~~~~~~~~~~~~~~~~~~~~~ - -lr = 5e-4 -weight_decay = 0.0 -# UTD: Number of iterations of the inner loop -update_to_data = 4 - -############################################################################### -# Because we'll be sampling from a replay buffer that stores sub-trajectories -# of length ``traj_len``, we need to compute how large the batch-size -# is going to be based on the total number of elements we expect to find -# divided by the trajectory length: -batch_size = 128 // traj_len * frame_skip +# .. figure:: /_static/img/replaybuffer_traj.png +# :alt: Storign trajectories in the replay buffer +# +# Let's start with the number of frames stored in the buffer -############################################################################### -# Model -# ~~~~~ +def ceil_div(x, y): + return -x // (-y) -gamma = 0.99 -lmbda = 0.2 -tau = 0.005 # Decay factor for the target network -# Network specs -num_cells = 64 -num_layers = 2 +buffer_size = 1_000_000 +buffer_size = ceil_div(buffer_size, traj_len) ############################################################################### -# Replay buffer -# ~~~~~~~~~~~~~ -# If ``prb=True``, a Prioritized replay buffer will be used +# Prioritized replay buffer is disabled by default prb = False -############################################################################### -# Number of frames stored in the buffer -buffer_size = min(total_frames, 1_000_000 // traj_len) -buffer_scratch_dir = "/tmp/" - -seed = 0 ############################################################################### -# Initialization -# -------------- -# -# To initialize the experiment, we first acquire the observation statistics, -# then build the networks, wrap them in an exploration wrapper (following the -# seminal DDPG paper, we used an Ornstein-Uhlenbeck process to add noise to the -# sampled actions). - - -# Seeding -torch.manual_seed(seed) -np.random.seed(seed) +# We also need to define how many updates we'll be doing per batch of data +# collected. This is known as the update-to-data or UTD ratio: +update_to_data = 64 ############################################################################### -# Normalization stats -# ~~~~~~~~~~~~~~~~~~~ - -transform_state_dict = get_env_stats() +# We'll be feeding the loss with trajectories of length 25: +random_crop_len = 25 ############################################################################### -# Models: policy and q-value network -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# In the original paper, the authors perform one update with a batch of 64 +# elements for each frame collected. Here, we reproduce the same ratio +# but while realizing several updates at each batch collection. We +# adapt our batch-size to achieve the same number of update-per-frame ratio: -actor, qnet = make_ddpg_actor( - transform_state_dict=transform_state_dict, - device=device, -) -if device == torch.device("cpu"): - actor.share_memory() +batch_size = ceil_div(64 * frames_per_batch, update_to_data * random_crop_len) +replay_buffer = make_replay_buffer( + buffer_size=buffer_size, + batch_size=batch_size, + random_crop_len=random_crop_len, + prefetch=3, + prb=prb, +) ############################################################################### -# Loss module -# ~~~~~~~~~~~ +# Loss module construction +# ------------------------ +# # We build our loss module with the actor and qnet we've just created. # Because we have target parameters to update, we _must_ create a target network # updater. # + +gamma = 0.99 +lmbda = 0.9 +tau = 0.001 # Decay factor for the target network + loss_module = DDPGLoss(actor, qnet) + +############################################################################### # let's use the TD(lambda) estimator! loss_module.make_value_estimator(ValueEstimators.TDLambda, gamma=gamma, lmbda=lmbda) -target_net_updater = SoftUpdate(loss_module, eps=1 - tau) -target_net_updater.init_() ############################################################################### # .. note:: @@ -945,109 +1047,37 @@ def make_replay_buffer(buffer_size, batch_size, prefetch=3): # practice despite the fact that they introduce some bias in the return # estimates. # -# The policy is wrapped in a :class:`torchrl.modules.OrnsteinUhlenbeckProcessWrapper` -# exploration module: - -actor_model_explore = OrnsteinUhlenbeckProcessWrapper( - actor, - annealing_num_steps=annealing_frames, -).to(device) -if device == torch.device("cpu"): - actor_model_explore.share_memory() - -############################################################################### -# Parallel environment creation -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# We pass the stats computed earlier to normalize the output of our -# environment: - -create_env_fn = parallel_env_constructor( - transform_state_dict=transform_state_dict, -) - -############################################################################### -# Data collector -# ~~~~~~~~~~~~~~ -# -# TorchRL provides specialized classes to help you collect data by executing -# the policy in the environment. These "data collectors" iteratively compute -# the action to be executed at a given time, then execute a step in the -# environment and reset it when required. -# Data collectors are designed to help developers have a tight control -# on the number of frames per batch of data, on the (a)sync nature of this -# collection and on the resources allocated to the data collection (e.g. GPU, -# number of workers etc). -# -# Here we will use -# :class:`torchrl.collectors.MultiaSyncDataCollector`, a data collector that -# will be executed in an async manner (i.e. data will be collected while -# the policy is being optimized). With the :class:`MultiaSyncDataCollector`, -# multiple workers are running rollouts separately. When a batch is asked, it -# is gathered from the first worker that can provide it. -# -# The parameters to specify are: -# -# - the list of environment creation functions, -# - the policy, -# - the total number of frames before the collector is considered empty, -# - the maximum number of frames per trajectory (useful for non-terminating -# environments, like dm_control ones). -# -# One should also pass: -# -# - the number of frames in each batch collected, -# - the number of random steps executed independently from the policy, -# - the devices used for policy execution -# - the devices used to store data before the data is passed to the main -# process. +# Target network updater +# ^^^^^^^^^^^^^^^^^^^^^^ # +# Target networks are a crucial part of off-policy RL algorithms. +# Updating the target network parameters is made easy thanks to the +# :class:`torchrl.objectives.HardUpdate` and :class:`torchrl.objectives.SoftUpdate` +# classes. They're built with the loss module as argument, and the update is +# achieved via a call to `updater.step()` at the appropriate location in the +# training loop. -collector = MultiaSyncDataCollector( - create_env_fn=[create_env_fn, create_env_fn], - policy=actor_model_explore, - total_frames=total_frames, - max_frames_per_traj=max_frames_per_traj, - frames_per_batch=frames_per_batch, - init_random_frames=init_random_frames, - reset_at_each_iter=False, - split_trajs=True, - device=device, # device for execution - storing_device=device, # device where data will be stored and passed - update_at_each_batch=False, - exploration_mode="random", -) - -collector.set_seed(seed) - -############################################################################### -# Replay buffer -# ~~~~~~~~~~~~~ -# - -replay_buffer = make_replay_buffer( - buffer_size=buffer_size, batch_size=batch_size, prefetch=3 -) - -############################################################################### -# Recorder -# ~~~~~~~~ +from torchrl.objectives.utils import SoftUpdate -recorder = make_recorder(actor_model_explore, transform_state_dict) +target_net_updater = SoftUpdate(loss_module, eps=1 - tau) +# This class will raise an error if `init_` is not called first. +target_net_updater.init_() ############################################################################### # Optimizer # ~~~~~~~~~ # -# Finally, we will use the Adam optimizer for the policy and value network, -# with the same learning rate for both. +# Finally, we will use the Adam optimizer for the policy and value network: -optimizer = optim.Adam(loss_module.parameters(), lr=lr, weight_decay=weight_decay) -total_collection_steps = total_frames // frames_per_batch +from torch import optim -scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( - optimizer, T_max=total_collection_steps +optimizer_actor = optim.Adam( + loss_module.actor_network_params.values(True, True), lr=1e-4, weight_decay=0.0 +) +optimizer_value = optim.Adam( + loss_module.value_network_params.values(True, True), lr=1e-3, weight_decay=1e-2 ) +total_collection_steps = total_frames // frames_per_batch ############################################################################### # Time to train the policy @@ -1083,16 +1113,27 @@ def make_replay_buffer(buffer_size, batch_size, prefetch=3): if collected_frames >= init_random_frames: for _ in range(update_to_data): # sample from replay buffer - sampled_tensordict = replay_buffer.sample() + sampled_tensordict = replay_buffer.sample().to(device) # Compute loss loss_dict = loss_module(sampled_tensordict) # optimize - loss_val = loss_dict["loss_actor"] + loss_dict["loss_value"] - loss_val.backward() - optimizer.step() - optimizer.zero_grad() + loss_dict["loss_actor"].backward() + gn1 = torch.nn.utils.clip_grad_norm_( + loss_module.actor_network_params.values(True, True), 10.0 + ) + optimizer_actor.step() + optimizer_actor.zero_grad() + + loss_dict["loss_value"].backward() + gn2 = torch.nn.utils.clip_grad_norm_( + loss_module.value_network_params.values(True, True), 10.0 + ) + optimizer_value.step() + optimizer_value.zero_grad() + + gn = (gn1**2 + gn2**2) ** 0.5 # update priority if prb: @@ -1103,21 +1144,30 @@ def make_replay_buffer(buffer_size, batch_size, prefetch=3): rewards.append( ( i, - tensordict["next", "reward"].mean().item() / frame_skip, + tensordict["next", "reward"].mean().item(), ) ) td_record = recorder(None) if td_record is not None: rewards_eval.append((i, td_record["r_evaluation"].item())) - if len(rewards_eval): + if len(rewards_eval) and collected_frames >= init_random_frames: + target_value = loss_dict["target_value"].item() + loss_value = loss_dict["loss_value"].item() + loss_actor = loss_dict["loss_actor"].item() + rn = sampled_tensordict["next", "reward"].mean().item() + rs = sampled_tensordict["next", "reward"].std().item() pbar.set_description( - f"reward: {rewards[-1][1]: 4.4f} (r0 = {r0: 4.4f}), reward eval: reward: {rewards_eval[-1][1]: 4.4f}, shape={sampled_tensordict.shape}" + f"reward: {rewards[-1][1]: 4.2f} (r0 = {r0: 4.2f}), " + f"reward eval: reward: {rewards_eval[-1][1]: 4.2f}, " + f"reward normalized={rn :4.2f}/{rs :4.2f}, " + f"grad norm={gn: 4.2f}, " + f"loss_value={loss_value: 4.2f}, " + f"loss_actor={loss_actor: 4.2f}, " + f"target value: {target_value: 4.2f}" ) # update the exploration strategy actor_model_explore.step(current_frames) - if collected_frames >= init_random_frames: - scheduler.step() collector.shutdown() del collector @@ -1132,6 +1182,8 @@ def make_replay_buffer(buffer_size, batch_size, prefetch=3): # **Note**: As already mentioned above, to get a more reasonable performance, # use a greater value for ``total_frames`` e.g. 1M. +from matplotlib import pyplot as plt + plt.figure() plt.plot(*zip(*rewards), label="training") plt.plot(*zip(*rewards_eval), label="eval") diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py index 6abdae9d60d..b82d4a9ab78 100644 --- a/tutorials/sphinx-tutorials/coding_dqn.py +++ b/tutorials/sphinx-tutorials/coding_dqn.py @@ -41,8 +41,7 @@ # estimated return; # - how to collect data from your environment efficiently and store them # in a replay buffer; -# - how to store trajectories (and not transitions) in your replay buffer), -# and how to estimate returns using TD(lambda); +# - how to use multi-step, a simple preprocessing step for off-policy algorithms; # - and finally how to evaluate your model. # # **Prerequisites**: We encourage you to get familiar with torchrl through the @@ -57,7 +56,7 @@ # On a high level, the algorithm is quite simple: Q-learning consists in # learning a table of state-action values in such a way that, when # encountering any particular state, we know which action to pick just by -# searching for the action with the highest value. This simple setting +# searching for the one with the highest value. This simple setting # requires the actions and states to be # discrete, otherwise a lookup table cannot be built. # @@ -84,21 +83,18 @@ # of this algorithm. # sphinx_gallery_start_ignore -import os -import uuid import warnings -from torchrl.objectives import DQNLoss, SoftUpdate -from torchrl.record.loggers.csv import CSVLogger -from torchrl.trainers import Recorder, ReplayBufferTrainer, Trainer, UpdateWeights - warnings.filterwarnings("ignore") # sphinx_gallery_end_ignore +import os +import uuid + import torch from torch import nn from torchrl.collectors import MultiaSyncDataCollector -from torchrl.data import LazyMemmapStorage, TensorDictReplayBuffer +from torchrl.data import LazyMemmapStorage, MultiStep, TensorDictReplayBuffer from torchrl.envs import EnvCreator, ParallelEnv, RewardScaling, StepCounter from torchrl.envs.libs.gym import GymEnv from torchrl.envs.transforms import ( @@ -112,6 +108,16 @@ ) from torchrl.modules import DuelingCnnDQNet, EGreedyWrapper, QValueActor +from torchrl.objectives import DQNLoss, SoftUpdate +from torchrl.record.loggers.csv import CSVLogger +from torchrl.trainers import ( + LogReward, + Recorder, + ReplayBufferTrainer, + Trainer, + UpdateWeights, +) + def is_notebook() -> bool: try: @@ -244,7 +250,7 @@ def get_norm_stats(): obs_norm_sd = test_env.transform[-1].state_dict() # let's check that normalizing constants have a size of ``[C, 1, 1]`` where # ``C=4`` (because of :class:`torchrl.envs.CatFrames`). - print(obs_norm_sd) + print("state dict of the observation norm:", obs_norm_sd) return obs_norm_sd @@ -392,6 +398,7 @@ def get_collector( device=device, storing_device=device, split_trajs=False, + postproc=MultiStep(5), ) return data_collector @@ -448,8 +455,6 @@ def get_loss_module(actor, gamma): ############################################################################### # DQN parameters # ~~~~~~~~~~~~~~ - -############################################################################### # gamma decay factor gamma = 0.99 @@ -459,9 +464,9 @@ def get_loss_module(actor, gamma): ############################################################################### # Smooth target network update decay parameter. -# This loosely corresponds to a 1/(1-tau) interval with hard target network +# This loosely corresponds to a 1/tau interval with hard target network # update -tau = 0.005 +tau = 0.02 ############################################################################### # Data collection and replay buffer @@ -595,6 +600,7 @@ def get_loss_module(actor, gamma): exploration_mode="mode", log_keys=[("next", "reward")], out_keys={("next", "reward"): "rewards"}, + log_pbar=True, ) recorder.register(trainer) @@ -609,6 +615,15 @@ def get_loss_module(actor, gamma): # trainer.register_op("post_optim", target_net_updater.step) +############################################################################### +# We can log the training rewards too. Note that this is of limited interest +# with CartPole, as rewards are always 1. The discounted sum of rewards is miximised +# not by getting higher rewards but by keeping the cart-pole alive for longer. +# This will be reflected by the `total_rewards` value displayed in the progress bar. +# +log_reward = LogReward() +log_reward.register(trainer) + ############################################################################### # .. note:: # It is possible to link multiple optimizers to the trainer if needed. From 3f16a495e4dc9143c02736ac2e31cd9c4a18b11d Mon Sep 17 00:00:00 2001 From: vmoens Date: Mon, 3 Apr 2023 16:56:11 +0100 Subject: [PATCH 68/89] amend --- torchrl/objectives/common.py | 3 +-- torchrl/trainers/trainers.py | 7 +++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/torchrl/objectives/common.py b/torchrl/objectives/common.py index 106ad36ef62..fca524eaa96 100644 --- a/torchrl/objectives/common.py +++ b/torchrl/objectives/common.py @@ -6,12 +6,11 @@ from __future__ import annotations import itertools -from copy import deepcopy from typing import Iterator, List, Optional, Tuple, Union import torch -from tensordict.nn import make_functional, repopulate_module, TensorDictModule +from tensordict.nn import make_functional, TensorDictModule from tensordict.tensordict import TensorDictBase from torch import nn, Tensor diff --git a/torchrl/trainers/trainers.py b/torchrl/trainers/trainers.py index ce3516f55db..3cf06a4da60 100644 --- a/torchrl/trainers/trainers.py +++ b/torchrl/trainers/trainers.py @@ -22,6 +22,7 @@ from torchrl._utils import _CKPT_BACKEND, KeyDependentDefaultDict, VERBOSE from torchrl.collectors.collectors import DataCollectorBase +from torchrl.collectors.utils import split_trajectories from torchrl.data import TensorDictPrioritizedReplayBuffer, TensorDictReplayBuffer from torchrl.data.utils import DEVICE_TYPING from torchrl.envs.common import EnvBase @@ -1198,6 +1199,7 @@ def __call__(self, batch: TensorDictBase) -> Dict: auto_cast_to_device=True, break_when_any_done=False, ).clone() + td_record = split_trajectories(td_record) if isinstance(self.policy_exploration, torch.nn.Module): self.policy_exploration.train() self.environment.train() @@ -1207,8 +1209,9 @@ def __call__(self, batch: TensorDictBase) -> Dict: for key in self.log_keys: value = td_record.get(key).float() if key == ("next", "reward"): - mean_value = value.mean() / self.frame_skip - total_value = value.sum() + mask = td_record["mask"] + mean_value = value[mask].mean() / self.frame_skip + total_value = value.sum(dim=td_record.ndim).mean() out[self.out_keys[key]] = mean_value out["total_" + self.out_keys[key]] = total_value continue From 833bf5865be2475d7b56388e53b3241ac1addd10 Mon Sep 17 00:00:00 2001 From: vmoens Date: Mon, 3 Apr 2023 17:26:31 +0100 Subject: [PATCH 69/89] revert --- torchrl/objectives/common.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/torchrl/objectives/common.py b/torchrl/objectives/common.py index fca524eaa96..be931e8c260 100644 --- a/torchrl/objectives/common.py +++ b/torchrl/objectives/common.py @@ -6,11 +6,12 @@ from __future__ import annotations import itertools +from copy import deepcopy from typing import Iterator, List, Optional, Tuple, Union import torch -from tensordict.nn import make_functional, TensorDictModule +from tensordict.nn import make_functional, repopulate_module, TensorDictModule from tensordict.tensordict import TensorDictBase from torch import nn, Tensor @@ -98,10 +99,13 @@ def convert_to_functional( buffer_names = next(itertools.islice(zip(*module.named_buffers()), 1)) except StopIteration: buffer_names = () - params = make_functional( - module, funs_to_decorate=funs_to_decorate, keep_params=True - ) - functional_module = module + params = make_functional(module, funs_to_decorate=funs_to_decorate) + functional_module = deepcopy(module) + repopulate_module(module, params) + # params = make_functional( + # module, funs_to_decorate=funs_to_decorate, keep_params=True + # ) + # functional_module = module params_and_buffers = params # we transform the buffers in params to make sure they follow the device From 094d49b2d63d7e3a634bf67c6cc5ea6e3c832a9c Mon Sep 17 00:00:00 2001 From: vmoens Date: Mon, 3 Apr 2023 17:56:02 +0100 Subject: [PATCH 70/89] amend --- tutorials/sphinx-tutorials/coding_dqn.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py index b82d4a9ab78..34209e75b6e 100644 --- a/tutorials/sphinx-tutorials/coding_dqn.py +++ b/tutorials/sphinx-tutorials/coding_dqn.py @@ -398,7 +398,7 @@ def get_collector( device=device, storing_device=device, split_trajs=False, - postproc=MultiStep(5), + postproc=MultiStep(gamma=gamma, n_steps=5), ) return data_collector @@ -458,10 +458,6 @@ def get_loss_module(actor, gamma): # gamma decay factor gamma = 0.99 -############################################################################### -# lambda decay factor (see second the part with TD(:math:`\lambda`) -lmbda = 0.95 - ############################################################################### # Smooth target network update decay parameter. # This loosely corresponds to a 1/tau interval with hard target network From effa4fc0a14994f5d463ae93ac1880d82c8d29db Mon Sep 17 00:00:00 2001 From: vmoens Date: Mon, 3 Apr 2023 18:39:13 +0100 Subject: [PATCH 71/89] log_dir --- tutorials/sphinx-tutorials/coding_dqn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py index 34209e75b6e..458700a33d8 100644 --- a/tutorials/sphinx-tutorials/coding_dqn.py +++ b/tutorials/sphinx-tutorials/coding_dqn.py @@ -661,7 +661,7 @@ def print_csv_files_in_folder(folder_path): return output_str -print_csv_files_in_folder("csv_logs/" + exp_name) +print_csv_files_in_folder(logger.experiment.log_dir) ############################################################################### # Conclusion and possible improvements From 4afd785e04c1275209ed1c28445831145dce608f Mon Sep 17 00:00:00 2001 From: vmoens Date: Mon, 3 Apr 2023 21:22:53 +0100 Subject: [PATCH 72/89] amend --- tutorials/sphinx-tutorials/coding_dqn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py index 458700a33d8..acc63f36ca1 100644 --- a/tutorials/sphinx-tutorials/coding_dqn.py +++ b/tutorials/sphinx-tutorials/coding_dqn.py @@ -582,7 +582,7 @@ def get_loss_module(actor, gamma): # can be cumbersome to implement. buffer_hook = ReplayBufferTrainer( get_replay_buffer(buffer_size, n_optim, batch_size=batch_size), - flatten_tensordicts=False, + flatten_tensordicts=True, ) buffer_hook.register(trainer) weight_updater = UpdateWeights(collector, update_weights_interval=1) From e50f57809665a20bf37e7b5ce97511eab96ca77c Mon Sep 17 00:00:00 2001 From: vmoens Date: Mon, 3 Apr 2023 21:52:07 +0100 Subject: [PATCH 73/89] amend --- torchrl/trainers/trainers.py | 2 +- tutorials/sphinx-tutorials/coding_dqn.py | 1378 +++++++++++----------- 2 files changed, 686 insertions(+), 694 deletions(-) diff --git a/torchrl/trainers/trainers.py b/torchrl/trainers/trainers.py index 3cf06a4da60..04aef9d0aa2 100644 --- a/torchrl/trainers/trainers.py +++ b/torchrl/trainers/trainers.py @@ -1211,7 +1211,7 @@ def __call__(self, batch: TensorDictBase) -> Dict: if key == ("next", "reward"): mask = td_record["mask"] mean_value = value[mask].mean() / self.frame_skip - total_value = value.sum(dim=td_record.ndim).mean() + total_value = value.sum(dim=td_record.ndim - 1).mean() out[self.out_keys[key]] = mean_value out["total_" + self.out_keys[key]] = total_value continue diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py index acc63f36ca1..956721e10b7 100644 --- a/tutorials/sphinx-tutorials/coding_dqn.py +++ b/tutorials/sphinx-tutorials/coding_dqn.py @@ -1,704 +1,696 @@ -# -*- coding: utf-8 -*- -""" -TorchRL trainer: A DQN example -============================== -**Author**: `Vincent Moens `_ - -""" - -############################################################################## -# TorchRL provides a generic :class:`torchrl.trainers.Trainer` class to handle -# your training loop. The trainer executes a nested loop where the outer loop -# is the data collection and the inner loop consumes this data or some data -# retrieved from the replay buffer to train the model. -# At various points in this training loop, hooks can be attached and executed at -# given intervals. -# -# In this tutorial, we will be using the trainer class to train a DQN algorithm -# to solve the CartPole task from scratch. -# -# Main takeaways: -# -# - Building a trainer with its essential components: data collector, loss -# module, replay buffer and optimizer. -# - Adding hooks to a trainer, such as loggers, target network updaters and such. -# -# The trainer is fully customisable and offers a large set of functionalities. -# The tutorial is organised around its construction. -# We will be detailing how to build each of the components of the library first, -# and then put the pieces together using the :class:`torchrl.trainers.Trainer` -# class. -# -# Along the road, we will also focus on some other aspects of the library: -# -# - how to build an environment in TorchRL, including transforms (e.g. data -# normalization, frame concatenation, resizing and turning to grayscale) -# and parallel execution. Unlike what we did in the -# `DDPG tutorial `_, we -# will normalize the pixels and not the state vector. -# - how to design a :class:`torchrl.modules.QValueActor` object, i.e. an actor -# that estimates the action values and picks up the action with the highest -# estimated return; -# - how to collect data from your environment efficiently and store them -# in a replay buffer; -# - how to use multi-step, a simple preprocessing step for off-policy algorithms; -# - and finally how to evaluate your model. -# -# **Prerequisites**: We encourage you to get familiar with torchrl through the -# `PPO tutorial `_ first. -# -# DQN -# --- -# -# DQN (`Deep Q-Learning `_) was -# the founding work in deep reinforcement learning. -# -# On a high level, the algorithm is quite simple: Q-learning consists in -# learning a table of state-action values in such a way that, when -# encountering any particular state, we know which action to pick just by -# searching for the one with the highest value. This simple setting -# requires the actions and states to be -# discrete, otherwise a lookup table cannot be built. -# -# DQN uses a neural network that encodes a map from the state-action space to -# a value (scalar) space, which amortizes the cost of storing and exploring all -# the possible state-action combinations: if a state has not been seen in the -# past, we can still pass it in conjunction with the various actions available -# through our neural network and get an interpolated value for each of the -# actions available. -# -# We will solve the classic control problem of the cart pole. From the -# Gymnasium doc from where this environment is retrieved: -# -# | A pole is attached by an un-actuated joint to a cart, which moves along a -# | frictionless track. The pendulum is placed upright on the cart and the goal -# | is to balance the pole by applying forces in the left and right direction -# | on the cart. -# -# .. figure:: /_static/img/cartpole_demo.gif -# :alt: Cart Pole -# -# We do not aim at giving a SOTA implementation of the algorithm, but rather -# to provide a high-level illustration of TorchRL features in the context -# of this algorithm. - -# sphinx_gallery_start_ignore -import warnings - -warnings.filterwarnings("ignore") -# sphinx_gallery_end_ignore - -import os -import uuid - -import torch -from torch import nn -from torchrl.collectors import MultiaSyncDataCollector -from torchrl.data import LazyMemmapStorage, MultiStep, TensorDictReplayBuffer -from torchrl.envs import EnvCreator, ParallelEnv, RewardScaling, StepCounter -from torchrl.envs.libs.gym import GymEnv -from torchrl.envs.transforms import ( - CatFrames, - Compose, - GrayScale, - ObservationNorm, - Resize, - ToTensorImage, - TransformedEnv, -) -from torchrl.modules import DuelingCnnDQNet, EGreedyWrapper, QValueActor - -from torchrl.objectives import DQNLoss, SoftUpdate -from torchrl.record.loggers.csv import CSVLogger -from torchrl.trainers import ( - LogReward, - Recorder, - ReplayBufferTrainer, - Trainer, - UpdateWeights, -) - - -def is_notebook() -> bool: - try: - shell = get_ipython().__class__.__name__ - if shell == "ZMQInteractiveShell": - return True # Jupyter notebook or qtconsole - elif shell == "TerminalInteractiveShell": - return False # Terminal running IPython +if __name__ == "__main__": + # -*- coding: utf-8 -*- + """ + TorchRL trainer: A DQN example + ============================== + **Author**: `Vincent Moens `_ + + """ + + ############################################################################## + # TorchRL provides a generic :class:`torchrl.trainers.Trainer` class to handle + # your training loop. The trainer executes a nested loop where the outer loop + # is the data collection and the inner loop consumes this data or some data + # retrieved from the replay buffer to train the model. + # At various points in this training loop, hooks can be attached and executed at + # given intervals. + # + # In this tutorial, we will be using the trainer class to train a DQN algorithm + # to solve the CartPole task from scratch. + # + # Main takeaways: + # + # - Building a trainer with its essential components: data collector, loss + # module, replay buffer and optimizer. + # - Adding hooks to a trainer, such as loggers, target network updaters and such. + # + # The trainer is fully customisable and offers a large set of functionalities. + # The tutorial is organised around its construction. + # We will be detailing how to build each of the components of the library first, + # and then put the pieces together using the :class:`torchrl.trainers.Trainer` + # class. + # + # Along the road, we will also focus on some other aspects of the library: + # + # - how to build an environment in TorchRL, including transforms (e.g. data + # normalization, frame concatenation, resizing and turning to grayscale) + # and parallel execution. Unlike what we did in the + # `DDPG tutorial `_, we + # will normalize the pixels and not the state vector. + # - how to design a :class:`torchrl.modules.QValueActor` object, i.e. an actor + # that estimates the action values and picks up the action with the highest + # estimated return; + # - how to collect data from your environment efficiently and store them + # in a replay buffer; + # - how to use multi-step, a simple preprocessing step for off-policy algorithms; + # - and finally how to evaluate your model. + # + # **Prerequisites**: We encourage you to get familiar with torchrl through the + # `PPO tutorial `_ first. + # + # DQN + # --- + # + # DQN (`Deep Q-Learning `_) was + # the founding work in deep reinforcement learning. + # + # On a high level, the algorithm is quite simple: Q-learning consists in + # learning a table of state-action values in such a way that, when + # encountering any particular state, we know which action to pick just by + # searching for the one with the highest value. This simple setting + # requires the actions and states to be + # discrete, otherwise a lookup table cannot be built. + # + # DQN uses a neural network that encodes a map from the state-action space to + # a value (scalar) space, which amortizes the cost of storing and exploring all + # the possible state-action combinations: if a state has not been seen in the + # past, we can still pass it in conjunction with the various actions available + # through our neural network and get an interpolated value for each of the + # actions available. + # + # We will solve the classic control problem of the cart pole. From the + # Gymnasium doc from where this environment is retrieved: + # + # | A pole is attached by an un-actuated joint to a cart, which moves along a + # | frictionless track. The pendulum is placed upright on the cart and the goal + # | is to balance the pole by applying forces in the left and right direction + # | on the cart. + # + # .. figure:: /_static/img/cartpole_demo.gif + # :alt: Cart Pole + # + # We do not aim at giving a SOTA implementation of the algorithm, but rather + # to provide a high-level illustration of TorchRL features in the context + # of this algorithm. + + # sphinx_gallery_start_ignore + import warnings + + warnings.filterwarnings("ignore") + # sphinx_gallery_end_ignore + + import os + import uuid + + import torch + from torch import nn + from torchrl.collectors import MultiaSyncDataCollector + from torchrl.data import LazyMemmapStorage, MultiStep, TensorDictReplayBuffer + from torchrl.envs import EnvCreator, ParallelEnv, RewardScaling, StepCounter + from torchrl.envs.libs.gym import GymEnv + from torchrl.envs.transforms import ( + CatFrames, + Compose, + GrayScale, + ObservationNorm, + Resize, + ToTensorImage, + TransformedEnv, + ) + from torchrl.modules import DuelingCnnDQNet, EGreedyWrapper, QValueActor + + from torchrl.objectives import DQNLoss, SoftUpdate + from torchrl.record.loggers.csv import CSVLogger + from torchrl.trainers import ( + LogReward, + Recorder, + ReplayBufferTrainer, + Trainer, + UpdateWeights, + ) + + def is_notebook() -> bool: + try: + shell = get_ipython().__class__.__name__ + if shell == "ZMQInteractiveShell": + return True # Jupyter notebook or qtconsole + elif shell == "TerminalInteractiveShell": + return False # Terminal running IPython + else: + return False # Other type (?) + except NameError: + return False # Probably standard Python interpreter + + ############################################################################### + # Let's get started with the various pieces we need for our algorithm: + # + # - An environment; + # - A policy (and related modules that we group under the "model" umbrella); + # - A data collector, which makes the policy play in the environment and + # delivers training data; + # - A replay buffer to store the training data; + # - A loss module, which computes the objective function to train our policy + # to maximise the return; + # - An optimizer, which performs parameter updates based on our loss. + # + # Additional modules include a logger, a recorder (executes the policy in + # "eval" mode) and a target network updater. With all these components into + # place, it is easy to see how one could misplace or misuse one component in + # the training script. The trainer is there to orchestrate everything for you! + # + # Building the environment + # ------------------------ + # + # First let's write a helper function that will output an environment. As usual, + # the "raw" environment may be too simple to be used in practice and we'll need + # some data transformation to expose its output to the policy. + # + # We will be using five transforms: + # + # - :class:`torchrl.envs.StepCounter` to count the number of steps in each trajectory; + # - :class:`torchrl.envs.ToTensorImage` will convert a ``[W, H, C]`` uint8 + # tensor in a floating point tensor in the ``[0, 1]`` space with shape + # ``[C, W, H]``; + # - :class:`torchrl.envs.RewardScaling` to reduce the scale of the return; + # - :class:`torchrl.envs.GrayScale` will turn our image into grayscale; + # - :class:`torchrl.envs.Resize` will resize the image in a 64x64 format; + # - :class:`torchrl.envs.CatFrames` will concatenate an arbitrary number of + # successive frames (``N=4``) in a single tensor along the channel dimension. + # This is useful as a single image does not carry information about the + # motion of the cartpole. Some memory about past observations and actions + # is needed, either via a recurrent neural network or using a stack of + # frames. + # - :class:`torchrl.envs.ObservationNorm` which will normalize our observations + # given some custom summary statistics. + # + # In practice, our environment builder has two arguments: + # + # - ``parallel``: determines whether multiple environments have to be run in + # parallel. We stack the transforms after the + # :class:`torchrl.envs.ParallelEnv` to take advantage + # of vectorization of the operations on device, although this would + # technically work with every single environment attached to its own set of + # transforms. + # - ``obs_norm_sd`` will contain the normalizing constants for + # the :class:`torchrl.envs.ObservationNorm` transform. + # + + def make_env( + parallel=False, + obs_norm_sd=None, + ): + if obs_norm_sd is None: + obs_norm_sd = {"standard_normal": True} + if parallel: + base_env = ParallelEnv( + num_workers, + EnvCreator( + lambda: GymEnv( + "CartPole-v1", + from_pixels=True, + pixels_only=True, + device=device, + ) + ), + ) else: - return False # Other type (?) - except NameError: - return False # Probably standard Python interpreter - - -############################################################################### -# Let's get started with the various pieces we need for our algorithm: -# -# - An environment; -# - A policy (and related modules that we group under the "model" umbrella); -# - A data collector, which makes the policy play in the environment and -# delivers training data; -# - A replay buffer to store the training data; -# - A loss module, which computes the objective function to train our policy -# to maximise the return; -# - An optimizer, which performs parameter updates based on our loss. -# -# Additional modules include a logger, a recorder (executes the policy in -# "eval" mode) and a target network updater. With all these components into -# place, it is easy to see how one could misplace or misuse one component in -# the training script. The trainer is there to orchestrate everything for you! -# -# Building the environment -# ------------------------ -# -# First let's write a helper function that will output an environment. As usual, -# the "raw" environment may be too simple to be used in practice and we'll need -# some data transformation to expose its output to the policy. -# -# We will be using five transforms: -# -# - :class:`torchrl.envs.StepCounter` to count the number of steps in each trajectory; -# - :class:`torchrl.envs.ToTensorImage` will convert a ``[W, H, C]`` uint8 -# tensor in a floating point tensor in the ``[0, 1]`` space with shape -# ``[C, W, H]``; -# - :class:`torchrl.envs.RewardScaling` to reduce the scale of the return; -# - :class:`torchrl.envs.GrayScale` will turn our image into grayscale; -# - :class:`torchrl.envs.Resize` will resize the image in a 64x64 format; -# - :class:`torchrl.envs.CatFrames` will concatenate an arbitrary number of -# successive frames (``N=4``) in a single tensor along the channel dimension. -# This is useful as a single image does not carry information about the -# motion of the cartpole. Some memory about past observations and actions -# is needed, either via a recurrent neural network or using a stack of -# frames. -# - :class:`torchrl.envs.ObservationNorm` which will normalize our observations -# given some custom summary statistics. -# -# In practice, our environment builder has two arguments: -# -# - ``parallel``: determines whether multiple environments have to be run in -# parallel. We stack the transforms after the -# :class:`torchrl.envs.ParallelEnv` to take advantage -# of vectorization of the operations on device, although this would -# technically work with every single environment attached to its own set of -# transforms. -# - ``obs_norm_sd`` will contain the normalizing constants for -# the :class:`torchrl.envs.ObservationNorm` transform. -# - - -def make_env( - parallel=False, - obs_norm_sd=None, -): - if obs_norm_sd is None: - obs_norm_sd = {"standard_normal": True} - if parallel: - base_env = ParallelEnv( - num_workers, - EnvCreator( - lambda: GymEnv( - "CartPole-v1", - from_pixels=True, - pixels_only=True, - device=device, - ) + base_env = GymEnv( + "CartPole-v1", + from_pixels=True, + pixels_only=True, + device=device, + ) + + env = TransformedEnv( + base_env, + Compose( + StepCounter(), # to count the steps of each trajectory + ToTensorImage(), + RewardScaling(loc=0.0, scale=0.1), + GrayScale(), + Resize(64, 64), + CatFrames(4, in_keys=["pixels"], dim=-3), + ObservationNorm(in_keys=["pixels"], **obs_norm_sd), ), ) - else: - base_env = GymEnv( - "CartPole-v1", - from_pixels=True, - pixels_only=True, - device=device, + return env + + ############################################################################### + # Compute normalizing constants + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # + # To normalize images, we don't want to normalize each pixel independently + # with a full ``[C, W, H]`` normalizing mask, but with simpler ``[C, 1, 1]`` + # shaped set of normalizing constants (loc and scale parameters). + # We will be using the ``reduce_dim`` argument + # of :meth:`torchrl.envs.ObservationNorm.init_stats` to instruct which + # dimensions must be reduced, and the ``keep_dims`` parameter to ensure that + # not all dimensions disappear in the process: + # + + def get_norm_stats(): + test_env = make_env() + test_env.transform[-1].init_stats( + num_iter=1000, cat_dim=0, reduce_dim=[-1, -2, -4], keep_dims=(-1, -2) + ) + obs_norm_sd = test_env.transform[-1].state_dict() + # let's check that normalizing constants have a size of ``[C, 1, 1]`` where + # ``C=4`` (because of :class:`torchrl.envs.CatFrames`). + print("state dict of the observation norm:", obs_norm_sd) + return obs_norm_sd + + ############################################################################### + # Building the model (Deep Q-network) + # ----------------------------------- + # + # The following function builds a :class:`torchrl.modules.DuelingCnnDQNet` + # object which is a simple CNN followed by a two-layer MLP. The only trick used + # here is that the action values (i.e. left and right action value) are + # computed using + # + # .. math:: + # + # val = b(obs) + v(obs) - \mathbb{E}[v(obs)] + # + # where :math:`b` is a :math:`\# obs \rightarrow 1` function and :math:`v` is a + # :math:`\# obs \rightarrow num_actions` function. + # + # Our network is wrapped in a :class:`torchrl.modules.QValueActor`, + # which will read the state-action + # values, pick up the one with the maximum value and write all those results + # in the input :class:`tensordict.TensorDict`. + # + + def make_model(dummy_env): + cnn_kwargs = { + "num_cells": [32, 64, 64], + "kernel_sizes": [6, 4, 3], + "strides": [2, 2, 1], + "activation_class": nn.ELU, + # This can be used to reduce the size of the last layer of the CNN + # "squeeze_output": True, + # "aggregator_class": nn.AdaptiveAvgPool2d, + # "aggregator_kwargs": {"output_size": (1, 1)}, + } + mlp_kwargs = { + "depth": 2, + "num_cells": [ + 64, + 64, + ], + "activation_class": nn.ELU, + } + net = DuelingCnnDQNet( + dummy_env.action_spec.shape[-1], 1, cnn_kwargs, mlp_kwargs + ).to(device) + net.value[-1].bias.data.fill_(init_bias) + + actor = QValueActor(net, in_keys=["pixels"], spec=dummy_env.action_spec).to( + device + ) + # init actor: because the model is composed of lazy conv/linear layers, + # we must pass a fake batch of data through it to instantiate them. + tensordict = dummy_env.fake_tensordict() + actor(tensordict) + + # we wrap our actor in an EGreedyWrapper for data collection + actor_explore = EGreedyWrapper( + actor, + annealing_num_steps=total_frames, + eps_init=eps_greedy_val, + eps_end=eps_greedy_val_env, ) - env = TransformedEnv( - base_env, - Compose( - StepCounter(), # to count the steps of each trajectory - ToTensorImage(), - RewardScaling(loc=0.0, scale=0.1), - GrayScale(), - Resize(64, 64), - CatFrames(4, in_keys=["pixels"], dim=-3), - ObservationNorm(in_keys=["pixels"], **obs_norm_sd), - ), - ) - return env - - -############################################################################### -# Compute normalizing constants -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# To normalize images, we don't want to normalize each pixel independently -# with a full ``[C, W, H]`` normalizing mask, but with simpler ``[C, 1, 1]`` -# shaped set of normalizing constants (loc and scale parameters). -# We will be using the ``reduce_dim`` argument -# of :meth:`torchrl.envs.ObservationNorm.init_stats` to instruct which -# dimensions must be reduced, and the ``keep_dims`` parameter to ensure that -# not all dimensions disappear in the process: -# - - -def get_norm_stats(): - test_env = make_env() - test_env.transform[-1].init_stats( - num_iter=1000, cat_dim=0, reduce_dim=[-1, -2, -4], keep_dims=(-1, -2) + return actor, actor_explore + + ############################################################################### + # Collecting and storing data + # --------------------------- + # + # Replay buffers + # ~~~~~~~~~~~~~~ + # + # Replay buffers play a central role in off-policy RL algorithms such as DQN. + # They constitute the dataset we will be sampling from during training. + # + # Here, we will use a regular sampling strategy, although a prioritized RB + # could improve the performance significantly. + # + # We place the storage on disk using + # :class:`torchrl.data.replay_buffers.storages.LazyMemmapStorage` class. This + # storage is created in a lazy manner: it will only be instantiated once the + # first batch of data is passed to it. + # + # The only requirement of this storage is that the data passed to it at write + # time must always have the same shape. + + def get_replay_buffer(buffer_size, n_optim, batch_size): + replay_buffer = TensorDictReplayBuffer( + batch_size=batch_size, + storage=LazyMemmapStorage(buffer_size), + prefetch=n_optim, + ) + return replay_buffer + + ############################################################################### + # Data collector + # ~~~~~~~~~~~~~~ + # + # As in `PPO ` and + # `DDPG `, we will be using + # a data collector as a dataloader in the outer loop. + # + # We choose the following configuration: we will be running a series of + # parallel environments synchronously in parallel in different collectors, + # themselves running in parallel but asynchronously. + # The advantage of this configuration is that we can balance the amount of + # compute that is executed in batch with what we want to be executed + # asynchronously. We encourage the reader to experiment how the collection + # speed is impacted by modifying the number of collectors (ie the number of + # environment constructors passed to the collector) and the number of + # environment executed in parallel in each collector (controlled by the + # ``num_workers`` hyperparameter). + # + # When building the collector, we can choose on which device we want the + # environment and policy to execute the operations through the ``device`` + # keyword argument. The ``storing_devices`` argument will modify the + # location of the data being collected: if the batches that we are gathering + # have a considerable size, we may want to store them on a different location + # than the device where the computation is happening. For asynchronous data + # collectors such as ours, different storing devices mean that the data that + # we collect won't sit on the same device each time, which is something that + # out training loop must account for. For simplicity, we set the devices to + # the same value for all sub-collectors. + + def get_collector( + obs_norm_sd, + num_collectors, + actor_explore, + frames_per_batch, + total_frames, + device, + ): + data_collector = MultiaSyncDataCollector( + [ + make_env(parallel=True, obs_norm_sd=obs_norm_sd), + ] + * num_collectors, + policy=actor_explore, + frames_per_batch=frames_per_batch, + total_frames=total_frames, + # this is the default behaviour: the collector runs in ``"random"`` (or explorative) mode + exploration_mode="random", + # We set the all the devices to be identical. Below is an example of + # heterogeneous devices + device=device, + storing_device=device, + split_trajs=False, + postproc=MultiStep(gamma=gamma, n_steps=5), + ) + return data_collector + + ############################################################################### + # Loss function + # ------------- + # + # Building our loss function is straightforward: we only need to provide + # the model and a bunch of hyperparameters to the DQNLoss class. + # + # Target parameters + # ~~~~~~~~~~~~~~~~~ + # + # Many off-policy RL algorithms use the concept of "target parameters" when it + # comes to estimate the value of the next state or state-action pair. + # The target parameters are lagged copies of the model parameters. Because + # their predictions mismatch those of the current model configuration, they + # help learning by putting a pessimistic bound on the value being estimated. + # This is a powerful trick (known as "Double Q-Learning") that is ubiquitous + # in similar algorithms. + # + + def get_loss_module(actor, gamma): + loss_module = DQNLoss(actor, gamma=gamma, delay_value=True) + target_updater = SoftUpdate(loss_module) + return loss_module, target_updater + + ############################################################################### + # Hyperparameters + # --------------- + # + # Let's start with our hyperparameters. The following setting should work well + # in practice, and the performance of the algorithm should hopefully not be + # too sensitive to slight variations of these. + + device = "cuda:0" if torch.cuda.device_count() > 0 else "cpu" + + ############################################################################### + # Optimizer + # ~~~~~~~~~ + + # the learning rate of the optimizer + lr = 2e-3 + # weight decay + wd = 1e-5 + # the beta parameters of Adam + betas = (0.9, 0.999) + # Optimization steps per batch collected (aka UPD or updates per data) + n_optim = 8 + + ############################################################################### + # DQN parameters + # ~~~~~~~~~~~~~~ + # gamma decay factor + gamma = 0.99 + + ############################################################################### + # Smooth target network update decay parameter. + # This loosely corresponds to a 1/tau interval with hard target network + # update + tau = 0.02 + + ############################################################################### + # Data collection and replay buffer + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # Values to be used for proper training have been commented. + # + # Total frames collected in the environment. In other implementations, the + # user defines a maximum number of episodes. + # This is harder to do with our data collectors since they return batches + # of N collected frames, where N is a constant. + # However, one can easily get the same restriction on number of episodes by + # breaking the training loop when a certain number + # episodes has been collected. + total_frames = 4096 # 500000 + + ############################################################################### + # Random frames used to initialize the replay buffer. + init_random_frames = 100 # 1000 + + ############################################################################### + # Frames in each batch collected. + frames_per_batch = 32 # 128 + + ############################################################################### + # Frames sampled from the replay buffer at each optimization step + batch_size = 32 # 256 + + ############################################################################### + # Size of the replay buffer in terms of frames + buffer_size = min(total_frames, 100000) + + ############################################################################### + # Number of environments run in parallel in each data collector + num_workers = 2 # 8 + num_collectors = 2 # 4 + + ############################################################################### + # Environment and exploration + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # + # We set the initial and final value of the epsilon factor in Epsilon-greedy + # exploration. + # Since our policy is deterministic, exploration is crucial: without it, the + # only source of randomness would be the environment reset. + + eps_greedy_val = 0.1 + eps_greedy_val_env = 0.005 + + ############################################################################### + # To speed up learning, we set the bias of the last layer of our value network + # to a predefined value (this is not mandatory) + init_bias = 2.0 + + ############################################################################### + # .. note:: + # For fast rendering of the tutorial ``total_frames`` hyperparameter + # was set to a very low number. To get a reasonable performance, use a greater + # value e.g. 500000 + # + + ############################################################################### + # Building a Trainer + # ------------------ + # + # TorchRL's :class:`torchrl.trainers.Trainer` class constructor takes the + # following keyword-only arguments: + # + # - ``collector`` + # - ``loss_module`` + # - ``optimizer`` + # - ``logger``: A logger can be + # - ``total_frames``: this parameter defines the lifespan of the trainer. + # - ``frame_skip``: when a frame-skip is used, the collector must be made + # aware of it in order to accurately count the number of frames + # collected etc. Making the trainer aware of this parameter is not + # mandatory but helps to have a fairer comparison between settings where + # the total number of frames (budget) is fixed but the frame-skip is + # variable. + + stats = get_norm_stats() + test_env = make_env(parallel=False, obs_norm_sd=stats) + # Get model + actor, actor_explore = make_model(test_env) + loss_module, target_net_updater = get_loss_module(actor, gamma) + target_net_updater.init_() + + collector = get_collector( + stats, num_collectors, actor_explore, frames_per_batch, total_frames, device ) - obs_norm_sd = test_env.transform[-1].state_dict() - # let's check that normalizing constants have a size of ``[C, 1, 1]`` where - # ``C=4`` (because of :class:`torchrl.envs.CatFrames`). - print("state dict of the observation norm:", obs_norm_sd) - return obs_norm_sd - - -############################################################################### -# Building the model (Deep Q-network) -# ----------------------------------- -# -# The following function builds a :class:`torchrl.modules.DuelingCnnDQNet` -# object which is a simple CNN followed by a two-layer MLP. The only trick used -# here is that the action values (i.e. left and right action value) are -# computed using -# -# .. math:: -# -# val = b(obs) + v(obs) - \mathbb{E}[v(obs)] -# -# where :math:`b` is a :math:`\# obs \rightarrow 1` function and :math:`v` is a -# :math:`\# obs \rightarrow num_actions` function. -# -# Our network is wrapped in a :class:`torchrl.modules.QValueActor`, -# which will read the state-action -# values, pick up the one with the maximum value and write all those results -# in the input :class:`tensordict.TensorDict`. -# - - -def make_model(dummy_env): - cnn_kwargs = { - "num_cells": [32, 64, 64], - "kernel_sizes": [6, 4, 3], - "strides": [2, 2, 1], - "activation_class": nn.ELU, - # This can be used to reduce the size of the last layer of the CNN - # "squeeze_output": True, - # "aggregator_class": nn.AdaptiveAvgPool2d, - # "aggregator_kwargs": {"output_size": (1, 1)}, - } - mlp_kwargs = { - "depth": 2, - "num_cells": [ - 64, - 64, - ], - "activation_class": nn.ELU, - } - net = DuelingCnnDQNet( - dummy_env.action_spec.shape[-1], 1, cnn_kwargs, mlp_kwargs - ).to(device) - net.value[-1].bias.data.fill_(init_bias) - - actor = QValueActor(net, in_keys=["pixels"], spec=dummy_env.action_spec).to(device) - # init actor: because the model is composed of lazy conv/linear layers, - # we must pass a fake batch of data through it to instantiate them. - tensordict = dummy_env.fake_tensordict() - actor(tensordict) - - # we wrap our actor in an EGreedyWrapper for data collection - actor_explore = EGreedyWrapper( - actor, - annealing_num_steps=total_frames, - eps_init=eps_greedy_val, - eps_end=eps_greedy_val_env, + optimizer = torch.optim.Adam( + loss_module.parameters(), lr=lr, weight_decay=wd, betas=betas ) + exp_name = f"dqn_exp_{uuid.uuid1()}" + logger = CSVLogger(exp_name=exp_name, log_dir="./") - return actor, actor_explore - - -############################################################################### -# Collecting and storing data -# --------------------------- -# -# Replay buffers -# ~~~~~~~~~~~~~~ -# -# Replay buffers play a central role in off-policy RL algorithms such as DQN. -# They constitute the dataset we will be sampling from during training. -# -# Here, we will use a regular sampling strategy, although a prioritized RB -# could improve the performance significantly. -# -# We place the storage on disk using -# :class:`torchrl.data.replay_buffers.storages.LazyMemmapStorage` class. This -# storage is created in a lazy manner: it will only be instantiated once the -# first batch of data is passed to it. -# -# The only requirement of this storage is that the data passed to it at write -# time must always have the same shape. - - -def get_replay_buffer(buffer_size, n_optim, batch_size): - replay_buffer = TensorDictReplayBuffer( - batch_size=batch_size, - storage=LazyMemmapStorage(buffer_size), - prefetch=n_optim, - ) - return replay_buffer - - -############################################################################### -# Data collector -# ~~~~~~~~~~~~~~ -# -# As in `PPO ` and -# `DDPG `, we will be using -# a data collector as a dataloader in the outer loop. -# -# We choose the following configuration: we will be running a series of -# parallel environments synchronously in parallel in different collectors, -# themselves running in parallel but asynchronously. -# The advantage of this configuration is that we can balance the amount of -# compute that is executed in batch with what we want to be executed -# asynchronously. We encourage the reader to experiment how the collection -# speed is impacted by modifying the number of collectors (ie the number of -# environment constructors passed to the collector) and the number of -# environment executed in parallel in each collector (controlled by the -# ``num_workers`` hyperparameter). -# -# When building the collector, we can choose on which device we want the -# environment and policy to execute the operations through the ``device`` -# keyword argument. The ``storing_devices`` argument will modify the -# location of the data being collected: if the batches that we are gathering -# have a considerable size, we may want to store them on a different location -# than the device where the computation is happening. For asynchronous data -# collectors such as ours, different storing devices mean that the data that -# we collect won't sit on the same device each time, which is something that -# out training loop must account for. For simplicity, we set the devices to -# the same value for all sub-collectors. - - -def get_collector( - obs_norm_sd, num_collectors, actor_explore, frames_per_batch, total_frames, device -): - data_collector = MultiaSyncDataCollector( - [ - make_env(parallel=True, obs_norm_sd=obs_norm_sd), - ] - * num_collectors, - policy=actor_explore, - frames_per_batch=frames_per_batch, + trainer = Trainer( + collector=collector, total_frames=total_frames, - # this is the default behaviour: the collector runs in ``"random"`` (or explorative) mode - exploration_mode="random", - # We set the all the devices to be identical. Below is an example of - # heterogeneous devices - device=device, - storing_device=device, - split_trajs=False, - postproc=MultiStep(gamma=gamma, n_steps=5), + frame_skip=1, + loss_module=loss_module, + optimizer=optimizer, + logger=logger, + optim_steps_per_batch=n_optim, ) - return data_collector - - -############################################################################### -# Loss function -# ------------- -# -# Building our loss function is straightforward: we only need to provide -# the model and a bunch of hyperparameters to the DQNLoss class. -# -# Target parameters -# ~~~~~~~~~~~~~~~~~ -# -# Many off-policy RL algorithms use the concept of "target parameters" when it -# comes to estimate the value of the next state or state-action pair. -# The target parameters are lagged copies of the model parameters. Because -# their predictions mismatch those of the current model configuration, they -# help learning by putting a pessimistic bound on the value being estimated. -# This is a powerful trick (known as "Double Q-Learning") that is ubiquitous -# in similar algorithms. -# - - -def get_loss_module(actor, gamma): - loss_module = DQNLoss(actor, gamma=gamma, delay_value=True) - target_updater = SoftUpdate(loss_module) - return loss_module, target_updater - - -############################################################################### -# Hyperparameters -# --------------- -# -# Let's start with our hyperparameters. The following setting should work well -# in practice, and the performance of the algorithm should hopefully not be -# too sensitive to slight variations of these. - -device = "cuda:0" if torch.cuda.device_count() > 0 else "cpu" - -############################################################################### -# Optimizer -# ~~~~~~~~~ - -# the learning rate of the optimizer -lr = 2e-3 -# weight decay -wd = 1e-5 -# the beta parameters of Adam -betas = (0.9, 0.999) -# Optimization steps per batch collected (aka UPD or updates per data) -n_optim = 8 - -############################################################################### -# DQN parameters -# ~~~~~~~~~~~~~~ -# gamma decay factor -gamma = 0.99 - -############################################################################### -# Smooth target network update decay parameter. -# This loosely corresponds to a 1/tau interval with hard target network -# update -tau = 0.02 - -############################################################################### -# Data collection and replay buffer -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Values to be used for proper training have been commented. -# -# Total frames collected in the environment. In other implementations, the -# user defines a maximum number of episodes. -# This is harder to do with our data collectors since they return batches -# of N collected frames, where N is a constant. -# However, one can easily get the same restriction on number of episodes by -# breaking the training loop when a certain number -# episodes has been collected. -total_frames = 4096 # 500000 - -############################################################################### -# Random frames used to initialize the replay buffer. -init_random_frames = 100 # 1000 - -############################################################################### -# Frames in each batch collected. -frames_per_batch = 32 # 128 - -############################################################################### -# Frames sampled from the replay buffer at each optimization step -batch_size = 32 # 256 - -############################################################################### -# Size of the replay buffer in terms of frames -buffer_size = min(total_frames, 100000) - -############################################################################### -# Number of environments run in parallel in each data collector -num_workers = 2 # 8 -num_collectors = 2 # 4 - -############################################################################### -# Environment and exploration -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# We set the initial and final value of the epsilon factor in Epsilon-greedy -# exploration. -# Since our policy is deterministic, exploration is crucial: without it, the -# only source of randomness would be the environment reset. - -eps_greedy_val = 0.1 -eps_greedy_val_env = 0.005 - -############################################################################### -# To speed up learning, we set the bias of the last layer of our value network -# to a predefined value (this is not mandatory) -init_bias = 2.0 - -############################################################################### -# .. note:: -# For fast rendering of the tutorial ``total_frames`` hyperparameter -# was set to a very low number. To get a reasonable performance, use a greater -# value e.g. 500000 -# - -############################################################################### -# Building a Trainer -# ------------------ -# -# TorchRL's :class:`torchrl.trainers.Trainer` class constructor takes the -# following keyword-only arguments: -# -# - ``collector`` -# - ``loss_module`` -# - ``optimizer`` -# - ``logger``: A logger can be -# - ``total_frames``: this parameter defines the lifespan of the trainer. -# - ``frame_skip``: when a frame-skip is used, the collector must be made -# aware of it in order to accurately count the number of frames -# collected etc. Making the trainer aware of this parameter is not -# mandatory but helps to have a fairer comparison between settings where -# the total number of frames (budget) is fixed but the frame-skip is -# variable. - -stats = get_norm_stats() -test_env = make_env(parallel=False, obs_norm_sd=stats) -# Get model -actor, actor_explore = make_model(test_env) -loss_module, target_net_updater = get_loss_module(actor, gamma) -target_net_updater.init_() - -collector = get_collector( - stats, num_collectors, actor_explore, frames_per_batch, total_frames, device -) -optimizer = torch.optim.Adam( - loss_module.parameters(), lr=lr, weight_decay=wd, betas=betas -) -exp_name = f"dqn_exp_{uuid.uuid1()}" -logger = CSVLogger(exp_name=exp_name, log_dir="./") - -trainer = Trainer( - collector=collector, - total_frames=total_frames, - frame_skip=1, - loss_module=loss_module, - optimizer=optimizer, - logger=logger, - optim_steps_per_batch=n_optim, -) - -############################################################################### -# Registering hooks -# ~~~~~~~~~~~~~~~~~ -# -# Registering hooks can be achieved in two separate ways: -# -# - If the hook has it, the :meth:`torchrl.trainers.TrainerHookBase.register` -# method is the first choice. One just needs to provide the trainer as input -# and the hook will be registered with a default name at a default location. -# For some hooks, the registration can be quite complex: :class:`torchrl.trainers.ReplayBufferTrainer` -# requires 3 hooks (``extend``, ``sample`` and ``update_priority``) which -# can be cumbersome to implement. -buffer_hook = ReplayBufferTrainer( - get_replay_buffer(buffer_size, n_optim, batch_size=batch_size), - flatten_tensordicts=True, -) -buffer_hook.register(trainer) -weight_updater = UpdateWeights(collector, update_weights_interval=1) -weight_updater.register(trainer) -recorder = Recorder( - record_interval=100, # log every 100 optimization steps - record_frames=10_000, # maximum number of frames in the record - frame_skip=1, - policy_exploration=actor_explore, - environment=test_env, - exploration_mode="mode", - log_keys=[("next", "reward")], - out_keys={("next", "reward"): "rewards"}, - log_pbar=True, -) -recorder.register(trainer) - -############################################################################### -# - Any callable (including :class:`torchrl.trainers.TrainerHookBase` -# subclasses) can be registered using :meth:`torchrl.trainers.Trainer.register_op`. -# In this case, a location must be explicitely passed (). This method gives -# more control over the location of the hook but it also requires more -# understanding of the Trainer mechanism. -# Check the `trainer documentation `_ -# for a detailed description of the trainer hooks. -# -trainer.register_op("post_optim", target_net_updater.step) - -############################################################################### -# We can log the training rewards too. Note that this is of limited interest -# with CartPole, as rewards are always 1. The discounted sum of rewards is miximised -# not by getting higher rewards but by keeping the cart-pole alive for longer. -# This will be reflected by the `total_rewards` value displayed in the progress bar. -# -log_reward = LogReward() -log_reward.register(trainer) - -############################################################################### -# .. note:: -# It is possible to link multiple optimizers to the trainer if needed. -# In this case, each optimizer will be tied to a field in the loss dictionary. -# Check the :class:`torchrl.trainers.OptimizerHook` to learn more. -# -# Here we are, ready to train our algorithm! A simple call to -# ``trainer.train()`` and we'll be getting our results logged in. -# -trainer.train() - -############################################################################### -# We can now quickly check the CSVs with the results. - - -def print_csv_files_in_folder(folder_path): - """ - Find all CSV files in a folder and return the first 10 lines of each file as a string. - - Args: - folder_path (str): The relative path to the folder. - Returns: - str: A string containing the first 10 lines of each CSV file in the folder. - """ - csv_files = [] - output_str = "" - for file in os.listdir(folder_path): - if file.endswith(".csv"): - csv_files.append(os.path.join(folder_path, file)) - for csv_file in csv_files: - output_str += f"File: {csv_file}\n" - with open(csv_file, "r") as f: - for i, line in enumerate(f): - if i == 10: - break - output_str += line.strip() + "\n" - output_str += "\n" - return output_str - - -print_csv_files_in_folder(logger.experiment.log_dir) - -############################################################################### -# Conclusion and possible improvements -# ------------------------------------ -# -# In this tutorial we have learned: -# -# - How to write a Trainer, including building its components and registering -# them in the trainer; -# - How to code a DQN algorithm, including how to create a policy that picks -# up the action with the highest value with -# :class:`torchrl.modules.QValueNetwork`; -# - How to build a multiprocessed data collector; -# -# Possible improvements to this tutorial could include: -# -# - Using the :class:`torchrl.data.MultiStep` -# post-processing. Multi-step will project an action -# to the :math:`n^{th}` following step, and create a discounted sum of the -# rewards in between. This trick can make the algorithm noticeably less -# myopic (although the reward is then biased). To use this, simply -# create the collector with -# -# >>> from torchrl.data.postprocs.postprocs import MultiStep -# >>> collector = CollectorClass(..., postproc=MultiStep(gamma, n)) -# -# where ``n`` is the number of looking-forward steps. Pay attention to the -# fact that the ``gamma`` factor has to be corrected by the number of -# steps till the next observation when being passed to -# ``vec_td_lambda_advantage_estimate``: -# -# >>> gamma = gamma ** tensordict["steps_to_next_obs"] -# -# - A prioritized replay buffer could also be used. This will give a -# higher priority to samples that have the worst value accuracy. -# Learn more on the `replay buffer section `_ -# of the documentation. -# - A distributional loss (see :class:`torchrl.objectives.DistributionalDQNLoss` -# for more information). -# - More fancy exploration techniques, such as :class:`torchrl.modules.NoisyLinear` layers and such. + ############################################################################### + # Registering hooks + # ~~~~~~~~~~~~~~~~~ + # + # Registering hooks can be achieved in two separate ways: + # + # - If the hook has it, the :meth:`torchrl.trainers.TrainerHookBase.register` + # method is the first choice. One just needs to provide the trainer as input + # and the hook will be registered with a default name at a default location. + # For some hooks, the registration can be quite complex: :class:`torchrl.trainers.ReplayBufferTrainer` + # requires 3 hooks (``extend``, ``sample`` and ``update_priority``) which + # can be cumbersome to implement. + buffer_hook = ReplayBufferTrainer( + get_replay_buffer(buffer_size, n_optim, batch_size=batch_size), + flatten_tensordicts=True, + ) + buffer_hook.register(trainer) + weight_updater = UpdateWeights(collector, update_weights_interval=1) + weight_updater.register(trainer) + recorder = Recorder( + record_interval=1, # log every 100 optimization steps + record_frames=10_000, # maximum number of frames in the record + frame_skip=1, + policy_exploration=actor_explore, + environment=test_env, + exploration_mode="mode", + log_keys=[("next", "reward")], + out_keys={("next", "reward"): "rewards"}, + log_pbar=True, + ) + recorder.register(trainer) + + ############################################################################### + # - Any callable (including :class:`torchrl.trainers.TrainerHookBase` + # subclasses) can be registered using :meth:`torchrl.trainers.Trainer.register_op`. + # In this case, a location must be explicitely passed (). This method gives + # more control over the location of the hook but it also requires more + # understanding of the Trainer mechanism. + # Check the `trainer documentation `_ + # for a detailed description of the trainer hooks. + # + trainer.register_op("post_optim", target_net_updater.step) + + ############################################################################### + # We can log the training rewards too. Note that this is of limited interest + # with CartPole, as rewards are always 1. The discounted sum of rewards is miximised + # not by getting higher rewards but by keeping the cart-pole alive for longer. + # This will be reflected by the `total_rewards` value displayed in the progress bar. + # + log_reward = LogReward(log_pbar=True) + log_reward.register(trainer) + + ############################################################################### + # .. note:: + # It is possible to link multiple optimizers to the trainer if needed. + # In this case, each optimizer will be tied to a field in the loss dictionary. + # Check the :class:`torchrl.trainers.OptimizerHook` to learn more. + # + # Here we are, ready to train our algorithm! A simple call to + # ``trainer.train()`` and we'll be getting our results logged in. + # + trainer.train() + + ############################################################################### + # We can now quickly check the CSVs with the results. + + def print_csv_files_in_folder(folder_path): + """ + Find all CSV files in a folder and return the first 10 lines of each file as a string. + + Args: + folder_path (str): The relative path to the folder. + + Returns: + str: A string containing the first 10 lines of each CSV file in the folder. + """ + csv_files = [] + output_str = "" + for file in os.listdir(folder_path): + if file.endswith(".csv"): + csv_files.append(os.path.join(folder_path, file)) + for csv_file in csv_files: + output_str += f"File: {csv_file}\n" + with open(csv_file, "r") as f: + for i, line in enumerate(f): + if i == 10: + break + output_str += line.strip() + "\n" + output_str += "\n" + return output_str + + print_csv_files_in_folder(logger.experiment.log_dir) + + ############################################################################### + # Conclusion and possible improvements + # ------------------------------------ + # + # In this tutorial we have learned: + # + # - How to write a Trainer, including building its components and registering + # them in the trainer; + # - How to code a DQN algorithm, including how to create a policy that picks + # up the action with the highest value with + # :class:`torchrl.modules.QValueNetwork`; + # - How to build a multiprocessed data collector; + # + # Possible improvements to this tutorial could include: + # + # - Using the :class:`torchrl.data.MultiStep` + # post-processing. Multi-step will project an action + # to the :math:`n^{th}` following step, and create a discounted sum of the + # rewards in between. This trick can make the algorithm noticeably less + # myopic (although the reward is then biased). To use this, simply + # create the collector with + # + # >>> from torchrl.data.postprocs.postprocs import MultiStep + # >>> collector = CollectorClass(..., postproc=MultiStep(gamma, n)) + # + # where ``n`` is the number of looking-forward steps. Pay attention to the + # fact that the ``gamma`` factor has to be corrected by the number of + # steps till the next observation when being passed to + # ``vec_td_lambda_advantage_estimate``: + # + # >>> gamma = gamma ** tensordict["steps_to_next_obs"] + # + # - A prioritized replay buffer could also be used. This will give a + # higher priority to samples that have the worst value accuracy. + # Learn more on the `replay buffer section `_ + # of the documentation. + # - A distributional loss (see :class:`torchrl.objectives.DistributionalDQNLoss` + # for more information). + # - More fancy exploration techniques, such as :class:`torchrl.modules.NoisyLinear` layers and such. From ac6c83b7e7fedfbdea342218ccb4b96e6cdf8687 Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 4 Apr 2023 09:19:38 +0100 Subject: [PATCH 74/89] amend --- docs/source/_static/img/replaybuffer_traj.png | Bin 0 -> 252140 bytes torchrl/objectives/common.py | 3 +- torchrl/trainers/trainers.py | 15 +- tutorials/sphinx-tutorials/coding_dqn.py | 1377 +++++++++-------- 4 files changed, 704 insertions(+), 691 deletions(-) create mode 100644 docs/source/_static/img/replaybuffer_traj.png diff --git a/docs/source/_static/img/replaybuffer_traj.png b/docs/source/_static/img/replaybuffer_traj.png new file mode 100644 index 0000000000000000000000000000000000000000..64773ee8f784895bc0c3a6c8b4a390bbfa1ed7ee GIT binary patch literal 252140 zcmZU*2UJsA7cGpUqS7oV9Z{+ZQl%GBX-W|!V2~nWfJiT)hhjmc*B~XRfFNB%XhDLC z(n7C72mu2|h!A>!fc~3%@At<4#yev;jB#+z*?X_O_FQw$#fw{JhMY$QjvVeaPjJ|Gw2lBfF4Lkk-J^?9Y?I|jdNy$P${+9^f4HT8z4suG zCN4T3agciEZ|K$Dq}dAT*ABf5XOp$u9}-H&lPhPH$W3^>&OY$)e=qL=RUz=y|E`bF z>xBP(W?2e6F3bAQztkE_OC{;|dvK$vRP_unq=`7PPvJlRb>(iVI248WgB|mbey>nk zdV{MIGxTkNbriwb!uP#37g5!-`cd5YhgzUL12uRAjBgUWfF;jAKm7V^Y`*AWjhjzf zpD4uU->I-hlD=#(cdvYP@4?wfy}=Y07t3=X$<1Q{VA7SeFg=XXS>Iy$`(eiglSr37$+uRl~F4*|sRe$OE@t ztx*I;4gVcC`u4n()gcy^MRw%BQ&lg$aG?+*leTzk@BRqKeW^La&oxi$r%*#65E!)5 zx@;jZY$PvS;6u)_Ke^S5r!rtAdOBBOEG$#_WB)xpU#RVaXCvN_%vjaR{LDxd_qAtC zXuq0#v=I-5Ko069iyK2A7Ch+77tUX^wnbfb8v3eEa{x~mK+y4z!ojV&nSYVj&i^y3 zK8RXR_xQ5}MXq*rzL?`}AD+DKv2Q=IoPByRyLz!^hV4Y*$EtKzmMB&F3p{uYa7`R- z^DPWYjPLwjTw1EtEAjfA!YN`c+o4nGd@&)SksTNln>tVC{@w4Lm~@L{Nkt8%7dj%e zKcida#_mC_wH4N{i!2ODJcaYMUU@U@YMS{Oc@9lh^-3D}6zv(~w7I=K-hhKa zi3@=ye=f6A*e@nh@SjrY%mrmSBh1*z@vOA4n{OQvoo-~D1kX-rb|>TEY;HSCkNsp& zwl)OMtE6BglH{S1VAtk5!4(fh4c8qUWBROsL3;`PZv~I6S893cTLkAxk)jH{(>#ga z^q?r?R37~gV;~?J{QO0!1Cvd!034W`QG1$BK8a*KoA> z2uT^IYW2qD^BGyDiaT#>T{e1Q{M9|hWAafJqM&D0>-_gaZN7&_2ApJ{@q(>Y5AC`O zLWzatlWCu{zGm`mNJh-HFY*%$fjQl+j9q2ZphonX$%WYaO%4p$^{bM#p57{gP-G)% zF;Me}cl$=B?14dL0!EqR+6fuWjNOM1F7>VkdKhRXAYwePqhN~ZKa8idw0hq@HyIQn zqus^H4B8nJ!VPp&pYr2%8FPoA2-2++7Y45tX_CC^T4uF8jZ9crqKJA}j=h(PqQ!9) zdfy8BSuA>3S?C8dq984q&OE`>!M(lm1Gnf>UGnUQ$)L%g)qWd&3pegJ`gc$g`vl1h zK|LN~Ekz9#S$FNe1_P2RzA*OfNl)tKW8a{=j&yRfV}&&gI$qmE@@fmf;@IPcILFLV z#ZfTh_Z1QCG6Vsx$8+J{Nfsc#OmW^92hjqqOBV==N4%fsORs2+zP}Z=ah~7A{E^8* zrT!NLNjkQ)v~%^Nf4m-;iYvnoHX5ke5d3eWP}aL?{rh3{-+IK2QEB3E*9MuJFHkit zu?Y0%v(zc4t1$3r3!bM=d}3@K9?AKVwqPw5dHWUrePXG1l-{5+Ma`>Csnz7DL^Wjn z;HwAGc!Y3UiW`)Sqb5@uBuj+9_204O(O(cvKHctvEw@h%$#Z2>JV9YK!PFR$Zo#v4 zOoWbufr$T?SL>w-nEB{)TZY+1$bH=WKh5%?inf%fBgf?^uS-i;^$CTlcS7W~L|(5T z9dQ#9#skk|jqCdnr0KpMmumQ51>;od*iMXH)o6n6d3ygNL<@oxCdUcZs^j^8+0nv@ z@8O+uhxZ@xbNgOk8aYo(w;omPAC;dA3rM_b#5M0LuzKZ(KdWX9cVQS*+g8$|ZhF#+ zRa|lngg7LF8q!GeD#>;D_6Xq|>;h&Ys?FfPXee7o>Kdn(mgYS16+OVjBv2 z`pe4y$+OGygUVoPSYBQ_rr+=LyvH{e1fFJgrL;45c9D96Sh~2R@rgOw2A@U#>hRth z*G0L33=A$CXTBIZQ}3A3Cz(V_;0-LrhA+nTXs}UL0Zg^>4Mv`W=%17@jjN2$ekuOq z`s32lRJXCEc$gvl$?wSm81Fkl=a)t2-?X_)fP395@|M1`cT<+ib_VjucX8u^vC$hX z>aGGZ7C+1h284hJ#Zg4@)6OU5Wd(9KEIC_V^2-j^n^C)5gLu}}gGg%U`S+IU)9$&w zXaFBF;F%cF!{g_Dg$y5QD7sL77!RMXLM5O&bPt7kC3M8maM?k-Rj1pP$Mw49<3myF zWTeF_1^>tP-y3MHMgyyK+;L4XTJ`^}4@~2YjRns_a(w)Vw1rUb>Y+#HSkYmz`F%PM z5aVyUmAeasYnAhc8cNbqc@F6D=sVYnhM1=os=ZX_aE7AFhHrGf{~*ly8)1-_D!#1c zy%N#;AO)x3=xnVHp3->je_OXKNY1@YE1aBZ1VO>%1%n@%NZdj=>ku&FBSs7m zLq1ji%l7csuczRtViOxu*3<;hQ55AqGo>kO9rVC!2|H7(Qq?Ty-vsun= z2Iu8$obVSR`H8XH?V@BSdV!;5*;x2wpPx{%{jI91s_a4)0%n8RRQR=aq2Qo|AYb*& z4nk@)Ar#djpPOv;w#}`M*geJtRuR%y_is+inPTFfsOC#Q3>Z#TU<@cHQ0^4y>08W% zToUe=A;eOG8%Z08?}{AG)niYbzTf#&yRZt^}&bH^QK$4RjvY8*i9R_5?p?2y|L>`;g$cy| zUUcJgsUw=Q)29`!%OH8w7^mNXjX4XkY%DB&tw!?ssd)u}lc0 z!uldV+X;}kE&+%VrI|)mf3`P&-q*Dke=v2P?Y4HT1;;q?=<9s~Ur(fvecngai5aKL z5Hza80*p}VwEKE_R0-n~ZOfbR@4lKnz9I+pZt*;YXG?Xb0T}E<=)`nkx|F*CZW2FF zSectU&(Xrn*8L5>aIYRKw=T<<*6O`ExK@#Q7z~K*I$<%UYw%*pEa$XEhgFxkx%tn} zKN`wQANti5f@hsSaUrcm@6BYvx88zZSN^IdQtEDjkxfm*ppw<;=l@2DMaqfaJFS15 z`yKe!;h1bAYe#fU=Mxxo@pH=q{~BAwACk^{8QOlhQEFl7g%*>thC`b~emw`cNvJJotw(7JEIn)Uc&%{Ebb~+6wqivK zK&|`N*gHLj*DKF*t~~O^zbKa=fH9xEbkKjKDVA`Cd{Ey)+;Q~8g)n)g-hzB-2M324 z@(@U>d)GaIZ4^nFGb5{L4p-R<5KHpnVSS78QQATvQ{I2r$ZS1wm36C)Kp;G>IY{sr z9(TXR_2EEkY<@eYTy04!>BZ;CdW?*O)y6&T8w#&r&_>cF0w!+#^TB^_FT&lAA!fj` z4#YnANDtNOs`Vr@(!`HE66I)lb|hnN>G5_mo_;e|l%UA}qDP*i&9?{3U{7*BeIWWt zCr69XAXW$aT$9lmR_jrQXh-9DYkKP9CX<|-eEaLiPC8!z+~3Ca;Hae*Boo&LaW?`GVgHk|Ex4#W1$jtWEq*+||Ez{+0F9=d=;c8a87~;U0zsV)_G~)arupdtY5fPdi7#mgo<%pf9S;GFXOYWj?T#g3TCoYIQ7pHce8>kmHKTB z!w*KeyEkvorBbIpDWi%{|88G(`=kBe9M`i})kDVN;5b}1o}e=ZY#$OgmQn?L#uozD zn+a!HZ<#XOZQAg;wje)s_k`hZ zKh(Sok8jUcqrUsBY>#B$>RA^~&}CeH+QJ`#L=L-=`&p6v>pOv`GyehTqOzH~Y)l&c zZwb7`Oe5(h`S!mN6Gb${{to=+Dn7bznJaFsFH^oN6`_! zGtn>Ydb0Cw_wFi;3UT7W6xsFLtNdz-kMX4BhS?tBK0MU{SBT%*$pL9s`u{as;u zMLf8Cn9orpY-m9RMNQ1vW!xM78lqu3h>!8!YE;RKaeBu|W9~ZSm@+l=hwmoSX!BLA zdX00A49(=r3*7zEu?vA06FxfesnmWP^T6^w6Rf&i>x*UowzKQ;bu8Wy?$DD6}mLofGsPg*Zh z{di23I*vB|oFFm$*dSr@-un5on#?XzySg;iBWY5s2yLiD|F<|~`LrNr8#p-6Ngnso zENcGJf4sNwY<=|+2gdMd$)Em*(u%$@ABbbk_gl>6WkMg1l@rV_`{Cf)28UOGCpK=u6d|-c> z6#HxoAM%AWO{IJLiN;=e#@N{r(zk#>kjA5^ZgJ!BW)v(721R~Q;g2u#*SKU~hK`bs zojHT2GfZb;d$OZ@coH=lEFRNvmH@=7M&Rx=mEL`y3u-yFH!;TWcspMg^*A^cf8Ztnpzb%X4 z7Nb5jbVwaW!X*B>r4MW8JqFo$>%TGFoeG(JScCiC=yU8q*)XV2qFpRM9yCrhGM#dB z=E{c!0Z>Eo;%)n|;UY~?Jp71CrQ*YqTH)9btXWPqo2^R4rkdrGm-vN=6E z{ifWrm8sNT%fXEt>Q)JffSrzb-JJ3#K+HUC;JD!l^4Pe;=yMZ0i~~Z_IYPa$QuEEp z=>z_qA@O*B$4&g119}!b3AVB}Z^Ag=}Vv zZc!vRR7!UOr3%Mg=?>aio9xUJ9LHG-cprd8u;i4hoUQ>jR(ZM~~NP|FMjmMj*+m90d+Xs1Jm^PxoUqozJAh=0ftY5e63G zksc4O|B>#m*_OT0c1+Xpw^O|_k_%}xKcUam>(ol#SX`}vnt!jF9FPom7b_ELGQNX? z@O(>HyEbZwQqW#z4us=FFDJ+AcW&K3?^u{Czq)weDSjfo%BG_TiD0`|o)(vGCsORxCL5kdiv$PR zs{RcA_)q$j*K@_j`|jtp@`jg~^V;+1r;5X%r}aNLRMin>DR(ChXDJ$dIhDy_Zi}iR zE)?3Yoo%cO+<5;nDqq^VEJNA-7Za~?_~XZq99`X!6^N%aoZp+nkEYD-=KH{^>hCs^ z%q}E(5|JUvagQi!E6pGeTOVn8@10RV98zEt5bSTJEt)AtfP8T|Vt_h?51+f2u}3>G zi3=NU0;ut-MS^m73ez(5%^3%enT^Zr#gddl#JYPTpzh`!Tt+S;=9kfqJxJG1V{!gU zS|Su~rrxX0U^{IzX!R$xXzqzPdidpbUr2WY+W%I316~#(Vp?AeVzo<(`z(b#NWfm&jxqyhq_#*^qfKv8KgDhjCWJ!^5Qs0Mfj5WCZpZyF=!DWb zHlJ2)LJA_f-b&o2hp0zrLLj6sQ50kw1MIxFJq7xU=T-f}_py8uo}S*PdeyOSeASb{ z16^}|X-;;fIirReDjSIliFju}QvJ)FD^iJ}Vhwl>z&CmR$>bl@537Fa^xbV~4e7Yw zW+0LlI8vd+am{)(`$&A7QGDG+;}b3NhrIrVQqH&~Ana8SKPoMK;GryhV!X3m*?!&X=hIIB zh}tSkxG}gE>z%WGV5GivlA>Ne6r%h$&rh?BmxaYHwQI}D`=iLJ9s~kee|epZ?Njs) zARENyzpJ|Jg1tln+hd(7wCiHT!M!a0%4VMU%!#koGbwztW7SS+l0Qm;qd<-~AzugH zJnHuln!)CL;%=2-=$&U5yZR+&g=Gnt?X$@m0GPdGbf+bzctA_@tvLW+L~g11s}@O$ zxt){}p;|&1GJiEa_})PgN#NM|3NqU$r|_%4L%q`GO>!J{3CcI{PU8rQ8;1TvAnZcO zAU%fxz%2hUNOqn;vx7V7&?urnI`z^~6T#@gK3TfT+!VoyZ)5%b$Xz3$HrA}+uZ2$KFpY~|01x6ULmY6k9q(?MYF*LmxHKS{53D!2v?OZ{S5o(MA>sU9r z%ApFDI}4cRL6d^|HI)fyyRQ(T$~h)v=(-u7G#zXwQH?~UK85CLp`Sx#xU@t75*!|g z8)$nPio8~z)uK+=fNfi8kEgpIPdF2+*EZ2-2TtAnWBTO7A}wtUgC0EvzqK3?jJN6M z5t&+eP|5|aA&#YFmP`k|RXEgogg%kzNo?UZ+ny+k`{q00#jzmBeF5;rl5jHP*{d3r zobH#_qdm_8LxL7*af1zHaQr5YY@JZ>7jK2bhEH0Y;7>cK)GjcO05H972SduydirC&zhIT_HhPh33r%2)h-7bDpqc?|jP%Q}p4*OH&+IO9p!z#=|l{s7buHQJFSQ#;o+n#aFb}5jb-FH=T>G#CwOz3b!N=jRhi@Mwi zSxv+|1}xI*gpH-zO7yVtd3tz+aPwlxd>A0c%C7HjZajVDH6z4Bxhs8IJ;|)8I6!iJ z|K6sVIPAHJdULzb^t}SkfN2pPQ5Hl&kL&QQ299~HFEV4An#cyDAt6g%Yd83VM@l1Y zXKu?z2>BvK&jX0 zOYdu~6LkW0PYOm4Lp{vm8cganl062MiHyAITtIa8E$9EaBKFNPvNTzGsGSC<+|AoQ zE6?G)yt^3KIK8-oWWeVFRo;(m3c8Y6XJ3l88ShCQ2CqW-7vAKhM4xvm9xjUkd!x(G z;eu@umELaH+c7Jn+nt0OX3C?tIg067gVwq)PWOCLSKGN1z)qbik>!cYHIyRVp1%~b zm!W0*Tqb%_xCfLoPEQw*Bp--TifN;}H>_U&xxMTAjxJpNM?}hW5|CCpJ7llA1*(6@ zI9yJKIMmkDBPG2nSt8e}v{Z>D(-D+~W%$~3g?*IG@4G6+<%fO)Nyb-n4Fa1ozI)NU zAk@uly~$GZN;{js<;PGSfgE&r{8|UJH+_4s-sQUPco{vk2d9)|v76Q0f3Zww_71r& zXgvtKU7SGiC8?1S1$sfsA79$e;QO+MI=nMmMiXcb|I9nh4Rk`3dj*j?RMoru zv*`q2Y>>zgra;CxP-SznMLu!TJ8_b$4f_&oh8}pD_!w&oyYd~>EN=1TzD^Blb4rOL z%@k|sEd}Zr|6seVskf^JI+4w2(kd*bcKY+0hOYHMQxX^@k(h$_GE|j!vKP+DN0w96 zL@HmMoQG(BmJ%kb&wjMRnw*I=$9*=Jc7@)Hz?T1c``uVL>4}aI;dGC%Y+5uMvE}^t zTBNeu-by?PdJI6SF(W-DPS_EYi|nmh?}U&(8a2a>lZ#`c`d3~>#xSJ0Wm&Ja*1Uy} zGolKa3*km1-U>v|nr(wVhEOGU8j`)s+jLRMm5@d#HE+^ylWGHcXnu^nyi{`d_w;6zG#4!zobGt`vJqcMx~{XmPs zYQ>*%iA-UBc@nUpK8N$&a`$MFD1dEe_Bx?X=Pn9RQTGP>7kMe7u4#hIjRfw=2a#MUOo2JH@9f6Wb}L4 ztH*;TI(vM}tBH4XxtNxaEM+JB5JK@KLc>$F=R6a!I_esJPtj9Rc znL|iyi+Sa}e^K8B9PG9K9Y^#6Yr1ZOu z!A68%@lL%XYfXy`vAxU_-4j7(DkG;SI@tvLPejfy#;~af(F>L3FpjJDu0mxkKWd0G zU?XVgTk2?;d&a6Vo=#=Uqh4;c5>lzot$HUN+j=XDSH7XVGJ?xYX-g|R(!8N_8Bjl& zuEandeL(c(QRTZI06OnqF8~BGtJbq=IMxBkQV3ff#BS6ybp|k|y{prP*~+GwH~jOc zxDu^i!6c`Zs3AjZ83Jav@n@ZaNEq6Q4+WbI(fOE3RIsx~YxNRV?TWqT@0!Gv^sZ{f z=5N%CIbTTXk&&Rg*>19~&vc6vL@W9VwXJ@f&ZaZ*G#nHKh{lX&8G`8HeCgDXMc&TN zlOSCV_gY9*`8obl`iWakrqZK&jt&-oSgIL0c=|f__cAC98E1*L@azMNR@p_py}f&M za@jP0wa66+WaWh47K?5o81wYzeqb>GaBYDAfrPNFgd~T11sA#6dX@`q=37J#xEV@~ zBBUa_Kz$Ad@p;MN$#qBZ-Gy|5Ia+mkyfS}F6+^ZlK3YG!%V@pv6Ob2S6cRsh=!DcXzC2h?Dre%j{=exfp=7b*o=PbZ=5 z|KR)JGYHDQ9&#GiaTQ6I_J z>@)6{9a1hU4gstkpZ`|3Yd@v1vwX@q(e{C&2IMTk`vsbb$~l+rima6T1uStBFo27F z$~19Ly2&-~S)`cw{Oh4@+fQ%P^YZfYr3akKo^IIu`aSkqwqfA6W_gMuJo`+RT$}G{ z@7KB3)(Z(naiuV5LOxC0boB(X#76gpB?m*)V&L+PXC{0OupR{*PCwB?YA#O6&q@6u z4EoUZwRG%hq1?x`)~Ii^nOQo!C#6a-2$^7ioiD~RPBOj}0mcSoLv7K9(>X;@6f9NT z56a!LWXAS@RaAyB5S2v@ah_zQn?X^ub=$m`yh0!M`^yCXyKX9Dd;0C{n|$e`b0cD( zkMKLonht&=eo0iJ=ZQ&L>~Y*E7snT`m2FpR{7rJ;*(<_G6pXe$&Oj0{g@4nO=fGJx z@ks0Ea=As1VkN140*b1+cXKfpQ%b;?99mN{_1{r_I-q2NqyI8{YY{fHb2!FHEm5p2 z{zc`BBW=H8Q5ptF$+a&j1;0b`zRF@`?$*AIqZt(Y4|SAH@D5>Vtw?)kYFW@7j~Uvp zjxaFNEils8KwTca|97e)lilaZQ-><)Dkw*ox+N9Q?vc#uIF%+(NFAD`z3L!R~ zSH=xyujOK7Y|4hc>S*p&TI=kBTb@Up}v*pcpH|6$HM|fNNRAvvpI)unOyQh<$9-$$is7hLP zQHi}VP&H)SMV}NbM1|y;yLB{mX4b3{Ze*Lnl70AV{tSI&;^`rIPxws+(=YfAxJ@Tg zJXCTNYTQ6*qVyB{qtvOdQPe9>VrZ>fm4XYVrirLwbb#Yak1q&{NYw{)>EnVoV#L~{ zp7YYW{4xD*LiHm!WTCCuyJ<1PQSLcGE*j-Xi%&DT8IK20N-gn7#=d%9zhRt-G|d0~ z)W^Q0ho8ebC6eTT7Q-%OYNq7Jfo=U1ie4)|NY1SJ5d0eHDavk~+P*aEE5yV%Z)F1F zdZ*d$9bjd@)?^f&LBT3-zcJHVy)TsrC36)D$P@*z)9nv74v&XgS@1t zbE4*Gf*JdW@%aRGXFru(?*R;aG$BkYlK4n8`0tBl%3k)L8@YR=XVa@KB}Ub>jovo$w2=Vo;;Djj&RY-@UkO#U89 zN|BDuSK3S$e_=K_EWMg9{Ydmq)u;u}`l)m-9$lfDA2@}3pEcAa`|i{YA*Zd<_AT=8Us@SIV-{2H(We_2nA13dLpPuzb9p9pYRZ?Q{#9^UPQSgq(D_vsR3{%0 z!mq~UIpzqH#rTqLm)njPOb68x1!A)Ql6KwxFh5bCIz0FCjya-LXCX;!fP&LYPz)XJMRt>Y#9?Q{VM`Z-cy!Q+X zE=E4P6C@Ruq7A%H@8ACns5d(Fum|g0s4hHt)AcR7da=vd%3CMn?8VE)>!`(zeWm98 zvKF!3E@Gq476Qu}bwWSZd&PekLR=e|!!7np@P3rTvcIv{K`2!T+BCF!)e#NGmH88H zE-uwnY8UnN+qC483+b9DQSoch((>}5AApozadv{d8O zNK!$aNtB5VWoS~a{AIX3ShJ;-lDwqKazk~@E?(3+@t<5gB?E>;UXzb;0-6X$_qwZK znFqjVJ%>6;UZd(GBmHPIuXU8p#f6v%Qh`bfq82|F@eHhP<{v!NMRrd+U%oY#3&1Hx z18!1L(68uep*@}gi@=)XhQ@0s4NMZ$^I`t;)r+>3ED6WbJ&A_SXMKf&98?R<2bOTL zGy`n{NP2Fn-6@~r-!8J>H^F+8&|05XeYgp1F^OZf6)+ve{X;q-z*q=vsj8_=t_}|8 z{)U6Yo<3Wy?AI08lW17(0lyqj!yynT3I*ej@tyz5JlDHA{|V50f+-Me&bNbH9(!{7 zjdK#!hBn5jUstvGDS-Uj*ii*g$M{@ikbPbMnYU(f+l2@02v0D!w;+&zAj}qCS7$|) z+;XNMh(0-&9JG=N;1nj^f5UB5VOUyP;ViqIY7dh1Jm1JA_{Jc%pUwS~CF3c3W7TRM z&`*I8(cB`5A(NspO%%EjpZtRn8jkgu!{bd4}31epgOv`1Z6YUgl_`lC2VKq zpQ$W^`UdXA7x2)K@E*{dUO*WKwwjZ~rt3+Q*?XW&(e!rXuuk6F3p;nEF9vMK-2YcX ztw+fa#IZgBN6(GF=Cy)Ac=Rn-o^<*Ic`e{lib|aL{!#enP|tEKA;6$DKv=R!inrEn zksnY-SEHZFU0geB8`xIQVEL6!RX@0A0-fOHA9ok+(8wQ$wLT5=SXbHYDXEBR^wAvU zY>`*VNpRu=76kc5p!ys0%~^4EW2zkvbz%X8y4WinU_NOMV=V468FVh- zMc~n&c;@3(#>!v(L$-c!N75K$TcOq47>(UqM>R8Cj1^=y#3AhGS!o)2;-}1CLJ_ znK;(14&Pk0#1O_S19C<>o$16k2~}C#w~DfF^>k2;&3~T%{Nivz`)YWNtueJ(+4RgN zh2;e}Ty)i~HUc}3vI4a*p|o^QoOeIIXF{zY?=LI;afF*1$llv%oN8CDx+XhQCmp*N zNadxS7?~D%p*HpHuUXhIIva}JGZZwN2vp`c^4GuKZFlF3 ziIo3{JJj_>{z2|(@PsLcbhhnt_xNI-1gb@`Z2OQ{qV#uS?tkPB$-8-DOP(z_c-_gS zMhp(1``HJ_4j4A3;VSR$(f4}C03N#w?!e+h>0-@p*cac1U}ZVI%=s-?4wyXoV;3qi z4^F=CDRDp33((*`sXC&kP}@M33`Yw{K1t0MJi@ZE@*EOItuwg2yMRkR%5994jPkt- z3#Lq7#g*c9Z}yB7TtgfPT_31h6PpyY6AzGR06%$lOZd2B#ks1QW-XyH-(P;Ej~6Bz(uc1 zp>4-nswGl{?0p3~{F-)X-`t6TVnZM9MewTQS4|0Y+&dY9 z453vjOrRT1Z3P6C%+1~+r%KulIjjtUFYSLXhim#_o$%@v7;vBn*djZ)F^?HJU!gid zvMgb~D-uZbDcocR11JEeNcLj0zCr-n;3|oKoQV<5+%4~`+0ir{nh8N(?8Mn}v6p5> zQYU2z<6d>Fnz^~;thOM5F;MfQt!ln;K=V-Mw0OS;HZH{F0B0p}VZ3Z;wUJEMw4s-G z00W0mMH6k~^McGt>bBnOW4_h?)2 zoYH^8xy`9>aX{}i$r02jAo?sQCWF9js`4Ymq0!OIf(KYP_G1nbYvUg88F^B616xtR z1QxKJBBFaDHRX>gCaTVT$&0va zTTAq$e!-X~Ji;~({c5-eMXlX02~za8w>h2mB7K|`1w3V5S{y9~x_l%n*CU1S3(Y_T zTf8D>am;l1(R^{UCviRR3J{&dq|w43f6fH;ETf&xBBVykdhC*Pj~8Cck!-Xd0;&$% z0wxtA-bvi^SGdn~D0jEUDlt*R<|lzmR@ayWc)@TaYAGl0@qe!zr2&LtXFsgTk7I_< z+RJVZVyn(N+!sL~KfV>FJ^!9eUr)m6ncfkzE&A{b|9MS0YDgr{Yf7rdW5NjQ9L*_k z0GvF_gSk6BTKKf?4mx0PvuMjwkG#f!@P2Nb&Oqw|*?nZ#>>gD*d_lt{+eN6&G62x5 zSA(PyWG+N^rCp8RdlRptVr8tK9;>&ND}1_!``)qAeNx2Uf+xmljq}iVkQ4F!e;;74 zjCqjv3ta6>*u?8<<-X?7lIGkVFDv?tupfKQ^{Dv4^N-uQ$kEk;djAlo+^SRf-q=k9^=n0bNJ!< z%E~!iv@$w}%LEsEZf|epn(vw6|DV%@8qD_j)Iq)X8dIVyQYrp`#b|825SmwKTcJUP zG@NTVmhj5qz)5jq%)Fj&VCsogE&6`u{Yi}KS<^uUf5(IrYJ=B>-qp11-)!yTcUo3G z=Iq=SqGCOXCwNjCDOaq13LV;*RZGn6NKo$W@@{$578HMEOMBjA{YKn4nBI3Y79)C8 z%j}=|(x*+TjZ&|l@FvzCz|AyM$u>Dju0rm0>4nZMalh>L^Ztv208>%??SuR@BcLcKDuc~cI{_e% zNFIHQ=3c~nJgT7*>;ed6G8!rBn|RY8Rp7x@;K>z$=Vg&`>TN;3$Su$MJ52)#+}9lv6RYJ=4ZA>4c~jASH$ zN6p|7>azlNx_lX(3(u}wZJ4Ry2x&!)Gi7M<7t zBXpco72dL@n9PaW#z<~GF>MP^yLEe4C^7$6c+zp_JSKB78`%5n?jEsfuTK1$HNy&c z!U5%`?=ub$H(B4RZ#9`Bx0<|~aPKiPZQRRtujb8l$HGEGkG8zuZFpz30sy9ccOf4g zn+`_Cveus|c2$QJOF*ZNu!MXrFO8+MyOx$dtg)SQ2ejVy{<}O+(*yVNx2H-Dv)hH5 zQxow7pnQ?M{Qba94dv#6!>J*ER#p^s;vYaqyy^%T50_?aT>r+oE%EHmN2M#uX zrz3{s6*TQC@Z$Sr#hQB&-lnb*nWZ*qNu-j;7rGt*&Ln7jgGlO^GiZ@d|7Os$EP2LD zo(HrJu~hy_;84KwoUf$y$oMeF^|=1&m(klhdVclXl`G;8*b&*ouoaoLo{! zrOW&OHFxa03Y_BZLSdQ$_Sb+ZT0Rwv2+nimYi-BhcU^!DsH{3jrtQ8OdU#+$bK$j2 z=HPZkp$q}cDd#4DDcYIfMIwT!%o~s0`2TRD4LcE}o0A?`Vcp)jblN z-}K*k?C0+=%`SvJXEqYPBqS##y%q6N$>~W=4}X;7C*^_iuHBmke37hP$7NgozTw@CZrz@S zV~bMGx2;jCzR~&pY%k^gE(oy2v^vi=tcJ-Vx9mSUBshLY*k?Uj{}MgW5pXLmdMVE( z@Jb+CU(?f>rprSeOo_^$db!z*{Cmm|w_fDOheV4%-9Jx+%k)95OBGlLU-owtW%xI}2Av$TL(Nc+zsrh3prvAL ziSg8?L@?ozHe$1WPXKc+EO6oEj#XNk)cCujU86($&5i3;f2(auS!>>P?~$kyG&E-1 zQdb~$tF^2KO(blqYHMgBMJa;S%0N*`@7jS$1hlJ3`GqnOma^fNtL^XqljnKC4#yo%899-ozD*} zz%JkO%KmvJRdOb1Wf=|ki=Ymb9csGhT}M=C7+VO$fzw1YbBxZY^{h+!(x0{{WT+R{ z&)CJouq$h`@66i1@UyhhKh$=Z{c5k%rGH^`!XwP;1vvirS+4&?>T-E?b_L z2lj4O+%$EmhyOY7#vtjM6vNs1FeV_X(IqvWVVe@}w(7JC3F$YdmB$|mO4Je;zVwgF zear5S4u?-_MmAZKfHT)S@FB^Ti~|RBm$P^z-J_4s`RA5qh0bV_1vSeJI?`@KIkI2` z#!FudvS*A`s{i;hnviB|yFId?lzXp+govGPZOW# zvVMxeY?|B%h=vuw)~^6BHkx}-G3&N-MQE}lN6Uu9-74W2Cz-I*!*UXd6t`$#Y!BTm zre9K&ZT=D4uc+N8pyWPNw>qHG;Ba|}JIJ|1*;G^R*Lnng{Q==xs&!JJ#To6no&X-~ zeYM2F`p@l6w;~q8O5BodUUzt_?M=LLoFQRbr`5X(xcZy=tsTk*9KSNOsWcU!iQx;e zVUqz7dZ0tCG~xY91y)(`a_dQ2xMvo>Y+3Vqd)>5iphj^n0BnY&M)6kV?$GE_M^CeC z_rl$nU}f8)l)McK{C(0p-f|JKT}M+06*rhj&vh<5367?hF%74v}Ra$ zHsra_9k}fXD{;$0ZERlof_>a*gzdt}keD-RzpVT(tj?TV`t>d3s*wAm+YG-g_MR?5 zhI|FKI52^4FmB2USngXtJuM%c3EXKtman*;`3LxI7U+i8U5^ssd`90Rn!UrxXN%o; zF>cu|6b~|;v2c9jcVvHaF2!;ZdYzM%x`5q@y=Xv)p&E;<9t*T=dPvJTV3z1do_xln z487i&|MT*pSkIwBsLNH@gpoV&E=7)l7z-k;m9b%>Nu4nPv%~-Nl=PZkPMEX+Q2*||*6^@UOCSNCcC6w{Fy#!IZCJWYy)c@P!mFxnDRuBu z;-v{SPQh>I8dNx(MYAt2I7pi9riZM2wimZ8V((IaVWwED*sH#eYM09!V2S;Gfj75$ zLpJUA+U5Dc)R8H?Lf21G+dx&bCZS$=!m8&`%Y>QcHVwxOB5k%4|H*%NDyEg;k&;w5Rg;4oLV; zJ&q$760TE44`olK2==><_^+AnH8>e$^UJ33Hl-Ovg3^J`f5%ZK8skhx6^ud}WT_uP0&C*&Un$IL^ zd(;u}c%W*4XjXQqCpZnx4|i>sx4o-60=b)zXt2*b76m`)PyWKn`>f2)zY;;>3-Q&2 zZ@)R<~haU0u7`z7x0^*YV`lR7UIhUSrN963le zS)0%*xmbO`ODgw;;c?k(Rls6>jR+tOLA3@r-$BpN*?qYCd@(uQKg+3s%tg$Y#}75L z!L^~OiJ2XTOpd4s%Lv|boAncdk*~p)7H3WjJ0AzBJg+-#J|p6#e_~?CUK$8iDuBgB z?o;WLL15JpuaDnop3fo2sNw6^8aJcm%4=X1tHbwf)q{%OlOlpf+_FrZDuvqk)t^ti zOsI>@wo_fBv$nnSZvJ;hB9}hE*B`ndQO(?Iq5RcI1sz)z`IX`UgRMaSB}k?ED9$Ta zKvvM7#0MJF%q;vX^}$pWFCsx4Jrr-nBideFm&(c|t4mGg#;syn>9esQj(#HR@46DV zex@!|8O>Di*YGs9*6Tyli;B@b@d_x6_=V?pw%e(8*R|4T12-ywUCg!-&!Z4v-ni9E?o0%th`( zeV3x>5uq(hS>qF1fxr~Ig};^kwr8lX#M6n@QDWY_MMAzfw<_PEG`mLGlx_SN?UJ=> z0OcfJ$bM@XdQ1wF*i#wfP0fQ?0z0VQ#cBer4D17mZyi+r^{^tY%;ruyCeCCXp z99P}c1=Vzg9^Vt7?9M&u)AMc1wlerL6d0+2HvJ+*)5#+r7ChLqEFD6aw2dH39?3YR z@3hyu(oi6G4OAmhW`pBisgFZ%r^HW)c89x$s>f2k`{s%se$cG_QBn7QUVyFWof)+r z^+OTwN*|is1l>7cy1yVVjTlHem8P(3CJtu4m*=R-(mA{zt~!MJx{tR~P%NoG7ttm} zPWu%%v*Vd(Rdp%Zan4V{>6?!2?6!vHh-`5DSFC$af}`r%gk3mAbsz**NFd)bu-y?q zF}R{hJ&^0~i$}TJ`=D^Xpe;+>3P^=q^bq#vsA<#2w$I$uV6YH5TD^m}#*n0yy`kt^ zL24YT-ev&U3Wt?7bI!HvQtuHr0vj^|-|ltrX8tVwEgGAzY|0yXOLzQSZLr|35Mbm5 z$ADWQ0Y0^XbkvZzZlT;~7W56LKs{#nXVVrX!CBy(pA@I*9+A0l?6zuA$E=Xt960(^ z!$^FC>kA3>ot(4ps`IV`I&AB}#}Xzbm=`}K6Gpidd$si|Mus+`vnFL5qqm41QGR@RU%GeA2Hk*?t@>kxdS+P;1EWqvMBxwB8LG zjPd6~0@exzL&7TRo?J4*WdaIR{6&C(y9d)2aZOM)E?;f^m>q5NMlTCIPvrahj~+DH zKRrWb^k+zfSn>CM_a0eC)qo2{2i?&QoP|{bGv5bt^Ix8nbuV<9;hZ(md(CbWU$BMt z-`~@q=18cyq;`{3535w1g8+m5OqzxBd~xL+VU8zb_$-tt%K!uv-|^fPdm6&0yMfWm ziCCo1>+>_VEtk;FiTs`iMPA;z9vQm%d8_6olp19*S}N$AXhDvs3J$Mvva(fSp1vE1 z$NE!kT(_a`Gsb*{;7b`Fu4In?KDS-DxA=W;ZgIBU*?4sRZrUBfPKBvjpJhBd*V?#^ zj%^(?AEtD3N=T}o5BYyySJu%d@y4F&BOK*xb}IGu+>q=6y(K_2f&Pok%uM{rC;xRN ztI5IprMWKM9zrgN>K$_Vo<8M28{Fw zCQ%S|L(lk{4>>iCv7Qy)-SC+f6aS{vcTpW}rZHU5jO2Y*TAFD6vx9+Ah|a=wu-P;p z`W0VfRR81Gq+Iy_NjV`~OETPYkfns9Tvhi;plaJZ> z9QdsnK(i&^&MpFcG^1km(le)E%TDZ(BK(ht<|N9U?Bj@NgBuQxjy1b)y(!3&QdfD7 zmW4p#9R-eSV^vOkduKGeVVXr9$?LKqXfq4c+yecuaMlk_d<){7RJW@~iNvh%V{C4; zJIp2H%FBcWKm3ioU&T;y=D9q_Wv7MRT2;-h%^qC?;Q0-bHN{%WfLm6Nx!nz+PD-8G zR$p(JoSIWh6#2T11BNIh^(vZ6Oj&Vv{JSSWn~(PD7kOpV>gp9cTDE@%E#{|9HUaTf z64*OUspz05I-!Lc{~v2_8CTU7wT}`KA|--!s3-_X35bAzN;lFSill^emw>b&EhR`w zNSCyLv~-DpNO$+Kxp$_`UX`*)msCw!-sXLw5u+`N#q^>kih`l9uV1kJlydPR zIM%oqH^uYl;|nIXBb{~Gq=o4@Hs1R~8>FTQmd8tFCSDfMuKk{9h=!`bmgB!7>M8~L z{NTh#+-8#v_t)=bYbBM9AN8F_Uq7$guWQGX1oZSMBjAi}FEPwCM<^l0As16{prE2I zw72gKae0O9k%>m_-oU|Gt2}^VLwzE+r81I2t4;uAH<9hkcm(!CY8eYu$L)JnJ70K0 zLcT$?FtKqE`jmwu&_zsUz~aHKBXj_m+GV~#a4&3+go?3e!Z2M3%|o{OIIGZb4@@{@ zcNT9S4+2pBL=%eDB~a;NH_p-t>EXyoZxf)X?8v8W&8y9~Gfejdm)y}GW2xA)f4_`d zsEhEq-v;P|da}^t+Wi@&kk&drHj0pM(4q@Rk7dc&;D=_r5V~e|iwRxpW4saj0j{Cz zGSr-%_Bu!NE@P;5Z#-gX{6F$=&Y!0ZY(ku9+qGwibPQ`$q|y$@oA+aej_TL~Dj!u2 zt1SBiBCT~_O74FL?ZD7Be7+5P16ZlCq|rTaU~2FVVl1Qrg0OY9(Ul9`VzqRn-H zXWG50@x`aRe*R~VeRDC(!w1kjD!gbY*shk&{NabLtoX{#AJ_QxgT=Rn_X!GiFPkTp z6n95NEAXA$CyVTPWJt&B#V;g!*|PRrfp}QwOKW5CQLW?aW0|;dg=$Uc6>Mu)10tSmAFUBANUeX14Am!50~@7A+y_8YdXIv z&~H7vC*IR(AAO~#?1e@-xMSbhKiT<~pLXJi{))~(-EL~Uk8^rhVz-4oWYY4WK`}JI zX2ygZQ(0y2^i*4As4Liw@RA~Nd}fX>-|(6L(~Fyie;R=F$#f6#dC1IZx$*?6wsKM% zwE?A5njVF6a`Di+LEO>a4z0G5MC9aUwy6Jh27ECww(B#bHb0#NDw@+6zMFOUKuG&5%NU5-rQRmc@XOf-xPR9>v1?!vr%_~@R~)MF&QY@ zV0H=gei%@kclcUn${Bxs7o=Ap2>GL---%Sn5LJ{Q?X7!9*;hR0es;V!(8yMQx^3bN zj;KLv@1Xu%=QU!jVhS-s;CZ;!Mtk`KG9{c9=>=I_8!92OncA`lOmdB_$Uj`*Vqa&o z^xBdOC}F;)!gi>uDI{O6b357|kQZ7y_duK;<+ht1?~UjQd7K_jou5rTh-QLwO;ZWF z9d2&Tw!z4~`+?r6#t`}usZ;d54~1Qi@KQfz;O1zptA|H9lBG4{O|`+Ixqq~?;CWKv zIdEQ^^vx4SJ=+$3ew1?VjBbh?9_vDL06~hN`|*0s1|tYZ!L1}2(zjy3Smxk1nuQQs z;XShMCu+nafn*Ei+dXJ;+4BJmz_2*|BQ=aE%ZQ-;lzQV}wqF9T+jp^Q78p!fI zKk&4&`U8`47A}N13=lf??zMmmZj#k`oE^iM{eRpjH{T^cw?Y@B->ZhlPe)blu+3xMf6-ebn1MSdMQVp;e| zmRR;GOhrB7c!d_R8Q|K190B`~HI}l)_1iuCkx(z*Nk2lHy$OPOM4a#w4vGC!|dX@zMMYZu&wc%Im>;8?C1WZu@tq zi^|%?dAX3|?iD)E`+T}zQGa%NwBt@s4X5VK3a1r_>P$`emBV$!aP?c=TX?rinbB-c9*~Tpy<@l-&3M6~^Xy6wG=)(~PByKrXSpNyn+P-2r(| z#7@c_!TBNF&US1)*PSyAFKMa2;t1L!8PoDAF>~&>oxUQ&`NG^>Mx4%K+~zN@CW`Z1 zB>}9cQ^DC!=h!@R99d2dw`Qs7k+W2pn&NT11d}w}HV-QbVq0qX3e>DM2I=$*4vuL9+ zJbGY2Xj34^3afj_&+~XGi(i5RPHb%xI@19rsEY%|`W}Hm%=A$lvc&ZiUs|2mZ#QMa zf(LYRkP;z4j0e^fnSXHz5+UpGXTYQBe;X4z9(I(kMbyTEipGw1%?X*m0g`Wx{#jY&cJ13;&y^kJyF2=)1x8GT zGb$9H&3_un6dOlc45PovG{x_IG_1h_6)y6$FKmvtd?T9YeoE|XT#>5o-a9j!aEPRj z{A9Zd%kd{?n$h#2EG;^u+8GM&A=sSVHa*X?=rlA!YAzqb7^4;&0mA#hWwv3Pqb+^@ zkdIjfio`&Su8m;FJuzeuRTNPiTz?KQ38{rai#G!D3c8QNX79^l4r??u+3U2~jMwIT zs>CcG8^e-@ZqQ4Oaix3Fwkj%;k#=_`?4dk9(N;m@C9wSI#~oM#voNhHw8EK)ZDfT% zbp~k}U=y-k~yoF)~)GsJm-lXbpli>G|Bn045ivE%)un!?N{!nrx6j`R~erX$) z20zhY^|PvihY-((pr1?j4PeR5 z&YDR^nh4y%eG1VnD&<)j8?I7J#+#1i6;&0IOWq;dq?a0t8{<}m4Fznd%*`_R)j&!s zQdwoFYG7n!WS^F@u}QGMws5EIf{H{xO73sKPr=lt^TcTFNz5hCIMXep#OmSdX2%te z>5}}&ne#4^&E*qvLH;NIXAD9-RBd}}A2LBxq_Z>lTR*6@e0$B(8QG(?R&@A+?ptgu zX`s%%2|X!30ZFT@9yu${T;OA&_W>Ugs21)z&FsH*0Wjg zL)cp=#6)Cq4EUN#yx#IMNdZQlg+u%#DuDRr|AcpNLeR-^tn@dGG&+Q#G@6`~74ajK zjlVFCPYH`{qFiSrnm682pH|}Mx*X=i8D<3*eAzjkisquSjarY>JL?+ zRCfwfR;1!WL|_?hkR^Gjp;}1$l3Xn0#-{5g3Yzpp< zF5?~GDyyV_8ABh4Ygnb3?@%-O5c6deLjBpaX#&lpp93Rq^X4E*vT1_*z(;#lxBgau z6jADm#0ytg(NMf7MgHn@p=f*K=Zvfi(f1KUk4r!YpRaXu>F4)$UDba=^cLuLV%kxB zHDXkd`ZI~!tI9~iROD}{JU;Kx_EiV+{o9y($$qW-bF`sX>G)N!93NORpr9;5^Gz}` zqA=PIc0GX585urtY;T`Tog*4Et6Y4MF>5b4_IrOmEFYu%dSZVDZmIrcy@pZ(JJaEGZ^Gw3PWpipPbqpiQnHH{`5{~ zr1pg}Cp03>(3Yi!wKn+BW%mwTw%1e!0&Zm`65nr2KKrnqzc?1F~2*bi3yd#u3kbw24E~Q8e?I6b9QjsEAw{3MQY^^AG%zp^C zD8?9(oO9lZnLVldw&_F1j+=hpfHP}6?gkcHGxRLcs2NkJ7hhNt%JA9B4Z3o2|CtjH z!b?ap5ow!^o@9(sl|C`x5|CYM-Is|JDYmt1x{qYPgCGInCDR1fXgToxBvT5jNDG59 zEkj=p1j^D8=z)PWu8bOyiRtQC`K!w*^pelQgCwm!LDq;dNK7o3>rMK#k2x%zrM4e{ zkvlN!tovw;h%_*5@#yGb@6Mi!`xTWt53zlOgEFYeg7N9axCK+W{TG~z`6tR{2f9wm zl;%$AF0qxcxH>zVQoSu^$w(}VWO*{13e;oHhw+vwnYsg$)JYWiSKA&bjX)S`E%{Hl z6lLLewWT64k%Z~SO=vxd*lJ>r`XOpf7eRtoeOPBTv2Rw!m++EYqlhu6n8nC>fQx&u zk6bvi{=0&=dpnQpj*0Lbviv?8`HjvfW$*GRR~D#SHuhLU%Ij@*VHEfu&`H>8>wXEh z6l2ui+_eM70k;{t3|7PGA;W<+MB?ZTOK*A~9{HF>6T8JX0eYYQ zk%@Nu$zyE=&X}}QcTifZ;L(Tlg7~k`0UZ_->xuoKZUE`1hZ!;7+94VA{b6;FY#RT` zS)JgkIEU^2x6{8bCwuDg;NAl1pi1Sm+}9%i#NAlQX5K#na>agS>Yfk90J{7G%=v741=%TGr0jHIvUaE8UgVeqvbeqHH!mQjFUc;f>p2 zoG{<#{PELB)GTlRy}Vo|4e93$F-AibLc+f#4alVE#dtuR$~y6>|BH}W^=iKMQsZWG zYFUt}xHaI$6o0wvz+KV|$-ttKwWb{#+~i!VGaRVIN?6_Z)=hLzrQWc6QlsSRFL~Bg zEF&m*Ay<14W4IzGKqky z8VR#WmB@RpiB)<_&mFmTk&QpJw>W;qKn%CHbG`@IMRo?9#9W0~g3+^|j#w*u)sMp7 zg!C3%DODTMQ~Rv)yg)Pl+98sz{R@dxE#-A;bPIu$`KQ@>%;%11($8sv<()X~@@|X9NE!SYwhP^RUbME&BU^Z zasZ{GO0Dl_viz(bso4hTm9Lu|D{~w`Kx9llW?M44fol&SO#1`nve+7+N&In>DwBt{v$Gf}GfJ|90Z7SgcN?Y_| zVlmcK;;KxD>P7>{edDO%}GEw}y^7jup zv)yXu_+2#TFSXHz+p?N;pVn?@VvRCs-K(m>EVwtQOO$oQv^taO>CwobS7(OYVWqvSJi_TkgO!hmJ-L*1#A6JD3A*%LUJE?! z|Act`eFN*U@M}a;{E=b+BI)76n`VQ@gz61H7+AF1T3bz0iC(^b9qALQd*UXz{Hj5r z6gnXSB(etFzKW(R3N|p7AMqzkWtd#Qr0O&<;vM~EH)HR}Et8jR=d+Y=?T@Mf1tw{0 zwUmZ)BT6zd_2uS!ci%8K4P960OHV7LWj?H-Mrm;9ICAz9%2j#W8lme?O9uA&3+@k1 z)q>nt_pXUX^ft)J$@RyruIAtMlak~> zfUUCW>wA;Z2ZD2?;cZ9sIs!BcY%NN`6^v#zWqJMP$(?gDs4u0q6XN#8iW+Id1n-t( z1zjr_#6wg1y?dpAePcdAsy!}_IL!DeqbT!(o)UXU!R;9A2(ytva;Iv_U+BmQ`rH=t70ZNdIOAN0x7-5Wd6Ci zTs)3k6w&l|i2`B%bjS~*P4QoN5L>}tsb6)ySKG9nu~v_yZ)U%QlvAXUV^ifxXmKp^ z-!ZfmFOe5fgvKw~nVL`Hg}3kByQib0GklE?l2T}e4@IMLb1g7UaU^hv>Wb^VoJ+u0qrJa^FT^(tW<`~!#fEI-%0~G6(i*oqfEGRb93!ktaL_k^6$1|(|CQy0(9%%q*N3b$x_^UT{8!$6Uy*)4@GEw#7`TAs3>`<1ifL>V{t{| zj$o)ab#e+xMHZ|?NW4jI$t;Rw6zWuc+cq0X58~lgvPldbk4rX)S1B9AuV?xJ=S$=)ZA8 z@lMCGi$JmuYI`edaEl>E%C%3DpXND|)gS7qsLNrG5D^xrV34?TWG^C#l>QltN~NO^Uh6-Y{p5LRX4$ukrjKsq)7UVXLrrX^=%&QB@AR6 z+C9K)PW{jm({HU?s;n5L%+%6G=~vM<`!Vja-;k9XV@ zreWDxL4RgmHWc3_D>(i35DbjL!;fiLDA_<){*Ab}b!C#$zuMMdN3UtbJpHJ^a=jVj zq6Cvo^FISnzvFM9@PVA7>tZCiV2X34p>nt`3Adm3##`7cCccA3k|=cg&z=j3Xi>5v z%k}5X-;2B0r09t(E1-k6a720~PBrxm)YXYj0yLvburHylYGYwyzOK@imnZPtuPz99 zaw8~1g8oSr3N-9{)YnhrKvQ~BkI2odfg&b;3XCtteNE4?AL$bo=wBHQs;jB_Kmt7o zlq8Q(gu8>=W^3yjF8B9R;6v}6rzDiq;SX8MrI~RW2 zGy|XgkeJAYdy5z+-XW7Qh%pGxZ(#v_JFw4hmJ7ZrSmD#8)4zcnBXXCR$#YQ{;k`WF zxR_8B^}8nbN*-y%G|yObQP}fR`Owk7mqdgNepP0g+j3geGum?cxx2yfn6Q7Tlnxs| z;1(6Wl3eI@a{n8W&je@)0-AS;U+_}33N55ULm-!|^qu#_a(yPs#G)QLl94bv0UBfP ztd;7!X=%>*0nM{phi-y`rqbpwImyQbqNVz-+C0^#fuWIJ42|f7BtnATPN(gHs)nMj z0b_e0Lu?Dl#jVg^|mR!4A^LBMDtiPIOSeoVN?BYzn63wMxEuj$SURWmc|UBb-0nd zAoDl9TbsYQB+B!OK(=3LBbHK%i8>Smy*LeSr=`6}jtfIU@D1}Q29pblR{}jY+MZ!n zSW_jEF@}P|jGb?^f9Fs1_XbVdno`b3&48Bn0fQ1Y^wLICvxPuDoy6sF)?4k`Fnm2An70 zKOi0xReYh5X%H)e&KT%y@f&*OOgYcDd--U>r6Wa7z+^nrt<|$*d_pV=ep-TlG37Xt z9wvRu3DP7uH=5aRMu26%D@2#xmraNLQQ_@=p{XV#aZ|U~(c?-VomAxZeKlhp*F(_D zio7gJ9$jgSO6;#+zoy^gQ}cb3;^pLg(s{@k&^+^klRUU3QO=m2s7O#c`Tt>5U0jM} zz_-QAbQH@B#AoMrARc0Ivzc&kayH2w|M>W{QU6OtMQlCplHK@>HKhBq26A9!(LuEo zjD7rjq_My8zCFs5#MC{!?db`D-;*@Po4DR|+Kt!qNFLF~Slsnwx>#o)7{ZhD8}pHQ zZrA`l+N{Ru!9u*emF_1F`wkDU(BfhS`1>!~J32bb%lAH`%E-!EfTD)HPPeWjo|79c z7MeHMVY4~`xvZU>6_$Sn^_9MULsWVN=^OLz&!N_$qIx^4Dt&A@B&U=3`uq_RB8tR{ z#LO;Sf4ANBtN-3^0+U6y{2+wU_*m&&rL$?!4rG$XhQ5ud40U>>|P&G zRj~g(WaO|RA?}h|hWq!&zyB_iBE!-DJyJbBmf?1!F$N7R4RYbdUdi>I+6Wi9SxKUQ z1zYWlpUBObJr|q8f1ghkgb!1JNJo;KsG{}VR0=S1^;6+msO4U;OyAMBFWFJ}drx4X zC?r9ul$VMo9P8lXR`fISnxX?iDu5d$+9Q$1-Z8F?s*5LmwPcqPtX>xC=pzPby`_4Ewg zLP#bI*et!(EsP!J$VVv?SBz3%{Pv`Y&xN7>2_x}uwNz9g=v+jE$-yGae#}@g^NO#o=DoK+1Np8J>maf8BJ~ifh<>~lpjYp{%F+w9hxCpVT+8X1F{f*7!$G-knDf>JN>(AO~rRYH;s z4(5wcFjPz_hd9)RHdIXZNE-?_I=wGAqY41q#ml@blGEs(4ToERKmGFpAC>%fheGP=5{qoLgv={b!;DnYVXEfBkyt86*X~=qE@-k zh^O<-LHpVdBpTUN@fR1fr=O8pmM5k2)v4&zM-iQb zveDNn2VOg{M59=m*}Rxvu;Rp893vwmLnY+#wd-1%=O%NIVfU(%ORURH)@q+44KDV< z8{mCJMM>OmMnGW26~h^vaT{mQhvdHu=j>~sENsSd=Z_gdLQU|Kva z`id@Wr+&-Fvb^fjc@pNx&N*?I1(*&faYq%o1BkH2~0> z7WL8sun{}1$Nrt+AolP(znlbUaQx^cgW6{O4ZzrjxB1Qo9DD!0snb;>fWVo;89&m3 zpu0;lQijBxGai`o9~Bsrd0H^`LeM5>b4SUspH z`ZT!B*nUrT`7e2@T^Qb6ROFJdhUD@MHk~3p!wvE_hNmu~OEtkd88C5-p&PfrPIbaI0*(gqV zpIEx$b>&jIcr^%@;5taZV^=fjL-)nX0^gKvknU-wCMme<%;%JR|Ac3cG%jSE#3=gS**PJY7%&<#Uu0(*N-71f zZB=5Z<+thS>HW2dv57erH+C{A3W~Im$;nB8LY=nSh$wV#l=Dk`qY;xfZP7Z<}XKx|{ELlNjTvA$spNk>tK9OIw%<3lC3+80_z%?qLFZ5_ScEK{nM-pm+s< zk)J9|n^i4~6g!5rMT?`b9#P#qv$(##zO>XB>P><)w2V4^dR-T3o{Q6b>0&2JU#ps2 z7rA+(vl!mS#g_zb)lLk&S3D_>pPi0zigY9(D_Lgvk3-Fvmd?o7yj&LObqpM5gcmRd zjjUw6;=lv}f03IW^?Czb^;7aUjL9%Pe-Z10qh8r#3ThGrZp)J0_Ex(fRDS`Ll~%#R zfUU=@dAxXmy~cSUDS(5Ctp+ZQf#R2fVid#;c6>n6&SB*dy;9FejHB_KjTmR^^WC5( zo&%TKOdH}97dyZ`<)eF=xCe)S>`@dM?+X> zsIx#a(|c+FKNWU9$*aGbJH?cX|$%MrOiU{-N9B?;x{Ag_++IY(a1LjB^S1lh+Y-tc;8Ou|M>o& zqN&UBkN7Z*nW7;OD1C^Aq|qUdRVHI*0LiEJx;<%FXs8Z9VIkb+lX7%?Ky9e6pDIFr zNC43TRSd~or++}e7!hn+nLA7Nci!9ZQbB}r>M--wu;m)CJP?EFZL>BVQ_-Ve@wYgr zurKAFqD8T0?X3{v2VguVqogb;ka6}DM3HH{gZ+drY5sxao%e(4TJPf8*zVfctvG(P z%N`*>uxT|4@$)>psXcapKVxQIcC{c{br-A>1X2{C*MEM<$<2t?*aTE+POX#1nn{Zp zFO%7o4!%H;Q&zlCYd?1}b`S?)`EUJa6N?$1|8K$r2>2E?^F7 z^0(@@PukFWY2+?dJK6`Y<1v2A_ouxixgs~$-(d5`%+3+eJFPGt|3Gq4;3YcG-z2#3 zB@nMtX{;^+?^}RIsX_t;>4XDwhm||8_%s+sqeWLc4!rv=rdy<*@hj9wb=Tj(B*Fh8 zlZTvZGjte^4KiyWhoI6h?1bDOnwnx1Bn8I|%*{2Bo;+0kT5NbI6Q=N~hmhy(L+ph+z3C+rGK+F-n9Ssaxt?SoQT*Bh)n#iu!GS&vC&`?6Q6SvrZE zcX)NaA!&%nV-OSkvXKv6TFeQSFPPZz zQf1mK++koC!+|v)SdC$eGE-DF0%7u|zcxLe4Gw>W9JQ(SUkTGNbz2(G3 zk+C^TX+k0@Ij{IO2HWCumIo<{6ZnOYiCnYhOW6LTxMiq zfIvvV@h1xx+5#Akq+gyBo*e`cDb~w-im4`_@(OZF@TUTATn|Wui_e!~uKAxof2K+3X1DtHY9V1GOe`$l#%XD`+Fc-0q~}~jkY@dhi;F;KsIVC8 z9~$Bo6x5w~K<*C7NON;@I>PDTsP32I;^Jdk1O0P0jt6z(4_fsq9Wr{>F^S(+L_|d7 zRYhm-)Y9ZCro^Qi>Ss4Dy#d*QMlQe;k-`dyfpKmVIs_5A`U^{nqD>Vh0k!ylH zO-ZNGB}kFMeM11g>U^Qht%K4|)0U&bi$70a0@e`=yTA;BWHpehm++_S&{I zZPAa54_xX@oA17j`fECuOJl|+5+lw`28Q><@ddwfa7>PDSo2ai zUgFWN`$6Z1=q(9&a3^GgCb{aE*XZZqSkcTjM!CuS@{cv0B)9s2i5fm?{I;Eq{l1B% zIz&y)LEp-UP|AE)lxbP&M>gk88F1ELLY`YQp>555*b9I#>I3u^e&v)L$R#D+FaBI! z%9SF}dQeqW`}uxscd>2=aY}Ff#7O0Ai3eGQ?yrZg?or45d{p7LcKpoMhAqsSWxbq* zD}C-Mi2iij9`$8@svHfc6z6qJU^}tXJFitTzWWO%Sj+Pdot6m4*CiqUrGUo7h}ymF z9{ZDnV*AXn2giDQ6dw1V=Hg7LDVR)GKA+P=JrP^9gL_hq_?YX*=aoc=6zibk$tpndx?@1&)#c#Z~>vJI>Il7mr~^_}i86562Tu(dbYD~B9ocX>Hxqv+Fz90j&n z8%cbw3qw7bJy!8zH_+9wvsOyH>(#k>$TjMAx)WZ#$~4ig@RXkoYe-V#-_Wc4Q+Ik$ zdHTfyjP@-Sg&a@%saHid;?pysmk~vADq5P^v`4cv@2se`$}*7o^ zXsM_~_%DpvU?5bFI63?K`WjQ#TY^b@6CWzwCq>zb8LERF_#yP{wwx`DF|#LZsm^2M zp@0>>^!jio|It(R6V3vm*PHP32R{CH{eBnNAvK1*?$fF5ed6rF`$WCp;4LW4@%N2o z(Ag2qLZ?sH@c1vvItmG$b1twSpjdpE(gqj=Hij!ZS)O-rz;H3ef?d_{+SIKm3(R!X z1lXfBu)#8K`(`}7hxg&vh*Sl$4E3X;ms(mL-+%bVEnHT`X(F-s_}IEHH8tZy<9ZU` z#I#u^DH1BKW@lvuL|4PYdgy8w8WQpqxc8?F$fXDrQkUV03F+F7QSQa%WeRd~n<@*T z#%qvr?R28i`xFszb^o~07i*V56p<2~kv~b!-mPc2=VSsMnP7}9YrpOwj-;J7sp1+N z9}l!RZBC=Qo|UAWg=AVY?u>c2QhQ@Ia!-!v-BfcuKh5}S=#5}ojG+gB$;WU+?|AbU zXoZJ|t7-NXdgQD%4t*rDKWL`Bw#(J^)ZGw1yohrd#li0M&_;DxpS?0M-v}Px?ylV1u=I~SD)Zw}T!Wc#4 zEYevMH0n}?##u0?t$D%Y9C@Z(Y$~%+{*A8#55d2z(FtjE%43RSw2p)Ry2|&URu)?Tl8!sVoS!9UV9BM;lMpJ^HL?c5)~> z)!iEUfbzl3kC;2bQHq6S9w)c@fY1FupnbQjgFnQ(FtNcLaMT3?)9)w#=!ZZvFHB6 zDFay1oi1v`wUuwr@qDCt9jm7YjY9d+GP|#Y(GSyA=JkGjdRA#YbLv?4-0~?@?mVkJ z&yTLif71ekK3L_QH#Ynd=Sy2?;mH9pB}W(W%a&o5A}lY%nPtF>DNuGbiq`0RVDDg& z?3`xxJ5~_UP&x$GrX_a?yzRCQjoK^IB1<{f(cL=#x4?|7-Ra@KcJ11#FNKpbQbrO* z|7z|2xH;5W@mj67ez!olFzT|;`5#LW#R=^^+rrki#1!qK4+w>QkD=s5A9qnpRcXFh zDFv}oKXjs!t=e1a=B(05J2-ULlkeEdww|`;damp&vtHYynH$=l$(y1di|TbRe@Vm2 zesy+swtA}Ty2h;~M1SF6zYzO0>T79DEG{yx)l^iB3=_mdGB48&+It-M2NlH`ESUA2 zcChpgKO@g?9DZ7VU+CP3g!mH9moPr{1;TV4>gUmSlq2M9g1W4SO^ zjM<+*uRx1e*6sB4^f8O!y54w$f3K!S@#+P$@@1sEyL)YItr9651FVh0+I@<1^7E(T z4nO|syP|AXft}drBg6gOaq$`A!SV6&{{E(zO`%V(ux@b-=lRdGYB02qF-!}onbyuW{#D!7eAvixp--|++h+ozi<}9rj7d6CMIuHi|ozG#SH%? zRv5eLO5mmPa0nm$(0s{&Y+KXVh(*b5-|#r#|49I{0B`;W2J4IJ)3-A1#hx5!UD=fp){P<9({d;(kcn=pZ z_m7WLU|%}CIr-Ju*1{r0S7@i}er5)WbO(ji1FK&z3hQ?>*v@8X5rE5?Ezy+G0q6Ua zrRmi;+%v!Dx?Z)Q>$WBO6Di|`JiLtjk*ADqBVFo>zYorguh!DcjDm`arKG`jEGItY z9BSWrh-Ec+j%98``t9no`L(r4wTSmZNAK&;ChG6)jHZMkJj0EpEG%^=_ep-TtK%TU za37L0p|b&@zLk8f3dE|h5c7@zk`W^Ou;ZM8epHT~tW+(cZ6`OC?#rR(j@eSvO>J}a zP(Nm-V~UqGL3NlwerbK_fe4=X7-l$kzN>eh6eF0jWaT!uLXtF}AL}Ssl40LGhnD_$ zI0QnZt@Bwu&p*=;9VFSgZ*Zu_-IYvp%WhBWzH&E4kAhqq%F+tBd? z(vSWW&A)H6z8yo`WDl%^POCk&^Y47aH<^z6|A%&iT+OOihS!dQ*-nQDWEh^g9A^H9 zUifx-HNM{Z;kF%T`t!@fTdsiTAU@K3d{{F8k?Q7$0SRr0-N7?}3FWaD_v<*wTdlx4K%Zu#~i(R3`=U`8%^uao|MWq`u>`H^x15as{ad>rYb*~R%flniV zL~53w`q!qh9k)e&mdP{`O-n{ z{?ld*G*p>3Lq`DePoF;Bv(CgGp->>00dr}XKu@`f{k)-o1&A8yFzLo46)9u=)c|;m zUV^_r0x05^usU!c_F;;zd}iCPjk$jOA;cMOhEY7{{cW@m`an#Jr513gJlHjKqk{_u z*#TqbFBFuU*1Xoy>PR3oF}8JD;|>#iwtLJt_Z^U{>MPTuPlYMB`vjLx+QBcng>4{? zg7H{MnZMdhZ<<+REbj_mp7W`t-42|+;-(uXPR=#+GaSN10Pyolzh*PJ#sFAGIehg* zM8X%wsZq`E;CHC?aHrG?9>b9Fe1bW@!09>@`{%o@sMF=w*MB{*wSU_}z{ufB5_0nu%Se4~HQV|+YFI3EC-mPTAfUUf)# zIDz1jFM*(E!FgZ0c&UZAQJv=u(bTyx@4=zX>w!PC5U@mby&}{`hC#ZtMpb%r&y2CMMMxD`Og%K|rD?{0zkkN5G= z?j-V0LgeE-;f#a_u09^H)nG%fr__t|_UvG!CQeqLxl&vr-MgA2>3hATq-0gZbLYbp zEjRWxsNdicMje|rdwHW2auIcF2Gxb0^ULp6ej2;q$u{shzPb5& zY!xyR5MaVWJ`Cf9s5MQ7qc*9EZ;-C?&`Y~4WA4cDM12NuW6j$>^&Yio_mM?=Jn>nj zi*>G0opuhb(g8s<|7)*={*hwVBrM*lbih{pB8dXtYAtGK!(PI z+AI+w1YYvB&OV`i-#*+gE~)qQNPLCZY<+W=Ks0dDCm@}eY3&HWh^^Ru4uRc(zSI?1 zJKyapGu;Zl*hN|vh3Wd0YstvT6&uz;+y;ESjkPr$iN*z+?vZs|x8q+6GrJZG6%g-3 zL^}|LeJ7lf;SwqByU!J6TIms1MbmKGt<yYm$nNg;xtI2tVqkTVu zlp{d2#>|?V^pq;5>9pV>Z2=@exF)*6tXlC{euaA}T}zBG>T@cwnYGjN6n(He)?=mk z;L7k=Q)!c(p3s#;A6dE`p87&27V0R4CkZO5 za7RcSo*@zn?_IYneRm!q!P3!_RDkvz2a*#jEZmtShZO)y+z^v>koo~j17>*a=~yZ4 z%+DqG#o>97y+31iPRaA?4F+2=2UwKc6RskRWHl%^ZyRn zffnL+c)Zm$u2X#e(sTMxF7gC_z1_IC?Bi%5zSOH{@-p_ z`7$KmuYN^URplgjfQr0RBz>^sK}wy`pt9J<>!D)#%w+zGc{zDqB>Iv7wzC?C++0VD zseR#rf;DEh=Zvf36KQ6B^p(SAV{i=ilc1|P6>nF6UaR*5ic2Ha?kQPakY@Z)#rLiC z5S%jj32RXXk&=zBKjOxmkJit+VpMqy`!7;h7L5yyYW0JvF8Gm#kPLL%7}Q9RF@i|| zvks{;4T*V*>vIY6)Q=uNID^C}Y^{{pXQl9b4JBv5`$3c6dF7IvNN>K_g{g z>BUii&#e!As|4`gEu;7;=bI6!VOQ;l%~93cfmBfTM_Hg^LdRN*DTH2fb@@i80VgM? zAx_)YmQ8sFBt0Ox21Rhw&qO1MKvSAOVI)(uFm2>-LTjby%JcPM=~Tw7D|x~gt~bW# z7Edq9GXC@<;nZCaPb_~rUNx~cUopfLDw;mEQ4#ZXS1lUnDaVE#U#%{in0+ol`Wdq; z8O$!_#3YY=gJ_t_QiqGJd9|&9Cf9f#ffvxcWXJcu9jcIz2}kAfJQod|A+8ABKix~dLf4p zz8_k zXW`vjU%@IsQjfpT;Tby@c6aptjnumii$ViwkOFzA!+QOt-r)uoZI)w+(kUOfZa6&$ zK2_$E1U|PYg&sAM81kkvvNz@`0Lsw-U1b|U#bx5&;XsCwThjuYL95RE>nbOKo`tWAvZ$Yxn^gVKT&` z;cd+e<-pyWE0TW!0s-DQz%zpaT#6wOi-Gryq>st1e4DjFR4&;i^nBJ=P%wpxJPEWa znkUGpsPIWKMMpx<=bZveYRD52hmo@Bz95~2ZWi!?LUxM^86(;Qr86_YRg9Inlatd> z`mkow-yjYxvj5xj=g*1BGUqRqdx|QyZXa$u;ia0trlCoba(%#F2Tpwq#e5^Dv1d5S z%YIL%NR_6K-0FN3r893g9vy{OpN{A7-aA?PM`lwr#^V$V!_ zWH*xN957L)9Ro)gsXHvSOutbExVV9w)KT6Pw5 z*p_)NI6f{XO^bWBLlk{I*=+fZh~rE%H)W=rL?#*jB{UU}%Q=aptch%ucIS2Orz11) z9lOk7lRu>QHpm;UsdIT$zC?HLxoEcfvNy*>U65J-3wvZ)?emPLqG&HuMEF@DWJ2T| z3jY?<1IC-%`nQ!)J3YUN8XW&3X--iD`jL~sX`{O!?OX{?U500$F(VzUeo8b*dl(RK=K& z{e{cO#;sAdiQBSK_XsZL`=V(~^Hi;C_?$4g_p4DJUe$Dn(Wx&HP?rYNfJX*9?e!Q_mR-T(% z+-4xjOT~Jq%zG8yd2`7HI|K0Y} z)BJyt^%hW7ZQUOzDk6el(NfY4A|;K4q)14obfeNK3Mk!O0sl zyx)Jk_Z-}D?*%+(pS9PXbN*t^#ZHJx?+f}Xbeh;QQOfjxwe@g?My+h}(3sl0{)p3m z=zO30FsMd$_`s<2VLT~xxg}-W*Fjp!(C!4#8bRwx36?WmQ`|*|$`dVC)+`2m+mAkx zO{ytbD?P@s{y!(v>+2ut6BCL1nZN{<4Yz5eEyIxUNbxR4)t7JIC?s|}OK5Kgx6+q7 znuLE*co`vvo%%3NTK$DmH~0`6c;2}ygW@g05b*Qo(>c7gq!)TDS1{)I19QF93(VcH1^C zk#x9YCnhQSYT;HyUcqAr{{E!fy-9y^pRyxlK9{LHH-z&aDKZ?qJ?_!yj+B^YY6L+) zOj@brupqifVf}Q={=ob)b`w;2!`%Wy=TkR>I2x7T?)9{?^bZR37=x zItOg$h@PtJT^S|*wwS3e=5e}`OOnNf8ee!DCDUknvP^zo);?Qa%&zPCWJA>=<=#8e2q%awLzVAO72WGU zN>UrX^qZ&^f4>%kC~wldtr@ROsl9yBiwoHeP3?tQOzsLe`D-8U-a$UMpdD26X8O;| zM$beLfcE|ln%7s-3p=3#^|hBTO&Y-AVN%ZG#vi1@(6WsLMliM(Y1SCUml1qQ^c`;) zaIzM^nCue~B(&|8a|Wmj`#f%BG4`ZcA)ZfY1=}-Q44T0rv|q6iFg}X9$)NFMh&M$1 zr%V)uL_Zt_+m3H0)T)!9!4QBa7d72L)AxW{X@^Wn#gt_gUJ(b8>}G!~KR z<^O-FC_fe+`+=A!f*LlNo#WF%+v%-x+i1B4wT_>P>J8fKld|YVBiS8uDMLxpvdX%0 z$@7}fmi(JJuUECZtBak@Ww71N_79yUzhd00DYO?BDx!6UU9lr0s!kV8d66-ss4>!1 zIDWmWTq!iryV74bM|)GWjd|HmM@_QPv7k$4#@~Q7Mubf0_~6I|Y}$PaUmnGx89FOu zDl*`Jp%BjdG&AewjFA>2yG!?u6cv0_2E;5;>-W8 z&vE1Kz|k>-Ww(ePiF3ldM{zJJQ}e)pXa`4&6PO5vyi2B_Q&U!r8M4+3FGCH!SuJ_OZowb=B-$y_SB`n1H1ba z0fqtT%&u?nRzG5zRW>9B#)RL|9LS&J1!nvkn0e!O0oZ=_0(=N8+7PIY=tH@kS|l|) zrWxS%0)`t+n_tB|eSY*f>g*uZ=k}2-aiDuEa{~U~=EscMT4f8&(sQsfXS;t};ctR+ z#^LPBcC7#0uDf+&WN*!=wAeDSLYRKiu>|2D0cucpMfzZmGl^G>e$2_yX#e{ZZmy=T zl-0&^r$G^r;_w?4fU6TUCnrEYGycknSSW}lwb7tJHFy4GK9oh!CW21_7rHj&!DZ_} zoXf_5lhOaAj!(kT?~v>6cc78#p&R}hL&#BayI z0dY1p8TCvS!=KVpwV^92ci~1}Hxo@lc)A3n2-r9|6~a?eZbX+I^{B(ibqpO~+*{d$ zxp^zx>kX2Q`*o*BJ27tk?mcH^0?cyp-qY@!v_yk{OqmWHV}`SNL4yXf7wpki1Nvb? z^Dcsb9;3ffNx|$B(Hb0P7|NzPG&^Jyo`cC}WX$@Pzo zcm0|rS4oy}Km)H=>)p|9i@aQbvdBjlcE-G+2RRt0pk3PWWvHaV^ zMsvSs=}i~5tce`%4Y%)+5OB)N(Jj-J7+_^Eq^A$8Dq?^ERR%kHANXHV~ zzUZ9@O6qOfao`=KLcTZ?4;xvC#f!c~#XlgRSN`{d??KD!5!0V1NJ5@=u_!Bllm`_Z5l#PJI?MypnXfYBXZb5zV=g@}gSiU=KHL({Qc(oe60b z5IftcuOSR(KDduZl1+@bARuNz{zYh}I!~Y)MSV&ak)lTxCecrxA4&mj#^5~8d-STsklb zIWTIA!J+~AEEmL}fssd`f&f(v;VV3;#)98*wL2>Tv_dcd&RcVV5Wsdkfl7qF+Ds4t z12I;`5P$ZkbB z7Tv7`Hzph`&xvH;M;Ng^KNPky)Z-qw_+26#ZeIbFXsZRd}GW> zua5#+u>7o52-F`o{5IG`^pYMqBZL1DPTXM##!2)WuCE!S&b`uoknF0VR+(&*tq{&a`WVI&6_7a^QP^;+sNBp!;9<`2~H-YsvwDfYK?h zcLD6>m9cWvuV@&fI#i=FF1#;FrjqB|?6!7nws!fya#DTeh;#m&0ddLEW3!p$vNUsB zdId7HcZHp4ix>Jr-A;n*V9`;>>2Z&Cc|MI%1;{xcXLdAWLQ6qF3rEd>%se%Tb8GzG z)-CpQm2&Qc?q+T|PkIJ4M9g@BfhemWfsxH z{h7>fP^~38wkw+t4lsgVG_5>N9KR{U3p*6F3x7g5(@~Dg92syP^9=)xrIbkVXXz@{ zi9M<-)@RN9W(v_7W3Wh>cp484Xbiq13@N2fwga_Y13t_CEvIdL$ z=>u;VXb?Q>l;>Qi1~PK;VC7#LCk`UnlJSWRIR@V2v#5x|4`6d3kU4{c$w@yF(TQVGF zjyNI$0=fbzwRWRHLd<7eK{X{*6^ltb*X{8zKgv9Q4g6&6U@|X(r7aQI0%`9C-*+7B z0Se=MaM!;CD3rU;Avv6kzE4&sOH58glx7fT6%-U;6W|y>QR{%C8F#XG)EiO`*VF}0 z;Gd=vPrBDdugNX14_}8&Ic4|t8iHNA_@lGLa8_*q3=v@HHM?hIcU2>EWDL>+lm7iY zUl3&RCw}YemKw?`cL$B3G1x;83H%Q)>aCjt-kSn|*YRO+n`B* z9GrXVPN$tDV8mL9%2^ri2FL&evo$Dms3_|6M7ddYf1hFG8hX-amm8_o`v0G)2cJRB zO!Y~CK7SH3K>&0U?$W7a{c{|hB!ZI!okJzemiBw-n^T|$4$6clV~>uQmtg9c`<7ST z-z)1#iavc)%a>Lp+cA2T*DL!c&G6IusQ+_$e3w^Q6!%Y`!a?VVfHp0C)+ZbskWgR) z^j^%Vg*AIYeXG&BYCM!;%3GKn`na_6aLcY<2sSWY^`iH^41LuyF$V^>_fD?d3($X0 zDw5hOP4$?A11IPQAMx#!G?(tDZkrztU#*=3>VOCf9`@JH$_wB;`R5RHZsu|zyPv1Y0|gHG9ZM8h{ugVl^C9@C zL6L;e%Rkqr@()~vYwv=4fh`$T{r4Z!#18%tEYPPxrvw|g6MWt#Fg<{}`K60w9FG&T zTDJzJIna`j>tty!1Oo90NQ~gYkxlB%6UqTg!(R#Q^Yn8(07)&JEWE^IR>}(*L(e#C z@<6}@H}?AKUk==W+=|ukrO?e@z>JXJhyT2}w7YjhiUXz1h*9X}>f%wlA18r5it$@C z+a^*dgAfuek|6sXL}MK*v3Bvtr${*O#RM8Ck<}4}Ove-EC@g&xnPEPU%-7~Nq{&Bz16DqKyb*x>U?JnqiEYWP+(qw z83Jo_q!OXfvWX2^Ox@E9QEouE0VEc0b!gSiMz^j1(MKr(lM%0%rRgS6NT7K=U0wVX z`1?ZX*Ly_XQMhDdt7#-OcW``!4NoI?!a!HZ`}Q@vol*G%)Y6>2r^}Vy?-^l4jW-ss zTre~0)x?sGQ;nAY6D2TW>fpiQsqcj9nc;qy_!ll$@G$4ArC4G1Ee+7)ab`p%lLl#J@xjqG4*{yP;Pk^t`9 zCk%b%|iqL~ggTT8!qb;;e zpD8LeUXZD0z=I*-dB6Ad9*K6TGN`RB~Devsy5X|P!N7;(GMNY#0CX*aL-N$EAN@HQ&$zCr$L=~m__FnzCx8tZ zL*v*qr9)Z)JeC+TpH-)reB^h##P>0<+gu`+nr$fyEBk*nC*zd_6#JQsjsNOAwR}V+ z`jYD56RKO3RLJgLN&CCAgrfxkS4fUe)TpU6|o*+lgP;`E)f#$i&4ZK1agq zR2K$*tg?WD(_XZLny|javR-0zyK2Gp=Kkr6u=iCT_IC1jT?ExEnqkK%Cg)|?*vES%v>jwbqxY zSwL6ThT4 z_S^~j4Q`g5ane+!D-HUrTWayiKS?!&u*wUJ3RJ{D7ZwOnsl~>}Q_xNqt1&)MHd_dQ z5f;?KX3)RMf(Ol=V^Y&+zu+$dwuON%jtM&V4f@o@&M+-lV}i{dF}E2@iYfS(aaYjB zmFr{eInWjVbXT4D?A+qu`Q%K^@fiiu#Gaj7v$ z2h5vI??`2eVGpsGyz`f5kU_@b6V_jk_d283*FmmoOt==HPuSRfo9~{_Y}WX(JhR%; zvWP(K5~fV3qBBg_RNbs0=ncBDy*oyPxK~N_TVJs=gh4TAiQ3w2L!2#alOg`&0Zf$jpzGT$YJI$5ZEw`q=VPciORgPb(q>7jHvAk3< zbKd!Q7Nhc4#ap$YuA1JG2G0?t@z&Pm%})s=POXQDX+G~*lvVDjKd>FEa3Qs#xUIbX zjJ^SzWBkMVFM%ibH4&qzki?sC!(MH{I3&a;i;2*U-itH!VEcGeeetJsq&rOF2>Z(i zr_a_+_DEfCDc#F66Kn|(vthv^@TC9rR1KSN>GLZE{I|~E@Vq6qz1sdqcHBusfC!4>>viKHuwYv?tC{`rGFzbd)ANjAlGI$A{ze3!K7y_ngTJfqCF;#2J#ix&ckq7y{=X;96NYreb-}ce&SwoW z3{)=H_^O7f9#6>i4yd;-FE5W-a`Nci=fqo#w#8^I@BQ@7Q{+xADVkDN#O!s-u#a0U zey}SNVf%ksswh_~;$=@53HV<5yNuZ_kjwEj$df+If5!0}JW%kF4YgTAQAP?& z4eiKiSnHn4MTBCs%In=Pn=1J#DWT2VL+HLX(W~V6A@HoraXyM#zDJ2VmGN}-%g8s&G~w(|;MNrEaDg7sDOd~4 zco!)5_U+qYc$>$k)gInXH4|%gc?xzsp(OrQ-Y4AK8x?{YsLFS)>x#PX%y%hkyGAgW zvBjd}GNID{g`mfQ1=?zvazv9n3!X+0F++}02g@za*I$QL=H?8MTiI1t7|Eaf=y$bx zb^7LTJ}o8XuqP|~C}J%`YavHEu{N_hmCq67OQ;$~3_EqMk(GOWZe%ak0l8}9@p@vb zxtdWBrTwyzKp$=o^$0$PRLc5jdE1Kk2XnIOJNG2-J+7AVElPl|F{1Y0u@xgNo*oG& zt7$D%rESsiOh;)?EuWUZr!gO|WM4OsUpUi>HoJ?kZ*IQ8vfcbsPmxA8_Dv~kgct^y z)tjwRzKw(yYSt$Lj-m9-2-uP6nTp?mT4xjIb)Fgm|wuzr=x2eaa}NaBkLFRS;BFdHry2nMrt{c z)#IV-yy<`Os>b=bxV|c7X);9t+~-dC#NKLPw#EqlOqfU2bYfTbfO zO=3%%gK>372HE=b;S*&mWr>hIYTnEZ=R-A#sB%Ad^DTQyfDGRrF{k<=bYJKpy!-Qv zu%-`MdDI~M7zt$ zU+G3`N~d!No_B#-4*o%ykgsQ68In1f<@}X*gwQm7ky?=AKCxB2=>ZRsW0bH{Z>UD9 zQdUBIyma9;nf``G{=Gj`VLk^r1IJT>Ul<;(mY_e-|7E(TaN{SJUg+xgxTGNGO$wRs zK&Orc$)0KoP5?VXV_c%9%e%=`O-uS466~J%#oPQrH_Isu(;+O@nA3+>5s%^@Fw(W&4A# zJqI2OuANOcFTR^U!McRF->a;Z{~QN{jWCXEul9W9TF)^I9Jqc<78}hA1qCH^m7SZL zE=1ha)b!_d^anaRI;2==uX}{af?<{rc$%<8e+>-$G5Ma3SPE2nC(AK_{XnFdg-cMi z=&-ahsWag4UW|r9dbh{I$vO0ula!wP3mFz#!reb^8!tM$;y6dxv9Yl)0?$iDMHrk? zHQvn=1jqBV;N3#8RCw1o^))wleyy|P0Q(loNFG$cFi&Yz;Amvr(%#mV{fJ5au}?3E z*vy1rMRF#IV%FxjS^NoyIhol7G)@Ierq%2G)PaF4eU+8m6D~)VWmPby#Zp{RQLzbw zW!(}Uywl2h9UVOHsvU(M7z{sihJ!TaYwd+Wp1laCWppO;-a!QlOeGcWjEYlDV!8?M zdl_8(<3pi?(du(DJrZ1pg71cR#b!RNmu<0;az2Do3@7&l6Pr)aV`X{W(s%&og*|0P zRk`6-H@Rjf+p1|{4 zAb**bZ2x{67N5ZO9fs3x+ftzf)!k9aQ3`9rf`MyP_c?VyhJlAJuI0xMqDQMrx`Lt6 zy4P6oZb7LO*Ckf_{HnA{yifP@l&H?Z$^L$Fe|Q4FqG@hz z9jv)wR|Qfz@Zcu64`8xvZmt~{p*|BziJA;SX}Q%;h(ea+YA`{yYSo6j?Fy8=bVk$| z)({@}!Z!nK@Az>}x+plLL2l{q?{C9n72Z8Up4cWDdIfmxmO1due6OP zv8fka$yCY$sMl7glvSd_@cPH2CO)1<76>j7D;9s5mQaP=Y$V-&Ic7Z<&~NZiL$?TV z_o!obBTY)w{tIp96rm9XI|G^B-3l}g;?p3;WXLAXTL`%RwHXs*tI;Ak5O=FfN;k2L zi&Dxmn8c?JFsR34^?YS*&4PD}iksk}iv@4*84$5Y$$}!Dz2a)N_xB>30|=q08vio1 zvSwE`xk3G~z4n)kd|${E*Vy{U@*lG$SE;$HhE@#lsMe`k1ztC@OIYDU@dicPJKInR zs~D|3IIM7$q+YzGT-EpBTxVhP&cSTZ)pLS%hYt)^BgYz|=jXz9^zp17G9r6tr|n~e z2PacQXY0KS=g%XPj*fcTW3@wQu^dE#(L+D3>~m^Gt!RmqH>5kK9~r$F_Z!PXRM!;r zGyj6YB<4zhE?%Z$JKxRG{4}L3i9o8dQ2+trGe{c|Z&|-LY#cb(Ox()FlEL_{7c`Zt z7iR#gJG8QQeENlQXw9ylO(rFL94g7<(=-1YBc!Ac84-CNo_^k#PIbcqm0=PTlpRYl z{IuYqy4KS?l^x;zH@=EJAu_CJ7`6F#ec1*6T&LjNhpYDSqKUP6NDcLD#k!~`{p@?P zIW2gdJGgD_cn5pB@5~?Ol=V^a*U111aEYuTsLU($q{CT?3pod&h*`w>g37 zIvNfU%r$nr(pI|M$3}Du1(KXrUYQC*gRxD8hGEeYJV>pWp>6SfzAfd~sF3bmztw;( zas`|3-3zGwDZ*$~t+@!r=`NYs{19=amfT#+aPI8;tk*wSSGq;QfiGNy5CjsGKJpw1 z;6H)sQxZIRlN+*Ga?*K2M*At;v-CMiaTFA-C)}_jK}M9OicJRgm5sM^q0Xi;i19vtqf-6JpE%Vu)@yw|)ZbxEN_D zUyD;8-O}TTtH4nIQa|*nAov!WKG*fpPhvRY!dDQ*1bL2fHzN*uBMbbl3!GqL{FcTV zhM)m9lCc1z?eL~fsvHE7DIN;9^6}gHHwvW*T>9yrvm$*u!5Mqxb;e5y}8i3szVsHoS7x!sqWvX@<-q9|$GR&- z77~2--dGL*S<@29H2I>DvHt$lU#1FHsN{~FkmH`PTZxg-2*Q%W^y7^KDLru<9?R-p zwk|R9(Qh5=Kl}S(Cgx3QACXr@BYo{k&#so|<_5;cvUy#h_*egbR|psKz`NyLK12W5 zLD%EN=VBrW5>m`VReT=VgymIUFEY>Ku{w8wqxI5gMUK5DYy14Be3A-<mQGH9qe}{tUB%QWP1Z+Trs0GCPQ4-w*gNb}|VI_MWC#={fb5)Kq6 z&J{Rnlf9D*QBQf7AOHoM?>j+q8ABcx?SZ9$ml#_swn10kAJzXBN;nYadQVy7kw-SA zWK}s2EF6E|IYX&4zLjFVAUG3A_fjt6O>LP~NlK=?l7?&97p2A;yZJ*OFDYduZFJ+G zy>(Bt71JnIl%qER%K=od8*6m2nrukxsIt_cZfsSG-c+tS2$9hj5>x1zIJ=sBJ)i&FpynIl?TO+T8bg|BgUJfp^q zZjYA2QoU9Fx71bQ@nL*k&yd84AY6GUHo>oUDD(rMmXFvd|J_5hR>RBQb+V0ob4FgT zKQQZbHNExI#=7nd{3l#u;rzO8stDQ0QSwI@3IXIoUh+IF;wyqStTAPm1B1~9v#7Pq z1|InON_{(6Gcz85%Brq^Xkq7wMM|lgOclSv2To;zx5JlN^XXjo>1;Qqb2&k_Um0&~ zd3N7p+_qP%TzX>bDr}h%E##Ey)S9o7ZG`RtF7eK(W>OKpqg8^K z^r@NKfp+&c7oFd+Bft-XlXoe{;=YKX{dw>CVi6K2mU1*p=7*G~LR|DsAVS zR5`1!V-;r4=DNMbPMOuz&nG9(O22*;r)unQtaz_)u)4g=wEG^(1CaSBs=BGUT4Jo> zfy(YJQ8AZ+W~-L45Goc+diBhL%ZB{am+blZjY1{hd$vBplL z%#O7&acBUW0CyEVis^07%h!A3t)HxVA7dSF5S+&CIN;>xA_9Pc#Seg~ZK&JRxNNgT zxbFv+^s}S=)4%#H;PMHN5dkk_z{YN07{SSu{ht@W%j3K1gzWPn z#{)e4T^1n7Ovt`_8v~I+gp%x#71l+7mj4JI*<*b+U>KXU4E39oHPjp37&3pg8q={( z&nTS^H1F7BvWbbgq_vQL8Nwh9$|)%WCn|dgy{&!Naaqx5A@ihKxb+^s4E9S}*w9I{*06|E zeRH8|T@YTPm2M0M`KK#ov^!@#cC2e~u}T7}>^d@qISzzQukaMu8ez9FsA+{^CjlcH z9eHb8%e%(lRwLl3I)M@G*qJ`l5;hqaHNDLlD^1meI6wzU(BqiHOE3F;bu_bNc?B%6 zA9@tUr^<?185GH$@(S z-LWS>TL}d`^!>jR;Nbx>9AigJP>2#v{t0K%zNC98B_-u+pmg$ne(Dpm#6jcs0Gmmt zCg$^=ngVV!tin+tU&QWiqemRXl&;=uWiha2IB=h=@dt3rum_klD&Hpu2T!uDGQI`+ zI+qn7M1cxaHClOK8bu+4tYsb+MIN)`X^=rXR(ve?2x@&`vyIX=l=u*+Iy<+;DWWfj*JbpZsJS?>5Lxf(w(r?g<+i%C6cZm`JYw!t!yl1nR-#rIT>p~|ahH*3RN#EzBf$8|_V#9G>Zubi z6g3J{-xvv}o;CRlO)RbK9Jx$U8zFmSK(L3w$#RN!hAwhP`bd=j<86eR9oCCh`q&)8 z$w95cMDKI8AFzJFP{Lf9YQ#HMr=56*&1yfos`j4@>l+7qn^9>m}Yqz6yt( zEsXcW8O2N|jKR9P+3==QV^3Zyhf;A&gd6(wu|U=$7uKscHM2M+iDJ6B;by(*e%3X4 z;$M|MFkDRC*QB|#y2k71A|@Y_yx<+hJYhn&%feHZ(doB zeTq1|dc*zvc#NGj_V@jmx!k+Yd8tw}MSc~7j&RJqk%zH*_nsH`yG8vz^_IY!k7J~l z>>@&n1N5)N;3;^dYqip%ir;}IvrjO9QHRfE$aR11QzjhPmZjKEAcG;#MfE=0|6&8Y zXuYI-^y~FmbJ7-ns$Wh^eDG%{+HB}7gfHU1kckzZB36Xf_zj>>w?YOtkL=v{x$jh; z{9WI_>XBjfN>SPQ@V;_xZb>~Tqps5f-Gd8|s-PI{3sweCGbZ8kH_0hIs@GL|7t#@Q ziTQKUi+4x58AyE^w+s>pw!d1$B^;TLavb1I!{e#970=; ztgGFg0Hp*fLl3S?`9-8H|4wcN4CwLl3%#7itn=2D=epV| z88R)Q#4|MyX%FxJKD#k|cBp#4{P5F_zEvAIaY?z|@uHw5Y{N1>ify>7ihp4Z`usUN z6Om-ncx>?zltLUI=KITOoFWu*Xg^|A%2X56tVNVso|c{l458Pca*S^8=H$5_Yw1SS zb3!Q_I&{uO&6M_zQR8TQMm;)q1@>D7(o`&%%i=JxT`}A9%lsLjaUAA66{yIc=Qd8= zA-)_`EBK{WW(sdb01=K|UHM|J%B<27hC~$TSU?S1EQ5o8j2vL>S$*~cwH3iW;f>$U z6fB>=>S;!I=j-K5e<|yHRlr3UsKWwN%0V<#4Qt1RHic1jv3{G%nZC;#Au?4(`?IzL z!Puq@V+E2eyF67AyZBM7Gf*TF6rmTCuQV&YT8>iPq*c*KXs0mDNbi^<(=_w!G8DGC zSSUe{L#Xv7mlEhDtf!Ob|sx92gq4v0lCR=0%G2Y1GzhUZz&s*Ytmn z1kUkgZ|Vd<$_Nvu0IF#>tibH#(U>bJS(|=rGCT>-VSfMW<(iax(h|N!D`fR;I3|`& z#=SD@uVYPzw@R&?sz?WneA_l48$Mo&R(1q2U5?EMkJ=jR1|vCAf)0K9L;`bv1e9_Flkw zRRU-sKs=x#w{Ax=sskI4-5cxFL2(aS<{!mKGevbwSyOC0PBr;$Sft8;<9{)`4hiWF zyS_oE&_9^85-ORt%4%h%zZyotDO74H2g=o0+~gd}%4~xddiAxk;^O?A%H#dD;|%w| z)OP)_Z-4a5&KhbY);4QWU7^i$zZ--;2sT|2A(h^sz(8Vdn@J{Kx(4v|ZgmJY%Uh=H>n~HFhAzm7Me0i++j<+Z8Q{T|Sbt|ee%7)w zDqE8{7|@-ts_$B7R74M)|?UCqmM`- zHEY~pS1!vGBp`+OHHHo2o|!)N-CK|aDpy+epd`c8czz~ao_oE)wvqSm$>fXPyz6v- z8z?%aHCOhW;<~^Z#>>x7d^&j1El%nkerW-mO#FEs|8STsyp=6JSvH`qAg;AWPtphjbwi` zayxsh`#j%^zRuZYr6ZsM=@A_}$C?kt1EH%c$WyUbJXc!6hR(&c&h;it+ z3jTH3y?A#?Hu1Xj(#rFdoer#2pL?Df@j~*43&MAtTU#hWkIF^0kKS0*v%eGgIGSff zwA8J{dD6v-#A*ek#qmb5!p54^oa&s_;p=aLVZ~;&kT%{?6ya}R0fMakT@yqI0_yuJ zL=B7iyql~VX}{KRHUfU;$yvLIU-|ZP$mu%T&Uk(Yl=-!`8^-h_Q|r3q_BQ~Y0)$4r zRw%xw5n3oAKR+p^NFyNvxOO($~0l5EIJ$#y*i} z#P- zfv@S`1oMYxV^rfHFO!qkYI5do0Bo&%eL7`JvQY&{RWkj=5f~1Dgbj9VJ|f!-!}tdx zhD&}6kr_`{|H{vVcu7T#1fid1RYjC=xR=fMMMPLf$0yV}xIrmmd0*Z9XQMd)yDv(@ zxk;S565xtZC9!WwxWKVWZ-POGEO26G#&b?hD=*W+n61g8{I|&CAZS;6CJCdxyLRC=n`Ez7SP>nHQ0S#JZtG<&TJ8&bTygh0dd}XkS&BaUiy#}q&B49iwPsYC8+D0MD!?t21 zJ`vW2HLgxg&^Uow6!?y6!$ROg05#7}fE5GPiqUGU+A$0_@L*%o$Z8#NvNeoe21SYU zbEibsR}nJW%wOU5#mr8*1#x8L{M#>De?T?&hUOKbh~1B{FAn3sR!C}opmDPYSRL8t z1v91sK1|!606*P+=CWD=6?Pd1e@KDdk6&*4(`Dxt$Kdr47gbsAj?(T}zRStsBgBg4i!J^;9PeGWkWx$68<4++ua22L9333~0}lgf zz_-=doB-Kk^_$prpQkt!VXFZj(m*J1_rPLB<-R(`alse6lAF@Qq*Wyk&nO&$xIj29 zipUu&dho(zg2_`ejrYZb`TsI}uVKNLp|qmvSRZ-Yg^<3qxXyM2&L2AmXuE7R!!%;I z(-rN!t#AE|JC{g@{9AU3s+ht-(AleX{V0}9P5q{_2G6BQ=2H{i@LwdS*u>mOiU>$xKBlI zn+W~ydBq3Zl?7$L>xVdt5|_@l(56N+I{EmuFU*PBy7R3r6b%wWt?^`S8D$c%FGL7> z!6(8ROSWPE7I?^l`@X0dYHfHDQk1FKW7mKgkZAkf67?i6L{m(m>YRniN~O!#{@9T* zrqAa9+s-Xkp@bf}qXP&^G0E zp5K7!;o+&aFJ@N{&|raK-iHAms4S(I zWFXSu^+Hxl-}4h(_X2If;p|9!EkhPo15{>%`?xV`RK0`1sfGTgmdCTO7!V!gs1uhk z&ketEp8q?Ls&l4zu|n*3ti*@yRG*UPB2CRwq7f$(=+H1_&fs=_JZThl-d}WXqR`GO zQI7MHwycCbk42Y*bPj_Xg4%11hQ}@BA$KP~xG&OY{@Sd3)EZ*%w)ilhbtLwxulPX* znnXiH%2GDg?FRO{o@SNPO1m#jZIv(sefe)H?mFK7FM#C-0(+5kfx3x!%({X4gX}U# zR5v{_4Isat35o~0R4j^m+($sZS}&nhRkPi7fD?4NVxkuy-?eps39#IGINI_kWW?V0 zTX@?03TgfD_g#lggaY+a9HQX^rVtvu`mgf+0((!8|6wZ|Bfaz?FqrUo9&_+ze^hLD zfgDR<3bY*hsdD+z9Q^(Ukp<<$0$%ph1u125a)v5lNE*zqW}Y{O@e=rb3y1JpZG6Ji zGtnpVN4I)jk2STYao3qj{wT8nMYx>my>%Du*TpbF(WHa_up82fpT5EFQQ;9Cw|OM^ z!+g!y^_OFhChQme79174gB3;fYlDrsG+ys%TB>8W?KXE)Jq6a;VmY)|ac+Z8)N~Gr z>aJFLAT2B{qW`eg*{Ko3#OPT)!_N2f>hpSyzT%!0p25J#C=x#Zm2(i#TYY`VO4vNJPI`nxWP77fY<~_A?{!1Fr7~Z zr@r@<+rUSuWHe|p*OOfoo0(k_6YrXQz70{HHC9b`-k~Z->w6mg?A1O~1-E5cY!18z z_M4t&&xJ&jQ0e|we{}p;j_HhtWcw#%Jh!z=d}^`~_{_t-%Ir_iQNg`=&Uf2C!SWOP z+VhJ5*;?sqdz#E&t0yd~{h|Kpp@el`l!~CRJlWP~Ywn&SKluwxPlcK15N&GatTKQn zha0+ZefQX9JB)qp=OBer3rJl-tJ}9{m8`GYfVYv7zh@Fr@xt+$^S17l$?n?w+H|)= zPUU!Q6~@YC+^&2<-WQpU5~LG&Nfx1vvq`GaUxVbi0?kI-RjtE6svoB!2J5{}{ztwe zaftuVq2cQrhIfNwqJ8W`>u?5*L+f3bZe`Dlb_X3w(FSm!@z?HACSnWH1^;&f5eH@= z>jh#G%&i~(XC)ba(b^u3?r9PT#oa1vs7AfmZ@1+=gkvgk6HE)YC!FFKI#%3^-yRYW zqt1SsovJ*=I8%Bj^AIoqxE3lolFK?b4uB_&L>z~EdDwyAmH*+yyGwgOea;*lf2b#2 zTJLN5$(ArEv}y;+Hf0AiN`+j4su$5v&)(+USh3;uy!?)Nq@pu;Tk<;Qz&G9?14c1@ zRzNenB9}iF2aefL(_R^uM}wIVGi=V9D_-^h_0^;8-d1-DGjiOj!a}kz*T=4BLw=pQ*DMH++cDurdTreHiE?+V{#or1^XYSOC>MbLKBn zF;dM-{CCQIosQ0lhDLUSu}Ce4DQYFN3h3yf5p&iUILNZH;*v^0`uzPN{S1K|79|5W zoSMNDma`b-86c;i(8$x%*?DX9R1i+;O1rmyEuR)aNj3QL6Jo;QJB1pGiZCWuCm^v; zecrB0VTo>LePhoF{0;yviT|Ap@&!{Be<&qA-O$D++#U%Od?xg=tJz85soM-Z4NE$K zqX0u0h6VL)#EO!Ynn$4!2^-23nrP^x)JN zz_E@axcJ_gSy@t9LI2+I-QtA`t(TY+U&71odXWG6g6e}@X6rR2kSW^p6Bg>Vy^3xF z)bt>@90Znz60H++YzKblaeIhsOI!tvL#d@D*knaIAS!^AG=&M!W)%}KjWs6-j(<%L z5W*Me2yOh;M6FwOziXJ=wjOR=*lCJ5Nhn~+RoYr2E5adx?(pp`P)|G^*F@e!D8$eXchQ?}0TiW(1Ep_u zD#LTBQ+?0p3#diox?VS#WsvR0%b8MXL42&>tdtCRroYSS z*a;3{_<|4*)fuEUSm^c2FRC2t0Dsq$&+6~3y#fvnxAfMZL}ggH0;r}?24|Qc zjjB})!1X8@DuvKoW-#P&aV^?aNl@dfnv{)xsQ=u*BvNn*-38U==jWgH_b<|`3*4TN z{H=~NTB*6sop;xw_i5Og2Q*o`${VL&Gc&QrkcnSAoIoUJzyW%2}P%QcjOlw|VncHBEGJAqwo zCU%Yx>kAEW0ru1CVCyGl*6C5(87B~qJXf|AjcX5xEDx^v$QPdY7nn@-MLbKoE?%{X zcAqL4^tVuP3P>~nrj91~iVR&1Mt zJ{$I&{4hJUq~{Oj4)j*a~T z1uU<=-$>2J=f2zr2h}^gXeVanGXWyhMn!HnFLXJERP5?g5?L?#x(Ik0{R#WmuxP#= z>bE7|Ndd)95FKze$C993lNW2=&>`I_ZRn(|q|jvx z-NK+np)g^l3KO5Iz-2`wI*3ryT81HSn86u^l?F8L6`SbG5H7eJ-d;oEz)OH(nA7@# z1`FgS@$K>c3|T3PQ0Rn;p!-F|m<$j{gW<7C3ZYChvoVz51z}O(G>5HH{x*i9tV9lK zzHO)Q+vfS8ZoOFaNk%~ohJ8bW#qIjHSTfDj#6-2jHm7CuQxH$VePxLpL$909#9Yjs zDnAanwmtTL-S)r+s+8SZX~f@2MA>pwW!_!;C0F@ zlmRE7kM0BGp=}jY(H&ynbDt9aI6bMKcF;2kFo*u3{U|$pK^7mx*b1QuIno||FVY)6e2N@IBD#k zldDr*Ys+MD9^8BwA`Zc(RkBGg5wStQYfQrL@(Gci^d)L@f0?#j3i~6TCfX@~_?oB@Ol2)76uo3L04zr`2|lekPAGUAJ6P zLik<-l&7Cq)>y0qV?%t1GCvRs>>Lk=Ngu=AL08;;kslg>;QwKcD_IB&qH-DOE-D^2 zF9n0DU8rh~Qf&HNtII2Yx9Yd@@D3_MIkf-NBXG{EKBu9WCVhtKkXb-0cR$BJ9n1z5 zW^jXW*M+rtgYadxh1Jbg&dY5i9OyrMINz~4n>SLeXa=5PJsPEX1}^a!n1TEL+B?0L;rX8IWGk9sK=qKBv5v*^P4hVny;N{vWd5 z0<7va*dE5fKm?^lKt!ZlL`6!aJEgn3I|Tu0X(=gb=|)0YL`sA$E#2My%?3U9{=dtk z=R8~Bm-EiMX3d(Juny#pZaLL-=xK*<>Tx*V0nr=c!>_+1Fq;;?g+`ApueQD~8ZN~8 z7!c^%@FgA~GW`_D6mg#Hyl${|pm^_v#5nD}b#H;4s= zbeonYl<c@IDvR1*D6>IQ#5h9%6&K`KLhC2@ zzEh!Aeg)dOLv0=ulj7d+^_|U|KQ$ChXI&BMuOJG5tTKQyfM!4nD_7Wf ze}NP{@Wm}aD+gAGO>Jn6aGK}@C;lN`L-M`lKz|=@0}_j-xM-LFt{~)iB}L!hQQ~Dm z^5}!Z0R`DC*`!42@<+77-KvJS`k1E?CP+|+6!}~!A}h7>j+w+2 zWhG+BmM*$ix1%yNnE60`Ogaq-{~A0+ZFWG<1ywvf_+97BZ~_A5>b?;Wg9%G1T3tXd z(T=ZbnSc%q?!MN+OhTvZ^YU--peC{=S<<1aRwCXmI+v^ffkaw30AeiEcwPsh0dU#* zI3U*MT`exJ`;8qlGMMDNb>|`$SU(|b_$c?WADsIcP+T&akwpq`&%~K;bh^*{xC8Y# za#^y9l`UkY(w1d1WT6p+aRBj_oPIw2viYiO;jogGPnF}|-S6Gv>0^x9()8kM>X+0% z#xkVUEh>*#mL}4DGcY1{w|nKU-?D%;YxABCIf6k`a7DcA|M}Bu z?avFd3xUDG;u7wUSoqVVOG`?0XbI&TtRsO3jnx|1yDwIC#+fXW3j%Qkb^xDr5>sf> z0@Fz(yva+)jzeLyr%$CGD8HcC->*Vps25$ggU$(_chx{*dI<~;b|qLpZ9O1Mxfi^L z#z|*|3q5sdkdY1!8g$!>p)((;!UeIOQX>iKxhbO<+n=KyZNULOsVrHfCOaX1NSR`| z3QsxFq)!W!Rn4W3ms-P=f&$Zl(*604B9$sk2t+&xki&o&QPFr4ZL^7XR^_L{)9cut z&76m5*lMxYx^r8tcJAOt07MvFE-NY7a~UhCYrTmG1f|WSkK`HoSxH|2FmA=*?wvG+ zkFjc+4dtgr;{4QhAjBaj6l!16?!0xCtY-k81)tpaFr{iv(u>G0`Bc|lSwa{#qzv;d zG=l!)W{hoDR95o`hok~|TBv?}o9ucZ{podBe;!|#TKz-0$4x{Zbgk6V2l8!#e>3M+h5#OL4owICl$2y zbMgM0&qn!11pYnxAMBcp`W{8o2e&_`o)Gv}N5VgF59p3st6GTPW_h&NZ^QyjC%>(@ za3@NTpJm*JTCzZJ@pOgz06WN#N}3in>(Iqo_z_ys52E3X%NiQNi3$}qX+XylO+oS5 zxJCl`1!^Mt3-8G_kj_1G?tVzyn#H*nCVeCjT)lMfo`8G|0WeICrg@Z6p}G4oHTj+dKR&+qI|szFhF!EF_Lq2#d~l0vyH(i4^)OU zkYuqN6~KNX!|e=6&e6!pkW7*(q)?I^69G;?$iiFY7G&0utjB+^F6Jpf+)E)>u_G_Y zgTys39pcz9O*Dur0EG<`3x+Ey8e1=&<~$%5bSgA2cYEUAhUDxfb}A~$N*}|^#M64k z0EFMHxZU_TP6Y%=5RYJCkxPt(W537^&bZ%0VBBY_l%OJzJGGRgadnufRpL`*w)kig zke)vhmdFGk#V1lOMSynmT$?_{tn+usj>i`1q1Nbo>N0#|x36E~BG-mfyG}Wam7&%U z7(Huq8<6GR+ZR+L(KpSK#%*Px4(&k)qc$^!!phAJ*>^7GpK~2s`%Z@36{DRxp}#}; z#1boH&#aGlg>v~vlU{k5n++oC1{Z0dZgBYiwWidsb-6z{zb$cr8_>lm?RJlW45H2T z?z(*jX9B+(kUsA31+ITf(SJ9TtqiiowX<|3ylZrF~_1dp4HDDltzy9sO;C8`X z&mRl!N^_{p)RNJ*d3+L6XI{SE=qEm>}Hw0((n+;WAl=WwCz zVt)nVWx}De<8eTBr2S0+GVMf)svU^u!^d7LDqF*!VKK-#J&6Z@>r5uW>xP7CDb@Vr zQ~Ka&H!hMiT>~AUq%2NdLfya8LtZ}2Tlg%@f5i?e`ZlKtC}_pkoT7qj_7sl5T~ zW5OWmw7_HaH0eCOZ0PqB|;~?+TTdjytOt-Wrp%XISv@CAKyx^fWpRfy|vx>Z&b! znsAmhjV;vm`4bJ`pRGsI5YR&-&cltfgC>Ymwv4hQVutI_WO!Z0oKc5Dl*j7RGSEG< zOIz#n-2E@01*8_1DrUD5O_iqR<0M~%-B-P|Jfq-z1M{Hz$6jiP{eJ6@!#8iUZSu02 z1Axuh1}Ay#-*S3?CCw^dI!gdvd9xW5nmIXD-CDsJkG+a^U&WT4SkDmei%a(08>Dn4QdkI!Mmpl9N_bE%>OV|7 z&SM_)AyYORfUYC)a|WZVDX_C8$;mH4Mfr_S<>==}!zAH&=dZT3k(I(IN}ilz zWZU$CrVEQE9rf?yO6pGfr;ENVl&Jv2faQII4L6=9Ybm{VE$pWAt#tv1w3-g|3a(S- zR|-Yob>FRGk5)#%QlD_-!9XgLd0V*Lx;yG5k&*PjN40S;PBK)K?e){^xV@fB0jP89 zxs3O!n^);C{iZ|89U?f-VD5dE9YcVZ9JC&?-z0_i&uVlQES_#0vcaHpSsEz6d1gWX zK5)b=KQr^&Az5}#&ZD>vFpL5i4SrUBS9P>gc$YV%I|4|E$~C zePr(+4$Nb@4==_2NWEL`Jf85ruH8#{sPyZT};3xMb>^o1<}`YffB}o zjbP>ndN@(R)HWkU<@V19jJQr0>aWm9uh2`cOn(e{PApqE>i|=-#@73k8WqLG~SIb${Z7P&9MLXMFh<`jwkkE`9M@TU>hWRS%yWpM!D= zsw%T&p{5ARw;Yu_^sH4?i5;2aiQd$pZ1AmgLX{ho5cUi}!KtHA)PmMG#cxQa())J9 z-Dp8_pdL9ZfJRakg%X*wnnjAkH9HTfHx^HSM16;(G_*5#l`OfS+j-^67cZt15NHhu zf5b31-q6^A#ur(?mhHm5peWIT(Zim{NAqn4snm;L^HQbCI6)F7D1|$hYkZQ<8ZpBk zbqGtOX-m^99~zLHJ&7R2y(X_#a3A_nZ}@*4`ToUDb_;TwimtB5p${1CKeA6dp1QIn zPkggw#G=vrw_Z?g&C$2hgDlebevv!@sE^oI9z)BC;sMdo%3AkRu12OSUT^GW z-0JEwGAIdyU}g?{*DN)AZ1qW8hf*#ZVJc$Lu|p@A047ie@zdi444B|`5<(5*h#BQ! zH){u*9yCdYh7I&GAf=1eZ$nlPbyu)Isx3sjaO&t~<478TgH_e$={=nijgK#xkHF(|bFJUuI4>LhdgX$~up| z>jV7q6gOV!uJiU7Y^8>9O>mWcCVIp-j4watve{wF-X3RO8zZb@RhNQT6uLoZd3xds zm15y{_8~F#JEX^;1jd;{k(GcS@5|Vjw+;diO!J3(Lm#t>QL=kNt>W%I5|SuEa%fNH zXU4sPQO#dcA^3pMP;!zui1qz#wm7le*Vf!t0TWr#(YV6|{qr`9PI;vw4+$%|eQny~ zORB6c1*RJYG>6SKI0`%{tmhYACn{v#&uO?xzQ^mRG52LUm$ zs`Guy{ndu&tM4*DRl2`46U8l9*H_{8TZpcxr5mU}p1!xY1l!7L7QnIwy1tDbIw|rOKxj8 z@k8e6{djN3_AMvZ7Ulb)+O5G^vNWSD!2Ll*)2Q$U`?JH)bVD#37uxu=KBGH6>rORO zJqmPqzKxn}aUd*j){=s5Aqg-Sq>*XZ7o=|9m#+O{wv^CkiGh?&BNMCGml3m4r37+v zgIJ3EH~J45U42A_r}p`09w&}hStl66>OpmD%ibt6)3Gve zLtxnAruR#zc%I4St)O^gF@lmghrqzCsQ~^8z(&7;5zruf)bC$dVpE^7B21soX(10< zKwDFMBR|Ss!sIvltC%EHrbNWV(~na^=`=rW`$ksxg&|~lNAVz&24#%n7I0Y^y=}(Y zWZ3>|!SBg_3eMVnf%Ohe8q#k+fbj@tIXy*l!ysjrsX)KjhWiYW!pO3LaXd|P#Vfj< zHxM8zEmd0Hc3NhJTOQ*7(b*EQ?gis7z%OTHw1V?{^P-wq@DXiXI-PQISiJ-;3-%Kg zuomwc7&Qi6JMLb>Jsw1gi=ow2YEMLz+4jlmc;p-!^Y}f+EvvShu*iJVDHojECtEG$1W0;?6EQu5ifbHd zqf(g9xiQ0X{fd{Ztu5#~p^uOW8>N0x5i}G5b>V=T>Dv4J+l|j zKv6c78RoodWi3qB#+iq4rl$w$-fHa6kW2?)EkH3m-$`uhf*J-@fI35iDnh6>=br0K zXs4EJPi6%9v9y;(2kJG1Y*NaY4H*0_r*__u=sSTw^ca6!ZvlI%F5#mqRq7C5m9PT~ zA}^EHc5x)q}iSnZn7b#tmxWe+0`&NxkSTvB-Ox``WmCC`~#Giu1GH{}!9VUHS(v za$n~-s#88%uTWty6df)r$be;FNjY*~=*PE#^F~qS$I;C-9wo7HXL(;!Tia4u8kpz{ zO+6fLzE_o?w+@pFz-?^pyG)^&*Y3|ioeyG?4hS1A>lx?} zw-5Wl+wjw3L_z2ijDhAL%>?l$?Ci1asG=uCf;I^{iwDO<8kb<>2smsFen);?e#hR| z0=M+&8gy=Ulu*Gvt{QBrE1$iZ*oDZHaF%3|=h9W5I+?W5&h`l6o;(;IZI^NP`_MqH zZwfl+{v+WkOjbmJ`)%QA^BHJmKvpZUv1#c)_NarKR(eHj@r^%5!(;O)*NUR;mo%pb zAfhR5QtPpH*Gf6@Wt#cR%6_B$>q2}YUR0GOIuYw$dG=E6G^P54kK^VoRDi9>Lj9?K z$_r9=(nGQ{N5fOqZ)|c$tateO6m(M%>t%=*c`Ac??t?z7!iAsb6@Lp}cWy8GfETVw z5Zw;y1_xE$Vd}XzzRx>eDIK}FtE!Gb2Zi!-EDd@WF7+1dJFy*le!MJCpT46+xqIon zy9Y`U#@`SHg{7=^GwR_2C8RaPGq7_x&QE7-|Ce=ofBrc}v2~P106jku_H5^2e zoSb4&k-`sQ>cLNMSl3m7zJD(1jKmN`K=BDOqNlI4L-BZ@EAXsBC2wKD2ojl#G=Qc6 z$Ia1*q@<xau6&4|qq@&X*WLmUm>6K}loV)LA}cQ+ z?&|RT{iM9CEcr98sT`>bX70=0H}D~lgW0|n_V>;Jyx&9UP7|Nqy=;f*>uVl)gKTF2 zM}+l0P_MzX2I2#*mzWC3m_n1U>!yeu?doe_(Rs-JA~N}NZOd0e-Abh51sZYW^bw+8 zsa%AvFVJ)uI<|@RI_L~cTUCO*rFR~>bwJY#=nG-*;8p);_mGm3GPNhOLl0CS)p{&G znn!M@dwY<1fUXqPX5uXT?}^5s0YHb|I5mpGCy_rV$A>)zviXS3lxFB)*r9jU!ee!; zu8S$KUgHn$N7(8Yw|fk1tgL>n?m}ZSv(I72Z3qzAIFV}5pHplXNJFh>oglrzB=z^w z2X%AH12qYJ8>QssXSbak8gSqC^cXLtp~GM;;F@bJcehI)zhU z-_a1h$OtWcQzw-x8{aHRp2+>ruqnf+$H7jlUq4r$shs<;Qqme}?W_%||4mY6L zB$3wNGZ{Y`SIz;~(2o_}Yr$fl(m&-x`ZM()dbzZ9flgc1aypUWTDtbmD^ec&8#Cyi z2q;@V7%-?p0uVU-m{BT9)0kj%tv8p;AvT*?kFRxAS>xPZ~PjWo_G+ z`#gm8e7EE+Zs_1@T<3gkd`HaYK9{gWiesjc;sv@c#~V-eM;LNNCm(ZSjWmUZhGHRn zX?DM{;-Txz&uax{WkXqL&hg7eX&#k&T;#D4KR14k?QSQF{IxbKg(<;6AIcTN?dw42uF>PG1{YZda4YUF5sGMt6u3g-*GeVCk`oo<*V>yH^o zZ6?fS zfm>mpeqPGTrqQM&K>xXOv+>#$uL|6=_k_FxhuMTl_c4U8rro)>jOv2VpZ_`T(&Pl6VO58aeuCIjat5`)h`?N<#h+byLaJ}bw3fG z)X#ZVv6)}Le}5xr*I`Lh7gtgbBOi2QB9O&N8#Ko{!ka+L`$IPuLiDTw^XOaX1F&hH zUq3g(n76b=|F%6Y7#2<_5B#WWhwr)cQtCNPAtb$y=PjvL&lkw7e3Z%DSSB9Iaf$BQ1e)0!Pm(0|nv;XmU1%Bj zBKllf=z%LK4eO>&J1!-VSuf|yn}@$fM^}!xTM&7waq;m@x<0u5-Q5M2ll((m#RE&w z&&v!o;%*b}@I_Qk6dXjTZ@~FpSinNkqohZ`PmWb6Ce@N?sTVlTMjviJ=4#&_MPFuW zK2XF*q=2rWYcgD6siE<;Ea%Oe3zi=wJy_mUcq%6)omb33RqMUYvM9wUIg40(IXQ5} z+zt_r9nptNu3a>>7I_{ym@%F_9x(KbgZS&}NcUG7lMqllT6^=;e*Gq@Al zrcI0dzdxJA-Z4~;t^$tvF&mq-dL7so$C?5izsTLYcTF#uT-nIS!tYi|7vULY$ZgcqIa8eBt6GXWd9B^@WULDMExv zZ{3prqd0-;q^+I0r%Ix6khkds9&#+-1C#ap=?|*C=YJm>3YEJ7y>*Q&4a~mn?v?~7 zL=g`9!~v4@MfTkHZ8|op_|X{C(6`%$J)u2ZS5ZY%ON%VJJ*{`%6W^bvlb@g8s!|0e zSn-mTDRHzHp*JSg|O7ZVi~4AU{@hE8E;6ZO#IQmC{@vuS4D#1Yxn>g8#_uD3g}O-9XRnR z+lU=|&DSe{7@%h5h7?)hz zvwfF?7pu%-^{8-^(r-X#v=pi%dhO2+m%Nr+FCx#3Ux>TajxT#a`|^k!2W@~oYe>OS z5c}^rDn#g4_acb2|>^;qtPZW?GLCP>{4bn|hdWm8A4*INY-HKB_!aqwttdZo*A z@5m=?Z2dz+#zV_4hR~q^*>7-WNsVVa;}^DNo7C~=NAKsRB~4*t3qZx)f-d++J zSA0@YjG1L^lJmaBL}K6m-JS%VQv3JzkDSg7+)K)12YB2KJ3Dqt6ePt@V_+{VUQt9F9di*tYCrZ(plhm%(4A0yVGC!`9pbb8=mCZC?9qoi>Gu#SQiyJ;Z>mO zzo>KGHFNv(#eTI6cWa9Z9ks9MGuS-s+~oRG+?A5xbi;9r-*yi(@7vuDGRn?oY4NNl zt%0STZYcfbyi6AQPduy_44h7TY+m~cI%F!BnH0Ky5YKoJ`VO%esT(ucBApdc(Ey%= z4sY^7;JTVY3e2Q=z%G54AZT0GAMmIj@8IBICO>10Xm++zXn8vOvrUjx?k@)2eKPo3 zQm&6xjPRkTT%Qg5NPj|pDAEN@GMTk*Jm^~H7!Mr{N4|P1TcJY`H~S@lCZu#KQ^co49H30$jLo$l%0$I z&Du_o>iO{$jU@)%2mNpK=ka~AW7>*=JqyT2Bz`ETbzFm6yhB9v2?4DxR$vBu%&~vQ za8J+u{;W55E02Brx!<-&Gd>@p{HjWG=tGu=n=Wk*uZl6z#QfpuR}Wp!y5S1SJ^Saf zBu2%Qv@(}JACa78_*8^H4_IuU-fj^L{AVr5|4Y_%EBzI2p8s>-joiO+{rTs!AE0ew zLeEDS6I@|<8sTL@Vuo6A5#WZ0CrrpZ`x^c~zhq>&+WwvAqXRDt2$)zL9aRfyeRX=H zU^rd~`;z0%(g55qy<-LLTGRc!KWLqvYA(;2nEvN@f3AP4{0fMtApnx^|Kf0jUUZ z?3P|ImzJ47R4}9WXj7B|z#8liO_B4jm>&TtSGxk=ou{w0A2P4U+s=M?61@z|B(E0z zopEzcU`0w&zQYR5Ty!7y(HCNxkPm5*)E8g7g`7PshKK#O6$hu+w8>k9q4{Jc1{n;H zN^k4>R`eOVPzF#?P>A;8=+}ml!!V|ydF7~wLRjuOhOuyOwlL6AJpR$x=-G_ORHM@5 z{}TFDOE2WXh!VA{UBHA7ImsrC3FO5fa{eqeTJ8Zx}p-6NVgA!wpf23+*w~VZv;>VO5@typO<&YABLY3O`Zs|+hz{OHyMD0## z{xb;zOK-|+|J@bOK98|t%wUCOsREaD!Hr zjkN<|G!2eWD#0J=9T?(7E;P|~fd=;+9shl>NiIpHpBrGm+IG6}@xeV&(cThCDqd}7 zDb>5`t*jlgG|-#3_+vTFMDHRBOoZ)U<`KPy$^zqDyP=_*d1{+byUP4U35bu!%$W+xC#hQ!CE3teLG;Wx-W#p-#d z(>dFl9NP;MqmYKoDlRIzhD$D#BCJd}ACT^f^_&N0bScnL2w|D>65ae9QS%^3lKn3* z|7#&(@GKw7;G}&sdBVCaP8KRDCl~e!TIL9nH~BVYiSPoFaVqx~kB2RDX7GZz6i&+r z>np^;Gl)VuTEd{8yue5PzIdlA)k&?9Nw}`6h%PR@GAWiKEtu9rl~vB=5Y z(T1+wB*$#%4+zlxXGP@nSm)naGhkbuzTIwi|-YebeNQal5nbw-*B$G>$lR1ii@zz}29 zj)6#%ur!1QmTkHJN)+{yS_q`mVcd5h5a;MO&mt+$Mwrk-&Y*->?MQm3Ps zAh$`Jt-qFnB9ej^-P5O|*g0***Xh9=^@Fy8HN5Sec>;D+x1iQh?o(3s zdRiF>%&bWi*y3_`7B5P?gt12i1Ygsx%2N>_JWu&h9g2`q{`pq~W=T|JR=(@9ai81u zBolBBNPXV!C6x=7sO3B_&gHcW&aY-@kuXTF=hey1KebNOXO_ zx0aAFk{B4ATVqvN&45Q&*XWBmv||Yy1gHv}j=$69Q~6JzFBipDvb$Js?#wdAXDu%9 zUgUWTs~|%E#p#PppMw?Eo2?j?6NLW)47z7U`}0181nmzM zVulJ-l%A>j)O)Pv70yRfLD9NsKWchsO&5nb-8Q^tZX#U8o~T7NFG(GlnT?vY&u*o? zIBDzX7&1g9*^9|(thjVe8R0hBha^t%RhF)LEPu^7I50OG6&G10;ihItLEq#?pZUB5 zN4898^Ado|gij-rlsC-Ig?7kM(cpiuM7OtLoR;>;*78i^on3bpx_;{p#et#cwSU$f z`o+dhegq}Dv#?XJ+3%Oxex4uTc3h*Xsj)veqDWq)gtE zIOfj}=1qKQ^mMe~H&fs)jZAaLzU27q>kG8*zmXu}P(t(xtt;1Lxr%k2IA?VEW%N0W zpNgsz#xE>(ojQjccU&L-%QT^&-2I{a+R*63X?PE_n{;pV6?xOaZ)5P7)EbZAQ!@j` zi3}we4VSB7LEfURk?kb~EVa5sC1MA{+0UPU@9S-++YP-}hud~_FO;8xSSo_zbnkV5Cu;Bc?!mrnJb*FVnB{)767E1Wa9qmnq zI6FJTl&)iQ>W`zNnsIea$VKdd=dM@YdcFsK?;MUfwv6OEf;+`>7FH zk7GjJ? zB^e!;ajvzc1y8B_3DI+#!%_yPHE%*oeK#wyB^j>5&30SW{KGQay)s9xg~jq(E1siA z9-j|X4#6Q!{f$*nvR~gcbJfk!JV{^O?h~d*`m`0}s$(TY@6hwTI}+dp?GHE-^Gv|F z31{kxi-+ul7QWc$!cl^vrx$C}7hxt-W_?QT{J_}SPO`Ipr&JVI?Vcp9=XT_!T2ta* zEb#D+olZA|sk({Hn>81DVa!Kmo( z0QLg%Ei8LLK8jJRy4p+`#9-{w}SQXv!6Y`}m88gjr+T4}Z*TG$rpR z^S(2F#-noYI;!il|FDJRo&N(ZN}?*;U_5({QQc)pajv7aSHqYn9uMFC7n=U@`wC&r zcTmS*E;Zw;e?CJw_Pe7iD>WsPQBhHVGls8FDT9YWWt@XvD14)WKL6{Nd5n>Xjj!0f z!^43L5EvJ@I5|68+R{cYnyv7t-66V0S3~a@X`^}jzodf&2>QZsf~gHUtMZ8#ylvilNXNc?>wAahAb(h(gU z9qtDi?*7kV9D~Rl(D@bdk>7)Ib+755=2Z8yG?jYFNvqucg$8t1q;G%G#)M1N_=x=& zu5x?Ry*3rQjT4DeIhFmBwYqmA8DpoEA+5y=_U;)iW0HLMx1)kLxxbGcuh{P|j-4Kl z;mcQWALUzzhfjRB9uU%;_p<0`W&Lb)N<4gD0R!oH>pf3~E3LPxDk~My7nb%I!I;u2 zSRgxc{^c?4(a(ba<+Rtm26ribb#L`_Ys_H&MvU87PCnfnEBM=!UkCk2eiY@z*E2`0PE24=H{dsPt3odgRTLH2FrnAv&H`0NJ}!{sz!_TC6I4+bf8#& zpWu4rP0L)AzFUZ$ogK(Ol%r!@CJ|}I*<$d9h~W=hZ>{`kw;fsU>Zx|lpN z_Zs1!_AdbVKw{TgRw}cFs(c}ihJhYSP!c=6+xC4sm8YIsI*Y!o)-VH&?Dpg&K>e14^x?!xzF)S6T~R z`}(v|lo*EslLR0o2XP}HVwWCp?d5hhezar5)yJKkW<#TngXU-O+NCExa{BJ4AouR- z7e^~U`ZPbb#Ko0A52@ahh{W(=aUVh0OzIBoEBj2`lCMQNz3nfxZ#_T0`seddL@;u7 z`wTk2kB+XRg6W$b>{QJUl}fiSBbNGf0Q(+ol?4O506?{L3dts}(_bI$+X9qWJuM6VQJ$M>=h`mt za-?6tbv1YN7lZgko7JsK!n#C^TDMgOc$S6!x#w`!_=7vZRHarn&9+Va&TdEC>TR2n zrC-l&)&S*NzYceGv3$SNAf8ZRf8bId`;-uhMS0Gl@?z%sg2h~3f`ZEG8r z=Js0guTq-;UO&>Y`7yXlp{~?;Wb*@tD!Vb3{ejC@V@mrJk+>kM&b{p2$q=A4qU%#Z zmWQPJ{-adruJLQpFUd6odm>+7PY-+|5^k-gj6B~=TG;Vk-R-&Z8^C)$65G&p z?yLf^0HxG^bHr5T^;P9F9O%NYhV_T-V|uIpjd9ic;F)9D?ryy0bbWV){T=PZ0$THpgS$`6l9B|# zOeJ4meB(hbG;dSoX&#`fXliQOQ_b>}=tDG;5O**z5R95es}+WvO>YV0;8~vHDR7)K zF&@{DP*+3a4{QmW#6O`;2)*kM;gS$WjDs@!cZoRprCX+pnVKhn=Vkr)F+FB!Lt0*& z8bmaCbWY*vG`P>_{9Pz2CAOlZRN{+D1mhksr$<@3D$A z)m3E4#j&vqkf)kF?_;VnSH5cQ1~xtHQaKH=X?$cF%}i@acjCg&l@Z%p@=bln@cW6` zFwYD1TSynKP}0)EFDC~~EyH~dOFv#Rya34PF|}Ag(>7p>L9L;AU6Z`UD571@_-JQP zD7%!->Ty#VG6wA}c83K6uOF~!ff!Z8EY9tW@-|(OT*DM$cEYXs$s_BPUmXo)u3X9A zu=+K7(v6O8qH_CA7gQKCw3I?@3#%M3TJL4HvaAn#CZZG2*SspZ%ce$rgl^6~(NcCE zwx!Ll0pWt2wav$}n){@Ae5850sHn#-NLb^6t!<*Eb+Ie%RScxm=dv!H<#3ade19gF zeiofZ+s~OU)+7(ij%E*GJ&}_6a~@IDWHFd!FNm-3wDVA$gFnT!Qx?!ymtCT zUH_fuhBj*rY@D!|*^QmgeR`LEw_=0a^^z=4eElrDaej~s&sbJlPF``ZDgSb~-lb=W z7{-%dNJqLpa5XM0>~{M4({?`D&^!&%E&A{^0Cc4De`K;Eiep6)wu2%dU`>-oHJ-@s2T)~dNknv1 zat(y199C2Ba`%%XjLisI&eQ+_t&_iGH4fS9Mxt~FE(-NEL4pwvH>>`^z?Ahl`b!_i?ogHEQG)wA8TxdWRx9d{Js}!P$m{r_Y#L zBy+xy{&T_r_>oU+oEg?6VW0hK_WBcG?f7QRmY*{{XE$5aVRAKm@1q(^OkC%D^vPV= z_i3VU=_*S8Y4YbqxMn4y@a-%id~sS<^6X; zz#IY}gqnC&eYtuW zEK4I3NE^Ig!X`|GBTNxKVXs}NK!uZ%O{*9M!iME63oJBm1ANGNsjYQ2D?L+1G!Lp} zQ%r6~kcSI_$_;PTQj(8tSt@E8MimREnS@NWPFiOFl97vKo>@A&a6@|HaYi+alHikS z@)v1jBGBe|Id7vRg;Oir%c_c?o+^phF?i0?UIb}TZP74tCEtut<`(;mMRsjb^iSkM zaN+l=v$I7$v`D*UAC>dxhqi#>cV4xmr=^+;fXEPNmK^@aKgPC>>vkPDSc$)Vem1&Q z!!;Cff9p3A#R4;$KX!WL{b=H6U*gbHpgLUGLV>{;Xt`s2pf(1}QE7ZM##(~$D0EqW zLn4-Yb8T}5)5sZ7ULZpW6r^{JD7aJ3pjf2=WC}u71^a?99kL zOJ=#eRFT5zWlhlL8OUyF>JsbyM4ur|-Ci`KEHgYGHms?NF#VER)i4ohBT~AHHYqv@ zq0!LLvioO-iC9{HvKGIK*dQCgt$#~M*~Ig`Xh}468NZ&nrKvwymPSsPFq-N8je{(P z<lYBjmEEQ?KBBM3rU(Zf}%_5#QYkyKYD20Nf0r?Gq1MUHk={^?PaBbXYyq;%7P3Y= z46=$@?hq5^c1H1Yu`5!)Oui*nn7UX+E>uPi24WPSWN5Y5ttp~q2@O)Y-WlJ1XRP?* z1@}FiEy-I-{}`+6&=jgN-_plow9;*S+lrgHJk~MqCK-D!O@>%&&R?+0<}H&;nXfV< z;0Nt3G6fj5=4e6aFU+wwF^eHF6^rj@Ow|?D;yLL>F9@D!8h0t{)rgltkM_zpy<|AA z+5i8%DnE7?Xpt1&soJ-#&0gTe9F?Y=wMHOV4M?)=3}?P()pcbe_>{m)b{jxT0=kWA#+^f06XOj{cZA78ujM}u;3W%Ww)9AM z;K*BKzn281)_i24s0p_^YG0*38X(zCuaC`~_)$bhc>jB)H=q0zUuq98*^BTj0-xIf z&9ZX1UBpo;)Cdl|fF3akax5Rsca9*)b5VFkFQJl6B@D7Hzfw9{A`C{>yR6OP^_I(! zKJKgV<4w_sWJn0GUP%=Azur9lKfQ)z5P>#pN%WI0)(%k$?KqaVG|AO!=E}-6u)T=z)^fz&@ zEmrQ9D2-6ZAnaveTCt-BRJ`@Mss0g~TNxc1sfo5bg23lwLB6iR z)T0Oz@XBkRwYj-Lr@Ba|asG8iGp>CF(i!p-ej*p9|G>5kQOMU^6nV%np6Rg9e-h_S zt?qPMX=!O{>MM>Jg|mN4hEydRb6jK>5QQ6chxx&fjIsY-~P@XcBK91(AeM#&t}X z@!X*ykoU0i@+8rSfh=J4QX4kre2)6@LY7R%*(pbchLo2P*IvIbo*O9`?=@p}-x6@& z`d_J0G`ubDll72!TVrEdgXpj! zcS<26BO_v40=X2(Z=UB)(J(rA8v}4vG_&&)xvEQ3171V4V_|kWGqcQBhKc(JIk7Ej zfgC88^b$a1kDnX4wlg(q|TsGP$CwfNjoghrQ5(5%{=u!gN$g#GG*T^9Ho>|*E08s?EOTkTdWir4E(hLyo zBU4GnV{bCbZGYL?{%im;0OBa19ZG<=giD6~eZzqRcntYP*BphinW3e)tH6({H%Y8u zp5_#6br~Ps%yHWV(&m}o7gE^bck|Vhou?;&{DAyiF(#s|^DIZZ1`H|?R)*6<`L@VZ zIgkzN`q}w)kYZFChvzrodKLE3Z_%TJJ(2BtaqfD*^2{rXiw}2#8Cg$%WbVYtyCO3v zF)?@6nN|}{cLdxImwj2zc3~i(8=IMKZ)Q2gxg_x8ABs zP1Od~wcvw@>*p325)`y9t_O>R_5JlCv;u=UQ2WBIyc6VWSCH*P9kxpZ6 zcWlI?Sx%C!8)K`f>eaj zkbzW^4Qon%(4MK1Rt|43Yc2$UaZtP;?<_)&13gm9Hfw|)rAAVw?UlHpqp?@qE?n_UAPT2AaGYa)06#Nr`+S0qM zjBN-F7Alq>&H$Gz{rR&ED4Y@j9ReMpQhRm}k|)~<-WPX)xG_EwH^*bKiXB+Q1IRiW zH{VlAEG_~w$#M(YE3^F(9Q<71?j@=ZPvSxt2PYxZy#I}Lg9mt?JWV&|_c?5)yo8#T zUGb*0K#)E1lWhnx`& zAwhm`zlF617X<4}fyv3>f$>9HO@pqU38+EeXBkP}Ht;sdkWf4DwMO8;gfKlGL~kV# zWmo<$&T32*L3N0XQ@MqO) z>0&KL-i+vjo9yze?{}&#D^2?E$QQmD51si;Cou`&8UiKH26Uk4QRN-(_n!IVs8=+R zT7eJ4g$C_7Nc02!4q^Tr464IHbL1dVj5;g2D3U{iAVo2XlO!ZNJCM1M{V84mG(Rz5 z6;_8H16i_Y_E=~|bO6QVA`+nGRD5TH)B+5Dv5FyV529Kp zg)(|0EQW6?fT9jr$ne!Osr1MHN8MY7Ww{1zqlgPo0qGDG3=ruS2?^;Eq`SMN^8rOs zKt)Mu0cimN=~8KsE|FSvgLK0;H{g2TZ~xoJ{@ixp?ob0U937 z+)+VppKwy#KI+RQ)Tkp#P!-f1EcRT~N{P1{tiUVvld<{rWo#8M)`6z2)@WjLA3HT~ zL#PjUnUwcY=Gd>$=dcygJr z`Ke9FAXZ^kLBYYM-tnfM&vyCfaA@(YUrL1>@MKZ3)+g!rVhb=;8=RnB5Qw6plD)=> zLr)0GB!!BP)h^7U!$@AqCQJ;|#O^SPcWw<<&`YxKQ+X7E}60*N4< zVgSw`NdRzAK!uog4fiX+MtS|(*+~mVwYRl(sSU|daSS|7CU-hzpTr^}o zfVJ!9=S8^Xm0=Z+B^$^IfeoO=qf99V^E9)M?s~-uAJ50w*x24);q#16EF1^Ek$lk( z(in7ZhT34KAy=_l()RW(Sa;>|97q&HBwnz8xnpbQ^CB>gbu-iFA`uS=WF|DZK-lun z)b;A`>6tPG3go^jRgAP2QM@?{^CYMDO1#mma&mKXPs6C20CFo(etULSh(`L4cT026 z$g=IsQ*TZ#n!XGsO%z&+t9kuYeMFl1b&++rb)yAa-%~zMSX8U|*i4Qu%!W$a8ah zAFpKRUi)|-gg>0#xw#OQ%?vd6?P{Ef!_7@^W#Lv^lDKc8`-EGLtG`5r&vlw3EbKxpV2hz#6gIOVhy zC@j2KpLDSL)}0lnF`J(*X&QS0kl2~pC>he(yfZl1ro%$ zJ8qm7a%+x*yg<>A|FCPz;_j5+3VvO%%oAJlXbJsnJ!;ge5L`Rrb zsKy*+#8K3y$Cs}`rC6kD@K?b6r^reYxHOOd(9dp<=G>fEY6j_Z`WFzx>l%?!RaC#=#7 za?ZPfj4hh+jusGGP~L;1ake_~>WsfoCWf9fkAItJ6frUo2Bc>O3gG(!mb{O>ClRp^ zo`>|G|CHA`u1miQK@U#KgB^`N1HxjjA8HgrbfiJ(qpa z_vr#dnvAKX=nro}bqxeTIG7pvm|4U8)EWEw`uw%d<-m(%Fh>-=xHaFF(2GSz*~vUJrwZCDH=S2^%o=>Y z^F=+_>@r($^$t*V+_XS)umFRQOfe*P?oNYRB4njkkXWt6^=L5RU_5f;_xAjZckWT* zAfne5FdzAHN;H|+Gk$Tao7nrAaPY*YMdlR)|C6e>YVqqQ>)E0I*RS+UOMc3RxM#G? zu<#7?K*wHBzKj{hgMbUx*FrbmJxUWv#9t;?<_F<~Ivk?idm>T#>)iHnfLc;-5vs3< zSYP9D%2LhK1Bt}KMLYt@yHs@(n$-(3Rcd;rUihcN{e+z#ya^M|048bPBn)`P02DqHA!w+AKAgn5h6w`P z6+4hs^OuU=>V%l}PdVpj=!yvyoL(yzBuzNvI0ys&{QRUyH#i6KdJ&Qf;GG#;pyx%N z`s+PDQ20U&vsw-mC=&?UXQ1Zq1f%`-Cx-G)z;5i&R39qkSf|y)5vhP-9yyC8cEl%0 zXBvc_cQ<@$+d??Xe)!CUkeKdPboo#*sD2M4dY3WR38E~T=X6O=T#Oc7D9XuA@<=} zNHh(XSVofMN5cc1d25yOy<;6?HT-Y+To(V$JI)IEywQbGKU(x&2iM$fM1!8N}53v0iWj@~l+P+l#Et@#Qq(dpp z1V%lGd(=&a9j|%|%5J)#Tp_H0RQ`X`05L-!4dK1ilbcAX^n&4?Zslx+%3h7W+S-4z zZ!o8@D6(AB{7%ZR5dnpI>JcP3n2=r0hMHAYawzfBmgwe0xxHrN992b@_04c;jR-DI z&RQ8oGtUn>3tu1J3GH;&(dG?nz~0birftDfu~P7b{!QP5gyl++bi%YGwV@SWay5hb zN@ixgVcs)2TQmmDY!j8%z2>KoHV1YQlKvGe%0`H?x1gY$x!N2OgPe&M^fV{zK6sWW zfj}XyARi>CvrDHbbwqWV3;J2pCC_g6xS7FnNDCZh^+QlrV!P)8aATV)N6$~1%h7o! zx$0=zhsyFO>gGLGV9Y_gv8;F9_9!Y3eh_zk6eJ5b-yD@Lep(871-_E1cg_*;G3Xb7 z=mLh>1hNzp7ka%s+qrcr<_1TE_0rOB`yqbh!~~2}OEnv_a}otGwq#}oBDo*v2WHF; z1} zud3oX)`6^=8AO7V!Db+rSX~EIGlV^JH2`zqF;~0pUDKP%*vu}XP|wWqw)AGG%+?uQ z`?2lQy?{`}dzIzbE>G^3Z1XVifjj`nc_g9R4@c+}8JBu8y#(wBy-%gxXcY@;H3+ISmn!WPjmT5I;LknC!y-B#zO{crkBp^&!E>1Zasg;CDH% z@zv@TDA7XdC8ml~@|@@7$hsUEE};9W3{SJgL#B_iH*#KOF@TS!I`|i>rl#708ER1` zAO{xthaX89e|xCmu56bA*$(R{)6ISa)IXVk2b>vW)1Dplr_sHTVCDX%pLQzt$Az+N z8i4x)md-Y{RM=?XR`orXpXYy-{SK9rA=G)YYQ!|}CCU9CfBBeJ1BsE7Vn7+IcsEJ# zs}s3Uxoo3i9h?DT;TcTG!~g6Z^Ury76kw;-068fnjx@|twM^9HpimE>q)#g=*uC7K zA^=fio=`q!7|PeE#McV`!lLorRJg{=zuWGIX2MRo*j z^%CtWq+CGR-&)KyzG%-ex(9&Udoal6QZv?gU`g4-)n_9s+sOzFYj8cO2Rk=t3WcD- zUQp^}K#q~6!PJs8^F8vs%1a)rSp_*&_e)3aw~sDT2i6%v6^TFkg!5qwMdLFI(eomO z8XbBQ;0o!m>1Q*Z2I8487AZpw>Ey-mhDjXiN-jCaU6)ILJ^*q00#sy{F_&q#2Zf_7 z`-@T5mmxlmzkVI2sePDV$Tc_us=)C200fnAS7L(nx)XNaywBOTHA508AmL9R`XycF zn8WmyN>DmyJ_q$muHLbN31=@lq86zY%U<1=QOEw5`)`y@V$Gi$ww|2v1Hck*Tx!n6 zadq7m-nB6=71Q+jY<@q#lp_;jy_U$4z9IB;jf?s`zrar|OqahwW%RE9@dQZ~SPanZZ@0y+ z+bp=btT0o|$y}3K{4dLo$l2wj6!*kZqwml32q+Y$2lhb}f=;ediad|`1qd{%fn&nR zSKa%1z7`x7WVa!qmA3M`On`XCo3)eJ6!lCI+58A&;@izB_n6p{K3O=lHYG4gJ;EgB zolizbhrD3|Okf8r{KLKTn~swXH_mPF2HFbn{&*?qVV0^mqr?rH2hV5wpW+J(!`;4s z{73_3;bR_b56BK=n5adVwB*-*o-eb#8bk1L2!iT5z<_C>BVHZN{x@xcLazE)M3eLI zNRVmcnwmEZci%1N;{V`5%f|y-zrfSA*V(ym2VrFC`nFI#{UAd^RwZX!*UyKhEAY}m zDFa$;=@Jo@Y1Kc@S~ox|d9(VTYSezr?kiIA@T?NBIY9XZeVPS>1FjQRyc?_Qqiw%Z zM<5__d*iA$K6?JMA48-Z#pme!izi2iK;CZULsiKKW&wriz=loLFGNpXIQ&_t-noOs zxp#*lyX0_ryx1b@e%l9XsS1B?XxV4!X`-R3!bKXiipk+2I_x5v&&Y>$ivgo`OP-|J z-vD5bipt{vRW_L0VEu&OYll^^6v|{~9C)5kvt>4=;A=*}pd#z*CJ>fbL_^I~UneKi zwabG!q*G=D*~I^MS*^$gFh~qx7tMrO+Wv*g$}z^-D@GOe!B9^BQ9SwayuEm~IkBeJ zL==<~rD;ijnuPhXzx5U4?jQfv0-V$lws0pnaE5scTQvyvTAcV*Tv%Rr)EID_5 zKxg7{p%S&Sd@v|a#giMZxnurFnXKYtkoIWh!pNMX^5(tX{@_c6^fMp~>+g*kcJ;l* zc9P*H#mewh$g(m?7%pc>sMi5G*8Hf5Ovd$R5AVbGDFsVEw()|tYdSxa{}}jE>{bqC zlD|c2@7(y?$U!~A=}+Ek5MW~pooar6t#IdD{IE1$H}Rw;-yGu!W4=V-wAd|z-2ST< z{!vzuyAxW%p&}FV2?`P2q0s|mF7{~tEZmgp>gd2Xf<4)}uvu@&W@bhq_E=dZ<&9hA z$TOlfl3McQ)s5MpR10V&gF}Y7js&w^NS&8*8+{<8%dO?>ad2_z0to@hA}JYYK+wim zhZKj2xlgoyO=&b{&u1(yEPVVUv9cTrjz0JVKfqr&ZF}n15d%GSyPvP(!duv=fsxlya%W^YLN??4JeU_@7 zE$o^3Gz<&v0Dl0-&CP9>mN}!cTSXDXN1(0>Vl>7Q_tC5bSJYjXA&qq4LQ4T9F7=nC zKP&>}qJTDeYpzmx)eEeAM0e)~UKX^8r_4h!Xu4q4_o-bD6VTK_eaUy7Ef#XuRan3X z!vjc`2-%H)4RZE|kNbH#$xgEd70uxV&Jq(gnqn?AEo+U9lJ7~#joMlT9KhQ(Ze>HD zGPK^DI=!m&^+l-WjjlPTRjYSD8S;O4>DW_CIV`^ieG1{Iz?$nh5-6z2DBq5wbqr1$v*! z95pL5bYiU2I`WmZ3ahLXfPgcpZy-^GIL zU|t?UG5X)4yD7B);`5W^DpW%0^;tc^(DLs>E4zwS@%N#SLIRBLzCG;9cA53FFZYPs z1blBC1v;*s1&~i&#=(JBDyyI&uN(HIMS4frlLC(gW7bb7I^TxpRex!K9H33GuGfYp znHaWeLU}hMDI_(48iyK3=kc? zMtvTT&%=R-!gvb2|8rGu5nF!kGL?Y#pMRKI0KLCvlL~=mKgZgLJ|?t@x;LovmhKpX zR7im#^d8D1WQv!_g^!J7`!@-~#)ob~M2B|k!e?9-3ZAIX=tUpXiSA1Q&bFo*dkEDYq*Cc6AiNfYQOT|cVNr-&@# zFVw_hbS2Q1m)uBsRN7c$y8X41NmP`C&RsB+7SZcAZ|+lFO|bLU=?}8IM6dEa+8b%+ zf=>oUGpG()j=Yi%5naONq`vOlKoCQ9$rj`hT2t2FE1~U(25I!QZx>;S0?0+4q{=gS z)M|@7>M1?PT{1{Q;cuf4p~JUJAYD}R1|7{e+)N_^C}h_bh9W3W%;b0jZ8xr6j!BJrlxD%6hLMt||MH^mE;_UqK`OoNA6i8PW$z7- zy5H;`J)ly9=rq-b#dvRbjvp8$!S$^7kQwyOtx{Z4yQKR3ct@}5-Q0}DPqUSEuZE)u zyP4a2l(v{Bx#O`Y9iM}TsR`1KCokxSu?DD4cyt75r4*r-oA^a)FJwx_Cy#p^A;quw zAlHB#JCj(6*yIUbrbtSNNvR~YIqG_J4jqzmAPjh}o^sNaV9qI8Je`4X@tKA3Yl~=q zd*OgHI|jlm$25nw0hcJm*gb58{0LBA{ zoXkuUjrWqzM_|A5Kh>QA9F()*WnQjKUpMi+`>qtB^JwD1t5&?P=#{CLMlp0A&7+A{ zV=(3ZUcn+hE2oEteA>Va4ei(Cc7J_V=!XQ^g)2tm&kqzSoxT6K@%M;4)Q0Y7kTvUH zfEEj@ZrTx3T?{|WuW{(~Tl5!Q@&0(JDL=N!7^mjXbmv_c!x2!ry{S-*%*79GxMD;M zH8@0X#K|}{uVdWDztE9mq9Kl%c;QUdEyu)sR%azd68+)BT~t?Rg(7L)dM!y0|B-6! zr5D7bo_FPkt`lkw1U`SGfL-e)Q+J=O-Wpo64OGRnOw|^H4djg@9R*d#eA$CPtZ>|M zXupT@`(3gox`KVh@8S~$KI)0)O`IG1Vz$ErLil$1r6gaND*sem$e1E16zMfIwU=`4 zFXn&Sz?al_<*Mg6lSRZTyRv&i5Tz%kXEDEHr{dM=y3~2$`o{J<17xp+maXMX?~%T9 zQ;Oe_$0mCC{31VwS?aLuMGf?GmYds38QJ2)^8(&`v!ci1XqlXX!qU#Jf2a)*S1@_ZEddI-JKaD8$)DG_Yfyqd;O|eVPHTKBGHSe@Xu_{x z8~`2s*YAELd(PnW^z^$9eSUd*qb?Pe8=AkY*NGCy%h4$VQOCKh5h2!exbKL^F6n$s zY{%q0t+b!S%s1h&GMGm#?v;z@Vi>={qB`KiJMSS}@{r6N0vd{`sj0L_aa~^Nl(eg$ z&u9mm3Ljn7aO}RWqeR=r@49fiz<8i>!LS>q2lVvzX2?L#u1`4rZo5;F&~E(es>y-V zO#FD?bVvbhL1N&$2N%?4AFlaVm6YsI9j{VxS0z>t_U!fU)E2Dh(7bwgzVYkFEGhvnh-WJ`ug>&c~1ApNX{oJh>?kW(7tG#9tj0JH|NGTil_{s6R(dmTM)T5 zYGWD4`()bGV6OE?msb1Ww0J#d?+q$}J6j_H9jt-^8D3R4BbRM@DkMpiiAk_~z>```I|571UNeEK3?q$XEM>_ z{>`Mtdt_>0NsoOAouBgbHJV~t&z_s~AE-Kl$Lm_R{AyBT;}6@Wu{0c#Q9PMX5NAd$ zY}m%K((&JR7Zf>Z`OlD)$;BqPA~D=ejNb6|iBNct@_MD|KTFIb7|RV^KLik1?50`JYjxj(?BSVnLK;pb@o^GFPMLBZQ2PQRi@=?$5JI6ET$a7 z>7^Bz8aw&0j;v+Gc5{Bkr7YQT4cS))M#_U;MXp){GzW06Xa9eA=n)~p+F)$0t+ zvUj?>Arsb#H$M@C_d@j9l5V$a5QQ$&kXp;_ADXX*YP!(*Uq+{m+)t)#yX#>swnL(< zhFrw-4;trZ1`Xi}-dkNLHXOsE6*Z?}!%Bjjju+!1SV4GPEGg&cn~SQ{GblxwDGtp> zmWftM)Y}sF;VvC-ONinYb<;H&5M1@nFiY4b5VMa@AwPL z#;MPDk!a}Z*Wl+gqD-6e$-VDG0S4>W!ZPXIf{$fgj2ec1H#fcEYGRgW%ht2~ll;}h z`fwW!&a8bUeEjS88wJ zJ`gW)=gN8Z7DkPTh-S}qtBa2&ej!7$3qO#1K`ru3Xx&3Inf2MHhQdrsD#;JV8w z?9oU(DkFCByKgBU@bdCz9!<8eR4W%jr}TyS`OM5rUOqmh&jr~mswf_E=@8Thss!S= z()IUE8Qc-7Zwy`dJ|Rn%+HO|x_+Z`X>YmNMl^L_PH0o!UaiW<#DOZorR@V`1kt>%1 zhQy0x<$>GM)s648I;UZ@GE&KpGm8Pc^nT@QGYmBKICD;lOD{saR2JWe7ltxsf3nwu zwdoDJwPn{4b|SWevcSP1Egl!yau&(xhf;fYSYPc`uD!4{pKI+_klCP_K2ow8EX91 z6TmUA5QaxZRNxXCO9EFU(8G8O zxq9>Pu(VzGH`Voxn!+VXh@)zhCklEK8RL>dmrGNONK&5xQeY~e< zLDCJ&5PkCvN5b;Z?IUF@18Y`J$FY1jT5;nRFl8(E3N6{o>#|$*ba970M(!MWO)m+w zdUE)_eRrC?z2SQBXR?V**G<@wkIL(huvd}n96C`El^|h4v-D2ltWr0lMNt>$YuLa) zJIvMhx0d&Pn6+P3?~jZedrvi!Y|?^(DmUDmmmhaL4+~&*!tNirkJD;lG_p*@%4&D? z0`Z<(9n63XEwRISllD2oJ+&YHN8HuBZ|-QBo|x*MgjyDU_+JLDZZHR2di@90xXo8o zFF6aj{^UO!Zw@YzoYbf#Uq?lycYHhvS(_8t#Up`K3%JI#KS;`iMDWlWDT}Y?u5spf z_mLgdu7wu(mrX^rRPT&aq26O)2%|^5tsc}!>+XO!y&3BQY{D>LQ%6L|j{DsEPg{)jA8}x24m#FoE8W%c=nT@%3JH?rmr}kyOqgBeeyx2zkoIW2Jt(F+j^+j1=UI4n-?5Hu zH#8;BjYZ=^JO7RjJ6?)CM$w|vZpHjbf5Ia(@Qniw^6l{7!CW3XHaA?@$E}#pt>`Z( z-=O3;n?nU>{a?^)XV}i?st@$2IA#eZzsf@K2KDDV)_A^Fv90%?f1mzB`AmApW9GL? zkMXjxeJ^;ZD)p=5*rz{tJ%kY>L5!0_M7eRXhXP&*Fkxwbi%AIyd&t{#p#;ox}s~MwFp{_aSfyMPA)D$ zM{YWUu?qi=w2IuFV)3!aQ+Fe7#01!PvnrA5`gQNa&Hg%<$BzkcAAErYj674cjt4Z&W>LKB6zqnVO zS-f&=ZZVNTb>u7YbW;eBU!8KCA;h15dfnu0huop2EfL!S zCVRqVapr&ue`;483}pv9lnV%YS{z3Nu7SV*`7D3(nffdY!n;y_WD9UJOHz=ru~Xi4 znU2TP0YC(gwr7F&p=*q0g7=E0u(Js%G3_=%j+JnU?T6euf{BTG*c2W{hQnvgbp=ZR zHV(+_XW$z9f$xcVkpjQWk83OlZbVolIGputJ^20J`{7e?FlooReWbs5EE-%bEJ?iz zNT8s|y3BOW&CzNq!F>6~-eEPo@#?FUma3{f3e^m%3E3$vCeIh>VVK5BK=sc0__UoB zvy8~GI_E}x?XS~|z-c~(AFV#Cht(8oV@7})0V|GpjNEe^b2vB$*rxg1>I4q>d;(8^ zlV)YfG6k+EueGhGm`}a|5Ny>oS$uZCDUEMC9s46s;)elQ$AAnXGXze?>H*TMv?|Kg zi`CYqsX}Y^21g8-_Rdkg+5R3p*6OukK&;HKclzK!Prd<+oIn31tSb8;bjO!dOFE=S zE`;~kNK}qGARXMg$qM`cpuOnx09Qr7QRA4h2^j#A2opY(tt0Ejl8?LZxZpl0tyaBR zVFRC6ZX=MR6Hof5`DKM$yQT&fKh&kylt=`h&%Mn7KwZmr(Wgc7nZ3#ijdp=5YJl5( zmV|Rd@~!o0>?qj_r^6Gk2h2G_QeJSlvpreUzgC9JxH4 z+6Q=eU^@rC;l5Rw1s2z$%5&yhLX>Q(P97iF@!71^fE^U{|JvAHBLSEzIpf1FD3Iu- z+w*7s=L?zU1Pxg58;3Qs#6t^z%i_Jaf^MFrI55#XNQ4F7vlhOADh#xh`spZEbipcq3FnD@R-*Ewme_;I+ zU%vU+D~w)oH>2&%>wpB2)x_xx?zWDs@|+85KU7%EXDw!NgU&2zMDMQ_Qyt2Lj%+ruv?zy{uXTm+9V^CSY0$~)W$4+77}G1y;y zxI-|7127@*QTjn(KHyfR1U)DE0KC?aI)|g7HOW`EqaS5?Sw)qI0+=onbRDev4WI|` zf9)!~DjV-SfR|zC0|C0kM29c2?Cn)oP$)JRiEkelWu+(T#ig|!{ zs6R*Q;yF2d&l$!tu~%^`d2MH=r;%r#_n%!wo;k$bw58a2#W^{)OmXJ0Dw32`c8v#* ze$TySP8p>i0f5zeT?Oz%QW|4*5`Wb;{XO#EnmQ>58VZ#?7^GKmEFM+ayrAp?=zI{! z0k-Qj8iUu?)fxLvBj1Zl;QgfUSAI92&k!jzbN>vXap%@B&5gH#p#={UCF+U1maO!Y zilFZt%nq7^T3@bIs;=yHK;vK*oK}O(EBZHve|Q(dstn9A5QAXghD7i1DuOhU|5;;b z=G0Rr=IW6@flvj;yGgl%)k%y}GpP8wTMm?SD&H@49cGwr&6kOk5ayJM8%UYOvFg5BieqO5N+LTSmU5FkF zLR+^sV`zf*blePJ%V;Q2qcXx(h`JS{QY0c6lVYrOvXk1l;uax#S?8{Ku^(ajwS2Qe zj&bhOZeYGZtF+ZG>YI14VwU!M6|q)(-?7t6vIz>%slF`JP^a(nY7SF5P!@(u{g^2K z%zxyIsT$t*$8JBnmUl_1Zz;Ff3a`9@5rKek+3r#?^2lpA~V( z^g@+@I#{-JgMx<^GpRl9UL}FMwTS84HMrky_&8VS_Fj-!`rYFPt_#T|ZE+>f0j^cg zE|9JNwivfTc_~u!2jX2P&f5QknpS3xTb3tbcwyBdY%^l+EdShI&cWgb7t_V7AVU29 zIyTlHMZ>uvE_CMIK8Z$Mk>%NG$-3DZCT9tO;`e=qw&AabJ6*gq^J4^qTmAJp{)CjV z9v8s?t{mH_S+f!g26tPJ=1RB4GxB&U{uJBNH|F$fhe>=X2T=zjxXdgpc1rX7QYC;y zAGoKx3%{j5r%{sI$EeYFpLs06U(Lubao=Z9ML{Pc>YRM+ZdCEUD0kkTS)t3^k@9<- zG3k-#TmWUbOc0R z7r|jdl+Q2-+etoM7l*~x=AKN%AQnY!g)DpooW`epb1>#celj|qt9D^AOf=1}<-zYp zH>LiQM;tgkwW`d&sU`uj?keWtvul-uJ<)oPBl`B&)Ps2wmKMma7s5v4AY;jziHxC2 zs2J>_e`pBqcpW=ZjP|p`Hy|-M39eS-D?7BvEQW!V2}oKU)n|bAZ6I|ib9#* z3QvHoJXS{dgDb0}gH~81Vf}2e`6@qvptEmYI?t)qsX;zveTGxU|5Zj{t;fxv&e^vN zf=b&e0Z~<9y`rDiXL@vWOq&$1RQ|?`X*&(DbIUZIj5-e&Dbhg zJ+Ue%(^7gJf&blQ!CkAWP|l<8EL5+~*OpJUyEMTgu3->$Ah~96 zPDbikxaX3R`>W>Q@w{@vdYd)$R}{ZUWVFx2B&rw6D2h>c1QPz z8M-^VJFXJ_oiz&=t*`z1ba8&OzofUT)XLP5Q=MfURH@Zm^+;d;yfxi^*_|6Rv;w>H z6Y{l1rIQ)i;iHp0H)LQKGSlde2a4y|!8&1${I3?ka6L*!+I;q@)7g^$W*>iLCww}h zSBbrT1*1>?$$BxwFxdi!hg1(@d9AIcxGzc?2TzV^4jn9|p?1zwlHJq!;lpDw`*ad0 zxYLIUf7gqh?sVUwjoq1qb>sImuf%6z!fxda)u%Xycf?PA&_Tk)ghrl7WBf;55E-*B zvB~alh&lF0k9Dmh!^2l!y|V$TRP2X;ybwm;+p}mxKOCq$fGa6m?{I+~wIz)Ju(8Vd z?ht+y1NXsezyG2VclTW=k>hlI0wd}mDq&(_nSrPT_*jJP&0X))vZM2J2G9YchF*yq zUxc6%f3!nhR#v;rNy^%8u;>l&(zo+Xve?+z600Ac2%jC)+}+(hJy6vA(_oYI3h8bwO zbxG)L6zpX0k7K7?FEE?Hq&WJ(K2NCpDmZoyY$ALIEH|LGx^rMc%-cTl2#$q@+76e6 zXADL!&k}Yz0P+QBc5n_%N%J|dKv}MGarf7O1_px!tZsl2C)TTVpE5`JkoDU$2;tW* zc+9U<_bKtE!Fwi|a2R^{7A69=ZjB?2S7Rg&J+2gdM;nmZP|P@1B3mz-=pB5kX2yJc zt7;L*2`ly}KzN0c${e*xQoH9UIZ`i8ti=E zH$no6oRkT?gQzBBLcL>4uV*OE+a<@zR8~o=M=^q`%lqGd<%$W__t%1ng*Z%drhVxY zQ?JSh0e}5R-`INQP0*p|7^>@i&-+*XxR$F{@iZs(boq@oHIN{T}pvH*j zA6q4cEZLjxC2iVA7-NCf_~l1P4ud)0Bm*ck#kaB1(YFkc48~{C2Su_4_!rQ0Ug`L7 zyBrQf*w#txQk3gsYy5d~QL5nUhPVZs0zk?9`DJiMV0yshMtEGKZUM@vEm%IYyeXpE z`OA*Nde=Dq2|!fdo(a62;9f58HSBN{TXO5MkukQ?Hgb#Kf7ZzWgK{hO6wq#t{L;UB zk32tCFcK)yAI=a^c3&4%p}L)H*C1YtDWa7{b(H*AK?h7)-go?`er|@btwj>$PkP5|oBFYjP2-yTUPe}K?#jw>O>DJY<)*5cSsHU7GDg9g zx;4lI%vQd}7=zE=&u_V$7!bJ66eXjeoH??U2%-#ZhaSdo^S8&A{BP>2AyTv>%^!O7 zmf`MQNjJZCoDH(7wfyNCSdK(*dk8pJQ;V*>34Wd>$;yZ^*?j#w1?-vQ&X#QQ<3<&r z77U=&@nXE(IHGQ!^UeNyI9wAiOZW98By>HAEpV# zF~Oy)v7q4ytpH?275co;$p1joa;-7@K0gY%7{^((l>0`m^ z6M1>NbA@^NvlYVfHEm)aY(RT8BmwJY{OTS^NJQvhmRU+7R^KaY4GKommd<2}@Cj%# z`r$QV%N1m0Wz&xZC-d{!?--%olZEE+bxk{U*xS9N%4!fR-qz($Sf1_r%(d2|cRc{t zzn0<%>=w9il-L055Ji3;@Z}o$Ly&=L6Xa%h0ghH&^)|bSlan&h+wSh}E4jnK9Jg<6 zwriJIvqw!g(55|#eY^rGjWf^%>5AzeQxLi@HB?9L?Ls8QKHN#u#)7U}d(Qy}Sn5}H zv!m`aaj(f@{VByECo#w4v5~%KGXS0bkiLu+{Jn{(a3eyOtt>1a<1resW9lo#@_6r6 zFYd8KGaS#f9`n1;D~ksfOfeHmR`wbQsT33xZ5%HavXey4xPxO!yN zm{U;>1;%uTkyQ}czFLo=k(s}4-}rEUd&SDql5olcDwTnAcg#C1A3c28KBML79z-!O z)Ap$OWpLBWU`WUSrQN$RFR<>C0JuDR#zsa*pTq=K6u@~U!DD!4KS^7_K!8ENRgqF{ zw)G{pP_aa}lJ`CjPw7_q9qspYfT*L|&E9&kyxFiMpYp0~8$6Tk(J9!zsb@e1NTz=8O`yAO(MHFjB(1@7HT zjgLpx8S6)&0uqik(&qO~yBy#HG+*6!Oj>W?(~uCtsh~(^9ZL&~0P$pxGDM7ZLj+>U zs`a>)^!80xpKX{6rP9NK{FV?|V?;JI9t)pV8IXx&?(OY`>XzAQq{81<)*O8Gr{>F) z{Eds)$Pc4kHyF-WA+%ym-^YMxOO*7@6hZ>&{{y)isBQvT)Vyc*=g*6Cb7Q;bJ=g?S zn+~ZC)s=qjHU+!;{vzgx^dZ2mOfFqJsHFw|?k0jC%8zoleLgYW1vi5~kl!05{{coX zde88U&cPUBJGCCaIob_$;h;{`y4rLQo!6HP;x8{mls!5K#_wZY%3pE z3^+X0Ig5LGc+Ahw)6>(BE1AF$&ubJE`Ra^4*L@k;U?UcMak4djjrGjhzBpqO{A4hC z8N!#E&%W1q&7_qDO;y1~?$DV753J{N=p5Rltf;71`1&ofsAhX+^C7A6ypDyA&fuYg zkZb&v*?!{PqGa6xQqSTr?>MX=+&`l031od$pR3w_PU+n}?^;y#XOvNJ_iFt4e9<*~ z?{Su+ijv(yeg^BZY!;I@0obfmYRz1Tgs z#p>99=<(&vIM_59BQ(Pq?{#$XKbNo*C7+0JP}tAGX$uc2wpH zRQCq^bG;7x*6+1$NC#K$L85SNctBvuYV(&qxS4=DWG^&tt=Jt3C}EVIa=}1J2>{;= zVF<*Gq?74)sDVkP6`(fsON4-FeIghqhLkEZ*%paHH{p}j&X|Mw=+S4X`i=J>OLSPm zJ)!o-UQC?YdX~1nfx{U^;XxkJmTv<4LWEvJAxR=EGID!k1C05>!Hi&TZm#@94BWZz zBV|=p)QacmRVpef3X06EtckpH1N8&H)UPxQ{Q59ph^iWLE@W_#0Ge=j_2Sez29F1@ zn*lnf%;rUBERCu{YvH~vJ{Iw0AQ+kx^fnGHfvj5HWFnaY#2B;J3&d1BXcc%7PO%N7 z5+*l8<+kO$y`>o`@7oIv95t0XUSa$Vc~;&h_4x-wtsy2i_gL!z!=1u7;9HfvX8#4I zn1&t_^Fb??j|z#?eQ+9q$RkVYgd|+qHR~BuILY#?^x^z1kA(k)Q00Y83&VeG>W!fI z{Oay#=ed9-I$n+Ri5kdENql*ZjmT(PgE0ac)tNiNAHe0-lfS(I34db+J!CHxTL_Nt z>}3WX&n{zjpj61X_`S50o!Y>5k9O^so^p%|{xL7!eOcnL!O z@E5C0MB34OBM{1Y-xbEHBupbU=H?-YM~+2hNvg-fhjg|I*9CaF$ zXyme!L_atJ(RaZ40zGIA-sf7V-eWMwdT-=EMEEH|x|dUibe-ya{*HoUUWUfNQBv-4 zOI9LcmaUvpvJijz=JkX95tS!JfLHV`=SyNibc6}!_m%YkFPH)67Kg3NCmF2)7{53V zGScXz#o=-_*uWssVpklp*(YRqi+>fzCL84z@N8bXz!0Ebtqw&tZWBld}92(%{O$MQmEqfB?uQXS0trjf%n+4_ITVDa6_r)iQ5hZpszl zW{&`7^kcKl4J(42=UL3P+#9fwbE}WaexbncLInCZrRBREu+0@ZROesW%mbO>0@hXD zTf2Sior7G1{dzqLf`k1mW&XW~wy=*OXaaiAN_AH1y@ny(>gWHmOZT)nBsmtpeWn<_ zVOkth>c&%@B>Q~oU}v&Vf(R49I0PTfUH2_ghCMug`KC{zTE`z0;b6IkWwfjy; z2|3|SRS|;j2Pw{*rwRkYmM9^KXJjKGE6Oph4?dA{*j;^hfD;mGWO)pt;0-+qox+{#Ko1Ue3TS)c(??3xY_F$ zU3l(5d?t^QExC0S@(q+;e-@}Q5)-k;qcToM&Apxmzp|cChF+V)!hFm=tV#DDx5oMc z@@HT$d$3+H7l2S}>bRWq;X_%Lt%gdxewL`KSN7K|J>tyeDJ|od77{cl9e{V`aUREc zl8vpWeUx&CqI&1$%e?J>{d}sglO@spMjbSQvA%GTO@25TTbSuh8bN2tP?lEp*OIq| zvd%&Q?Ei`jcrj*Y%!9ax+2jV~#>_r|)hSjYAE0wDL9OCdQ)|D+gU4cdv53yoUS0f&47R ztgoO|wYrHU4wt=AYet(27vBIo55c!oCP-j_160-Q6piwda)X>#^!YPFc6Ys0iz;JV zU~3*O?s2;QU9n^R{-Yc1i12s+$~}NV^Ups-%M}9J-*02aD}%b_BUJayIE+^J#fwn34|CU zI91cm|55*&*8fta8r^oe@gZy@Fphy(7{>Sm|4WrX_W$(@HG%fimQzL^tXm5Z(sC77 z;m6&&_JuBK^w_E(MbvQ$%cy$V;s!Rt*npzpmCmu6oD|r%=Vyeiz5;LwaJy4;w`KTu z{vCBE?!X54`KPxb!E1AsmY?AU$=@3VH;*=9_}3B?fHQY&fi)ApG=uWFjq+#HkK^C0 zp+f#kZ}&cy5Ue_$4zAk@J?^&IZPJ*h)Ii`Tan%|-LY$n5KYb-S{QMV%IUh0_0R^6KAS||5sl$&kUT6>zYnrsd=?6JMIksDJKq4=b~&p{zXO!zg-{0{+u~d|`)x3mBv>r6>P=_Kx^$Q89|;Qq5oQh z(U`Od-hWHhp?h}rt~{y6_$+%orqk0Crsn(=pd?T45&r&v-1GnS0RDgb#k-Ve-}QC< z?_awd|99&G1B8S2MHo5<-`SUFJzZ~TznqX=f3(w0xyX4yTH?2LkzBP{LOa>zm)w=)5AR=dxqmI6(M-hP?5SEK z>*!sI^*xwXB@~jMVaD!Msts+TtX62qaf%8HO@xJygmfcF#QvUV0C~4xs*E*Mr}bfK zZqL2b1NnYNSmQ5MUIKbBz&5gR`kejQ8D0m#p4PvPfqpKbowjYoq4dKs{^_@bif+Pw zRA-szA*YdoRtd^51!&+#-L@Lc=Nnuasq}OmVSiaOw!Z7Ojf9Fq%S|&;=xBWbgG;KA zaHkBWmRficCv*c}msG>Fco3@VyrWu8m8ha1v;7BWH32#cy;`pgHQ~X(fD9iN%hS_t zgJzi7ebi#k_pg1|SFm<5j$i161`|Ul6`F%<_a`KNcFu^&{$7G+$=|^*e3W1p|FXb@?FVcPS4K3tvlI^I3B7 zw6n9rEO4qaSzYM~8mWQC62T3j#uws-ZPoHknxzgtv5uW!XgPZKO$QcD7B*k+gwd7CPET*XFFN zA=DXIyjfarCgs|VPKT;_8^8;@vCiqpYU-7rYJdOp@}~mLY(r+cJm@6z5BfQ_?_qmd z(5>$DSZ zopW6}`{q=w!Nt(GSy`9ReBaaA^hsy?O)ufkuuFx&f@iPiCC7ov3jauCy*1(@YyQgg z`fix*{qo@|N=Mc`F9DN5(*kHfc=vpoXi|E4IUL?9M3;d{tf*@y5S}uPD~(ha%=(@F z3FDqCOJtytNlrclz!F6N>keqD`Z(_&Yq5bMtRSH)7Jn z8g)!Lm{5xm<6wqAs(GHlKIJe^($XXK`MGH3Ps_rQk|(zf%3#u-g(b^ah{aV65;WhT zj}<*3Uz`=vn#a{B3kR2)7j87WBP&a21u>@V8F_h+n`S?URcPFn8y)hPdD`QiJ6Ry@ z!B06w-5bc5evIaDihJ%)cnOI^b5vC&# zr;-+ghu?gVUXzbI^sp{k%v(sQ+g64aADhOnPwqW7@vADVZH|WAGCHX$G+N}gPd5|} zR9h~&Fw95^eE;`%#{LTWQz1FIR$ivKBmo=xCZJ>rNh# z2jNLIWB5$aHP6D@TDh2=!K8&;5#eHPY)`Jsof1Fl!fR)A1GAO7b-S zF2a=;G<>T3m*q zCK(cDEIy^rnqHBRHzLw>vWROCY3=8L(G|5|OX*)LMTDDLOHfKVENl@%4eg%b_3Ncl zI?=Q^0H^5cx-D&$WBnS}b539%Sz0!5Bdw5&KR8*4EA z7#+>2)v^$p-9U$EXIED%T?g0l;b@hrK0`L|D@_p=wl|xC4z8*qYhS+ru@7WqWLObQ z%*-yXuCxi&uJF`~i~FY>pm(0>9%Ay}allUSeL89KD5UPuy`YvU3QEc?kIMN4=(6^K zSMG;JLB3q{#hXfV(Ldc_7dymwYXlb0I`Nj5mvie!DI!nu7b{{ztj1T}6sa7zSXn<- zMR=&C@(zRl@GwlL4oZIz@s;s*n)KZ|=gyw~aB8|uF++BBu*Aq8q00zjY~-3J?xeta zo*|#2+edR-QA)2V|8tdX8AGk{7g&DCA75=Gi;QDT!eEzjUz#!NQ~7J@5Y7-&YgJPs zQ&<}jQetE>IS$zXA zQf@x?=W|`xS?}{c=jiEC?xRKztiYU%Ajy;*dk>~y*ay2GwCy3J8Y_txm-P_J;8QWK z(rnb(`t<43?CdQ3{X&bl?9FUtEAsMDC&{<;F|v%qghO5x_DZt)y+*1k?%RJGo>zdIv$GAv>Yi0tIPqR4q?95;Ps>` z)ZuZAy@)O`DS*QpltuDpecx>$S-H=+B6PdA@K!QXoZ-pfEG;7m?VC@*gx^=Pl#1po z$4H=+N;e|)HhWMwlx)p0TqOW&o}1*s3wYSu z3o|!`9jV5+FFx|2KRheEZiRsLyQ>I-CM~EBu3nsF60MY6UQ8)cbw-i?`+zc+Mjbn- z{clKs_T01&C_Br0mK+);zth89qr~%{L@F)t-v#dUWMK?pql~hEge=%I-aI@$fv))< z^7FTkJdpm9w#?Az@NhLKOK@8j98$V-Px+AdvHV;}{@g`=mD|vuHUBO47@JL5&A)Xf zq=ZN+8YGEMirNtQgY%ddfvyjp6Pq;imQj`Fb?Dl`O?n9#Gf*b{TWNp$I&GME^V_ZQ zb$60f^!&aXl&nj|tw`UoPuR>VYivxj?NWhUg9ffa(v#Z7QnDLr?}UG(#$Vkw;{6w# zf2EU(HUS-k8QcmdDBj5Esd!2^9NZ1(hJh8sgY*5W3%gjwPlmdS+b-`hkT)pa`u_>v z{)OdH$HX^px_);{xL=r<%ICy%s9MjN3~5jxe3H)OqK}a21=vEbB=Xz0zx(;=$*O;_ zbl?EF{_pj*?IObNcmSI)xe2|zsx+auD(szg@6xW(s+6!=G-cs5j|DbwQH zHB5riF|P`9hb`_L?2f)TIi&xhRr?6ZI7Tk-%Zvs2`QS!$ZTyTja&3_6f>=0D(}PFzqT3j&rBt!pB((b;eeIEop{14h;wVYZI^mH*OQd>1D-%0m0z zNCYVlwD0PAtcxNvH zEonWTkAZ3<5MDB&#C@Nik9YlxydUk?dkjYKA4LpY%xk+lJAR+vBAz8Db$==MaP^)) zJRYLAv9KVxLi0ss_Qu;-0X8#Pz^k!y^Ij}E3;pA)FB1&0?>}|3wBp?;T zAX%=gruGt<@4pl?SO>wE^VU=vZB}d}dPWF7`}XG|Oa<`BvoKdbV?-LQ`p@ODfh<0) zgy?1eEg87r1Yt2-qh+EelMs45c>!rjk8d$&&&2efbS*6`Kv>AXw1m>d(h_Go6gZF7 zSe_811JwwXM!&rok zh^AeE!SmlQai_r2gAZm`YL!%9I=gnbIjNUN}L=O>ga-n-XV<2!{K=~)v$I?nYyqgJo3Cwy(9(T*Z%ps}hKOQbF z>dJ}m4MlsqHi*{(=)dvqzti5bC|0zx+IaDD^@LBB!*$zY{;>Ja1>JI^2((J3>HSHrXDK@Z$poj(G6#(nIT=Z zr$F?3a-;@M50_o~th~xmaj5W|e6$_Z87tjc0Iu2avHe<@EuFTLZ%{f^8|8Z&Bzv>& z2VO`prF-$i3Cemt=X`;N9NdWPWu;u zxDU3sAAUBY70??>Gfl1t)w!P&Be!e9)Ygec?vzzbeG+Az7XHk4$E4*|%Tp7A#>!-g zyHfp68xLT)m2b-LmEEm73AkRwcM`QJzh4Hwjz(Yj zZT(GA=pPCTgCNKo$WJGbYPGI`wmLFzXs1MMw1|a2zd?Ex>aVD9!5N08ju}_JLm2-t zbztSy6YAM6uV&r&g}BIPx?D6p9Jesd?=3m<(Mdg3dPwroEmCTflqe%rT`X-J){-~W z$Bvh5m2rg-E@yZ%R^F0Z%Y)0@3lD=MQXI7j z?QNzm|82K(BnNpYX`+o_zCd?`aP^2%<4YHp(Yft_))~m95)VDqRV#JI`F zEN2s(sAe_8LH(Mc3k+_5PxaYn=*;xW2_oXF&|X_E&jEfLO!ZYuE@=+NLXP~AQmHjx zzO)~@g z9t*8*9i5!QV?!UtyT|x3uZ_O1dKIV(2Uy?-Sg1k)5;~S3{y+8Tg9mY->{l*aG}@JU z{1Xjka!TS)YCX-_AK9DC4WzvLLW@?O0M0dITMJflZ#SrQ2D1t0UT&7sO`)U47v~IX zz67%@j+wuDW_EmOr$*=!tD_qSAcSQW5jXoK5A(wI|KD6&8mj+%gl*-CPo)s-&`QC) zs%Jjq8{mm1Ke`5{Po{w|^a;3a0OFJ=%mUh@bH8Ss6a%62J0*t0*)CYAkJJJl^G`Q#!Z)*byOH%%SkM{9n zTbo$HdHjS)jUPHVILP3M1qxW>l-LRigv&ia|JT>oPc#^aLvJkLos1ZArtvuyG&VkW z_<%YzVV5Fd<=owyQXFjRJ%n%^bBudhgVUG+B z7FAlFhYYtK3CW*(o3FmYB%BTBjGm>#Vq#Kz!;*Iip%ZG@pz(O1J z9jBJiS>6H~gX2*=Z1QW9KrGR7=8Uh?1;ZBwleX9c=+)r)Ub?ss9nGiWF;FR!Qo zri}WH`&H*CiO&>}7wUOe-SLmP$eVmc^g!#ob^=oDK@Q##Xh2#4N?f!M9CGYn-cK9O z7EO21P|eNFqf;sBfZOwwAQBo?d~b|TNEjI!YJ0iD&CMNV3ThcJxC$QpP=?Qow!8ox zy4irM0I&t1s=UZgNWKC{-uQ-`Jv3#wpx+JY=dS~SP&MM8yWjEq_iuwBTyD3Up-(2P zYrw2uqGD5?j!_TIYiMw{v$F$DyLH(9G6ot|!S>|BHqoo)NG{1Iu*ojcn(Qti{nvJ?9b10BymWCWrSBM#TWo$rsqDQ-a2n&004o<}OBM7^P%&jaelqueQ zNf}j@cmWH3&3{%43jAwo)SmFlUnvAu6%@Yt{&z>8LckrIwANku3LpUs)cNV<4KN~` z;>swgeV1MqcFZs%{wX!JmHSYp>oi+~j_sTDCOapd+P5p5`dqnpqChMO zBECOy716xl+eZ=qQ}2#5wxdhC56L2vlW#9v)rZjExxo_-BM*kX|D?4%=Xms_^@vP6 z?KPh4sT;->W6?c}sHU^5V298pWqDzuMrGEUqt8){7x_L(X`!KWJro3;gW)nQ>DUWM zupgkbZI>cd<~Pw*Fbok`fYGrZ8XG4KIiZOQketHhK~J6sj(~4KBON&QQz9@@4mRAp z>cW~+KWcm|We@v4V+$>x0iBEa22MFFY%Q_9j=X%&moEW`d%;0LgX0jqfrtPT@O$vR zgwCx)P}88D)pLPwvS!Qs1&y?3oILai;BCr9k`OV0F|QoAOzex4RtdhHq>9zIBQ zuC<``J$CR5q6z`%M26*Mq`udT2Hj^13ky3NnT@}|3^d3gQMlhFpTwKZ_rKqH>;N>w z=xmCBpFcJC(e5luLw!WQo%iY>BgcEOJhYM21X9RI+Mx28FAoh0b38w@_;wy?{yJ}N zlv3OOw@6k9GghEJ`(PJVPw{5pR18h;|C+ToRQ*%n(_df`i@ z$qIc}if}JX6$%Q5Gz^8q`~-cd#1v(CC1VQ;0Gw}bg7aCYo!H7^VKrXe{a4{3lr?Kg zjkbNZu9*uGA%6UttONc1@+DPCyn6u5!As;3AR9%s0zEi5IL9ZahGT5Ot>CCb7b~x< za1lzMsPh2L=cDh10!9(J1k)U_K1e!L!!>0Kn=plFME)$JfH z$A-ya;irGp-5@LnTUX19wMK}Bnkq-n@0K}T?2(6TL#EF`BNd(bsi2_zt+}=JaLHS< zqx?9KvP3l<-4Pf4|_1eVfYL=HY^-eE*_qSN$W7z!XZsqI%vR02G1PWv-Cp6 zswFu#FwAvyev#KR&^m;!ddZNSZmd1KscGsrM<&65cQ?CtB#J7@|DMS(X^gWv&g+Rp zhhjki%wXJiyGcxZ?fUinZ8cp&XflSJw>w^AW09O|b9eXnj}7becyCJW|oTk?+*;#h|tI@|>*7(Bh)8>;SB| zEc0?ge~?%6v_V-fM$o4@S0VjaeAxkrcH0axoyf3*VeD(yk6YNmV=H!K!FyJ|;MK@Q z(MkRJ~X4-Nt$H5}Q{C-0jndv>!JR#UzaioKy_yoDK-CE4AOsUV3PAoHBz zdu!Rd;}b8bXiS`9_-dX=AogaCSJZ)hp2V9hF4S0M@On1va!Ae?v6v!^Z%V318T26y zeKOhyE{^_Vp@hyhrJMH>ZuF`D!u=3Mz;Hl@eZ7$w^%2E;5Iba(`o}7)J-aL);^iX; zC??nTZ`(WV4fzS(1?B!*0sGUc=sz^N7eciEAVy5Fxb z)EMTHZFWfLG%~g3ZlzfY6!&fixHQu=JB^g{^Sy+%%)SsGRKWmdL*d9dOkXCn%XEdD za*BlDQEr9bMe{2t_}1jzZ$_!0Z}rb=BV{C|qd!Q=P>V_{z6u<7;|C9&*VS8gp+TKx z3%=+Gs_cO3<~?PH#JWK2Zo{K?vWN>HT>p`$5JgIa6b0tsaC6qBB=*~QQGrl^cDLag znwmOuc){`&Uz8N5F7~(EOxKxCLenFoy8lnQ5uxqGQS8mq)`=BUHq`U}9h|Y|*?yYxBus2#VzNqYP-(=O$7%M)%W8lnoXl5A?cu;~MgI z>#|_}0aVHC)t8q?XALU!!6_pS3(Hz=-dAMFFQ~^SVM$+lAUl9V#?)D^Zz=9$>HlBH z)T>`+Y)k3GiNqk}T@g@~u_!kQIN}YERRu_Mou@B@l68RR4<8x!a0~P##w>0W=n2n~ zM8EIW4v#y67>egEpodn_?;bOT&F&sOQGBAk5TTqWN77<9-Q-zhdwj65wzVY;E%e=k zf0{F*J&=X@6+us$Afc!-wBp#$2fo#YGwBE7w%7Xa%N#?pI_v;-$#y!QeHAnn?_+DU zMX9YXo5<|usa~Fg2-6W(OlqIwKrI+q15$={O-d7nQtxfPfNg=ZUJWq<{8I| zeT&xLHc%T6BhXMI*G&}{6@<@)(-?+4;ljx@KG()DUX$~Sm*{#OExtZEMt_70^HJR5E`D3*U&x8_ z*7kgnc?AU450AwZ2~mBY1LcJZ&2M^(f{YCCIfNKWVG*<(!$td5Aj*w~dKrvdEm0H! zXi8>6=W~f~R?wGqd@TI|K9A5Di@$_ueH)?g!h5vQ~JWDGTw69Dd2mwD>`|4ZC`M41O z6%IiZJFrrzS=`;-4KmybMj$06g&^n&JXs6MKnb*N9bJ^mQy$9W>4d@%z zLR3z|`Z++HKA0B^)CE{-lL`%Ys)J@W>T96S5b_enVAU; zyc#(NUWC>cyVT76c!O3@3c_?!v%J<+a)e^9+h3UW$l5uY`w*b|a*xdJY{?O!IdD~1 zRzjJ@U6w`|S68PEMjQYb3U*|Y#afe#msj#EyewzYCp%I95|9d6I(14Cxbsk?&z1=r zp^-BDIGAj*Z&5t>K^EmLRNDH*Z?mqeGhP%hp(wDA%6CfAmIrX&@DoW1&#L81kU7)( z1Kz2t>z7xx_6rzkyU;n#a%vj+_dUFJJAod00T%{BWGT?pQ?UBeTX`erV}>T0Jhhrb zneS;R@=$#r@>w)tpiFTWD{&#%7v3Aj5UH<;$s>)WNhIwiNk?O0=2efgKuc#lD+< zWChFndlVceXYmiOdjgpN`J#Nzq7~F+bmTj!a79e<{hRQ{?p*YAO$qUaBfE z(!5tG9`f85=1yNSVxl%2rng-Qi!&^ z>ntxa(`*q_IQI5oxV}PkMeFgNab^E&ap2HI%z>+f6 zBrhDQAH|F3jbz>FVRR_0oL$3TW%A(&{g>B@jg z%Yp$m=Q5KU%9a157@kflXoYTvwB0UvQFM z@v;DDHB2Qz({8tDx1`>T8aek&1&|9|R7Jw>J_f01z>!rR5Mq@1uJ;S`owob$QE5}D z!NFkv&xb+je;jeQGS_SQPKU5S&ss3{eFt7LaD#AB)0e<`LcNUmb_Np*2w@+U_)D@F z!p#W@P#;-@L|N{3+f9B#VZ_-x)2nW>SWV&ese_k7gl^2#K5nLn!vD7i_y602 zoft?y6R0XBn^Vg|YFYf3K0A;5)Vp*nE($5IYwdQ3l+BQLA8?@IwwvT~0t7%H)>WDi z(@$?LxeA>GvEOb7z5`#SzBmPgh(M17HJ+HFsWaK_HaQ9#vi34l!{U0&tIg9rc4UDu zZ3dwB!*3TyUaqaHOTvywuZp@2H*^(wDArn9E`K=3E>T2I0g_oCJLsBq^|H0Kb-I^0 zWNuVaqFvYj^JlWRL|4e^=_yPE)MANG0Q&>BDCqeeLASlpvKMtK)a(P-P2y{oDyyrZ z|2r+eye0@~|MYY-vB`CkPyZ)*dv&6?{1Kc zi3J!(gz|UqH`0U(R>Ri;yR9)*f)W7KL6Ao9W34aAT_F9zLl`9Y->buGSt~vVfUjr* zWx_USb$wsUUVD{8Jsy6Y2u8Nk-vuU!sB1Z4lA7j|!mIwDhlHh&zw0vu6Ukf_zQrkhd`cG@RF;kng#sM?&>JT&!?$NuCE4;xJgB@P77 z`7nphxq)%|mwnw6MMqvTF8mK!4EYkO+CQqJtX$H8@qmZ{?9=M{6XSf~$0kzmuI{@% zj2tNWSf7VHQc+H$%i(_>r<*{cZk6PvYV%br3=IHr!}@I|YVrCmja@-T{Z3-K%F6y> z&=;N^pC-G7lN#LT$5hF~<1@Bse!pN2-U@;Lg5`~|pv~kv3qm7LDRCv~Rm7htX43+J z!&e05GZLV?dT&QUpywu{ioqpAsA^Z$eQADX?nU(~ht~cdg1?Q0qCw$Vf`^w`8|w+c+?8WWJ&9hMxGV)uA#clp z5#F^AxVK0D1lJx#{ai}8&LmGZc01*lTt=#p!=ve^q-&(LLsJ?+VzIECg<~P_@90n&*Ne%F10IwjYu(^zVZTYGMmC8bZP zPy3wgeT%Nw3~zfZtA|jnF0Yg46grJd9{fpzoxCC{zG_NMl17DbryIpjJH9E z5(LU*M)T*Og5C0`y`)`)sLJ!dMnyJyxp_RNTPmA3>Ft>H#K6YU$8Jq)H8szD4v=xG zT1&U=eMz0i=hwUeuTJT<<>3AO}KW70> z2Q0+uH4fK5E(^>O44pSh&B5C~3FAF}yw~~W)(g)oYspQil_iz=nIRSJh--Tbr5QpG zhlT{VPhj0l7}l;uwLi6Wa1i4qGZr?w`bd(y-RE{fz4FZ*4Bkj-4n<#~G{CGLHi*PT z8F^?`qcgL#+i@*vgpuZ$C1)`#Dy^|4w;6!T$;>exKNFDeb1#9pyBlibjWhf~rm4TmHjSe?r1hn8TEAAZV6TJk5OgSDAe8 zK38`@isW`(yPI;VnVGVv#`tDZx(E-B zb)RV)eTm*;6Lo*PpWrN>#Q=nOnBeh{40$kg40NMi({o@L$?&(QovocQ^e^FclQO!)sHRD)Ly=6jMkx~)Gww4 zH%)!xVCRH9zaeNKDl!Y4a>L5>|jI-r%LALRZyrU97-r+ zjyx9m%m`Di`q412lOOS;3(tI?HqtW7lVv6yg!>?4&V3Vvm)heH5&m9UFl=Wd+4Fe&P-F$rF_&|v^ z>ei}mn;qFkFu9|xEqlcn=k10dO_HGCV3?=sgckmU8e$ggz{jzHPhAuMvMXTXmtta@ zoCf5uj#(p~!1yNv&(~c6E3uow_uevuL1G*tv~>*nE_B7<>Y-PKGLyRT+S)rI$(CAN zo3M%~EX9R_$9Llw-&saAfa?4H(^(?J-51|@S94F*Jg`zukBw+d*2Bp z;DJMuG=t3+)05KG@X$(S;^-s4Ag+z}IuDrKyM+g|28&j%>3NRHuKAQ`pY=+?yY1IR4pr>=KHqdoiBKp zys8S$n-kDYRE`enLgq{|JjF|Wq--1K#tF8sT)AzdN1ctMqg@gp2Sx7#h)(uQ2Ck=j zdnpF9f`nh*&zS?SMb3NGKZ-9#aA(m7Q8C-=3x-f@BTY+Zt#2#)W5wKV=ao5mk3RL9 zg{nKdgn1}+m&eMB#5FPkOeABT_1R2n_r&KHJWYF#`bd&DgCs6tvEgYN4C4!&tA)rO zLs_ZJ#7tS>SxVL=>bB=|t~$7=8u2mv-u*r7pPwe%rdJrIlo)Fp{XSLd{IOF36+?*+ z@R7>;Q$Ho($qW!kXE_oppHW9MR(;V^ws$a%pR0gWaS%bQ$D z--vTZPo&QA?z>suNPAiwR<j;Z4CCQ_8*Lj_+} zRhH;TMqeObr;nz~wcFavlH;$BT|IEuQj878Zi}k_n8lF0oD{3!M;VeO>B=deire0v zlToIOL=Of=eJrMC%;;vB{Qq6^^AkG{mwrMOQnmZAK5w{SLZc1&?%`qG*2FV zli&KdGr+=&C#ff$J#w&jXlryUz=B(}b~Q7rNAs$Bp9u^#vA@-`d&H4Ub{r=+x1SSV ze`BOSS4SOLwPs&8nyt;XLA(JeFhxGrwx%3jPp=L3}#>mx_TDtqG;08c8W|blrIiEas3mxKd zUj+t3Arskw32UAQN)VqtEnW$yKtmm7(yF$%iwi%0Qp&SZ8|~y=$naYlj|J?}M3ww} zi)8l0iy3RIoJ?F;d6%-`(;e&m26B0Yf^$GDOaPDCMH$FGxa zy=u&~QTTwjQIIbG?hezU9r?_Bwg*RO$gp7s?g72$iWu%fS#eqD<7QL~)&m47Pg=u` zL^9<#`tFtFbdY zOvm>%#HrHi-l>uZPMF^sG1@mv9xkf*dtZv%jHKZKV?9stkdq{_TIZY3$I?A)Y!0uU z{v?awhkH}#c%o=HEoK&F%l*eUsh|M74BX%Ei=hpb zbv2UwR!Y%`exCp+vlM9E_>kaxZG|9Dr_L*iLU{R{I=)1B|LJ%cy$a z!hjsCwB?!?PG9GKI=0@_d*|IVt_zo9IHPtn22(Nt}f3?#FrLOA(7?va7+S(c{ z?621_E7&u2rlE`(8!uMNcyo$%Pa0(mri#sYiF~N_-nz3%WZ0aPL^46=W7Gl?8Ucz5 z^%0>M;vs;nv{Ec_27f}OU9r=bh{Q5ER6gpkS-od~cn}II1$rFFYPzN#k}U1Jwa%~` z*%68X=7F$`C@27OXEg13-cNOz40Gg3`~o3P*HgQ_$An_LhB$+tL|E~Hx?(8NxD-+2 zLdeAiDs6e8w&0nH(S4~R)G+3RPB9bf$U+5%z1~&~Kan-q>&Q7%;b*A3r2=421bPPa zUX`=375fI|T1wQgv1Wmkr#B1Ro2v;AyO{slM zm(w0hQ0TxPad%_SKWOXNeqVtcL!?GX|n*U1)(4nyCLG_>-F)Ye4+Q|^SvwF+p^tCTiBYGWHUnXQ9txwf zr10bl`Y@58x95S_^aAtP7~*fu-FOT;7 z;r1@*bLQZFoyaIL5OS7uwb6YvAgOE4V(90SzE#AjVRP;u_|c&~PM#Jf$ChBg&cDE@ zI)srx4lyzDXsGKPR7w*DibuJdCfrVz6j3BpQGpr3ykrO+)5=yyj1R&QZJIx;4N`N> z1{Xr)()p-zMxWvaETl5c@LJp2M%h2-Qa9jB>1{KlFDhV4+YoIqN!ohTlnCfJi(;~k z(4JrE{RE?=YhE(+AG-L_ulWP0UGDj8esI^9b>lR-C$&BS-7o6Yg%R5CXU+pIPjaxN$ z&`XX4HBX&#%h}9@J3ltZ&c_&y&248VjSO-{1T|%Apd@&h!ol$I2s}N!x(q512E^r% zo-STrAQxuXW4_94gJ@}Ajpo;DCB_{RTCKkaPO zDfrgfp4X$}0(jZqrmuhzRn%IKFNT{`{LA21! zeH2%ZePcH_o2r*WJBthbxj`(vX@7%lpJNBC6(nV(J`rmsd%t{xV3GBT&xELQi5*!U z1BOvcG?A~8QCL=~tZl=&pFkN1o)~;6ff`!*XF#I>nS#I*MBNJ_52E+WR3v+Ffz&U% z9<=zGGKuYTQp09lkh6V8a(5h74#kC`HQT6G&$Z`Z?`3yuV4IHjT&}~*Ox@(zJFdI- z#2L@A$MILH1WyE8kER`kV(NpR_%d6JH$Di$S-&k(d+$<<)4rA4N}H^(SD&r=vpt-^ zuqO6QND9s~Jxx3p)5IdA%383eQbAH?qM(()fs4TY{ytRE9!bF0TH~l!2NO5`hp6o( z-B_#w2yzT#(d$}F@autodGk9NtE=2BY2+jTvPGx@a%86d9dWN$PAK11+xv15d4Gp- zkf_WZp;Lg%D!lSH2zyxW_Wt-eD{^*O#e3 z1iD8?1;U66{11@>%OB$=T$;#H^oiQFOErx>UdiLxia$Fb9jB=BYqWn^M4?9u-iNx& zM1o9+mHA-gDXCDBLABiV#r;OfCQ6Cw6FL?M8ad#3PuB^#d zV(Q~Zz~MSl5`>!t^hTWlxC2ftcoOYjg;PoEtgR&~)LAmxUISS7Iqh_8y~F9iOJn^0 zfXR;v;k+Ip^F256>=>M#`-#ki!1^4(VAhS&j@J#Sn5I z=MX1t zM8j?wWh1Q1q7Pu}FwyG_CL(NI8|k}!QUFh`;HI#p=PYMwY5yGj?K+LZKfKY&NN$iV@CZ;H#F{;iotplIs*Jbqz7?2Xn{0 z&6NEg-j-f9*ZBVVZIKnv2JIAEUQPCg;doj{iusBS=kKnD^K$nN-X)K{6Mc*}Vu<@e z<0_FS#{1=69jVNJKC#bbV7d{C=^H^)jVy*80ibcPbjs!3Z(G1c_1!THUyZ$1xbmm$ zh3+`wLF*-(SChOuJDr`v;c&9O{Zgtle?(~4(B7^HNe)|de8HfS&a0$Uv$5Oc`|W-^ zXtWYuN0GCxKNL%Z96x-bn;@D6)E)p85D8*CXSXIT8^|rdG9D0mv}DG|`vm1M%`z#+ zHll1y)V+i9&F_twmaUUX4SR@YuQOd?epmbblDf<{jUg2yANhGf3I_&27N7qd)D!mX z$HnI&it##I$09=`lCUi|}-4}gd3d>I>a zF;Hzt`>LS1L(>221t`ECHk{Zh!58XJV@g0O6?W5=l+@e{8}BPeXp5yzdf0Utv2i(# zucmu)e-7Zv>}@Xj_JAjtIf7mlu#)wT_Xjm>G9JwvpwPys0rAf)pTA$ZVd~l|+pfqD z1|spQxwGLO*pv_M<(VF5Qy?{)&&fHn`Ql%oQ>aK1f7oh1dnCSz*7Be&6YoQKbZM41 z47a`?%?fcDCKKYZ&zpZ!w=hCzrHV>F>)9&ibB4FIh#L%YVd*RTVAA;a#Zm=PrRL>S_Okj*JWsQvna77%{_z8{w8bp8_L!W4_E zUoX8oA|O^9KR9GPG0jmTrXljvxsuS%`NU8_x-@em2~<*=w?{8QN7UE1f;O(j!MESs ze{hvhV_kws^-{|N5-6^EB$jcR3Gp%bzl`@Kea{x4pfiMpv(`dp`sS_BGWId&%2DUa zQVj=y_YzlEKk)8-JT(A40vdZ|zxdz>wY!n)Qo=i0S=gqGx=%XO4Sbc44)u6tyC6wL ziY4T{wb~1IIJ~i+>sU_exmA+cy%=hDI?`bo_szh7*4Q}i2VIe`?g2AQ*4vwMm`7xw z60Qyw4ixUXnVUNLiuNuzJ?8JzR#wu4+#d-yO@MZSd>BS39L}B}+Hb|xs~>~M6%-=N zizSc2)dr@DvUzwAOzuTr07WjZhd-vk4rS1r3+0j^Y5m63X|I`zI0Gy=(MF2=!i26q z+jISoQ{i)&x|5S=c)^}EZxHZ{;G<>3^?e@9vrU{6ldu*m*J6|B%B|oXAk%wmOJFyH z7)4FrXFGK zAtY>P^evoSL+~*8bbu^sx9^)Febf&aMmNar`sYt#F+$bd#MLCYNh8KhTA?&T_;UiU zIaZ2rqV-Z7&45XdGW!>alc%_9`EgYnUhEZed3MHW*JC7YIlX=({4l0>Y*~K4fx}?5Hm-^DdOT-zOR@!Dz_a8Hjg9dSY3|O^wnL2WC`46q(-rl!teQ?v6!IOLa z;VE8!4TgyyHlMqzyW!3v;{(%^_^w00?K*Z2QURCu_4PeTOFP?2y1$qMVtuxCc7i0Y zLGVEwz5wkQ-In4qgd8)pUJx)ll>fHnN$dHdb4FfVP1kSXj&O%TmV(afw;FeMI@u@$ z-MC(|y;!x~Kk|T_@JzDk&I4^_wY{yTDo-Dy?ej z54z1>EB!eROFy6u_k#X5^3mQZ3=I!&j#cDURXM@fUo=0mwsP%qyM~>dxv*5oPm($3@dxTTIwIL7Q0L6&wzvG* z|GHg|L-_R8%Z5p=asa3l3ES58a}$GQ2Dqf68LGsg3MEtb3(I2I^n{8-VdH>7nPrnp zb#Dnj?ao?yy6)j#uODz9_HexaP)n(5X*J-FtW6DNg4`Z~CoY|G+g4bw49QExSKVVK zeE)tq^DRKxm-v|zQJ-zqdJBq$P7{C#( zZLDiqun*0G$>C`wgaE%}HYBoiny1oACFhnMLN;&kPXuytQBX9n$I2Iw*MOKD-LlB4j-yP_p|RYWxFb~dwE+>jgT|L zd+!z|a9Wqu*pIomxpnI7+iW!f4>2h*WAw^sS><(FeFmKbDOPh5bPUp*Zj+Msr11ym zg;7aER**iC838@0PnIQwvqm8HeoLv;eg)kvzZo zP|NV~3KMy*+l`JZ{Xjo~*O*&OOpNf~8JrS0htMuSyGa|0MO@Plk30}wFYz&gogM_LPy4UbgXi31(MP?>9MJ(T6J$ybKVbJ^Osr&- z5kW9T|KK(F)$IJ7d;Zonlv;?)=_fskq66O%C7oa-M&b^EFK-{I1xmS~bvRvQ3f&5a!jO!WFxyKBEcyk=(wsqYpYz}lWu6Y2nb}M?t&NnN4G5no z8yV#^g+R@w=wbN;br`bQ!1R{4sV+z49F*Vw`bDW^jVAD5{LJ}(4Wuqcq&5a2$Y`AD4h$16o22i9ioWu;|=?BK_|32zHi zh=y6+N>4vD1G%#DXrjX@`QFu?E;udwSCh$4I&dQpqbaZDzRFNNMLGfk9;y7E_80sP z<$>ZUu<(Z^*h3%l;DXP{+|qNX_o!Zi<)^XHnYx|AFg?uyoDbKHP9gDF_g`{nEKQDE zV8VB)D7RRdw8xYnJyi|>?#ZRGvs8pPf(ST{iJ3NI_({q}aB`M@q#@5qWZXFeT2Bg^ z;Mda239}p>?JmI)cyqs#imd$tR1=Fgk{uY0dL_^U#g~t8weQ^&*X_E`la`Ml>+P3i zYph*qXIq`lSXs%b)^wBn2#Lt0ZIjj{{WPbphl1T4DS6GrMs)gf2 zCGU{v`kzkn+{@z*`H!rCuu)kDL=g~q*R54w0CyfY>$`4j2I@|_F64cC>T@8y13_;!D>XerJkP5YQ-klX?{;_`bX%UE}A?*zLcct1Nk=hxnh`J_zY%#{hsorXL zL|5Rk4HkG^3NiNr?>Y5C0RC%g=(<(Y@-@X@O2KQB-X}xwhp2^Uc;cjH`Vokl`m$Cv zA+!Mp-Hdpq0VM+7-oi@fglGi?5Vv9A(?PpV&rI(>trQ1`hcvkMk(6lA;R`<$cE|FQ1bVWF>BJDA%;V{%p7x;WEGqRDd93PJikjO>a}~F%0vRepuVu4A z7_a+!;f5nh0Ag%;uKN?&2k=|Lk-K|C(!#@lX;_1Pn1{VY%&PtbkOa!x`ey&A+Ek*` zIB8tML7R2A*=nMeL;vVIzn30( zB|$tdO3t|T5x%olIw@Ri2F$mz-XM6gNumaIw0F}V)66~kj3lTsu(Ha!BqQAeB0-Vl z-Wtn35Qg{D^Zhx}+0lSg}@*%VvC7p zL&fyHB|8YQgTDRg(EIk)RP)fV706tp^K_8lNAj-fWR$d;D~u3bM_g(_mSAJ5O!t^K zijfh%yC6TmwErF$ihFsuIJ3nzBEn@cI%dkPABiLOu}WuFgqj7S+yXcR$0OsZe+Uf> zqH^&Qxa{6dpZJ^~?*(J6Bs%?_DH5Z zJYn8i2Y`j=WmVtl)^qauy29dK8!Kx*m)~zeXMFvtF2$gY^_qt5^Vbt{(1-wm2}qi~ zdL?&9=)-idl|(pv2tWseSOy@L=g3J_ zym*E&^GiygtpgN}0G{2?7I9UTev>zh?Yu zw;WP_F1#d{Av0V-p9zn|r2CN5^8gvvbg7;kE%<`PtEDLAx<4YnL3n40h5cD3`zl@ z9p+ivH}@HZxhK_S z8clIpQe90!@$j*jMve94$?@UJzKHjk@6ydav%7(C|9hUdftklqltP^fU;9dQ0ow@a zxjspG@REh;#Utl~C=9+`NFKQ)aHByWd}M6tdrXS5N>|XUIaK5Us}QrXv&Y z17IuUMU|i!;xtUAv)`U}-+DelhfK^lDI3v2mJ&%csCQbkO+bl9r{TEgcba#aF6iD{ zc3z|MN#^X?7!)*j^I#Wt^~P8Z)q9-t&TVVX&ktqITN{n~56>I8A3pr1D*pe_^%hW7 zu3gtK7=TKNl!Azo(%m7_-Cfe1(rtl&bV+wgNC`+ONH+*bOP7GOfWWsl9N+i<$M`8owq3N9{9YgY8dKKP>}cRSJUp!W^l4712&B&BYiRG04#0fJ z{7}X_1zgb~HqF?5UTEDeAHBo+ds(G?bPT7Ljx+Fa{Eu873G^oz8qa>?&p7yzhb#0} zYkU|q^jLjvf*2oSK|b{bPT_waCJlh3wit4ia&|DFD?4?%xWo}T2s&M{tBn%E4InTE zjO%1MsrN7mg&Cl3^G94G6q+%UP69?hzIr*~5a5l7>}{Q;x7#L>t)2DrRuhH4fkWa> z4&z_#U;8undT}AdJ#CN8mOhuDGo`yIG9}VWOC#vP45lfY<{EL^o2Ti=#~w5v=(?a# ze|^-!TRQ3s87y*H>uHs5SR!%_C{uGATsCM34{ws*7p<1E2olS*7CjB~MZ8!frZ4j* zZ#oc^;U}W%s+IkWsKzeL8y_Y#?se!eeCsOMn3uE7l@>|D-+#=MzR@b2F0r4`H`y@6!S`c z>;@3{VOSdf*1vI%cJwv9RtMs~-Ri81%L}LdH}mM%5`kbo?sgj&bLn(f?nlkH^<<{6 zQ4wh5!1W4vTP{OLfC!hcw#G2`E2f0_@yyKE10Eo9o^IR%1RDmQfsnh~Qf@r}xf;OH zu&GWTPfiX4&t=RKLSJff&67k(Nz{P5YN32p`zT!;8Z#-7=zXGxJ-J)O+P1H|deV5f zmCIsXT-?CEEvoKc%gP#0^8-+znuzs-A*#&2di$-l@XPoVT+JwW1=8;eu2Mx-@h6}; z;o|{pmsmS)IjtJdA1zVhkYuPF-py9 zpWm}!0dmpRc3vMbStC{)5dQkV!()%3x1H}0{23GGjov2F?LnEQ%9Rh$5VegZ^i6}u67vpCP;OMfBT{arVcC`BH@au=C z-GTU#NBbI}G z(sC;)Os-7Lr<314*H;amJ>a#ln8@sGGfl^U{ajHb8t0S7c2!yeP;aIOA&5R91VP}` zLA<8UpHOpHd6FOw*mdEs?FC)G_l%TFpT~A$;N{N_kgtHwh6HeUI27Pn82rz(!0sr6 zQD*$ahp#4w2Y1q@hW@f04@Ps5?0KdIbneT$02dnuG3ZP9&D$YGfmbF7E2sVgYTnnX zEZd3hVZCN#GjXvlCw@YWVC~yxGkb@q(3c9{#~{3C6LsJF_}38}3wzg&U*!(S6CVM2 z`0W@S6k=77Wr7e{D8MiiOR(b0(MG20*W zS4?a2^Yag9d>i{qo~9J!<^5UWY>d}7fi&kn;TM9OzfLgt0LQHt&Ale}utnwSjEp{` z8jS&AgcbqdJcoR#&^}43?E}#ye!wC_%tc)w zfuqJC-~g~3I=k*>s3F|3_I4v~Q4jW0-Bt#cD^~zn)LEWhj6r8tk{onYGI>d=L5CmE z{Nm$_*U*$}vu<7Plnl&_)vATRA zl$i%sfTPI>{Cp*i-q3f&CzegmF4_y}@hOZ?Jrq~3$(&pZ24~PDzaWwwynn-x*F}cd zfbYx5*tZK~((_!Pn&QN>+5_{eYW-~V#NxC8PalTPW3z^b58E=%%ox0%PJLWF!GT~K z>j8b9%|ofsF70i3xp*(E1;j!GQfTMlR`!eb!3PF`V&tsyaaXPfYn@w7D@X2IP&<5w zVku0Qgc`~jy+L;3VEJe&!WCvIxTISJLi6fM#}z*L5#*U}I^E zwfhN)h$URO5f8cGqmwwSK;<~J1dKQSo?71rM)}7O%`;iEE8OHzn3ulb+*>B{Jgxe* z|7$<&F)8QipBB-W4q(5+4%pkfmt$)vwf1R1!WGe|gAohvtKF6AwAS=mrR)k0wm~G&5I_+FFmQMPxfHN2fc`Zz31#b9A0*Rl z{d!wc=L*>|qCAfEVfMXYb8@uUGJHmK=!%Ge0z*pBkA;N?PzhhP@*7f)-(o?2Nhd4w zj8_~ZR#7`?zY$q(e6Q$?#HgrfKT8wDIK68(?9F6Oh(UJHhDW0rKm;v+e0G&&M81#y zZFYW76WS1=`xrkWyRTm3CU>rG)jQXPgaM{YdwN2A-T}Z;8f4#caegA*49K8XQG>a| z{Ic&ItRhtZa`xu<^*rYNNIEKnB1m=HuxKwqLOH}t-nJF61-f@~IbikW!v z`b6-<07S4W1pLcc?hYRR0yA(tE^daJl&&j=iMh2^J=%nO7A*ugM-^s~08WWPs{##2_c^qa zUFd%iIPW16==t}nuGh!NG>^Yd?{|LoZ)B>wds^#TeSWYNm3#u}g)+gqP?vQiGlC7i zH>+|n;8W%E2A-4f+k0Xi(%z88YYBf5W)k@@O@ zX<4xmU0dsMo{$`EnL2|D8l_)MdGRb`H*$XJZxm9`i->;#Og~8ga#spWM=Rr+czN+P z2InItL_qGc=jHPkDj*pKBh%CR8Bj}AO(b8Zy3z&WP>7w6!~!KfNdo#%pDzm1Z-oyW zN%z0(AjWX4Cq7|t-Xwh$PXEf<^>9;HRFO@#Ir*hEjQGQ^IM9+(A$}Zj)_7ZcwY;ux z(-;ZsRQj;^y8Vq*q42w%KQ(mqJ$U1}hkInm$WZhKJ93YLLNbLh*37URG~ zmJU`?O-@W?EyNCP5%}}->Zh0P&p++G!V-AfV(5?8+SXR0Tbo1O$-7l&G8Y3SY_hq z+c))`w;)H|3P3GN||lj`?kk`L*?R3P~3ImM5fN z)R?M1fn1@M7R;Q2V1%@)DoA`)ymbG=C8`KS1TNy7WEEA_RrxO;g=L4j1Jb@8nx7+N zZeYN8sCSim>CUr7&7&0HVA0!PL}LDRnlP z*HuL|+pHK^YRgMY)4u24%{;JZPQj$Rb@y*o!+sI3>n0ctuMvQFhCwrmKygM!Q@;!c ziMqCS*RGedb7eunlBO`|hXB!1XdRb;ps%-g@PxPxMh#ZAbJ%3(x#Sv)axC!h@SJuy z5_|ev4~w{tCFRYk;mb}DFh20{l2Rw3e*gUy74gdlRgFEnUf^7?vXYk`Z*6P43GZy? zc%&-N3C0_5XwTgo@#|PvFkTCWa}-bXE(wXAh6YG910Og1McQ&S!5mCay79q`uS>0o zCJ8Yy;8cXf#B3EmVdBz>2VQDZQxhPzs;aVFGdTAh3=AF;qOZAbI@at_372RZVb^C` z+`G?Dh%ObGB`;U4Qw)j#AUm_mm+T9q$OohA`(=*teNwR|Fs9C1<9)(G8?1*ImIhfP z1JJ5f#C?8nUFk{nBc10kB@6Fk>u?a#{^b;feARhicH7zTZfl=d$!1|7ajAD;v@f{|n}G zp(pzJbHZ6+ppi7ZJy@HdBAK@stVieqZ9`Ktv&QWZyMCBk59l$?Jhb7M@W&up!FDRt zpi@&+OrIHn(l^R?Z$R9rlu+9@e)Ajjp9YUq)YU4awjn*rT6rA$47wo zVnw>eBj(}wW48h^1{fZ-SwWCSeLQ4)4v#tRL6uh-ZAgX_xKfp^JCF#2j`#OJ)4Ma9aHYu^FD*k~0G_8bjPGkS(ap|dg2bpGQYKwj1J~*K0dd=El;!1d z#tnRTy8ZnU-_t}&|JR&()m@omshXTkM zF?Rc9TsJ4<6BCEF^k=ABf6U9Mu%CZj<_Sc#UzGdCmS z5vU~8a>es|2p}|K%`clawyjIG7%W+&MX;yI$R?_q5ghu8x zGz<)9!Hs`{x3?W+hhU9xTK`bc*0wS)%Df@EcOrkDU^7o3u3{&*dS-&s-vGWV|NT!3OOsuQ034s@xP*5>&qk*V^@{})g@;5^y|RwZ-^ywYx&s);u0KCJfuLg@(PDT6 zktW$NARmu_z;3((M6h6}9`>2JPe_X)w6<;C-MK?joZ#dWel_LzzT{oauW-$!11Ap^ zZc*WPITrre)^1f#NPq4gftbcsWW^>@&~`a0)MZd7#ZG(pr<>o|e4GYysSt61DwS7} z8G5}OlsFZ?20cZ*&mNk*73Bx_5HFN(PtfJwJN4K_&>`+BD=2_vjSzug-G7T>{OZK+ z{4D1=r|os)z?Jkkds85mqh*wBNs-vO`z2=9F~@AEy*^_5~xFt&Vj$B;0H z#(C@^nOn*zo2=jn(9!+BZlWWnQA?F&gH8z3pNSna(+tfV>Z=AM$YICZ3IPxAy<>Rg zU_khd(Rs`Ok5mWlXEuRf0l{q|dFG7v5`>Fkt^{PyVEXRWS5nK1#uQS!AiSkIL~|Gd z7+db?nUz7VWbRjhrVW5H=R{XHxaH@F2(rtJjt!Ym92>+1q8ul_{y0?uoabk`M*t27~mcQnL270{` zx#UgI#CSaS7!5?V>Os%me|r(+?Z_p6Rl+bR1A7LwBQa>s zU0+$&18u7txpP0=^+L3NIe?HTT^HNem>dgG|Ku~0EUQ%T4LV!Z*G6r%2v>Z=k zPE%_+>i05$&`{m|@n(=!qwU+)IXkc9`=9b0hjQZdCV|KS3OyJIL@$Vj_JYL#L!$2R zz8txT=jZ34&zgo$yZfGoh{GBN-cTxqhrgAI?x7t(DzehIcAi>)5@Ivoxg{ppoZb?| zK@lp`Rbi5N1FzO$z4%(#bk=oCs&u`qFS-PH5^3%c7;8E^a}IW&6MN_bO^K+cd#;v^ zPEA>{5s=;%Y|ruA_VfCiHcc=^kagePq1L^5+i1fh{d#aV*3QJpH#R~i*O*1}-VO*3Lx)CV*&U07@4XBqC>RJ8e>W|U z>C3$mBDz$yvb9Ckd;-n*;Us+Ha88!^J8cdyc{27{s*C^xJ_<3wvyB|Nt8Bph`||Go z-d=dgx_JXI7eGjm@9>GD$_3$?P4+o446uOqdD1P#O$cP|*CW^IxAv4HU^FtI*|<(1 z^7@T{=C=)Iw`R34nkw_tGD?tPe;!lk4i`4M_>Gj)9!n3i%ie7`Rd-rG|D(?L{0Ga# z%%AlQyT6q?*3yD!x!I|I?;O?5pZ^v7e0IAA)f^v9z5lh%RdHzJ6apSln^q<+UDpVgK$`>bq#Mv;O<;c&Y(Vl8BMo-hN~D$Y={#?{HI<4~3L?q4 zOWCoD(bQ!T?A}YgDI_EgJTb69L;Nu(+c*At0ZAi`-~3nq+lY*La>3Bhz-2iFBczn< zf`a-2cLd_Ukvt2Y0ty9W*k5!}W%CG&fvM0s^vV?WJ5tKSs)_qI-&_-dsacuuKnJOd zFHY-us6CRkw7fj3{?VF)S3tn$&u@dwfgi_JH$I85#Q*)>j5X>?8Q1lOJLDPdLmeT| zsk-`w6|k~XpT@TNhoaz5Giy)S%!;6aAU)yzCw-GvUp-|TYXZ~Vq-da-iGcxQV`bP8 zNFtk~=P!7-t&2iWi<24kAE(y6^7C4{8H_dl7xVj#*>QC2;{7LS0?+eS5V5FKKY9q;# zos^vO!~gF0U1bAFPOTR~Bx%HO71^0tS`mAo9TpfOk*x$K5V)du(fxzY%fT=Kq)e>A zq=OzmxXu!kaBXdSEh}?|ELe1d!2ZJaNocB6wumAdA-cD>x7%!mH?g3Pj&qS|x#k%k z7neqZ(4z==fX1KQK!6zSCC#%(5fY@>KiyNTAu=ywToYPZU4=oeKoQUZSHOki2YOID zXa(~|{e0o@F79;FdMy4jfmtFbOrPCC$HMBOf`i%k3hN^^CEwfsVeWn-Y0b~Se|AC$ zT-%h7afR}F{jujrRMqYO_DzH%k>7Cvt<3?#f~`VSV8;6%Q>+I9-401DhVcmrFg2tn zt<~SYO^+}AD1ut>y5GC*zZD-pk|I=DNi9$`>F{Cb9fY|evmgf-!j2AM7XdF4F5*aX zSvk2KJ{W;qP~AB&J`{N*B!~J@0CpQfSEk>d`Hje<7ep8BA|$^ClLSsq&bBf-cgZA! z_YDmV+1V6F8L?8B4+N~42-}eqYIGQ4q#oQUw5_gSK(qKMu%9 z6Jv{OYp3kKMqwBCpwAY(eqy6#qcOkEkX{>}X{M3&J@xo#AA$?IB$2+Ci!DE2pd0M$4EoC**VWVAt$2MJE9iT?nhOpzZPM*HFpTSv)VB<7 zfOK6bS6*n)L5Z`VTc@`L*)W(edemQNX@NlWzQ?*3I;4}pNx{O3n&0IEtAmUI>Xb*M z5IQ?)j{~qC&4M;u5T!bcOLNPV3Evs_VH2OFM=WvSc6TBDe=9w!Gj*J!?=!0~aj zkYeY}-u`|*L#LY3w$zE#LWx*rsaF!Sv}Em!NnY5LZON2ptEQ3zURN2oAjCyeQE9|t z#Qt>u()!E%2+iG_uSIbU+Ll$TGqa)#l`4=HtN`z|36Nld%ZVGM+k zi{H#T*qWIc8_TDe0R4Y(MV&Yu`4US+NciWd&$pOoeSdw__xGy@KMkIg3st{V0}#y5 z?5$HGWt3Af+0>$AP5We$Z50vYn~R0?#OleXLP=*AmjoF~)v-MfK1Mf`Zqstd-#mV% z{{QWbafrSWpXeeaellMkbyiO@FXI;jyBFUo7-b`TP=ch+& ziXapJU5)fJe}NoiSgW;rO1#1?{md2_rR`5H9s~@{A>We-NW3v-9lPxQSDzr^H5dp& zKY0jA_DS_i$}FYF2#L9G^H=;rKx7yvFud8GIDq%E1}%blrZfh+t~tTxe!jVFt`rn= zuwiCPFS*utGi^WoMr(oQkU~(P$jjv;1!{6eaH~MOU`ZgEW38*yt zdCa;XI3XoPE?Ws#lo{QKK0-YtEX=H2bCGzeU=|<9j|`0=0$g0}Oz+lyc7RQV9AKYJ zgoSD!S_~CaT>m>PS9KV~6=72<+aeWF;RL>Z`f!jzSS(d}Q8R`JX6YMe811kFMSuf$ z1l(4{%2np}(S==ipf6rRn3V~q-pXu)_8y9GlW**vupt_j;c*Nt%sJJO&zST`MJJ1-30!-T*4 z%tb;ZCqNX^3h(QKtt3NSE%1B>2jN`Q)zyU~y;C0t>U9i?g;DfZU!PS?Fq^YJQNHo! zsiXQwM@Q940nC7_@(5Q^BqXh?FfdUEo=Sbomj`T*sdVm8R*MvefIveWZII3i$z>aKl7@j6de*eDRv|K!tc$N#2i_Un~cx^(SevM2A<#INiX7rCWjQmE-hiLV|_nz5zmvS)sPv6 zyD`h)G)ENmA1#S$+<*qg*N1xLg^TaA4GdU};c~hA9{l;4aQw5xlb6(B*XY=KdFkMa zD(P>HS0X^w9W8$Yu28b%)^?OmmAf6`5oEM8SrLI5T+GZuDBAN}#hID&OFfAT`vkC% z7wCxFn9-wTC~vDF$(ro82K?me-Ef(=xJYCXn^0y~!i?qB*Czt5&d!f_4f0jM4dOnJ z)7ynF@5JVT@l530Ax+U)4hod2#i0g-fYfuyT);uWi+A_LQBcW}EpP(AQL>MP2XaQgnu_Q+up}qch>bxdeG`i~ zu{B&pO9~|8Vqaf=$Ckqc39?$EHR9(nof=3qUx!z4QbYoGvbX|0YQKYoB%sm}6s3Y;4( z{9^8z=r;yr6EO6s6?@iRUlT@bB(=i)j8woLP0KqOOs+}FMvzP=rq9{kZVx~(2ed?T@rM0Ai39seEnJ4OgIdOThe>t!1E5j8wu2f4KGr+Z+liyrsIm+Z z65wjH&9~(OR)9fm^PM}9KPR&W7JlHUmI-0MmhE(Tatk7y+Pko@TQ5oFnQ~;5nJE%E z|D@}cFbZ;UKW_SWv})Zrj%6o!^|c@Df2?1_(HftcN|g#62hgX`XTsD~CHU}&8ALV9Y zWm&NOqp675<7IeVekl~|YH4+vmIL{cT-64Wat!my;b982CQ6J}37=pEhTGSKiuW!n zK!_nIP>h(E1(r#~%ZT;HLJb03u7aC|qw9_rqAP|L=H=pJh(){i1Z7}7io$6&m zzq_oA7rsD`FN95p-<4oH2=uVQ)xB@uzQLyND7jmsgPBZknJg2%`4RtJ1* zh}b-|mLToUFRT3YO^DVZ4nXLBHYkvrW|_LmcHVgyzXHMaT|Ql$;1=LDuAx9d8#)iv z+C`hSI^l~G1!PAPay&geU{ls&AV&+s;X;UxlGqa~L+PF(PmTjC3`P&A=1J~=mDlgt zX7pyzOix}uf)jM7K_9Lca!=yd-$$MGc$%5W<;7fpuE*`Pg59c(Vvamu2PgJ2LQ(<= z$tLXSOqtpDReHM)dIrV4lxvPz=SQJR%cJcsiN11%eA z2+{p_SUWp?ua0FgKm51sIPm^W3SC?b(vy>aJ}f8RPdKgJmoA0H16gQ~QOlu@(pDH+tE^W(c` zU+^<6KDXWg-BpkkkV_Rf{fheFXU7-T0H`qXbO&Z=fj<7?{0N!ngJQ|%zkUN@-Rmm-zD;C)@l z{4jSI1xzXX`?33Wfr-v03$`C0Qankqp(0PWKyYnv0^7TwEh9TrfK72r(*HfABgU=? zIe>Vq`Hu*AK8heDA)!3^tiy1P9CH7Q%I)$Jt$}=)pkij0n~&bigeNVXP|OutK4uMd zv>rpy+_!JnG}*bh-ghlVV;DnapFv%K8Pm56rQ}~#>}ydSA}*YS=mn_C2{_#HOc;~+!AR@3s!oG^1hLNWU5 zGiqN9zPPT(iE9h7`u+yspUKb6TzV05Rr|)EN2U=AgWtx77cfA!ny1``0jTjhIaT~s zqQ&&0LVLUukReB6ks6b^!9nEC zdT@*3g;uZo4L}))kt7AtUiQNao0*Y!yn%;BVW&TRQ)lBBUmrLc0Rw~vHpfi_HhuXg z@uqi|7AWZq`~O~BSs^XbeMuS~t24NiZSC5%=(y^ng?(+YU3&imM+R<1QF{8r#cbLW z>hKC75EY-;cu6Yq=(g(2eHJc~zTF*kB`YZ^>b~m=h>(ns!oK|D7b5(rFlKe%1Fo}t zj7Bf>>oaUo2V2|FjJ(U(3@D7h!Ge14^P0&&fBrxK1&nLhvd6R>Vy*KuLr&J;Qzvun ztbJcc$1<&RB47%Gj!xUeJ9oMDm;FBCgbMOl}?>5u_FAN}I zfF=_ym{uE`o&}+!Ty18`SLp-`0(6vE6Xkr|yKe$Xdn%1Na9JjY5;lhsVLUdr-H8$j zk{XjS3G$4D&yv8GfR`Un2GT!!hlk~K(xxGgK(BIUE6#PYA&tPzZ1bg3EAA_qXvTgQ%6hsWWuJSAh*la*m#KXMJlx*^?5)~A%#2yP7#cox zug~hR1`nnMyI~#5&LiovF>A^w&7pc2G=i6-RtEuwpMPqzFLMLzwV#ka@t6&FSd(aY zL3(NdvS5;;Kq4L%QRu|vBndN<6&~g$#tfZ@Qk?rKq^i9_*A+lIFabgL7=fJ$h*v@m zv}Rz)mm2qx1?!^0HKDgLz*PuqV`XFeK7(*jT-&0;xOkOm+)$RW_#0r}=lMP`;Bb2E zoU&{;U1z}(A3bWtLBOS_~lyVqm~^N2cbTi-AIDPzE5L<_Qd` zhBu#F|L^M82!dPSkc2SEmuq?o;?ktpq_t#bm!J|ou*EGQGbx-3>STHchnAN6Z;ClM z84ECu0W^TJxwBH9zqV4uy&~50%I|A~-}GPAX3cqkiGpP0UohdIX^fm*o!hQ<477}{ z8#jZf1}IYBE#rT{A~sl-j7fS-ffCMLn`$k*%i}Xm@!wHXK^N9W7DitvvP2$b_E`Dh zJW^a33LP4Fs5~r*AvL{xoF=4zh;PEAby7eoB<&kM`aglYdwLA{p2ND(9UA}jxw%Iv zBy2>mT|cr%57~om7`Jq^5n5c7r|2%<1H=%1xFfn4XqE%wj)_V_cJaLxC=mC*@9a3% z&Rj9S2Yr-kkA0dHRP!rMcoiVJf@F;h0P0mHXmK5$j3&&@Nx=js9lmm}HmKjLG-F<( z{pN9)eV3wfY5TCrg|ORAxRZN9xiez=8RR5j72V7cDPZ>Y-o;OvW2|V8=p4jS2S;}p z+fLU>3P6EFCLU_-{bIK&Qi-8zqeWQOweeXLkjZ$yBJ$Z#Ucs~NFvJL_%{Hq>4F|`` z;0IpDq%C6Z(w1C*%Kr{r@GzdoI{%GTb;hICX1n6DVSKPT&}nnckEde{EEP!5QNV!h zXR?td$Gs4k)QTe4p~G)}_U_`E`tQL3E$zg^$7uctGT<-sNJR+50um=aY9w<~_wORm z+S^tEYmdT-4a!val(zW`z{PLG?^keZ;9O@Vv%`|!nVRF`_>a!Y(!c6Aq;O*hC>GUr z^eIS6>SI)jY3NMUibL>Ti@HmmeGu73+d*l1H1?#EL7nf4-{7mm-fj*3_9Xr|kBCGT zjU4n?@l>yzamwchlkfK5Jbp#7|ES7qJJNA@(u%B{v7W}aN4JK@X^Gy3^9y6f>!>|LYEe_R0F&kT2rHt@_gBTC-v)kSx$rdD~^ zB3o^8@uV5`RFsyxz0SPaS&6hL(QLa#5O=sZI5Fcf|23?adZ;wMe3`|puk^wBbBm2b zxUc-~2fym<1$=4axjjR@ynj!}Z-z~C);Y>dryT}&)$8j}YHxOB!&lrFNBxccYzneS zJf)?F<*xFj_yQm0Wf-ZiR(_1`n30M+_Lv9`xS6lHSc2TWsn^*b(a0*=5H!SE@l`>G z+D?Iou7~MeWt_?$Tv(d3qc4m5L0Ri~R`bQ1MzZDy6#GSQnX=iKq#~}(N?H|B_OCK8 z#jFmQ^OyC1-Wcqm9z=y2?x3_tN)hI~XSyTmaI9Av^3GuV)V~R3MPs-3*0OV9dl{0DWe{?@MokQ3o;YJL4R zB}~LC86wa6uA=M_flg9Qu2BzpizpGV^T_m54?TCy+>)VDCA>S1~#Bj!GQeD$Q|`>(Zc zS)m)0nw);>DTNmM7YFgnHbuXTcHT%Yy4Qzh{0E zKkI^YQW}?xR%*;7Qi9BhGAdeIu>oJ0h{q}0XJM|JKgSNsX5~vJiunUa4qDus9nas} zgp;CyV;M16Ns}d9#yIwsZDMUbhdkz%aM{J6l8o}DIr~u5gQ}CBmp!>jYk^Mviphg} zn-j1F#}|HIZ{LF9X&! zua4-2uz^&th)hVtd;c;RePMlVtr?uT?~$AxkROji8;xR~5t2N2s>$}JY3X4+%6gpF z29qbOxzNn3T?C9pI-W$kd|%67+F!SM_I@tf$lKF0_iIoy%lh3NXW;;W>cXN3nl@q6(&2977j1Zs)W9lR z?X%OlgQ{r_zJ0k`K_rb^h)Q=ONFb`CuqI>1worFKx+dnSW$ajE*d9}qBP}}zir)xm z6>7#(#8IGuMcz6z^^UyA->6}wEW-is>kEz42xSvoQApXpU3$t?pzqOJIlMD56VmAR zjiGi4dq()UmacblsPXp(65U*Kt?|1Y!_DvGw9~2d2YJs37j4o*k_VN6=K^@InCp~lP%~SsIv~PZ6pF-lYz5MZ*Ijg>T3@1Gu5Ab z>b3T9IwN?RPZl~GWAy9%MA^OQxBfJ9o#Q)WN~glHz0H{$xznp$JkQ(~-|vtj9uC)j zZ>GEenfQKT1ExOhG?#;MtqpB8aSyi8A`iBPcVx+jj5hhMxn z#sbBSwzJwE&&Z@=f~dS&N+%6vcY(?fL}wGjcN|VC-LD5)d*R3-yp7!7#1$G0W-o}M zQ00%H=oBbyj;>cuZ0rA*gs{K=&yFgsrIZeQvO)C${0Q0{tUhZ1CVK$Is>FZZSkeGo z#3~UpIvAEsZ9OeFJosgLpfRx)x$NJVc(QMokVdK?|5eAsYJs z@hHeH8#R|7{J((~Vuse5JVThLU(Z|`Z}la(F;Yy|&P*TvEQK8mP2chyYAGT}5ynsP zzoT9-6XZcS@Gi8HF254d4Y{lpE8;ZhT6euNL=b@AVwag_>^xUc2Ji=uWEw6{vKy#xaZ&@j6&?{c>%}dY^F#MB-YW%pEfDxA`Hp z^Yd95^j~TOzrl#^*`og0b*yU%*AOi#7g6Z>9l>-esyq(TJy@Q3N+yWp+Y&1e=&l{8 zDa%GMfq<(NI=8az29FWEW6S?#kJj=2l4B+g9%7n4k5kVNp1W`hGDCVXR29eJ@(FS7 zRsCr`RP#OgCS&@+h;yMYSnw}ua*gMN>)E%zeGg_0F5xEXZbn@EYbKJw1pAVJ_A-sw z59~1;XBtOv3DoEIJycm!kGvY(_(D-HUwjd`kJI02dAej!xg(uH9W%m`Q9WX%bGk!m z!?coGRe3O_UC&%xe!x=enV7~_^lZPXvFr4hbC|&x{z-p^ok72j?fkf7nvubk_`dmU z?cvXQRyd0^8y6DO=xmT@Ki@_hr6(&Y1nR9Ilq0y%)W0_CFgOLL@hhQNlTmMdVoY&9 zk_RLtOYVA`&;2+5>jK)~(f^t;BrDX`RUHa$u~VH1P{4Y&_}9_{6! zV;ct;Ih{aa0*iF7q#ezk%eBvjfG3d4%{a^33O-|R>q*tQHcr{eVxX7X{FE4ql^Z%( z99(QkMQomi@82&2)XcZ*)|zOe&QGds>upd^EQT2RMvSjSP(qC05D_IlsS3NMs$_RokPHNrP`ElVV{;KNHpUyNIwBQ}&j<05gjMRs^H5_6t zNh}(s`W}^a3rE@=@PAo&RiZ;g%h(M?OgM%TqhkTaGgeX%?L^=Pc>I1WiNim(aTYWG zxth{n1-lpWpraDX$3bgcJpB$%ew#mHQ{Ss_M)7y9$?ygdp%X{!9KoU9S=llrT59xD zqHO@#)U_nKgL)J{mgPuW@;;)D_y}dbg{i4gH* ziHcs`oVz;yLz0NMQI|zfesw?V_q%K=Uo)w(Z>^9()KU8z6xN4TxjQ!DobGZs>f$}9 zck}%eo?wFc%Twpio|ijoe6+hnZ-yYEh0L_JDq=DJ1mH7cB;DHu;6A7^-`ZE^`$-9U;L2cIz#z}z-|g(?o>hV zgHcabH?HW0tal}2+;rlxl>0Q?cGeI05CfyCkOp}+#l*_Sr#=0=Au8CB5#`|@viS8~zx!O-bskgmZ?u1Pc`9Q>li_j2bC>tWC> zX<7A(Qf0FZ)p|U+mF}qp-0sckw&h1%n$I8ZDxA^Kn>ZC}){vFEVwJmw>9{2r+?4+` zXCv~E{X_XdB0NdIs9j}Fdgqqqt=L1vWILT}!$Gd`Oe~c1x z1poYeg%SHXn$NBF`NB1e9~);Yr--U~%?XivETq4m_MZ6NS^BF*dZz&PD9`glNRLVN zUG=!^Agii#IhxC!jQOw>=3C%9EjAL#TF^;K&#-wsEPCH+M9_|G0y~*(X1H`|(M~R| z)={-BK~nL?>Fy`p#?RCQJ}25U$`pbea|1_)5Y9>%B8@&~e^5A}po#hYI_dKYF}LD0 zUcUFmCQzu&(AU!wLog}RUO-bC)ykTR$^A3S(Vxqpr4``bH69V|p6BU!9A#LmTF|6D z8#d|^A0#W<)uj>_$VNZVD(7&VQc?EQ)ulkInp!cstgG`4PgH)jgNx?TI2X6YBIMMD zmY+!J@TFr?;zSh9itDm>vM}63-(Vv8lu=MzIx*rgVH}Lvy4+73W?wmee{XU6$-zPlInwLjI;4G2FC_T%l z$6q~^UcXKx;m-r_onSdp`Q#vo(1Ft-f_){7`s{VbB42sLaesa8CvNdD(#p_H-Kt7KBhGd3@H9Aeb zs9trPcrOl@d*xkW;zOV9XPXwg)H&SjS?W|vQH8S?*$*O# z#-Kde^Vy!b8eZ1EyzGF^zHa4@RgqarYD0W?Q z_U_;RJa}srw;4AV=dqQV_MM%Zd_z_0TAfVZo+`D)KJHOCErwX{1__lREgA7;q|wcU zil38Zq=B_)6H1>|yR#+ZBT=92L`Dvu#jLyhzO-Z8#-1JcmutJTooE;ChzS~rZ@_Jn>JwqH6o34ZD5l=F-z|mW|*w_~6HD}_5FG}$9Y-mFO z!7A*(0_W-yy_I-{MZn%d*Z6gV2uj&!F}SZ==01w%bP}y#{i5aIqxa>VMXjI8OA0f- z|6b`Tv4xvzWd(klLMH?3$zjA@*`Vhi*fzcGS#V<<9izubF^S9|w1l$LQpsyIxaE`5 zjjYI*du=OKRwk2C!KNrYE3tzyR4$(I#u&N>UAD6D@A!@v2(leEzVkbmWjBRSUcD<> z@Q778?r z$Xd(F!&CZvWBTLt#_+c$vmI%>cje#xprtptx$2&5k{hjkd#ev$I}X>*0MNoU5JV*R zH7ocNS~GY3x#?EQ-)~u5&%-L7cfn8sf{8MXg#*MF0RpQLfUW3=Al)r&}R8645= zGdkT>@85>P(@Orh&t#ndma>f9T?sN&LCK(<9uDh5(g9 z%3$rgjxkK9fHlo4j8fQ@#9!rTV;yJMlnbF;O+$zL4m-q}3a{( zYxj5C1fjA70PT&k;NGy(Epm5kme z!IN4_Cw-Y$RkNr6E`0LK0s9ikLi+B4$V(*vFH?O_7oxvDm7Ci4t>=fc z#bEnI*@xGFascQf%(WkCzm{xCpb-(uuI;lg_~v{EvxWy^Cf&J@;(jw4(QA;qwtQLr z@rj_y;o?OIOp_yS6U|hF!8HkT70=&%`eYPpx4X(fYTSs)^83n)IrJ?yGFN$)Z`iol zH5{1+%RV3WhpFCI9PwkfX2-9LKh!}*MO7TA@{7%1VZS+6rlvrQVG(lm5nf`3ULH@& z;U@wt{xg;Abbxg=JwHVn@@nnRv(|d1j;IgnJqHk9YjZKs+VLc^nPa~P)GeR9&y9mKlHYt#Y&21 z3{3dtMFBHoO4duDp>E{YCCcizM5>)*f-aosqCzWJG&-trtKWXnNcFK<<`~CsDu6Xu zC!Ni45a0ee^VQl)o|+P(i*?(jHo2Q0WRe`n(*2>G3_Tpo0PR1NH(vHbzZ$1QR)IK= z1}6|^ZQb~$#~*v8_5tIPl8Oq7(2ro^aM8JzA`HBY_CQopmX{CfTW)TCV~UM-%{*bc z{ci-4Hp*r1^&Xlx@E>@RDX#;i+~ah$O$1j1Im#i18hfc5v1g2>!ozHpx8@+?|^%G*OTcHhxw&%GSa{&qy!MF_%FoBm=S;DclDc1@t@Ep6~KyRi;G!nx}5NXKAm zp_AYFTmWj)A0DL|bl7Nad?{de2aiN6JvY~^nj9Mq_^Bz&+g=rhPEOGbnRL9GoIE@Q zI`1XO0q^c8fDi9bF(7zQk(?6bI0aLSmB1f~kPxZ#B*%#`f^W%{(L{)Tz78Sb|v=$A7lx3zWWBFT)@PcmR~#R@@m6k{7{zzw<1X zmm02J3B|9{Tvs+vE%64J=yl3*0}%$59Cd4TRdKUxx6UgQuCP;D??gNX z*Z&)z>=5V@{~7g^y%<*l(v5z-M+}{iD5J0_mereo%G=stVV5=h%+2jRTWO0Svub7_ zDOxGcg*F^~8}x+%?{??CUx6fqEfHLx0+?M_VuxD_m`+8K)6>y~qp&Y(4YqkML)T>soZ;x!?l6h}@Z2+IDN_06r_GL_o7a)kKVqHl!c45pi&5u_ltUU=a;c zxUJ>T{3j1I(S+&FPl~b{IG6z)&Z=IKSrxzz#udQ58N&&-(9Djp&zJ~rzM%pB*Xdf# zGG8B7o=Dgvc^-Z=+XrtU&;q>2vv+_cnb6o*v z-{v>hEB|)$9|bO$Z9+a;Q9&-)fBv`c{QKm6aE6TY#fC|!)ZuBjk?qE*jUSR{Uf%oX zA9iu2&kxU8>uVctzq!tSiA`@hjPN z)jf0c8pSddFUX%q_rlsh*{VkdFW-ex@V-Szk;^r(d!0-%(Ht~Xm2JU@4l&1z z5Mvum%NbX62m2`*MO=$N4Ch+%+xMOdpI3A?*2Rd=mm0NwuC48$S!A%PUk=4N`T*R; z>+hA0Kte!wza0p8Lr65he(s!uY4;lBs*E5a>LILQxBBTFmco7U{rncIw?nz|T?5rI z7$%A7F4dhMqRqshP_H6ENC3!6ACcu8k1A;+uqM3Mh;A(aE!s64Y7XXdngA?ysOA z?yQ718V`H|b?g7I_195Vec$&ujDm7`zp7|sGT<5}+ zp6$Upi3U4y^4!iL?d_Q|eaNN{TVt>A|=H_-kA>1 zL7RPal+Fqjlh?6C`&0!E%9{n}dBPd+0fdP~hFr;5-AYymQ%-9bY$#wQs!&WUdw>b^ z#&Tq!i&iZ<-`bYIX&>hWNf7v-zmk`y;+oOVRS)FcaS7EnI=4A9%^>L5gKCKB581=tnR zJD&k02FeA*((k)Mf~t`!*ncT0k6Kq#oHb(x6BieCXEmsbY#Pom=aat8AUBOo&3l}Z z!e6y-yaNC6nf@7J}1>kzBdpZ(g7Qf3u{@Itpk4SGF!q z^M4EykD6pKDrA(=v8A~271BqRm3YJX!_hsvrATzok1c~ULB^AE4*Sn`9f~0I@rivX zza@AFJa<}MFEkjU%x1T$z?$nkId20kuaPajJgh-Hb<(E{(&=HrWg21o_1lc$6!=oX zQ(rjfZL$NNWXWfF`Tn#utb*z4PCVV#M6q8)*s&vc2qSE6$FuXwv3TK`pvh3&o0t9! z2g>a~0W_iZ8dDBmhR*(Z0YYfaN&a6NK<;jdApw%j28-L#(G66&>3N?; zh4nlV`&fZ+nxg}PsOzz;d1&0AnyYB3TWfqw-bbqZ6sNMEZKAbng})yWMyO+>FFV19R}{@Q>Y7x z{Ce8ASE5_9LQXBFwq*l_xKJ(vxo(SJ*n<+hn&n(%-YhWVxkK}T&= zVVp9pVP?3l(yb}9lSFGg+Cgelyed9)hmV2-lK~n>mTnZ$Yw&Fn+5hiFK)1^z?o6Ec#Lphr7O;EtDOaLPuPiaAs#a!ZmzO1` z4O~5gOjhAn(#B3X%YzC?Pu&lb#&8Q+9aS_a85n8f?QR%)Pv-y=7JDt=L=CF{GxwBv zM0cbew;BS>-fowV2WJ2q!9?r%Iry*eSp(eWok=D-)V;>njMN+eooGU50rUrcl0k)o zdsj}1;@%s9FVnB7Ur|Gk>P0mYF|~!bCGyFSBoHj_yv1hnZ#RmkVpbLl7jV&eS2dv zh2+nA?PJyd^p<5MCME_9Pwa(w>YQbb5kH*uqR9$WNa->fER!{0Dw%z;q*{9BOc+Zp zaYl|Ad3_BGl5t-Yb4+6sNiMFXP2-yhOH9OCiYcj~nCS&t9 za7+i!wvW3(7dg+VZ28m;x`)pq!)L)?2%g<-8D8^5mvzn->kI2OG$iRh&JLvR9$JSf0Wct)+TTEX~`8JeD8hqzLjnl7dCg!fTQ4Tmyx_Bt+ zy?>yYD|o@|umiJWGdRWV`%2HykP>{zvbCQ-(|Tr_aJ?X)Y%>^>dy56e5V0N=vwQva zv8}JAI-k6y2ShLvDs7(>IcZ66K(Y*R2?}DysEBqQh5rH)uLJIYKleoC)f53?yQI1U z1m(7r46WzpGV@@qG#oDAAyKxTwSjV<BAQKckk1ETCq+&oqF!#X z21Lpp+*4v+K08dU?VJO%8BIT4TUPAhEgTZhs10+J?kVf`Q(f~DvJ8(_3e43N51Lh} z`OJQrJ+u>XjnEH41Y86m$?=2fgmKC4=ywDTZUgaKm@|zKCyeO1b84w4tB?N_Ij&2{ z-mX{tSisLN>3bet(gAj(=DxI#DRVR}Q#GRM?Vu*M4ueaI57)S(!!UC1dkd-O2~ekq zR$9W^%(X{TP9Ws8&Pl;V0k$b!J(tz7r;bCY9zAOkY-0|dyB-PG!ElymJNJDCM6|7& z-)2A|GwUJ0M&NU4_J{65SeQQqY&dd@?>}x8{g*g^XpXV)*ZY8cj2`RPR4vkMDTj)D zI`>Nw91aQPDz&V_ROG#C4lLLAhdtAb^|Q`&8zQx+`eXlFa(2<*Xnkh30xYS8==E1R zF81xZ&VSlojVG=urRjG?>zv94q_(oj96s;c-oS;!YPo^((i0g5#wNy=RPW8oD1#6b=K0<+?id5 zI25^-4gGlfZ3WRB(yBj_2f&1y+@P!v)aeVjy5gC2*-KZ452RTl<{Pi_sj*C!&*3ps+ z5rUBAYyG7J(HwXxO{dnk?sz4xc`z4Gorn#w=YWE#8=*@fa&fi1a3~HQ-ADO4Z$o*{ zA45Bh3=JyOZKS}Ugxi;v=U@2$T2PX&L7U^FPNzgt0s=^nS*h=oqXvI;&koO_Y2`Q+ zEZ;J75TOgSY4l`jSp=`>@A-pCyq^-A97=g;`BD5y2Hmkf@cstKP*acbLgzLH#^6)F zvQNFe@=8w=`P|j(hoE+l3|Hg}hqjD|y(~ODnu)W}-(ypR0g(&m5AVV@#TWx@Mi@rq zxj`E^AA)9|jU!xCkt(}Mo@dpXn^4{%t{pT0|J9-*rZ&JQq8Ueq1_S%DvRW2)gdc`+ z6~-($C-n@2zkADRPW!JVOzI|J49E|eejU2cp7PvT&44OV_c67&0uKqG@%b(TXl~dz zkPOGVg$h8QOB0(#u8xFZQpshWW{l8_0Ow(m&s!i7`7kzS`6JbJG!Q z&~%6nqQe{$^}@3R3|R34U@rV;|7y+0BWk&za)|?_oJ7tTac`kQcd!|d1N0s3)w0mh z+aoCoeVI_6bJ1KFp0yZb#@Bv7m6>ebIEW|&>TO|%CjPBEUsC@P1aBBP!U>D+)2f1m zEqDj4{lhDi)6ZR6Xf?FNUqhX_&5m~~+|J`EQFLY4Y>=DsI*I@Dm&K}(0Ab7M+chK`{ROV1w!bgXFRYT2TE={aeIUuf3V4j08c-A1ecE z(R3L8j|Z&?gk(1IEvWrb1VdYPDAua!JD`!)kUTDcg&jx}8%GG!+BtMeEG*_Ovt|1< zlkbMCRQ2iJt$8sf^lc%eo}IZn|@ z)Hm4#E?VB703P$b)1ut6w2KCxM6g`P%CV>^g%#JN%8M-mij3^$qt`qcRY|4lB(i z(0gcKGhtBS&t8`VcP&9C3ryRyf`W8;?5@WIVR0(vi`@h5vlmZvP~IuJZV|P1o_qu|}VpxMv5IlMg-J_>xWz z+ZR*_>`(vC1=6Le^MOSbQSg+%ffDuqSOYp5CNLz&xXANj*#PGc%V2x>BtO0OIP&ZE zVnol7-98jW;QBDoxn(+h^2x^x`DXBIAK%d2LvT;EB_SM1(SR5(cxw?!fC1@N^uVe4 zq%#p@CKy{l#{V4@;I|9Id)uH z+C{;^!OhXiwcHu3j^J&0fAA{9JetEHd>$yRGv)B5@y1#wv3AA<$+vvWHLj!^nXl3x z5jKJ5{ygj~(@DWnxShwq;b2+S*7`uB5+Reqxyw`Iv``S)W$C(g$6}R!-DAPBlrO`< z`f$5ajrpSO^2zKs@zNZ79&J^(WC^6sI*x}7jOq<@aAt$MJ=SBbFi(fiB9H=P>H!{Oq7XXBG(<*(~bi1sIV9{rhU)N+809f;Bj_ z!0*%77?{~^q6vvP>GFZ}IS`Qr=`ch|Zd(28$tF|qa>z?4sP%@H(T*g3IG17K*w?RL zlg>{TwuK-_5QvZ5*Z#?gAu%dlc%Tk`c$D9UD@Ej zl8@-%m(%X0)2XWUIOPI63WiWcUNtmYM6{(NB2y&R;~fi3;*|3z$P%hGUX30;M^^K{ zDDc~a#zt~V_i_4mwmHhaCcF4(w&(L`pE|lZdsMiXjOT}% zt9ZP+wId9Aa~!z}y}mCJ3`|WW&VItg9zYfl$yD&*S{M#4LYcXBYp52Wi}N9X%sM0qf3 zRPCWBgVo%2v?5P30z->e8|di?mSdQRHomZU26wUWZfLr``{6cJGAGc47NxW1;2QKA z(THA%{j5@i=NUJm&3B++k(9>?fmtAZ*Nl-{GEz7$ApyFvx44J764rmDJj0+bDrwr+ zKos;*xnRMq;P$Zt8aJR+jr)%rzZT$_giT=Zor%Y<7^Dg(k{FF#}@))hsDAxtHT zk4J0}<;9D3rYt$Ey-H}#E}zoUy5C5*tau%ED5;jXs8MeKt6*yi>!IiZksGT}Gr{xJ zV4>AN0k`{orKJk@*z9AvmDevUj=_fFt71ls9(Vm4_Vm$tspGkzg_h&wpFQNzEBfnA zBN_fL1gcC*;!(fNlUgO+ID62QSGt42og0;y|0>!kfYii@cQ&)2vM=vNi~THdjwBDw zv6Ev)Id_F)^BOMiiQw7DdZ5A8#ERHaKgpCXnG$h&vah$xzp4lz-F)kW>K;D3;_5?x z1(u{pD8EV%ZptLw$1o}Zk? zDHAV<<8MoU4!!q|&fq3I(xLYV%-d(TTK!_Z1NXrF!jxNkcC;yd@=8S%!^9Cimxo46OzB?lW~8MCe*Bu}{QPeEQbklgTc1y3 z0KPKWj!y%$6l{1fHFUysl;Zi;Jss(D)wDv7Pi1@KG!qi?`lkb%i{HM4zFejf(a~V- zpVS+f(i~9L8QX6kFng7%@Ie{{dud4n4{vO%gOQ-tQcssUqnk_PHd~K&Rp4R=uYaTc zetQqOdxHr0@ZZZ!22DoIsUTW8S*}RNt2MlPd4UBafN=N$??0(xGCe0tZt~x~H zt~0ss`(^Z;CHqVBhTl8AucMWisP=iU$#tgJzb!+S&+z&Wf?IU`5qJTxw3J_F+qkZ_ zTrd1s*y*@!Xgk?QYFQ{{5RN_(yw>Y4ZtC)0Vp{R~!?#^F=Z*~_n&&|$vUa9%T9R`| zR+&?O+I}{i@(kv(u39lxl)Vcf$!beVRdCvt(@0EGZg7PGaoOP4bL1LTX1M-X#u(qqqt#sEjNrJ2a=y>w|?Caq=GPCdwY9o^j373H-hfbpaT>8 zW=qT}*Irz{nsiB~`x%YX#`jF{2dF+RQ=qO%QGwQzOy*vyRooX#Y!hU~? zWY6sz7J^^fd8mu)2|g!q8+T?<^b~l`-+foOKQ# zFVv>5D3hu!V~Oa`|1`l>|IuYK)O#$>du;io zQAxZ?e}vq((*~EEg89#l=F~|OJ||aDWvpHU%LJXlwbNOGk9S~u$V-FPR14=;pSk>i z0CaFFY{*_=dD}AEYg*nq^-LK6welTd`P{*yQx7cMIpo0aAEcFGj!(tTQ2=_ApQPi4 z$Rm|}>ph*(SSOV9D2K9%-0o*`s^=e9Rge5U0^L(bRG6tp$$RSi`cwD6KIc0s(&;57Nc!SsvyCd$=RYG0uT1pdr;x7vG{ z#36bUsy1pIxmo8kNYr+E9QxoVGd0HLJyg!McXq}%yvB*--eL2mB3n$OLh0~P6O9zo z(EI5otDN&+OF~Al0Vw~l97Tu}EN{rrlye8y8~_zSrZU9fI@HXQv%N$~GzTg(E-tQ& z8Bz!*I=;?sLE1?#DkwkoZ#w2vgYOPJ6UrT={fswe&aoCVQy;y)}^kA%31E9!V^*SPXT^Yw3+L7&2{!l zZ$aHR0)bjHq-B|zRPkLR%!PxdE6dB9OxlUhs)Qa(ugRLlgEtziA7%c1H1Tp zaILzi_&$41Zte>=>BVnz=UsB1AxI=JAA%)8kV6{nOXm1zdph(hC6HZJf{+7RwIL)8 z(S`N~;$~gyv|-6lrnBx)4jfPIFRoI>~}Y~`yrNA#Jk-Br1=Hs02p++4vN?o z4!RFqP73kQt_O@3WIB@E3QR;}0(YIDog+x$p?LVHxi-EgSF#Nwuh?_ANO5kVn!E*71CJ+}-&bj|qoV^v`6tOp>#M7Q$B(I_ zTp&orPZ(ZoNf`29)y`q^G6g_Lu$NTh(!U5mjM=*?jPk=0>pz4>x}IBCXZF+ zI5#&pq@a+|;RQ7dkU?H$jy2R>pt3!e-#v7L@LYTW|sJD+CI7*;Q*|74!7G5CeU6&jo3~3)t_GQOJ>dDa4mM*sg%9 zoQvK2VF9t~srLg#$Qj}x5GC1#CsB?X3k@av6E@15ropjm`g5;fOk98_@#(O4J=N|c z_L1PX-$IfP|AK&ZJbe76wKb)m-L0)+EKjJLkCZa+BOTu&d$I^$~wp0Ul5 zbgf>xN$wQYvE(mzgnNzI+E?r%+Th7}E+44T+Yg7iU)MiZ1Cqdc6;Z_88iX^-W6g4u zN`yb(cx#1XdizD?kCa&YgVpg$rbAa~e1vV>_o}|dDIr0qb=utg z8Q>h)w#rINOM5gVb&XJ9qVgF%LuRpLdq}3oj<~^44m4lOXlukeZE{YWstUs#d?Ug1 z-CXk_zV6!p27HmN3=Rw+;0r;J5#$W6YR{cgOu~qSAd&tbKfds79iLJ~_)B*e8nZwQ z!v&AkTWh%T-q#~Mq-OEpm{R=Psw3wtbm>ExneU(C3R4;Ht=!sbrz<4lbNev+)S>39 zt{heqxM!KkK=MyZX*1Y9J|$%POdNzsb^flNxLTbn*$Xa(05)%nbsjA#5z)!yLA_|l z-rk;7-Cp$PR7e7*hcL*SI zPv$l5Ha!6|K@j~ju!LacLjvQ-ngb-7MeOv`E$^nhWZe@mXiFYFgJn5riY?qbZ!@~) zz?M262w^F7mco1;G7!@LkSr%Z+FLH10;s=5YETy94)U$Jj7rjD>#SeAy zAXedoC5-MhD&@R`CYSvpsiT&z!VTsK<%L$IEfPb#p#|zE1x+!le+h9g#}BVy zqddSuhqMHUMUqVfHpZ$lohu}Vrg`25k_HKaL@r!0@I<}DV8Mq->1!P~Z+5UI(Vf%N z&mG&ok-k@bX0}I)g=zrP$_;$xT@Nm@KSe;;;OQ|dAA|GHl>J2~5qP#}vND2TP~n8? zZ$xHRD#qedahqlce#=)L9~gk7-roPBdjdt$hF3D2ibl=hsXn4C5`D==aD061oLHEh z{ll~z1_({WHXE?OCF1%|IX8&=fsQ6T;4^O=VDEk-fYj*o)f66Q1}<)H@VpL|!cGeh z)np(zjKHk|>r*|yqM)D~LUr`pF1C+=C#9nwPx2uZO7PC|@-p=|@XLwpDqmL>UDSxa znHr&BcaQbKv&<@eOF=b(3dF&!da_G}4rW@8t7bNakfMgYkn&O4JL0DIYnT>=LI@%n z-zqt#Ww|=G>qMO^u6*eDD)TB1_E&NI|3n>g*KoZ8gkXe_!jh$&1C?fZUj>8ls##fJzb?8Enmsp3F22c$wVAVaTdy%hSUwGIwDk$%c={CYw%`0Ce0=;cri1-` z8zRVJ@=EoRM%>Zlg2j1$$RB|^&8#fbJa*7Ds0Of0c>49yS8DoBiMYbOvs*uC<2pNy z*-}w1xx^u2IJu@e$QEF@4UBMS`l{78<~Zs63N}Qv>m$a6^+%>UI#Z^Ps)=AUfs{oJ zQvA(nbhQW8!u)J(@1V{Pf{PL6AHNb#(YCNtObRKtut{{ihIu7$w2sm?Oohx`Hph;j z$z(+kv)@6Zw?c~4sL0LE26zJ;e`+egDu{z>LL{hI1is|E7Si%UMI~!Qm3dNy8tY|^ zS-HCNvn%3m4h>T}Dk`GKFO4(Zer-9GsWV7VrVN_G&V*&_`RDX3Gc&X5fxNu@-6vn> z;sKFW)If$^t<+{~>9~1C)zme_WRnLGpf-rL#T`Z8@J`mIJAaAhO?HkUoz@F!+R&jp z4oU4z>X)}b6;~#!e_=gqbX|M8AKjE#Zxl~nxqamEn) zuE$0IyW6OFJ!IYI2aMaw!{jFJ@(V=N{)y}5yAKB4G8nG`c|y;_^`bYc+fY2`79PN0 zUm!G=dvbn4Y*WkZUE#26QjhV<5Zh5rLxT>MaOo($=OY2)P=#AJTM-~%4ic%cgyXvq zLAx8akNBXK^Nw&a{8z-}h5NF6SJfCStIl9jdw{Dfn#u@0KphS-u2B~FiocproGa+p z^Nd!x-&PWG3~bNLYMfx>n9rKirl@aAvF9M>-^}f2QxEp%-QRWdl zBw#P@9L}71qgEqU2Ang>KZln3vt~I`#lxS+Dkd&qzx@hZ0!Rv`xGSrxdAYeePCNs@ z%-9HWoI$&(c>l8ETIG0Gbe%R6v0U$e6=S10SAx}lEXplE8G4|Roff8UKWo$hk=3V? z8BR4Igv02*qtMCD{X;i%LUggi{ehY9SNn|FN~(9R2^Nj7Ssych=dGa-hoQ4eb_%=b z-hkbDF&?TkVGIwJ82t#KuWK#?1;w5jx(8lua!1Q2sBa4y*N?m`7gC~r?$51{b!mVET`I}0TA!1oLgR6 z%8G@a4@akmg(cnh8+gaEPK7h4(V{{FEet#t=wCK2A5#=nRVcfO41*tVg5lWNHd0$geh~1md=5&! z01;;_@D(_q>~->@iK*)RMZj1?kql{v6f_G`&PIoM;kkw#KQbbzEI`DpQHlbfOi$Vvv`MwJ)kK8#>wN5&hEKjUuip{ z+~gre_W_pQt;DTqj*A1)%?*b;BNJ1HD5fM8!K}jvBvl68%T})0$L-at1yh|6-N&s*(bKpbm&?tob6|ATq+H#EFC0)5 z_~^;M=k8xd(!GU?vGz zqquKVBF}9#@9?}Afh{#{x)=juzIt}BWXy$H=8(g$R^{aQ%LM(V7lY-GI10Ul;@mqH zb!-Li@Q{R@P+o%bI90 z+Aw&ViLl)(Ix-NReZ(&^PnfnW4Z;-P@oK4?CweF|qfUL`H!t%yaHv1N|F+~RlGpBQ zBE;yek@CXA5QgF5VO!>qohvGEqM8Dx&T-j*UUjpuSU z>XhR&$|mv@QN_bk(9E&heVSiq`=jJ3Pdi_T#PrsycR3NSMjo-Q5zqFZ$s2;D0#YR;QjzPiw&*?{>7GnzZ#A>MnC^Rp}~o{P}5& z0kdK270Mw8)`C0@PygiI12m*IpjuGJz}`KRsw&LmoE@YzPrunZvq^n(w(Xh4wo+JP z{Uu?3Jl8GQk?W->sh^MN5knig*y=pAzEVWhTPzHM458k+= zCvglrHhE6%&!Y9At^|Ca=*HHK3ic2r;(#i@>vVwK?w{z>uBK=lE`jtcE8t!aM_+JH z)m1!x%(m4h@!n$8U%#wk^?UJQf#uTTL8VSzRk*p+T3~&JZ~mY$+x#n8cq^ZjHCnCcp!TGp<@~by3tuQ_UhV?)phJEL{3qL}_0#$wGE_ zr4x8c!#P~nqz(zQHrH>?Rj`S^jHGBY=g5kq?KfugCfnqZ?SOu<-fNc&KlB06`a!*A zj~}Pcat;|}_8OxT5@J1L-(Khe1qP%HWBewx6-hqD3D5<22r!Q0;Tq3hQIpeJV zDPUVth{pX@ELbJaboQ2f7E~ppAuGte+)fb2bvYs z(8j*bXgJ#Bu|`8g(t7`o01b80f%w#$Z6Q;anJP#~A^6hH-_GC;)BWa7i5Y^0KJrH& ziO{dB_=!<=I#GV@XbfuAd*7_oX&SbrZ<+A?C?{99jF8$h=zwrxZlZ>VM{D7FO&Fb% zDIp9Go(Qmw@_(LKk8D9gm8fMi&+Bdpe-{nKXl?$9H~^=_9R3 zw%F3Wn6I;@wbs_ltb5iH)qnNsJ&v1jV|=1#;l5C<6Ob$|#cwP#n&xFe99l)_cwbvI z==2qs!reoxgtR{+Ui>gJ@54C{O+sT%mKUbZ#kU+Oj&{yN$3@ zu>WTC70J@ph-F7l8y5r~Efo8n$}U8g=P2)9Dm5RS;-kI_^(4aj{W)o?yZ;T_&gb3F zXqk$@y&I+K`8k>G#vp)x5xdL+L_%753F)p=_3k%Np_WPxzBy0cZG5$GKtrIAvxoF-?yQA?PXn#bwT^(Zad4l zh5LKFY=_eJdv6Vpe9UZZSESYQ?cS+-Yww z>1pc*3m@w`qg#qLGbJVx+yvp7fU}(*e%bO=CJVEKjpcgStWTGPiF+7s-Gaw~98(xB zAOU;iS)$3gN%6cP@wptWWyZLg>=axsmika+fV}j0xB9#XO|?4i3shVJPb4yZiw2kC zkv^8KaKlbH=UmguV5V1XMcG)0w}Jg>fXfKB5)g`x;%5VBWb6N&ELz9iH_WU<&cIlr zG0iJHIVkXX^B^E&^nFZat3oLp2XI=H+&$q>s5u{hU|$PcSd-B#9EgO%R|ru6840%7 zMx9S8>E_idmAk?r1V6ar&Gk0~u(s-6`y8{#b;uCGLk1gJQ$kclIZnMC|5>0waR3&R zK{=H0%;r+Uf7fy=%2{`JR>KxXlaJeH! zY^Y~yqLWmMZLU(k?`c=S1opXwuBa-*z?G72sEn5MkU2{p;9s8hNV-})r$JUc=O;T( z(V;gW=!!51rp@xTUuWquo&KC(OJsNrpVCqo0<`~AxT>0FR{~?n<(ehK)7czE|9AWP zpBJ1H7>38%&CmPn^~?`4Fa>**upOD8YNCCY|8ru?b^Av8^*aTzx29?R0{Haj^@NjR z0cGtRy5-1dQHh6x`LCs|FY7Ym1}Rj3{ey;FdTiU~OwuY^ZGuWAkK;UUztsr^G9${& z$lXPb+CL0D_^8uyZFdT6vKLvENtAbE#bcV4is@Az;{k(Dv$nG3G|=+*D_-8H5iEf2 zK;IKS2kw(H=`s9pdJTxF`gGd&_fB5^HlzZ8*_4d{Ab07gv4Jpztp=p_FG)Sdj0Fld zsR!yiS*VQz{!%t%$oSeP_pv&D2l!^KpRYG;AA+=B$g3o`qv)$CHTju;SKH2%Ybr_T z9(;eK>9|{pyYaWN-BBf2HH^yD*BrQ4lzzX5So>ZrcrJWjLh%Ii{Qwlh5EMpU(aVqI z_3=#`D~HzHQ6cl!TQnA@`xT^TWcLdwk|vlNh^o)x0XvEdR2f}`;goW{R$Pj^_|#qI zu_9?OD=YGme$s!;+ZMo{xrin%Kpoz*qjvi1|NX{f*V9?8@MnX>X*OW;2k;SOXlQ>C zs#lA?jm6i*#qQ!oPzVp2GXXB zmBznwxIj(m#jb8v5t#868W$ep!7vqs!13MJ01M?S2uD&>AZGDg4q-w1;(NrerRDS` z^np;ywA|uDElGR(J($nK;5kpHBY0OZ6VA(u|1^5Z<*p1_5|68^PkP~Z~l4k z4Ag77Y`UY#$w~L){azbuYi2M_u=h=0xlGAInu}+CmN64@a^=UTCDL<>!7LX0?M{%7 zInU`4n9Dhv0hy6PRO=!5NaV zaL|TZsBBJ$vloZ2V!M0%BYM3~3|V6D{dAovN8&WumK*QeZ0&X|OmWidhb?^q&aC6X z_kbNEQ4OA6dO?Z^{+V{SVQ{F}>kTLqZvTj~qRaD?u_o=^95|gi#SM6Jg{{~k^xML@ zYD*Mdu)F7C-Qs{<_s`kSbO;1U##wyhPJxL)6`#lDvl2N|mc2-N13s5}Wf&e%B~u#* z>C{uC{1?|&00(Hs>DS3~HUfI5w%k{(=+0hSow?6&M+B2(-Tl~{bk~SUQ{D=JML9lf zEGy?Fa5?)9O@127Cvh;hcG~S0ylIh4f?}elYYbcc)S&eBYNc`wk1GMlA{j+TSZwdG zh;_JJH0uL=(EwRwmS!_~O$>iXv00wO-wW!wtR~h7qrPpwPpPBAEbm1HY(7WmC6YD` z9J0N@r~9A9`F5DdO(_JQgGHsZ9R3-{PZXQ5v&>QM%Xf_5fYuQ>laW1uC4+dLP&R>w z^(?SxY_uimE>>edT;yN-iO6<1V@-p?^VGyJJ4v8j~7^i9gOuRl z2U^FxrQ}Yw9v~H&@Tb6Tw=dkkwE(GA z@Y>Xwxd*r-z`aP?D0ZyV%ahrR>9DF&UpEr&*@9LS&$F(k>AU;R#q;x{H)BRaN^w;< z=q$|KH`{#ZdS8F&?oQ%=0wqYTKf}D!kd za6l{n1gu3j<_3-tqcQG3J64DkRI%67{jdIZhNya$ zHuqcQ2@D@-6EOm3RQBSI@(~a$HSa(en<9O1xn6yNr3t6F$l)K}YNrrSoqjGii4Swq zF>l|dsw((DD5^OYL%Ot-MpXnwUetaLdH(xGa@I9|2Hx%D6p0pFD872Pp}>J!HxVE& zw{;+K%E&egyAbw(^2Q&-n3F#P+0RLror1YN8HeuAd`Rv6^hH%Vi@Kt$b= zwIO`)iU!C0!c89+;`3h~k|AZ$rs|{HnkolyZ&$~LjE_jU0vH(V*ob=ogjt`AX2$N{ zKHQQFsTJz^lw9!{UT3f2&;${ygVHQj6iw?V@G)U-e<051z$zAW&$Fj zpIHwZRMlgirtOQt{y31`&hgVG+1c8OO;BHM`sUvDLA&I*I#eZL!6YHa753>_g+*I} zu6FXX68G`S?{-+IW%!iAT86splmQ-ea$a}J=;!z67M(0PYc1>Uub(@JdFXK^O8%6p zsDIVN24c#mheYQ){O1u`v(N!l3now%9e4X6m4szhCeyqS2|XP742yiOL`3-t4!rE^ z6FIvLG2TxJH|5(08e^ zgiIEyp{!{5+NI7AV~|EcLmd|YHlCTDr1ugp*hu|d6z;^mpxD~wxPhgio_*H>9-S83 zZO7?0Dc9@qa?g1OCB}?ZX^C|h`MrV{?(yI|o%Cq-wie7*e>ynEI93rUBlHwWrb)o6 zDLOP!j&G#8LO^6*zp@sEI%DRZr*fzCDUU?yvmkt0lkTj&@MaZk4yfZ#JBdo0jIEy4dq%E%pmw7#xko4LZO!&@>h$AFe!B!3bm#aB69@oE-1M zqBTtRUy_Gpq(DUiWYN-6jiZGj1$EOEWA?O-;h;>r2)9bsNMwvyEc2vAK;=&G+f}T}`c>6rqN>&!m=!cm5$C zlnpFe6P*-2&mD;;#QJ+Po7pdMvVqe9o&l>~PRC^qo8XBRG3y-@HzmFD*QZnGe?rb( zroW9zqD;sA8bBM41kWfxKJn?fw(^jR2{fEvUM=(3he|-!Zkb(zRDs`jP~ODG)jLNpcg5;k0FfM;~zJ-CjFZS|w@8isJw z?ZiKmJ18yI)@fUBF(;=YUnP=h!gZ@8F-hPVXwGb8UbPz9`sl`z$QG%poyar5LHu4y znBc7oPZ7TR0CHHq@FyK5u{#CWt0iGRG&J| z_mI>2j21~o_>I|Va$;->!w~{tLazfh2FeoLMw6F`+}9b{82jO@Nq8HrNXpSw|3Xef zD=)6&c}rC3-6UWfXDgC{R${sx@+s?@OFC>^^nLxJ{bkQHHsLgd;*BlUBHJL5(<~bb z=EOQDVZQpGGng$3G2HR@E+h_`*#IKSq|?InS}iQTkH_NTW~HMUIBNTA6PU%6AGu` z-TnwVG;qSq)A0PffzvZ%sBgD9FqYjQ9;!a ztFs1n&~B5_{pp^hh$r?yJZ?|bpa%|K(yx!SOUya%cqX2sWK+sacvtj^&Mo&mb_(f_v~n3PxE1Nvwo}Z+dfw#hqwR4s=*~JQy;)r{5s`Sqv2c1oz9ztD<_8Am8qM>3!%nk$hWRI{3)9!P;!Cl)2uo#^ z#+R7Vjqfp14@#Jt$W@wx#=qkV$m*(4oZT1uohuQ3X!PdlH}*l80%sX8O2-ct+^CsO z#A*0U6AIor@?V!ayg)NctQ0swU_jF8J<%JbAfD|R2Ykq&#}v-WBp>#SKzK)+5!f9a z`9c!?*7qoh(S4hy9i0q+De!yMnB;$#`)#O+F^cdZ^-(wljQ33HY2o`ye^~;B_Vg`j z!1S4=;ZKNuynD?vmoI2FU<-nU7rD>~z3|jnHZ>EA3eibmA;ms$Z#mI5cyP<+TF49W z6&?Jz21@?v)u^PN({U!_(aZ8b=<1tWVNW$H>-OJT@Sj|1N9WL*-swT;`ATu*%q8!j zIbB|g=&}zI3BPs84z8|Wtvbr#4}W6RxZ4h~};XRPJp&K+~&;gy4 z;VotA`6+T@;S`VqgwQ!7POzH{V3tlK*~Q2GGW)!czi-2OwG7Je#?=Mi^ZvoTryKeJ zh5^BQ?7h6ruOOyD%|?XoewR?^uwCVB(~tj$4V26xJOe&&QsCeL*sm_H&x?+s0G2NE zyOpU^$pXT{n9h_k6;ix)wk#Cl+b!v3|C@0Z?~OY0s{cbT-Z3O;XM|-u4K}m1RMvFW z(IM5yEMK*-^THVX@gw=^$;b$vB>!t7l@~A4CQ)@?{H=k(e9Ff5yyM!nYlw21x+@It zaw8)f2P=TY2bcjM@xeb84-Mrpo<4`EvGF2UAT_I$LAG8lMtlcNtgsjgK@i^PZ{6xeJvn?biYiazmTT$= z+xGj$v>mMpf}rN224|?2x5=r4LsF7#`WBKr>$NM{`zM*9dY8DVVqssLAzb$c^q5s2 zx-!`CH=;>Q40AJO5@YL}GC+ZW5(*f971U3>o-*}Uh&?mw;BbN?6O^FuTj!7o6^_|p#!qA@E`bq+ zA{dU*gpDEH=p45Qg({JPbJL?10?6<82rwU=S$t%x;t}?X7VLo8O>NN(OiZ(ae~-es zqNOMIj5=PLRhoDt8$SmCV`Nc25a?Q%wUI)6o{|O--eKkgh$34*p~h-SvWPR;NR|}? z0Vw;l^0M-A;Y@%c9w;Z`hSk)G>YWA44bGy+uWw=0S2Oe-te{+}-fhd$K}qL2eZ6H< zoZ<6L{Gw=slHMyn2q3)6kwd*}esV?JH+yR*12|>3caeo@7-Ce_0c2DIHEoI;ETM<4 zwVeC{gfhSwfTz{%cQj{S2Wg&5Ll4EBf}*8p5q?;^p|r$_c$Zre5{15=Zgz47mte%u z>C|nDu(~$|vz)@LvEIQ8(T#fhYKV%Z_{}L`G^nIn8`tcR-W`p$&Pods7eK8GNTmQL zR6RJYvBWq~LeDuQ0I25Ycf==T8q<)83}0C{Ql2hEKu+j={S^vSfwDbh~f%y zPjSI5Q?5iH;{hGJNh_j5p&rgtf1w6}a+ZQ*t062b{PUPD?>v)>%wF>Z1-VH>F;0C= zJ|`{g=m`aW=zz9VgxI{&k{67YDiN}_w*J{cpM3_kkr&0kX9y{D57cu;txni~`iHXGA|_%m6~$Kfolh(2|`7V5?r@x6k`7^-ai-Vey| zYBrW_IhD6CMzO+eWhfuIe$Pe5Nfl zeH-BDxe{S@%J6wTkVJ^AKn-qfSy#B7pSdH5)=1cR!_&c=0genbP)^thg4#Bq|NF$l z^D!P6Qemp6z}DY2pEuFk+8W}qn}*_-QjVA+AW0sFs4lq2+E_1e(oZQ@zgi2Gv|jV$ zaed&0j4RH8BHl=T?g&0C9O|m%S_C!6ryfV+^EMg9HPC@!4&$Av`7Rf} z7rg?!CtU)=g#!TgN^~8k#R*%`#}IT)CPhvy@OlVkCsF7=o`sHXUgoT9t-+{GGN=D3 znJl)3I-)ftn1qy7agukJ*k1NOc&us7^GH2Kz*x{-b{@&^X<44gYYn(d`K&ny)Fhx& z0lm5o9|1*z#vz6YD+m($TQ!z;8E*Orm64$dNf39i5d^{9HIz5o2H^a@gYx|^d;Y}* z9|+;I!v*Z?#}gYKDmqC6ueC!I;X?l*-h`daRJ!c1T!YZi_Ba56f9R!nLS$lgJbV*( zpt5iD@cg{z{7_}Z<~{V+v>mg7zLzmVCioQOEXpYaHE=hHZh|bhEHkp$Yqeax3jCBI zEv1Z=aB$nh!R>i|dT_oNa%QX*44KZ`%3SUps!5M?2jiO8sUpO!iT>^;7r%K)hk+^y z!3aThs*Y^p;c|=r@4puLgUSkLvAUQ&o=B6n`SZWZ#NXQ@U7T5OX!h=J?BU-RniqJp zbz8C#(DYIUJdA%w?<^Mncaa)+x9&{^;?v4wg+4$?Jy@4gKcZi3mylmj?Ft2X0=yrQ zgglloOm@6tcrhvG-{*;^hJW-X`!Qe$Uad8U9iF6q~u&*d6?Qq^9ey3nmMnyX~fP_rR#HhnqFq{Hx>vmibQ^N~8;E3!n37q+- zrkhogZ&u9UZ9l*tG`qqf1I}YfjRpq(lid)~Tg1w$=R{ZUv53yRX1J7dkeaT=GyZ>U zy#-X2Yu5#eiiHXYn@*7~=?+0aQo2*Rk#0~>kr1S%ySoJh1Vp5z8x%wuq^0lL9{m1$ z$GD6$+(+5*zR!MQt-0o!bE7E{f*>pzGfD~Q8>xE*WqYpjx%)uEbf_Iu`xml$}V2^-|NU9dJ=S?p@2euzxKLU0GeNqTkzFt<#gQsZ1kM@Ob$GuZRjTwg=RM@a&$DZ)UcDT7cb z=o-&gRbUJ`Z{Nmaa3$#zZ^baQ&TcTuZtxdA|5f}|3}OW+2$bc18y)%9xSb;^T>d-} zkOe|?=u_2V9P91T`v3@1np@D#-Y5OXK-hw$PXTL~Z! zjjl!?;Ow;Hzm;&$DTCgizxpMHu2#3_F0vZ~-J$hPihpSCA06_rKfx@m+(R?@!Ke}$}9X@GjXvoOW-wnyj%j5O> z^V>t*h(kd2P<}C&E~X8Ga~V*ys3aZ<5eTdM^sr{D{~D+CBcFyhq{T;a#y02Df9F zv?10j_9Sa0W97MVWOCS&bTYwH9f~ z{(nLvNqI8P^ktq<2`nTRExQ59iGYUF+}&>siw;_$%C0#Y>uZX1A6ZyF*EINj`-&QX zL8hkD2A)I5M~&vbVMN6i47o+2M4}K(1AJQZ372!E%X4z+!!`grxd>9@j+WPD0w?o>Y>tPZRzI!0K^x5%F0!?_MSLs_1^ zk~GcNzN`OOvhNCz9Ai>b2u#2R8hmf7d~~;)YP<{8lRe)0%70xm+&sHmJ}AQ zE$5_s@xS`qD`VjU1RrpA+HY#>Yhy>ZKn;reep-JdOliJkqmo1jcV}iCQ-8lY8h}6}umimm~Ayqj=G|D%qQYJqubEHF%mB(B??pA>XrzCwBJ};q&lBzz|znS*(!6C*(k;F}A%8VOgl>?+6~}z|?UC^A9;uy?TE{ zPaBej;(Jlczy61b^ev3mLvp3vD7;I0^=Ig(NxJFCr@j@Ti}2;)D08!3t>WYu(ETqX zW$(Mem&_0@_l|vb&jo5`)(6K|{5uYsfGv|kNUDpE-?HSwOKtz5SMSgrhy}>j#i_fW>IR~^_gU{~|AJhK-F@RbL+ss#z%-!-U zi#V&d7+^yM9Q%RxaoY)1oYc|cHo{EVc z5b*l7!+hS*=;-1Yhq}8taIY-sAMlmp!4D#}v&;67<*u%xVtBK`S0tO4iJ7^#uW$a+ z<%dFpTRIC(u5t}7AUR-rq}VSns;{k#Pb3xS!45^RE^SYvxBB-h?cfd}Fbx%+j5>KQ z1BA`(ij0B&dH4_DLAdn1#!B?Czm4L&itw%Rg(ft(X+o>J(&ZCjL43gw!;(K;GrUwW zv5;R`xX>bF-ya$65R$^n%Y3jnZ+fWYs_LA^?Q*wk3Uo~d-wQ^9D%R6=ds{lx5T!mf z=j)~hNJNvn2ZwfdfB#pvm=kmus)foqMhEO~OefG__Y3X~Df6=TsLmYyZpCo)p{N6~ zapA4c*aXk+y=#n%Bajuljxyc`P+V@ntzez=Jrc(*%LBzBjV1=R@bApd0_|{EPJ~cz zkM;o<6Si(TE?QyFe5eKN&tuP%|fRWdT)e zfTZRo=>-SsCf@79kZeB#s@z(T0OAntF$hzY1C2(=Q^cl<;`==!WNK8nAN0F=)TeYk zKv=;G;sf$xU?hQ!8g8K)^h?P)yFwJEXj~DW_K)E4H(D~N$c$tHp&4YnPPojR&;yOd zkVQckqela+$3E3k(rVh22u- zLbcWHZMI}8<2TT;gScuO3|3pXCRgL2fBU5ApISXv@BO2gtMgB-#1%v8MW9!!@5i&R z$HowrL>0C`^G_89Nb-t)10X7n$uIt z4Xb5nIK6{(`S2<&I;~X15dwkZZ=lR z3|#?4at z13K^?vo%voVQC-3Gr(rB?HXJOLdJeRk&mvor>CFuYjA=~l%3bkubzV$PrIs$CU^+y zN6dz$ReXbi(|8x0j?dkV2A>td>)gndPlXe-dPL9TSjN`3wr z4`TfnfTNo!=X^J=`9sx!PaXP9)q&MOa90rlD+ipreSHCdh+0GuQ-sbnKXS%)Xr~k3 zZ^4FkpxD^hb(i`c)8NSO26)PiVAeo&gcn0gO{< zUCss|y;xz(@^HG|6+$#C` z`D5OBC6*o@dN0QpN)5jVIbQ!_J1pP53rC|UdNo*u!&pOJ{5 z!NI}pwPgtVpp7zIK}{_?clnDEeE3z7+`>Y@i&mDK;f;{~=aWl&M^LLiN?bgI>FQnx z96NA2(3gtn?Ce~c1K-lQ;NRA_8^mz9sfh>)hc0zf?l2`MBxKl0o0u#Bp3hc)F@I9` z--P2|6&XuCOovu=ZY6(FZ*ORB55j<*E+;!1K?dq*1;bzome-#T=9$nN4IQa8FDDL` z%lMO?bo%tucS@9&zuOdN(s81dwg(|`%Vu@Tn^0=-Xc&s1(idma2hE94vcD-2g-mDl zqWjFU6DUX+uzX;SQ(2iC178k2s6G8#W5GB60d!B`J)ZoD5=_T%PliKbs6;V53Hqv>(5!ABPZ_f* z-W}YB3mCX*edW;kZ?FfKf}>fnO%Jh6;tdwDm^ zH~m9GTfe+QODUac$%(+gfmK@^-eVzgi-%woD5?f^gyG$=XuKfg$^O_FIzz3dwmy$Y zVxo%Q=b=T?DFG%__4)w0sI!mX(c3bz<>?*Nw`+dnk`bTTltBMuerwE{E3yOn{Beqm z0FHwVYad?j?2Bh4J2eP3C6N5HiwA@ey7$QO#O0h9f40nn}snoNp|3!KCYAUqnUF)=ZL&3ok~uCkUEeV^p| zSed_)A}D7xvDM?_ zwNl1(A1_dnzoQ&Ov9_9-CZLP9X zUyHxq+!F8wwHCtXz}SfO=!%Fa~8MWlRrNb$7?ZTS;Z2e4$PW*@Yy+O}99Vcv*L zW2cznk|gZY5pv3r$oOm28~bHHgvgG=U1P3e1*=r62=NwH@S?l(L#6Ha=#8O*R)c@? zTO=~RcppmTQVEKTJQhMplE`r1?w~0sD6n{0_o2eKU!x>2N9SFFqW3>8fb?^3eoCo< z)xGTvwwFrePQRVIcL)DO1_k(gjhRYi@E~~aHRzr9*!eh4R@+2SLLlHd)UH@|D*<6s z<-UfMdP$V-vYDyr&>j)|b4pYs)S`EzOT_$J;f@E01tA#HL%EBR~o1Pj`h)`M30alyz9oG7 z+Ef#4=@2_y2{sMonn)^9M&8bysGHhFSMG53FpQM<-)L`dx10HxH@d1xQvv)X*#1A0 zv|wd0^TU*5XJ=ngmo@jvhVuxt3qybTX_ef5AtYhXvDeT#A1+~^dq3_A!W%~Ue)v}! z*xP;tkd<#a(@lygM!9@apfAqNrNs)QHRQw!{0M<6Koqbaa&*u0QFjmg!M%d=ocO%t zTti`p^N_~1o5?!P$~U9*Ea7M+=XdvaRu&FltH+KtCju4fKD~S!2%+=#sRZ9Jhv)wl zcuRcHps_QbG>OUob`Jj6&^k8C_ZD>cx9Puh8iB?K*x#HA?{7S)e#_s5C(x1p`3;&$ zjO=9Pg2od%uDK1_Nuw32nithc?=Y5cRhmhVzeou9jBnP_hMFLiQ}Jjno5p^si zpXO}GWr~PfG#KvWC3vgu?|m@mCEt$s26x`?`Jnv*5zC}qZXW0&l_?&FC8P7M{VYpI zyno17`oss5gLdTnh(vmWPhb&r5*posdc#LsY`j7Z9G#uRB^A!vOZn4JZFnuPs)o~- zc{R@_o~knLCC6fnf)`J4+fZ95$JUs1?~tn4P4=E$PsBSoxz@bI6m{C@$Fe}my~v_& zSWz=1yC5Mxoff{SV7c3%!C<1P7P|)KaK>!45y7Rn)3uH?(bCJ<1?7x2m6fGC9mZ_& z6baGab`u$W*GorY_=rs?#tqAOh(lHA+dOc9Go^UFEjr}EG1v29%VFdEC4OvcHQ{(M zn<~aVWoX$Ud|Pr_!&dWi`F!5D{f@u0(c!o}XMmFY_2J*a=lS}V3^E|;R%WT$U~j4} z>5iMj5PUv^&BrRb2l3gKTVl3Hgvt2g&N7XcAzgq5BW|GB(DWi4^@m=&EYMgYNh!LF zS}h8tt$C!RVxs#$n=^AX>Uw&n4ttfm5hJaZR8GiqGhTjp$soEi! z!yLRjvAAO~anI6E4k7M51`&i&1n` z>v)VaqO9Bx@yu{GCG^Bmq)Fi`#EN?`A;qe)vmGhJB}={V<)Sk~<4E{jo)L2k2*}>q zVvFnk{rk64N!a3{3cX30#%{kkSAu3i1K^Yqd2I1J)&!hp^)BnuSxOx4(do1il+lR^ zcSaslwKhjol{13IZ~o*xV5Wj@fEIK?vc+YJF={t2J6ZZ^3fpPAbM)xZN7EDJ7BU%A zNA~x1W+xBZ<<(R~$p33een=GqDXp+yZF~NU$@Nf)430QuXzc3SDO3rzuhrqW71g(x zksM%scleZpo15tEm+ERR5loJR>;BCk)e2+G&KoW3As9Zo%EbE^u{V{G@t7jqoFhN3 zEhtysKx#aEW5X6HAC&&8^+$0r^C|DA6^}W=oaC3*Bdml@)lo+-w6x1{UDk&eszEb#Q4;<6<;~KeE)Q*U|4#P z4#dnyM$~$uWQI_~s2w{O9Q??0BbA1{L*>wPTV6X;kbR2#G0>Zwvz*5}>8j;#8nq+o5%p>A$+WQw4p*T@bkr67 z*)x%O7Lf7BI5O< zK1~(Vt6W#~rI3sKqkz3)$2(@<%TCvE>$|0e+APS zM{;|KPu_M;e^6bmNoBh+YWe(G5$f4DJD!uY5>bZEn6~+?hvcP0w+U1X4Gaw252O%n z0I9ui5>2rmz|EukCAVYulj^GhVxX9fO3= zxJrvv4erQAiR{NcLY3gP&GUDsr(`U|rF5{ZReU1zKxtTTHH0G>q(Uw%4ljmc>|>0k zHtc6DKYo{&K$#`LSppuZku=`yf8Bh&8bFjPcN@TJ(zsj6^MUhKGN(^Zc@$|AbM|) zAK0!Cy$C$G;RkLa-1YVK!wH{*%ojk!ZAajXYt_rvr-XNrrI8Y!k{1s_-v1lgU;Xe3 zap+vf3IzEU*bSonbai!CaLmttJky9V5y1o*9~sii&m|*3p>A&LL$kqekA`waE1@-T z0VGBbPM2hGa&|7mG1j!TwMD-MiA;aVE$9V1bWR%=_7G^1Hm(>?;I_+FqVsS5wDU2Y zmbg9dfTqiVB0M!O(=FM~#>Rw!HfR8Z{hjzK#tRv{uKB0^U9)?n9LMdhVg}<_fuG05 z-ZvYRYA{lrU*%+Di;$(Z0H3+#XFOx+Wo2csCUXYkWU1lR%?9Q+HmW5LkHLcgmOmpT zx40+w@1Ufba-JS-viuniwDQg#VIFr_%aOZ=Y8;*4{G1WvhgJ_}vE`pf`c;^@s)oGd z)-B;pw#C^ne@Ow%|Y)#KyrvDTRe)T6(*~*umO5 zL`eq`6z_c-0|NjYJUzE&!!kG=!^7_a5x+NpE(pX?;C(;aPXXZs+U@#w7%OD*%3(uK zUc?T3Nh$Rn@!vNfNSZEEReG|P6O(d^XuXsGBpj|=Cu!Gd?Ra-$6jiS;Z)`Bf(KB>x z9tG$TrbB3z+l zo;KYt`O6oHe9`-rzg}FqZoUJz%SZ7AEyfbOA{O0)I6~(&Bw{jyw6wLEaBsC`4lWO& z(qCnH#TTvE+ui+*LR=9m@a5og=IrFIXa(=ZTK*JMlW0xg$+doPyvHfAP!I~*Dp@uf zv%(rTJFTOoG=kD4>8|gGPOisLT9lz)5o9!~f+$KS#$@jdY$Et{(G-Mcb{TS!0w{7q zvSjZV3F*j(`CV&4vF<+kQ+KwwkTyd)<%bX)X7r}m6X$dUSUe*J_UM>ro{DsXBb*}k z#LgzDxx~@0ER~$W=>1FBxO3?OTPuW!9gWqE5s={ax6?ECMPFl&{?1B9hx%|ypxemY&T@h z=zl|mQ6S1L6Bn=pyksUtDDX&!w^35pDe&D1rTC7}mBAfu`Z$LK19NScO_8s`gX&QP2$^i*HW zd)aUQ+iHgjaZI|T4KW&1-rolQMOBfayB5hbg0WpxUFCJX4@kpeH=}c)*}^?Jtj&<4 zAMwx~CfJb|Gp+%mm@zSTJ9~Qx7|-F7sQxGgEtXYEHFa$(c7&iOyv)&~Es<9aH5Mq9 zj3a~fz%svRD{E_62pR3-l7}r-5meFAWwt@fPIX?fidE3g7dmgOSu9X}3B_QC&*y|k z0we!3xGQ*oQ4L|j1fZmrKc(*$Yd>UqY31uHeD>{1;WrpG$T_fdsS4W1l(#h)N5;qV z?7ChrcXobI$kY?!?daER zz(brJPwL+crL*Ops=BKRa=X4Jkn4}H*39}$+Sa7dr@Kypkm^FZ+q}CFF}f9BIjp^~ zc5FcNn^PIPgi_=&Uv{>_T8C)>K+G|oI4ej*xw_sr|75@dMl%++{vI>gK-P`x`L!;- z`$Snea@dlDhzJ`MO56~b_i_VF2B-%Z&Cp=o%o)t14Wg974bIvQX$d9aH`XL2CCxrH zV~c|W;>vY&gF?%xX^K5T1@+f}T^~TR{Wbp|-<$o{ z&!UAB1uKvWhZ8IZ#VACE-(B;+7cTiNp-M;v1w~uY&hEh%oczy{cO|ZHU46-5jRxuQ zpGO#_qFo&^(IuoAmXz_SaHKA!_{05$Oc;VzoZ_zot&Uj_y^7Gf4r<_ubJC4eF9^4*8Z%7ajrVS&); z?~j7AS`?FZVRSm|4QSwhNqNI>bk!u7BK*drw66VShP#Z0{7Y+VLm(QL`|~be@K_jg zEJI4)>UAfnn_Rg-@QQk{zl|HM1K~aO$!;wN7&U!n{}cho`*xci$GBdHIB~m0MDvjj zVY_{iI6kh%J+r%$k7Z0WP3ExSCeTZ?YO3z?cWp93)9vpO>`+PIxnhH@u zPqyvT#{+Dec^Nao!VnMnAq~|o5CTz6bj1otU905BbaISlc|J>e9kM~aqReL zV+g3PYktQN{6x>jWTd|0z#4-ySeVM}3`PoR6wdh}J^uelD^{1}zomMR6F$Wj`Osqe zLgT82I9#>j9=`MX4l)E=RYwlm!C`-4sfH)R#hv|qc)nNGZ5=nc4k3w_zP>&Sz7FH^ z@bq-U*qRuC2DFTmI^cxVZ?2+#{jzoH{=N&e2H$-y_w?B_z(_-IRBB<(Ktf~~b@865 zTukiR66?5KNZl|F!u5@f{o~Wha`GF1xj9(BsaXM8gnn~4TdI8Sr}F*rYJ|7Eea#vz zf)XSp4GU8{sjzM;B5=f;AO1^}qKx%cC+C=3dS=&vrvn<0d-W))86_nc{+ANk(jEn{ zs41Lk1+gexT!2DCjK21@;rj?QPDo&bL1pahBc%LHHV|rKlv}!)VR4xeV zZG_{{)ytQ(CPqde{^n@Ux?f#Y)z24?W3U?7W(Wo1S1%hqb38y~GDGhLp%0i8<+}oH zGu|Z_05@@vA0C+f2%F=~OmDU7V~9s9ac((-ASGL7n#%#8E ze{{msqkG!b%dkId#vC6V0al`}u1<;7zt}pr#Z&ZcdYUAxeOTr=2nr+|ggtn*I?Q`$ zAe{y#T+lK3^KH6ydzIBFyb4ei)XVW+_<{=A8E`Zz(7rtRj0~ElY+Z#?np`)N$#29v z!5c#ZGaO@nJ?kz7S0beG0T|X6!FGdi+&C)2O#oED&1vB6BTdHVx6DVL=D4m;a6;@x zhMUV-&Nyk!72DiG7zHl_FLrTM%K_3K0W#B$zo^*6{ z1n9msSApK+7!$sru7x=sTVvzQk?R*~G$>s6jJvo5K^Jg%SX|x$JQ=A^;l!hoTG`mh zUIQz0c+@MB3B}6`BLyD8^7Z(}&_(yoR|?yOuV3fvi)ARrLoM$R*#i|pP43@p0Me+Z z_>5Xbo(S{+7)gE)4nin|V!u11h?f8{l6!9QE7JNor86%_n(k_-Ub&n0LtQgwuS02JSS)suQyYzRS z(3b3~D(5QOVX!xaQ~TL>6#lw4!~P-Za>?w`cpbuvCH5)1)DnX59S%#Hw#MX5m-^-4 z9jGqsWO>rc57GvLlChxuqvXZmg}VUTNjzoRsG$vtaxmBg+pT9jj6sGS17v)6hGJ~^ zl$DhQ1qI_Pm#|`ceTak@s?4rU}FA$9}>B%rYQjEx~;-@Wadhy}~Hq8?iHw~!>if~Cr z+(DZ3rJB!#RJ{un-VZ6r)U2T9y2o?}FS5rJ=^WH`B!)1#&xqTlHPsZcjQX?HvZqM9 zhQ4VI_ETBv3}(9f2pKCxZdAiy4VJjjzbRIojN@zwm>C2CIK5(!;Xsl}j``4{e+69o zpuYji8WP_p{wh`L{-hiP$-#jG4Cl6x2@M!8{TeRRn^{5 zo^m`6c*?=dPB@W5i0?IIjl7AVEE4biwBXQDF55%8oa81@yxemmVd6TOS1Rx6Ej(se zv)@)n^?kk1&rVlTvo1g8;!gJX!WIOjal^7P!r}LII>q4gEOd)QYKFcy_?oIMY%%fJ zA0b(unj&tP0U*TNyP+mX z`*2Xa06C0J<88q8z%v=$vORcc6ATket;gZC?@Q#( z7>CYVwPL&*s2mnw|AO}b1VZi)Ld=wOm^T0}0j3-5sf&kh0*ymAv;G%fKNRGJ^p+)4 z`?b&vMDiNdYsk@kW`Z59sF*>mEpbUoQ5ri$5nFxF*obktDg*%)!Cf~2ux$XJjPkXe zLsXH6)rBr=;}wq`iw-tt)y=v%G|{*e3EJ)BWF>?r;|7eGvFFv>A$27vhcu4dHnl@7v(|00Fa-I)bi}_E9#iG)4Hg zA!h&=faHRXfpN?4lRa@&*6_($rfpC;BR_E{)$}~WcC4Yb|9T=( zK8;jBfRcSofwWKlwz?PN^*lXgl94oQ5ZgvXc)xk%FMH?dtjW1dYSRd zk7oc~0dDNUbfd-hY|%Hrzy++t+U)`OZQSV}8p@Q4R%E`H^hCeGBihIK>O*;;WK-S{ zr3l9-BFYfX`1*VtGrA``@^f$R!k38dR~xWNm6%)&H9bW;r$18WNK<;*u& z);QUwyrKNhzJCH{*m%x&NKniPK>7Xek9HVqO5cOMbRco%;qAXX3QWQ=(S+n-DEDph zZM}LIo4iwe@F{(g$kXhj^QGg#B$$-~WMr|5A@t{=zGH>NP^{P5SQ)q;JJ%a)+)D6% zI862u5J!nwegs!+kzgu#bru>g!e<_#yOq{j4_|743;y$mr(veSv!+oRi!0WN54;q` zazJ4QiN;r4`S9Z0HlQe=(Oi6FQJ-qep z9ib=C_c%jQ?LI|UxyOHUKL6brh~3%ojKfu6S z8AWt(CEUsQ2b-7LLZQZJ8qY*i@5?W1hRmK_W}Oeep+?GF?POr^{c|8TEXtHVyYEt> z>#1(q-4A>ZW&E3eH(w`9++&s6NZmQ0?%YbDKzLmaYL41a>hXB%fd3lx1 z&oZ?%h5sA3k#Pq;Rv=n?lzlArC`>{mqZScYaYK*}m1T1dE^h9}6nFRZsHC>t1inzn z&GJHO4*OLOhzAc9v>CEiyK8GFn0`)i+kM877y3B_ zj?#{f6+)B%4fmT5?Z#(t4=duKy)t*q=+6zC*?v6gg&PyQ7ElCB0Qlar)a*Pwv_W$a zH>cCCtU_=Djp*sLdJP_`BrE_zA;jg}F&pwWM1b}!kr15D!^W0g>$nh>++!LL5MZG# z6~RGsjq}w6QD?aT7Z-JqArQ~l;#6i)u?MqJoS;HR8%P(U7b z)Pmnj!21Gd!`T@clP0`|H?a!CMn*=IbDk_?sFtHE#h4JfLU~P!Zl8fd2gr>JdmgPn zO8XX9S64IKGOCN?pJ$S3wvS3u!ttiBpF(~E-VXD|4L{7=*BAuq`WC?x38n&gQamgF zENo2$)*(R39M$;-2j%dpImFOhTu?)^tkR;Ag2Ebcdp~_(`u}0G0q7*;nk{} zAwHSnD#B4+3xMW=b#6q*&#zzKv>Dp-CB@6(^4C#HnN}Qr4Zly^P8Hb!7L0J_eqP3{ z@IH3&*8c`Pg?u7*whrR~%PQzS(~KFu$t14$Iv{`vH@Is-31I?No(Rf)tBS@(y%fuy z1qVcjSd2+R53&R(u4w-HBjCzuEg3YCQbA7xT165@k5d1g71>pZ)^UH6&19WH|K-ja zK+tztUM;#e&4zu_?9Nlp*^P|c+ue=o22KBOAmAM+TQb@;gAvdQrG)zW1Y0+Cbv)IX z(vz2sGcpmB&PS11BW}rnA459=X_%G6?O+n{Hg;>!BQ!}~1garSZuL`%X1=i|U*8VB z;9!$3C~hukG6j=)2*Uy3FM<|< z;O->7u)nw0KLaO{0O{j3Yd8+XXRTP1p2T*?EI8BxETs@6BZ!@oPW$)FRrgV+NNiskN)e`swc3LsU1B(E39%OQ}kzSi66J zeR3}{a=OX4$+)_(v9U2L>(yCu{<$nbWB$!>o?L?Bi@gqH2g8iz-Nk2CtcZfO$GD!v z?dua&R&H)Q(b5*>6~QFlf5B@s=g@hm31I$j#Yi#bU}r#n3SfFWL%8HiCAR2P4|FYi zDPAsGyE3dei0^s@L3tjioe|F?u!p95%{jn@qE_JFWn%b^}#R5sG?{ z=?ptYQFfK{sUopbW>LQ??BKQ#38RkbiWyh%mWIm2ptxLs>_vK@t>>AQ<+A*eNnTk$ zHxx7Gc@AkB!a4p#jfWUlJbYM;neOz&V{T6~ZiSqz%@A4}V@%}o2yjk~ z*$83=b(QvIycwTwFRjv1si!>&L!|HCxgu?A(Mb{6LD}}3ZG*!Q=VB|XevLI=-f0CX zJNK~P=lLxt>_d?0Qok3L2}1hiSR>&Wg7~A?tLk+ML&6-eINlOhNkOI|&{t8ILn*bs zz7F-m%>(7xkDc0dN)^9niq>r1okZ! z2m}V3zobe-{dDWW$5M@(ParhoN|4JLgpIB>G3Hj+4(nr8yEoWRqD>M;d^C=n463Ho(s;$h0buIu#w zcb_3x<7{e|s#OdX58p1`<6*dyMptF<_QSNTG;fp`yLi8fk{>x(Zh z0r?>_j5tN|C}~)a@o``9MKEbqfvHI)WoczK@2;o&k}$XpN}J=q(rMN7n14CcQc4*{ z?q7<6KJCx;iHu#y*95nT?uhdy^;&az?yabX&JSIp>vE8hxiu*e*`cE6XM~j7V@&Ws zOT7)&Z5NwJnylJ+leVc zjgG%7)*BBMDuF(z>3I-S2c-k^knCOZx{_!xn#Yi{@Je8(`B02g`Cl7i#kwaEnI>jW zpLPKi8}KTVXRYEZZjmN&>EQ!4N6(G-p1yc!AeDpzbf>NbG-yi>c*YYlE3A+VvZa(* zj2PjxhT1^77guyy8xy$+z%Pz?TCwX2&e9&#?jq`j!O}sqXr!!+b>b04xP~SA6-%H! z@biQc0h(?OWFg28Rtj>a@Y`14pE)99@@VKGpn?**lxr{a29(Q^MFPOZ#r68ze_AsT z4O(6)Hz~DsOWxerz(q}~tsOV#019w?T*zJ27o2@gIC+uSXA~1$Sb<_0{Zv&I0Br(B z`QqUxbB-e~p`qP}Ecg)ehlo*pNr!EQnIvP(X!F}G=Z7HH;3fdgps-tmmwnbCeN!I9 z9|kN60;^_XSH3;_ACaSc*@>^{$u}j#kcLMOhNNyF=^|0ILBP3%xI5}e!3nAI&x3=5 z;B*)ln18H73Rvy-*D**7OG=K+YuS$U!lmgjp9l6%jCK*mKWd`&6RL8Pg&nejrNGFa~MIN;M>-1Kw$7w%mb|Nr+P)dkSX4q^taMaRbDmE}M@{h8eZ zbhXK1-Fg_rYTxs-y^j@+>enQgLZQi?C9#weA|cQ2MVl@&)5YT3j9-=U8FCYg z%#-ZKPCltF?DI(6y!Z&q5eXMV)CP9(l^V!>G}OOxx063xGc$S~ zBxtW@e2#4Zw7==?Itz%O6Q5j3(v!vBBz_3lf9xaj`Y4i~R1}8@^xg7%u6Pl6plF!o zGEGk5{>)PhHacpkaMc$Sw6qV=B91I^GGD=W7=28wznq!{|6HUO)i6~+HyG!YWwafQ zzCXX#CR2GCXLsoUC6()RW_+@DabU5RCfQZ-Q@7WCmTA&%?M_Wn-&-x>z(w77tg;Su zz9H2RM9O$q!&(xIP&SUnr#@YRPd7eo)7G@#+T(Dg{&mj7<7$$sfEx3$U|jj>@~7ci zhATV1#6xG6r+V?{8v*V)xJt_{q)Hb1y(g}ZDn&PrZmbKZ#IYs@7em3HdyX*qJzm2kH1+w4E6#(TBq5`J;-qVR(MZicz1%It2fHkwce_ zyQ=-*`$_g#9-FZqI85R>-@LNyxHvA(^i{b-{j3ubk{;><%0~6#S%sW>#s?%74aXz9 zo@UGKaea*{#j|5)!;99hUWZ0{R&>Chdyg?T;rXVKD_JNWy9Ta{HTv*(qFFbaqfxVA z-q$cd@a~85=Vc#u_v%ZM)pF{qefQkqrfz*-KIpYQ{^Q*xBV0*3eYif#-*g&3lKH$) z*aP1mr7K1~%FDERI!ZFm;#QG#4Q+kDb!-u^jwP%Bsu|HQR9SwOj&=Uc*@ zIZ(TLpL#~mclX)!&UHL2>=h@x8mpXUrIdPJX<>S!nzy(zKjf-K?Q0}L=M(wq6ozH~ zEh z^&f3UoZQ6iRcm^ZviOKs)Lx>YmqyTQ%(zXo+vv*~xV5da)nu^|*YLlM0Od}%cJ(kKc~P&anVH_Rwq|4L?!Hl)s9%lQgM)kq zhqV+Rwyn7?nDTc>nok7)79VOM-kURT$-_`qN z=U0@GfPS45|3A6a*7u@&8G<_Y9~HgtpHTSvHt#+0B;LTmVn7K&x#KTG0)kI`iq2}? zW~RgXj;!O>onvl7>|K@wZ(}?n9T!^@ZM$qUvPl1znWw(69v@Lp^juSl>w;wz0oB@?Er9THh;C4!6M4X~n)9!T}s|08B64{RS9&Wyt{YXM^ zW9%V(Qd(H9v`{R2zRypW^-P_TjxrJ?m+kLQYJl}Pu36t6jbmoT~8fk&KaT$332Nl7mXgb#_W(s8ie6`O9rIX!VsFnlCOxZAhhW7=r|!>_W)Jk} z*@?!x&GhLAkOW*sqh$`PP6)@v5>->RERXqNz)?Z&`iu?Jsn3M zelp_$dS1NV8y~5wBpf#dX=N*1%SQsPlR}qKru*Y0`~=1cpc^C$deVQir#4G-PYR!> z#Zb)crE{sw$U3yXxAh6O9t6GTKBE_RX2}Gr7EuUstbER$y3h=T3P`~J{P|<3p3-4V z8v$78E7v!Z75v;@a`~n>gV+z#J?JW6A2F7pMCtv9*6k#b88DTuuU5eMvu>UcYU7R zZ1m7_lp&OFQiBDB-B%A!RSdolYk0(tfiZS{x~AHD_pEzSUx88Z&rw-s>_Rnzn@Pj%+D|c3yrn$fdu3hmzhw1tvMSPQyohqMSC`;@J=;6^S?+OPNe{Xj ziMMjq0apc)iD+KP+XtDIdp2!?QyGFk8)I42N>@TguI;?2_A-rWk(HX^%wHKm)6^4x z+QAMx&0*Z_ET3UCcQ)}f!EOwlHrk8H@vun5Ml+faAOBm5hMjz$w(oJximU8LZ#%9$ zpU1*m?*nUgQhbygR@k~^+Bp(pt-Z|V)>j-jayiX}Pk#T@Ke?*wq^mo9aw5=h)-zsl zzB*_0q6{|U8t)UKZej5Y@}^EIh9QqFd1 zdK!N0uc@gI;1}IFa^IE>T@U8`FsQN!d(^5-QSI0zcq=WQ4EtgW$<_TDqnej;@{{HW`*@?i zS*z(m>0Pb;>Zse2^&~wDsVjgXadL1N=%gM+-&Wb1dUx?Ulv3MwOp{8u+1ZQdSMEgR zBSw!xYMu4vbhl4F4Zm7mrsug!+q_+oGFP3uz+Q1pJTH0Y5@*J`#Z*`TA`7!XA&GHM zV%RD0;@wIE3!hfYGDeuhQ5S$o?TwHsu5&VV8bKQFePRPvxu&!F?56Yj_&P!5&9ejG z%_>}_xlFZ`U_8QOvbGP)yJ+^6WtLrpX2o{g_~8$$Dj&RjoElHOnn}%7*NZRGz)?#S z6@*0Eh#9F(Uql^QaZ%qxVYlK56_(yn6)82^(bb8?tz;uYQ(t6 zO1L2dkOv^6X_}R1_$|;k^VG8-6YUNMbpP4QfU~vRZjG31ND8^0>ID?;eCrnRxZxY(Nj+Wv6IQ~C?^hX10UdP4VmZbZ@G8vW(nJF zYm{DU1dR~8^WRJ)uJx_<+;N$s`e2s!@hA1y78I;(~(KH~-G5 z!ls~dlj8J<)epfMmt`?gYbJF~Qy{y;Le(1`02$Jcvd_4Lr$*r{WRDfP;5NT9zqinC!chiItD4o_ zkcZ#@r!|5yqE%}h$Jn)-?sXYps&Z>hswzV$IhLF3Hl@1cOP=-$pAHJ2ZpSudjzVWL zRO~-IEw$aP(1eGV9MENy~*``C<#3+G-$PXpp??~ z`Lh^(bowY1v*05C$E|wqtj4VUWyzBzKd-A7T9fBybeQpa-n_ORM>Fv~KbP}(!dx8& zI|7TPJ;Wp=)3J=g$IGU*v|}Hs1)IK}onZtg3AN=fn4OzEK4>y{_q284bqQ1E<^vbp zoz_=DdXeiEJ1@9GaIDEq1kyrgHP`;!T0VyS5iKk7`b0=|4(mVC5K5OReZ>#MX*_5h zm8!m-KMSGTtoPpT2x;zB_m%3MT8NERpD`=+LZp|0dtLDEObY~F32u$)`n3u@ko7@= zb=bW&LQ1u;|L274dck)wgTr4h+0eD23UZP3NQs|83A}y0sH* zYD&0a?f%W=wm!pi`+`O@=OMq=9Jp=ixb2451Z&s5inn&HCWU;O(0a3eY545x@5bt6 zuUU7=r0@$vP$n3-Dfh8q#6RQcy|n7jC_4Vh?I?b12rN&3;N2!xui#EYBOppiF2`T(=;@-~T#NFluFki~lxE|v*u5*+tHKCi&-vUNno z_F$UG0-^q~ot*EvYy7Zs)1l2u#_Sw3^Ow()2b8}XhBy_=M=AEMzG}0EM2pguTMTe; z?Zz1US04oH!_zg)el;*1=>;zwbNIcOhi&osU| z6+Cm;W8!#5G%194`&+m?xz z$m}|c_r^(XTXXMIh)>6F8b3G))gp$3VMW)1aBHDB?5kblBYv_f9#f%XrG_0y3*pzd zwh<)azL(d!A2Xq!LcM$1MM&L2n=cN&?6r*!yGDO<-CDMT#BRNhQEoG;)L=vralhrV z1X;IAAP)>ZL_wL3?|n8IwK)!@+aZKpiQe;;9$f6LFi(N&7fCh@#{EfG{b=IQX3=rQ z213hH^F+AX)4164+<^{-?y$c(z2r{em4)8VaInH8g2^e(<*xlcKK^|EyCDOD%S=W_ z6@`J@u76HBV`=tI*(ni1yh@gC z?O9=IYwcX`;`QSEnjSM#i`JI(&DXnT$$h=>5+9z!c7COS|A(xvj*5E!qE!T?6eXm) z8xcflL0Ui%si7Gf3F#IUkS+m{k`n2XE(IwOkY-5f2I+=(@ZS4=Z>{GacU?xA`OYU! z>~qfE?SL3iuTO=)M%0_kwXlg%_)ouGC}T~nTnwE1j@ zmCekCP3k^|JLO~Y@oaHQf}kNT46^=Com!;Z5=nk+)d4qv1bwQ!+bCkDnxv%KlX)Sh z9HP(CS&x}pAjh`h@l@K5U5fkEour&1)a`Q=ew~sTwtUQEx6on8_D_m;n%=^e=!S(8G!O8C+8qB*6++|S`E zYhT=?tO-wi}cV-taByG$+ew6py&l{PZTpEy}N-KD+AIYvCTRDk|W;_Uh zU$lEmd}GcbNDyIKV%srJ(P_3266;=0R$;Y(M{c^XwRAjIkhD7MARyy|$sCpoiT7h> z>GU#R@4nl;PC06=>4@~%`UcZ4np6c2+v+w49}W*+JMtnQCQoZS_hI|ZqB;?NPiZ?7qlv4`2Ncc$9);K1)EOLEM58>{XYqPjU|51J2`o<8 z>U^@8=G)9ED?dzCjM1jjN`A`{+q+`SaJ0ALLjYC4>}N{oQB6ttz$+Yd;+Jg!PZlfc zxv;ML8gZs*S={8`u0qU_6x;G{nE}+Oj{T^=yk(X>wt2^`00r2gAm^O&G|6 zf9b{eOrolZp&=k0<6$p$||yCHq> z-ro!W+hIYqx+#Z;7=(0J`& zTrLZ_8W4&?4De}N_@qTtYkz#{O zdFl*9W-CB0<$!QNg1=a5G8h)^h}oG2P~CD=;p&`TC-yQ0$@vmk|uzFRv|& z@Hw6s0Tc%QA#E3~jAE@W<)3*o1WG0wE?r<^OsClJA^H2tQ>OWsUz5AwMJ{Z}C3mdD8jJ#}{T2(QjBX`hv8s)5B1k!Sq^7 z@upuktF{kq5*Nd=TKECLP1_EQO+$lMqm7Rx(jGk@+hS>6rY`rfnY@b=uQj(njQT8)z3oHU{-Y)=|K`iq49QhgE6*-Gef)7!S5TJ^b6V` zs%|2{;!x(djoPamXx%hYIPan=dT9BAvD3Mgi#@GHW9~hmMm$Wq!k$=O(}fSJhqkuv?0zE`R9arqCV`~-acz>5 zYr1^l1wQ1xkyjs0TgIch@BlF3^kmr&{N#e(DqD77Nda^*@4lPjRYjvLQv4zpnUZPJ zrnf#oQII;55m*sd^&r=EXXHzW$60?z>-_m?fL)36$*U2f9IM9n5?rAbxmQX6QuN}> zvFho6Tsywid*}87yj!v67_Z-(RyM!ys$x009*!*b9#5BQ)84O&Wn9O`Vhcmau3+a= z-5E*l>PC8NPS%II_hrL7o~-6Y&h&?`hN|b*`zBZ4Ub{nKW?fQzNmF)$e{=K)p0S<^ zl~qe6+To5@-wc5j5w^WDmtdz;>;o`@qi6gWSn6rppt zO>iTb&NT9{r~um2Qd~~^<&e$$E;M~T3@-@d`G+l>k#k56w=7Y5>o)SpV@=o6w1KBVBuDZxv2Yq;D=+!-39 zYTn}NmBDj4CXG;=S2R5|T0)@?rDbwGsCug^QAOh@z25oPK~K1MhR@v?dEn@>8MgaO zR|QbM-(KL-vAj{EtHZl8yi>#RV*3ws>&36to!$Q~3giv_2`V036|La#@I}Nl%(YUC z{@sh=t>g$!#5f8{>-*3y|2dl;%44Uq%wj#<=YY99Lsj+I3<==zZSHM#Px32(3zj;a zgm_lldz-TKmpm!vNVup^IYa*wn^Bn!0K9FEWrg;KAp?2JkMH!E{lV$*)-V3~g7AJZ z?_&PscyrXKDlg5NG6%-zuUqR?OggMh)qpu&gu>n%tK%BVrR#o76E@!E&%{);=Eww< zKBRbQN?X9s(H7}dEAWEN`LaXMXcVJOM`>-iWhk(mSea`kIX`D4?8Eirc*lgNa*rH2 zdw&<(ugkW1-WF%;-3yOe9`=cJXE(oT9)&vZw2~i0IY=Ag4|h~2r4&Dfni{0x`2|+= z5t`&W*@l=p&?;!`jXcapeiU{6V?ZA;vzZCY+k7L)k9Ef;?86GR6k#jr1Emx(1ioc+ zd)aDxR+;kW_ewI>F&MN|6qK8(Up}Vuzs2m64=P03SQsLdLJBpM) zA!9Jfc@DZT+7Vf|5x(a;)Y%|b{bYZ^??+J#vI8K@SW%ZVIH9}O$-!q_gkj(!?*)#q z-S$@LmfPhppDU!iH=7)XH3Y5>B6ag>Zq-FHZEN)EsQ5Kl*DlXJjJjVOKM0lm6&XHl zY*ykcoPKW9PPcJdK8(iB)DAK#;nY4WqfsU$X1&4g*piT>`@Xhg;?6z3CqhDA&z{U- z%e|dWe0G#)X?T@C$CSNyzY~E4s9L8m(xiZLa6~mA-IF0W&wrjNIosK@?n3K?VF(<&SF-5 z6&EDx4`V-Jd)f50MnermnjmjU7TNq@4Sr z7(TLHRUQlqgz_0-7d8c!SxF>l7_xHMs5mlmzBoPb%%TX>+uB-jJ@BrK4i%1^5P6u$ zUCaNIj=l18cC0Gm{YT#}Gt-#SnZ$Ih=rOjRa_XLy`sT>Ec>d?Mrop$PTw;Xq97ymI zEY}ZcCk-r1;$1z~=ko*HIHXv=T+%gS7W{CFEhVW>8QmnWutRJ)L;HTHDk-LM@ox*# zNr~SSqt{Kj_Um@+&IN3tqBKVJyz?f=v-#7f$UimWPf2>ZPI&YxhhEFj-kfLFChT@l zyIR#9o9=Z_w$4wJ?C5))2!)XM(|jJ;jFEcC@#ksr6T>+AT0F_t; zT|vp^^OLqZjq?xn_(};@BhwGgcBi4@YH__nWiY9)9XqQvNf`o`CN*}Z2LVjcGn$>NSWZQ~rT zuvIOEl*nX=4SGnFiv z*nQPesFVCtH~*EIaMoSH#Jzu;%~dT0^J3d4T@}Gx9*IArQiHI~(C^*m_bBJDMC5&} z0y0a)G(#l`6DoJL$pLd=`{F)xf4jbJ+4*fVa+qb#CX~YL#d^Q>g+;8o8%bU--OQJm z?k2Y#TTGg|=(jhE*qaT`jCC~c>!S3dOHNEvk^;w)nEt@Uq2JEIu^2~ZcF=BqVdNbG zudv|Bcju3aVx&YI;V(9lH_yB}KOG+5WhcFngV#ZHOY! zU!TUPd`Nx{`peHeYDP|~%aW|j=ksYVcOLKcX64-OE+iC}H=J=2>Zi|bg` zPwD<;1#Va%H-CoJZ71ql4kSt^YR{nlB#@x9ytLGvvn8yvChPU(i?x>DzGv|VHUe92 zQwN3CWrBVEDViKn!jW_C@$pf4wjRWA0RC@iEHam=c`x}ogE>f-O=99R2Q zB5G#yH10v}RvZ=?j>}x;M3j7hG0?|6D37!KqAB~Hv;JJZCC*DP*%h9gSDQSUr=7Z$ zQ=Lq&J?nk8Fy6IGMT}(?6*#YMhDulb`SYV>49502a$fcMNma+z}{SDWPfQyo&t@?=P|ba z*l*HzP!(FknVDjX&& zp#f_m2IfXnQW@l6KDq1*Fwre10rLo3$j9_Wz=x`!1uNx2k{&d_w?AJ(Hcs)>c=(E6 ziu~*hUDS&A_h3k74zOmZtXA1gy)w-OBn~FC%FD}nZviJV#jV>sM+T-C*wA9y^-wqO zH>+yl*u(^k81l)O^{fLzFZG~=qm;6@3p7UtMuXDY(u(WqbbaRM#Bv;Y3Ad%wi}Ujp z&$aw{Zx&z%) zsad6KKsgZ&liu*j&}kahJ1{^L_()zfFjosI+rQoz!-#neDX{FrC`BsM>63O57!EBg zgdz=;DO7XBOQG>VE0+JVTzFuiZjH2WsvL9CX`k?8s8&4nxea5r!c=A(Z>8V?^J(N% zg{iXA;kkPV^vd6%#pKv6Zl_@h!!RAMyc{F-fF?(a=ndiSne6o_8l77O2@;25JM{GQ z2a@4pJ4F%~^~Uoet3}jB+BgBRwZFfg#mVWi^<2+&txIlVop<%zyU?Vr>NAryR zZl9haK@re8Kqs8?rur)?WZx~Ql*5w4M64%c1MtXFEkoaZ^1^O2brA#6ghNn;0S_p* z4d8J}sm!`1`#s?!bE+A)w&1uO+&V)z`{iBO=LZk9Vqq-EEtweIchIn>ZChAKN02GU=?>;l&MUDU6BzAYG@K}5b2{1~KO(S#;rR5@ zx;4+Py>&#n%jAfs3-;iIS&GjYY9Xc15Refk=tAQcK|*{4fsP0Mq_MdnX@6BQw45Iv z9`e@(|40h-`;M+1(#U%;G$(yKm@%QVcLgvz{_403(7!7QFfZ?% z8s^#50|O%Nimaps%u0#BYG2!NUYV-C@h8B0?oR-f{QvIk3iRTi!#opzTzc~B0M4mm z`o4IXICUt%7&aAgwG|3=kl*gQ*Fghwv1Z5ztG|>AITN&@~ z5axqHr{}BVUXKxG$pW6#qr-RJ{Jz4F$% zBpxiQ{@wafp~G=$`hB=vezR|lN2$S?xut%Tib zZRArG?k@K=w86O#+l?`}eOP`H(0c1!lO%L}BU`;SSh`$Li_TGyGTx{a=CIY+^ccWQ zMb4tfGs)c7Hs@5|e|8~_5qf3ngW9iOt**|nf0q6e<#wE`S-<3U zswwQ`_2f~h^w5zI2B^sy{Y#{1Nd%QMg4sCUa0p`+T4U;zEAJB z(U(5G?N>+Znns!fvzDEH);gN6%SIm~nY9d$F!*i#{=BA@33n*pc(h|e(=gef$Gn!} z$fY!Vo7wto#kr>fV@H#AGVQ!NNtbr#SBGx(PG`z#bl2B4A*5gC`O$|8iOTAkPK2`P zI-)}CI0FCGx8LrRrV;j>r$a*wv1l%4Z`je-yEstwLDdow_K^Z8gBxSDL?*|RCl9*z zE?=Tygi(ZvrV-%YR$?Yz>JBPHi|=@YU=I=IsgGrUNM*pB)K;s+2a4}y+A!!RCssIM zsoT2nO^%gq{adS;%W#wx6|;`ht=bErV)45gghxVXo>&u(&g7UKcHW#kaVoYg9~K}->O5>|*7bc% zA(e8hSZwP&w4ibOh#Pa;FuXu>OR0kR*mTLk3|@hKLP^!Dbc{tZ`IU!xCR%e6*lJeK zm~`Cc^8Se<+c0z6mQ~=9*3X_91%e@GFYnA@fdrRoucaj&jR^ZrDsM#w{7Ac)9C=gy zDYM#I$tELhIrW=;@5ZGR`&y*6~6_O7^0~0?yGK91>#?QPNiMY@N?t<(?rr^I(sRAbNx! zc)rB&=Db?u;8()#>4vBt8Hx-UCQkEJWGG_cGWU}!iDI?1DdDxG_`;6~wm7faCo# zs&8eN1g$<5ziJ*P&1XN6hh8X4g;qf8MtNQ$>#G`2ZC`X0yk|t}550T(7S`~C#~Ys0 zkE+?jHab8`U8WG1i1XkZk=$%9b^`7c8BQaA_Q%xlE7wIZX%VrFXkCbXv4e=??LRIl zMOg%uhprZ1+c(a&7aM5Q=X(~qBJ{VFDuS1?*!(@JJzgNfSid-BbPH@a9!s1{O}Hl2 z_N`6bxF5-qRrT;Odld?2zg@|%oU3Av<$ zmYGmWWo~YcPZ96NjjdQ9v5XGCi+dL+Msn}oc>K}C|A~JSViLq*;8pUJ6HV>O=rY zu?vTy2^g{>etQ!f<}@-%R<^vdQ)Dw=oDrAAlTg)(#+@ZEYr}iiDrh_TnX#?1k`%|U zT%+#Ca#5pREcDYM+XiNnNT<|mBzy=t^-ouFPdyYMvg&6`rZ zq_l-`QL?h$uRfNP`el+`jwPzynDV2G@?xXvdWy@z0F0Ocn-OYJi?ij%H#4 z`N2xkFG+5qg_%p2BJ$MOog`GjoIjGvJLgYg%n8Y=>I$m2RGajj( zUvE!8;p+9_lks!0t|_c=OG@?uvyw*zZh0_~tfZHEKpWSl7_ysbP?MphGLs`f*R^My zG#*r+6X;oLuy&EkynE?SbzOwv2CBp{;?$*~&B8Uwg${>UZ2kuSOkm%r!j_(qiC+Ic z!K^5%o&1#GMgH%Zh+gL~TpYY}z2`qYHTzv>q{B4ZyGvhe8+Ksx7O=X}x^NzTdyg_| zI3C-7z`5>2BI<|+^3lA(u?@gFs{_J)L z_^>Hwd``|CwI77vuB>4d!u@AR$hSe1--egsoYW`l-&~wV3ao-WVbEf@wSvOl54@8% z`=7eES~atzS!2EpSFuO={!tb*QO=^n|M~c>Mdo|;d9Po|ePaP> z_7RoX)nmnIA*yU{rSFOjV}I{unlMUD<}HMjMFb&vwffUP8HG0lz6!ateeE+zHn%+6 zbu7ziKMf_0XG~GDEL9vU$CFZTO*kw<2#L#{C-A6eU&AxfWtL|n5)sjud@9RueEYC{ z{QH$>bdkdQd6h*peYXFN#AM(gJEA%?(f1?j52kVUc6)nOWn=+ z-bLDAtelAJB%2<##2nZq<`J_~g=$A|DqLxl#j%s70!rhQZViIsWe3L1c}g-*ukx+UnqkEmDla_d`@!B~DSJ zO%x(fP9jKd*S3=iT`97VwTtMI>iSbbtP~L1LR?f7q%e**w{cTE;C-`5*EG;|F0nck*{8YZ=FD zm=k$$&`$)izGED+w7gdW_=87X3+Ojzdk5ch;si|ZJpMh;**+n{Kq#rKp+!i%hON~7 zgg5@Zx<8fsm5L2n>}>+&K6z)cmGh3!KUvOV^BmUwNg1c}~=XWDSA@#%fA-Afxa_9G|ncdQ;^-;YM+bFulj zR=NA_c2Q3q0Nm3+UK*5kdT6!6I($i{pllZ9v8;^gw$UTAI4x0Z@PxU0}b zuCwEfeV}s)4|JK8Z+NQIo=)r=BCoBEUnw+m-f=x@G=h-s+0|TNy4kyZ&1Mqo@i5FK zgLBdleHRjrJC;c2W6RhrLT>dPsisygag?+Lfi6RTwskW+>%W&i!Wz!L>KK|)-nVZ) zr-$7>#a!3&ehq@JUoC93RtHb6$;#zdUNEfD>SeaS<1`^y@&lU}AG7!CSMl=%+k05u zGd_F*`*^r{9f#ndZw4k?2MOS-z%w!@LR{!~Y}NlOl%PAp6tpKm|JrW>SynJQ<}`)m zWY+H>SC(uyK^kbOX>kj8dm*-dav6rvE+<5I_gj|zNhI+v9$i~`QBZqGsX=IAq}`rS z!OL zTkG^H=x-rACVmzH`>lm8AEq)r!q#B)X=fqj2z zKe3HhFQBj;$;Qo{MF-6hf5=EeTfpyrW(UEXEmLr%THg$K{)pP}#skOxv^90_fDE8r{=@@kIC^(W6EB2^0XM(dOCt`9z{ppm2dQ$j#566T#Pj)C*X$feS^Qg`1mL z^e)7Q@GiW!8;Mk36{RltGu9AZ7x|j=Kegk)Rp*+&(j=*vOs_mB+a3!8~;L*$<1ks2mEYNt~tU$$s z`xN5Xpv^ZDv@!CGk7?nHefZRl&8G>{EEI}>f=OZTZmmOLv9dDbgbi^^BzBW~UGgUj8y7pEsZh!S zn1zuVsD#&6RwOeXs^K=lht}ySH_GL!M}IYr{7p(qN^1+UXE|y)FeAM#Xk8mQ1SM%8 zpDvB}WaAxZrU|qgyRHdDjv3f+VcB2(r(Y}11k6Zep-h8;(3IhCSgHFJp5>E8DT-o2 zyBIJ3E-uUfS3Pxt2HcwQ`Y?hY5`wo{$isPboCL>9dfKjL3|MAbd?1eiagZL&+}Cz~ zYQFwd22vT7Iu^gru2D~%|89ERfE~g2Y1XfjG=dHC%1Vuhli0HtS6l!tzK6L%>cG`Y z$)w|}@tw84{OvOz3SPyNmmy@KDwE-5KyY5U@qOvhRn3dty;#fdhW|ddIGNyRUmxmF zbe-e-!#>AEF3w)BmIQNV?*?*Pe$`Xf9X`hc2{nd->1)V-^T)z8&)A5#$xsq*@Xh~w zM!3j^)3w5fzSIsrCEf>Kdye1Tgxcur_4MRZK4sM`IQP^u+`p+QJ&1SiPaH8_OSvk; z;IaQmV#ikXly`z_WMOGt?$s4v{#3%zi`Yx--J?dE2#!p`QPopAZd5(?SklE`KTAXM zdVhS;7!@VkmO!Vp^C|7?cs%2%1}6T?T|kcX`Ye?b&A&PM9q^GUCR!?hf`vvHj3M8_ zjdtaJubm&J---2-*!Chr7T;=wzsP4HQ=!Hk#DN9DxHw!|0oL*Jh+IuFUwo^U&RtoG&wQUB&`VtR5m&&N$uRUFs%$DQGk@{GDTZv>$j^QqE<{eU0pMXijX z95IAq0Uj6#&~xa5F3H$oI%nc=yyTd(2B(e zsCk<-4cJt`A!l@@1!X#=th*$*$Yb0lNcPgBiPf)9VY6O_D;$(3S#CXh@zwq@F)MS{ z%Hh?^GD-`IIz6~RfG>?@$9+W$)H7S^C9zrJJ57Vqwzjunrwb2Mt=@by_{ahlM!s-p ze{YY;+hV{HCcxhgSTfeEudFnse~C~3^vRz%49f3D-Dki;1sOwszg?jWJim5+K~4@` zPvc4foNY(0$b8h=9t=On$j2NIQ36x%l4Ds}8N+Xj57Odj|5`?U??aQluP{Und%_7`oxwPCjCOHceBmCy}b<$eJLu{*g)fK#T#Ftyzsja4of@-#LTJU;;LU1N{+SG zv3(ON&`P(buBkfD%I^_AeSwTS6To(rM&eeLzGH&pI&Uu2U}&DZa?J9JA2)}`sLT2B2k(V3lr@N-J6Bk@&4 z$NIR~5-u;-mX>_1-yd{MVy?KA&QDCZtgmBwrHZI6pFf{_{&FW>YCK0N3xv?eT;>-R z+*TsI`NnR`{`&o!#zjgmD_=e{h(HOXj>h!U{cRFmuWGXX8yf{mmUGDM;^AWATo-lmgHph0{qDON#Hi<;*?FthdP#Mf9(aehuRuiWm^^#z(Ma7usgy;l z7CpFHp-csnP8$`M}MU6F~e`K-mf%GJt5av>r}V^-J6nG{F=Xe75Vwfo}k`JD89 zvlgbG8Tj6RfNW6w<f0tT_bJMvxAH#ABUjh0D-fKD+0C<@&0t5a$`f> zIy>k@u-Y|UiUMLL55?T(2woZp$>bFj+=zh?%KBHs?B(g)f`Z#z4Zw;&+Ig=0;3X$9 zzYbV{bzBCbe`DfVRc-AbR_{R-?jkDCQ^@T7+_{%y0`M{o`TDOrQTR{HSK1N)NP`>% zuHUc|gGpxXo6Drm^sF8yX4=~7CEIMsDXxp{de>d9^+o7V6ZZB`O+bH=DY|`b%jv9)q@+b{Kw};!w=cp) zPjKdH_|VQ>+Vy?UJ{2Z9X+@UkyF3Yi4R&M2tpjkh-*hHjci3x7g&hBnJy{xZVnRb4Uc^hF| zWsE(av|XIfPLIs_LHN0>p6z74h-GP=CH7iwWB654xNDrsh}L>~G0inmRMo-SAMWg& zHyN#_dV*i>8;^i zWqLBqVk9C2wf7AP`s8h1u;;!IW8G?>HwP4694Ve5fzYtCpy641TgESOUzB6;2)xV5 zFZWNM4QGVUPqu5HpYQve3mm&^e0N{dsik)prJvnZ#m91CA&E&+lpiRL_y6;eBBfhw ziLg)JLQAe~A-Jrp6@8}3D4)7^MA=Yjii*2&p<&t2q|_#f?w zRy1D#id-V^3kc|pZl~_sUsZ^e>NQpXa9=oL({Djilo}WoM z;<=>xKu!OjK{-ugER&Am)wlsm6B6l6Rlw@>I-jZopECkM-o!H&9{n@E$NW+IxY`uJ zKDqwHN99LpU-EV$$4l-TZYvAUUCRYrBBMKQlvSFE6S4cAr}{;th>l&m#S#gGOiBKkoS72p>3Ao@_zJmhXL|5ruOAT>1{zTK8bl-B=}5~l&^CxI zz6hAvt7lHs{llrMOgxFB@x*iS`ow6gdufJsI85}g_ zjUQ$cz0{5i(atv{FQAp-;o-RhmScBhEv8NICXl=_adE&H1z}CpmD16*S)nHggjL08 z6+660ihI>2CnVUAqy)`~DL(K5Tmt$H!ei4aF>7y$JhiJ#lsH$2XRqZ}2na+pI*v8E zfOUa@;gC5ABkpRDSLv9bYX=>xz|>>4h+n(S1)H?5#xNv?- zvoQh>|HUb2tNd(@ad@9m>!Gx7l)RL;n9XetWjd_5%`wE!-~r}%-9U!fZ45KBY|5(X zSDvc9(Fk6?`$7v~y*-0_OYpT!{AR1nH0%cGRVeYhrmkILcP!76(cPub_(4NDyRZQ3 zFa@zJBv6D>&C4-pvTT?dvRLZ5AV3w}{$>;--XK2USEs21grA9-kTrsHW{U}$9SD)0 zDIv~>)Gt$YD!!x)QQ|dcdFWG>FvS!fYpp+?dp%B2&H20#qUStwTP<#NVY0Fax#l;G zccrSv!1BTZ%0@~nyWmyiO*f?)0&45L#XP|id;rT0s47zQ-lle=pk31as+~EX7Zs(* z;~=j(yG`TC0Wsw1AKP~g59-pq+W+EW^u$)q8-?+4HvMU>(lJZ;?oxg|Au1U_V#wA zZ3II#ot*#;BPe$>E;?2NA-4yH1za_^GxzPsh>%0>Y9(~RCq9~i6 zO_F?r?dDD5*#HeLj+=5#lAgm=AE~lB+aiIJxaKZ98G?zG<0b@A2^DRH@E`Se)~^`| zwRL8%u`zQ<`d)H-dcSIXtEc1`^~+27_uhz4Ij=9F&lR@ zMxH#M;IeZu&Adi4p>;DciC&+8x0E#vkH#Ko&2*pnsgdTrhEygwt+pTRiReksxw{a9C>&wMg!Z^PdU<;Cb+lxJ{c$RL~JNll5K+ z4wzsv2uG?>8ebSbLo|Dm{gNXlHm2xu_OC(vg}Uh2m|qKBJTl6U+i}1|`=zfm2YnA_ zm(GbgYjJ6oaOUJ>*HtOG9u<(-*VB;NrZ0Q0Z^Ho<`sey8_;(QFN`wz?$a5K|vz)xT z>p^f42LZIH6d6d%Sd)_ZqD13~4&b7zoHyL5;FlWdh32-17)NPHktQb3yHuW_u$NqZ zz*V0TPb$JNUHCl4>?=P%Y<%UJs;0P9kU$Ch8<|qM%y4?BPj8G=rA-G-4R>|v9nE)7 zZPq(%PZQk;xzdLGT#vcZ1;Q(x^T7J~goRI3R7udMO6U$`^=Mo0vzS;n4{k+5UT!Wl z<@E3=s)NRSEgLU1NzA@h!5bs#0X%GPVrYnYwfSyBCz2cBIXFX~(PJi{3CR2dQ!$7Q z!^skuSatRQ0pBJbFeXThb=9qhyx0MGUvY%Xv?=-JvHT5z_^#6(Pjl^#2}liHd?E#Qlye z+dc?lGrWgSrxZQ>SmbOH0u}fB=P+e z=MSOrt^_vjtSVoGx*Eyw=6=Di!Q$-uB{c^D*zIy0VP}rA{n#m+E3I6JQT45#1)7ZW zuE7j7DKV?wmW|BzhLBW9p(i}I;mE5vJ&Op5t#y5Z`756=o1z ze`eWBUNNm>YYXTxV3O9tU&_kt>D3E|3TK*WC@WW6patYF(nKyW?<7c>Q2lGk(G^>` zi59$xov|_$VZ}|1o6Pw}*7ktDJtEn&H@87A%OuPo%ML&&kV`h_dw~~ttc;VS1kkcs znKnP!^;bCM@;{4B;6!xJMbVnnqLUn^D}T6LGu|aiN2?Q^sh*3ng*~rD{>#Mt=i?Eb zH51dT)nC^4`uKolyfjQU<%-M91!lT*K#AcxE8J(dQHipi_-M{RzOdW9>{xA*e(N%p z4#XzjfDofSOYix<+^$cbJCLKAU8xKtfMhoKz5x_h(TK{`;ZbUru~mj>mnOjg?@=l~ zO1w)K{<^qEyuei}L4EZG&8-CW8-xaH44)-2vkm%wyg?HW0wC03`Vr1LRbJa6R6|-? zTvI^UOQph)H@x;C5f2T03_I62tzY{uGD09 zvp_R9Ta7JLy6GNnhp50qs>k$`%I}`YqDK)#s2Rmv!H|_G4jQ@45Tj`#arqKN1L~cm zk!@M+YK9k>Hv)7?R%h~O8zu$T#YT;t;!|@4PC@nty*u#dHc)KJvKF13csk#rEKa@5 zI(8qmbam}k6aiqL64Td3;@R?mo*ynNTTBjkCURju#a6PVGUrBdM!fIVI&`E4Yq1+2 zweF!hD5+y*D@(~A{R_m4V-64-Isk$WPu=ynWrY_ci*cL&FwN?7XY3HTJoax#8j)Nj zGUts2sVeP!xXc;#kpq@<*ZKO*`Nj@+hluwd+P(4=;yo=8$Ayh0Vv5dYf8{uWL++kC zQ2kJM$Fl5$06{d$dVatZUH0CcQj^sY)2ApFvtMSoS6_t}!2(+Z7YIWnexp$}g>dvyJV7+kq+1 zkgV|gihTfQ&T0V1cg3i2rve(L2FN{O8A_g`%uc z!&zcdCQX8SH7E@tg&lVwDtBKVrWohR@ri2`&afm33b80LFU2c#?gmQHSN4+8%H`C^@3_^>b1pY`oOFm?HV zfKf>XP?u0BW%W%_lxpg54aMnuw^Lbo9M`bAt6Img2MbVyc3L!}-d?m%%t)WqVG7i& zwt!*Vcnbc_ehoKc+LX%q`zy?4&J1In$sA)706^Buar#U$a-94 zJz#K2@M^CxXOhh%UgsY1d!u-dT`6LFFIPR+_&K^k9}VK4Ls!(?wYOs+tF?90M}Z+t z>C4H;lpS=&K5qDj-O_>+WFX+k1%csn@?s7et0W62?Qq$}+s9}Y^VB$xSQ^48Z#&%= z`<@w~cB}Gks0T(nH;s!N;}Al`?JDHA34 zaJl;Ya*0U_LpqB(2pyyTXb-KrHZ~V-C1ZhpfwGZGPTXiln_0UjhY#JG(a+u#C}}dSAbL z`A=;q>LF-1$Y(ZQmwyozB?1mS!6O9v1p^w6-_+h-SQ|bQJ5SDwUDCk5C$%?O$0hdX z>3T9W`cYK61=p&dhy|3gon7t0l#@tQemd$<*@~Z-fXB%dIb>u)l{qZkj;z!zD*HZK z5uEI`j42Tuyn6qxJl(AgHr$3NdiBeycsblQZ`$78xrTVsV_Irh@{b6z?9CDr*2mf< zF0D1nCLOR9ZKfS1Y~EiH*hQ%?$n+MTPrxo&ms=kfH^fmNV69(bz_NcRx*;xscYD}6 z9%)7!Q$x9rT}}~Gqr2tE;xCMsjO-{OiZTnfi@T(HD~G!PC%74STB4RkNgx2A{*N2Kr~xAkh63ZH%am*QQ$=at#Cm0k{xk5K=0QRYcGxNTh;5 zbR?A4*48Fgl=PVY?DpAu95rrZCZ?pK2J)h|rw1MohXLL)TN?N{@<#jo$>Dab-502Q z-_hoX1#Wu#8ZR)hz!O|k2b|4mpL259pj{9r5KPQx+Q^g?YW}a4xul`edoy4iaku+x zi#pcppqFsj<#O*8@IX*(Zm^p|^Z2clMF7<*GB9|cUHI$j%Gz4<+FyhB@-f^6DDd4; zL6v|VXf|3}^*le=B)EoWfsO5=Y9uH72tzVWoCS%aEatB>;|?_SOwmiy=!H+q%F72W z%O)!AK@hBmPZz!^7j&Erd9dRLlK4-F5{?cH$@K9pse`ZxG$;?Hkeh8}Tv1Bzi3q=Y zOU@m`P#0`92~f^5H2rM$`3&s9?d1WM-SDO@uhi@|226825SZWOC>&tk;C0$RElb&FdniV z(&5@4$kmA6zW=m?+_sjMgV_1(#BH5mk;C5*I>+Q1h9LkvuKyd<0S1$yZ*6B+R9g$x z>8PcBsMakF!*@G(wO<{ubfWWMrnFn7F9*}x$UI2<%ySQA)=2NQ zl3!tcFX4|DC|wP$fJj&%pZebtIlXPpz%pbYut(06_z`2faw#0-5o|G*NaUMZb>I-L z6J5UV|Jegdm`Gk&;i$!DxdVbpH=1aApY_{$9nL}PJr~41*igc|{$G9k>s1J^UelVq z0hNIJNlF0@9KtK_afk>7WGyT#6cw-Aj*dNWt%TZb|9-wYJM^xKa+hg$ir!Tc>{z{p z#BBck`{TGxIH0o!!>)O|U=2?cA7bBg-r5 zG4fJd*E1L>Ny7+yuHH1{w#%#gTbobgdbLBquVV~idFl^AiSc0s=Rb~h_~ze=)u)cQ zbCE0DwhZj^m8PW+D7&o6c`8Pe&|5Lszw+Qpsf_KS!Z_j0Euk%d338PqGq>=>=*zcZ zZLIGVPM2&S`CT5NtBmtqgK-!&pfk5Di%Ql3k@!BoKvicG5RER=)(~mCdKv4Yh03vjNdznS z3c1*4U5Zk7*U(4eu<4cRI^0iW`TRZ>nry(nCS@F2wQu1G=`q*o z5x-Wa*Uc01&48wX4EL(AZ}Jp`hClxgTVEYk)%HaTNQsDww3G-)qjZ;&N_QjDpmaBi zNSAob`ricTAB&sMZ37|#ma)(u9aXAvclDq-oY@eFZcBK)@k5N^>u&cKyy zOCZ^-jbB#1uilpH8DX0F`OLAfDm&&#Go%A2C;(#4fl)*O#@Yw!@}{I$ugf1?SwxZ7ZNemEno1-7)Cs3n(?%3~>yH z#Tnke#j}BuhGG2;!zdMHgzm$0YUgtF7&3CG?iFyx#YQ+we7eFk2E|2>1F23u38?ar=L7Lfe+hoKmT8RDC;qSrswjM+b95P#j`5JkTqt2I@bFlU{Q~ z!!gBQ$VG`h%5wELuPDidmB6qfD4t$sC{v@*!xi9umaBUvm8}H6@~;CcQ-m*pGUs!~JJ2x@y6TeCL7n~W zpPTM7GBPl@K>ZbKj}cVU_45NhCY;q_2`7TrKP2*sc#$) zCM-rlul9DyOS*q#q;*^w3W#wYf0kuE=`dBj@-O4!?S|y(3QIC1T)p6*_8+BSbj7&R zH4E;zu6J<=*^R6FvpT{%nn7QKktB?OIWsj{NPiMhtN{Ac%$kQ2!g4ARpcplX!ks7q zN?UiS0qXEjSy-M<2#<~mJA4yl4nL3aiiCFql&mE`JzqdeO6#Ly^0{j5(ohBp=I^C8 zAUj09^K_nsD1o={gJbnsFK(bE8@T%RPY5OQHfaz4r#V5t$TR`b0hFHCC}bT&Gix%J zR;@Wsp+@-u+6`(#ps??6TbyBa1Hu0i+>zh32JOK`T0bw_^Vl6wp-l8nLAMMjT418; zST%fOjbb_r+Kyg*OGktzm|EM-FDm~U73$G{?|-P3t3>ho@ZiUPr0YKLb{kR4J}LQ*EM@NRf_Enw#?Tk9_M!g>2-i|@-QCcy2lm9 zBLZB->JiVz`Q7VAI0Kak8CF3a0XGDC1^10kXu9aDd-g|+(N;Vb$XEU8wSu0)FY3q? z74siU^WJj1_?)hIWGX7Cw0ALubp@Cj!p&iCwE~d43~o+-?hBW|dD{x1Bm>`i%@XvqQbhA1l*zmUlnB#!{%xFS>%S$d z=Srx{;>r0hd0*g;@m!=u@{9iM)ZjaG;v-x}nmZ^41u1veP0DYVx(?q1Whg6sN6!?H?lE-@ z;QSte6Ndgmkyxo_$p$$@?%?(52Ei_wjnVg)VmQyy(NR-d<$0T<-5$Y2WHh8zS&Hyq zIT>h5x^IK9FT15gj~hYc2kKMC&X3AfiR1n-V|vNfI73&ir`GO$t=N~Z#~3W3#^+cp z1<>j+RWbOrH3{vi@8RCP6sY<+5ImhiTeldL^XdX=)$fNHQQneht{>~=pQ+48){{d~ z@T1DH7~@2SuJcAG(Cu!ic<{I)?5|0@Y>lR>>3e@AH#9RdvtlJz{=Kkpx!WQ|e~BSd zGQw0uU?1)qh86h)1$I7d#10|tF!gA1FOH0_!_=FD094HW?R{8B^BFscmkH^-nY2f2 zF0<>YGe;Fh-2&uL+MPuMR^BNWkmHAN2FNhSu;U@ z4db3}n6e7Pp6=IDo-fwlOIi4o;oi1)p^J~Sh$XaENC<#jrASWFhxX44@BTb#l5CTM z8_cXh>yHo~d-5M!^@|q5M0V?p)n`FZfo|`};+5dw;Oj%o;mRtN5925LUWjZs@=zJ3 z^9qv0IxpY0-Qx;)jI;{r4h$kc=>~6~kzp8!4POw#Al`p7kKWgBPL+T5dGC#9pCpg% znM3AD{8pZP)@!uuBCyflPyZBj*)vJMc?KoNTEhLety|V0IfAeQi~TVUcHg~X!@m(; z7TC}<8b}sb2C}(YWTJ;)Jm82wwV+!5&moW?sV49~$jw7%0r8Okde=>h$3J|}1EGR? zsY^+4T(moZEpG9Oq)_QWTdA`jQ>rIpKiuM!evy@Ib=q%97O%JFkPpl*`$42HDKr@^ zPJ|1ans{j;0QSuxpN8R~Q%a(h-LACKm@;}fyAIU@G;eJuwlMmP+3{vHcwsp6q3 zV?P(!*kMb9EFTXv*Zu9y*W)0%apFL+PIB<4rMX{%vWYHLWL$A$Q@0o+L^YQBFx2mr z|5-tQ|ChR6BgbI62`d0D2Kq5US?9mZ$3ci^pp~#uaw*8!{CKKoFJpTRVxsUuw!brJ zgXi9#t74J_^)jLwin^ib%%&e~56WmfR3?kLG$Z~=erPZ>dayK|pnlv)9aI6gYQQ0Y z-B$_l{<3WGy|j4i@QH$Q8z=!4cN(FFG>*l8T?r&($rlv7+UgLs9_N4N?fcGK0!Jcr zXTU~_R1*!rS0MTk{nw=T@Yrl!?Jj{{jQm813(vUFRkbL3>rKSwBA(#m2>+%X&6!t)Zb^>@Nylpd6C?-1W@eYdpGhYj9Y&5I;a|G%&Dhgbbw?&yyT{^nZ-u_gTcD>-Cv zbp5^9?WY9)5p17O2JsQGi{|1htmDh0r=53aHU-*#(|?U1`%`rKRW|as9?MW;b0|p7 z43Cv15E|JD$+hl!zw_uwPL>55f~!wVC_`vp)C+mt2|wOMv(s@ubm{4X-)J5IUK2L; zxt>DD_wT@^dRa*$BiUcQEbIHxlN)};m;0A3!+zem#zH_0L`DvGkfP3eE`?S1ZzXv087MFoSIYdsb|8CwOCk6uF5 zr>~aoWFeUQNd^7A@$&yYoM_*ozfmFV-PL=Unl1wm-NL&JR7+7|8yqYOZ&59LfJ=&v zCEDD7OfKsd>;1hjAnm+%vk4tfKMVfy zSd_*NgOB}NzE?2te3$JE04^&>SvLBTI_mc3Rjp-i{LK#k-r8%aMyYVdYwzcDUQHH1 zbFAGx2yvX&=~cE%T+BvOC4_f>z-HK(LYpsc-i3G`jv2}c{ zgP+=?|Bj44MWC4n2hnu;&pX#0=J2jPr5deym^ouVh25}AO?VHn=u!(<&?VM-UZaa{ z?!%~c$^{wm0iL|$`XJ5!MHKNL8cw7 zn~}khb?CRe_zP*BlTQpXx4wgd10IIHAUxd(7%LhJZSN|Tj%pp+i16HHnpPXVK16T$ z+rFCB+O>Spe&=BdG*++mx%!nsfVV^e1<&=5S>K+c(m-Ln^|93^?Gd@7As~@u{DGna zfioW$>&742JkXoOa_k*UIE7(B34QpfKNLO}(j8GaRl8nBTKg1_nEP07=^bes*-ha| zLF)-DqVInHL4!0_B|~I2-9s$%7-|dAX|+IiI42tP;fD58Q7&b@xF<&NJ@u%QPpJ

g44C$yYvcH|J(V7B%Gmf;gILDtD8D=ZqvhKNsh|``3+IP$x@_C9Ez2`< zh&!eVQtmGeOqg#~J50@-c4RJ^lR88XpvMjJgYfZYJi@(4xj%8-btC+ zE>5$VgU{d&0E(ER3fJ*uGo8f3E3x>KtL}J?%%fmfptZ;d*EOKKdA^FZxnBKoK4$fV z1s4h-{^9rx36Y9PB9Zd0)zS`Ni{HNhwd{&d{brinKf-0HO#am;H1HMhyf#64-qZS= zz`Gnf_N}I`ts46MbtZ56T8UO>K=X>p5-T3vjmVK|Wdb3{YtN1HHXX2_1pb&d+ zN>w8wWBVi|%iI0T%H&Tk?hRK9bGwT~7kEK2`{R6uOU>;h9V|J=6A&hanNu;+x49IF zjKsuqbSDaKT5(kx+nycUAAq5<9cWU4TYUL)PNSPE++4;*RFoq83iPW{##_B(*a_BX zmA>DwLvg>gvNb~@Xuh0S^E3LP>rS=G8z#pY%bqYxWXJH*Jdm?&n+%9-R6>T3X!n{| zsq2iPrv9VK#-IHn*aMeB(O-(Ic``n_$_EI=W%h7OG4)^aJEjz&acL$N`|VAB&etF& z?){mX?8p9(>0v?2ZvPW|V60lG(@f&GH)q8q6VHRYSlZeL1X6Y!^DyoxW)gWM9N zhVW6jr{a~Ta?`=$dCoJN_l4q_?#PaSiLv1QjHvCJwjsyOD?T5uLvikUg7X65<#XGE zF{u!dhCh0!Ewjfcs~{8Nk2lMvSf97X?YtSheCh=+qAdp?NtxJi0+em_!3*-hL*Z!X z%Us^L&cgIK-;Ilnfsx+7ze<)iigK~2_}T3ysfL`@n!^uoF4rEg3}>^m6htof+4jCr z{5y3&At@6g4!yTav#AhvVVtu=h2_<{pC-$?X~*ln@jT2r@<`%sq)!;P=&Y>eT|W?_ z=0t-045Wf5416=o8!sN>+>t3OHcMlO`q(`e&s*Kb(M?ytT^OnToIbFgG=>aQwKUE{ zSUSUQ*8OdKBJ~h4hjrMj9^mL1tXXZJr+qS#h>I%+$3tt=LTJHTdg1M87V*#;SqG^Osa89NT6O0k zk>~Wj_~K3*pE9bl{ezpAmrW1$M@z_FwXx&BqtAW_CKhGT1Cy>~W1MvVteH}2ByqBT z_@xE!@tyWtDuTpB)b{I6+Y580?K#Owc%LeP^O7E&%NgbL_F8)?^D^CQ(R+LGe1Mj;&T!+&hf zIbZ&Ek#+_vbi^TR%&{5UejXX(BH-D>i|iIovpYl9J6PJ?ypsuz)$s!qnz5DTA%}?{ zQuIB8JvRtLQnDDU7OWeUL6~1x1^aKr>jPOHUVl4#$g|d=`jvZS6Bn7JVAD{DL^`^2 zsovy#d}vI5T1O-%<=Fl2gN%iW^6;TA}kcaYt|b%ew;B04H>$=P}2z4->% z$&i#mDD?FXoH({Qms|uV6Krhw^UZl++TCdF`;ZYc>-*jnZpQzlpZT5B%pWgUc4ZBI zFRAFN05$6|pN6v$MR*~d15?qaJ^n$abPV;7xFGXYQjXx*w=ZO9x9(#zx>X)pP-c zT{kyb!MPT8pA_olQ@QRFw`c^n=AaSK*|$96RF-~O)bzF4vEbQRWBQvn7stDa7b(@% zE-5_{6~qUnUpt#XY6K*&AYFJEe1%L)Nk`{$M08UW~fNJ2sa{c#kVX)faGr&h1b z%~^?p(T8y&3TB8Jk}trGtKqP2v*;#&#U1}^H_z~-_TME0p5VT`(8!?pmw-I?gXMko z_33#)&EK&^LOCods$@>mC#)55y6WiUXRWtRKRU{+MgCb+kZdEJa2`R11h3X>kM}jo z>nV!DW@lcgb$gv4NK$xIN*!#3N6#`pwBWbaA%#+1)W4(YR}u8j-W$H~$q5R(F7b_w z>GdnI83nK4Na%Y6G&s>^8#rmGe2Rd0z3qWVP>U9(`@7_SI*KRX!Due=v#q3^r;FwxD&6-AQD?1|@} ztrrtiUjI94$$2ip;ZMS_oKe3dnt;SksMlGGn{@c%vK*L6J-HpM0T37%TVSl= z1Ora~0GHA7287R7Xt}VJ0OnsvW_E5c8Kse zCq|p>c7c5KR4s0$GyMt$5Kb9My&JbiH0+7I2=tp@*SpD#q~>zn^#+Ia8{bXqBoUoE`iOuHeIK;Rsjt?I|$ zb{^r&A)mtlq#56$Dfnnv;&cZ{$;{s$Uq9?>Gbqk_Sot^V)^ctR))N`OMP{C=r0D&l zR2hN%gp2$x>G!WBXLkk9?`vOHC?qH&viz!mXL77P;nC*afu?8RL^YpHDTJW1+_7tTtn0I!C! zL^A&h^1HjvC{PpJzIh#$a+#Uw_=gnTobZoJh8Ub`S8YC;EZvX%->-B@qk9QOU(oL) zTErA(R9DxXO*-_>1%%@THqT%_>@FBB;3n>7ZaB5cy>)x2A6V%rm(_f(dwJQ}cl@7{ zx$pHtcbRIPE&y|01wsau;EYSB8SSDkkuk`Z$c;_R^a<*K7LksA{%U=H-4{X6t{iQEIq*k& zlty#eq6<35rQQbzmvcQYDX}~M$Zhi|Y-K`_DLe0`LvcH%!>~xpo{I#_kCj`zRoI83 zf@P|d%pki1ch`ISY3nT&zmr}Whbwkj>asV;l2dDYdAJ2YS;Xi$1PH)zrFoRwGNl08 zAi&i%X%wF|UnJI@gua@4z9}j=@KgV}Z>dhe{qG}U9{7w0=@+(w$>H*oJ=Bb!9h$EIE9`0D#3 zUBNQyW%J8kj#pzTY&eASyx8Kkw-eW#_|i<;qhFo>@wjau*=@BDf9OtQv^cX{zbI}z z#aDi=x_BFf{UK5C3R2%prS?vQRJxdWjP~ygo6rUus}&fdD!XCoJg>K5W3kZna;zfu zVEys>IP300QF)0V`)m8o1inaa4}DRTo6!&aSX*S-aOv(7ylW#C6lNY;{K)?3E&agT z9o@YuEiBzVgv0CqO2ptJU&0a%{u-i;s+@iN-ifa91wGIi4O&lv>K3j4FPbd$=UFHFoW!sw9(6(CO~5 z*~y|Ut00$Hi{-3_b#6t*_UVv&&w47oNyTVJgrv*o__)Ak{XhE^e>zwX7p$LC**VsYkJNF0upMN(j5nf!c6eUXp?vVcfZL`ryUhMz0Ie_AQqIZt z!fUi?WyofR=X{>45jRJwjro|}eCRY3Y-Lr`KmAF8Q|dob=<7<>f;u7nz}Q%|acDid zbbO<=i4#|p)C-Ijy2GK~RL?Cm&m{A?#YE3~jZv0474f5KuR{JWp6lLcRGjSA`EkF~ z4x$cHG%cjH=;312dea2fTwhOL@HiL$Q4mSKaEFN$%2iK0!kfl;^H-<1bH*kGrAIzZA z-4d39_p$tnF+%r9Na8=fXFR*7$n}?gS^rm~(u*3L%?qOMQ~m<;)&@9n1A3_6`3Xv= zD|~GHOP-tE__wce3$ct0wZa*(TJZc*ovyRlW3;4S_-kkTn_qF)USe-7)XO@IO2FmU zd`~A!5w`hct+VYb?Qsy9!HC10r2AivVNDx=ur2$guIqZ437ZDWP47X#I-4-qa#*|t zv_Qi;uK5^5NV$bIH79k4Q%_n%&br#wstRL-=+|`ld|%qJ@jSlMgl!T!p>HO3zBgR0 z!Vq+Be9Zwx=}kmY#Yd53I;97fOI2$PqkajI?k!xCA`Vq#8=DKW(BQ|oxFJ%pzw@rjn$^g7$~nLlBSZs&1rQ++q~ukpEM zKM)iue3CCzDFUuiYxtvyXGkwmC@!bX6J2Jn4Z2}6;q0AuymxrXD%J=t?HFo2bItyi`k zMk#A8UDQ-r2=V$Lc*c3u@0r=MrWmJ*<$-_ghH{EqWvs(Rvgu*PW0kGeqL8jXDQIf6 z1|4RSj7+vDJYbIv3{ifl&S5*gpGZaArF$c-~BAR00+tb-Ad8q=y5 zeb1OlcK z^qK^I+tcrzaV<-A=C|}D%f&$^n;n&8K9r}frk2#gd6!c)_k|*JeV$f=iTlOmK(1vc zs8rX-`#lbn?mC_bl0djM2uW1?pztF<#D0@lIaTD`$!W*;+5t22oQhvHYh70SfH_3% zlFV)fsb{MT<+V0)_;vF=1t$y)Qnas{lExVD3zi2@=s;nZLvs{Q+Tif2EF_fK#YF9& zvl^ep$Ea=LpE5I9M5Lq%@0<1S7A4QTSdah7vvA-<+Su%Fzw>;zpg`Kj2JUZn(Q??6 zSt|SY4bR5clf{xUO#J7m*Dmj1ikx*PL{yS&_eiNdt4cbW|FEMaVY&nkAH4K|fk0;2 zhCl8n_#(c>iTut+aK-TG;9|q3&h-jNFHeE)v$-nU(vqebxPp~A<+UbJT6Q>sh7PA+ zO%1Z|&vpc66Yet!&XZOiUO2#K0H08Xj+=9nvF1`aLyi57x|rTHGMcU=YDy#71oh|X z9Bv2U*10baeiK6Q57Mkq3n4DpTI*-_nEp9-Y%tqFU0_ek*8R)DSC3OO*(Z$@nMZJB-3?qw+Ev9+QQYx%K1v=Z) zxAUzpFtFhqP|6lMA*kSqtyG!cK~Ammi>o0vpVb&V74D!%)BN~mWp1tNmgh?(nifif!NZ5u%@x>-T!et$lo!DOsR4_X~r8(SrcIasMg$It3I;E8m zZ}{VIKtFzCW1~M)ws1&k9ClX3eM(QhTAK7yN4+>l`(AgQzJY2cNA7Z(bnJM3>HO6- zU39~g@QqHUbb318lhY%4*n18xkFzu`89-^8^OWI&*wa6Ap>u(xzXSk44>HAdg#-t0 z^YvI0quwCpmd7zv-&kAwE+@3V>%^Br6A~1pmzNZK-REb2|6L4UNbXTFi13UI-F}yI z5o;1-*<*;m`Sw#u?-2o($EGVq*KdLvg?ZK9u`vygvkt#9fbZYYq@MU>VUc*v7mx-*`>{vF4+oAdln5YM72Z4`PSFAjVtz#xm*;w?tEW-!r>- zoxHUm9Jm%TxAoFCi?lUe_@*$^O*5bI_3lOU!q#hgqL1ibmZ>x75iB&gZ@RA3Jp?GQ zWMZdJF@*jsY4X|=b8}dq_8lj=Bl(&T7S^&^F`Is_N$#{XNo(aENAALv!$GLK+f@*) zs*~d1N|*S;--wc4hY5uUZ}P0d^FD)WKZM5ms=fn zhFw@TzAh8f5bfQt=0R)(LnJ^#?1;L1axJlsDwW~9*J|h=82B(Iq#kV=SI`zgc`3a? zNui+&NEsK7o(ECAz29s_t|l}L8LAJhhk=)FZx%T^*wcqmN>;#>6d*ow5K_IrXTuqOP)SmM@v3wd}rv^hdZ2uehOg{yx zNq)K9dFAw6T3WlogFmuOX=>3-FN+sPTnPpqYLt|KYZRoLX-E2u~y4rCi&PHOfNx4w$l`qUt3>K zC*cz{-@@>LwFi?hNYY?9#W=zwqG>t*B<-n(Cp3?fG=R}U4J|QN|F>m^=9_Yb9eR)c zoSo0!08e|EldgTYuzRq~g@I>;-lPA>@gXd<-lNv`>O4t@NR%(-D zQ&EVz+*i)IfF~p+*pJE-hIofV!1(^RC4*`2FN)H zFsYjIIx{~CJn$B}yrELrwoq=N{Umk5(E5G6A8BU2r@`Xz*NX&RM;!tXt$7~J{}B)W zMpQ?9)UUgGPCFH==`1?=vS%|~P{X0@o`%aq+ta5V^vr}6JVT^^WETGTXAF%CVrNFY z-*4a%H(zh1*Wxnn;;i{HB*WO=T(FcPf6o;Tqbi!G^e@ILGX2e!S1ke<^9yY^V&L1b z1Yu!b#*;0wRO#63hYQ_jHV~hN>uf)4>@s~0Rq;3g&|cM11Tf+XU4d@N6!)*G!dj!QUd7MC70tiuryGN&Azj__~f;s5p1TJg)E@g$V3ysML z2w>jnm=+Q4Y~|$Z%znxB;r!@JGKUy>%hUwM3<;mDVXs6?=a#Pvqvg;6nV8pyM`Y)_GWk-HQf z7e(Zr?X%$o`_@#S#NZ9jki?PSzHtEKg;>j18SzF z$A)osh^926$C#;e*vGsc3?HO8>EU!mBoGcqzZ$DroYC_H-fNXU9=-^E=Zk!ilkTFcW@kUt}C ze~b;(5y_`CD7(LYku=a}KIsi4Zn=qxDVH9zf#H;psR*lbUkkITQj3X>Z6M*!BUtA02~Ae`;ZEuGxjIj>ED`&vD$ixdHu^aj*zs6PE)bCGxL_ir466s3g`X_i1bMbbb{#2kM)NhZ z-umPa0$pRec>o#0m-+cOdsCx|WTlDxWhvY-L!=`O8}X^ILbO;%GjvbS1BTYTt6@z! zP0EIJCmjEZZFA@{I9V5OGQTxZulu|xvtD0di^SfZrqAIZ6OhL6^K#7|fs2uP{tt;+ zwTWZh=XRz;(IR|{OtN`L)JWLvpx>4LiqAjrI8Re0-C=Dcxr6R8rtS7@^Q`M3M432_ zR|<*644UWEHKP_PI8-oJ7+O~HBq^~|Xu$gfK1J$k37&Wr9Nz<_X`RoEthgWeNDPN@ zL|>?@L{LVyrSZpx^qQpMOy*!pam1zD#UBSW>%*&|G2LLfXC(OUIT}2BMja_{TZ$8gqcrxNF}E z|2?i3>g7)QeXiwwp)2r))@!3BnNHQs$;ru=xcV@K9NjKe*S&e(1_cRG_;$1=T6i&% zS;jXQROIfQuvoT?CUV5M?#dm~UnlX9(V3}YyldBui`aoD{)09F0Ratb5OGTsLs>1S zQdSzj(~;W<{0?EaowkR~^p9Zrl%tc{MN@SYmS6w!2TEFOe>pq(Bo{u?n2z~<7kkm7 zaEYf8l&^?+;=5-p2zBLl>^^9xj%{0Ub?%L{Z)R{w9AmEQI>%+U5|Mop=N(bGCXR-S~uIxg3R#rCM&jp6YPR+B zC;c0it2*ED`Gq8blHiCPkkKTz9gbV8{(QTmc>2!|PHg|Tl#&o~EI4?Hv9ve5Rj~R= z8+K$;fBR8Na>Ox4wq=Q0(5Xkm;-HkwO(3AzAgp#!kB;e>_}a8-t6d4%*~jM@a6&S} z427}TNA@!LuCMP)lMnUw-j~GgC~iiStm5C-Suyi%=XX<@5vETdh)w{fU9IxxF?Uxs zH(`($w*Q|Su$wv3lbJ{zdWL5lYQXc*)6-oBSRp?1$qt{9wnqvNg^Npi^_{os_$(Cq%f#1isXuUZDo3+vF5wEgu!s7HcKt=r)4@OSBgKA}2m@vZh&3wQH!Xb^Vjo_(tWk zWzt%s4GLT7o7cI+tLe?r$0sMI+YjyU&R6U>(lH6s!&ERuWpxDIm63j~VtX-69bUH*;~R-nlW$6*44}YD!?A zOcS6lD;r9VzGt08)vnUP?-qb)cE`vN*{gbclGHpX2SQJkF3Dq$*4q~S2ej=zpBUgyNP^h9H(VYWVo69DB)m5@Ars9 zTc-FZ4_IJ|#<_|;cm1E*l~gY`rgl;W3na>Na!;KxQ;JE>2$3hrWM_&R8Y@k!*cc6* z@RgNs!^B$B(kLwPPZA5xQ>*)Cfp*$V_+3d(g7~l!AB3ah7A&(U+x-R$Cn?;**%g-}MzN0IlH?l##l!__?|J*~pA|Q8Jl<^!BCwBG48a z;eXryRZdjMhcXx%s3U^Iq^{q8@=&MjhKPakslCl_%(ggle?FdLId$O z01xgQyJ&1?2*q~0+Hk&?#g!_MysTdY!$*!qGoxUoqHiPbw<4yTYzV3QYHI7RYWvkk zIe(Q0xiGdP7M725Lguu;IEps#$WP*>I%xFi@~XQt#w z4K!?`v<*xCNKC1A^opts20Yhy9G7Ax9tlZPz4 zE~yi%+-JUArI-i3>$PfIMg;En`FleAUnUHBFzC3;MlF6!NavxUi1^O~)XI&J8;@|RKr>>Wu%ynP}v zQ=GAAW8`emGW*p9{JvXjeVT*x5XuXC5RFyMAYd2-AD2v!u7G!puRANc%;Pi3H1+|rLy6AU8_BEvg^JkKi@gSYEYzc_UG`V% z*cq+}Jg`QM?OHhAq;{fNSXf}vsdch6#umNv%<5Yc8xPMhOnjLfzs^s3EBcp@-@^W| zZbOCQ6{iQIjC{&C5WR@`kB1WYYc76&P3-RMOnUZMc)m&Y(tvi0*d(p09IdLB9L%eln?2ZaR3CKq^vFs}_s+o`92^|H+aPzU zOmLdLK6#V@z2=LH%hq7d0L2~apGbVzf~PaL>@)e@kL?)nQd^ko_WA%TL8T_}o40r_ z7rNiOcklfCmbYQ_-tBY*E6Ydp53Ziv*E6B=ylOR!Qt5UB7PInB0~Y?H-;vEXWiC_T z&kn!13Y`1^)IO&+MT3EhC8Gn<+fHsaHhr%v9!5&4M-#rY<{};=S^DGuR74&pvTdNZ zSAGXaE+$}ZJ-4vF&!{6n(Xg07x5@^vHV}r^pHpZUI?NoyC~3nb3`*3;zRg%=@zMlf z2a7=fV(5sVe0(|Lz&T-540SgDjjx0%2i*h^;TE0;SJQ_{n zHq0huB*cbk*}LPggJ(&=;?CzA`8woEu-mKKHV@I>2q*U7W1^ZmOn7@Eu4CDq3Xuc1 z*Ry+ec19(Oh`^3m!qOOE5fNEES@`Moh2Xc~i|L=S1@TMrPF#7GXJz~O4MPcZXLAjb z7r!l+w#P13eL<5&_zo!t3bfWTLonvgA*}q&07CJD26-RVZDw~C;zRT#YXwe@W z1iEV6@yW?`JDz-%)wQ)e2XmfyJkjtuuFU??$;o^gsqFxh2ug5dgqR8p4D^vcrAQ7Q z1P1lpi=}lKQoD?>ag$MrgP%XXXAzEj!)B#$iSKjU;7;TwAjr?pD5(JC%myFS3Fd9) zcaH#PUiX2xTpIrTq<>sgKCK@yPw5fC-U+Det{u5~7#KXo&jRP>g98j7tL46QH-SxH zS~@>*5JB80By3yUy`neznX%E++n4^p|IBUUBqAZ0p?9@cx*op9B${}b)}G{&w|tB$ zcFhb0o*jyUAwqK)W3DqCS;Q^!>;xUfAdnoRm%<0$EVAwUC1U_iy<>aV>Yg!9C+m~5 zHzm_ZQG^2wS7H>tWoL;dCmHdf%LuZ5n$KWiGH5oqVf7ysQA6ZZ%&0mk|ul+dW#cd`W|YEh-;(E7egx z7^`PjCQ~u)WfDIc8rt$YJPbRrAl%DGV^=eYV@lkze|Ttq>ozTTKJh4BtGqO;|bc;)|I;t72bPMzRTFy8lx6N z1}xxCyw4ukOwqApa~%~BvR`^8FpKgy>)TyMtkOPk!?nwu-Q8wQ{)Xz>0?+*nN41zT z>!W2d%g{LEx)?er$Y-X|eS~NW=BF6ZeK~oa2g(x|3w{ zVYDyZ#@;(8&>*IRDqz;8NQ#@4UFAXedtkKSSdl@d6LRNr*~<^5coV{i74-0LsL_5g zS?rRei7%DwG)U9#*H)}Wou16f!I9yf+_{Xl|jRj>1O>9J~#P+;K8r&ra zL9az4Jz+1u`Q|0>bD}Ds5_m0Yp_wrfa|jqRS^*@40dIcSgSBseWe^BT2&2VgKk!Uq z2>WSeMCH=4q&uBdKa7l2*&6$vJb6=N&W4AUmO>LH9@A?Ay9W!+gW_hckSaz-6Yi4D z%LvL|1IMUeT-~etSeTf4Uw(PW(L{$1&I+ZE3e$5lQvG1ku2yok_}hAjfG|MT`CV;S z1B~T6pBi(-;fnTdgtZy8_e5nHmT9O*GulRC3E^8BE*zyeApc4MiC9%bKwjSf$e1ZE>0sf(3qG2ypZ&sY(>M7es7 z2nfNLWrl->9v(WX&a9lAnF8JOHXrD|s4-j`xJctPgQZEysDU~ZIL0b8Z_I92W|M=* z47NXR)B;orpHc|3^J6wqVqbSDl5rt!r<0L3RlSq?3w&tc^xX>>G-HqJg4n<49Bl}B z0#E!9lTta%um+pfrxY(L#&JdOi1r$=D3i*p4j0Bynu7OPNMe`K-1Z-UCyxWZKI;7W zGpyjALiUGaE}~!-%#w639k?}_%>KSnnmI8JuE*s*_QwKkVAC>kYAAUn>d=E2i^zE-@hY^?!f4=LHs(M!mHtBI)$n@ zq7Bo9?lrky6rUE4p!$K1tfDlqw0nf)jiCI51@kUC@=~)H86ASxJI>LQc(F|%@mG)8 z`$tHu#@9Y2b|ju_mX&76GY09H>@j6wHkJ{QYtef%AB%$~N*R5$cqQUFHF)3@X+{18 z5)y)xs=B|q3Q|)asOWLT<*Vct3|O2TtQV=RHTvH8FaoF1hr^t($=5K95MmR|`t7?9 z9Hr7VOcaJ>C(UjU&JezHp7np2;I<=akDstus8uEM>3Uq(P+wn)dj&WfItzqL$Z5pP zRpNG?H5adiknk43#O)Sibv%DfIsU^5TSJ~XfGoxY3x06wUmj=m1Q@<>c7bOR&Hd!u zW_a8%z#hsoHjV2Jej>^3CvkI?@EqZwYgy#NT1JB^-`;Y+;Bjwg^s4_Y43LB^?ydyB zuBd5fXej);Y=;-ah=ql9u)qImygc|E9R@DTP34LJ-8feX^FqPzDm;|%DI$WNY5uD0 z1<53WvPhMUa+M8kDk3}L3`Q9esL3v4Gui>tz6i)>?C|GlXq?kYP*xO6Mz#1HBlDAb zT?y%4vV3aCckLzOqv9-L$i2}pt3}jNY&c7(Ckm;P?%F()8dF5s5oUWG?k208*9-5H z-}G{!z0=c$ygHvGR+;4zdE#ZXmlVpKM6YxFw==wWK(G{LW@)KvT%jl}E#1}A!|5{o z^~vzW1dMhs;9Uhl<-hb0EUrWr1~rLY3|yLay@6<(9$*Kt{gz6(0Bl-pDCb*lF`5!@9*%kQ=4JVK0YJ)F zb6e}rpWtkSfUt$0Pj-5HQa3(!DBryF83)-ZYkz{Jvhb6nx9px@?oOuoTICy|t1`Ov zs^+7frWDHjlXf5=yc`Rzsgq*_!AuYqLo8O=zXvg7!%+kg9&!52)t<69o+VI^AVk|!gv@F30O86l=9_jnvc5694 z+wsB_u#gZedo;<20WKV5@4iyLp4?4vMux~mI=+4Y|E7Hb2fPF&O&WKIEf=xbce?+< zOeMV+mmrfF%hZGzoDc~x#VGE)d*PG-*-mvLiisoCZUEn{U_q@jUbk28(Id<`hngqA zmrntYet5E6)U!z6{RGloA#$PQ4Tl=YI-xG`pb89JFUfNPEWhQoWI2gRM4|wifGdum zPZahLFa)pA-lr@bp3o`*9akZA@7dl*QY&Ike*XM=)!WEr0)4Rc9y&rbcXgHih&IVz z?8rxm{L925zoqPe+Mm5{JA#j_6Y`Drc z)5W2EB)f(tAOe69FRyxZaI-$Z%Q-HrBYz%h7$Bj4SrRo1VEJh?4TCourDTE+Ajxc1GE7v!_-+E1X2pgLfIVVSq1tieHZ`~XK1HqArBhXC z`R#1}6(ohS8(2^M9EAhJzU8`KlRzRf4u*fq2*@+gk>!s2KLVg{u|9H3c=53w$r#Zd z)fdR^&rodRVj{p)z=-~NkJj-S@6x4zabNhEhhX~mQcB5{^>rv6@$;hO95`ZvDQ81N zq66%%WtjKIPlX^f=(16@_=ro6N`U68iIDWAb`a!n&G`IZ-5aY7*+T=`KybqkX0@N5 zi`8X4rMTa*13{eT{tJ#c+Gj;{eJEpqOdaU-6(Q_JZ(0#&(zHVvDm7GHd(JrlTZ5qk z2Gjs?L-*hTq*a=CGji=}WjQUUesjt2^QM1?0u_BybwzYEUXI9{ceoh7G|A1@#JD!5 zB`6>cfFCz%P!YYV53P*SQi%<6)0K(5Uv!52UfNseNpE+!1pE6jAs7(F*8@X*6{tI} zpiZ93p`7qNs-WdH*hr$0PlF9yw00`~b6Q3~PzGAbrK#A0DMpc*F0u`FADU7}RF@Hy zogEVR|AJ0JbO4=$M)m@zTfEhGWHbAV83Ze{e;XMa!y$S6!Ekiv=+UGf3K6j=byuHc znQjUVBwc6&^giTJ&sa$HYOlU&275;^?k2Q(S)maaqtE z(cZb~oX(4nkNVmOT@smfYI7c~**H`G1?o!`lJSw9Ik&6_X;G`aUaFT@63I;gc@!cG z`#5m?mDGWW3!M>3ZxE<9_UJ0D+1cj5>APP2owe1Prrm_5sBT}w%q)LB2q=tgf}1D5 zeJG=oXyqfbuHDJ8JDo}8iN<F_GisgJSgE$fm;Qcch7;nNk=TBy@NXXFrSLi z^aV_y3`DM&gv7-Bk9v^v$4if{IcV^Q47n*(5Qny0;~v(srt@K=!J#3$91cy9N0$UT z(waX*z-g=@qj5`?grynpMZO_V>ej99w%&bQ8Qn2A)4KFu{xA=SjZ9ii#pf4}ig| z+t_(c=}!W5T-P{5&<3sogp1+Ud*#DDI1_t;NILkn7%~XMF11+3!jVX-Tu^*#pVB6F zns%6hPZ*>2KBWyBz|!*3kcUz~W{VH8v=qy+ug(DwcKLhIot)903>#8=3;wkrXzK&+ zaZznl5`DhkVp*~J`}&}**g*GX#>k4Gl?>rM%?e9xAWw{bH|F4DXMYmD)T%P78{yZe zx8@^Voktee(R@+o^1vN|_7#vgQ{9qRb&aHwze zg%TZ%hVKOVP%h5%+Er;yzdF2x8V=P}#_WTif^fWTdi}gXtc3IQ{s8X9_im#5Ob3g5 zaIWoPDiA3e4qO{5W*xb#CO?#4*pKIrazTr8PS=~k&35d)Be1%8GBMcIV}XlgzsC)J z#*lL9XWkm}apIS`CQTg``1EpHpv#OMA5=jcjg8rj0I#3?Wbh0kjpHDP^kQ=^O|7@k z!~uLBf%lOO`+)lg@L{+4BKgQ>wk34Qjt^o)lBeR&tvl-(N7f>ar>~#l7~MVehcr^iCgq$cx8^K z5T2iemVpn0^r1498UAmj4v@j2MBcpvc_{8{pkINxrgc&R|9v66r6X73j9$#W^!#UjJ&^hY}sCZvZrymy1zE!gRX8qbV++>vw-4X$zrI zbFP$yk`V?}O*r;ay37FKhP3^Tk`JX)N-MO90!B=z3Xev#vEE&^A1o- zgrHXR3>5fIZu|`2A$Ak0S03n<|O)#$mnXbeAC_hI|3BoS-&@@3Yq6~v7$ zYc0Xl;rd$`e#9P?!~_V3!4BpdyeSJC#CNZtA{^8T^+Ohp*n|r)t#pMmasD-<9-tif z>sR#OwF=GtQyu=jcH;jr+W&i^rxNTt1E?;jZx>PNv>bNR<%{@I_EHN*UJ@lRZG>^J z{x4`8ApM8`{`cxMYa-TtWkjCPSO)ATf?b9sK#_V$XxJJ?7CJ7q*&|nLVv>Ia46C8> z=@=#p_2##=i5k`6*Tx)qf*xyS!y1~JICw8U{JhDhohESJ)b#rJ5Yig}4T1mD@GqkM zIW1(NZvPg+YnxqF#areVl+nYrhnFxZ7n$Y6&x9@lwz1Svqx6voXE!%3nx4)_+=7DS zu$to@3knuaJddQ>TLY$GPiLmnaLL14*Z1losRp}wI76gwipmcJrzR$Lyksk8$S-=6 z!Sknx34S(Lg$XGrhSz29-V7BBesV8a2Gb`^Ao@6iYk6jir%oI9^~@rpr^AGi@Gwix zl}1rRL&Jd%Vu+W>$%o!Q;3t>;Q%(e89yqC1RHz;p(cz;1H+5lh!j##qEd_5Yb zUa-I$qWTJ3crIp$*5p=HRPgLWQeIYb@%C$wL-haozSk~a+N&E4|IY~(`U_hKmwzCa z3u}jMi1daVK!>9P6BCRb#ryk~7nt-{%D4_d9rkjmB=Cd%CZR4XS-4X=KJsTn z1KVnuY#?OZrjO-BhvBm}QdiL=ji*U_egR;Ja?;T(+g*~79rS{o63jJAN}_xiFCdKT z+S#$Zk3_BDB!JjbqX6C_R8UFo@b<#N2?XKWNBmHp%~DG9;)25u7*~cjfk|^<>Kc*K zNA0iY^94doXo&%U91u_jGq&Zl8W>oC(>)QC$&UKMvp$!s}6A=lg{a4sP5nB5y9inUs6Bj>N zbhsR1UQ^5tCBRkSu3|O5LS0^omihrT+>6WzhAyv`HL)}LHBEgWbq8BV@v`EcKcv9W zFoTqW0^%o?{IbXrSs57{Qjnf6uEBeNsWo|P-kCZ>Y-E&YY%MF8s6)RT8Cs?_Y;{q2 zD8WrwW?uy5Bj#Lh#d`$doI$9o$I|j#7tJJ&D8Ib-iAn;}0?P_j8wZKVKN#{uGQdgi z60ZzGd=ixXk%XA|BDf7IQj5$2#2X_9C_(N5xafy&g1{wV3F%(Aef2IbdXFG~w=Lr{oi}C5@ z4}vq4wr42TgebISW;u1PtB?jrJlchNa;?&+4>IND?^uYx1MWe4&}9YrXk!i+sA>jf z%w@b)_&D0fT0A~dGV%cue`sAub1khCTGIYOJ70VEFAco^K%+@5p`fV=3uY{6#oo>i zX)xE#gnqeuXH<#Qi^&}ss%()1j@3YdR6`0OorST{o>1aOV8jM1w`btBnzoflMN*-E zwZZ;}E9NlfFy`CZvFHEdp40Z+J9+F=9w0c ztR5a#?z_<9ee(G2a$I2F%b4PCVfOv@V z!s{q;<)(xmaOxDr-MR&JxC;E}^7;2gMOLCEgH>>b_b&lVg6=JZa{anm*uMZVLlpNl zwCGr?zvFr*UkM`@Q)#c2)rax}M;~kuJf!c#18IUh8Yr&(gSk0@9_rHa2LOcx30%H` z%g?L>T?0l}IOC<2ww^RYV6C2*^l`)-Mk4zFOozAm8#0zwdBRo}7eS2VL-_`V_QCr! zCg>Rr;pKA~84Og&$rDA=ekkguUmM+J{J){8>By;%~ll&vuqg`K*z2!j&6kW!3Ua-ZJ&$Ik0vBlKY;;^*mur46<=sErsC2Lny zyFY3MX0E%3$T%H7yTu-;xsw~FcBWF)?XB*B2(&=pBZUF#F93xvjywl34r2G;d&5gx za8Agi@E>i~ELZ5#qck%5xAf3{%0Z??7JSmuucK0_`Xiqm2F9R|lyE6<)*2RTvyXtjua}G#bd{hM2QqIHdlx95Ou3)ezxoNXyL{QX z@Bm;G1Jiezz9Pjv#o?XH7J?6&pU6Yebl{J2${fUlAm9}8rm8#kUnjI(5UhofF69J) zox|od4sPprCk(d6}yVfGG2MY%A!+3XXnv zotmpuIj`)C(n_v(NJ%N1)l?_cILsr|6W2|PHPlYf=8myls^2BGcNAOa&+gyQG6O{= zb>ZL(of^B>+S;Q9D9zhJk=!wS>jk`ha8I@!{ zIq3Jv$8~2c+#lpCc2P;cCOsWNW@WLi0)jkn4j2Y9wuFr3;p+V$P5XQhCjvQkcUiZ$ zKc0frpu+MKDes4Mp#X~+4dCA@rkPg!4RBF7&@TZ0sDiq-p)P1#Fko)XYSxbv?ZhJ3 z$t<9!KC76RDLUp^3&YqV8OFhv&&5&_MIv{IG&FaDig^-$ zAHNDJepfm8a!N=l>=Q{G`S{y$gnIdGU~^Le(wKqDI_~nVpWY@j8e`ol8D1O|KN@I| z_-C^#ZiF$TpbCBepmouk>w0ffS>Z`rFANmu24T{b;`n2bM}X!QX?`JEORQ8`!_fsP zLOBREF&aR@17-+DvL0y`qTpQsip1{ZlaHI-%vGLF1eyOPN*dIWCiF(!n6h;S^!W@} z5(l%DS}fBCl|<=nMuo|*;Fz!-e&c%z zUq6uD32!?Ym$K7$rBV#WGJ>l^dX?#*YKSJ;#g)SsQMz;#m6>z%KOO)I3XS2@)PXvr zrio+W4Wt1}5!|)7_qJ}PwwaFj{7$1WZ8`YQ9VH!R{r#Akm^#l*Ok12jXCk%ZLzdg@ zI^j;cfoc`zLpat|ev#c28NA50b`TW77j_fbdE8Ot3l%l`oxEG@Br+j+TyWg1Ur@GBBUpY6H z6CNE*pB^=;#bS}yQy0ciqcWzVb&x*p9RRQS7wvNY_AmW!vC46|mo1`fMosVt;N;iJ z>lguu%&(J6ja*k+CPw#-bd`tino~w@8RtQBbMvn|)CO1aniGejMOw>NbK&S9kwD&ogZ|Mp;t9ouR*@_clWi~8B!2!PL7)Gol$UhCTE}J zOqg*pv#ck(V)JD-{IQO65yuEO1GOcQT5M!7+!fP9M$t&mL3BV`;Of9H_4RFq89k6^ zC?L8)-pO1zYVt?Nd$vD2U3P7s`*`#7hP1Twoz^_M7|GDEu)>^@p*mZRqi5G*hp1c^ zXdYNEoZoM+J1?wm+zp=6b^ZD1y}PUQ$veb}|W!LvyH4*2Ww4`7)`p2e-*!P9q0K=iYH8i_uoN z&FA|~h4d=U_Pa$5Dp35!iUCdpq`(fq8#JvmfCe@QXHo5Mmg6s8s6XJ1sj&&o>-^D~ zI8e|&Xoi+Qx3Vl2i{Q5Wrd?&IR--~-1-o<={c!Xz4l-ea!k(Wgo*q*}jGDPPqZZ}4 zFC)oQ`Rz@SWCs6_^GF%d0Zs)wZeMBx$O{8*nSeBQ?whV;QGIW1en+WQ_&()|hmDfU zUg$`a(IR@YI)<@YQSGVC7Sd>8UjNgNpz17V)ws3P?#^20)r2-N*)sL9Kf9Xb^Azjz zP0P0L>(!27G9bt26S+MMQh2GW)PKi~;|^f*(dASk4q9s>LHe%r$V*xW{EVv-Wf*E%})b1_S(HC=6U z1>#)Ey!74?=(=s4OfkQGP16+%rwz^EOa$zqV0>bk8moGPiTg`3(?;2N3L*>zd)+gc zUw!;ide6C_>;8xR7?>hfksCSB8DtQ!?lqWhn(^AGB@(h*+DCtX4bK#N-sL#{wb-k|X8k}X;&(|!l?c3VRoIKVmlzVV#zErxCREyH@ zsai3)ouv-;llN#mK9t(MrcPKF!M8n?pZ-AZ`)S=qIq4}A-+%VtC~5e>Ki9p3VBJy9 zbQCFL^|i`DbWR&lmFfvWk3+l z-qN!K*^P5aOd4^-dqKK|C$*kHLy{`a*x zIPd;74EygHVgl2Syn*p?E$c=Af-(;<`7tmtF-?Vl3hgE+8@{lxz)CJ>F*7`q>UqY? zY1)~<3BM-1mm_1EX8ODJ==}4mexdLyKFD!oC*SL(Q-V{!XzPWMc-fYJ4MDcf@^Sog)7e;-TE z28&6`*1KF{6!kk6nlD#eY-*}*Qo_)B(y;W~xH!a|n0~Iu$GXl}3#^yuvN|S~A&(HN zX`C6bJL{4l-YNbdsPaA=Th|?7Y-wp~_2i(@H?2d6!M%~l%Ci;FA!EmP0z0-39F%Kp zEobr^^7&8AhCW@$qgYiwR+rx~#(_?EgRS9@&nmg(=43q?an3brY?&P9_Gbh0mnhD$ z8(HRkJq_*nfdl>bk}Hisy~O6MQ)O*iwZE`HU~JYu8_rJuJ(khUy7vVe=%bgb4(=eP zzp6#KU#YQvf>-(;T2M;`#}~!cL^RWy&=NC-7jbBUZGsKh38akZu@n`|PAWp|AFW{K z(NKah+J{^G_~e4!%c_*>7pZO;Iz@#EK+^eG zX&+U00;y@c#z9vv1djzXXhZ2kvyavL_pm4IJ5!E$2Zco>o@=c{1v2+?|GiGR>jSZa zSDx6GeA%u%OD!&8o}W7!kDGyP3MArimb*E~Gz4V)VrsSApzL0mI;$ywWQn0r1cnuU z$RdNUwcE-RJ(^>ob~uwU>eXy_;V1;Bo4`O{R|5Y0obz0}GT&ieS6l9*8y>$~`p$wf zs4Qd5y+*f@2eEmna+Z@CgmrS;bpwQR^NeK&(e$dp;XOYkpJvW&ZEGvM?`~K>QZk+9 zHbC54-XBu9pMfTA&5|GUCb*5()VG|D^#J+qM4{GTfthKZsdd-rT)UKc?L)y`#c1S} zmbe={x;%$p7g5(-cSd+Qi3#c})yB8CDU?mInos*~da$d^lwcv^9X#_7Q@5?aRLpwf zY?m`?Eujs3Ju;jEAhn}Tv1Y1xGJkm8K7V(aq}eJ?$KAJFrv8->fw`$b=V}}K0GfqG z=P%4_kkX}19KpPhtvf7UUxnmPN1%S+>Sso@0*?5#21`$klSvrS-4-+Iv5Ppw(itqz zz)WZPOPnNKFflluv5wx|3Q?@M68-7@3Q4oKhR1&McBAlpL9n0cK-FBgk)F8lW?AT3 z+G&UBsbrA*GD(+xWPoejW#75%(jRdt*U_xrT6$r;`6xb&#sJP&$R8o>Tl+#Agpa(F zjI6w5+5zclGb8gk?fVum+>moE`nB+n3AGGtFl-5V8{M<>)d>r8MaKkg4+@G_%UeF~ zB56WbE(I(#WG>0{@mTTMnnn5)C@+XAS`78x=u>zvT+Shmg&Zd_Te^(sXeURgp)uZX z_EbYvS^nM<g$Ko zCpVdsc1~aMK0Gblsqxqf6CCyDvv@+5$r$M7uAG9C=$=-nhNqjhj<|a(FeFT3p+z-Z ztjwO-jAZ%8!tnJkX?B*YIxK8_h4WaW{?(CjIw2D&;tA((bQ;pR7R#{hO}i-horG== z=F<}JGoF{Sf%!((gw! zo-D-d4n5nDZnBj9n3zv+!^1{0-u6IMo8h2uUwlHbIZm&CP~KB zc;tI=0u@h>X$Q>ScN}AqL`6T~*9Y|^6p}#*+QXQ!a8yG{oij^#ivBlK4)*bnx-=e9 zC)``)uLsb6&*C-OZaIms*8M^f_`Tw>S!u7w#Y5 zYF4ID^C1cuBV#7(ko1{air4i$owT~15!8s+YOL|&VMpi0&X%sXCyejnzMDKMw_E?U z;)yqV<<51yYX<^aq&d@4R}>$ONoi`*#L|3>dDwWqDx|I--!~L{^sJ3#VxPKmDB)Tp z)sXLGIDto?0GsQ4(&$HAx;a;eEms_ITa)A20NU3y?_yZ1<{L}eQScGlj2=$bUi)8@ z1zBC^-TI>cQ6=x(df8#q0sYM9nyIPD7vl9l5q4_Mus*vTkJia`|Mxe`$*evTM5Y^S zKK+2Q*{u?Fh~1r-rsdJ@rLdrPH{V8Mki!13#VB6kxdTqw_ntLJ$K(wcMcJRG10R8*TTvxndR zB`l8AYsT#}FOV-P#9e4P6k64hD{;P^+25xn`LXSW+?%H}Q|Pe>I}Hu7kQOH|0m-Qr zA7Ot4@)D7*N1{Bxll>s^^klK90ky93R*hJya!`tGbEf)5ZJz(+A`pRy01S+DX7pVSe2 zjYGrAmX*UTJlbSflXt)pq5QZXrNe2|V_uMs5I0#Nfo00I62Od|!+V8{uGhRk{Y0Y3 zX>-nXGIDM?R)61|pSBWSFp3t(gE{h10a?JPuaa3>COTqUNNDHi*l}-97viF(oqC%& zjOT6`6;CSa0!8FIiFR9Yy0JswY~gNv=i}2E8Wnk~H988D;I6jgd=KlEr76iWTzR5^ zt&B4vqJXh9ILP;;>0>swKeJ$19ZY7{yIzh%K#qU)){xuW$R+nAG1qnzPKrwjIGq!uZ@)~#Ot%{vZsoL3}@=z;BQ#o zRgXMNVD6u^m6HQf!WIiex-oYX3LaL@OE=T$?SGk_!*-!lrIEUS^X?seH5K9rX?68T z<7lcM_=K(D9#@jyBI?+Lu-&774Czx}2Z6 zCsa1wZr^uUEa%SU^(8bP=r^0MGhh!0qpeYvm)AE*gSjIgV=xk#doRKUSvohuzJSop z&x)m2O8Kc-8{+yGM)FEReko{!M(fJ$vq55+{N2pIo&3a~Ch)7h6AxVtTz}Lik?>PW z$A#6!NCbT{d@|)l;lvBwACzmP_`3Uvnj9+q8OeqYPSK&ks*cijY?-4W#fS`UD~XyK z#>6IGHsWScnxWk*xj8vNwcG_h)F;=GK1D>{!@BvtSIV~jx00Fg(Vtq$!XrmjwdrI| z{GPfv7CE)H0K$@is7#oGh%9{gafq1TCTWR8N=@k2rMq{6CS?>X?E+Ia^EJl7?xEhk zL#6tvOPzm7$&l8a8YBBhS#PEH@Zb)iJESPsZY)3MVOS#-o9K8cp=*LSw@X8Ez|xCcokm{D z>!i^%fVE=cSTVQ#a!UVi2ub7gEp{#^f!Mi3-t82}!$rLq0TpYS_A*w66UMcT;Dwgp zh6l?HqGGLZx*C^``e9w|Xlcb#{MB82i@C+g)yG{g^A)qWfuKaEr@cB}Xp8F$D!PHr zaJQ;oQpJm;vqU%5p}@#~J0>PZqQ3$}&m9qrt{G-bn8QxLTb^ETC8T%*#?3@z1o;FC z*3FR{?gxQ5ER(})^Ne&Y-xf=Yr8UiM=xo-|2AdRZ{TGw3tEV{8{x{(3<45~r5|Owa zk&KxfPhQ$-)$mM32$E#YWA&lZ$fiB^;2Ch$oA3FQo4lr9#gCB%=U~bqI-Eotngo1T zOv;D8v00QDb?vy~M3U1d3Tb$PB_4FQ%q9j zt8JD6=9bN0+?C9%O^qHk9C)=<;>wqjAq@x5C9bz7nX!WES5#_WP5I*1!=1%iUUgzT^{O8$c*kibXlu@hxbOS z=H`Yp^^!!3SuBQiu|_!wq4o&yntOCxy4ZtjD`evK zpovnEEZ@Bo5{gd;WuH%0MAdaJS2Yw?%UlpE4ijFfHBa8Kfkr0erG zqm@a=30C<aVh&RdN955n>vxBzg$eB92FjlNlMkg= z6U8U+OZcRS4VjZ@8fjc_dQVh~o+cB`UI|o)kl^!(sZ}_5X-~|4g#Q=$68a?kphbl8 zd*}6_dabWXwO(Te4EZA}By}FX#9wCw4C)o>Nk~GD14Iia)YM*VdEJB4>7`wWX;bfV zaj{dv#(bOfC zsmI;qTA93Xi0qnO0dEZUP*{m4^jwv4>5^!Mzb_tc5?Nyg;t%twO z;Rqzb8OUot1RU3|{ay~+VcyqUeveO{sF0+6wL6`3%#{r}JJd_KhqTB^bh3YT>UhWK zr%YgU&c`SY8qh1Jw&>qP?KdX#UK!qR`F*fH%ClKV!O#Ay7W#;xzX9&3L_PY;IF{(nh=Ll6p(b22IokPSny)+_W5z*uBdBJFicS?a)%|H}#aQ_DW!zBOZAr z{pJO4EgpBuBOgDHxK8TEJSG*nwI)s;3WJZNh?O3bi!(flAJ^SF*)DLEt-cR~cr1K2 zkaXANwb3CXJ|ZRH?*i~T_50T%;lJ^ctL>>^Nd5GoT0ds@5CRf(fvoP3@|Sw%4+kuK zvwdj6F2sX?{PpW+vrf6guR>HhjBkcU+?K3ACn#=wM{&f)Wg9LS_j_&1*p#b;bE*2u zWqwriPU?wp56THy1{U`7=F~;@8fV3$!qgu9%757mij=pxI(g>wm#kLis~f?hn}o?> z6Cdk;6y4+(HQ&Q5PHqmsaOLnP(o}C_kXgUX|HNOHJZ|l@a4vMy$TO>ZuErzdq!-89 z^JhH$)ZXp(fWWV*zl9kn@yKuFnHST14%U@F=!9xsty;29L-U1(DZ58qYmX|mQBWkEOo;#73l(B`+hxzQ1<&Yf z@0;JrZZeEZr;)`)M^l<%R;_Nny&n)xntsLY>}X+QTzLJGjgq!D8JU`e#oAy#rgAhi zI(gObcMYJV2yzzkHa)@#bt9@{_oly8Md@Os@9|4&9OE12mo7y(w;Stx#vN^cm%mt` z9j8I@y7y4%jQ$<2pv^DzudGvK&bul4U!GLrd+DPOgB^l?LnrFpYa+Qh=~n8Nxp{fF zI#}NZ1_lCMDOEkzeXE%u_&4zv#jKDDAfazvm|%lN){-m3w5ps zXM4KmdqCoNRrGUYw9d8SL(03cHT;uU7;ysz>%Z6Tm5l?5O2+dDaxenuqd-(~c;O*K zxc&8-y1NAqNH5>A39xy3+d4!bMzRbXOR$_TSG|z(*%k6QsF?~(#YVew`5*o4=hBl0 zEpE4hn`2ju4cuNpo7WT);`ob*)Aw!O^aE|mmTSiPzk)pVyv>HA%+K>XOv-Uke@-_B zblBg5lPW0o8(sq)X>2FnS6ls$qJ@Wk*X}Nj9~Ch@6Kl=nw(7mS<9S?S?(M{{TXW{L zw@I2&gsxp>gQ${mF*Q2D)*gh%#IWZ5OoQU8x4m06MHNAq-?2l*7i4Oy4d7o~Y<8d5 z;-rgFXP6~c1t>?(oIXue-LnO;>m1*-+=ky6b39FKIey)Ms4Eejn zF-*91>^Ex!zE~CxaSg=O?&_NZT7+}k=U^n1q!io!4`Q@LNa0kw;`-&=Vr2q5!v_!7 z%hyZvagUDk7$^-QeM(%?X;N&5D=hO07|i{7t#}lY6TXFPPZFN-P6^L zLDbGnw-#e=eJ_67JWX6R6E1)r=;-*Cu$IzrKG_G-BAmaK~U zq?`_($AOiu!Dx`JWu@?W){r?9^x&3Giz5-|geA4*RZHS=*T$uSM_mJKYYQyGT%4dOXje#N;0o{c_*@JO1 zxy`SuuD)bbW%E;wk}SNn@q)-pznTxGJYf553mcCF^fzYSOuH5#QvQLk`Tct(;#fE) zx{@aYGj(#`l~!#zHvml2`Cqz`tK&)35C{ zilK_bx4K_XFkn8}v^c{l!>MUTLlJUvNfuUsNF7DohBJMN^YfRlAbH42j2Cbf6wI1> zShsuM@%Xi&Stm@?K7^#ECIAq(A4}PqKu8V2uYZ#UU5sgGfDvuJw=3#M0zeZMJ+|4h zJuWbEhz1Scmm5gYVx?KCy*`V0XzCfdkT!TY?Q95p!T_|YXa=K#X!wX(d9BO;wJ znC?XtTKFy_+#t^>3V>1siIT1Vk2SlcLMPoeW2KmHmoU?D8;kpX7__%XBZ4tC?QJ`My4;jCZ`|gcUKY#ak@^`%y^`~3nEvHM&!ZN zdCF*7T(aZK)#~Df3c~ziIcbVEm_nxO6BnS(A+3S(DGN78M%;(Ou}Kcw#UJB6;t7LL z`Ln@qwFdHa<>QF^N!Ou$76D8kg3Yvm@2~+2>Jd^e_2vyLnL@OBlC7f{zdpc`dm9;8 zOOUA-CDSdpo2VJS^2EcQH>LE=Q;gm6JIrN#N6feg4%7M_MzWo#rm_25-{fDfhzK&U zi*Quu2vSjeqkEUm5*9$H<|?q82HjJ6b?dB&xp@|9v}i=m zhK~~#JUw#>w7vd%U>akmSln(1!V1d@_Hc0UTxub^am_v?t&Rpqh~LIvFZ!8GfU3HB zyqz{5Ay{YV2-Hl3S$PdWyB);z50FoEZ2+m$(sHuFv%$Q8V>ro)KLten95)|yJ$HH5 z{_)JV|AF`GDwj)&s9~u(_27yBUi(|HWURQ(yZT4O;QJpCD_?i#NzF zZFwQ;&_S##%AG`&{SRM$${R~5=7t!Ip9~zLg@mh(5(pyr?}LZ~U->T7z3CF$;ERoz z<6i|pHGy~_%QiRE9-vj+*P!<{NhIKW4crE>wGqYz12FCD?ZMzzVM3HYNLT(A%uzCX zX)MOJo%r8hwGJ9vo979;F$r?AH3yU1iTz8UA)l>;chPtcD#q%mloWElqO5~}uyUwW z-vBB$ePUa!4)Yo^Kfhzm;h{w!y+&hWWAQ6e3eVGp*2BX?@EHCe!5w9`zHvvU+>%S88qE!nn5c1Vz8@l%36NFmccl-K%J|IWx1nSc8;a^X6AfU`aC`nYj)~^zIOK@n zubZi$ubY|HZSa^_*tzKJLi?~3*D6=jK0ZkKYh9Y1WC#um(L>$%K83sd0e!*EUh?mv zLb4CL9&NB-+3qg(BDJnV34zlH4uA2o&$f>ezQ*B~N%*-izf2M7;TizYO5WwuM=}8y zCH!3A_2PG5NU5orwuqaaY|RMeRo!XpQ$y;6#DfHa-X4b;IW7oW>`l|^wUw33C+o}0 zY{a3(Umk;)0q)+uBGAfrj+$cGQoBL#Q=mfMYCRx0HaPe|dl;1Z1uCZ1VEzXpnc|AC zfrF+TCt<50`PnE6U9DqZFfi6ox(o2jpMiIKk6-7~P*pxL-QW4M3G8>8{P%vB*~j z;?R(=R{ZR#G&v+4e)XNXwHiPyGGD9Y9QZKUs!Y~PS@cY~*Ts1JIv|It)Jix-*Q)?X zW#JU?uDfP@!YarDsAx#i)t5!J?y3wdjT-_&X4|ef-#p4x{sUev@pE&q*)FQVxfvhM z2Z=9_o5>46I*+XZbg|Zjwwjn>q`dcNNO^6ZIvayy%cx+!c|{~>pGaE+u_bmdSKb#s zD(CD5n{U};|H=__Fp+O35JEgR=vx9+TR8|)oKs}W0LAaHxdxl8x#NJ!yZX15Rr@+G zqI+r3MHHh>Igdwk7lRy%s_kr$uSDv@wDyrli$y@|9rkjX%$r24Rn+qh}*}01F zQSY;=K`;+^@rtNVW^32fDXgU;_+dr6q(NU_@IP$lt3+M3$WF3`Se)ohaVW_xCI@~S zR;8u?uA-d5WPE9?fAVY@M+dp1S!O!;`vmj1*4WqqLx%+1tgw)RI-=FTz+}xbR6&nW!2Y}jP`AD!~tj<60s|t}xq)w95agjeZ z=<9iYyIbys&hIbzRt*z{{MmEuvGO8PNIv?rZ&30|&L7c;GR*n}89?+Ij8iIGYlPjs zJ1JS-qy0M@9rhWpbl}lf+0New!B<`*V;?PvmF`={OpBbxG#3;UR7#WQcbV>XwzbVW zQ1z%CJJd`Vb0|bHSv(v05Q~>TxNK9EB=ZoP^qywfr%!Knu(F92J2)@rIE8sM=pMJb z^%-(Hl@Dv-7Y=ML+wsAM)HrPnamq@#d|7oqRE4%~f}6GV8#2}|QI4OXgQjasOX-!* zM~xQS_(xRnRBn9b#C~li^Tk&XkALwlt&;v<(ld=1te^ej^43|2es_?fZ1Sg?ULx*z zs^J-^L#W&CheWG{JAxp4_{XRW_R<_{@H=6g#eA<6BMs#wmNdaOkQ!2Xz6? zpD!XKKB^c{eXaKc5{H*LW(_`_`2YR%f7k{}&`oT0t50f+&qtiQ1G0{4?mD(pGFX zVLQY2%@h~xVhGB`fJQwq#Ij0j+;0Ds00scnOH6Sc3p*0Coof!kh&lf2eN_+$n3e_I z?AUSB)YL5hB>ZHRs=&;MgSgpHE3PA^R3ffRly|?oqWxDshAh!nPQ3rd51W5c_+QsN z6fgZIYtk4l{aes%1GM4h-*8P|>t94&v9pyhwMWKb@H9YD1I-K6lRv@Vsol>oL2mxg z36VJJ996#WU~do6h`%00d|(d)_BZ86;BZ8I7NGa;109e2H0`K|fk7smgpSL4b1!W= zpHnLu7~F4UVTyy!h*HGz1CIjM%O)%_V<^*!(m#|8FGXx`_u-PV- z%sfP~7qkchX`W1k%-zP{hTF=1*;vyF7+*QFe+s9fk?*Fxw(h|2e&xyKvQobK^z}(e zvxoLiB6+d4@jd&?S`5eLz{W<yEUTD|#6{vV z6kYxmvRLq_k|=_C-QCNa=$H4k^L@?nf5cd?7+=Ydh?NXS>iK8&O%^_+tdF-5yIoh; z(SG?Qu2cZXKdmDihUt(}UFAV?k>UOM!~bMaGq&|zabM)dfMmEXkP>ZvlN^7K7R=7| z@W!X#r$?1VJ^3oeMI5tEYcor7cQ-;XfB2O-0vUv`W~U?|_qr?ka1Bl|1z(7~ZF=ppoP*p7-=tPUL9Kh7;Yw`$L0I#l z5Y!j}B|u_sy5f9K;Vn|gg<3(SZt9!!%hFKi1uVhxx2oXMS3oM^0egEo#r@3LFI~F5 z%UkKjvZ-*w((lv(D)XI`51)ukHCL%P(?@Hk=-eeX%owP&ii(Q6N858naVsnULqN?+ z^E@W}e8d}aZi^?$)4SKeW0R6qX95Qlz;r0Z)5w9e4a{KEruV?bOWy5D#I35 z8avbKoJQZ=Hh|*;>=0w^&`)U7|G1Fd_!qYh>KQMJ^jzx0|s^dFSZZlwX#T`a8#mkCi`DfLDBy+sSRi71&1itJz+CmqhpkRSAPZY)Slxw=(1_CF?>$|&65w`Sau?d>lG zrWMw-(Yr;9Cgf~g_FWTa@~3Ja;COsvHR=My#^LUDc}2yjUB{)qPV7*?E_yZ}t! zcJZB^tD~+J-eqq<+c-4M-aC}b)}N{ByGUEG0MDm5x%JR%cX$>`tfs!{eEbsX+<@st z`tzBD-R+!&I|&sjDa|Q0&*RsU$OK-y5S>R};{%iR0b%)tg_huEiZwbXpawOhb`8xT zk3JHpGCt68u3E@2g*`F(NWP=zhSmt`@Olj`Dml| zN$Se-GDr>suL*os$g$oFiXTnOmf_m^9-CG1{10rGwo3>xtWE~EgdN}D%S^o*1iLxd`0#4u!-V9*5F z=IeBOt?otbf!@7KVkNNg;v1TkZSBU~qii+~x%Q(4TVI#eVTaluVIb@*^nToxzrUq8 zkvlL^pJkUgN9q6YncfTD6U`NX}moW~w&tG8Q{ z1=HuTrlq!*F>?1)zLhRjJr=*Nn2ovzK zG?C85Xu9F8ikg}oRA5~-Cts;m*xud_qFR3P1n6mLe5q!FNpoU$iv!F}M+EQW9Ur@> z^H`2-A8t%CYL;_wbJHi<0WeFW!GIqOu}&wLap`{qWooLG{D?mY=^;{prZboWLqqNO za=A)ifZw^FZVV_RGY!RV-^V)(Gr~BVO%aS8jNlc)!B}q6PrZe%9*cDDu>amhqL3*+ zEi68|Bc?5XbAu_CCjT9wSKjMBgB@LMC@%y$wo54eGO(kqsi)l`%H1cgA-Izv&#CkO zwfELhS$$F0=mQ8SpcqK0geV{_Akr-*N=gfgpma*7C?MS>B_$2gk`GFYh;(qqB; zD1H6s98Rj3!osGG=w{yrW9E%F(8#=P_nI#P~0Yb zSh52f;7H(f5tZHb)0#o0-5POFUU_*8Elb~eWNA%OoD`BR?M>FuD3r-bUOah`lcbY= zC4coxQuYUUw7@{0HO5Fvh8JGrdHm|>nSqg}2zsresR??(X+4IGa}rz#%(Kk0%1Ez zeqqTQf4xZ>8sM#_%PLaH4vcy=^dcFsP71iVB8l$H&)pu$!* z6u?I)z0SY3Zg0YM%@ZbDa_tp`->9vvj(2%^EGZrH7Tl?=@7E;~(>mtSy8EoTX^NT9 z2}b|T^8WZasz0^%;H!GA!f*7+B702odTDW5FP-?rn@_g*Sa5UkwB+Q;=V$tl#D_*c z_rFgp_ZaF>p18VA5ac6lyFOu`dlbtRleL8L=QSu*^=iz=gJsXDwQd4IxsWADXc zg)DN0mPnve@igK_7blG^3Xdc>Gl7LAHGUD^q3`;2*?xaQ})Bw&+*Ee7y! z045Ul*WcsIR#nh+JBbX*OKLV}h2OkFNpc=5;%k|bHO{?LRzpKpzo~*Lo491t%k~5y zduo)tBEilo)|&?!D8zAyKgNRmnl}=NTD@O>+1X9R^SyhBK#a%ycB}^7jh+OMvH;+b z_SvbLR^QSi>(7%9m-yQcoCOUni9jlR=OCh|5fOWba_~PcKxktuJC*)~QHcif9(jX1 zYR?(o8&;1blat3a;sP{8n=H(^89rJI)^0q{Cj)aU8EtfWnh zKN=ONSNdKh+qYhlzRL_`-8)V9 zGNNk&`nH`f`Vyu_N$oV^d%x@|^UqYf8XG67+VqVB&LAD@OY1d-kAzag!b(TlgAVQZ zL$~%rE24^cZA`^RWRv6KiyPyKDs6yP77m0T=Zy62vX`TL#BwM>8PRvF=g}F%QEF5u zF}}bZj_u(<+J#?A@(6(Hsri3<+e^1sL}hXiW2ciAPqbgQMxI!eJ}r^a&UCUj6y^<$ zQFivPu9eN$nS$8eV`>g*0_0^f8Y=4|YuHc}we06Fc=v)9^2YACH&%^j6=CI2IW=kd zey9t_Z}#O*o0GDi>`CfRC2UUZh<@#u#mq)P6Ce8N64|&WdZ!0GDVPVv-1=&w9F8{~ znszs*`sR=|yeC^d&0FU6U6(`&4Ruvo7`~lSvLwt|l;!;&J${h?lw^*b_bLC_$1cTv zk*6iJ>r+!T`s;7nDAJ@M7KgtOPfORXq~+py;2r<^I#KhZ0>cE%P)t?|r=+HZp?)GE zxc>b8`$Yn_e`r`*l{FVzIt(C3w`Ki8Kbowt^+Cp0B7%05NAY@@3AL$&2x)-Cnyg+= znSZWEim!OXZjg3mf(^A~vozuH%KaU|-G*vHSIfRm(UV!^ZkYW?IPZ*<6rZ_p>{99t zn3D3~;N^ZN_fXtFWkzR1VJ)hJcjH*um+$y)2vm0C2-rw!zAqCW^mko=yd3y}DlVK8!@094IVt4h69ub7ET@G=)P+Z& zaWMX~%mn;gJPpdq@10UYlIyv&N{!Bk(T~!z)Yg0s=`fhYN(iVQTI*iwefdNy1aQ|o zjIn6ml}h<0)bH+$9=QdQv*#nPpfhzU$r?Si5;YR#2vVt_4#Ng_f45+Nr>6O0 zG(K*8-vE(d|2iT{UqLHGf1s_WVEo9_zIUNerc`8AZ7ZX7N=9>YVgQ-|S}2=vDJsid zV?T8H^{g0h(?jjCvzK_k04U*ux21|oynxv>cCZRmfj^n;?fhcz_w-pf@)sGlh?hZcDyzG)8NHudTq&kO0RR|F6O! zpA<7yfTpE>Rn@=AR_v%gn-2olY9fzLXfi9D-TAR$r+ZX#8?ea-_CegcsfHtU06ll=WgQXJ0BJRbB%s7Q|R;UXh1Fb z@(V?%26jgA`qTYL6%%7F&EO?#{q7f0kYn;xU7--ppjGJ&y-iqqpJVT*sgovLjZYPu zNIs~XaG0rw#b>45SL`TQm9(Cbe+e_VyEMSa5?5)9ZJkC@4EM=z^mM=ozLdmXDZ0h3de=18~vnYKN13 z{r%A>Yd1IK*Jl~|`6P6px>z=33W9C4VRRxDylo|FOx<;$Ep~8poRE+}z?ur`LmIh6 zE6{;G=Wwd*{0kQoO*055bX*8}Ba9$)bO+!n=OSD7_kr^p+a>e4Ls{Je?CudM{*C5s zkr@1p*(cOXCS{BiZ(q*mY*2agZz3%w~6kl9W|BXq(<6rrAN< zgbE27g#%S=sFOs3R=n0ux9~Wfjzg^wHq(C8J#2)JA zJ$$l!VLV3a$ps=RT2@-u{gYhtH=`Tmda<7Y+tg#4Tw&`$mGEg1j$1xZs=W;5(*Y;@h|FU6#8 z#lzX#gzWDH_>=x{C$e$>a3|_E`JtpGi2nY>_9iy}rm3$h9=cG%`}_SlM1fPF%m;a+ zg(o%I_7!T|lPbd6ptr33StHkjDq5Cyn5Eh7pJqDIov2nQlr`T{VELn4TbO-xLT zjZ^;M?8Nt)e6PdX^Q@b=NLeDCn5vsW-%5L*R*G`IQ0;PATvkfz?!C<1+z6LP^R=mN zFX@qz2syGseNQUKrJuvU-Y@H*dgFDGEWiEfQ!LF4&fLec>N@{4VuNz^>hy=oNMiku zqX*+8Ulb6C&*E^qOIu(wcou!2kqSU|J^~400AlcAhHdh*n$lYH#(PkKq$vJb52_{M zUhryWq!o}=6;)LwSs4LfTXX$?fbEpa%FnGUN5D{p2_yvJ6dQ0KIW`7Ex_O8~aWYfI zKV*cGG+4jmH)FQzvEL3*{I*%upsSFnTzZmPdV zVRr$MbyG!q0D%6|L+K?>k=1si62+;UwwGaCVS8w4!jYaH{i1;khq62iF51C-814 z#MlGvtmq1uY400vdRur_2^Rn$umG2d#K~%%**HLJcA+vricD9FIsU###SR>9)bj$5 zi$CTA{v1&qj`bU5XwkMw+M30vB2B3CUi{rJv31E$C?3cb3QP)g=P2QLRuTqis0A(kGuNn4|lbm(lAdW zzbPIA;Xd6U#JS9cxf);PijY z*x?wrJ1$dAU(|p9h*;Z~2G*@q&`k()t6=z6v??j2HnejWU52NxP7I##1U~~1bT{N+ zY~Mr)?#g_eWz)yM@u3Beu#PPzF|2Ol_nvd1;xk>%6JH#?P6Kg|q<%=hSoO9(rk{yW z`>ysObJ9Vu?Ha?wR065}j2)|9tkV>Cy%+dnN2Q^0$V~?BLix zwwAswRiCA+_kfYo_t8t~bTLa<{wRC?$@h;spN(Hu;iL+apuxPX2v)P&XfGj>a2&$0 zEs{BIvl#|T2CN9}_v+>1?6sZLNq>9ugLgpa4T_Y@Okht{-_F_hjP1P=KSdC*Sa9c} z?!^Ex1%PIdTpTW(d61TvYC{5&t?iqw+$`k$nh>!HkAfg71qF4@f&3X%kEEj39jj0M zu!D4K=Qdk-$-y){sHsS_XC*^0`Zb;l-$n5Um=kyoB6}YR&tQVyDM~%RPbvI61q#AC z!2{3)Z_83-+B-UqXnqEwstI#xo?_splEXzKoF_8I{%7-buTs^~8Ox^hFieb9|)x5B$%)XQb<7%@|6KJvo z$=F^PUD!=}hb~{tn1I|(~>|0_dDlWY6 z9FmPuvzBfDwsWxKb3uIvtOdRY%;A1!`d>T9sgaS_bJ+Vcdt|w+)L7G;QLmq31}PHyDnc|_es6PKYJ_EOk_%Sm;)c(QZof~b$AA1? z_`Ro?(t?ask#H6ZH*7}bk{$5-QBCGY+QLAp7S(2pk^OETeB%oTb=yF5@>A)(X5zGS zRs*hvM8fZ`@@vljzvcZJ@LgK*RG94D%nVXb@wmPGu4+@fU?o`87GhL8(C9IB{^?b- z|61j1JCn@+3-YJajfnLh3`~{&L`9`Si)EOm85@Aj!otgCm>ZZ7MzASR!TA4P;^~jM zPkI7WiKkQ`GbFFNtcqdfHOp6m1_FDxUHqM}%jI@bD4|$mnF3{y%;V$ZK5ZxFwh}`# zGnGtbI=O_L{QRcd?;?7Khxysqf{p0F3T}&+U4TVIee~AV-k$U{RsFX% zzTFQ7v$J}RT*!X7uY+w)VPWWVQtvwI%CokW@zS6=1QRz7vI`^QnYG52MVGxJS4$hO z=V1A6vWIfovF8VxS}}_Z$JGRIRf04WG?>#!gtOAXmQ7|P~u-#3$+cWBr0Pp5h z0k&hY_y&D~#iJ6oZ&wq5CUFXb{@EH455Ajl(Fs^W&T4tRr0g=d7SwP&4$y}|m|BNd z2NHlL;!27&TnR>N+WGf;nCMNUHMil?d(QA1W7}YEkOzmGf1I=dx>Ka9RK+8GVcL;O zh}#YncW~uJVfjJ0rLrd1%I3f9Bku9_S%HoxKM9_v1xqj=Sb~KNpEvYzXjbYRT=In2 zl!HW4Ui#4Ydk6=CcdmVtpiW9o%A_BNbIK`Tss%+^qr@j%$voFDfXZ00YJJ%^zFDd% zso=*~V)X+gQh>^q^A-qK_Wa#JBIFM!f|}+w1=fFYpOtFH`+NLg2c$Vnp8hr9mAXj0 zV2(T;WMC}D-ekPp=vQ=$1`7rnvbMq~8P%?Kyw_6zspi{|U!QrEU&#)>@9poGdU_p> zQU)0(Kjat}Ebhn`e>s!hb%)uO4xbcH6tbxjjTKubNl9-LiyP$AnUKF3mB9K}r=@Y> zRE%XOx23)U)pQc5rqd@~ruq8{m~_E`HmxokG8Hfhjcw!1354i$;aE<94iY;9@hkAK zPg0@>Ok4{|dXnUUM9E1lGPU%};*m;r#1uDBZ%sS5?D@f$!HSS5Sp5M_lO;tE?pJKG zzoRewkJW)t6JJINuNd&Rmz9)AUn|Y{8pSKuZ^E_sn3^lO0~bv>8J)4x@9FsZvS^*( zyliMe5;(p0fBD71fzq*Bm+;!6GmZ9TXsGUxx>rx{n!13i;V!83)?6j3>tzv~vG-Z* zFZ&+33Tynu4B#nWQ1pL(E>@u9ftl8|u(4XNa(lY=P3=Z&le=s_4|P<~#e{z7QNdf7 zyuo923pWrmQs)ge1A-_CSSC@e`UPqk7~rrWjXQmxR=y86LI$kp(o72>Z3gbDSs6SA ztnB?b2QktnOE4&bIQg73a0(Qh<)7pkR7hondb36?xPP3EVgUhUfP)pkltR-K@_grV;pB$W@aW5d7^&(1{2nLnG6PJcp9PV{wTH9rpEJkj1;H_svruK zVrRaZ_DQMuZHd2g<@7Il?Tr8QVCA`iT@jHV_kzjAUL|yULfDYI`4RnXY&5sT{i?@* z$4}u2QmNYWp4X`(Ut%`9{Czn&7yq%PPA>})s7C+q?{UblsW}Fd*NFHA4J8v*IkY$2;FwS;!=_l$ni0l<&B{! znb3z|c1tRaun&3Iii99@R@HvCQ)nKvxc0`5x69;CM1+8p$;(^1y6JcJB0b{Y8F*R% zoGONg<7SyU+p*QOzt0XrEoglcN>dkugZVnWmq2y*QeH4VTGTY8KH*`yCTjP=8z6V?93MOxwMjE|0t#V1@Oqkv2IlXo)eHb4sa{#9)ob|9uAC zCIR=B0G00>30fx+=Eq2h)Fjd=ERZg&HchSN3 z!z4tJ^<9jNDpnqWN(d*p%~0&kl&MblZs;H|y9i+Af)KZ?y0&{d&IDe$TpQWD+9?@- zsPNxDAB4MtZX)9m+wF8m@qfsiy)tqa5)DUB52NsHW-Hk z%`a*4zd67B*HcfGGD`qNf+gfeS~(lQ86c%onq;>9gNGm+i&3(TY*K8-#e&;Kn;Kvu z(_dYMRcMQeW?@lA#p%(K5?jns3w|4J~Jv68nMX@&4Ictjz`4xZ_tDv zmsMvLm)!omgSh%SN~T0hpGR# z;DbNV8aHQFP(RCnS@X-(oMEgedpd&m_?2{L8Gs6RQiz@npmDG8vNc6(FIcRK7>yvj z1pkmBjGHWlr9^JX+P{~bnnn%)Y4fh=bK(ECc=vCX%SSsp8LlF>yko<=v(Sh-kboA= z>Fhu5=2fYA6$O*N4O;Y`&E;oc2Phh&8^VS7gpYpIGc(DKIk{^LgS<`3xS9g!%N~HW z-GaYru7(XLtL%eJP#;(TWiuw3W@PWXum)g15R!Ru2hgHM+>gcqy*I^b;|By5K$eRyPu!M(3Cu zGWpvJZ;#vEj*CnnoAc5G=Z^reZUvFcX`B#PgHHik`egL16Lsja&6!z_ib) zsnI^XezuwM!l`nkyVM6}VB!Br{Z;n2COuvN^$h|7CW5PBd-8kO<9RdZ--(zp>>cZn zx;j*4Ja*VVG^aQmGKWrSs&istrm?GUEqaI9=KbOM1s^YOc zQdqUB>Gdx>2DKcAopeZ2PE7vECv5EO{JJ2)D^7J7{Z~{UFMv$O3L;yVnML728)A2VHpa-nG>(WSyWgV~9U zf&Od|1tIQKh=*bFJ+eBz%~USviu*ZKsE?mVG#wA`r)sNLb5a59ET<3V^^I*QHTXa@>Eh~Tetk&eB5au$-TPVrlv664SO5q5O~+bZ^mW}MXjU{; zOtzLjJ_Du?YPj5ba$p(k_w4PtSqqU-Cu=899v`*~9nKYT6Vuzi5gzwrt@-FDuc*i_ zBqW!3GADPUv>b(FC6{G!G&}ofn7R3mG$ktWp!^rPxSgT1$0e$mKJ7?Q*Fu0Cb**E+{= z_hnMr%=%}iJ#Ej^ME99z^Kp9kTEkLO^vW0#nV0vt3ITOFumyS}j1W0p3=MZKo~_!O zjqTi_(bUkun-X#X0Y7$`dxUU=YdYaC?>IEAq@c&|C zV;g1>GSM?H35#G3$_X7QbFG*3X(!p zH_ozUb2mjm^jhuw<;BBwE*dHgWp5f6UxD&XI0Pz zAklqXf!4y#@=Y2Y9AFQ8==uitDGXLnoI=TpnJC92NlF)GwzVr%nvwBG2RnZHOJKey zm%v{RP+t`fbX}ll9We4}k4t3p$v&A^;x^-lqatX3G&VKqILt&Oo*d4>lx-X9eFGW} zhrQ*1IOdz&=Yyu#<$+B=a@7YrW-5GC?LKRkq1B3>C(bHexq(cer2;N22ZQ9A_5 zjoYO~H@V_dTineG7ObCR5aH!`f3Iz}K10y?oCKd#_{Ng9#0^_{^XW$dp(NR=SFnbQ zxW#!^zS#wvS4%yqxSCyApC4+rH^trita>5#eyWrL(Ss0YAxfe*H*1{tYT8@F4;Oc; zkht!5IU^nlh$*u<{T>$?k~&;eMOHX$&caMPKCTh@K%00tC3Li~b}YD(QfqI&O&N5u zw4teUxMZ$H*qB(x81@bKD06vj4dBAz3LE3ahx%cv%#{{>`8x0s6f_B^rl;!a1WI<5Rp1B%Nf$o(VAlXUB{wd&2;w0xxfURRoGLbcT%p`76H zYLeYp1!^m5tjcyPBLA3maC|dHHPQJH#k^|0F}A2(2>E*E`*K zZQ9E1WSTf0=Z^9dyU~jEeLvPP*1poOW5-)VV+nmfe)#oipFmRI?mX5ipjQ*KH@h@n z?zTJKHP@c#X2)h;wLMwV_v!*+N{7>3C%E&I>*l{!7;dpD1>y4^yB-XU3BtZGR8(m> z)g*MhZijWSI+V}v#Pd0kqGp@ZBr(K!(|h1Y$H9c1oX{G7Ygp4^Pd0})YejX+sKZiY z#6#~fL0+f5mRCB?dow$uSgfr3MXcI2&jVdJkT7N>?LvEtzuQ`4V$5qEYj@52AP=V9 zkVCs8Wh^idc9bW3{i=!OsPA;`?1?Vh>WA~WW%&>0t{zP9E$?vIZ{l|FIlO#1DBh`|uGa=X|)k z4goFqISxWNOMGXbIO6;2NM*Wm<7-VVtzke|evf~R=W~?CK(Nh>x}6*m3hqAJCuT67 z8ZG=Fq)(>^|k?<<-^k%W)94 zuE69rwhLlnVfmEWRAV#{^P2ca@@g3@l}}xm2)5khdx?C;STg#l!(b^5-fgJ%;vj7J zvH6#xgL#QY1L_dg4d6FePgL%VI+?FG!_CvJ<$!2<9hBl>KLg6f)J%QYC;O6Dur+^* zH5#}GTLdc$3&jE5+vacIzHKe_rB|lYArM^*6{_GOoNn!_HVrogvChxWZxoGw$lmNJ zOd9C#SJTm1+a`hMik8_~9RWq0FRn)wt@sE;?K?&0YoDMP9YACB622c|*lKQme){Kl z0r+txiIQb46EibGxK$@7%>e?0`yE*1@t~F0(D1+pjfH)BYc#Mo2+L%2G*HD6-g@+` z5;o$Q<4CpB{rx#8w0!bI-v?RGU*N2BDb)B6V?x*{^RJ0!uy)qp<9ce8(O=+WAx#4Z zeh`=;j{o=D(Bts`IUq%O=+{&XgU@vff%p{!6sh*Mwnr#=eTarl<7~8z5FlZ%?i7R) z^e@US99mV{Fv~5NWR##*AC_=DtZB;&j#-qhyA!u={1xCDQU813a7QWrr4>+Bzt-hP zzjh#;KkmE))8E38cn=5Rj%o()_0x~k$jAPF|Mh?FOZ3l#k~mqYZoxbQjDbJh0wIDW z4yJ5fp8;oYZ?A8IZrlLcoScccueaBOiftT5q>JIKM=5#2!}GPfu{Hq+(nvOfB}woe z9keC$dcG-DqVZHiBdf;HHs-|by>r$BM25X4JgeYH zA>3J?hNQ^qS={9$LSId1_V@2!78tV0dVGE$%fk}I`uo;5wkpdhi6Pd_J!fYub@q`5 zV0-}0wwE@4$`uXB+HUSBz{~b8Dv)s#D1MOJ|3IImgV6mLW{W#@^UI<%#XEy?07>>3 z$t4P!mS||YS=!t8_qyCAsIxTw1zCLUv(QWChK4Qlm~86Gtq9V}*BAY`w@UL=t# zPhX!;YPU$LmeQf7smVy}S7tu;Q;)rYrpKtH=t9(dVM%E~Iq zO5&rvUT=Me*l%5Fam#Td33Aek3UYHcAt7W{l~#VA+`ZdHCrRjAj|BGy?(*=~nV+SH zb)B!K$z)Jr#|Xu_i|#GqhqYPC3|C%XFy%OpJKEmctFpmW^4L>!VxcR2ef!9*-rUsG zhHqo^YrYoCj>F{oZ5UF4ZL&u{uI~e-4RPhx-iYgP=UAOL;nWLg)9qjdZ->w(=WSJ< zC({cB>ak`Wa{E|P)H{(F79ltZHfs5Oehs>7=UkqXUNBa9LFxq*MpM(=o!3aC5?&#p zv6JxH9vXpC!a55esI3}V>j^+)}y+*S|a6=*93gNk-3g)q&f@hf!A$pB_sKB__BclLj<0u zXt;24lpmkHcvEDY&OqDZqS0^&O=3GMGxLM|YiC3N0jkj|9w=zB{)`?%?#faM|F<)h zqd4#Ru_T2S2&sC8<chW{BlN-2_TDp1|?7lR4uk6f2^YjIbiCkiDzUvOng z{boDDoB9YNRb^JW%^8D~66ha$j{s~-v^Qu;k5fpJpO1^`3!*wZJ2y9V9!5Pm(;_{T zKLo702haJpz2qn#v|bDPHr=RDy0C54V{|W}EV^|DX19~cMe|;O?QB{M_Ts=_QEtyom{=5ph5aknb@xSeVmtxs>7hi>HvDi&w&{f(Nd&a!DFL-w%TqaHL z<6`{RpEjo#f(Owy%2s9wFiB!sV5L36Ac!|26ZR!2So!_Tlr)U-GWgHxRm|%Qtr(b& zFnck^IP6Qiw;WpQYA?>!Q~+jvJq<)u_F`!$AP7D4h|}4}_n;bh$L1i}{Yx5-6I6e$ zh@dGfEhUx{^8a8%a9GiDw!?rE##xSl?ydA{X`+4D@N9j#tBY-tN$i{G>vakj(36zZ zt3^7vx)T3_CyE4?*Uz4u3G8QQU}+X&JEtk~JdE9vNdr)M5;}b^68SY~v8eL0*B3|r zpmVg zvpP8NkWs)Z>kWi)l1a1L&In$1yp*f!0$N6&R>TJbjX2Sh3taD~q=UgrnI2yuQELC8 zAj1(x9A!Ky2DeU!bJ?z@t6-wgL&8Qi!9?>Cji$yTkYxtpxY36!jWrll|#`sDtE00Yy`v2aUB>K*}RT%s}eLHVlsL2(ZtIPn?YR0PE8gF!C!>U>v z+3y$IIwP?uIwb_Wl3MZXzNHq?K3H=Fm3r18F)=aIwr?Y>Q<2dqT#LuB9don7nCN6O zMYhf`M11c`motVfyz3CQF^pMeTWfnqv6hFX>Azk)cMmxaLti^D?RIy8@Yy{4sNyXS z{s!HGoi8gZD>jkef7+azSIq1-Dgn={jVMdiBaGr3D-UJ@N+YR`!moJ?qHjl!On?8H zz@ORVhu+yq&U`&QbUu9Gb|$S*tz1i6L&Nu^WWeM?RrYd7((Ht@#$Wj{9I71+N*PXb9>+UV|bebHF4oH`j(Q+BNRHtErm_kXapg=Lq#atnzGzA{&sf3wf37U0E5aap4<$VxOv#-GI(GG;&KJ?47)X zFrtx5jOTe6Vo&#+6h{Q+d>xxJ>__YoH^8+GS29^OXq(t;0$gpYwaxJrV23V3afhGY znB1$Nzc;qx)!#tst{~&;`h+3E@5VvVYbjPDymZH0LF5h?djns^ln&;$Kw!D`fj1## z!s2yksGEB{=<87(MD1`B_z{O*=&jVcC4Yw`;dsBJw0ofTfX8W?$MiJjKBSXCc#YR@ z`FhhwwZ@Zjz?)eU4DR?~bR_u^9vc85n5}b4`vXis3qDvbc=Y<_NGaK=(Vl;Lkz_)b z1UmSXu$yiwA)<*OCJ1)^&6sM#aIG~UA2Xet@cG7O!bz_1cK3_9Q@2mDJiYkSJNv_O z@Yr0jPLjz2s8xa(=LM>>$?VTj^}ylV>YDGRsnN)#?K2G&^4QtlPR=9mxG;6aXV4|i z9#4!6y;Yh7JlS88ES6BMnovJJxr$h+>LLyaD8y@=E2Prqi6-ju)(~r)F z@Fn!e|4!Xug`R#$vHm-R{r~(nMr(Wf`3=Z);UoC*>c7Llzbj_KK5~K!Ho+UDM!#mz P-zNRwv3Q>7GmrlV6Gz-% literal 0 HcmV?d00001 diff --git a/torchrl/objectives/common.py b/torchrl/objectives/common.py index be931e8c260..5c4bc835e5c 100644 --- a/torchrl/objectives/common.py +++ b/torchrl/objectives/common.py @@ -284,7 +284,8 @@ def _target_param_getter(self, network_name): value_to_set = getattr( self, "_sep_".join(["_target_" + network_name, *key]) ) - target_params.set(key, value_to_set) + # _set is faster bc is bypasses the checks + target_params._set(key, value_to_set) return target_params else: params = getattr(self, param_name) diff --git a/torchrl/trainers/trainers.py b/torchrl/trainers/trainers.py index 04aef9d0aa2..d8bce805487 100644 --- a/torchrl/trainers/trainers.py +++ b/torchrl/trainers/trainers.py @@ -110,9 +110,11 @@ class Trainer: displayed using tqdm. If tqdm is not installed, this option won't have any effect. Default is :obj:`True` seed (int, optional): Seed to be used for the collector, pytorch and - numpy. Default is 42. + numpy. Default is ``None``. save_trainer_interval (int, optional): How often the trainer should be - saved to disk. Default is 10000. + saved to disk, in frame count. Default is 10000. + log_interval (int, optional): How often the values should be logged, + in frame count. Default is 10000. save_trainer_file (path, optional): path where to save the trainer. Default is None (no saving) """ @@ -124,7 +126,6 @@ def __new__(cls, *args, **kwargs): cls._collected_frames: int = 0 cls._last_log: Dict[str, Any] = {} cls._last_save: int = 0 - cls._log_interval: int = 10000 cls.collected_frames = 0 cls._app_state = None return super().__new__(cls) @@ -142,8 +143,9 @@ def __init__( clip_grad_norm: bool = True, clip_norm: float = None, progress_bar: bool = True, - seed: int = 42, + seed: int = None, save_trainer_interval: int = 10000, + log_interval: int=10000, save_trainer_file: Optional[Union[str, pathlib.Path]] = None, ) -> None: @@ -154,9 +156,12 @@ def __init__( self.optimizer = optimizer self.logger = logger + self._log_interval = log_interval + # seeding self.seed = seed - self.set_seed() + if seed is not None: + self.set_seed() # constants self.optim_steps_per_batch = optim_steps_per_batch diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py index 956721e10b7..3583aaf01e8 100644 --- a/tutorials/sphinx-tutorials/coding_dqn.py +++ b/tutorials/sphinx-tutorials/coding_dqn.py @@ -1,696 +1,703 @@ -if __name__ == "__main__": - # -*- coding: utf-8 -*- - """ - TorchRL trainer: A DQN example - ============================== - **Author**: `Vincent Moens `_ - - """ - - ############################################################################## - # TorchRL provides a generic :class:`torchrl.trainers.Trainer` class to handle - # your training loop. The trainer executes a nested loop where the outer loop - # is the data collection and the inner loop consumes this data or some data - # retrieved from the replay buffer to train the model. - # At various points in this training loop, hooks can be attached and executed at - # given intervals. - # - # In this tutorial, we will be using the trainer class to train a DQN algorithm - # to solve the CartPole task from scratch. - # - # Main takeaways: - # - # - Building a trainer with its essential components: data collector, loss - # module, replay buffer and optimizer. - # - Adding hooks to a trainer, such as loggers, target network updaters and such. - # - # The trainer is fully customisable and offers a large set of functionalities. - # The tutorial is organised around its construction. - # We will be detailing how to build each of the components of the library first, - # and then put the pieces together using the :class:`torchrl.trainers.Trainer` - # class. - # - # Along the road, we will also focus on some other aspects of the library: - # - # - how to build an environment in TorchRL, including transforms (e.g. data - # normalization, frame concatenation, resizing and turning to grayscale) - # and parallel execution. Unlike what we did in the - # `DDPG tutorial `_, we - # will normalize the pixels and not the state vector. - # - how to design a :class:`torchrl.modules.QValueActor` object, i.e. an actor - # that estimates the action values and picks up the action with the highest - # estimated return; - # - how to collect data from your environment efficiently and store them - # in a replay buffer; - # - how to use multi-step, a simple preprocessing step for off-policy algorithms; - # - and finally how to evaluate your model. - # - # **Prerequisites**: We encourage you to get familiar with torchrl through the - # `PPO tutorial `_ first. - # - # DQN - # --- - # - # DQN (`Deep Q-Learning `_) was - # the founding work in deep reinforcement learning. - # - # On a high level, the algorithm is quite simple: Q-learning consists in - # learning a table of state-action values in such a way that, when - # encountering any particular state, we know which action to pick just by - # searching for the one with the highest value. This simple setting - # requires the actions and states to be - # discrete, otherwise a lookup table cannot be built. - # - # DQN uses a neural network that encodes a map from the state-action space to - # a value (scalar) space, which amortizes the cost of storing and exploring all - # the possible state-action combinations: if a state has not been seen in the - # past, we can still pass it in conjunction with the various actions available - # through our neural network and get an interpolated value for each of the - # actions available. - # - # We will solve the classic control problem of the cart pole. From the - # Gymnasium doc from where this environment is retrieved: - # - # | A pole is attached by an un-actuated joint to a cart, which moves along a - # | frictionless track. The pendulum is placed upright on the cart and the goal - # | is to balance the pole by applying forces in the left and right direction - # | on the cart. - # - # .. figure:: /_static/img/cartpole_demo.gif - # :alt: Cart Pole - # - # We do not aim at giving a SOTA implementation of the algorithm, but rather - # to provide a high-level illustration of TorchRL features in the context - # of this algorithm. - - # sphinx_gallery_start_ignore - import warnings - - warnings.filterwarnings("ignore") - # sphinx_gallery_end_ignore - - import os - import uuid - - import torch - from torch import nn - from torchrl.collectors import MultiaSyncDataCollector - from torchrl.data import LazyMemmapStorage, MultiStep, TensorDictReplayBuffer - from torchrl.envs import EnvCreator, ParallelEnv, RewardScaling, StepCounter - from torchrl.envs.libs.gym import GymEnv - from torchrl.envs.transforms import ( - CatFrames, - Compose, - GrayScale, - ObservationNorm, - Resize, - ToTensorImage, - TransformedEnv, - ) - from torchrl.modules import DuelingCnnDQNet, EGreedyWrapper, QValueActor - - from torchrl.objectives import DQNLoss, SoftUpdate - from torchrl.record.loggers.csv import CSVLogger - from torchrl.trainers import ( - LogReward, - Recorder, - ReplayBufferTrainer, - Trainer, - UpdateWeights, - ) - - def is_notebook() -> bool: - try: - shell = get_ipython().__class__.__name__ - if shell == "ZMQInteractiveShell": - return True # Jupyter notebook or qtconsole - elif shell == "TerminalInteractiveShell": - return False # Terminal running IPython - else: - return False # Other type (?) - except NameError: - return False # Probably standard Python interpreter - - ############################################################################### - # Let's get started with the various pieces we need for our algorithm: - # - # - An environment; - # - A policy (and related modules that we group under the "model" umbrella); - # - A data collector, which makes the policy play in the environment and - # delivers training data; - # - A replay buffer to store the training data; - # - A loss module, which computes the objective function to train our policy - # to maximise the return; - # - An optimizer, which performs parameter updates based on our loss. - # - # Additional modules include a logger, a recorder (executes the policy in - # "eval" mode) and a target network updater. With all these components into - # place, it is easy to see how one could misplace or misuse one component in - # the training script. The trainer is there to orchestrate everything for you! - # - # Building the environment - # ------------------------ - # - # First let's write a helper function that will output an environment. As usual, - # the "raw" environment may be too simple to be used in practice and we'll need - # some data transformation to expose its output to the policy. - # - # We will be using five transforms: - # - # - :class:`torchrl.envs.StepCounter` to count the number of steps in each trajectory; - # - :class:`torchrl.envs.ToTensorImage` will convert a ``[W, H, C]`` uint8 - # tensor in a floating point tensor in the ``[0, 1]`` space with shape - # ``[C, W, H]``; - # - :class:`torchrl.envs.RewardScaling` to reduce the scale of the return; - # - :class:`torchrl.envs.GrayScale` will turn our image into grayscale; - # - :class:`torchrl.envs.Resize` will resize the image in a 64x64 format; - # - :class:`torchrl.envs.CatFrames` will concatenate an arbitrary number of - # successive frames (``N=4``) in a single tensor along the channel dimension. - # This is useful as a single image does not carry information about the - # motion of the cartpole. Some memory about past observations and actions - # is needed, either via a recurrent neural network or using a stack of - # frames. - # - :class:`torchrl.envs.ObservationNorm` which will normalize our observations - # given some custom summary statistics. - # - # In practice, our environment builder has two arguments: - # - # - ``parallel``: determines whether multiple environments have to be run in - # parallel. We stack the transforms after the - # :class:`torchrl.envs.ParallelEnv` to take advantage - # of vectorization of the operations on device, although this would - # technically work with every single environment attached to its own set of - # transforms. - # - ``obs_norm_sd`` will contain the normalizing constants for - # the :class:`torchrl.envs.ObservationNorm` transform. - # - - def make_env( - parallel=False, - obs_norm_sd=None, - ): - if obs_norm_sd is None: - obs_norm_sd = {"standard_normal": True} - if parallel: - base_env = ParallelEnv( - num_workers, - EnvCreator( - lambda: GymEnv( - "CartPole-v1", - from_pixels=True, - pixels_only=True, - device=device, - ) - ), - ) +# -*- coding: utf-8 -*- +""" +TorchRL trainer: A DQN example +============================== +**Author**: `Vincent Moens `_ + +""" + +############################################################################## +# TorchRL provides a generic :class:`torchrl.trainers.Trainer` class to handle +# your training loop. The trainer executes a nested loop where the outer loop +# is the data collection and the inner loop consumes this data or some data +# retrieved from the replay buffer to train the model. +# At various points in this training loop, hooks can be attached and executed at +# given intervals. +# +# In this tutorial, we will be using the trainer class to train a DQN algorithm +# to solve the CartPole task from scratch. +# +# Main takeaways: +# +# - Building a trainer with its essential components: data collector, loss +# module, replay buffer and optimizer. +# - Adding hooks to a trainer, such as loggers, target network updaters and such. +# +# The trainer is fully customisable and offers a large set of functionalities. +# The tutorial is organised around its construction. +# We will be detailing how to build each of the components of the library first, +# and then put the pieces together using the :class:`torchrl.trainers.Trainer` +# class. +# +# Along the road, we will also focus on some other aspects of the library: +# +# - how to build an environment in TorchRL, including transforms (e.g. data +# normalization, frame concatenation, resizing and turning to grayscale) +# and parallel execution. Unlike what we did in the +# `DDPG tutorial `_, we +# will normalize the pixels and not the state vector. +# - how to design a :class:`torchrl.modules.QValueActor` object, i.e. an actor +# that estimates the action values and picks up the action with the highest +# estimated return; +# - how to collect data from your environment efficiently and store them +# in a replay buffer; +# - how to use multi-step, a simple preprocessing step for off-policy algorithms; +# - and finally how to evaluate your model. +# +# **Prerequisites**: We encourage you to get familiar with torchrl through the +# `PPO tutorial `_ first. +# +# DQN +# --- +# +# DQN (`Deep Q-Learning `_) was +# the founding work in deep reinforcement learning. +# +# On a high level, the algorithm is quite simple: Q-learning consists in +# learning a table of state-action values in such a way that, when +# encountering any particular state, we know which action to pick just by +# searching for the one with the highest value. This simple setting +# requires the actions and states to be +# discrete, otherwise a lookup table cannot be built. +# +# DQN uses a neural network that encodes a map from the state-action space to +# a value (scalar) space, which amortizes the cost of storing and exploring all +# the possible state-action combinations: if a state has not been seen in the +# past, we can still pass it in conjunction with the various actions available +# through our neural network and get an interpolated value for each of the +# actions available. +# +# We will solve the classic control problem of the cart pole. From the +# Gymnasium doc from where this environment is retrieved: +# +# | A pole is attached by an un-actuated joint to a cart, which moves along a +# | frictionless track. The pendulum is placed upright on the cart and the goal +# | is to balance the pole by applying forces in the left and right direction +# | on the cart. +# +# .. figure:: /_static/img/cartpole_demo.gif +# :alt: Cart Pole +# +# We do not aim at giving a SOTA implementation of the algorithm, but rather +# to provide a high-level illustration of TorchRL features in the context +# of this algorithm. + +# sphinx_gallery_start_ignore +import warnings + +warnings.filterwarnings("ignore") +# sphinx_gallery_end_ignore + +import os +import uuid + +import torch +from torch import nn +from torchrl.collectors import MultiaSyncDataCollector +from torchrl.data import LazyMemmapStorage, MultiStep, TensorDictReplayBuffer +from torchrl.envs import EnvCreator, ParallelEnv, RewardScaling, StepCounter +from torchrl.envs.libs.gym import GymEnv +from torchrl.envs.transforms import ( + CatFrames, + Compose, + GrayScale, + ObservationNorm, + Resize, + ToTensorImage, + TransformedEnv, +) +from torchrl.modules import DuelingCnnDQNet, EGreedyWrapper, QValueActor + +from torchrl.objectives import DQNLoss, SoftUpdate +from torchrl.record.loggers.csv import CSVLogger +from torchrl.trainers import ( + LogReward, + Recorder, + ReplayBufferTrainer, + Trainer, + UpdateWeights, +) + +def is_notebook() -> bool: + try: + shell = get_ipython().__class__.__name__ + if shell == "ZMQInteractiveShell": + return True # Jupyter notebook or qtconsole + elif shell == "TerminalInteractiveShell": + return False # Terminal running IPython else: - base_env = GymEnv( - "CartPole-v1", - from_pixels=True, - pixels_only=True, - device=device, - ) - - env = TransformedEnv( - base_env, - Compose( - StepCounter(), # to count the steps of each trajectory - ToTensorImage(), - RewardScaling(loc=0.0, scale=0.1), - GrayScale(), - Resize(64, 64), - CatFrames(4, in_keys=["pixels"], dim=-3), - ObservationNorm(in_keys=["pixels"], **obs_norm_sd), + return False # Other type (?) + except NameError: + return False # Probably standard Python interpreter + +############################################################################### +# Let's get started with the various pieces we need for our algorithm: +# +# - An environment; +# - A policy (and related modules that we group under the "model" umbrella); +# - A data collector, which makes the policy play in the environment and +# delivers training data; +# - A replay buffer to store the training data; +# - A loss module, which computes the objective function to train our policy +# to maximise the return; +# - An optimizer, which performs parameter updates based on our loss. +# +# Additional modules include a logger, a recorder (executes the policy in +# "eval" mode) and a target network updater. With all these components into +# place, it is easy to see how one could misplace or misuse one component in +# the training script. The trainer is there to orchestrate everything for you! +# +# Building the environment +# ------------------------ +# +# First let's write a helper function that will output an environment. As usual, +# the "raw" environment may be too simple to be used in practice and we'll need +# some data transformation to expose its output to the policy. +# +# We will be using five transforms: +# +# - :class:`torchrl.envs.StepCounter` to count the number of steps in each trajectory; +# - :class:`torchrl.envs.ToTensorImage` will convert a ``[W, H, C]`` uint8 +# tensor in a floating point tensor in the ``[0, 1]`` space with shape +# ``[C, W, H]``; +# - :class:`torchrl.envs.RewardScaling` to reduce the scale of the return; +# - :class:`torchrl.envs.GrayScale` will turn our image into grayscale; +# - :class:`torchrl.envs.Resize` will resize the image in a 64x64 format; +# - :class:`torchrl.envs.CatFrames` will concatenate an arbitrary number of +# successive frames (``N=4``) in a single tensor along the channel dimension. +# This is useful as a single image does not carry information about the +# motion of the cartpole. Some memory about past observations and actions +# is needed, either via a recurrent neural network or using a stack of +# frames. +# - :class:`torchrl.envs.ObservationNorm` which will normalize our observations +# given some custom summary statistics. +# +# In practice, our environment builder has two arguments: +# +# - ``parallel``: determines whether multiple environments have to be run in +# parallel. We stack the transforms after the +# :class:`torchrl.envs.ParallelEnv` to take advantage +# of vectorization of the operations on device, although this would +# technically work with every single environment attached to its own set of +# transforms. +# - ``obs_norm_sd`` will contain the normalizing constants for +# the :class:`torchrl.envs.ObservationNorm` transform. +# + +def make_env( + parallel=False, + obs_norm_sd=None, +): + if obs_norm_sd is None: + obs_norm_sd = {"standard_normal": True} + if parallel: + base_env = ParallelEnv( + num_workers, + EnvCreator( + lambda: GymEnv( + "CartPole-v1", + from_pixels=True, + pixels_only=True, + device=device, + ) ), ) - return env - - ############################################################################### - # Compute normalizing constants - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # - # To normalize images, we don't want to normalize each pixel independently - # with a full ``[C, W, H]`` normalizing mask, but with simpler ``[C, 1, 1]`` - # shaped set of normalizing constants (loc and scale parameters). - # We will be using the ``reduce_dim`` argument - # of :meth:`torchrl.envs.ObservationNorm.init_stats` to instruct which - # dimensions must be reduced, and the ``keep_dims`` parameter to ensure that - # not all dimensions disappear in the process: - # - - def get_norm_stats(): - test_env = make_env() - test_env.transform[-1].init_stats( - num_iter=1000, cat_dim=0, reduce_dim=[-1, -2, -4], keep_dims=(-1, -2) - ) - obs_norm_sd = test_env.transform[-1].state_dict() - # let's check that normalizing constants have a size of ``[C, 1, 1]`` where - # ``C=4`` (because of :class:`torchrl.envs.CatFrames`). - print("state dict of the observation norm:", obs_norm_sd) - return obs_norm_sd - - ############################################################################### - # Building the model (Deep Q-network) - # ----------------------------------- - # - # The following function builds a :class:`torchrl.modules.DuelingCnnDQNet` - # object which is a simple CNN followed by a two-layer MLP. The only trick used - # here is that the action values (i.e. left and right action value) are - # computed using - # - # .. math:: - # - # val = b(obs) + v(obs) - \mathbb{E}[v(obs)] - # - # where :math:`b` is a :math:`\# obs \rightarrow 1` function and :math:`v` is a - # :math:`\# obs \rightarrow num_actions` function. - # - # Our network is wrapped in a :class:`torchrl.modules.QValueActor`, - # which will read the state-action - # values, pick up the one with the maximum value and write all those results - # in the input :class:`tensordict.TensorDict`. - # - - def make_model(dummy_env): - cnn_kwargs = { - "num_cells": [32, 64, 64], - "kernel_sizes": [6, 4, 3], - "strides": [2, 2, 1], - "activation_class": nn.ELU, - # This can be used to reduce the size of the last layer of the CNN - # "squeeze_output": True, - # "aggregator_class": nn.AdaptiveAvgPool2d, - # "aggregator_kwargs": {"output_size": (1, 1)}, - } - mlp_kwargs = { - "depth": 2, - "num_cells": [ - 64, - 64, - ], - "activation_class": nn.ELU, - } - net = DuelingCnnDQNet( - dummy_env.action_spec.shape[-1], 1, cnn_kwargs, mlp_kwargs - ).to(device) - net.value[-1].bias.data.fill_(init_bias) - - actor = QValueActor(net, in_keys=["pixels"], spec=dummy_env.action_spec).to( - device - ) - # init actor: because the model is composed of lazy conv/linear layers, - # we must pass a fake batch of data through it to instantiate them. - tensordict = dummy_env.fake_tensordict() - actor(tensordict) - - # we wrap our actor in an EGreedyWrapper for data collection - actor_explore = EGreedyWrapper( - actor, - annealing_num_steps=total_frames, - eps_init=eps_greedy_val, - eps_end=eps_greedy_val_env, - ) - - return actor, actor_explore - - ############################################################################### - # Collecting and storing data - # --------------------------- - # - # Replay buffers - # ~~~~~~~~~~~~~~ - # - # Replay buffers play a central role in off-policy RL algorithms such as DQN. - # They constitute the dataset we will be sampling from during training. - # - # Here, we will use a regular sampling strategy, although a prioritized RB - # could improve the performance significantly. - # - # We place the storage on disk using - # :class:`torchrl.data.replay_buffers.storages.LazyMemmapStorage` class. This - # storage is created in a lazy manner: it will only be instantiated once the - # first batch of data is passed to it. - # - # The only requirement of this storage is that the data passed to it at write - # time must always have the same shape. - - def get_replay_buffer(buffer_size, n_optim, batch_size): - replay_buffer = TensorDictReplayBuffer( - batch_size=batch_size, - storage=LazyMemmapStorage(buffer_size), - prefetch=n_optim, - ) - return replay_buffer - - ############################################################################### - # Data collector - # ~~~~~~~~~~~~~~ - # - # As in `PPO ` and - # `DDPG `, we will be using - # a data collector as a dataloader in the outer loop. - # - # We choose the following configuration: we will be running a series of - # parallel environments synchronously in parallel in different collectors, - # themselves running in parallel but asynchronously. - # The advantage of this configuration is that we can balance the amount of - # compute that is executed in batch with what we want to be executed - # asynchronously. We encourage the reader to experiment how the collection - # speed is impacted by modifying the number of collectors (ie the number of - # environment constructors passed to the collector) and the number of - # environment executed in parallel in each collector (controlled by the - # ``num_workers`` hyperparameter). - # - # When building the collector, we can choose on which device we want the - # environment and policy to execute the operations through the ``device`` - # keyword argument. The ``storing_devices`` argument will modify the - # location of the data being collected: if the batches that we are gathering - # have a considerable size, we may want to store them on a different location - # than the device where the computation is happening. For asynchronous data - # collectors such as ours, different storing devices mean that the data that - # we collect won't sit on the same device each time, which is something that - # out training loop must account for. For simplicity, we set the devices to - # the same value for all sub-collectors. - - def get_collector( - obs_norm_sd, - num_collectors, - actor_explore, - frames_per_batch, - total_frames, - device, - ): - data_collector = MultiaSyncDataCollector( - [ - make_env(parallel=True, obs_norm_sd=obs_norm_sd), - ] - * num_collectors, - policy=actor_explore, - frames_per_batch=frames_per_batch, - total_frames=total_frames, - # this is the default behaviour: the collector runs in ``"random"`` (or explorative) mode - exploration_mode="random", - # We set the all the devices to be identical. Below is an example of - # heterogeneous devices + else: + base_env = GymEnv( + "CartPole-v1", + from_pixels=True, + pixels_only=True, device=device, - storing_device=device, - split_trajs=False, - postproc=MultiStep(gamma=gamma, n_steps=5), ) - return data_collector - - ############################################################################### - # Loss function - # ------------- - # - # Building our loss function is straightforward: we only need to provide - # the model and a bunch of hyperparameters to the DQNLoss class. - # - # Target parameters - # ~~~~~~~~~~~~~~~~~ - # - # Many off-policy RL algorithms use the concept of "target parameters" when it - # comes to estimate the value of the next state or state-action pair. - # The target parameters are lagged copies of the model parameters. Because - # their predictions mismatch those of the current model configuration, they - # help learning by putting a pessimistic bound on the value being estimated. - # This is a powerful trick (known as "Double Q-Learning") that is ubiquitous - # in similar algorithms. - # - - def get_loss_module(actor, gamma): - loss_module = DQNLoss(actor, gamma=gamma, delay_value=True) - target_updater = SoftUpdate(loss_module) - return loss_module, target_updater - - ############################################################################### - # Hyperparameters - # --------------- - # - # Let's start with our hyperparameters. The following setting should work well - # in practice, and the performance of the algorithm should hopefully not be - # too sensitive to slight variations of these. - - device = "cuda:0" if torch.cuda.device_count() > 0 else "cpu" - - ############################################################################### - # Optimizer - # ~~~~~~~~~ - - # the learning rate of the optimizer - lr = 2e-3 - # weight decay - wd = 1e-5 - # the beta parameters of Adam - betas = (0.9, 0.999) - # Optimization steps per batch collected (aka UPD or updates per data) - n_optim = 8 - - ############################################################################### - # DQN parameters - # ~~~~~~~~~~~~~~ - # gamma decay factor - gamma = 0.99 - - ############################################################################### - # Smooth target network update decay parameter. - # This loosely corresponds to a 1/tau interval with hard target network - # update - tau = 0.02 - - ############################################################################### - # Data collection and replay buffer - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # Values to be used for proper training have been commented. - # - # Total frames collected in the environment. In other implementations, the - # user defines a maximum number of episodes. - # This is harder to do with our data collectors since they return batches - # of N collected frames, where N is a constant. - # However, one can easily get the same restriction on number of episodes by - # breaking the training loop when a certain number - # episodes has been collected. - total_frames = 4096 # 500000 - - ############################################################################### - # Random frames used to initialize the replay buffer. - init_random_frames = 100 # 1000 - - ############################################################################### - # Frames in each batch collected. - frames_per_batch = 32 # 128 - - ############################################################################### - # Frames sampled from the replay buffer at each optimization step - batch_size = 32 # 256 - - ############################################################################### - # Size of the replay buffer in terms of frames - buffer_size = min(total_frames, 100000) - - ############################################################################### - # Number of environments run in parallel in each data collector - num_workers = 2 # 8 - num_collectors = 2 # 4 - - ############################################################################### - # Environment and exploration - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # - # We set the initial and final value of the epsilon factor in Epsilon-greedy - # exploration. - # Since our policy is deterministic, exploration is crucial: without it, the - # only source of randomness would be the environment reset. - - eps_greedy_val = 0.1 - eps_greedy_val_env = 0.005 - - ############################################################################### - # To speed up learning, we set the bias of the last layer of our value network - # to a predefined value (this is not mandatory) - init_bias = 2.0 - - ############################################################################### - # .. note:: - # For fast rendering of the tutorial ``total_frames`` hyperparameter - # was set to a very low number. To get a reasonable performance, use a greater - # value e.g. 500000 - # - - ############################################################################### - # Building a Trainer - # ------------------ - # - # TorchRL's :class:`torchrl.trainers.Trainer` class constructor takes the - # following keyword-only arguments: - # - # - ``collector`` - # - ``loss_module`` - # - ``optimizer`` - # - ``logger``: A logger can be - # - ``total_frames``: this parameter defines the lifespan of the trainer. - # - ``frame_skip``: when a frame-skip is used, the collector must be made - # aware of it in order to accurately count the number of frames - # collected etc. Making the trainer aware of this parameter is not - # mandatory but helps to have a fairer comparison between settings where - # the total number of frames (budget) is fixed but the frame-skip is - # variable. - - stats = get_norm_stats() - test_env = make_env(parallel=False, obs_norm_sd=stats) - # Get model - actor, actor_explore = make_model(test_env) - loss_module, target_net_updater = get_loss_module(actor, gamma) - target_net_updater.init_() - - collector = get_collector( - stats, num_collectors, actor_explore, frames_per_batch, total_frames, device + + env = TransformedEnv( + base_env, + Compose( + StepCounter(), # to count the steps of each trajectory + ToTensorImage(), + RewardScaling(loc=0.0, scale=0.1), + GrayScale(), + Resize(64, 64), + CatFrames(4, in_keys=["pixels"], dim=-3), + ObservationNorm(in_keys=["pixels"], **obs_norm_sd), + ), ) - optimizer = torch.optim.Adam( - loss_module.parameters(), lr=lr, weight_decay=wd, betas=betas + return env + +############################################################################### +# Compute normalizing constants +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# To normalize images, we don't want to normalize each pixel independently +# with a full ``[C, W, H]`` normalizing mask, but with simpler ``[C, 1, 1]`` +# shaped set of normalizing constants (loc and scale parameters). +# We will be using the ``reduce_dim`` argument +# of :meth:`torchrl.envs.ObservationNorm.init_stats` to instruct which +# dimensions must be reduced, and the ``keep_dims`` parameter to ensure that +# not all dimensions disappear in the process: +# + +def get_norm_stats(): + test_env = make_env() + test_env.transform[-1].init_stats( + num_iter=1000, cat_dim=0, reduce_dim=[-1, -2, -4], keep_dims=(-1, -2) ) - exp_name = f"dqn_exp_{uuid.uuid1()}" - logger = CSVLogger(exp_name=exp_name, log_dir="./") - - trainer = Trainer( - collector=collector, - total_frames=total_frames, - frame_skip=1, - loss_module=loss_module, - optimizer=optimizer, - logger=logger, - optim_steps_per_batch=n_optim, + obs_norm_sd = test_env.transform[-1].state_dict() + # let's check that normalizing constants have a size of ``[C, 1, 1]`` where + # ``C=4`` (because of :class:`torchrl.envs.CatFrames`). + print("state dict of the observation norm:", obs_norm_sd) + return obs_norm_sd + +############################################################################### +# Building the model (Deep Q-network) +# ----------------------------------- +# +# The following function builds a :class:`torchrl.modules.DuelingCnnDQNet` +# object which is a simple CNN followed by a two-layer MLP. The only trick used +# here is that the action values (i.e. left and right action value) are +# computed using +# +# .. math:: +# +# val = b(obs) + v(obs) - \mathbb{E}[v(obs)] +# +# where :math:`b` is a :math:`\mathbb{R}^n \rightarrow 1` function and :math:`v` is a +# :math:`\mathbb{R}^n \rightarrow \mathbb{R}^m` function, for +# :math:`n = \# obs` and :math:`m = \# actions`. +# +# Our network is wrapped in a :class:`torchrl.modules.QValueActor`, +# which will read the state-action +# values, pick up the one with the maximum value and write all those results +# in the input :class:`tensordict.TensorDict`. +# + +def make_model(dummy_env): + cnn_kwargs = { + "num_cells": [32, 64, 64], + "kernel_sizes": [6, 4, 3], + "strides": [2, 2, 1], + "activation_class": nn.ELU, + # This can be used to reduce the size of the last layer of the CNN + # "squeeze_output": True, + # "aggregator_class": nn.AdaptiveAvgPool2d, + # "aggregator_kwargs": {"output_size": (1, 1)}, + } + mlp_kwargs = { + "depth": 2, + "num_cells": [ + 64, + 64, + ], + "activation_class": nn.ELU, + } + net = DuelingCnnDQNet( + dummy_env.action_spec.shape[-1], 1, cnn_kwargs, mlp_kwargs + ).to(device) + net.value[-1].bias.data.fill_(init_bias) + + actor = QValueActor(net, in_keys=["pixels"], spec=dummy_env.action_spec).to( + device + ) + # init actor: because the model is composed of lazy conv/linear layers, + # we must pass a fake batch of data through it to instantiate them. + tensordict = dummy_env.fake_tensordict() + actor(tensordict) + + # we wrap our actor in an EGreedyWrapper for data collection + actor_explore = EGreedyWrapper( + actor, + annealing_num_steps=total_frames, + eps_init=eps_greedy_val, + eps_end=eps_greedy_val_env, ) - ############################################################################### - # Registering hooks - # ~~~~~~~~~~~~~~~~~ - # - # Registering hooks can be achieved in two separate ways: - # - # - If the hook has it, the :meth:`torchrl.trainers.TrainerHookBase.register` - # method is the first choice. One just needs to provide the trainer as input - # and the hook will be registered with a default name at a default location. - # For some hooks, the registration can be quite complex: :class:`torchrl.trainers.ReplayBufferTrainer` - # requires 3 hooks (``extend``, ``sample`` and ``update_priority``) which - # can be cumbersome to implement. - buffer_hook = ReplayBufferTrainer( - get_replay_buffer(buffer_size, n_optim, batch_size=batch_size), - flatten_tensordicts=True, + return actor, actor_explore + +############################################################################### +# Collecting and storing data +# --------------------------- +# +# Replay buffers +# ~~~~~~~~~~~~~~ +# +# Replay buffers play a central role in off-policy RL algorithms such as DQN. +# They constitute the dataset we will be sampling from during training. +# +# Here, we will use a regular sampling strategy, although a prioritized RB +# could improve the performance significantly. +# +# We place the storage on disk using +# :class:`torchrl.data.replay_buffers.storages.LazyMemmapStorage` class. This +# storage is created in a lazy manner: it will only be instantiated once the +# first batch of data is passed to it. +# +# The only requirement of this storage is that the data passed to it at write +# time must always have the same shape. + +def get_replay_buffer(buffer_size, n_optim, batch_size): + replay_buffer = TensorDictReplayBuffer( + batch_size=batch_size, + storage=LazyMemmapStorage(buffer_size), + prefetch=n_optim, ) - buffer_hook.register(trainer) - weight_updater = UpdateWeights(collector, update_weights_interval=1) - weight_updater.register(trainer) - recorder = Recorder( - record_interval=1, # log every 100 optimization steps - record_frames=10_000, # maximum number of frames in the record - frame_skip=1, - policy_exploration=actor_explore, - environment=test_env, - exploration_mode="mode", - log_keys=[("next", "reward")], - out_keys={("next", "reward"): "rewards"}, - log_pbar=True, + return replay_buffer + +############################################################################### +# Data collector +# ~~~~~~~~~~~~~~ +# +# As in `PPO ` and +# `DDPG `, we will be using +# a data collector as a dataloader in the outer loop. +# +# We choose the following configuration: we will be running a series of +# parallel environments synchronously in parallel in different collectors, +# themselves running in parallel but asynchronously. +# The advantage of this configuration is that we can balance the amount of +# compute that is executed in batch with what we want to be executed +# asynchronously. We encourage the reader to experiment how the collection +# speed is impacted by modifying the number of collectors (ie the number of +# environment constructors passed to the collector) and the number of +# environment executed in parallel in each collector (controlled by the +# ``num_workers`` hyperparameter). +# +# When building the collector, we can choose on which device we want the +# environment and policy to execute the operations through the ``device`` +# keyword argument. The ``storing_devices`` argument will modify the +# location of the data being collected: if the batches that we are gathering +# have a considerable size, we may want to store them on a different location +# than the device where the computation is happening. For asynchronous data +# collectors such as ours, different storing devices mean that the data that +# we collect won't sit on the same device each time, which is something that +# out training loop must account for. For simplicity, we set the devices to +# the same value for all sub-collectors. + +def get_collector( + obs_norm_sd, + num_collectors, + actor_explore, + frames_per_batch, + total_frames, + device, +): + data_collector = MultiaSyncDataCollector( + [ + make_env(parallel=True, obs_norm_sd=obs_norm_sd), + ] + * num_collectors, + policy=actor_explore, + frames_per_batch=frames_per_batch, + total_frames=total_frames, + # this is the default behaviour: the collector runs in ``"random"`` (or explorative) mode + exploration_mode="random", + # We set the all the devices to be identical. Below is an example of + # heterogeneous devices + device=device, + storing_device=device, + split_trajs=False, + postproc=MultiStep(gamma=gamma, n_steps=5), ) - recorder.register(trainer) - - ############################################################################### - # - Any callable (including :class:`torchrl.trainers.TrainerHookBase` - # subclasses) can be registered using :meth:`torchrl.trainers.Trainer.register_op`. - # In this case, a location must be explicitely passed (). This method gives - # more control over the location of the hook but it also requires more - # understanding of the Trainer mechanism. - # Check the `trainer documentation `_ - # for a detailed description of the trainer hooks. - # - trainer.register_op("post_optim", target_net_updater.step) - - ############################################################################### - # We can log the training rewards too. Note that this is of limited interest - # with CartPole, as rewards are always 1. The discounted sum of rewards is miximised - # not by getting higher rewards but by keeping the cart-pole alive for longer. - # This will be reflected by the `total_rewards` value displayed in the progress bar. - # - log_reward = LogReward(log_pbar=True) - log_reward.register(trainer) - - ############################################################################### - # .. note:: - # It is possible to link multiple optimizers to the trainer if needed. - # In this case, each optimizer will be tied to a field in the loss dictionary. - # Check the :class:`torchrl.trainers.OptimizerHook` to learn more. - # - # Here we are, ready to train our algorithm! A simple call to - # ``trainer.train()`` and we'll be getting our results logged in. - # - trainer.train() - - ############################################################################### - # We can now quickly check the CSVs with the results. - - def print_csv_files_in_folder(folder_path): - """ - Find all CSV files in a folder and return the first 10 lines of each file as a string. - - Args: - folder_path (str): The relative path to the folder. - - Returns: - str: A string containing the first 10 lines of each CSV file in the folder. - """ - csv_files = [] - output_str = "" - for file in os.listdir(folder_path): - if file.endswith(".csv"): - csv_files.append(os.path.join(folder_path, file)) - for csv_file in csv_files: - output_str += f"File: {csv_file}\n" - with open(csv_file, "r") as f: - for i, line in enumerate(f): - if i == 10: - break - output_str += line.strip() + "\n" - output_str += "\n" - return output_str - - print_csv_files_in_folder(logger.experiment.log_dir) - - ############################################################################### - # Conclusion and possible improvements - # ------------------------------------ - # - # In this tutorial we have learned: - # - # - How to write a Trainer, including building its components and registering - # them in the trainer; - # - How to code a DQN algorithm, including how to create a policy that picks - # up the action with the highest value with - # :class:`torchrl.modules.QValueNetwork`; - # - How to build a multiprocessed data collector; - # - # Possible improvements to this tutorial could include: - # - # - Using the :class:`torchrl.data.MultiStep` - # post-processing. Multi-step will project an action - # to the :math:`n^{th}` following step, and create a discounted sum of the - # rewards in between. This trick can make the algorithm noticeably less - # myopic (although the reward is then biased). To use this, simply - # create the collector with - # - # >>> from torchrl.data.postprocs.postprocs import MultiStep - # >>> collector = CollectorClass(..., postproc=MultiStep(gamma, n)) - # - # where ``n`` is the number of looking-forward steps. Pay attention to the - # fact that the ``gamma`` factor has to be corrected by the number of - # steps till the next observation when being passed to - # ``vec_td_lambda_advantage_estimate``: - # - # >>> gamma = gamma ** tensordict["steps_to_next_obs"] - # - # - A prioritized replay buffer could also be used. This will give a - # higher priority to samples that have the worst value accuracy. - # Learn more on the `replay buffer section `_ - # of the documentation. - # - A distributional loss (see :class:`torchrl.objectives.DistributionalDQNLoss` - # for more information). - # - More fancy exploration techniques, such as :class:`torchrl.modules.NoisyLinear` layers and such. + return data_collector + +############################################################################### +# Loss function +# ------------- +# +# Building our loss function is straightforward: we only need to provide +# the model and a bunch of hyperparameters to the DQNLoss class. +# +# Target parameters +# ~~~~~~~~~~~~~~~~~ +# +# Many off-policy RL algorithms use the concept of "target parameters" when it +# comes to estimate the value of the next state or state-action pair. +# The target parameters are lagged copies of the model parameters. Because +# their predictions mismatch those of the current model configuration, they +# help learning by putting a pessimistic bound on the value being estimated. +# This is a powerful trick (known as "Double Q-Learning") that is ubiquitous +# in similar algorithms. +# + +def get_loss_module(actor, gamma): + loss_module = DQNLoss(actor, gamma=gamma, delay_value=True) + target_updater = SoftUpdate(loss_module) + return loss_module, target_updater + +############################################################################### +# Hyperparameters +# --------------- +# +# Let's start with our hyperparameters. The following setting should work well +# in practice, and the performance of the algorithm should hopefully not be +# too sensitive to slight variations of these. + +device = "cuda:0" if torch.cuda.device_count() > 0 else "cpu" + +############################################################################### +# Optimizer +# ~~~~~~~~~ + +# the learning rate of the optimizer +lr = 2e-3 +# weight decay +wd = 1e-5 +# the beta parameters of Adam +betas = (0.9, 0.999) +# Optimization steps per batch collected (aka UPD or updates per data) +n_optim = 8 + +############################################################################### +# DQN parameters +# ~~~~~~~~~~~~~~ +# gamma decay factor +gamma = 0.99 + +############################################################################### +# Smooth target network update decay parameter. +# This loosely corresponds to a 1/tau interval with hard target network +# update +tau = 0.02 + +############################################################################### +# Data collection and replay buffer +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Values to be used for proper training have been commented. +# +# Total frames collected in the environment. In other implementations, the +# user defines a maximum number of episodes. +# This is harder to do with our data collectors since they return batches +# of N collected frames, where N is a constant. +# However, one can easily get the same restriction on number of episodes by +# breaking the training loop when a certain number +# episodes has been collected. +total_frames = 10_000 # 500000 + +############################################################################### +# Random frames used to initialize the replay buffer. +init_random_frames = 100 # 1000 + +############################################################################### +# Frames in each batch collected. +frames_per_batch = 32 # 128 + +############################################################################### +# Frames sampled from the replay buffer at each optimization step +batch_size = 32 # 256 + +############################################################################### +# Size of the replay buffer in terms of frames +buffer_size = min(total_frames, 100000) + +############################################################################### +# Number of environments run in parallel in each data collector +num_workers = 2 # 8 +num_collectors = 2 # 4 + +############################################################################### +# Environment and exploration +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# We set the initial and final value of the epsilon factor in Epsilon-greedy +# exploration. +# Since our policy is deterministic, exploration is crucial: without it, the +# only source of randomness would be the environment reset. + +eps_greedy_val = 0.1 +eps_greedy_val_env = 0.005 + +############################################################################### +# To speed up learning, we set the bias of the last layer of our value network +# to a predefined value (this is not mandatory) +init_bias = 2.0 + +############################################################################### +# .. note:: +# For fast rendering of the tutorial ``total_frames`` hyperparameter +# was set to a very low number. To get a reasonable performance, use a greater +# value e.g. 500000 +# + +############################################################################### +# Building a Trainer +# ------------------ +# +# TorchRL's :class:`torchrl.trainers.Trainer` class constructor takes the +# following keyword-only arguments: +# +# - ``collector`` +# - ``loss_module`` +# - ``optimizer`` +# - ``logger``: A logger can be +# - ``total_frames``: this parameter defines the lifespan of the trainer. +# - ``frame_skip``: when a frame-skip is used, the collector must be made +# aware of it in order to accurately count the number of frames +# collected etc. Making the trainer aware of this parameter is not +# mandatory but helps to have a fairer comparison between settings where +# the total number of frames (budget) is fixed but the frame-skip is +# variable. + +stats = get_norm_stats() +test_env = make_env(parallel=False, obs_norm_sd=stats) +# Get model +actor, actor_explore = make_model(test_env) +loss_module, target_net_updater = get_loss_module(actor, gamma) +target_net_updater.init_() + +collector = get_collector( + stats, num_collectors, actor_explore, frames_per_batch, total_frames, device +) +optimizer = torch.optim.Adam( + loss_module.parameters(), lr=lr, weight_decay=wd, betas=betas +) +exp_name = f"dqn_exp_{uuid.uuid1()}" +logger = CSVLogger(exp_name=exp_name, log_dir="./") + +############################################################################### +# We can control how often the scalars should be logged. Here we set this +# to a low value as our training loop is short: + +log_interval = 500 + +trainer = Trainer( + collector=collector, + total_frames=total_frames, + frame_skip=1, + loss_module=loss_module, + optimizer=optimizer, + logger=logger, + optim_steps_per_batch=n_optim, + log_interval = log_interval, +) + +############################################################################### +# Registering hooks +# ~~~~~~~~~~~~~~~~~ +# +# Registering hooks can be achieved in two separate ways: +# +# - If the hook has it, the :meth:`torchrl.trainers.TrainerHookBase.register` +# method is the first choice. One just needs to provide the trainer as input +# and the hook will be registered with a default name at a default location. +# For some hooks, the registration can be quite complex: :class:`torchrl.trainers.ReplayBufferTrainer` +# requires 3 hooks (``extend``, ``sample`` and ``update_priority``) which +# can be cumbersome to implement. +buffer_hook = ReplayBufferTrainer( + get_replay_buffer(buffer_size, n_optim, batch_size=batch_size), + flatten_tensordicts=True, +) +buffer_hook.register(trainer) +weight_updater = UpdateWeights(collector, update_weights_interval=1) +weight_updater.register(trainer) +recorder = Recorder( + record_interval=100, # log every 100 optimization steps + record_frames=1000, # maximum number of frames in the record + frame_skip=1, + policy_exploration=actor_explore, + environment=test_env, + exploration_mode="mode", + log_keys=[("next", "reward")], + out_keys={("next", "reward"): "rewards"}, + log_pbar=True, +) +recorder.register(trainer) + +############################################################################### +# - Any callable (including :class:`torchrl.trainers.TrainerHookBase` +# subclasses) can be registered using :meth:`torchrl.trainers.Trainer.register_op`. +# In this case, a location must be explicitely passed (). This method gives +# more control over the location of the hook but it also requires more +# understanding of the Trainer mechanism. +# Check the `trainer documentation `_ +# for a detailed description of the trainer hooks. +# +trainer.register_op("post_optim", target_net_updater.step) + +############################################################################### +# We can log the training rewards too. Note that this is of limited interest +# with CartPole, as rewards are always 1. The discounted sum of rewards is miximised +# not by getting higher rewards but by keeping the cart-pole alive for longer. +# This will be reflected by the `total_rewards` value displayed in the progress bar. +# +log_reward = LogReward(log_pbar=True) +log_reward.register(trainer) + +############################################################################### +# .. note:: +# It is possible to link multiple optimizers to the trainer if needed. +# In this case, each optimizer will be tied to a field in the loss dictionary. +# Check the :class:`torchrl.trainers.OptimizerHook` to learn more. +# +# Here we are, ready to train our algorithm! A simple call to +# ``trainer.train()`` and we'll be getting our results logged in. +# +trainer.train() + +############################################################################### +# We can now quickly check the CSVs with the results. + +def print_csv_files_in_folder(folder_path): + """ + Find all CSV files in a folder and return the first 10 lines of each file as a string. + + Args: + folder_path (str): The relative path to the folder. + + Returns: + str: A string containing the first 10 lines of each CSV file in the folder. + """ + csv_files = [] + output_str = "" + for file in os.listdir(folder_path): + if file.endswith(".csv"): + csv_files.append(os.path.join(folder_path, file)) + for csv_file in csv_files: + output_str += f"File: {csv_file}\n" + with open(csv_file, "r") as f: + for i, line in enumerate(f): + if i == 10: + break + output_str += line.strip() + "\n" + output_str += "\n" + return output_str + +print_csv_files_in_folder(logger.experiment.log_dir) + +############################################################################### +# Conclusion and possible improvements +# ------------------------------------ +# +# In this tutorial we have learned: +# +# - How to write a Trainer, including building its components and registering +# them in the trainer; +# - How to code a DQN algorithm, including how to create a policy that picks +# up the action with the highest value with +# :class:`torchrl.modules.QValueNetwork`; +# - How to build a multiprocessed data collector; +# +# Possible improvements to this tutorial could include: +# +# - Using the :class:`torchrl.data.MultiStep` +# post-processing. Multi-step will project an action +# to the :math:`n^{th}` following step, and create a discounted sum of the +# rewards in between. This trick can make the algorithm noticeably less +# myopic (although the reward is then biased). To use this, simply +# create the collector with +# +# >>> from torchrl.data.postprocs.postprocs import MultiStep +# >>> collector = CollectorClass(..., postproc=MultiStep(gamma, n)) +# +# where ``n`` is the number of looking-forward steps. Pay attention to the +# fact that the ``gamma`` factor has to be corrected by the number of +# steps till the next observation when being passed to +# ``vec_td_lambda_advantage_estimate``: +# +# >>> gamma = gamma ** tensordict["steps_to_next_obs"] +# +# - A prioritized replay buffer could also be used. This will give a +# higher priority to samples that have the worst value accuracy. +# Learn more on the `replay buffer section `_ +# of the documentation. +# - A distributional loss (see :class:`torchrl.objectives.DistributionalDQNLoss` +# for more information). +# - More fancy exploration techniques, such as :class:`torchrl.modules.NoisyLinear` layers and such. From 7180e6c91bc6767fa4dda7938e410c826147589f Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 4 Apr 2023 09:21:27 +0100 Subject: [PATCH 75/89] amend --- torchrl/trainers/trainers.py | 2 +- tutorials/sphinx-tutorials/coding_dqn.py | 22 ++++++++++++++++++---- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/torchrl/trainers/trainers.py b/torchrl/trainers/trainers.py index d8bce805487..69cc22fd672 100644 --- a/torchrl/trainers/trainers.py +++ b/torchrl/trainers/trainers.py @@ -145,7 +145,7 @@ def __init__( progress_bar: bool = True, seed: int = None, save_trainer_interval: int = 10000, - log_interval: int=10000, + log_interval: int = 10000, save_trainer_file: Optional[Union[str, pathlib.Path]] = None, ) -> None: diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py index 3583aaf01e8..3a87e6cba88 100644 --- a/tutorials/sphinx-tutorials/coding_dqn.py +++ b/tutorials/sphinx-tutorials/coding_dqn.py @@ -118,6 +118,7 @@ UpdateWeights, ) + def is_notebook() -> bool: try: shell = get_ipython().__class__.__name__ @@ -130,6 +131,7 @@ def is_notebook() -> bool: except NameError: return False # Probably standard Python interpreter + ############################################################################### # Let's get started with the various pieces we need for our algorithm: # @@ -184,6 +186,7 @@ def is_notebook() -> bool: # the :class:`torchrl.envs.ObservationNorm` transform. # + def make_env( parallel=False, obs_norm_sd=None, @@ -224,6 +227,7 @@ def make_env( ) return env + ############################################################################### # Compute normalizing constants # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -237,6 +241,7 @@ def make_env( # not all dimensions disappear in the process: # + def get_norm_stats(): test_env = make_env() test_env.transform[-1].init_stats( @@ -248,6 +253,7 @@ def get_norm_stats(): print("state dict of the observation norm:", obs_norm_sd) return obs_norm_sd + ############################################################################### # Building the model (Deep Q-network) # ----------------------------------- @@ -271,6 +277,7 @@ def get_norm_stats(): # in the input :class:`tensordict.TensorDict`. # + def make_model(dummy_env): cnn_kwargs = { "num_cells": [32, 64, 64], @@ -295,9 +302,7 @@ def make_model(dummy_env): ).to(device) net.value[-1].bias.data.fill_(init_bias) - actor = QValueActor(net, in_keys=["pixels"], spec=dummy_env.action_spec).to( - device - ) + actor = QValueActor(net, in_keys=["pixels"], spec=dummy_env.action_spec).to(device) # init actor: because the model is composed of lazy conv/linear layers, # we must pass a fake batch of data through it to instantiate them. tensordict = dummy_env.fake_tensordict() @@ -313,6 +318,7 @@ def make_model(dummy_env): return actor, actor_explore + ############################################################################### # Collecting and storing data # --------------------------- @@ -334,6 +340,7 @@ def make_model(dummy_env): # The only requirement of this storage is that the data passed to it at write # time must always have the same shape. + def get_replay_buffer(buffer_size, n_optim, batch_size): replay_buffer = TensorDictReplayBuffer( batch_size=batch_size, @@ -342,6 +349,7 @@ def get_replay_buffer(buffer_size, n_optim, batch_size): ) return replay_buffer + ############################################################################### # Data collector # ~~~~~~~~~~~~~~ @@ -372,6 +380,7 @@ def get_replay_buffer(buffer_size, n_optim, batch_size): # out training loop must account for. For simplicity, we set the devices to # the same value for all sub-collectors. + def get_collector( obs_norm_sd, num_collectors, @@ -399,6 +408,7 @@ def get_collector( ) return data_collector + ############################################################################### # Loss function # ------------- @@ -418,11 +428,13 @@ def get_collector( # in similar algorithms. # + def get_loss_module(actor, gamma): loss_module = DQNLoss(actor, gamma=gamma, delay_value=True) target_updater = SoftUpdate(loss_module) return loss_module, target_updater + ############################################################################### # Hyperparameters # --------------- @@ -566,7 +578,7 @@ def get_loss_module(actor, gamma): optimizer=optimizer, logger=logger, optim_steps_per_batch=n_optim, - log_interval = log_interval, + log_interval=log_interval, ) ############################################################################### @@ -635,6 +647,7 @@ def get_loss_module(actor, gamma): ############################################################################### # We can now quickly check the CSVs with the results. + def print_csv_files_in_folder(folder_path): """ Find all CSV files in a folder and return the first 10 lines of each file as a string. @@ -660,6 +673,7 @@ def print_csv_files_in_folder(folder_path): output_str += "\n" return output_str + print_csv_files_in_folder(logger.experiment.log_dir) ############################################################################### From 6223494262353d3c8920733aa98cf9f0c2de54a9 Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 4 Apr 2023 12:36:38 +0100 Subject: [PATCH 76/89] init --- test/test_collector.py | 34 ++++++++++++++++++++++++++++++++ torchrl/collectors/collectors.py | 23 ++++++++++++++++----- 2 files changed, 52 insertions(+), 5 deletions(-) diff --git a/test/test_collector.py b/test/test_collector.py index 4dc92491fe7..dd78d68869b 100644 --- a/test/test_collector.py +++ b/test/test_collector.py @@ -1293,6 +1293,40 @@ def env_fn(seed): assert trajectory_ids[trajectory_ids_mask].numel() < frames_per_batch +def test_maxframes_error(): + env = TransformedEnv(CountingEnv(), StepCounter(2)) + _ = SyncDataCollector( + env, RandomPolicy(env.action_spec), total_frames=10_000, frames_per_batch=1000 + ) + with pytest.raises(ValueError): + _ = SyncDataCollector( + env, + RandomPolicy(env.action_spec), + total_frames=10_000, + frames_per_batch=1000, + max_frames_per_traj=2, + ) + + +def test_reset_heterogeneous_envs(): + env1 = lambda: TransformedEnv(CountingEnv(), StepCounter(2)) + env2 = lambda: TransformedEnv(CountingEnv(), StepCounter(3)) + env = SerialEnv(2, [env1, env2]) + c = SyncDataCollector( + env, RandomPolicy(env.action_spec), total_frames=10_000, frames_per_batch=1000 + ) + for data in c: # noqa: B007 + break + assert ( + data[0]["next", "truncated"].squeeze() + == torch.tensor([False, True]).repeat(250)[:500] + ).all() + assert ( + data[1]["next", "truncated"].squeeze() + == torch.tensor([False, False, True]).repeat(168)[:500] + ).all() + + if __name__ == "__main__": args, unknown = argparse.ArgumentParser().parse_known_args() pytest.main([__file__, "--capture", "no", "--exitfirst"] + unknown) diff --git a/torchrl/collectors/collectors.py b/torchrl/collectors/collectors.py index dd2505a78b3..98f39912c89 100644 --- a/torchrl/collectors/collectors.py +++ b/torchrl/collectors/collectors.py @@ -534,6 +534,18 @@ def __init__( self.env: EnvBase = self.env.to(self.device) self.max_frames_per_traj = max_frames_per_traj if self.max_frames_per_traj > 0: + # let's check that there is no StepCounter yet + for key in self.env.output_spec.keys(True, True): + if isinstance(key, str): + key = (key,) + if "truncated" in key: + raise ValueError( + "A 'truncated' key is already present in the environment " + "and the 'max_frames_per_traj' argument may conflict with " + "a 'StepCounter' that has already been set. " + "Possible solutions: Set max_frames_per_traj to 0 or " + "remove the StepCounter limit from the environment transforms." + ) env = self.env = TransformedEnv( self.env, StepCounter(max_steps=self.max_frames_per_traj) ) @@ -759,11 +771,12 @@ def _step_and_maybe_reset(self) -> None: _reset = None td_reset = None td_reset = self.env.reset(td_reset) - self._tensordict.update(td_reset, inplace=True) - done = self._tensordict.get("done") - if (_reset is None and done.any()) or ( - _reset is not None and done[_reset].any() - ): + reset_idx = done_or_terminated.squeeze(-1) + self._tensordict.get_sub_tensordict(reset_idx).update( + td_reset[reset_idx], inplace=True + ) + done = self._tensordict[reset_idx].get("done") + if (_reset is None and done.any()) or (_reset is not None and done.any()): raise RuntimeError( f"Env {self.env} was done after reset on specified '_reset' dimensions. This is (currently) not allowed." ) From b0d9629d3446c7277785d100fd65902deb374e75 Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 4 Apr 2023 13:19:42 +0100 Subject: [PATCH 77/89] empty commit From 822f518263646c0c608d4a57874631338d556c72 Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 4 Apr 2023 13:26:29 +0100 Subject: [PATCH 78/89] amend --- torchrl/collectors/collectors.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/torchrl/collectors/collectors.py b/torchrl/collectors/collectors.py index 98f39912c89..ca479cb2aa5 100644 --- a/torchrl/collectors/collectors.py +++ b/torchrl/collectors/collectors.py @@ -771,7 +771,9 @@ def _step_and_maybe_reset(self) -> None: _reset = None td_reset = None td_reset = self.env.reset(td_reset) - reset_idx = done_or_terminated.squeeze(-1) + reset_idx = done_or_terminated + while reset_idx.ndim > self._tensordict.ndim: + reset_idx = reset_idx.any(-1) self._tensordict.get_sub_tensordict(reset_idx).update( td_reset[reset_idx], inplace=True ) From 4ad5fb9c4b154bc93c82a9325a794d6820ae40e7 Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 4 Apr 2023 13:29:34 +0100 Subject: [PATCH 79/89] amend --- torchrl/collectors/collectors.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/torchrl/collectors/collectors.py b/torchrl/collectors/collectors.py index ca479cb2aa5..de84a5f4dc6 100644 --- a/torchrl/collectors/collectors.py +++ b/torchrl/collectors/collectors.py @@ -771,21 +771,18 @@ def _step_and_maybe_reset(self) -> None: _reset = None td_reset = None td_reset = self.env.reset(td_reset) - reset_idx = done_or_terminated - while reset_idx.ndim > self._tensordict.ndim: - reset_idx = reset_idx.any(-1) - self._tensordict.get_sub_tensordict(reset_idx).update( - td_reset[reset_idx], inplace=True + traj_done_or_terminated = done_or_terminated.sum( + tuple(range(self._tensordict.batch_dims, done_or_terminated.ndim)), + dtype=torch.bool, + ) + self._tensordict.get_sub_tensordict(traj_done_or_terminated).update( + td_reset[traj_done_or_terminated], inplace=True ) - done = self._tensordict[reset_idx].get("done") + done = self._tensordict[traj_done_or_terminated].get("done") if (_reset is None and done.any()) or (_reset is not None and done.any()): raise RuntimeError( f"Env {self.env} was done after reset on specified '_reset' dimensions. This is (currently) not allowed." ) - traj_done_or_terminated = done_or_terminated.sum( - tuple(range(self._tensordict.batch_dims, done_or_terminated.ndim)), - dtype=torch.bool, - ) traj_ids[traj_done_or_terminated] = traj_ids.max() + torch.arange( 1, traj_done_or_terminated.sum() + 1, device=traj_ids.device ) From ff54f0a2df063c1ac3e1cec9f6413292f1e7eed7 Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 4 Apr 2023 14:45:11 +0100 Subject: [PATCH 80/89] amend --- docs/source/reference/modules.rst | 2 +- torchrl/trainers/trainers.py | 11 +++++ tutorials/sphinx-tutorials/coding_dqn.py | 63 +++++++++++------------- 3 files changed, 40 insertions(+), 36 deletions(-) diff --git a/docs/source/reference/modules.rst b/docs/source/reference/modules.rst index 7a52329e02f..fb1eebf6b89 100644 --- a/docs/source/reference/modules.rst +++ b/docs/source/reference/modules.rst @@ -32,7 +32,7 @@ TensorDict modules Hooks ----- -.. currentmodule:: torchrl.modules.tensordict_module.actors +.. currentmodule:: torchrl.modules .. autosummary:: :toctree: generated/ diff --git a/torchrl/trainers/trainers.py b/torchrl/trainers/trainers.py index 69cc22fd672..070679acd52 100644 --- a/torchrl/trainers/trainers.py +++ b/torchrl/trainers/trainers.py @@ -71,6 +71,17 @@ def load_state_dict(self, state_dict: Dict[str, Any]) -> None: @abc.abstractmethod def register(self, trainer: Trainer, name: str): + """Registers the hook in the trainer at a default location. + + Args: + trainer (Trainer): the trainer where the hook must be registered. + name (str): the name of the hook. + + .. note:: + To register the hook at another location than the default, use + :meth:`torchrl.trainers.Trainer.register_op`. + + """ raise NotImplementedError diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py index 3a87e6cba88..47268647e71 100644 --- a/tutorials/sphinx-tutorials/coding_dqn.py +++ b/tutorials/sphinx-tutorials/coding_dqn.py @@ -83,6 +83,7 @@ # of this algorithm. # sphinx_gallery_start_ignore +import tempfile import warnings warnings.filterwarnings("ignore") @@ -159,19 +160,19 @@ def is_notebook() -> bool: # We will be using five transforms: # # - :class:`torchrl.envs.StepCounter` to count the number of steps in each trajectory; -# - :class:`torchrl.envs.ToTensorImage` will convert a ``[W, H, C]`` uint8 +# - :class:`torchrl.envs.transforms.ToTensorImage` will convert a ``[W, H, C]`` uint8 # tensor in a floating point tensor in the ``[0, 1]`` space with shape # ``[C, W, H]``; -# - :class:`torchrl.envs.RewardScaling` to reduce the scale of the return; -# - :class:`torchrl.envs.GrayScale` will turn our image into grayscale; -# - :class:`torchrl.envs.Resize` will resize the image in a 64x64 format; -# - :class:`torchrl.envs.CatFrames` will concatenate an arbitrary number of +# - :class:`torchrl.envs.transforms.RewardScaling` to reduce the scale of the return; +# - :class:`torchrl.envs.transforms.GrayScale` will turn our image into grayscale; +# - :class:`torchrl.envs.transforms.Resize` will resize the image in a 64x64 format; +# - :class:`torchrl.envs.transforms.CatFrames` will concatenate an arbitrary number of # successive frames (``N=4``) in a single tensor along the channel dimension. # This is useful as a single image does not carry information about the # motion of the cartpole. Some memory about past observations and actions # is needed, either via a recurrent neural network or using a stack of # frames. -# - :class:`torchrl.envs.ObservationNorm` which will normalize our observations +# - :class:`torchrl.envs.transforms.ObservationNorm` which will normalize our observations # given some custom summary statistics. # # In practice, our environment builder has two arguments: @@ -265,9 +266,10 @@ def get_norm_stats(): # # .. math:: # -# val = b(obs) + v(obs) - \mathbb{E}[v(obs)] +# \mathbb{v} = b(obs) + v(obs) - \mathbb{E}[v(obs)] # -# where :math:`b` is a :math:`\mathbb{R}^n \rightarrow 1` function and :math:`v` is a +# where :math:`\mathbb{v}` is our vector of action values, +# :math:`b` is a :math:`\mathbb{R}^n \rightarrow 1` function and :math:`v` is a # :math:`\mathbb{R}^n \rightarrow \mathbb{R}^m` function, for # :math:`n = \# obs` and :math:`m = \# actions`. # @@ -354,8 +356,8 @@ def get_replay_buffer(buffer_size, n_optim, batch_size): # Data collector # ~~~~~~~~~~~~~~ # -# As in `PPO ` and -# `DDPG `, we will be using +# As in `PPO `_ and +# `DDPG `_, we will be using # a data collector as a dataloader in the outer loop. # # We choose the following configuration: we will be running a series of @@ -473,7 +475,9 @@ def get_loss_module(actor, gamma): ############################################################################### # Data collection and replay buffer # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Values to be used for proper training have been commented. +# +# .. note:: +# Values to be used for proper training have been commented. # # Total frames collected in the environment. In other implementations, the # user defines a maximum number of episodes. @@ -562,7 +566,9 @@ def get_loss_module(actor, gamma): loss_module.parameters(), lr=lr, weight_decay=wd, betas=betas ) exp_name = f"dqn_exp_{uuid.uuid1()}" -logger = CSVLogger(exp_name=exp_name, log_dir="./") +tmpdir = tempfile.TemporaryDirectory() +logger = CSVLogger(exp_name=exp_name, log_dir=tmpdir.name) +warnings.warn(f"log dir: {logger.experiment.log_dir}") ############################################################################### # We can control how often the scalars should be logged. Here we set this @@ -616,7 +622,7 @@ def get_loss_module(actor, gamma): ############################################################################### # - Any callable (including :class:`torchrl.trainers.TrainerHookBase` # subclasses) can be registered using :meth:`torchrl.trainers.Trainer.register_op`. -# In this case, a location must be explicitely passed (). This method gives +# In this case, a location must be explicitly passed (). This method gives # more control over the location of the hook but it also requires more # understanding of the Trainer mechanism. # Check the `trainer documentation `_ @@ -626,9 +632,11 @@ def get_loss_module(actor, gamma): ############################################################################### # We can log the training rewards too. Note that this is of limited interest -# with CartPole, as rewards are always 1. The discounted sum of rewards is miximised -# not by getting higher rewards but by keeping the cart-pole alive for longer. -# This will be reflected by the `total_rewards` value displayed in the progress bar. +# with CartPole, as rewards are always 1. The discounted sum of rewards is +# maximised not by getting higher rewards but by keeping the cart-pole alive +# for longer. +# This will be reflected by the `total_rewards` value displayed in the +# progress bar. # log_reward = LogReward(log_pbar=True) log_reward.register(trainer) @@ -636,7 +644,8 @@ def get_loss_module(actor, gamma): ############################################################################### # .. note:: # It is possible to link multiple optimizers to the trainer if needed. -# In this case, each optimizer will be tied to a field in the loss dictionary. +# In this case, each optimizer will be tied to a field in the loss +# dictionary. # Check the :class:`torchrl.trainers.OptimizerHook` to learn more. # # Here we are, ready to train our algorithm! A simple call to @@ -691,26 +700,10 @@ def print_csv_files_in_folder(folder_path): # # Possible improvements to this tutorial could include: # -# - Using the :class:`torchrl.data.MultiStep` -# post-processing. Multi-step will project an action -# to the :math:`n^{th}` following step, and create a discounted sum of the -# rewards in between. This trick can make the algorithm noticeably less -# myopic (although the reward is then biased). To use this, simply -# create the collector with -# -# >>> from torchrl.data.postprocs.postprocs import MultiStep -# >>> collector = CollectorClass(..., postproc=MultiStep(gamma, n)) -# -# where ``n`` is the number of looking-forward steps. Pay attention to the -# fact that the ``gamma`` factor has to be corrected by the number of -# steps till the next observation when being passed to -# ``vec_td_lambda_advantage_estimate``: -# -# >>> gamma = gamma ** tensordict["steps_to_next_obs"] -# # - A prioritized replay buffer could also be used. This will give a # higher priority to samples that have the worst value accuracy. -# Learn more on the `replay buffer section `_ +# Learn more on the +# `replay buffer section `_ # of the documentation. # - A distributional loss (see :class:`torchrl.objectives.DistributionalDQNLoss` # for more information). From d978824d85465ebfac3baab81028f09eb3b41315 Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 4 Apr 2023 15:33:24 +0100 Subject: [PATCH 81/89] amend --- torchrl/record/loggers/csv.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torchrl/record/loggers/csv.py b/torchrl/record/loggers/csv.py index 69120bf1110..90aa41a742e 100644 --- a/torchrl/record/loggers/csv.py +++ b/torchrl/record/loggers/csv.py @@ -74,6 +74,7 @@ def __init__(self, exp_name: str, log_dir: Optional[str] = None) -> None: super().__init__(exp_name=exp_name, log_dir=log_dir) self._has_imported_moviepy = False + print(f"self.log_dir: {self.experiment.log_dir}") def _create_experiment(self) -> "CSVExperiment": """Creates a CSV experiment.""" From cf1ba97f74b768c76e5eb6cc7f730a929c088214 Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 4 Apr 2023 16:43:57 +0100 Subject: [PATCH 82/89] amend --- docs/source/reference/trainers.rst | 2 +- torchrl/modules/__init__.py | 2 ++ torchrl/modules/tensordict_module/__init__.py | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/source/reference/trainers.rst b/docs/source/reference/trainers.rst index a0c0056f2f7..d14cfae12ee 100644 --- a/docs/source/reference/trainers.rst +++ b/docs/source/reference/trainers.rst @@ -73,7 +73,7 @@ Hooks can be split into 3 categories: **data processing** (:obj:`"batch_process" - **Data processing** hooks update a tensordict of data. Hooks :obj:`__call__` method should accept a :obj:`TensorDict` object as input and update it given some strategy. Examples of such hooks include Replay Buffer extension (:obj:`ReplayBufferTrainer.extend`), data normalization (including normalization - constants update), data subsampling (:doc:`BatchSubSampler`) and such. + constants update), data subsampling (:class:`torchrl.trainers.BatchSubSampler`) and such. - **Logging** hooks take a batch of data presented as a :obj:`TensorDict` and write in the logger some information retrieved from that data. Examples include the :obj:`Recorder` hook, the reward diff --git a/torchrl/modules/__init__.py b/torchrl/modules/__init__.py index 5a3f4fdbb2b..7c26b7b1b8f 100644 --- a/torchrl/modules/__init__.py +++ b/torchrl/modules/__init__.py @@ -41,10 +41,12 @@ ActorValueOperator, AdditiveGaussianWrapper, DistributionalQValueActor, + DistributionalQValueHook, EGreedyWrapper, OrnsteinUhlenbeckProcessWrapper, ProbabilisticActor, QValueActor, + QValueHook, SafeModule, SafeProbabilisticModule, SafeProbabilisticTensorDictSequential, diff --git a/torchrl/modules/tensordict_module/__init__.py b/torchrl/modules/tensordict_module/__init__.py index 6686eb6b602..d74634c153a 100644 --- a/torchrl/modules/tensordict_module/__init__.py +++ b/torchrl/modules/tensordict_module/__init__.py @@ -9,8 +9,10 @@ ActorCriticWrapper, ActorValueOperator, DistributionalQValueActor, + DistributionalQValueHook, ProbabilisticActor, QValueActor, + QValueHook, ValueOperator, ) from .common import SafeModule From 0e4e6b4bfbaf69a530e54179090421c44196aa0a Mon Sep 17 00:00:00 2001 From: vmoens Date: Tue, 4 Apr 2023 18:30:51 +0100 Subject: [PATCH 83/89] theme --- docs/source/_static/js/theme.js | 3824 +------------------------------ 1 file changed, 2 insertions(+), 3822 deletions(-) diff --git a/docs/source/_static/js/theme.js b/docs/source/_static/js/theme.js index 219443ee11e..297154d9ed7 100644 --- a/docs/source/_static/js/theme.js +++ b/docs/source/_static/js/theme.js @@ -692,7 +692,7 @@ window.sideMenus = { } }; -},{}],11:[function(require,module,exports){ +},{}],"pytorch-sphinx-theme":[function(require,module,exports){ var jQuery = (typeof(window) != 'undefined') ? window.jQuery : require('jquery'); // Sphinx theme nav state @@ -1125,3824 +1125,4 @@ $(window).scroll(function () { }); -},{"jquery":"jquery"}],"pytorch-sphinx-theme":[function(require,module,exports){ -require=(function(){function r(e,n,t){function o(i,f){if(!n[i]){if(!e[i]){var c="function"==typeof require&&require;if(!f&&c)return c(i,!0);if(u)return u(i,!0);var a=new Error("Cannot find module '"+i+"'");throw a.code="MODULE_NOT_FOUND",a}var p=n[i]={exports:{}};e[i][0].call(p.exports,function(r){var n=e[i][1][r];return o(n||r)},p,p.exports,r,e,n,t)}return n[i].exports}for(var u="function"==typeof require&&require,i=0;i wait) { - if (timeout) { - clearTimeout(timeout); - timeout = null; - } - previous = now; - result = func.apply(context, args); - if (!timeout) context = args = null; - } else if (!timeout && options.trailing !== false) { - timeout = setTimeout(later, remaining); - } - return result; - }; - }, - - closest: function (el, selector) { - var matchesFn; - - // find vendor prefix - ['matches','webkitMatchesSelector','mozMatchesSelector','msMatchesSelector','oMatchesSelector'].some(function(fn) { - if (typeof document.body[fn] == 'function') { - matchesFn = fn; - return true; - } - return false; - }); - - var parent; - - // traverse parents - while (el) { - parent = el.parentElement; - if (parent && parent[matchesFn](selector)) { - return parent; - } - el = parent; - } - - return null; - }, - - // Modified from https://stackoverflow.com/a/18953277 - offset: function(elem) { - if (!elem) { - return; - } - - rect = elem.getBoundingClientRect(); - - // Make sure element is not hidden (display: none) or disconnected - if (rect.width || rect.height || elem.getClientRects().length) { - var doc = elem.ownerDocument; - var docElem = doc.documentElement; - - return { - top: rect.top + window.pageYOffset - docElem.clientTop, - left: rect.left + window.pageXOffset - docElem.clientLeft - }; - } - }, - - headersHeight: function() { - if (document.getElementById("pytorch-left-menu").classList.contains("make-fixed")) { - return document.getElementById("pytorch-page-level-bar").offsetHeight; - } else { - return document.getElementById("header-holder").offsetHeight + - document.getElementById("pytorch-page-level-bar").offsetHeight; - } - }, - - windowHeight: function() { - return window.innerHeight || - document.documentElement.clientHeight || - document.body.clientHeight; - } -} - -},{}],2:[function(require,module,exports){ -var cookieBanner = { - init: function() { - cookieBanner.bind(); - - var cookieExists = cookieBanner.cookieExists(); - - if (!cookieExists) { - cookieBanner.setCookie(); - cookieBanner.showCookieNotice(); - } - }, - - bind: function() { - $(".close-button").on("click", cookieBanner.hideCookieNotice); - }, - - cookieExists: function() { - var cookie = localStorage.getItem("returningPytorchUser"); - - if (cookie) { - return true; - } else { - return false; - } - }, - - setCookie: function() { - localStorage.setItem("returningPytorchUser", true); - }, - - showCookieNotice: function() { - $(".cookie-banner-wrapper").addClass("is-visible"); - }, - - hideCookieNotice: function() { - $(".cookie-banner-wrapper").removeClass("is-visible"); - } -}; - -$(function() { - cookieBanner.init(); -}); - -},{}],3:[function(require,module,exports){ -window.filterTags = { - bind: function() { - var options = { - valueNames: [{ data: ["tags"] }], - page: "6", - pagination: true - }; - - var tutorialList = new List("tutorial-cards", options); - - function filterSelectedTags(cardTags, selectedTags) { - return cardTags.some(function(tag) { - return selectedTags.some(function(selectedTag) { - return selectedTag == tag; - }); - }); - } - - function updateList() { - var selectedTags = []; - - $(".selected").each(function() { - selectedTags.push($(this).data("tag")); - }); - - tutorialList.filter(function(item) { - var cardTags; - - if (item.values().tags == null) { - cardTags = [""]; - } else { - cardTags = item.values().tags.split(","); - } - - if (selectedTags.length == 0) { - return true; - } else { - return filterSelectedTags(cardTags, selectedTags); - } - }); - } - - $(".filter-btn").on("click", function() { - if ($(this).data("tag") == "all") { - $(this).addClass("all-tag-selected"); - $(".filter").removeClass("selected"); - } else { - $(this).toggleClass("selected"); - $("[data-tag='all']").removeClass("all-tag-selected"); - } - - // If no tags are selected then highlight the 'All' tag - - if (!$(".selected")[0]) { - $("[data-tag='all']").addClass("all-tag-selected"); - } - - updateList(); - }); - } -}; - -},{}],4:[function(require,module,exports){ -// Modified from https://stackoverflow.com/a/32396543 -window.highlightNavigation = { - navigationListItems: document.querySelectorAll("#pytorch-right-menu li"), - sections: document.querySelectorAll(".pytorch-article .section"), - sectionIdTonavigationLink: {}, - - bind: function() { - if (!sideMenus.displayRightMenu) { - return; - }; - - for (var i = 0; i < highlightNavigation.sections.length; i++) { - var id = highlightNavigation.sections[i].id; - highlightNavigation.sectionIdTonavigationLink[id] = - document.querySelectorAll('#pytorch-right-menu li a[href="#' + id + '"]')[0]; - } - - $(window).scroll(utilities.throttle(highlightNavigation.highlight, 100)); - }, - - highlight: function() { - var rightMenu = document.getElementById("pytorch-right-menu"); - - // If right menu is not on the screen don't bother - if (rightMenu.offsetWidth === 0 && rightMenu.offsetHeight === 0) { - return; - } - - var scrollPosition = utilities.scrollTop(); - var OFFSET_TOP_PADDING = 25; - var offset = document.getElementById("header-holder").offsetHeight + - document.getElementById("pytorch-page-level-bar").offsetHeight + - OFFSET_TOP_PADDING; - - var sections = highlightNavigation.sections; - - for (var i = (sections.length - 1); i >= 0; i--) { - var currentSection = sections[i]; - var sectionTop = utilities.offset(currentSection).top; - - if (scrollPosition >= sectionTop - offset) { - var navigationLink = highlightNavigation.sectionIdTonavigationLink[currentSection.id]; - var navigationListItem = utilities.closest(navigationLink, "li"); - - if (navigationListItem && !navigationListItem.classList.contains("active")) { - for (var i = 0; i < highlightNavigation.navigationListItems.length; i++) { - var el = highlightNavigation.navigationListItems[i]; - if (el.classList.contains("active")) { - el.classList.remove("active"); - } - } - - navigationListItem.classList.add("active"); - - // Scroll to active item. Not a requested feature but we could revive it. Needs work. - - // var menuTop = $("#pytorch-right-menu").position().top; - // var itemTop = navigationListItem.getBoundingClientRect().top; - // var TOP_PADDING = 20 - // var newActiveTop = $("#pytorch-side-scroll-right").scrollTop() + itemTop - menuTop - TOP_PADDING; - - // $("#pytorch-side-scroll-right").animate({ - // scrollTop: newActiveTop - // }, 100); - } - - break; - } - } - } -}; - -},{}],5:[function(require,module,exports){ -window.mainMenuDropdown = { - bind: function() { - $("[data-toggle='ecosystem-dropdown']").on("click", function() { - toggleDropdown($(this).attr("data-toggle")); - }); - - $("[data-toggle='resources-dropdown']").on("click", function() { - toggleDropdown($(this).attr("data-toggle")); - }); - - function toggleDropdown(menuToggle) { - var showMenuClass = "show-menu"; - var menuClass = "." + menuToggle + "-menu"; - - if ($(menuClass).hasClass(showMenuClass)) { - $(menuClass).removeClass(showMenuClass); - } else { - $("[data-toggle=" + menuToggle + "].show-menu").removeClass( - showMenuClass - ); - $(menuClass).addClass(showMenuClass); - } - } - } -}; - -},{}],6:[function(require,module,exports){ -window.mobileMenu = { - bind: function() { - $("[data-behavior='open-mobile-menu']").on('click', function(e) { - e.preventDefault(); - $(".mobile-main-menu").addClass("open"); - $("body").addClass('no-scroll'); - - mobileMenu.listenForResize(); - }); - - $("[data-behavior='close-mobile-menu']").on('click', function(e) { - e.preventDefault(); - mobileMenu.close(); - }); - }, - - listenForResize: function() { - $(window).on('resize.ForMobileMenu', function() { - if ($(this).width() > 768) { - mobileMenu.close(); - } - }); - }, - - close: function() { - $(".mobile-main-menu").removeClass("open"); - $("body").removeClass('no-scroll'); - $(window).off('resize.ForMobileMenu'); - } -}; - -},{}],7:[function(require,module,exports){ -window.mobileTOC = { - bind: function() { - $("[data-behavior='toggle-table-of-contents']").on("click", function(e) { - e.preventDefault(); - - var $parent = $(this).parent(); - - if ($parent.hasClass("is-open")) { - $parent.removeClass("is-open"); - $(".pytorch-left-menu").slideUp(200, function() { - $(this).css({display: ""}); - }); - } else { - $parent.addClass("is-open"); - $(".pytorch-left-menu").slideDown(200); - } - }); - } -} - -},{}],8:[function(require,module,exports){ -window.pytorchAnchors = { - bind: function() { - // Replace Sphinx-generated anchors with anchorjs ones - $(".headerlink").text(""); - - window.anchors.add(".pytorch-article .headerlink"); - - $(".anchorjs-link").each(function() { - var $headerLink = $(this).closest(".headerlink"); - var href = $headerLink.attr("href"); - var clone = this.outerHTML; - - $clone = $(clone).attr("href", href); - $headerLink.before($clone); - $headerLink.remove(); - }); - } -}; - -},{}],9:[function(require,module,exports){ -// Modified from https://stackoverflow.com/a/13067009 -// Going for a JS solution to scrolling to an anchor so we can benefit from -// less hacky css and smooth scrolling. - -window.scrollToAnchor = { - bind: function() { - var document = window.document; - var history = window.history; - var location = window.location - var HISTORY_SUPPORT = !!(history && history.pushState); - - var anchorScrolls = { - ANCHOR_REGEX: /^#[^ ]+$/, - offsetHeightPx: function() { - var OFFSET_HEIGHT_PADDING = 20; - // TODO: this is a little janky. We should try to not rely on JS for this - return utilities.headersHeight() + OFFSET_HEIGHT_PADDING; - }, - - /** - * Establish events, and fix initial scroll position if a hash is provided. - */ - init: function() { - this.scrollToCurrent(); - // This interferes with clicks below it, causing a double fire - // $(window).on('hashchange', $.proxy(this, 'scrollToCurrent')); - $('body').on('click', 'a', $.proxy(this, 'delegateAnchors')); - $('body').on('click', '#pytorch-right-menu li span', $.proxy(this, 'delegateSpans')); - }, - - /** - * Return the offset amount to deduct from the normal scroll position. - * Modify as appropriate to allow for dynamic calculations - */ - getFixedOffset: function() { - return this.offsetHeightPx(); - }, - - /** - * If the provided href is an anchor which resolves to an element on the - * page, scroll to it. - * @param {String} href - * @return {Boolean} - Was the href an anchor. - */ - scrollIfAnchor: function(href, pushToHistory) { - var match, anchorOffset; - - if(!this.ANCHOR_REGEX.test(href)) { - return false; - } - - match = document.getElementById(href.slice(1)); - - if(match) { - var anchorOffset = $(match).offset().top - this.getFixedOffset(); - - $('html, body').scrollTop(anchorOffset); - - // Add the state to history as-per normal anchor links - if(HISTORY_SUPPORT && pushToHistory) { - history.pushState({}, document.title, location.pathname + href); - } - } - - return !!match; - }, - - /** - * Attempt to scroll to the current location's hash. - */ - scrollToCurrent: function(e) { - if(this.scrollIfAnchor(window.location.hash) && e) { - e.preventDefault(); - } - }, - - delegateSpans: function(e) { - var elem = utilities.closest(e.target, "a"); - - if(this.scrollIfAnchor(elem.getAttribute('href'), true)) { - e.preventDefault(); - } - }, - - /** - * If the click event's target was an anchor, fix the scroll position. - */ - delegateAnchors: function(e) { - var elem = e.target; - - if(this.scrollIfAnchor(elem.getAttribute('href'), true)) { - e.preventDefault(); - } - } - }; - - $(document).ready($.proxy(anchorScrolls, 'init')); - } -}; - -},{}],10:[function(require,module,exports){ -window.sideMenus = { - rightMenuIsOnScreen: function() { - return document.getElementById("pytorch-content-right").offsetParent !== null; - }, - - isFixedToBottom: false, - - bind: function() { - sideMenus.handleLeftMenu(); - - var rightMenuLinks = document.querySelectorAll("#pytorch-right-menu li"); - var rightMenuHasLinks = rightMenuLinks.length > 1; - - if (!rightMenuHasLinks) { - for (var i = 0; i < rightMenuLinks.length; i++) { - rightMenuLinks[i].style.display = "none"; - } - } - - if (rightMenuHasLinks) { - // Don't show the Shortcuts menu title text unless there are menu items - document.getElementById("pytorch-shortcuts-wrapper").style.display = "block"; - - // We are hiding the titles of the pages in the right side menu but there are a few - // pages that include other pages in the right side menu (see 'torch.nn' in the docs) - // so if we exclude those it looks confusing. Here we add a 'title-link' class to these - // links so we can exclude them from normal right side menu link operations - var titleLinks = document.querySelectorAll( - "#pytorch-right-menu #pytorch-side-scroll-right \ - > ul > li > a.reference.internal" - ); - - for (var i = 0; i < titleLinks.length; i++) { - var link = titleLinks[i]; - - link.classList.add("title-link"); - - if ( - link.nextElementSibling && - link.nextElementSibling.tagName === "UL" && - link.nextElementSibling.children.length > 0 - ) { - link.classList.add("has-children"); - } - } - - // Add + expansion signifiers to normal right menu links that have sub menus - var menuLinks = document.querySelectorAll( - "#pytorch-right-menu ul li ul li a.reference.internal" - ); - - for (var i = 0; i < menuLinks.length; i++) { - if ( - menuLinks[i].nextElementSibling && - menuLinks[i].nextElementSibling.tagName === "UL" - ) { - menuLinks[i].classList.add("not-expanded"); - } - } - - // If a hash is present on page load recursively expand menu items leading to selected item - var linkWithHash = - document.querySelector( - "#pytorch-right-menu a[href=\"" + window.location.hash + "\"]" - ); - - if (linkWithHash) { - // Expand immediate sibling list if present - if ( - linkWithHash.nextElementSibling && - linkWithHash.nextElementSibling.tagName === "UL" && - linkWithHash.nextElementSibling.children.length > 0 - ) { - linkWithHash.nextElementSibling.style.display = "block"; - linkWithHash.classList.add("expanded"); - } - - // Expand ancestor lists if any - sideMenus.expandClosestUnexpandedParentList(linkWithHash); - } - - // Bind click events on right menu links - $("#pytorch-right-menu a.reference.internal").on("click", function() { - if (this.classList.contains("expanded")) { - this.nextElementSibling.style.display = "none"; - this.classList.remove("expanded"); - this.classList.add("not-expanded"); - } else if (this.classList.contains("not-expanded")) { - this.nextElementSibling.style.display = "block"; - this.classList.remove("not-expanded"); - this.classList.add("expanded"); - } - }); - - sideMenus.handleRightMenu(); - } - - $(window).on('resize scroll', function(e) { - sideMenus.handleNavBar(); - - sideMenus.handleLeftMenu(); - - if (sideMenus.rightMenuIsOnScreen()) { - sideMenus.handleRightMenu(); - } - }); - }, - - leftMenuIsFixed: function() { - return document.getElementById("pytorch-left-menu").classList.contains("make-fixed"); - }, - - handleNavBar: function() { - var mainHeaderHeight = document.getElementById('header-holder').offsetHeight; - - // If we are scrolled past the main navigation header fix the sub menu bar to top of page - if (utilities.scrollTop() >= mainHeaderHeight) { - document.getElementById("pytorch-left-menu").classList.add("make-fixed"); - document.getElementById("pytorch-page-level-bar").classList.add("left-menu-is-fixed"); - } else { - document.getElementById("pytorch-left-menu").classList.remove("make-fixed"); - document.getElementById("pytorch-page-level-bar").classList.remove("left-menu-is-fixed"); - } - }, - - expandClosestUnexpandedParentList: function (el) { - var closestParentList = utilities.closest(el, "ul"); - - if (closestParentList) { - var closestParentLink = closestParentList.previousElementSibling; - var closestParentLinkExists = closestParentLink && - closestParentLink.tagName === "A" && - closestParentLink.classList.contains("reference"); - - if (closestParentLinkExists) { - // Don't add expansion class to any title links - if (closestParentLink.classList.contains("title-link")) { - return; - } - - closestParentList.style.display = "block"; - closestParentLink.classList.remove("not-expanded"); - closestParentLink.classList.add("expanded"); - sideMenus.expandClosestUnexpandedParentList(closestParentLink); - } - } - }, - - handleLeftMenu: function () { - var windowHeight = utilities.windowHeight(); - var topOfFooterRelativeToWindow = document.getElementById("docs-tutorials-resources").getBoundingClientRect().top; - - if (topOfFooterRelativeToWindow >= windowHeight) { - document.getElementById("pytorch-left-menu").style.height = "100%"; - } else { - var howManyPixelsOfTheFooterAreInTheWindow = windowHeight - topOfFooterRelativeToWindow; - var leftMenuDifference = howManyPixelsOfTheFooterAreInTheWindow; - document.getElementById("pytorch-left-menu").style.height = (windowHeight - leftMenuDifference) + "px"; - } - }, - - handleRightMenu: function() { - var rightMenuWrapper = document.getElementById("pytorch-content-right"); - var rightMenu = document.getElementById("pytorch-right-menu"); - var rightMenuList = rightMenu.getElementsByTagName("ul")[0]; - var article = document.getElementById("pytorch-article"); - var articleHeight = article.offsetHeight; - var articleBottom = utilities.offset(article).top + articleHeight; - var mainHeaderHeight = document.getElementById('header-holder').offsetHeight; - - if (utilities.scrollTop() < mainHeaderHeight) { - rightMenuWrapper.style.height = "100%"; - rightMenu.style.top = 0; - rightMenu.classList.remove("scrolling-fixed"); - rightMenu.classList.remove("scrolling-absolute"); - } else { - if (rightMenu.classList.contains("scrolling-fixed")) { - var rightMenuBottom = - utilities.offset(rightMenuList).top + rightMenuList.offsetHeight; - - if (rightMenuBottom >= articleBottom) { - rightMenuWrapper.style.height = articleHeight + mainHeaderHeight + "px"; - rightMenu.style.top = utilities.scrollTop() - mainHeaderHeight + "px"; - rightMenu.classList.add("scrolling-absolute"); - rightMenu.classList.remove("scrolling-fixed"); - } - } else { - rightMenuWrapper.style.height = articleHeight + mainHeaderHeight + "px"; - rightMenu.style.top = - articleBottom - mainHeaderHeight - rightMenuList.offsetHeight + "px"; - rightMenu.classList.add("scrolling-absolute"); - } - - if (utilities.scrollTop() < articleBottom - rightMenuList.offsetHeight) { - rightMenuWrapper.style.height = "100%"; - rightMenu.style.top = ""; - rightMenu.classList.remove("scrolling-absolute"); - rightMenu.classList.add("scrolling-fixed"); - } - } - - var rightMenuSideScroll = document.getElementById("pytorch-side-scroll-right"); - var sideScrollFromWindowTop = rightMenuSideScroll.getBoundingClientRect().top; - - rightMenuSideScroll.style.height = utilities.windowHeight() - sideScrollFromWindowTop + "px"; - } -}; - -},{}],11:[function(require,module,exports){ -var jQuery = (typeof(window) != 'undefined') ? window.jQuery : require('jquery'); - -// Sphinx theme nav state -function ThemeNav () { - - var nav = { - navBar: null, - win: null, - winScroll: false, - winResize: false, - linkScroll: false, - winPosition: 0, - winHeight: null, - docHeight: null, - isRunning: false - }; - - nav.enable = function (withStickyNav) { - var self = this; - - // TODO this can likely be removed once the theme javascript is broken - // out from the RTD assets. This just ensures old projects that are - // calling `enable()` get the sticky menu on by default. All other cals - // to `enable` should include an argument for enabling the sticky menu. - if (typeof(withStickyNav) == 'undefined') { - withStickyNav = true; - } - - if (self.isRunning) { - // Only allow enabling nav logic once - return; - } - - self.isRunning = true; - jQuery(function ($) { - self.init($); - - self.reset(); - self.win.on('hashchange', self.reset); - - if (withStickyNav) { - // Set scroll monitor - self.win.on('scroll', function () { - if (!self.linkScroll) { - if (!self.winScroll) { - self.winScroll = true; - requestAnimationFrame(function() { self.onScroll(); }); - } - } - }); - } - - // Set resize monitor - self.win.on('resize', function () { - if (!self.winResize) { - self.winResize = true; - requestAnimationFrame(function() { self.onResize(); }); - } - }); - - self.onResize(); - }); - - }; - - // TODO remove this with a split in theme and Read the Docs JS logic as - // well, it's only here to support 0.3.0 installs of our theme. - nav.enableSticky = function() { - this.enable(true); - }; - - nav.init = function ($) { - var doc = $(document), - self = this; - - this.navBar = $('div.pytorch-side-scroll:first'); - this.win = $(window); - - // Set up javascript UX bits - $(document) - // Shift nav in mobile when clicking the menu. - .on('click', "[data-toggle='pytorch-left-menu-nav-top']", function() { - $("[data-toggle='wy-nav-shift']").toggleClass("shift"); - $("[data-toggle='rst-versions']").toggleClass("shift"); - }) - - // Nav menu link click operations - .on('click', ".pytorch-menu-vertical .current ul li a", function() { - var target = $(this); - // Close menu when you click a link. - $("[data-toggle='wy-nav-shift']").removeClass("shift"); - $("[data-toggle='rst-versions']").toggleClass("shift"); - // Handle dynamic display of l3 and l4 nav lists - self.toggleCurrent(target); - self.hashChange(); - }) - .on('click', "[data-toggle='rst-current-version']", function() { - $("[data-toggle='rst-versions']").toggleClass("shift-up"); - }) - - // Make tables responsive - $("table.docutils:not(.field-list,.footnote,.citation)") - .wrap("

"); - - // Add extra class to responsive tables that contain - // footnotes or citations so that we can target them for styling - $("table.docutils.footnote") - .wrap("
"); - $("table.docutils.citation") - .wrap("
"); - - // Add expand links to all parents of nested ul - $('.pytorch-menu-vertical ul').not('.simple').siblings('a').each(function () { - var link = $(this); - expand = $(''); - expand.on('click', function (ev) { - self.toggleCurrent(link); - ev.stopPropagation(); - return false; - }); - link.prepend(expand); - }); - }; - - nav.reset = function () { - // Get anchor from URL and open up nested nav - var anchor = encodeURI(window.location.hash) || '#'; - - try { - var vmenu = $('.pytorch-menu-vertical'); - var link = vmenu.find('[href="' + anchor + '"]'); - if (link.length === 0) { - // this link was not found in the sidebar. - // Find associated id element, then its closest section - // in the document and try with that one. - var id_elt = $('.document [id="' + anchor.substring(1) + '"]'); - var closest_section = id_elt.closest('div.section'); - link = vmenu.find('[href="#' + closest_section.attr("id") + '"]'); - if (link.length === 0) { - // still not found in the sidebar. fall back to main section - link = vmenu.find('[href="#"]'); - } - } - // If we found a matching link then reset current and re-apply - // otherwise retain the existing match - if (link.length > 0) { - $('.pytorch-menu-vertical .current').removeClass('current'); - link.addClass('current'); - link.closest('li.toctree-l1').addClass('current'); - link.closest('li.toctree-l1').parent().addClass('current'); - link.closest('li.toctree-l1').addClass('current'); - link.closest('li.toctree-l2').addClass('current'); - link.closest('li.toctree-l3').addClass('current'); - link.closest('li.toctree-l4').addClass('current'); - } - } - catch (err) { - console.log("Error expanding nav for anchor", err); - } - - }; - - nav.onScroll = function () { - this.winScroll = false; - var newWinPosition = this.win.scrollTop(), - winBottom = newWinPosition + this.winHeight, - navPosition = this.navBar.scrollTop(), - newNavPosition = navPosition + (newWinPosition - this.winPosition); - if (newWinPosition < 0 || winBottom > this.docHeight) { - return; - } - this.navBar.scrollTop(newNavPosition); - this.winPosition = newWinPosition; - }; - - nav.onResize = function () { - this.winResize = false; - this.winHeight = this.win.height(); - this.docHeight = $(document).height(); - }; - - nav.hashChange = function () { - this.linkScroll = true; - this.win.one('hashchange', function () { - this.linkScroll = false; - }); - }; - - nav.toggleCurrent = function (elem) { - var parent_li = elem.closest('li'); - parent_li.siblings('li.current').removeClass('current'); - parent_li.siblings().find('li.current').removeClass('current'); - parent_li.find('> ul li.current').removeClass('current'); - parent_li.toggleClass('current'); - } - - return nav; -}; - -module.exports.ThemeNav = ThemeNav(); - -if (typeof(window) != 'undefined') { - window.SphinxRtdTheme = { - Navigation: module.exports.ThemeNav, - // TODO remove this once static assets are split up between the theme - // and Read the Docs. For now, this patches 0.3.0 to be backwards - // compatible with a pre-0.3.0 layout.html - StickyNav: module.exports.ThemeNav, - }; -} - - -// requestAnimationFrame polyfill by Erik Möller. fixes from Paul Irish and Tino Zijdel -// https://gist.github.com/paulirish/1579671 -// MIT license - -(function() { - var lastTime = 0; - var vendors = ['ms', 'moz', 'webkit', 'o']; - for(var x = 0; x < vendors.length && !window.requestAnimationFrame; ++x) { - window.requestAnimationFrame = window[vendors[x]+'RequestAnimationFrame']; - window.cancelAnimationFrame = window[vendors[x]+'CancelAnimationFrame'] - || window[vendors[x]+'CancelRequestAnimationFrame']; - } - - if (!window.requestAnimationFrame) - window.requestAnimationFrame = function(callback, element) { - var currTime = new Date().getTime(); - var timeToCall = Math.max(0, 16 - (currTime - lastTime)); - var id = window.setTimeout(function() { callback(currTime + timeToCall); }, - timeToCall); - lastTime = currTime + timeToCall; - return id; - }; - - if (!window.cancelAnimationFrame) - window.cancelAnimationFrame = function(id) { - clearTimeout(id); - }; -}()); - -$(".sphx-glr-thumbcontainer").removeAttr("tooltip"); -$("table").removeAttr("border"); - -// This code replaces the default sphinx gallery download buttons -// with the 3 download buttons at the top of the page - -var downloadNote = $(".sphx-glr-download-link-note.admonition.note"); -if (downloadNote.length >= 1) { - var tutorialUrlArray = $("#tutorial-type").text().split('/'); - tutorialUrlArray[0] = tutorialUrlArray[0] + "/sphinx-tutorials" - - var githubLink = "https://github.com/pytorch/rl/blob/main/" + tutorialUrlArray.join("/") + ".py", - notebookLink = $(".reference.download")[1].href, - notebookDownloadPath = notebookLink.split('_downloads')[1], - colabLink = "https://colab.research.google.com/github/pytorch/rl/blob/gh-pages/_downloads" + notebookDownloadPath; - - $("#google-colab-link").wrap("
"); - $("#download-notebook-link").wrap(""); - $("#github-view-link").wrap(""); -} else { - $(".pytorch-call-to-action-links").hide(); -} - -//This code handles the Expand/Hide toggle for the Docs/Tutorials left nav items - -$(document).ready(function() { - var caption = "#pytorch-left-menu p.caption"; - var collapseAdded = $(this).not("checked"); - $(caption).each(function () { - var menuName = this.innerText.replace(/[^\w\s]/gi, "").trim(); - $(this).find("span").addClass("checked"); - if (collapsedSections.includes(menuName) == true && collapseAdded && sessionStorage.getItem(menuName) !== "expand" || sessionStorage.getItem(menuName) == "collapse") { - $(this.firstChild).after("[ + ]"); - $(this.firstChild).after("[ - ]"); - $(this).next("ul").hide(); - } else if (collapsedSections.includes(menuName) == false && collapseAdded || sessionStorage.getItem(menuName) == "expand") { - $(this.firstChild).after("[ + ]"); - $(this.firstChild).after("[ - ]"); - } - }); - - $(".expand-menu").on("click", function () { - $(this).prev(".hide-menu").toggle(); - $(this).parent().next("ul").toggle(); - var menuName = $(this).parent().text().replace(/[^\w\s]/gi, "").trim(); - if (sessionStorage.getItem(menuName) == "collapse") { - sessionStorage.removeItem(menuName); - } - sessionStorage.setItem(menuName, "expand"); - toggleList(this); - }); - - $(".hide-menu").on("click", function () { - $(this).next(".expand-menu").toggle(); - $(this).parent().next("ul").toggle(); - var menuName = $(this).parent().text().replace(/[^\w\s]/gi, "").trim(); - if (sessionStorage.getItem(menuName) == "expand") { - sessionStorage.removeItem(menuName); - } - sessionStorage.setItem(menuName, "collapse"); - toggleList(this); - }); - - function toggleList(menuCommand) { - $(menuCommand).toggle(); - } -}); - -// Build an array from each tag that's present - -var tagList = $(".tutorials-card-container").map(function() { - return $(this).data("tags").split(",").map(function(item) { - return item.trim(); - }); -}).get(); - -function unique(value, index, self) { - return self.indexOf(value) == index && value != "" - } - -// Only return unique tags - -var tags = tagList.sort().filter(unique); - -// Add filter buttons to the top of the page for each tag - -function createTagMenu() { - tags.forEach(function(item){ - $(".tutorial-filter-menu").append("
" + item + "
") - }) -}; - -createTagMenu(); - -// Remove hyphens if they are present in the filter buttons - -$(".tags").each(function(){ - var tags = $(this).text().split(","); - tags.forEach(function(tag, i ) { - tags[i] = tags[i].replace(/-/, ' ') - }) - $(this).html(tags.join(", ")); -}); - -// Remove hyphens if they are present in the card body - -$(".tutorial-filter").each(function(){ - var tag = $(this).text(); - $(this).html(tag.replace(/-/, ' ')) -}) - -// Remove any empty p tags that Sphinx adds - -$("#tutorial-cards p").each(function(index, item) { - if(!$(item).text().trim()) { - $(item).remove(); - } -}); - -// Jump back to top on pagination click - -$(document).on("click", ".page", function() { - $('html, body').animate( - {scrollTop: $("#dropdown-filter-tags").position().top}, - 'slow' - ); -}); - -var link = $("a[href='intermediate/speech_command_recognition_with_torchaudio.html']"); - -if (link.text() == "SyntaxError") { - console.log("There is an issue with the intermediate/speech_command_recognition_with_torchaudio.html menu item."); - link.text("Speech Command Recognition with torchaudio"); -} - -$(".stars-outer > i").hover(function() { - $(this).prevAll().addBack().toggleClass("fas star-fill"); -}); - -$(".stars-outer > i").on("click", function() { - $(this).prevAll().each(function() { - $(this).addBack().addClass("fas star-fill"); - }); - - $(".stars-outer > i").each(function() { - $(this).unbind("mouseenter mouseleave").css({ - "pointer-events": "none" - }); - }); -}) - -$("#pytorch-side-scroll-right li a").on("click", function (e) { - var href = $(this).attr("href"); - $('html, body').stop().animate({ - scrollTop: $(href).offset().top - 100 - }, 850); - e.preventDefault; -}); - -var lastId, - topMenu = $("#pytorch-side-scroll-right"), - topMenuHeight = topMenu.outerHeight() + 1, - // All sidenav items - menuItems = topMenu.find("a"), - // Anchors for menu items - scrollItems = menuItems.map(function () { - var item = $(this).attr("href"); - if (item.length) { - return item; - } - }); - -$(window).scroll(function () { - var fromTop = $(this).scrollTop() + topMenuHeight; - var article = ".section"; - - $(article).each(function (i) { - var offsetScroll = $(this).offset().top - $(window).scrollTop(); - if ( - offsetScroll <= topMenuHeight + 200 && - offsetScroll >= topMenuHeight - 200 && - scrollItems[i] == "#" + $(this).attr("id") && - $(".hidden:visible") - ) { - $(menuItems).removeClass("side-scroll-highlight"); - $(menuItems[i]).addClass("side-scroll-highlight"); - } - }); -}); - - -},{"jquery":"jquery"}],"pytorch-sphinx-theme":[function(require,module,exports){ -require=(function(){function r(e,n,t){function o(i,f){if(!n[i]){if(!e[i]){var c="function"==typeof require&&require;if(!f&&c)return c(i,!0);if(u)return u(i,!0);var a=new Error("Cannot find module '"+i+"'");throw a.code="MODULE_NOT_FOUND",a}var p=n[i]={exports:{}};e[i][0].call(p.exports,function(r){var n=e[i][1][r];return o(n||r)},p,p.exports,r,e,n,t)}return n[i].exports}for(var u="function"==typeof require&&require,i=0;i wait) { - if (timeout) { - clearTimeout(timeout); - timeout = null; - } - previous = now; - result = func.apply(context, args); - if (!timeout) context = args = null; - } else if (!timeout && options.trailing !== false) { - timeout = setTimeout(later, remaining); - } - return result; - }; - }, - - closest: function (el, selector) { - var matchesFn; - - // find vendor prefix - ['matches','webkitMatchesSelector','mozMatchesSelector','msMatchesSelector','oMatchesSelector'].some(function(fn) { - if (typeof document.body[fn] == 'function') { - matchesFn = fn; - return true; - } - return false; - }); - - var parent; - - // traverse parents - while (el) { - parent = el.parentElement; - if (parent && parent[matchesFn](selector)) { - return parent; - } - el = parent; - } - - return null; - }, - - // Modified from https://stackoverflow.com/a/18953277 - offset: function(elem) { - if (!elem) { - return; - } - - rect = elem.getBoundingClientRect(); - - // Make sure element is not hidden (display: none) or disconnected - if (rect.width || rect.height || elem.getClientRects().length) { - var doc = elem.ownerDocument; - var docElem = doc.documentElement; - - return { - top: rect.top + window.pageYOffset - docElem.clientTop, - left: rect.left + window.pageXOffset - docElem.clientLeft - }; - } - }, - - headersHeight: function() { - if (document.getElementById("pytorch-left-menu").classList.contains("make-fixed")) { - return document.getElementById("pytorch-page-level-bar").offsetHeight; - } else { - return document.getElementById("header-holder").offsetHeight + - document.getElementById("pytorch-page-level-bar").offsetHeight; - } - }, - - windowHeight: function() { - return window.innerHeight || - document.documentElement.clientHeight || - document.body.clientHeight; - } -} - -},{}],2:[function(require,module,exports){ -var cookieBanner = { - init: function() { - cookieBanner.bind(); - - var cookieExists = cookieBanner.cookieExists(); - - if (!cookieExists) { - cookieBanner.setCookie(); - cookieBanner.showCookieNotice(); - } - }, - - bind: function() { - $(".close-button").on("click", cookieBanner.hideCookieNotice); - }, - - cookieExists: function() { - var cookie = localStorage.getItem("returningPytorchUser"); - - if (cookie) { - return true; - } else { - return false; - } - }, - - setCookie: function() { - localStorage.setItem("returningPytorchUser", true); - }, - - showCookieNotice: function() { - $(".cookie-banner-wrapper").addClass("is-visible"); - }, - - hideCookieNotice: function() { - $(".cookie-banner-wrapper").removeClass("is-visible"); - } -}; - -$(function() { - cookieBanner.init(); -}); - -},{}],3:[function(require,module,exports){ -window.filterTags = { - bind: function() { - var options = { - valueNames: [{ data: ["tags"] }], - page: "6", - pagination: true - }; - - var tutorialList = new List("tutorial-cards", options); - - function filterSelectedTags(cardTags, selectedTags) { - return cardTags.some(function(tag) { - return selectedTags.some(function(selectedTag) { - return selectedTag == tag; - }); - }); - } - - function updateList() { - var selectedTags = []; - - $(".selected").each(function() { - selectedTags.push($(this).data("tag")); - }); - - tutorialList.filter(function(item) { - var cardTags; - - if (item.values().tags == null) { - cardTags = [""]; - } else { - cardTags = item.values().tags.split(","); - } - - if (selectedTags.length == 0) { - return true; - } else { - return filterSelectedTags(cardTags, selectedTags); - } - }); - } - - $(".filter-btn").on("click", function() { - if ($(this).data("tag") == "all") { - $(this).addClass("all-tag-selected"); - $(".filter").removeClass("selected"); - } else { - $(this).toggleClass("selected"); - $("[data-tag='all']").removeClass("all-tag-selected"); - } - - // If no tags are selected then highlight the 'All' tag - - if (!$(".selected")[0]) { - $("[data-tag='all']").addClass("all-tag-selected"); - } - - updateList(); - }); - } -}; - -},{}],4:[function(require,module,exports){ -// Modified from https://stackoverflow.com/a/32396543 -window.highlightNavigation = { - navigationListItems: document.querySelectorAll("#pytorch-right-menu li"), - sections: document.querySelectorAll(".pytorch-article .section"), - sectionIdTonavigationLink: {}, - - bind: function() { - if (!sideMenus.displayRightMenu) { - return; - }; - - for (var i = 0; i < highlightNavigation.sections.length; i++) { - var id = highlightNavigation.sections[i].id; - highlightNavigation.sectionIdTonavigationLink[id] = - document.querySelectorAll('#pytorch-right-menu li a[href="#' + id + '"]')[0]; - } - - $(window).scroll(utilities.throttle(highlightNavigation.highlight, 100)); - }, - - highlight: function() { - var rightMenu = document.getElementById("pytorch-right-menu"); - - // If right menu is not on the screen don't bother - if (rightMenu.offsetWidth === 0 && rightMenu.offsetHeight === 0) { - return; - } - - var scrollPosition = utilities.scrollTop(); - var OFFSET_TOP_PADDING = 25; - var offset = document.getElementById("header-holder").offsetHeight + - document.getElementById("pytorch-page-level-bar").offsetHeight + - OFFSET_TOP_PADDING; - - var sections = highlightNavigation.sections; - - for (var i = (sections.length - 1); i >= 0; i--) { - var currentSection = sections[i]; - var sectionTop = utilities.offset(currentSection).top; - - if (scrollPosition >= sectionTop - offset) { - var navigationLink = highlightNavigation.sectionIdTonavigationLink[currentSection.id]; - var navigationListItem = utilities.closest(navigationLink, "li"); - - if (navigationListItem && !navigationListItem.classList.contains("active")) { - for (var i = 0; i < highlightNavigation.navigationListItems.length; i++) { - var el = highlightNavigation.navigationListItems[i]; - if (el.classList.contains("active")) { - el.classList.remove("active"); - } - } - - navigationListItem.classList.add("active"); - - // Scroll to active item. Not a requested feature but we could revive it. Needs work. - - // var menuTop = $("#pytorch-right-menu").position().top; - // var itemTop = navigationListItem.getBoundingClientRect().top; - // var TOP_PADDING = 20 - // var newActiveTop = $("#pytorch-side-scroll-right").scrollTop() + itemTop - menuTop - TOP_PADDING; - - // $("#pytorch-side-scroll-right").animate({ - // scrollTop: newActiveTop - // }, 100); - } - - break; - } - } - } -}; - -},{}],5:[function(require,module,exports){ -window.mainMenuDropdown = { - bind: function() { - $("[data-toggle='ecosystem-dropdown']").on("click", function() { - toggleDropdown($(this).attr("data-toggle")); - }); - - $("[data-toggle='resources-dropdown']").on("click", function() { - toggleDropdown($(this).attr("data-toggle")); - }); - - function toggleDropdown(menuToggle) { - var showMenuClass = "show-menu"; - var menuClass = "." + menuToggle + "-menu"; - - if ($(menuClass).hasClass(showMenuClass)) { - $(menuClass).removeClass(showMenuClass); - } else { - $("[data-toggle=" + menuToggle + "].show-menu").removeClass( - showMenuClass - ); - $(menuClass).addClass(showMenuClass); - } - } - } -}; - -},{}],6:[function(require,module,exports){ -window.mobileMenu = { - bind: function() { - $("[data-behavior='open-mobile-menu']").on('click', function(e) { - e.preventDefault(); - $(".mobile-main-menu").addClass("open"); - $("body").addClass('no-scroll'); - - mobileMenu.listenForResize(); - }); - - $("[data-behavior='close-mobile-menu']").on('click', function(e) { - e.preventDefault(); - mobileMenu.close(); - }); - }, - - listenForResize: function() { - $(window).on('resize.ForMobileMenu', function() { - if ($(this).width() > 768) { - mobileMenu.close(); - } - }); - }, - - close: function() { - $(".mobile-main-menu").removeClass("open"); - $("body").removeClass('no-scroll'); - $(window).off('resize.ForMobileMenu'); - } -}; - -},{}],7:[function(require,module,exports){ -window.mobileTOC = { - bind: function() { - $("[data-behavior='toggle-table-of-contents']").on("click", function(e) { - e.preventDefault(); - - var $parent = $(this).parent(); - - if ($parent.hasClass("is-open")) { - $parent.removeClass("is-open"); - $(".pytorch-left-menu").slideUp(200, function() { - $(this).css({display: ""}); - }); - } else { - $parent.addClass("is-open"); - $(".pytorch-left-menu").slideDown(200); - } - }); - } -} - -},{}],8:[function(require,module,exports){ -window.pytorchAnchors = { - bind: function() { - // Replace Sphinx-generated anchors with anchorjs ones - $(".headerlink").text(""); - - window.anchors.add(".pytorch-article .headerlink"); - - $(".anchorjs-link").each(function() { - var $headerLink = $(this).closest(".headerlink"); - var href = $headerLink.attr("href"); - var clone = this.outerHTML; - - $clone = $(clone).attr("href", href); - $headerLink.before($clone); - $headerLink.remove(); - }); - } -}; - -},{}],9:[function(require,module,exports){ -// Modified from https://stackoverflow.com/a/13067009 -// Going for a JS solution to scrolling to an anchor so we can benefit from -// less hacky css and smooth scrolling. - -window.scrollToAnchor = { - bind: function() { - var document = window.document; - var history = window.history; - var location = window.location - var HISTORY_SUPPORT = !!(history && history.pushState); - - var anchorScrolls = { - ANCHOR_REGEX: /^#[^ ]+$/, - offsetHeightPx: function() { - var OFFSET_HEIGHT_PADDING = 20; - // TODO: this is a little janky. We should try to not rely on JS for this - return utilities.headersHeight() + OFFSET_HEIGHT_PADDING; - }, - - /** - * Establish events, and fix initial scroll position if a hash is provided. - */ - init: function() { - this.scrollToCurrent(); - // This interferes with clicks below it, causing a double fire - // $(window).on('hashchange', $.proxy(this, 'scrollToCurrent')); - $('body').on('click', 'a', $.proxy(this, 'delegateAnchors')); - $('body').on('click', '#pytorch-right-menu li span', $.proxy(this, 'delegateSpans')); - }, - - /** - * Return the offset amount to deduct from the normal scroll position. - * Modify as appropriate to allow for dynamic calculations - */ - getFixedOffset: function() { - return this.offsetHeightPx(); - }, - - /** - * If the provided href is an anchor which resolves to an element on the - * page, scroll to it. - * @param {String} href - * @return {Boolean} - Was the href an anchor. - */ - scrollIfAnchor: function(href, pushToHistory) { - var match, anchorOffset; - - if(!this.ANCHOR_REGEX.test(href)) { - return false; - } - - match = document.getElementById(href.slice(1)); - - if(match) { - var anchorOffset = $(match).offset().top - this.getFixedOffset(); - - $('html, body').scrollTop(anchorOffset); - - // Add the state to history as-per normal anchor links - if(HISTORY_SUPPORT && pushToHistory) { - history.pushState({}, document.title, location.pathname + href); - } - } - - return !!match; - }, - - /** - * Attempt to scroll to the current location's hash. - */ - scrollToCurrent: function(e) { - if(this.scrollIfAnchor(window.location.hash) && e) { - e.preventDefault(); - } - }, - - delegateSpans: function(e) { - var elem = utilities.closest(e.target, "a"); - - if(this.scrollIfAnchor(elem.getAttribute('href'), true)) { - e.preventDefault(); - } - }, - - /** - * If the click event's target was an anchor, fix the scroll position. - */ - delegateAnchors: function(e) { - var elem = e.target; - - if(this.scrollIfAnchor(elem.getAttribute('href'), true)) { - e.preventDefault(); - } - } - }; - - $(document).ready($.proxy(anchorScrolls, 'init')); - } -}; - -},{}],10:[function(require,module,exports){ -window.sideMenus = { - rightMenuIsOnScreen: function() { - return document.getElementById("pytorch-content-right").offsetParent !== null; - }, - - isFixedToBottom: false, - - bind: function() { - sideMenus.handleLeftMenu(); - - var rightMenuLinks = document.querySelectorAll("#pytorch-right-menu li"); - var rightMenuHasLinks = rightMenuLinks.length > 1; - - if (!rightMenuHasLinks) { - for (var i = 0; i < rightMenuLinks.length; i++) { - rightMenuLinks[i].style.display = "none"; - } - } - - if (rightMenuHasLinks) { - // Don't show the Shortcuts menu title text unless there are menu items - document.getElementById("pytorch-shortcuts-wrapper").style.display = "block"; - - // We are hiding the titles of the pages in the right side menu but there are a few - // pages that include other pages in the right side menu (see 'torch.nn' in the docs) - // so if we exclude those it looks confusing. Here we add a 'title-link' class to these - // links so we can exclude them from normal right side menu link operations - var titleLinks = document.querySelectorAll( - "#pytorch-right-menu #pytorch-side-scroll-right \ - > ul > li > a.reference.internal" - ); - - for (var i = 0; i < titleLinks.length; i++) { - var link = titleLinks[i]; - - link.classList.add("title-link"); - - if ( - link.nextElementSibling && - link.nextElementSibling.tagName === "UL" && - link.nextElementSibling.children.length > 0 - ) { - link.classList.add("has-children"); - } - } - - // Add + expansion signifiers to normal right menu links that have sub menus - var menuLinks = document.querySelectorAll( - "#pytorch-right-menu ul li ul li a.reference.internal" - ); - - for (var i = 0; i < menuLinks.length; i++) { - if ( - menuLinks[i].nextElementSibling && - menuLinks[i].nextElementSibling.tagName === "UL" - ) { - menuLinks[i].classList.add("not-expanded"); - } - } - - // If a hash is present on page load recursively expand menu items leading to selected item - var linkWithHash = - document.querySelector( - "#pytorch-right-menu a[href=\"" + window.location.hash + "\"]" - ); - - if (linkWithHash) { - // Expand immediate sibling list if present - if ( - linkWithHash.nextElementSibling && - linkWithHash.nextElementSibling.tagName === "UL" && - linkWithHash.nextElementSibling.children.length > 0 - ) { - linkWithHash.nextElementSibling.style.display = "block"; - linkWithHash.classList.add("expanded"); - } - - // Expand ancestor lists if any - sideMenus.expandClosestUnexpandedParentList(linkWithHash); - } - - // Bind click events on right menu links - $("#pytorch-right-menu a.reference.internal").on("click", function() { - if (this.classList.contains("expanded")) { - this.nextElementSibling.style.display = "none"; - this.classList.remove("expanded"); - this.classList.add("not-expanded"); - } else if (this.classList.contains("not-expanded")) { - this.nextElementSibling.style.display = "block"; - this.classList.remove("not-expanded"); - this.classList.add("expanded"); - } - }); - - sideMenus.handleRightMenu(); - } - - $(window).on('resize scroll', function(e) { - sideMenus.handleNavBar(); - - sideMenus.handleLeftMenu(); - - if (sideMenus.rightMenuIsOnScreen()) { - sideMenus.handleRightMenu(); - } - }); - }, - - leftMenuIsFixed: function() { - return document.getElementById("pytorch-left-menu").classList.contains("make-fixed"); - }, - - handleNavBar: function() { - var mainHeaderHeight = document.getElementById('header-holder').offsetHeight; - - // If we are scrolled past the main navigation header fix the sub menu bar to top of page - if (utilities.scrollTop() >= mainHeaderHeight) { - document.getElementById("pytorch-left-menu").classList.add("make-fixed"); - document.getElementById("pytorch-page-level-bar").classList.add("left-menu-is-fixed"); - } else { - document.getElementById("pytorch-left-menu").classList.remove("make-fixed"); - document.getElementById("pytorch-page-level-bar").classList.remove("left-menu-is-fixed"); - } - }, - - expandClosestUnexpandedParentList: function (el) { - var closestParentList = utilities.closest(el, "ul"); - - if (closestParentList) { - var closestParentLink = closestParentList.previousElementSibling; - var closestParentLinkExists = closestParentLink && - closestParentLink.tagName === "A" && - closestParentLink.classList.contains("reference"); - - if (closestParentLinkExists) { - // Don't add expansion class to any title links - if (closestParentLink.classList.contains("title-link")) { - return; - } - - closestParentList.style.display = "block"; - closestParentLink.classList.remove("not-expanded"); - closestParentLink.classList.add("expanded"); - sideMenus.expandClosestUnexpandedParentList(closestParentLink); - } - } - }, - - handleLeftMenu: function () { - var windowHeight = utilities.windowHeight(); - var topOfFooterRelativeToWindow = document.getElementById("docs-tutorials-resources").getBoundingClientRect().top; - - if (topOfFooterRelativeToWindow >= windowHeight) { - document.getElementById("pytorch-left-menu").style.height = "100%"; - } else { - var howManyPixelsOfTheFooterAreInTheWindow = windowHeight - topOfFooterRelativeToWindow; - var leftMenuDifference = howManyPixelsOfTheFooterAreInTheWindow; - document.getElementById("pytorch-left-menu").style.height = (windowHeight - leftMenuDifference) + "px"; - } - }, - - handleRightMenu: function() { - var rightMenuWrapper = document.getElementById("pytorch-content-right"); - var rightMenu = document.getElementById("pytorch-right-menu"); - var rightMenuList = rightMenu.getElementsByTagName("ul")[0]; - var article = document.getElementById("pytorch-article"); - var articleHeight = article.offsetHeight; - var articleBottom = utilities.offset(article).top + articleHeight; - var mainHeaderHeight = document.getElementById('header-holder').offsetHeight; - - if (utilities.scrollTop() < mainHeaderHeight) { - rightMenuWrapper.style.height = "100%"; - rightMenu.style.top = 0; - rightMenu.classList.remove("scrolling-fixed"); - rightMenu.classList.remove("scrolling-absolute"); - } else { - if (rightMenu.classList.contains("scrolling-fixed")) { - var rightMenuBottom = - utilities.offset(rightMenuList).top + rightMenuList.offsetHeight; - - if (rightMenuBottom >= articleBottom) { - rightMenuWrapper.style.height = articleHeight + mainHeaderHeight + "px"; - rightMenu.style.top = utilities.scrollTop() - mainHeaderHeight + "px"; - rightMenu.classList.add("scrolling-absolute"); - rightMenu.classList.remove("scrolling-fixed"); - } - } else { - rightMenuWrapper.style.height = articleHeight + mainHeaderHeight + "px"; - rightMenu.style.top = - articleBottom - mainHeaderHeight - rightMenuList.offsetHeight + "px"; - rightMenu.classList.add("scrolling-absolute"); - } - - if (utilities.scrollTop() < articleBottom - rightMenuList.offsetHeight) { - rightMenuWrapper.style.height = "100%"; - rightMenu.style.top = ""; - rightMenu.classList.remove("scrolling-absolute"); - rightMenu.classList.add("scrolling-fixed"); - } - } - - var rightMenuSideScroll = document.getElementById("pytorch-side-scroll-right"); - var sideScrollFromWindowTop = rightMenuSideScroll.getBoundingClientRect().top; - - rightMenuSideScroll.style.height = utilities.windowHeight() - sideScrollFromWindowTop + "px"; - } -}; - -},{}],11:[function(require,module,exports){ -var jQuery = (typeof(window) != 'undefined') ? window.jQuery : require('jquery'); - -// Sphinx theme nav state -function ThemeNav () { - - var nav = { - navBar: null, - win: null, - winScroll: false, - winResize: false, - linkScroll: false, - winPosition: 0, - winHeight: null, - docHeight: null, - isRunning: false - }; - - nav.enable = function (withStickyNav) { - var self = this; - - // TODO this can likely be removed once the theme javascript is broken - // out from the RTD assets. This just ensures old projects that are - // calling `enable()` get the sticky menu on by default. All other cals - // to `enable` should include an argument for enabling the sticky menu. - if (typeof(withStickyNav) == 'undefined') { - withStickyNav = true; - } - - if (self.isRunning) { - // Only allow enabling nav logic once - return; - } - - self.isRunning = true; - jQuery(function ($) { - self.init($); - - self.reset(); - self.win.on('hashchange', self.reset); - - if (withStickyNav) { - // Set scroll monitor - self.win.on('scroll', function () { - if (!self.linkScroll) { - if (!self.winScroll) { - self.winScroll = true; - requestAnimationFrame(function() { self.onScroll(); }); - } - } - }); - } - - // Set resize monitor - self.win.on('resize', function () { - if (!self.winResize) { - self.winResize = true; - requestAnimationFrame(function() { self.onResize(); }); - } - }); - - self.onResize(); - }); - - }; - - // TODO remove this with a split in theme and Read the Docs JS logic as - // well, it's only here to support 0.3.0 installs of our theme. - nav.enableSticky = function() { - this.enable(true); - }; - - nav.init = function ($) { - var doc = $(document), - self = this; - - this.navBar = $('div.pytorch-side-scroll:first'); - this.win = $(window); - - // Set up javascript UX bits - $(document) - // Shift nav in mobile when clicking the menu. - .on('click', "[data-toggle='pytorch-left-menu-nav-top']", function() { - $("[data-toggle='wy-nav-shift']").toggleClass("shift"); - $("[data-toggle='rst-versions']").toggleClass("shift"); - }) - - // Nav menu link click operations - .on('click', ".pytorch-menu-vertical .current ul li a", function() { - var target = $(this); - // Close menu when you click a link. - $("[data-toggle='wy-nav-shift']").removeClass("shift"); - $("[data-toggle='rst-versions']").toggleClass("shift"); - // Handle dynamic display of l3 and l4 nav lists - self.toggleCurrent(target); - self.hashChange(); - }) - .on('click', "[data-toggle='rst-current-version']", function() { - $("[data-toggle='rst-versions']").toggleClass("shift-up"); - }) - - // Make tables responsive - $("table.docutils:not(.field-list,.footnote,.citation)") - .wrap("
"); - - // Add extra class to responsive tables that contain - // footnotes or citations so that we can target them for styling - $("table.docutils.footnote") - .wrap("
"); - $("table.docutils.citation") - .wrap("
"); - - // Add expand links to all parents of nested ul - $('.pytorch-menu-vertical ul').not('.simple').siblings('a').each(function () { - var link = $(this); - expand = $(''); - expand.on('click', function (ev) { - self.toggleCurrent(link); - ev.stopPropagation(); - return false; - }); - link.prepend(expand); - }); - }; - - nav.reset = function () { - // Get anchor from URL and open up nested nav - var anchor = encodeURI(window.location.hash) || '#'; - - try { - var vmenu = $('.pytorch-menu-vertical'); - var link = vmenu.find('[href="' + anchor + '"]'); - if (link.length === 0) { - // this link was not found in the sidebar. - // Find associated id element, then its closest section - // in the document and try with that one. - var id_elt = $('.document [id="' + anchor.substring(1) + '"]'); - var closest_section = id_elt.closest('div.section'); - link = vmenu.find('[href="#' + closest_section.attr("id") + '"]'); - if (link.length === 0) { - // still not found in the sidebar. fall back to main section - link = vmenu.find('[href="#"]'); - } - } - // If we found a matching link then reset current and re-apply - // otherwise retain the existing match - if (link.length > 0) { - $('.pytorch-menu-vertical .current').removeClass('current'); - link.addClass('current'); - link.closest('li.toctree-l1').addClass('current'); - link.closest('li.toctree-l1').parent().addClass('current'); - link.closest('li.toctree-l1').addClass('current'); - link.closest('li.toctree-l2').addClass('current'); - link.closest('li.toctree-l3').addClass('current'); - link.closest('li.toctree-l4').addClass('current'); - } - } - catch (err) { - console.log("Error expanding nav for anchor", err); - } - - }; - - nav.onScroll = function () { - this.winScroll = false; - var newWinPosition = this.win.scrollTop(), - winBottom = newWinPosition + this.winHeight, - navPosition = this.navBar.scrollTop(), - newNavPosition = navPosition + (newWinPosition - this.winPosition); - if (newWinPosition < 0 || winBottom > this.docHeight) { - return; - } - this.navBar.scrollTop(newNavPosition); - this.winPosition = newWinPosition; - }; - - nav.onResize = function () { - this.winResize = false; - this.winHeight = this.win.height(); - this.docHeight = $(document).height(); - }; - - nav.hashChange = function () { - this.linkScroll = true; - this.win.one('hashchange', function () { - this.linkScroll = false; - }); - }; - - nav.toggleCurrent = function (elem) { - var parent_li = elem.closest('li'); - parent_li.siblings('li.current').removeClass('current'); - parent_li.siblings().find('li.current').removeClass('current'); - parent_li.find('> ul li.current').removeClass('current'); - parent_li.toggleClass('current'); - } - - return nav; -}; - -module.exports.ThemeNav = ThemeNav(); - -if (typeof(window) != 'undefined') { - window.SphinxRtdTheme = { - Navigation: module.exports.ThemeNav, - // TODO remove this once static assets are split up between the theme - // and Read the Docs. For now, this patches 0.3.0 to be backwards - // compatible with a pre-0.3.0 layout.html - StickyNav: module.exports.ThemeNav, - }; -} - - -// requestAnimationFrame polyfill by Erik Möller. fixes from Paul Irish and Tino Zijdel -// https://gist.github.com/paulirish/1579671 -// MIT license - -(function() { - var lastTime = 0; - var vendors = ['ms', 'moz', 'webkit', 'o']; - for(var x = 0; x < vendors.length && !window.requestAnimationFrame; ++x) { - window.requestAnimationFrame = window[vendors[x]+'RequestAnimationFrame']; - window.cancelAnimationFrame = window[vendors[x]+'CancelAnimationFrame'] - || window[vendors[x]+'CancelRequestAnimationFrame']; - } - - if (!window.requestAnimationFrame) - window.requestAnimationFrame = function(callback, element) { - var currTime = new Date().getTime(); - var timeToCall = Math.max(0, 16 - (currTime - lastTime)); - var id = window.setTimeout(function() { callback(currTime + timeToCall); }, - timeToCall); - lastTime = currTime + timeToCall; - return id; - }; - - if (!window.cancelAnimationFrame) - window.cancelAnimationFrame = function(id) { - clearTimeout(id); - }; -}()); - -$(".sphx-glr-thumbcontainer").removeAttr("tooltip"); -$("table").removeAttr("border"); - -// This code replaces the default sphinx gallery download buttons -// with the 3 download buttons at the top of the page - -var downloadNote = $(".sphx-glr-download-link-note.admonition.note"); -if (downloadNote.length >= 1) { - var tutorialUrlArray = $("#tutorial-type").text().split('/'); - tutorialUrlArray[0] = tutorialUrlArray[0] + "/sphinx-tutorials" - - var githubLink = "https://github.com/pytorch/rl/blob/main/" + tutorialUrlArray.join("/") + ".py", - notebookLink = $(".reference.download")[1].href, - notebookDownloadPath = notebookLink.split('_downloads')[1], - colabLink = "https://colab.research.google.com/github/pytorch/rl/blob/gh-pages/_downloads" + notebookDownloadPath; - - $("#google-colab-link").wrap("
"); - $("#download-notebook-link").wrap(""); - $("#github-view-link").wrap(""); -} else { - $(".pytorch-call-to-action-links").hide(); -} - -//This code handles the Expand/Hide toggle for the Docs/Tutorials left nav items - -$(document).ready(function() { - var caption = "#pytorch-left-menu p.caption"; - var collapseAdded = $(this).not("checked"); - $(caption).each(function () { - var menuName = this.innerText.replace(/[^\w\s]/gi, "").trim(); - $(this).find("span").addClass("checked"); - if (collapsedSections.includes(menuName) == true && collapseAdded && sessionStorage.getItem(menuName) !== "expand" || sessionStorage.getItem(menuName) == "collapse") { - $(this.firstChild).after("[ + ]"); - $(this.firstChild).after("[ - ]"); - $(this).next("ul").hide(); - } else if (collapsedSections.includes(menuName) == false && collapseAdded || sessionStorage.getItem(menuName) == "expand") { - $(this.firstChild).after("[ + ]"); - $(this.firstChild).after("[ - ]"); - } - }); - - $(".expand-menu").on("click", function () { - $(this).prev(".hide-menu").toggle(); - $(this).parent().next("ul").toggle(); - var menuName = $(this).parent().text().replace(/[^\w\s]/gi, "").trim(); - if (sessionStorage.getItem(menuName) == "collapse") { - sessionStorage.removeItem(menuName); - } - sessionStorage.setItem(menuName, "expand"); - toggleList(this); - }); - - $(".hide-menu").on("click", function () { - $(this).next(".expand-menu").toggle(); - $(this).parent().next("ul").toggle(); - var menuName = $(this).parent().text().replace(/[^\w\s]/gi, "").trim(); - if (sessionStorage.getItem(menuName) == "expand") { - sessionStorage.removeItem(menuName); - } - sessionStorage.setItem(menuName, "collapse"); - toggleList(this); - }); - - function toggleList(menuCommand) { - $(menuCommand).toggle(); - } -}); - -// Build an array from each tag that's present - -var tagList = $(".tutorials-card-container").map(function() { - return $(this).data("tags").split(",").map(function(item) { - return item.trim(); - }); -}).get(); - -function unique(value, index, self) { - return self.indexOf(value) == index && value != "" - } - -// Only return unique tags - -var tags = tagList.sort().filter(unique); - -// Add filter buttons to the top of the page for each tag - -function createTagMenu() { - tags.forEach(function(item){ - $(".tutorial-filter-menu").append("
" + item + "
") - }) -}; - -createTagMenu(); - -// Remove hyphens if they are present in the filter buttons - -$(".tags").each(function(){ - var tags = $(this).text().split(","); - tags.forEach(function(tag, i ) { - tags[i] = tags[i].replace(/-/, ' ') - }) - $(this).html(tags.join(", ")); -}); - -// Remove hyphens if they are present in the card body - -$(".tutorial-filter").each(function(){ - var tag = $(this).text(); - $(this).html(tag.replace(/-/, ' ')) -}) - -// Remove any empty p tags that Sphinx adds - -$("#tutorial-cards p").each(function(index, item) { - if(!$(item).text().trim()) { - $(item).remove(); - } -}); - -// Jump back to top on pagination click - -$(document).on("click", ".page", function() { - $('html, body').animate( - {scrollTop: $("#dropdown-filter-tags").position().top}, - 'slow' - ); -}); - -var link = $("a[href='intermediate/speech_command_recognition_with_torchaudio.html']"); - -if (link.text() == "SyntaxError") { - console.log("There is an issue with the intermediate/speech_command_recognition_with_torchaudio.html menu item."); - link.text("Speech Command Recognition with torchaudio"); -} - -$(".stars-outer > i").hover(function() { - $(this).prevAll().addBack().toggleClass("fas star-fill"); -}); - -$(".stars-outer > i").on("click", function() { - $(this).prevAll().each(function() { - $(this).addBack().addClass("fas star-fill"); - }); - - $(".stars-outer > i").each(function() { - $(this).unbind("mouseenter mouseleave").css({ - "pointer-events": "none" - }); - }); -}) - -$("#pytorch-side-scroll-right li a").on("click", function (e) { - var href = $(this).attr("href"); - $('html, body').stop().animate({ - scrollTop: $(href).offset().top - 100 - }, 850); - e.preventDefault; -}); - -var lastId, - topMenu = $("#pytorch-side-scroll-right"), - topMenuHeight = topMenu.outerHeight() + 1, - // All sidenav items - menuItems = topMenu.find("a"), - // Anchors for menu items - scrollItems = menuItems.map(function () { - var item = $(this).attr("href"); - if (item.length) { - return item; - } - }); - -$(window).scroll(function () { - var fromTop = $(this).scrollTop() + topMenuHeight; - var article = ".section"; - - $(article).each(function (i) { - var offsetScroll = $(this).offset().top - $(window).scrollTop(); - if ( - offsetScroll <= topMenuHeight + 200 && - offsetScroll >= topMenuHeight - 200 && - scrollItems[i] == "#" + $(this).attr("id") && - $(".hidden:visible") - ) { - $(menuItems).removeClass("side-scroll-highlight"); - $(menuItems[i]).addClass("side-scroll-highlight"); - } - }); -}); - - -},{"jquery":"jquery"}],"pytorch-sphinx-theme":[function(require,module,exports){ -require=(function(){function r(e,n,t){function o(i,f){if(!n[i]){if(!e[i]){var c="function"==typeof require&&require;if(!f&&c)return c(i,!0);if(u)return u(i,!0);var a=new Error("Cannot find module '"+i+"'");throw a.code="MODULE_NOT_FOUND",a}var p=n[i]={exports:{}};e[i][0].call(p.exports,function(r){var n=e[i][1][r];return o(n||r)},p,p.exports,r,e,n,t)}return n[i].exports}for(var u="function"==typeof require&&require,i=0;i wait) { - if (timeout) { - clearTimeout(timeout); - timeout = null; - } - previous = now; - result = func.apply(context, args); - if (!timeout) context = args = null; - } else if (!timeout && options.trailing !== false) { - timeout = setTimeout(later, remaining); - } - return result; - }; - }, - - closest: function (el, selector) { - var matchesFn; - - // find vendor prefix - ['matches','webkitMatchesSelector','mozMatchesSelector','msMatchesSelector','oMatchesSelector'].some(function(fn) { - if (typeof document.body[fn] == 'function') { - matchesFn = fn; - return true; - } - return false; - }); - - var parent; - - // traverse parents - while (el) { - parent = el.parentElement; - if (parent && parent[matchesFn](selector)) { - return parent; - } - el = parent; - } - - return null; - }, - - // Modified from https://stackoverflow.com/a/18953277 - offset: function(elem) { - if (!elem) { - return; - } - - rect = elem.getBoundingClientRect(); - - // Make sure element is not hidden (display: none) or disconnected - if (rect.width || rect.height || elem.getClientRects().length) { - var doc = elem.ownerDocument; - var docElem = doc.documentElement; - - return { - top: rect.top + window.pageYOffset - docElem.clientTop, - left: rect.left + window.pageXOffset - docElem.clientLeft - }; - } - }, - - headersHeight: function() { - if (document.getElementById("pytorch-left-menu").classList.contains("make-fixed")) { - return document.getElementById("pytorch-page-level-bar").offsetHeight; - } else { - return document.getElementById("header-holder").offsetHeight + - document.getElementById("pytorch-page-level-bar").offsetHeight; - } - }, - - windowHeight: function() { - return window.innerHeight || - document.documentElement.clientHeight || - document.body.clientHeight; - } - } - - },{}],2:[function(require,module,exports){ - var cookieBanner = { - init: function() { - cookieBanner.bind(); - - var cookieExists = cookieBanner.cookieExists(); - - if (!cookieExists) { - cookieBanner.setCookie(); - cookieBanner.showCookieNotice(); - } - }, - - bind: function() { - $(".close-button").on("click", cookieBanner.hideCookieNotice); - }, - - cookieExists: function() { - var cookie = localStorage.getItem("returningPytorchUser"); - - if (cookie) { - return true; - } else { - return false; - } - }, - - setCookie: function() { - localStorage.setItem("returningPytorchUser", true); - }, - - showCookieNotice: function() { - $(".cookie-banner-wrapper").addClass("is-visible"); - }, - - hideCookieNotice: function() { - $(".cookie-banner-wrapper").removeClass("is-visible"); - } - }; - - $(function() { - cookieBanner.init(); - }); - - },{}],3:[function(require,module,exports){ - window.filterTags = { - bind: function() { - var options = { - valueNames: [{ data: ["tags"] }], - page: "6", - pagination: true - }; - - var tutorialList = new List("tutorial-cards", options); - - function filterSelectedTags(cardTags, selectedTags) { - return cardTags.some(function(tag) { - return selectedTags.some(function(selectedTag) { - return selectedTag == tag; - }); - }); - } - - function updateList() { - var selectedTags = []; - - $(".selected").each(function() { - selectedTags.push($(this).data("tag")); - }); - - tutorialList.filter(function(item) { - var cardTags; - - if (item.values().tags == null) { - cardTags = [""]; - } else { - cardTags = item.values().tags.split(","); - } - - if (selectedTags.length == 0) { - return true; - } else { - return filterSelectedTags(cardTags, selectedTags); - } - }); - } - - $(".filter-btn").on("click", function() { - if ($(this).data("tag") == "all") { - $(this).addClass("all-tag-selected"); - $(".filter").removeClass("selected"); - } else { - $(this).toggleClass("selected"); - $("[data-tag='all']").removeClass("all-tag-selected"); - } - - // If no tags are selected then highlight the 'All' tag - - if (!$(".selected")[0]) { - $("[data-tag='all']").addClass("all-tag-selected"); - } - - updateList(); - }); - } - }; - - },{}],4:[function(require,module,exports){ - // Modified from https://stackoverflow.com/a/32396543 - window.highlightNavigation = { - navigationListItems: document.querySelectorAll("#pytorch-right-menu li"), - sections: document.querySelectorAll(".pytorch-article .section"), - sectionIdTonavigationLink: {}, - - bind: function() { - if (!sideMenus.displayRightMenu) { - return; - }; - - for (var i = 0; i < highlightNavigation.sections.length; i++) { - var id = highlightNavigation.sections[i].id; - highlightNavigation.sectionIdTonavigationLink[id] = - document.querySelectorAll('#pytorch-right-menu li a[href="#' + id + '"]')[0]; - } - - $(window).scroll(utilities.throttle(highlightNavigation.highlight, 100)); - }, - - highlight: function() { - var rightMenu = document.getElementById("pytorch-right-menu"); - - // If right menu is not on the screen don't bother - if (rightMenu.offsetWidth === 0 && rightMenu.offsetHeight === 0) { - return; - } - - var scrollPosition = utilities.scrollTop(); - var OFFSET_TOP_PADDING = 25; - var offset = document.getElementById("header-holder").offsetHeight + - document.getElementById("pytorch-page-level-bar").offsetHeight + - OFFSET_TOP_PADDING; - - var sections = highlightNavigation.sections; - - for (var i = (sections.length - 1); i >= 0; i--) { - var currentSection = sections[i]; - var sectionTop = utilities.offset(currentSection).top; - - if (scrollPosition >= sectionTop - offset) { - var navigationLink = highlightNavigation.sectionIdTonavigationLink[currentSection.id]; - var navigationListItem = utilities.closest(navigationLink, "li"); - - if (navigationListItem && !navigationListItem.classList.contains("active")) { - for (var i = 0; i < highlightNavigation.navigationListItems.length; i++) { - var el = highlightNavigation.navigationListItems[i]; - if (el.classList.contains("active")) { - el.classList.remove("active"); - } - } - - navigationListItem.classList.add("active"); - - // Scroll to active item. Not a requested feature but we could revive it. Needs work. - - // var menuTop = $("#pytorch-right-menu").position().top; - // var itemTop = navigationListItem.getBoundingClientRect().top; - // var TOP_PADDING = 20 - // var newActiveTop = $("#pytorch-side-scroll-right").scrollTop() + itemTop - menuTop - TOP_PADDING; - - // $("#pytorch-side-scroll-right").animate({ - // scrollTop: newActiveTop - // }, 100); - } - - break; - } - } - } - }; - - },{}],5:[function(require,module,exports){ - window.mainMenuDropdown = { - bind: function() { - $("[data-toggle='ecosystem-dropdown']").on("click", function() { - toggleDropdown($(this).attr("data-toggle")); - }); - - $("[data-toggle='resources-dropdown']").on("click", function() { - toggleDropdown($(this).attr("data-toggle")); - }); - - function toggleDropdown(menuToggle) { - var showMenuClass = "show-menu"; - var menuClass = "." + menuToggle + "-menu"; - - if ($(menuClass).hasClass(showMenuClass)) { - $(menuClass).removeClass(showMenuClass); - } else { - $("[data-toggle=" + menuToggle + "].show-menu").removeClass( - showMenuClass - ); - $(menuClass).addClass(showMenuClass); - } - } - } - }; - - },{}],6:[function(require,module,exports){ - window.mobileMenu = { - bind: function() { - $("[data-behavior='open-mobile-menu']").on('click', function(e) { - e.preventDefault(); - $(".mobile-main-menu").addClass("open"); - $("body").addClass('no-scroll'); - - mobileMenu.listenForResize(); - }); - - $("[data-behavior='close-mobile-menu']").on('click', function(e) { - e.preventDefault(); - mobileMenu.close(); - }); - }, - - listenForResize: function() { - $(window).on('resize.ForMobileMenu', function() { - if ($(this).width() > 768) { - mobileMenu.close(); - } - }); - }, - - close: function() { - $(".mobile-main-menu").removeClass("open"); - $("body").removeClass('no-scroll'); - $(window).off('resize.ForMobileMenu'); - } - }; - - },{}],7:[function(require,module,exports){ - window.mobileTOC = { - bind: function() { - $("[data-behavior='toggle-table-of-contents']").on("click", function(e) { - e.preventDefault(); - - var $parent = $(this).parent(); - - if ($parent.hasClass("is-open")) { - $parent.removeClass("is-open"); - $(".pytorch-left-menu").slideUp(200, function() { - $(this).css({display: ""}); - }); - } else { - $parent.addClass("is-open"); - $(".pytorch-left-menu").slideDown(200); - } - }); - } - } - - },{}],8:[function(require,module,exports){ - window.pytorchAnchors = { - bind: function() { - // Replace Sphinx-generated anchors with anchorjs ones - $(".headerlink").text(""); - - window.anchors.add(".pytorch-article .headerlink"); - - $(".anchorjs-link").each(function() { - var $headerLink = $(this).closest(".headerlink"); - var href = $headerLink.attr("href"); - var clone = this.outerHTML; - - $clone = $(clone).attr("href", href); - $headerLink.before($clone); - $headerLink.remove(); - }); - } - }; - - },{}],9:[function(require,module,exports){ - // Modified from https://stackoverflow.com/a/13067009 - // Going for a JS solution to scrolling to an anchor so we can benefit from - // less hacky css and smooth scrolling. - - window.scrollToAnchor = { - bind: function() { - var document = window.document; - var history = window.history; - var location = window.location - var HISTORY_SUPPORT = !!(history && history.pushState); - - var anchorScrolls = { - ANCHOR_REGEX: /^#[^ ]+$/, - offsetHeightPx: function() { - var OFFSET_HEIGHT_PADDING = 20; - // TODO: this is a little janky. We should try to not rely on JS for this - return utilities.headersHeight() + OFFSET_HEIGHT_PADDING; - }, - - /** - * Establish events, and fix initial scroll position if a hash is provided. - */ - init: function() { - this.scrollToCurrent(); - // This interferes with clicks below it, causing a double fire - // $(window).on('hashchange', $.proxy(this, 'scrollToCurrent')); - $('body').on('click', 'a', $.proxy(this, 'delegateAnchors')); - $('body').on('click', '#pytorch-right-menu li span', $.proxy(this, 'delegateSpans')); - }, - - /** - * Return the offset amount to deduct from the normal scroll position. - * Modify as appropriate to allow for dynamic calculations - */ - getFixedOffset: function() { - return this.offsetHeightPx(); - }, - - /** - * If the provided href is an anchor which resolves to an element on the - * page, scroll to it. - * @param {String} href - * @return {Boolean} - Was the href an anchor. - */ - scrollIfAnchor: function(href, pushToHistory) { - var match, anchorOffset; - - if(!this.ANCHOR_REGEX.test(href)) { - return false; - } - - match = document.getElementById(href.slice(1)); - - if(match) { - var anchorOffset = $(match).offset().top - this.getFixedOffset(); - - $('html, body').scrollTop(anchorOffset); - - // Add the state to history as-per normal anchor links - if(HISTORY_SUPPORT && pushToHistory) { - history.pushState({}, document.title, location.pathname + href); - } - } - - return !!match; - }, - - /** - * Attempt to scroll to the current location's hash. - */ - scrollToCurrent: function(e) { - if(this.scrollIfAnchor(window.location.hash) && e) { - e.preventDefault(); - } - }, - - delegateSpans: function(e) { - var elem = utilities.closest(e.target, "a"); - - if(this.scrollIfAnchor(elem.getAttribute('href'), true)) { - e.preventDefault(); - } - }, - - /** - * If the click event's target was an anchor, fix the scroll position. - */ - delegateAnchors: function(e) { - var elem = e.target; - - if(this.scrollIfAnchor(elem.getAttribute('href'), true)) { - e.preventDefault(); - } - } - }; - - $(document).ready($.proxy(anchorScrolls, 'init')); - } - }; - - },{}],10:[function(require,module,exports){ - window.sideMenus = { - rightMenuIsOnScreen: function() { - return document.getElementById("pytorch-content-right").offsetParent !== null; - }, - - isFixedToBottom: false, - - bind: function() { - sideMenus.handleLeftMenu(); - - var rightMenuLinks = document.querySelectorAll("#pytorch-right-menu li"); - var rightMenuHasLinks = rightMenuLinks.length > 1; - - if (!rightMenuHasLinks) { - for (var i = 0; i < rightMenuLinks.length; i++) { - rightMenuLinks[i].style.display = "none"; - } - } - - if (rightMenuHasLinks) { - // Don't show the Shortcuts menu title text unless there are menu items - document.getElementById("pytorch-shortcuts-wrapper").style.display = "block"; - - // We are hiding the titles of the pages in the right side menu but there are a few - // pages that include other pages in the right side menu (see 'torch.nn' in the docs) - // so if we exclude those it looks confusing. Here we add a 'title-link' class to these - // links so we can exclude them from normal right side menu link operations - var titleLinks = document.querySelectorAll( - "#pytorch-right-menu #pytorch-side-scroll-right \ - > ul > li > a.reference.internal" - ); - - for (var i = 0; i < titleLinks.length; i++) { - var link = titleLinks[i]; - - link.classList.add("title-link"); - - if ( - link.nextElementSibling && - link.nextElementSibling.tagName === "UL" && - link.nextElementSibling.children.length > 0 - ) { - link.classList.add("has-children"); - } - } - - // Add + expansion signifiers to normal right menu links that have sub menus - var menuLinks = document.querySelectorAll( - "#pytorch-right-menu ul li ul li a.reference.internal" - ); - - for (var i = 0; i < menuLinks.length; i++) { - if ( - menuLinks[i].nextElementSibling && - menuLinks[i].nextElementSibling.tagName === "UL" - ) { - menuLinks[i].classList.add("not-expanded"); - } - } - - // If a hash is present on page load recursively expand menu items leading to selected item - var linkWithHash = - document.querySelector( - "#pytorch-right-menu a[href=\"" + window.location.hash + "\"]" - ); - - if (linkWithHash) { - // Expand immediate sibling list if present - if ( - linkWithHash.nextElementSibling && - linkWithHash.nextElementSibling.tagName === "UL" && - linkWithHash.nextElementSibling.children.length > 0 - ) { - linkWithHash.nextElementSibling.style.display = "block"; - linkWithHash.classList.add("expanded"); - } - - // Expand ancestor lists if any - sideMenus.expandClosestUnexpandedParentList(linkWithHash); - } - - // Bind click events on right menu links - $("#pytorch-right-menu a.reference.internal").on("click", function() { - if (this.classList.contains("expanded")) { - this.nextElementSibling.style.display = "none"; - this.classList.remove("expanded"); - this.classList.add("not-expanded"); - } else if (this.classList.contains("not-expanded")) { - this.nextElementSibling.style.display = "block"; - this.classList.remove("not-expanded"); - this.classList.add("expanded"); - } - }); - - sideMenus.handleRightMenu(); - } - - $(window).on('resize scroll', function(e) { - sideMenus.handleNavBar(); - - sideMenus.handleLeftMenu(); - - if (sideMenus.rightMenuIsOnScreen()) { - sideMenus.handleRightMenu(); - } - }); - }, - - leftMenuIsFixed: function() { - return document.getElementById("pytorch-left-menu").classList.contains("make-fixed"); - }, - - handleNavBar: function() { - var mainHeaderHeight = document.getElementById('header-holder').offsetHeight; - - // If we are scrolled past the main navigation header fix the sub menu bar to top of page - if (utilities.scrollTop() >= mainHeaderHeight) { - document.getElementById("pytorch-left-menu").classList.add("make-fixed"); - document.getElementById("pytorch-page-level-bar").classList.add("left-menu-is-fixed"); - } else { - document.getElementById("pytorch-left-menu").classList.remove("make-fixed"); - document.getElementById("pytorch-page-level-bar").classList.remove("left-menu-is-fixed"); - } - }, - - expandClosestUnexpandedParentList: function (el) { - var closestParentList = utilities.closest(el, "ul"); - - if (closestParentList) { - var closestParentLink = closestParentList.previousElementSibling; - var closestParentLinkExists = closestParentLink && - closestParentLink.tagName === "A" && - closestParentLink.classList.contains("reference"); - - if (closestParentLinkExists) { - // Don't add expansion class to any title links - if (closestParentLink.classList.contains("title-link")) { - return; - } - - closestParentList.style.display = "block"; - closestParentLink.classList.remove("not-expanded"); - closestParentLink.classList.add("expanded"); - sideMenus.expandClosestUnexpandedParentList(closestParentLink); - } - } - }, - - handleLeftMenu: function () { - var windowHeight = utilities.windowHeight(); - var topOfFooterRelativeToWindow = document.getElementById("docs-tutorials-resources").getBoundingClientRect().top; - - if (topOfFooterRelativeToWindow >= windowHeight) { - document.getElementById("pytorch-left-menu").style.height = "100%"; - } else { - var howManyPixelsOfTheFooterAreInTheWindow = windowHeight - topOfFooterRelativeToWindow; - var leftMenuDifference = howManyPixelsOfTheFooterAreInTheWindow; - document.getElementById("pytorch-left-menu").style.height = (windowHeight - leftMenuDifference) + "px"; - } - }, - - handleRightMenu: function() { - var rightMenuWrapper = document.getElementById("pytorch-content-right"); - var rightMenu = document.getElementById("pytorch-right-menu"); - var rightMenuList = rightMenu.getElementsByTagName("ul")[0]; - var article = document.getElementById("pytorch-article"); - var articleHeight = article.offsetHeight; - var articleBottom = utilities.offset(article).top + articleHeight; - var mainHeaderHeight = document.getElementById('header-holder').offsetHeight; - - if (utilities.scrollTop() < mainHeaderHeight) { - rightMenuWrapper.style.height = "100%"; - rightMenu.style.top = 0; - rightMenu.classList.remove("scrolling-fixed"); - rightMenu.classList.remove("scrolling-absolute"); - } else { - if (rightMenu.classList.contains("scrolling-fixed")) { - var rightMenuBottom = - utilities.offset(rightMenuList).top + rightMenuList.offsetHeight; - - if (rightMenuBottom >= articleBottom) { - rightMenuWrapper.style.height = articleHeight + mainHeaderHeight + "px"; - rightMenu.style.top = utilities.scrollTop() - mainHeaderHeight + "px"; - rightMenu.classList.add("scrolling-absolute"); - rightMenu.classList.remove("scrolling-fixed"); - } - } else { - rightMenuWrapper.style.height = articleHeight + mainHeaderHeight + "px"; - rightMenu.style.top = - articleBottom - mainHeaderHeight - rightMenuList.offsetHeight + "px"; - rightMenu.classList.add("scrolling-absolute"); - } - - if (utilities.scrollTop() < articleBottom - rightMenuList.offsetHeight) { - rightMenuWrapper.style.height = "100%"; - rightMenu.style.top = ""; - rightMenu.classList.remove("scrolling-absolute"); - rightMenu.classList.add("scrolling-fixed"); - } - } - - var rightMenuSideScroll = document.getElementById("pytorch-side-scroll-right"); - var sideScrollFromWindowTop = rightMenuSideScroll.getBoundingClientRect().top; - - rightMenuSideScroll.style.height = utilities.windowHeight() - sideScrollFromWindowTop + "px"; - } - }; - - },{}],11:[function(require,module,exports){ - var jQuery = (typeof(window) != 'undefined') ? window.jQuery : require('jquery'); - - // Sphinx theme nav state - function ThemeNav () { - - var nav = { - navBar: null, - win: null, - winScroll: false, - winResize: false, - linkScroll: false, - winPosition: 0, - winHeight: null, - docHeight: null, - isRunning: false - }; - - nav.enable = function (withStickyNav) { - var self = this; - - // TODO this can likely be removed once the theme javascript is broken - // out from the RTD assets. This just ensures old projects that are - // calling `enable()` get the sticky menu on by default. All other cals - // to `enable` should include an argument for enabling the sticky menu. - if (typeof(withStickyNav) == 'undefined') { - withStickyNav = true; - } - - if (self.isRunning) { - // Only allow enabling nav logic once - return; - } - - self.isRunning = true; - jQuery(function ($) { - self.init($); - - self.reset(); - self.win.on('hashchange', self.reset); - - if (withStickyNav) { - // Set scroll monitor - self.win.on('scroll', function () { - if (!self.linkScroll) { - if (!self.winScroll) { - self.winScroll = true; - requestAnimationFrame(function() { self.onScroll(); }); - } - } - }); - } - - // Set resize monitor - self.win.on('resize', function () { - if (!self.winResize) { - self.winResize = true; - requestAnimationFrame(function() { self.onResize(); }); - } - }); - - self.onResize(); - }); - - }; - - // TODO remove this with a split in theme and Read the Docs JS logic as - // well, it's only here to support 0.3.0 installs of our theme. - nav.enableSticky = function() { - this.enable(true); - }; - - nav.init = function ($) { - var doc = $(document), - self = this; - - this.navBar = $('div.pytorch-side-scroll:first'); - this.win = $(window); - - // Set up javascript UX bits - $(document) - // Shift nav in mobile when clicking the menu. - .on('click', "[data-toggle='pytorch-left-menu-nav-top']", function() { - $("[data-toggle='wy-nav-shift']").toggleClass("shift"); - $("[data-toggle='rst-versions']").toggleClass("shift"); - }) - - // Nav menu link click operations - .on('click', ".pytorch-menu-vertical .current ul li a", function() { - var target = $(this); - // Close menu when you click a link. - $("[data-toggle='wy-nav-shift']").removeClass("shift"); - $("[data-toggle='rst-versions']").toggleClass("shift"); - // Handle dynamic display of l3 and l4 nav lists - self.toggleCurrent(target); - self.hashChange(); - }) - .on('click', "[data-toggle='rst-current-version']", function() { - $("[data-toggle='rst-versions']").toggleClass("shift-up"); - }) - - // Make tables responsive - $("table.docutils:not(.field-list,.footnote,.citation)") - .wrap("
"); - - // Add extra class to responsive tables that contain - // footnotes or citations so that we can target them for styling - $("table.docutils.footnote") - .wrap("
"); - $("table.docutils.citation") - .wrap("
"); - - // Add expand links to all parents of nested ul - $('.pytorch-menu-vertical ul').not('.simple').siblings('a').each(function () { - var link = $(this); - expand = $(''); - expand.on('click', function (ev) { - self.toggleCurrent(link); - ev.stopPropagation(); - return false; - }); - link.prepend(expand); - }); - }; - - nav.reset = function () { - // Get anchor from URL and open up nested nav - var anchor = encodeURI(window.location.hash) || '#'; - - try { - var vmenu = $('.pytorch-menu-vertical'); - var link = vmenu.find('[href="' + anchor + '"]'); - if (link.length === 0) { - // this link was not found in the sidebar. - // Find associated id element, then its closest section - // in the document and try with that one. - var id_elt = $('.document [id="' + anchor.substring(1) + '"]'); - var closest_section = id_elt.closest('div.section'); - link = vmenu.find('[href="#' + closest_section.attr("id") + '"]'); - if (link.length === 0) { - // still not found in the sidebar. fall back to main section - link = vmenu.find('[href="#"]'); - } - } - // If we found a matching link then reset current and re-apply - // otherwise retain the existing match - if (link.length > 0) { - $('.pytorch-menu-vertical .current').removeClass('current'); - link.addClass('current'); - link.closest('li.toctree-l1').addClass('current'); - link.closest('li.toctree-l1').parent().addClass('current'); - link.closest('li.toctree-l1').addClass('current'); - link.closest('li.toctree-l2').addClass('current'); - link.closest('li.toctree-l3').addClass('current'); - link.closest('li.toctree-l4').addClass('current'); - } - } - catch (err) { - console.log("Error expanding nav for anchor", err); - } - - }; - - nav.onScroll = function () { - this.winScroll = false; - var newWinPosition = this.win.scrollTop(), - winBottom = newWinPosition + this.winHeight, - navPosition = this.navBar.scrollTop(), - newNavPosition = navPosition + (newWinPosition - this.winPosition); - if (newWinPosition < 0 || winBottom > this.docHeight) { - return; - } - this.navBar.scrollTop(newNavPosition); - this.winPosition = newWinPosition; - }; - - nav.onResize = function () { - this.winResize = false; - this.winHeight = this.win.height(); - this.docHeight = $(document).height(); - }; - - nav.hashChange = function () { - this.linkScroll = true; - this.win.one('hashchange', function () { - this.linkScroll = false; - }); - }; - - nav.toggleCurrent = function (elem) { - var parent_li = elem.closest('li'); - parent_li.siblings('li.current').removeClass('current'); - parent_li.siblings().find('li.current').removeClass('current'); - parent_li.find('> ul li.current').removeClass('current'); - parent_li.toggleClass('current'); - } - - return nav; - }; - - module.exports.ThemeNav = ThemeNav(); - - if (typeof(window) != 'undefined') { - window.SphinxRtdTheme = { - Navigation: module.exports.ThemeNav, - // TODO remove this once static assets are split up between the theme - // and Read the Docs. For now, this patches 0.3.0 to be backwards - // compatible with a pre-0.3.0 layout.html - StickyNav: module.exports.ThemeNav, - }; - } - - - // requestAnimationFrame polyfill by Erik Möller. fixes from Paul Irish and Tino Zijdel - // https://gist.github.com/paulirish/1579671 - // MIT license - - (function() { - var lastTime = 0; - var vendors = ['ms', 'moz', 'webkit', 'o']; - for(var x = 0; x < vendors.length && !window.requestAnimationFrame; ++x) { - window.requestAnimationFrame = window[vendors[x]+'RequestAnimationFrame']; - window.cancelAnimationFrame = window[vendors[x]+'CancelAnimationFrame'] - || window[vendors[x]+'CancelRequestAnimationFrame']; - } - - if (!window.requestAnimationFrame) - window.requestAnimationFrame = function(callback, element) { - var currTime = new Date().getTime(); - var timeToCall = Math.max(0, 16 - (currTime - lastTime)); - var id = window.setTimeout(function() { callback(currTime + timeToCall); }, - timeToCall); - lastTime = currTime + timeToCall; - return id; - }; - - if (!window.cancelAnimationFrame) - window.cancelAnimationFrame = function(id) { - clearTimeout(id); - }; - }()); - - $(".sphx-glr-thumbcontainer").removeAttr("tooltip"); - $("table").removeAttr("border"); - - // This code replaces the default sphinx gallery download buttons - // with the 3 download buttons at the top of the page - - var downloadNote = $(".sphx-glr-download-link-note.admonition.note"); - if (downloadNote.length >= 1) { - var tutorialUrlArray = $("#tutorial-type").text().split('/'); - - var githubLink = "https://github.com/pytorch/rl/tree/tutorial_py_dup/sphinx-tutorials/" + tutorialUrlArray[tutorialUrlArray.length - 1] + ".py", - notebookLink = $(".reference.download")[1].href, - notebookDownloadPath = notebookLink.split('_downloads')[1], - colabLink = "https://colab.research.google.com/github/pytorch/rl/blob/gh-pages/_downloads" + notebookDownloadPath; - - $("#google-colab-link").wrap("
"); - $("#download-notebook-link").wrap(""); - $("#github-view-link").wrap(""); - } else { - $(".pytorch-call-to-action-links").hide(); - } - - //This code handles the Expand/Hide toggle for the Docs/Tutorials left nav items - - $(document).ready(function() { - var caption = "#pytorch-left-menu p.caption"; - var collapseAdded = $(this).not("checked"); - $(caption).each(function () { - var menuName = this.innerText.replace(/[^\w\s]/gi, "").trim(); - $(this).find("span").addClass("checked"); - if (collapsedSections.includes(menuName) == true && collapseAdded && sessionStorage.getItem(menuName) !== "expand" || sessionStorage.getItem(menuName) == "collapse") { - $(this.firstChild).after("[ + ]"); - $(this.firstChild).after("[ - ]"); - $(this).next("ul").hide(); - } else if (collapsedSections.includes(menuName) == false && collapseAdded || sessionStorage.getItem(menuName) == "expand") { - $(this.firstChild).after("[ + ]"); - $(this.firstChild).after("[ - ]"); - } - }); - - $(".expand-menu").on("click", function () { - $(this).prev(".hide-menu").toggle(); - $(this).parent().next("ul").toggle(); - var menuName = $(this).parent().text().replace(/[^\w\s]/gi, "").trim(); - if (sessionStorage.getItem(menuName) == "collapse") { - sessionStorage.removeItem(menuName); - } - sessionStorage.setItem(menuName, "expand"); - toggleList(this); - }); - - $(".hide-menu").on("click", function () { - $(this).next(".expand-menu").toggle(); - $(this).parent().next("ul").toggle(); - var menuName = $(this).parent().text().replace(/[^\w\s]/gi, "").trim(); - if (sessionStorage.getItem(menuName) == "expand") { - sessionStorage.removeItem(menuName); - } - sessionStorage.setItem(menuName, "collapse"); - toggleList(this); - }); - - function toggleList(menuCommand) { - $(menuCommand).toggle(); - } - }); - - // Build an array from each tag that's present - - var tagList = $(".tutorials-card-container").map(function() { - return $(this).data("tags").split(",").map(function(item) { - return item.trim(); - }); - }).get(); - - function unique(value, index, self) { - return self.indexOf(value) == index && value != "" - } - - // Only return unique tags - - var tags = tagList.sort().filter(unique); - - // Add filter buttons to the top of the page for each tag - - function createTagMenu() { - tags.forEach(function(item){ - $(".tutorial-filter-menu").append("
" + item + "
") - }) - }; - - createTagMenu(); - - // Remove hyphens if they are present in the filter buttons - - $(".tags").each(function(){ - var tags = $(this).text().split(","); - tags.forEach(function(tag, i ) { - tags[i] = tags[i].replace(/-/, ' ') - }) - $(this).html(tags.join(", ")); - }); - - // Remove hyphens if they are present in the card body - - $(".tutorial-filter").each(function(){ - var tag = $(this).text(); - $(this).html(tag.replace(/-/, ' ')) - }) - - // Remove any empty p tags that Sphinx adds - - $("#tutorial-cards p").each(function(index, item) { - if(!$(item).text().trim()) { - $(item).remove(); - } - }); - - // Jump back to top on pagination click - - $(document).on("click", ".page", function() { - $('html, body').animate( - {scrollTop: $("#dropdown-filter-tags").position().top}, - 'slow' - ); - }); - - var link = $("a[href='intermediate/speech_command_recognition_with_torchaudio.html']"); - - if (link.text() == "SyntaxError") { - console.log("There is an issue with the intermediate/speech_command_recognition_with_torchaudio.html menu item."); - link.text("Speech Command Recognition with torchaudio"); - } - - $(".stars-outer > i").hover(function() { - $(this).prevAll().addBack().toggleClass("fas star-fill"); - }); - - $(".stars-outer > i").on("click", function() { - $(this).prevAll().each(function() { - $(this).addBack().addClass("fas star-fill"); - }); - - $(".stars-outer > i").each(function() { - $(this).unbind("mouseenter mouseleave").css({ - "pointer-events": "none" - }); - }); - }) - - $("#pytorch-side-scroll-right li a").on("click", function (e) { - var href = $(this).attr("href"); - $('html, body').stop().animate({ - scrollTop: $(href).offset().top - 100 - }, 850); - e.preventDefault; - }); - - var lastId, - topMenu = $("#pytorch-side-scroll-right"), - topMenuHeight = topMenu.outerHeight() + 1, - // All sidenav items - menuItems = topMenu.find("a"), - // Anchors for menu items - scrollItems = menuItems.map(function () { - var item = $(this).attr("href"); - if (item.length) { - return item; - } - }); - - $(window).scroll(function () { - var fromTop = $(this).scrollTop() + topMenuHeight; - var article = ".section"; - - $(article).each(function (i) { - var offsetScroll = $(this).offset().top - $(window).scrollTop(); - if ( - offsetScroll <= topMenuHeight + 200 && - offsetScroll >= topMenuHeight - 200 && - scrollItems[i] == "#" + $(this).attr("id") && - $(".hidden:visible") - ) { - $(menuItems).removeClass("side-scroll-highlight"); - $(menuItems[i]).addClass("side-scroll-highlight"); - } - }); - }); - - - },{"jquery":"jquery"}],"pytorch-sphinx-theme":[function(require,module,exports){ - var jQuery = (typeof(window) != 'undefined') ? window.jQuery : require('jquery'); - - // Sphinx theme nav state - function ThemeNav () { - - var nav = { - navBar: null, - win: null, - winScroll: false, - winResize: false, - linkScroll: false, - winPosition: 0, - winHeight: null, - docHeight: null, - isRunning: false - }; - - nav.enable = function (withStickyNav) { - var self = this; - - // TODO this can likely be removed once the theme javascript is broken - // out from the RTD assets. This just ensures old projects that are - // calling `enable()` get the sticky menu on by default. All other cals - // to `enable` should include an argument for enabling the sticky menu. - if (typeof(withStickyNav) == 'undefined') { - withStickyNav = true; - } - - if (self.isRunning) { - // Only allow enabling nav logic once - return; - } - - self.isRunning = true; - jQuery(function ($) { - self.init($); - - self.reset(); - self.win.on('hashchange', self.reset); - - if (withStickyNav) { - // Set scroll monitor - self.win.on('scroll', function () { - if (!self.linkScroll) { - if (!self.winScroll) { - self.winScroll = true; - requestAnimationFrame(function() { self.onScroll(); }); - } - } - }); - } - - // Set resize monitor - self.win.on('resize', function () { - if (!self.winResize) { - self.winResize = true; - requestAnimationFrame(function() { self.onResize(); }); - } - }); - - self.onResize(); - }); - - }; - - // TODO remove this with a split in theme and Read the Docs JS logic as - // well, it's only here to support 0.3.0 installs of our theme. - nav.enableSticky = function() { - this.enable(true); - }; - - nav.init = function ($) { - var doc = $(document), - self = this; - - this.navBar = $('div.pytorch-side-scroll:first'); - this.win = $(window); - - // Set up javascript UX bits - $(document) - // Shift nav in mobile when clicking the menu. - .on('click', "[data-toggle='pytorch-left-menu-nav-top']", function() { - $("[data-toggle='wy-nav-shift']").toggleClass("shift"); - $("[data-toggle='rst-versions']").toggleClass("shift"); - }) - - // Nav menu link click operations - .on('click', ".pytorch-menu-vertical .current ul li a", function() { - var target = $(this); - // Close menu when you click a link. - $("[data-toggle='wy-nav-shift']").removeClass("shift"); - $("[data-toggle='rst-versions']").toggleClass("shift"); - // Handle dynamic display of l3 and l4 nav lists - self.toggleCurrent(target); - self.hashChange(); - }) - .on('click', "[data-toggle='rst-current-version']", function() { - $("[data-toggle='rst-versions']").toggleClass("shift-up"); - }) - - // Make tables responsive - $("table.docutils:not(.field-list,.footnote,.citation)") - .wrap("
"); - - // Add extra class to responsive tables that contain - // footnotes or citations so that we can target them for styling - $("table.docutils.footnote") - .wrap("
"); - $("table.docutils.citation") - .wrap("
"); - - // Add expand links to all parents of nested ul - $('.pytorch-menu-vertical ul').not('.simple').siblings('a').each(function () { - var link = $(this); - expand = $(''); - expand.on('click', function (ev) { - self.toggleCurrent(link); - ev.stopPropagation(); - return false; - }); - link.prepend(expand); - }); - }; - - nav.reset = function () { - // Get anchor from URL and open up nested nav - var anchor = encodeURI(window.location.hash) || '#'; - - try { - var vmenu = $('.pytorch-menu-vertical'); - var link = vmenu.find('[href="' + anchor + '"]'); - if (link.length === 0) { - // this link was not found in the sidebar. - // Find associated id element, then its closest section - // in the document and try with that one. - var id_elt = $('.document [id="' + anchor.substring(1) + '"]'); - var closest_section = id_elt.closest('div.section'); - link = vmenu.find('[href="#' + closest_section.attr("id") + '"]'); - if (link.length === 0) { - // still not found in the sidebar. fall back to main section - link = vmenu.find('[href="#"]'); - } - } - // If we found a matching link then reset current and re-apply - // otherwise retain the existing match - if (link.length > 0) { - $('.pytorch-menu-vertical .current').removeClass('current'); - link.addClass('current'); - link.closest('li.toctree-l1').addClass('current'); - link.closest('li.toctree-l1').parent().addClass('current'); - link.closest('li.toctree-l1').addClass('current'); - link.closest('li.toctree-l2').addClass('current'); - link.closest('li.toctree-l3').addClass('current'); - link.closest('li.toctree-l4').addClass('current'); - } - } - catch (err) { - console.log("Error expanding nav for anchor", err); - } - - }; - - nav.onScroll = function () { - this.winScroll = false; - var newWinPosition = this.win.scrollTop(), - winBottom = newWinPosition + this.winHeight, - navPosition = this.navBar.scrollTop(), - newNavPosition = navPosition + (newWinPosition - this.winPosition); - if (newWinPosition < 0 || winBottom > this.docHeight) { - return; - } - this.navBar.scrollTop(newNavPosition); - this.winPosition = newWinPosition; - }; - - nav.onResize = function () { - this.winResize = false; - this.winHeight = this.win.height(); - this.docHeight = $(document).height(); - }; - - nav.hashChange = function () { - this.linkScroll = true; - this.win.one('hashchange', function () { - this.linkScroll = false; - }); - }; - - nav.toggleCurrent = function (elem) { - var parent_li = elem.closest('li'); - parent_li.siblings('li.current').removeClass('current'); - parent_li.siblings().find('li.current').removeClass('current'); - parent_li.find('> ul li.current').removeClass('current'); - parent_li.toggleClass('current'); - } - - return nav; - }; - - module.exports.ThemeNav = ThemeNav(); - - if (typeof(window) != 'undefined') { - window.SphinxRtdTheme = { - Navigation: module.exports.ThemeNav, - // TODO remove this once static assets are split up between the theme - // and Read the Docs. For now, this patches 0.3.0 to be backwards - // compatible with a pre-0.3.0 layout.html - StickyNav: module.exports.ThemeNav, - }; - } - - - // requestAnimationFrame polyfill by Erik Möller. fixes from Paul Irish and Tino Zijdel - // https://gist.github.com/paulirish/1579671 - // MIT license - - (function() { - var lastTime = 0; - var vendors = ['ms', 'moz', 'webkit', 'o']; - for(var x = 0; x < vendors.length && !window.requestAnimationFrame; ++x) { - window.requestAnimationFrame = window[vendors[x]+'RequestAnimationFrame']; - window.cancelAnimationFrame = window[vendors[x]+'CancelAnimationFrame'] - || window[vendors[x]+'CancelRequestAnimationFrame']; - } - - if (!window.requestAnimationFrame) - window.requestAnimationFrame = function(callback, element) { - var currTime = new Date().getTime(); - var timeToCall = Math.max(0, 16 - (currTime - lastTime)); - var id = window.setTimeout(function() { callback(currTime + timeToCall); }, - timeToCall); - lastTime = currTime + timeToCall; - return id; - }; - - if (!window.cancelAnimationFrame) - window.cancelAnimationFrame = function(id) { - clearTimeout(id); - }; - }()); - - $(".sphx-glr-thumbcontainer").removeAttr("tooltip"); - $("table").removeAttr("border"); - - // This code replaces the default sphinx gallery download buttons - // with the 3 download buttons at the top of the page - - var downloadNote = $(".sphx-glr-download-link-note.admonition.note"); - if (downloadNote.length >= 1) { - var tutorialUrlArray = $("#tutorial-type").text().split('/'); - - var githubLink = "https://github.com/pytorch/rl/tree/tutorial_py_dup/tutorials/" + tutorialUrlArray.join("/") + ".py", - notebookLink = $(".reference.download")[1].href, - notebookDownloadPath = notebookLink.split('_downloads')[1], - colabLink = "https://colab.research.google.com/github/pytorch/rl/blob/gh-pages/_downloads" + notebookDownloadPath; - - $("#google-colab-link").wrap("
"); - $("#download-notebook-link").wrap(""); - $("#github-view-link").wrap(""); - } else { - $(".pytorch-call-to-action-links").hide(); - } - - //This code handles the Expand/Hide toggle for the Docs/Tutorials left nav items - - $(document).ready(function() { - var caption = "#pytorch-left-menu p.caption"; - var collapseAdded = $(this).not("checked"); - $(caption).each(function () { - var menuName = this.innerText.replace(/[^\w\s]/gi, "").trim(); - $(this).find("span").addClass("checked"); - if (collapsedSections.includes(menuName) == true && collapseAdded && sessionStorage.getItem(menuName) !== "expand" || sessionStorage.getItem(menuName) == "collapse") { - $(this.firstChild).after("[ + ]"); - $(this.firstChild).after("[ - ]"); - $(this).next("ul").hide(); - } else if (collapsedSections.includes(menuName) == false && collapseAdded || sessionStorage.getItem(menuName) == "expand") { - $(this.firstChild).after("[ + ]"); - $(this.firstChild).after("[ - ]"); - } - }); - - $(".expand-menu").on("click", function () { - $(this).prev(".hide-menu").toggle(); - $(this).parent().next("ul").toggle(); - var menuName = $(this).parent().text().replace(/[^\w\s]/gi, "").trim(); - if (sessionStorage.getItem(menuName) == "collapse") { - sessionStorage.removeItem(menuName); - } - sessionStorage.setItem(menuName, "expand"); - toggleList(this); - }); - - $(".hide-menu").on("click", function () { - $(this).next(".expand-menu").toggle(); - $(this).parent().next("ul").toggle(); - var menuName = $(this).parent().text().replace(/[^\w\s]/gi, "").trim(); - if (sessionStorage.getItem(menuName) == "expand") { - sessionStorage.removeItem(menuName); - } - sessionStorage.setItem(menuName, "collapse"); - toggleList(this); - }); - - function toggleList(menuCommand) { - $(menuCommand).toggle(); - } - }); - - // Build an array from each tag that's present - - var tagList = $(".tutorials-card-container").map(function() { - return $(this).data("tags").split(",").map(function(item) { - return item.trim(); - }); - }).get(); - - function unique(value, index, self) { - return self.indexOf(value) == index && value != "" - } - - // Only return unique tags - - var tags = tagList.sort().filter(unique); - - // Add filter buttons to the top of the page for each tag - - function createTagMenu() { - tags.forEach(function(item){ - $(".tutorial-filter-menu").append("
" + item + "
") - }) - }; - - createTagMenu(); - - // Remove hyphens if they are present in the filter buttons - - $(".tags").each(function(){ - var tags = $(this).text().split(","); - tags.forEach(function(tag, i ) { - tags[i] = tags[i].replace(/-/, ' ') - }) - $(this).html(tags.join(", ")); - }); - - // Remove hyphens if they are present in the card body - - $(".tutorial-filter").each(function(){ - var tag = $(this).text(); - $(this).html(tag.replace(/-/, ' ')) - }) - - // Remove any empty p tags that Sphinx adds - - $("#tutorial-cards p").each(function(index, item) { - if(!$(item).text().trim()) { - $(item).remove(); - } - }); - - // Jump back to top on pagination click - - $(document).on("click", ".page", function() { - $('html, body').animate( - {scrollTop: $("#dropdown-filter-tags").position().top}, - 'slow' - ); - }); - - var link = $("a[href='intermediate/speech_command_recognition_with_torchaudio.html']"); - - if (link.text() == "SyntaxError") { - console.log("There is an issue with the intermediate/speech_command_recognition_with_torchaudio.html menu item."); - link.text("Speech Command Recognition with torchaudio"); - } - - $(".stars-outer > i").hover(function() { - $(this).prevAll().addBack().toggleClass("fas star-fill"); - }); - - $(".stars-outer > i").on("click", function() { - $(this).prevAll().each(function() { - $(this).addBack().addClass("fas star-fill"); - }); - - $(".stars-outer > i").each(function() { - $(this).unbind("mouseenter mouseleave").css({ - "pointer-events": "none" - }); - }); - }) - - $("#pytorch-side-scroll-right li a").on("click", function (e) { - var href = $(this).attr("href"); - $('html, body').stop().animate({ - scrollTop: $(href).offset().top - 100 - }, 850); - e.preventDefault; - }); - - var lastId, - topMenu = $("#pytorch-side-scroll-right"), - topMenuHeight = topMenu.outerHeight() + 1, - // All sidenav items - menuItems = topMenu.find("a"), - // Anchors for menu items - scrollItems = menuItems.map(function () { - var item = $(this).attr("href"); - if (item.length) { - return item; - } - }); - - $(window).scroll(function () { - var fromTop = $(this).scrollTop() + topMenuHeight; - var article = ".section"; - - $(article).each(function (i) { - var offsetScroll = $(this).offset().top - $(window).scrollTop(); - if ( - offsetScroll <= topMenuHeight + 200 && - offsetScroll >= topMenuHeight - 200 && - scrollItems[i] == "#" + $(this).attr("id") && - $(".hidden:visible") - ) { - $(menuItems).removeClass("side-scroll-highlight"); - $(menuItems[i]).addClass("side-scroll-highlight"); - } - }); - }); - - },{"jquery":"jquery"}]},{},[1,2,3,4,5,6,7,8,9,10,11]); - -},{"jquery":"jquery"}]},{},[1,2,3,4,5,6,7,8,9,10,11]); - -},{"jquery":"jquery"}]},{},[1,2,3,4,5,6,7,8,9,10,11]); - -},{"jquery":"jquery"}]},{},[1,2,3,4,5,6,7,8,9,10,11]); +},{"jquery":"jquery"}]},{},[1,2,3,4,5,6,7,8,9,10,"pytorch-sphinx-theme"]); From 66da33630d2cc5de794fe1f823f819c6d42b4cc5 Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 5 Apr 2023 16:17:49 +0100 Subject: [PATCH 84/89] amend --- torchrl/modules/tensordict_module/actors.py | 143 +- torchrl/objectives/dqn.py | 2 + torchrl/objectives/sac.py | 2 + torchrl/objectives/value/advantages.py | 8 + torchrl/trainers/trainers.py | 5 +- tutorials/sphinx-tutorials/coding_dqn.py | 1399 ++++++++++--------- 6 files changed, 796 insertions(+), 763 deletions(-) diff --git a/torchrl/modules/tensordict_module/actors.py b/torchrl/modules/tensordict_module/actors.py index 635fc90ca21..7b9b8ef53a1 100644 --- a/torchrl/modules/tensordict_module/actors.py +++ b/torchrl/modules/tensordict_module/actors.py @@ -715,7 +715,8 @@ def __init__( class ActorValueOperator(SafeSequential): """Actor-value operator. - This class wraps together an actor and a value model that share a common observation embedding network: + This class wraps together an actor and a value model that share a common + observation embedding network: .. aafig:: :aspect: 60 @@ -723,22 +724,30 @@ class ActorValueOperator(SafeSequential): :proportional: :textual: - +-------------+ - |"Observation"| - +-------------+ - | - v - +--------------+ - |"hidden state"| - +--------------+ - | | | - v | v - actor | critic - | | | - v | v - +--------+|+-------+ - |"action"|||"value"| - +--------+|+-------+ + +---------------+ + |Observation (s)| + +---------------+ + | + v + common + | + v + +------------------+ + | Hidden state | + +------------------+ + | | + v v + actor critic + | | + v v + +-------------+ +------------+ + |Action (a(s))| |Value (V(s))| + +-------------+ +------------+ + + .. note:: + For a similar class that returns an action and a Quality value :math:`Q(s, a)` + see :class:`~.ActorCriticOperator`. For a version without common embeddig + refet to :class:`~.ActorCriticWrapper`. To facilitate the workflow, this class comes with a get_policy_operator() and get_value_operator() methods, which will both return a stand-alone TDModule with the dedicated functionality. @@ -755,17 +764,13 @@ class ActorValueOperator(SafeSequential): >>> import torch >>> from tensordict import TensorDict >>> from torchrl.modules import ProbabilisticActor, SafeModule - >>> from torchrl.data import UnboundedContinuousTensorSpec, BoundedTensorSpec >>> from torchrl.modules import ValueOperator, TanhNormal, ActorValueOperator, NormalParamWrapper - >>> spec_hidden = UnboundedContinuousTensorSpec(4) >>> module_hidden = torch.nn.Linear(4, 4) >>> td_module_hidden = SafeModule( ... module=module_hidden, - ... spec=spec_hidden, ... in_keys=["observation"], ... out_keys=["hidden"], ... ) - >>> spec_action = BoundedTensorSpec(-1, 1, torch.Size([8])) >>> module_action = TensorDictModule( ... NormalParamWrapper(torch.nn.Linear(4, 8)), ... in_keys=["hidden"], @@ -773,7 +778,6 @@ class ActorValueOperator(SafeSequential): ... ) >>> td_module_action = ProbabilisticActor( ... module=module_action, - ... spec=spec_action, ... in_keys=["loc", "scale"], ... out_keys=["action"], ... distribution_class=TanhNormal, @@ -854,7 +858,8 @@ def get_value_operator(self) -> SafeSequential: class ActorCriticOperator(ActorValueOperator): """Actor-critic operator. - This class wraps together an actor and a value model that share a common observation embedding network: + This class wraps together an actor and a value model that share a common + observation embedding network: .. aafig:: :aspect: 60 @@ -862,51 +867,58 @@ class ActorCriticOperator(ActorValueOperator): :proportional: :textual: - +-----------+ - |Observation| - +-----------+ - | - v - actor - | - v - +------+ - |action| --> critic - +------+ | - v - +-----+ - |value| - +-----+ + +---------------+ + |Observation (s)| + +---------------+ + | + v + common + | + v + +------------------+ + | Hidden state | + +------------------+ + | | + v v + actor ------> critic + | | + v v + +-------------+ +----------------+ + |Action (a(s))| |Quality (Q(s,a))| + +-------------+ +----------------+ + + .. note:: + For a similar class that returns an action and a state-value :math:`V(s)` + see :class:`~.ActorValueOperator`. + To facilitate the workflow, this class comes with a get_policy_operator() method, which will both return a stand-alone TDModule with the dedicated functionality. The get_critic_operator will return the parent object, as the value is computed based on the policy output. Args: - common_operator (TensorDictModule): a common operator that reads observations and produces a hidden variable - policy_operator (TensorDictModule): a policy operator that reads the hidden variable and returns an action - value_operator (TensorDictModule): a value operator, that reads the hidden variable and returns a value + common_operator (TensorDictModule): a common operator that reads + observations and produces a hidden variable + policy_operator (TensorDictModule): a policy operator that reads the + hidden variable and returns an action + value_operator (TensorDictModule): a value operator, that reads the + hidden variable and returns a value Examples: >>> import torch >>> from tensordict import TensorDict >>> from torchrl.modules import ProbabilisticActor - >>> from torchrl.data import UnboundedContinuousTensorSpec, BoundedTensorSpec >>> from torchrl.modules import ValueOperator, TanhNormal, ActorCriticOperator, NormalParamWrapper, MLP - >>> spec_hidden = UnboundedContinuousTensorSpec(4) >>> module_hidden = torch.nn.Linear(4, 4) >>> td_module_hidden = SafeModule( ... module=module_hidden, - ... spec=spec_hidden, ... in_keys=["observation"], ... out_keys=["hidden"], ... ) - >>> spec_action = BoundedTensorSpec(-1, 1, torch.Size([8])) >>> module_action = NormalParamWrapper(torch.nn.Linear(4, 8)) >>> module_action = TensorDictModule(module_action, in_keys=["hidden"], out_keys=["loc", "scale"]) >>> td_module_action = ProbabilisticActor( ... module=module_action, - ... spec=spec_action, ... in_keys=["loc", "scale"], ... out_keys=["action"], ... distribution_class=TanhNormal, @@ -964,8 +976,17 @@ class ActorCriticOperator(ActorValueOperator): """ - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__( + self, + common_operator: TensorDictModule, + policy_operator: TensorDictModule, + value_operator: TensorDictModule, + ): + super().__init__( + common_operator, + policy_operator, + value_operator, + ) if self[2].out_keys[0] == "state_value": raise RuntimeError( "Value out_key is state_value, which may lead to errors in downstream usages" @@ -998,17 +1019,18 @@ class ActorCriticWrapper(SafeSequential): :proportional: :textual: - +-----------+ - |Observation| - +-----------+ - | | | - v | v - actor | critic - | | | - v | v - +------+|+-------+ - |action||| value | - +------+|+-------+ + +---------------+ + |Observation (s)| + +---------------+ + | | | + v | v + actor | critic + | | | + v | v + +-------------+|+------------+ + |Action (a(s))|||Value (V(s))| + +-------------+|+------------+ + To facilitate the workflow, this class comes with a get_policy_operator() and get_value_operator() methods, which will both return a stand-alone TDModule with the dedicated functionality. @@ -1021,7 +1043,6 @@ class ActorCriticWrapper(SafeSequential): >>> import torch >>> from tensordict import TensorDict >>> from tensordict.nn import TensorDictModule - >>> from torchrl.data import UnboundedContinuousTensorSpec, BoundedTensorSpec >>> from torchrl.modules import ( ... ActorCriticWrapper, ... ProbabilisticActor, @@ -1029,7 +1050,6 @@ class ActorCriticWrapper(SafeSequential): ... TanhNormal, ... ValueOperator, ... ) - >>> action_spec = BoundedTensorSpec(-1, 1, torch.Size([8])) >>> action_module = TensorDictModule( ... NormalParamWrapper(torch.nn.Linear(4, 8)), ... in_keys=["observation"], @@ -1037,7 +1057,6 @@ class ActorCriticWrapper(SafeSequential): ... ) >>> td_module_action = ProbabilisticActor( ... module=action_module, - ... spec=action_spec, ... in_keys=["loc", "scale"], ... distribution_class=TanhNormal, ... return_log_prob=True, diff --git a/torchrl/objectives/dqn.py b/torchrl/objectives/dqn.py index e584b894ed7..70957785fa7 100644 --- a/torchrl/objectives/dqn.py +++ b/torchrl/objectives/dqn.py @@ -189,10 +189,12 @@ class DistributionalDQNLoss(LossModule): value_network (DistributionalQValueActor or nn.Module): the distributional Q value operator. gamma (scalar): a discount factor for return computation. + .. note:: Unlike :class:`DQNLoss`, this class does not currently support custom value functions. The next value estimation is always bootstrapped. + delay_value (bool): whether to duplicate the value network into a new target value network to create double DQN """ diff --git a/torchrl/objectives/sac.py b/torchrl/objectives/sac.py index 8177c2f393c..521777fec60 100644 --- a/torchrl/objectives/sac.py +++ b/torchrl/objectives/sac.py @@ -49,9 +49,11 @@ class SACLoss(LossModule): This module typically outputs a ``"state_action_value"`` entry. value_network (TensorDictModule, optional): V(s) parametric model. This module typically outputs a ``"state_value"`` entry. + .. note:: If not provided, the second version of SAC is assumed, where only the Q-Value network is needed. + num_qvalue_nets (integer, optional): number of Q-Value networks used. Defaults to ``2``. priority_key (str, optional): tensordict key where to write the diff --git a/torchrl/objectives/value/advantages.py b/torchrl/objectives/value/advantages.py index 14799118990..e6e42fef55f 100644 --- a/torchrl/objectives/value/advantages.py +++ b/torchrl/objectives/value/advantages.py @@ -132,10 +132,12 @@ class TD0Estimator(ValueEstimatorBase): before the TD is computed. differentiable (bool, optional): if ``True``, gradients are propagated through the computation of the value function. Default is ``False``. + .. note:: The proper way to make the function call non-differentiable is to decorate it in a `torch.no_grad()` context manager/decorator or pass detached parameters for functional modules. + advantage_key (str or tuple of str, optional): the key of the advantage entry. Defaults to "advantage". value_target_key (str or tuple of str, optional): the key of the advantage entry. @@ -319,10 +321,12 @@ class TD1Estimator(ValueEstimatorBase): before the TD is computed. differentiable (bool, optional): if ``True``, gradients are propagated through the computation of the value function. Default is ``False``. + .. note:: The proper way to make the function call non-differentiable is to decorate it in a `torch.no_grad()` context manager/decorator or pass detached parameters for functional modules. + advantage_key (str or tuple of str, optional): the key of the advantage entry. Defaults to "advantage". value_target_key (str or tuple of str, optional): the key of the advantage entry. @@ -506,10 +510,12 @@ class TDLambdaEstimator(ValueEstimatorBase): before the TD is computed. differentiable (bool, optional): if ``True``, gradients are propagated through the computation of the value function. Default is ``False``. + .. note:: The proper way to make the function call non-differentiable is to decorate it in a `torch.no_grad()` context manager/decorator or pass detached parameters for functional modules. + vectorized (bool, optional): whether to use the vectorized version of the lambda return. Default is `True`. advantage_key (str or tuple of str, optional): the key of the advantage entry. @@ -710,10 +716,12 @@ class GAE(ValueEstimatorBase): Default is ``False``. differentiable (bool, optional): if ``True``, gradients are propagated through the computation of the value function. Default is ``False``. + .. note:: The proper way to make the function call non-differentiable is to decorate it in a `torch.no_grad()` context manager/decorator or pass detached parameters for functional modules. + advantage_key (str or tuple of str, optional): the key of the advantage entry. Defaults to "advantage". value_target_key (str or tuple of str, optional): the key of the advantage entry. diff --git a/torchrl/trainers/trainers.py b/torchrl/trainers/trainers.py index 070679acd52..4a04acd4c98 100644 --- a/torchrl/trainers/trainers.py +++ b/torchrl/trainers/trainers.py @@ -438,7 +438,6 @@ def train(self): for batch in self.collector: batch = self._process_batch_hook(batch) - self._pre_steps_log_hook(batch) current_frames = ( batch.get(("collector", "mask"), torch.tensor(batch.numel())) .sum() @@ -446,6 +445,7 @@ def train(self): * self.frame_skip ) self.collected_frames += current_frames + self._pre_steps_log_hook(batch) if self.collected_frames > self.collector.init_random_frames: self.optim_steps(batch) @@ -506,7 +506,7 @@ def _log(self, log_pbar=False, **kwargs) -> None: collected_frames = self.collected_frames for key, item in kwargs.items(): self._log_dict[key].append(item) - + print(f"collected_frames {collected_frames}, self._last_log.get({key}, 0) {self._last_log.get(key, 0)}, self._log_interval {self._log_interval}") if (collected_frames - self._last_log.get(key, 0)) > self._log_interval: self._last_log[key] = collected_frames _log = True @@ -514,6 +514,7 @@ def _log(self, log_pbar=False, **kwargs) -> None: _log = False method = LOGGER_METHODS.get(key, "log_scalar") if _log and self.logger is not None: + print("logging!", key, self.logger.experiment.log_dir) getattr(self.logger, method)(key, item, step=collected_frames) if method == "log_scalar" and self.progress_bar and log_pbar: if isinstance(item, torch.Tensor): diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py index 47268647e71..3d0eef5adc9 100644 --- a/tutorials/sphinx-tutorials/coding_dqn.py +++ b/tutorials/sphinx-tutorials/coding_dqn.py @@ -1,710 +1,711 @@ -# -*- coding: utf-8 -*- -""" -TorchRL trainer: A DQN example -============================== -**Author**: `Vincent Moens `_ - -""" - -############################################################################## -# TorchRL provides a generic :class:`torchrl.trainers.Trainer` class to handle -# your training loop. The trainer executes a nested loop where the outer loop -# is the data collection and the inner loop consumes this data or some data -# retrieved from the replay buffer to train the model. -# At various points in this training loop, hooks can be attached and executed at -# given intervals. -# -# In this tutorial, we will be using the trainer class to train a DQN algorithm -# to solve the CartPole task from scratch. -# -# Main takeaways: -# -# - Building a trainer with its essential components: data collector, loss -# module, replay buffer and optimizer. -# - Adding hooks to a trainer, such as loggers, target network updaters and such. -# -# The trainer is fully customisable and offers a large set of functionalities. -# The tutorial is organised around its construction. -# We will be detailing how to build each of the components of the library first, -# and then put the pieces together using the :class:`torchrl.trainers.Trainer` -# class. -# -# Along the road, we will also focus on some other aspects of the library: -# -# - how to build an environment in TorchRL, including transforms (e.g. data -# normalization, frame concatenation, resizing and turning to grayscale) -# and parallel execution. Unlike what we did in the -# `DDPG tutorial `_, we -# will normalize the pixels and not the state vector. -# - how to design a :class:`torchrl.modules.QValueActor` object, i.e. an actor -# that estimates the action values and picks up the action with the highest -# estimated return; -# - how to collect data from your environment efficiently and store them -# in a replay buffer; -# - how to use multi-step, a simple preprocessing step for off-policy algorithms; -# - and finally how to evaluate your model. -# -# **Prerequisites**: We encourage you to get familiar with torchrl through the -# `PPO tutorial `_ first. -# -# DQN -# --- -# -# DQN (`Deep Q-Learning `_) was -# the founding work in deep reinforcement learning. -# -# On a high level, the algorithm is quite simple: Q-learning consists in -# learning a table of state-action values in such a way that, when -# encountering any particular state, we know which action to pick just by -# searching for the one with the highest value. This simple setting -# requires the actions and states to be -# discrete, otherwise a lookup table cannot be built. -# -# DQN uses a neural network that encodes a map from the state-action space to -# a value (scalar) space, which amortizes the cost of storing and exploring all -# the possible state-action combinations: if a state has not been seen in the -# past, we can still pass it in conjunction with the various actions available -# through our neural network and get an interpolated value for each of the -# actions available. -# -# We will solve the classic control problem of the cart pole. From the -# Gymnasium doc from where this environment is retrieved: -# -# | A pole is attached by an un-actuated joint to a cart, which moves along a -# | frictionless track. The pendulum is placed upright on the cart and the goal -# | is to balance the pole by applying forces in the left and right direction -# | on the cart. -# -# .. figure:: /_static/img/cartpole_demo.gif -# :alt: Cart Pole -# -# We do not aim at giving a SOTA implementation of the algorithm, but rather -# to provide a high-level illustration of TorchRL features in the context -# of this algorithm. - -# sphinx_gallery_start_ignore -import tempfile -import warnings - -warnings.filterwarnings("ignore") -# sphinx_gallery_end_ignore - -import os -import uuid - -import torch -from torch import nn -from torchrl.collectors import MultiaSyncDataCollector -from torchrl.data import LazyMemmapStorage, MultiStep, TensorDictReplayBuffer -from torchrl.envs import EnvCreator, ParallelEnv, RewardScaling, StepCounter -from torchrl.envs.libs.gym import GymEnv -from torchrl.envs.transforms import ( - CatFrames, - Compose, - GrayScale, - ObservationNorm, - Resize, - ToTensorImage, - TransformedEnv, -) -from torchrl.modules import DuelingCnnDQNet, EGreedyWrapper, QValueActor - -from torchrl.objectives import DQNLoss, SoftUpdate -from torchrl.record.loggers.csv import CSVLogger -from torchrl.trainers import ( - LogReward, - Recorder, - ReplayBufferTrainer, - Trainer, - UpdateWeights, -) - - -def is_notebook() -> bool: - try: - shell = get_ipython().__class__.__name__ - if shell == "ZMQInteractiveShell": - return True # Jupyter notebook or qtconsole - elif shell == "TerminalInteractiveShell": - return False # Terminal running IPython +if __name__ == "__main__": + # -*- coding: utf-8 -*- + """ + TorchRL trainer: A DQN example + ============================== + **Author**: `Vincent Moens `_ + + """ + + ############################################################################## + # TorchRL provides a generic :class:`torchrl.trainers.Trainer` class to handle + # your training loop. The trainer executes a nested loop where the outer loop + # is the data collection and the inner loop consumes this data or some data + # retrieved from the replay buffer to train the model. + # At various points in this training loop, hooks can be attached and executed at + # given intervals. + # + # In this tutorial, we will be using the trainer class to train a DQN algorithm + # to solve the CartPole task from scratch. + # + # Main takeaways: + # + # - Building a trainer with its essential components: data collector, loss + # module, replay buffer and optimizer. + # - Adding hooks to a trainer, such as loggers, target network updaters and such. + # + # The trainer is fully customisable and offers a large set of functionalities. + # The tutorial is organised around its construction. + # We will be detailing how to build each of the components of the library first, + # and then put the pieces together using the :class:`torchrl.trainers.Trainer` + # class. + # + # Along the road, we will also focus on some other aspects of the library: + # + # - how to build an environment in TorchRL, including transforms (e.g. data + # normalization, frame concatenation, resizing and turning to grayscale) + # and parallel execution. Unlike what we did in the + # `DDPG tutorial `_, we + # will normalize the pixels and not the state vector. + # - how to design a :class:`torchrl.modules.QValueActor` object, i.e. an actor + # that estimates the action values and picks up the action with the highest + # estimated return; + # - how to collect data from your environment efficiently and store them + # in a replay buffer; + # - how to use multi-step, a simple preprocessing step for off-policy algorithms; + # - and finally how to evaluate your model. + # + # **Prerequisites**: We encourage you to get familiar with torchrl through the + # `PPO tutorial `_ first. + # + # DQN + # --- + # + # DQN (`Deep Q-Learning `_) was + # the founding work in deep reinforcement learning. + # + # On a high level, the algorithm is quite simple: Q-learning consists in + # learning a table of state-action values in such a way that, when + # encountering any particular state, we know which action to pick just by + # searching for the one with the highest value. This simple setting + # requires the actions and states to be + # discrete, otherwise a lookup table cannot be built. + # + # DQN uses a neural network that encodes a map from the state-action space to + # a value (scalar) space, which amortizes the cost of storing and exploring all + # the possible state-action combinations: if a state has not been seen in the + # past, we can still pass it in conjunction with the various actions available + # through our neural network and get an interpolated value for each of the + # actions available. + # + # We will solve the classic control problem of the cart pole. From the + # Gymnasium doc from where this environment is retrieved: + # + # | A pole is attached by an un-actuated joint to a cart, which moves along a + # | frictionless track. The pendulum is placed upright on the cart and the goal + # | is to balance the pole by applying forces in the left and right direction + # | on the cart. + # + # .. figure:: /_static/img/cartpole_demo.gif + # :alt: Cart Pole + # + # We do not aim at giving a SOTA implementation of the algorithm, but rather + # to provide a high-level illustration of TorchRL features in the context + # of this algorithm. + + # sphinx_gallery_start_ignore + import tempfile + import warnings + + warnings.filterwarnings("ignore") + # sphinx_gallery_end_ignore + + import os + import uuid + + import torch + from torch import nn + from torchrl.collectors import MultiaSyncDataCollector + from torchrl.data import LazyMemmapStorage, MultiStep, TensorDictReplayBuffer + from torchrl.envs import EnvCreator, ParallelEnv, RewardScaling, StepCounter + from torchrl.envs.libs.gym import GymEnv + from torchrl.envs.transforms import ( + CatFrames, + Compose, + GrayScale, + ObservationNorm, + Resize, + ToTensorImage, + TransformedEnv, + ) + from torchrl.modules import DuelingCnnDQNet, EGreedyWrapper, QValueActor + + from torchrl.objectives import DQNLoss, SoftUpdate + from torchrl.record.loggers.csv import CSVLogger + from torchrl.trainers import ( + LogReward, + Recorder, + ReplayBufferTrainer, + Trainer, + UpdateWeights, + ) + + + def is_notebook() -> bool: + try: + shell = get_ipython().__class__.__name__ + if shell == "ZMQInteractiveShell": + return True # Jupyter notebook or qtconsole + elif shell == "TerminalInteractiveShell": + return False # Terminal running IPython + else: + return False # Other type (?) + except NameError: + return False # Probably standard Python interpreter + + + ############################################################################### + # Let's get started with the various pieces we need for our algorithm: + # + # - An environment; + # - A policy (and related modules that we group under the "model" umbrella); + # - A data collector, which makes the policy play in the environment and + # delivers training data; + # - A replay buffer to store the training data; + # - A loss module, which computes the objective function to train our policy + # to maximise the return; + # - An optimizer, which performs parameter updates based on our loss. + # + # Additional modules include a logger, a recorder (executes the policy in + # "eval" mode) and a target network updater. With all these components into + # place, it is easy to see how one could misplace or misuse one component in + # the training script. The trainer is there to orchestrate everything for you! + # + # Building the environment + # ------------------------ + # + # First let's write a helper function that will output an environment. As usual, + # the "raw" environment may be too simple to be used in practice and we'll need + # some data transformation to expose its output to the policy. + # + # We will be using five transforms: + # + # - :class:`torchrl.envs.StepCounter` to count the number of steps in each trajectory; + # - :class:`torchrl.envs.transforms.ToTensorImage` will convert a ``[W, H, C]`` uint8 + # tensor in a floating point tensor in the ``[0, 1]`` space with shape + # ``[C, W, H]``; + # - :class:`torchrl.envs.transforms.RewardScaling` to reduce the scale of the return; + # - :class:`torchrl.envs.transforms.GrayScale` will turn our image into grayscale; + # - :class:`torchrl.envs.transforms.Resize` will resize the image in a 64x64 format; + # - :class:`torchrl.envs.transforms.CatFrames` will concatenate an arbitrary number of + # successive frames (``N=4``) in a single tensor along the channel dimension. + # This is useful as a single image does not carry information about the + # motion of the cartpole. Some memory about past observations and actions + # is needed, either via a recurrent neural network or using a stack of + # frames. + # - :class:`torchrl.envs.transforms.ObservationNorm` which will normalize our observations + # given some custom summary statistics. + # + # In practice, our environment builder has two arguments: + # + # - ``parallel``: determines whether multiple environments have to be run in + # parallel. We stack the transforms after the + # :class:`torchrl.envs.ParallelEnv` to take advantage + # of vectorization of the operations on device, although this would + # technically work with every single environment attached to its own set of + # transforms. + # - ``obs_norm_sd`` will contain the normalizing constants for + # the :class:`torchrl.envs.ObservationNorm` transform. + # + + + def make_env( + parallel=False, + obs_norm_sd=None, + ): + if obs_norm_sd is None: + obs_norm_sd = {"standard_normal": True} + if parallel: + base_env = ParallelEnv( + num_workers, + EnvCreator( + lambda: GymEnv( + "CartPole-v1", + from_pixels=True, + pixels_only=True, + device=device, + ) + ), + ) else: - return False # Other type (?) - except NameError: - return False # Probably standard Python interpreter - - -############################################################################### -# Let's get started with the various pieces we need for our algorithm: -# -# - An environment; -# - A policy (and related modules that we group under the "model" umbrella); -# - A data collector, which makes the policy play in the environment and -# delivers training data; -# - A replay buffer to store the training data; -# - A loss module, which computes the objective function to train our policy -# to maximise the return; -# - An optimizer, which performs parameter updates based on our loss. -# -# Additional modules include a logger, a recorder (executes the policy in -# "eval" mode) and a target network updater. With all these components into -# place, it is easy to see how one could misplace or misuse one component in -# the training script. The trainer is there to orchestrate everything for you! -# -# Building the environment -# ------------------------ -# -# First let's write a helper function that will output an environment. As usual, -# the "raw" environment may be too simple to be used in practice and we'll need -# some data transformation to expose its output to the policy. -# -# We will be using five transforms: -# -# - :class:`torchrl.envs.StepCounter` to count the number of steps in each trajectory; -# - :class:`torchrl.envs.transforms.ToTensorImage` will convert a ``[W, H, C]`` uint8 -# tensor in a floating point tensor in the ``[0, 1]`` space with shape -# ``[C, W, H]``; -# - :class:`torchrl.envs.transforms.RewardScaling` to reduce the scale of the return; -# - :class:`torchrl.envs.transforms.GrayScale` will turn our image into grayscale; -# - :class:`torchrl.envs.transforms.Resize` will resize the image in a 64x64 format; -# - :class:`torchrl.envs.transforms.CatFrames` will concatenate an arbitrary number of -# successive frames (``N=4``) in a single tensor along the channel dimension. -# This is useful as a single image does not carry information about the -# motion of the cartpole. Some memory about past observations and actions -# is needed, either via a recurrent neural network or using a stack of -# frames. -# - :class:`torchrl.envs.transforms.ObservationNorm` which will normalize our observations -# given some custom summary statistics. -# -# In practice, our environment builder has two arguments: -# -# - ``parallel``: determines whether multiple environments have to be run in -# parallel. We stack the transforms after the -# :class:`torchrl.envs.ParallelEnv` to take advantage -# of vectorization of the operations on device, although this would -# technically work with every single environment attached to its own set of -# transforms. -# - ``obs_norm_sd`` will contain the normalizing constants for -# the :class:`torchrl.envs.ObservationNorm` transform. -# - - -def make_env( - parallel=False, - obs_norm_sd=None, -): - if obs_norm_sd is None: - obs_norm_sd = {"standard_normal": True} - if parallel: - base_env = ParallelEnv( - num_workers, - EnvCreator( - lambda: GymEnv( - "CartPole-v1", - from_pixels=True, - pixels_only=True, - device=device, - ) + base_env = GymEnv( + "CartPole-v1", + from_pixels=True, + pixels_only=True, + device=device, + ) + + env = TransformedEnv( + base_env, + Compose( + StepCounter(), # to count the steps of each trajectory + ToTensorImage(), + RewardScaling(loc=0.0, scale=0.1), + GrayScale(), + Resize(64, 64), + CatFrames(4, in_keys=["pixels"], dim=-3), + ObservationNorm(in_keys=["pixels"], **obs_norm_sd), ), ) - else: - base_env = GymEnv( - "CartPole-v1", - from_pixels=True, - pixels_only=True, - device=device, + return env + + + ############################################################################### + # Compute normalizing constants + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # + # To normalize images, we don't want to normalize each pixel independently + # with a full ``[C, W, H]`` normalizing mask, but with simpler ``[C, 1, 1]`` + # shaped set of normalizing constants (loc and scale parameters). + # We will be using the ``reduce_dim`` argument + # of :meth:`torchrl.envs.ObservationNorm.init_stats` to instruct which + # dimensions must be reduced, and the ``keep_dims`` parameter to ensure that + # not all dimensions disappear in the process: + # + + + def get_norm_stats(): + test_env = make_env() + test_env.transform[-1].init_stats( + num_iter=1000, cat_dim=0, reduce_dim=[-1, -2, -4], keep_dims=(-1, -2) + ) + obs_norm_sd = test_env.transform[-1].state_dict() + # let's check that normalizing constants have a size of ``[C, 1, 1]`` where + # ``C=4`` (because of :class:`torchrl.envs.CatFrames`). + print("state dict of the observation norm:", obs_norm_sd) + return obs_norm_sd + + + ############################################################################### + # Building the model (Deep Q-network) + # ----------------------------------- + # + # The following function builds a :class:`torchrl.modules.DuelingCnnDQNet` + # object which is a simple CNN followed by a two-layer MLP. The only trick used + # here is that the action values (i.e. left and right action value) are + # computed using + # + # .. math:: + # + # \mathbb{v} = b(obs) + v(obs) - \mathbb{E}[v(obs)] + # + # where :math:`\mathbb{v}` is our vector of action values, + # :math:`b` is a :math:`\mathbb{R}^n \rightarrow 1` function and :math:`v` is a + # :math:`\mathbb{R}^n \rightarrow \mathbb{R}^m` function, for + # :math:`n = \# obs` and :math:`m = \# actions`. + # + # Our network is wrapped in a :class:`torchrl.modules.QValueActor`, + # which will read the state-action + # values, pick up the one with the maximum value and write all those results + # in the input :class:`tensordict.TensorDict`. + # + + + def make_model(dummy_env): + cnn_kwargs = { + "num_cells": [32, 64, 64], + "kernel_sizes": [6, 4, 3], + "strides": [2, 2, 1], + "activation_class": nn.ELU, + # This can be used to reduce the size of the last layer of the CNN + # "squeeze_output": True, + # "aggregator_class": nn.AdaptiveAvgPool2d, + # "aggregator_kwargs": {"output_size": (1, 1)}, + } + mlp_kwargs = { + "depth": 2, + "num_cells": [ + 64, + 64, + ], + "activation_class": nn.ELU, + } + net = DuelingCnnDQNet( + dummy_env.action_spec.shape[-1], 1, cnn_kwargs, mlp_kwargs + ).to(device) + net.value[-1].bias.data.fill_(init_bias) + + actor = QValueActor(net, in_keys=["pixels"], spec=dummy_env.action_spec).to(device) + # init actor: because the model is composed of lazy conv/linear layers, + # we must pass a fake batch of data through it to instantiate them. + tensordict = dummy_env.fake_tensordict() + actor(tensordict) + + # we wrap our actor in an EGreedyWrapper for data collection + actor_explore = EGreedyWrapper( + actor, + annealing_num_steps=total_frames, + eps_init=eps_greedy_val, + eps_end=eps_greedy_val_env, ) - env = TransformedEnv( - base_env, - Compose( - StepCounter(), # to count the steps of each trajectory - ToTensorImage(), - RewardScaling(loc=0.0, scale=0.1), - GrayScale(), - Resize(64, 64), - CatFrames(4, in_keys=["pixels"], dim=-3), - ObservationNorm(in_keys=["pixels"], **obs_norm_sd), - ), - ) - return env - - -############################################################################### -# Compute normalizing constants -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# To normalize images, we don't want to normalize each pixel independently -# with a full ``[C, W, H]`` normalizing mask, but with simpler ``[C, 1, 1]`` -# shaped set of normalizing constants (loc and scale parameters). -# We will be using the ``reduce_dim`` argument -# of :meth:`torchrl.envs.ObservationNorm.init_stats` to instruct which -# dimensions must be reduced, and the ``keep_dims`` parameter to ensure that -# not all dimensions disappear in the process: -# - - -def get_norm_stats(): - test_env = make_env() - test_env.transform[-1].init_stats( - num_iter=1000, cat_dim=0, reduce_dim=[-1, -2, -4], keep_dims=(-1, -2) + return actor, actor_explore + + + ############################################################################### + # Collecting and storing data + # --------------------------- + # + # Replay buffers + # ~~~~~~~~~~~~~~ + # + # Replay buffers play a central role in off-policy RL algorithms such as DQN. + # They constitute the dataset we will be sampling from during training. + # + # Here, we will use a regular sampling strategy, although a prioritized RB + # could improve the performance significantly. + # + # We place the storage on disk using + # :class:`torchrl.data.replay_buffers.storages.LazyMemmapStorage` class. This + # storage is created in a lazy manner: it will only be instantiated once the + # first batch of data is passed to it. + # + # The only requirement of this storage is that the data passed to it at write + # time must always have the same shape. + + + def get_replay_buffer(buffer_size, n_optim, batch_size): + replay_buffer = TensorDictReplayBuffer( + batch_size=batch_size, + storage=LazyMemmapStorage(buffer_size), + prefetch=n_optim, + ) + return replay_buffer + + + ############################################################################### + # Data collector + # ~~~~~~~~~~~~~~ + # + # As in `PPO `_ and + # `DDPG `_, we will be using + # a data collector as a dataloader in the outer loop. + # + # We choose the following configuration: we will be running a series of + # parallel environments synchronously in parallel in different collectors, + # themselves running in parallel but asynchronously. + # The advantage of this configuration is that we can balance the amount of + # compute that is executed in batch with what we want to be executed + # asynchronously. We encourage the reader to experiment how the collection + # speed is impacted by modifying the number of collectors (ie the number of + # environment constructors passed to the collector) and the number of + # environment executed in parallel in each collector (controlled by the + # ``num_workers`` hyperparameter). + # + # When building the collector, we can choose on which device we want the + # environment and policy to execute the operations through the ``device`` + # keyword argument. The ``storing_devices`` argument will modify the + # location of the data being collected: if the batches that we are gathering + # have a considerable size, we may want to store them on a different location + # than the device where the computation is happening. For asynchronous data + # collectors such as ours, different storing devices mean that the data that + # we collect won't sit on the same device each time, which is something that + # out training loop must account for. For simplicity, we set the devices to + # the same value for all sub-collectors. + + + def get_collector( + obs_norm_sd, + num_collectors, + actor_explore, + frames_per_batch, + total_frames, + device, + ): + data_collector = MultiaSyncDataCollector( + [ + make_env(parallel=True, obs_norm_sd=obs_norm_sd), + ] + * num_collectors, + policy=actor_explore, + frames_per_batch=frames_per_batch, + total_frames=total_frames, + # this is the default behaviour: the collector runs in ``"random"`` (or explorative) mode + exploration_mode="random", + # We set the all the devices to be identical. Below is an example of + # heterogeneous devices + device=device, + storing_device=device, + split_trajs=False, + postproc=MultiStep(gamma=gamma, n_steps=5), + ) + return data_collector + + + ############################################################################### + # Loss function + # ------------- + # + # Building our loss function is straightforward: we only need to provide + # the model and a bunch of hyperparameters to the DQNLoss class. + # + # Target parameters + # ~~~~~~~~~~~~~~~~~ + # + # Many off-policy RL algorithms use the concept of "target parameters" when it + # comes to estimate the value of the next state or state-action pair. + # The target parameters are lagged copies of the model parameters. Because + # their predictions mismatch those of the current model configuration, they + # help learning by putting a pessimistic bound on the value being estimated. + # This is a powerful trick (known as "Double Q-Learning") that is ubiquitous + # in similar algorithms. + # + + + def get_loss_module(actor, gamma): + loss_module = DQNLoss(actor, gamma=gamma, delay_value=True) + target_updater = SoftUpdate(loss_module) + return loss_module, target_updater + + + ############################################################################### + # Hyperparameters + # --------------- + # + # Let's start with our hyperparameters. The following setting should work well + # in practice, and the performance of the algorithm should hopefully not be + # too sensitive to slight variations of these. + + device = "cuda:0" if torch.cuda.device_count() > 0 else "cpu" + + ############################################################################### + # Optimizer + # ~~~~~~~~~ + + # the learning rate of the optimizer + lr = 2e-3 + # weight decay + wd = 1e-5 + # the beta parameters of Adam + betas = (0.9, 0.999) + # Optimization steps per batch collected (aka UPD or updates per data) + n_optim = 8 + + ############################################################################### + # DQN parameters + # ~~~~~~~~~~~~~~ + # gamma decay factor + gamma = 0.99 + + ############################################################################### + # Smooth target network update decay parameter. + # This loosely corresponds to a 1/tau interval with hard target network + # update + tau = 0.02 + + ############################################################################### + # Data collection and replay buffer + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # + # .. note:: + # Values to be used for proper training have been commented. + # + # Total frames collected in the environment. In other implementations, the + # user defines a maximum number of episodes. + # This is harder to do with our data collectors since they return batches + # of N collected frames, where N is a constant. + # However, one can easily get the same restriction on number of episodes by + # breaking the training loop when a certain number + # episodes has been collected. + total_frames = 10_000 # 500000 + + ############################################################################### + # Random frames used to initialize the replay buffer. + init_random_frames = 100 # 1000 + + ############################################################################### + # Frames in each batch collected. + frames_per_batch = 32 # 128 + + ############################################################################### + # Frames sampled from the replay buffer at each optimization step + batch_size = 32 # 256 + + ############################################################################### + # Size of the replay buffer in terms of frames + buffer_size = min(total_frames, 100000) + + ############################################################################### + # Number of environments run in parallel in each data collector + num_workers = 2 # 8 + num_collectors = 2 # 4 + + ############################################################################### + # Environment and exploration + # ~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # + # We set the initial and final value of the epsilon factor in Epsilon-greedy + # exploration. + # Since our policy is deterministic, exploration is crucial: without it, the + # only source of randomness would be the environment reset. + + eps_greedy_val = 0.1 + eps_greedy_val_env = 0.005 + + ############################################################################### + # To speed up learning, we set the bias of the last layer of our value network + # to a predefined value (this is not mandatory) + init_bias = 2.0 + + ############################################################################### + # .. note:: + # For fast rendering of the tutorial ``total_frames`` hyperparameter + # was set to a very low number. To get a reasonable performance, use a greater + # value e.g. 500000 + # + + ############################################################################### + # Building a Trainer + # ------------------ + # + # TorchRL's :class:`torchrl.trainers.Trainer` class constructor takes the + # following keyword-only arguments: + # + # - ``collector`` + # - ``loss_module`` + # - ``optimizer`` + # - ``logger``: A logger can be + # - ``total_frames``: this parameter defines the lifespan of the trainer. + # - ``frame_skip``: when a frame-skip is used, the collector must be made + # aware of it in order to accurately count the number of frames + # collected etc. Making the trainer aware of this parameter is not + # mandatory but helps to have a fairer comparison between settings where + # the total number of frames (budget) is fixed but the frame-skip is + # variable. + + stats = get_norm_stats() + test_env = make_env(parallel=False, obs_norm_sd=stats) + # Get model + actor, actor_explore = make_model(test_env) + loss_module, target_net_updater = get_loss_module(actor, gamma) + target_net_updater.init_() + + collector = get_collector( + stats, num_collectors, actor_explore, frames_per_batch, total_frames, device ) - obs_norm_sd = test_env.transform[-1].state_dict() - # let's check that normalizing constants have a size of ``[C, 1, 1]`` where - # ``C=4`` (because of :class:`torchrl.envs.CatFrames`). - print("state dict of the observation norm:", obs_norm_sd) - return obs_norm_sd - - -############################################################################### -# Building the model (Deep Q-network) -# ----------------------------------- -# -# The following function builds a :class:`torchrl.modules.DuelingCnnDQNet` -# object which is a simple CNN followed by a two-layer MLP. The only trick used -# here is that the action values (i.e. left and right action value) are -# computed using -# -# .. math:: -# -# \mathbb{v} = b(obs) + v(obs) - \mathbb{E}[v(obs)] -# -# where :math:`\mathbb{v}` is our vector of action values, -# :math:`b` is a :math:`\mathbb{R}^n \rightarrow 1` function and :math:`v` is a -# :math:`\mathbb{R}^n \rightarrow \mathbb{R}^m` function, for -# :math:`n = \# obs` and :math:`m = \# actions`. -# -# Our network is wrapped in a :class:`torchrl.modules.QValueActor`, -# which will read the state-action -# values, pick up the one with the maximum value and write all those results -# in the input :class:`tensordict.TensorDict`. -# - - -def make_model(dummy_env): - cnn_kwargs = { - "num_cells": [32, 64, 64], - "kernel_sizes": [6, 4, 3], - "strides": [2, 2, 1], - "activation_class": nn.ELU, - # This can be used to reduce the size of the last layer of the CNN - # "squeeze_output": True, - # "aggregator_class": nn.AdaptiveAvgPool2d, - # "aggregator_kwargs": {"output_size": (1, 1)}, - } - mlp_kwargs = { - "depth": 2, - "num_cells": [ - 64, - 64, - ], - "activation_class": nn.ELU, - } - net = DuelingCnnDQNet( - dummy_env.action_spec.shape[-1], 1, cnn_kwargs, mlp_kwargs - ).to(device) - net.value[-1].bias.data.fill_(init_bias) - - actor = QValueActor(net, in_keys=["pixels"], spec=dummy_env.action_spec).to(device) - # init actor: because the model is composed of lazy conv/linear layers, - # we must pass a fake batch of data through it to instantiate them. - tensordict = dummy_env.fake_tensordict() - actor(tensordict) - - # we wrap our actor in an EGreedyWrapper for data collection - actor_explore = EGreedyWrapper( - actor, - annealing_num_steps=total_frames, - eps_init=eps_greedy_val, - eps_end=eps_greedy_val_env, + optimizer = torch.optim.Adam( + loss_module.parameters(), lr=lr, weight_decay=wd, betas=betas ) + exp_name = f"dqn_exp_{uuid.uuid1()}" + tmpdir = tempfile.TemporaryDirectory() + logger = CSVLogger(exp_name=exp_name, log_dir=tmpdir.name) + warnings.warn(f"log dir: {logger.experiment.log_dir}") - return actor, actor_explore - - -############################################################################### -# Collecting and storing data -# --------------------------- -# -# Replay buffers -# ~~~~~~~~~~~~~~ -# -# Replay buffers play a central role in off-policy RL algorithms such as DQN. -# They constitute the dataset we will be sampling from during training. -# -# Here, we will use a regular sampling strategy, although a prioritized RB -# could improve the performance significantly. -# -# We place the storage on disk using -# :class:`torchrl.data.replay_buffers.storages.LazyMemmapStorage` class. This -# storage is created in a lazy manner: it will only be instantiated once the -# first batch of data is passed to it. -# -# The only requirement of this storage is that the data passed to it at write -# time must always have the same shape. - - -def get_replay_buffer(buffer_size, n_optim, batch_size): - replay_buffer = TensorDictReplayBuffer( - batch_size=batch_size, - storage=LazyMemmapStorage(buffer_size), - prefetch=n_optim, - ) - return replay_buffer - - -############################################################################### -# Data collector -# ~~~~~~~~~~~~~~ -# -# As in `PPO `_ and -# `DDPG `_, we will be using -# a data collector as a dataloader in the outer loop. -# -# We choose the following configuration: we will be running a series of -# parallel environments synchronously in parallel in different collectors, -# themselves running in parallel but asynchronously. -# The advantage of this configuration is that we can balance the amount of -# compute that is executed in batch with what we want to be executed -# asynchronously. We encourage the reader to experiment how the collection -# speed is impacted by modifying the number of collectors (ie the number of -# environment constructors passed to the collector) and the number of -# environment executed in parallel in each collector (controlled by the -# ``num_workers`` hyperparameter). -# -# When building the collector, we can choose on which device we want the -# environment and policy to execute the operations through the ``device`` -# keyword argument. The ``storing_devices`` argument will modify the -# location of the data being collected: if the batches that we are gathering -# have a considerable size, we may want to store them on a different location -# than the device where the computation is happening. For asynchronous data -# collectors such as ours, different storing devices mean that the data that -# we collect won't sit on the same device each time, which is something that -# out training loop must account for. For simplicity, we set the devices to -# the same value for all sub-collectors. - - -def get_collector( - obs_norm_sd, - num_collectors, - actor_explore, - frames_per_batch, - total_frames, - device, -): - data_collector = MultiaSyncDataCollector( - [ - make_env(parallel=True, obs_norm_sd=obs_norm_sd), - ] - * num_collectors, - policy=actor_explore, - frames_per_batch=frames_per_batch, + ############################################################################### + # We can control how often the scalars should be logged. Here we set this + # to a low value as our training loop is short: + + log_interval = 500 + + trainer = Trainer( + collector=collector, total_frames=total_frames, - # this is the default behaviour: the collector runs in ``"random"`` (or explorative) mode - exploration_mode="random", - # We set the all the devices to be identical. Below is an example of - # heterogeneous devices - device=device, - storing_device=device, - split_trajs=False, - postproc=MultiStep(gamma=gamma, n_steps=5), + frame_skip=1, + loss_module=loss_module, + optimizer=optimizer, + logger=logger, + optim_steps_per_batch=n_optim, + log_interval=log_interval, ) - return data_collector - - -############################################################################### -# Loss function -# ------------- -# -# Building our loss function is straightforward: we only need to provide -# the model and a bunch of hyperparameters to the DQNLoss class. -# -# Target parameters -# ~~~~~~~~~~~~~~~~~ -# -# Many off-policy RL algorithms use the concept of "target parameters" when it -# comes to estimate the value of the next state or state-action pair. -# The target parameters are lagged copies of the model parameters. Because -# their predictions mismatch those of the current model configuration, they -# help learning by putting a pessimistic bound on the value being estimated. -# This is a powerful trick (known as "Double Q-Learning") that is ubiquitous -# in similar algorithms. -# - - -def get_loss_module(actor, gamma): - loss_module = DQNLoss(actor, gamma=gamma, delay_value=True) - target_updater = SoftUpdate(loss_module) - return loss_module, target_updater - - -############################################################################### -# Hyperparameters -# --------------- -# -# Let's start with our hyperparameters. The following setting should work well -# in practice, and the performance of the algorithm should hopefully not be -# too sensitive to slight variations of these. - -device = "cuda:0" if torch.cuda.device_count() > 0 else "cpu" - -############################################################################### -# Optimizer -# ~~~~~~~~~ - -# the learning rate of the optimizer -lr = 2e-3 -# weight decay -wd = 1e-5 -# the beta parameters of Adam -betas = (0.9, 0.999) -# Optimization steps per batch collected (aka UPD or updates per data) -n_optim = 8 - -############################################################################### -# DQN parameters -# ~~~~~~~~~~~~~~ -# gamma decay factor -gamma = 0.99 - -############################################################################### -# Smooth target network update decay parameter. -# This loosely corresponds to a 1/tau interval with hard target network -# update -tau = 0.02 - -############################################################################### -# Data collection and replay buffer -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# .. note:: -# Values to be used for proper training have been commented. -# -# Total frames collected in the environment. In other implementations, the -# user defines a maximum number of episodes. -# This is harder to do with our data collectors since they return batches -# of N collected frames, where N is a constant. -# However, one can easily get the same restriction on number of episodes by -# breaking the training loop when a certain number -# episodes has been collected. -total_frames = 10_000 # 500000 - -############################################################################### -# Random frames used to initialize the replay buffer. -init_random_frames = 100 # 1000 - -############################################################################### -# Frames in each batch collected. -frames_per_batch = 32 # 128 - -############################################################################### -# Frames sampled from the replay buffer at each optimization step -batch_size = 32 # 256 - -############################################################################### -# Size of the replay buffer in terms of frames -buffer_size = min(total_frames, 100000) - -############################################################################### -# Number of environments run in parallel in each data collector -num_workers = 2 # 8 -num_collectors = 2 # 4 - -############################################################################### -# Environment and exploration -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# We set the initial and final value of the epsilon factor in Epsilon-greedy -# exploration. -# Since our policy is deterministic, exploration is crucial: without it, the -# only source of randomness would be the environment reset. - -eps_greedy_val = 0.1 -eps_greedy_val_env = 0.005 - -############################################################################### -# To speed up learning, we set the bias of the last layer of our value network -# to a predefined value (this is not mandatory) -init_bias = 2.0 - -############################################################################### -# .. note:: -# For fast rendering of the tutorial ``total_frames`` hyperparameter -# was set to a very low number. To get a reasonable performance, use a greater -# value e.g. 500000 -# - -############################################################################### -# Building a Trainer -# ------------------ -# -# TorchRL's :class:`torchrl.trainers.Trainer` class constructor takes the -# following keyword-only arguments: -# -# - ``collector`` -# - ``loss_module`` -# - ``optimizer`` -# - ``logger``: A logger can be -# - ``total_frames``: this parameter defines the lifespan of the trainer. -# - ``frame_skip``: when a frame-skip is used, the collector must be made -# aware of it in order to accurately count the number of frames -# collected etc. Making the trainer aware of this parameter is not -# mandatory but helps to have a fairer comparison between settings where -# the total number of frames (budget) is fixed but the frame-skip is -# variable. - -stats = get_norm_stats() -test_env = make_env(parallel=False, obs_norm_sd=stats) -# Get model -actor, actor_explore = make_model(test_env) -loss_module, target_net_updater = get_loss_module(actor, gamma) -target_net_updater.init_() - -collector = get_collector( - stats, num_collectors, actor_explore, frames_per_batch, total_frames, device -) -optimizer = torch.optim.Adam( - loss_module.parameters(), lr=lr, weight_decay=wd, betas=betas -) -exp_name = f"dqn_exp_{uuid.uuid1()}" -tmpdir = tempfile.TemporaryDirectory() -logger = CSVLogger(exp_name=exp_name, log_dir=tmpdir.name) -warnings.warn(f"log dir: {logger.experiment.log_dir}") - -############################################################################### -# We can control how often the scalars should be logged. Here we set this -# to a low value as our training loop is short: - -log_interval = 500 - -trainer = Trainer( - collector=collector, - total_frames=total_frames, - frame_skip=1, - loss_module=loss_module, - optimizer=optimizer, - logger=logger, - optim_steps_per_batch=n_optim, - log_interval=log_interval, -) - -############################################################################### -# Registering hooks -# ~~~~~~~~~~~~~~~~~ -# -# Registering hooks can be achieved in two separate ways: -# -# - If the hook has it, the :meth:`torchrl.trainers.TrainerHookBase.register` -# method is the first choice. One just needs to provide the trainer as input -# and the hook will be registered with a default name at a default location. -# For some hooks, the registration can be quite complex: :class:`torchrl.trainers.ReplayBufferTrainer` -# requires 3 hooks (``extend``, ``sample`` and ``update_priority``) which -# can be cumbersome to implement. -buffer_hook = ReplayBufferTrainer( - get_replay_buffer(buffer_size, n_optim, batch_size=batch_size), - flatten_tensordicts=True, -) -buffer_hook.register(trainer) -weight_updater = UpdateWeights(collector, update_weights_interval=1) -weight_updater.register(trainer) -recorder = Recorder( - record_interval=100, # log every 100 optimization steps - record_frames=1000, # maximum number of frames in the record - frame_skip=1, - policy_exploration=actor_explore, - environment=test_env, - exploration_mode="mode", - log_keys=[("next", "reward")], - out_keys={("next", "reward"): "rewards"}, - log_pbar=True, -) -recorder.register(trainer) - -############################################################################### -# - Any callable (including :class:`torchrl.trainers.TrainerHookBase` -# subclasses) can be registered using :meth:`torchrl.trainers.Trainer.register_op`. -# In this case, a location must be explicitly passed (). This method gives -# more control over the location of the hook but it also requires more -# understanding of the Trainer mechanism. -# Check the `trainer documentation `_ -# for a detailed description of the trainer hooks. -# -trainer.register_op("post_optim", target_net_updater.step) - -############################################################################### -# We can log the training rewards too. Note that this is of limited interest -# with CartPole, as rewards are always 1. The discounted sum of rewards is -# maximised not by getting higher rewards but by keeping the cart-pole alive -# for longer. -# This will be reflected by the `total_rewards` value displayed in the -# progress bar. -# -log_reward = LogReward(log_pbar=True) -log_reward.register(trainer) - -############################################################################### -# .. note:: -# It is possible to link multiple optimizers to the trainer if needed. -# In this case, each optimizer will be tied to a field in the loss -# dictionary. -# Check the :class:`torchrl.trainers.OptimizerHook` to learn more. -# -# Here we are, ready to train our algorithm! A simple call to -# ``trainer.train()`` and we'll be getting our results logged in. -# -trainer.train() - -############################################################################### -# We can now quickly check the CSVs with the results. - - -def print_csv_files_in_folder(folder_path): - """ - Find all CSV files in a folder and return the first 10 lines of each file as a string. - - Args: - folder_path (str): The relative path to the folder. - Returns: - str: A string containing the first 10 lines of each CSV file in the folder. - """ - csv_files = [] - output_str = "" - for file in os.listdir(folder_path): - if file.endswith(".csv"): - csv_files.append(os.path.join(folder_path, file)) - for csv_file in csv_files: - output_str += f"File: {csv_file}\n" - with open(csv_file, "r") as f: - for i, line in enumerate(f): - if i == 10: - break - output_str += line.strip() + "\n" - output_str += "\n" - return output_str - - -print_csv_files_in_folder(logger.experiment.log_dir) - -############################################################################### -# Conclusion and possible improvements -# ------------------------------------ -# -# In this tutorial we have learned: -# -# - How to write a Trainer, including building its components and registering -# them in the trainer; -# - How to code a DQN algorithm, including how to create a policy that picks -# up the action with the highest value with -# :class:`torchrl.modules.QValueNetwork`; -# - How to build a multiprocessed data collector; -# -# Possible improvements to this tutorial could include: -# -# - A prioritized replay buffer could also be used. This will give a -# higher priority to samples that have the worst value accuracy. -# Learn more on the -# `replay buffer section `_ -# of the documentation. -# - A distributional loss (see :class:`torchrl.objectives.DistributionalDQNLoss` -# for more information). -# - More fancy exploration techniques, such as :class:`torchrl.modules.NoisyLinear` layers and such. + ############################################################################### + # Registering hooks + # ~~~~~~~~~~~~~~~~~ + # + # Registering hooks can be achieved in two separate ways: + # + # - If the hook has it, the :meth:`torchrl.trainers.TrainerHookBase.register` + # method is the first choice. One just needs to provide the trainer as input + # and the hook will be registered with a default name at a default location. + # For some hooks, the registration can be quite complex: :class:`torchrl.trainers.ReplayBufferTrainer` + # requires 3 hooks (``extend``, ``sample`` and ``update_priority``) which + # can be cumbersome to implement. + buffer_hook = ReplayBufferTrainer( + get_replay_buffer(buffer_size, n_optim, batch_size=batch_size), + flatten_tensordicts=True, + ) + buffer_hook.register(trainer) + weight_updater = UpdateWeights(collector, update_weights_interval=1) + weight_updater.register(trainer) + recorder = Recorder( + record_interval=100, # log every 100 optimization steps + record_frames=1000, # maximum number of frames in the record + frame_skip=1, + policy_exploration=actor_explore, + environment=test_env, + exploration_mode="mode", + log_keys=[("next", "reward")], + out_keys={("next", "reward"): "rewards"}, + log_pbar=True, + ) + recorder.register(trainer) + + ############################################################################### + # - Any callable (including :class:`torchrl.trainers.TrainerHookBase` + # subclasses) can be registered using :meth:`torchrl.trainers.Trainer.register_op`. + # In this case, a location must be explicitly passed (). This method gives + # more control over the location of the hook but it also requires more + # understanding of the Trainer mechanism. + # Check the `trainer documentation `_ + # for a detailed description of the trainer hooks. + # + trainer.register_op("post_optim", target_net_updater.step) + + ############################################################################### + # We can log the training rewards too. Note that this is of limited interest + # with CartPole, as rewards are always 1. The discounted sum of rewards is + # maximised not by getting higher rewards but by keeping the cart-pole alive + # for longer. + # This will be reflected by the `total_rewards` value displayed in the + # progress bar. + # + log_reward = LogReward(log_pbar=True) + log_reward.register(trainer) + + ############################################################################### + # .. note:: + # It is possible to link multiple optimizers to the trainer if needed. + # In this case, each optimizer will be tied to a field in the loss + # dictionary. + # Check the :class:`torchrl.trainers.OptimizerHook` to learn more. + # + # Here we are, ready to train our algorithm! A simple call to + # ``trainer.train()`` and we'll be getting our results logged in. + # + trainer.train() + + ############################################################################### + # We can now quickly check the CSVs with the results. + + + def print_csv_files_in_folder(folder_path): + """ + Find all CSV files in a folder and return the first 10 lines of each file as a string. + + Args: + folder_path (str): The relative path to the folder. + + Returns: + str: A string containing the first 10 lines of each CSV file in the folder. + """ + csv_files = [] + output_str = "" + for file in os.listdir(folder_path): + if file.endswith(".csv"): + csv_files.append(os.path.join(folder_path, file)) + for csv_file in csv_files: + output_str += f"File: {csv_file}\n" + with open(csv_file, "r") as f: + for i, line in enumerate(f): + if i == 10: + break + output_str += line.strip() + "\n" + output_str += "\n" + return output_str + + + print_csv_files_in_folder(logger.experiment.log_dir) + + ############################################################################### + # Conclusion and possible improvements + # ------------------------------------ + # + # In this tutorial we have learned: + # + # - How to write a Trainer, including building its components and registering + # them in the trainer; + # - How to code a DQN algorithm, including how to create a policy that picks + # up the action with the highest value with + # :class:`torchrl.modules.QValueNetwork`; + # - How to build a multiprocessed data collector; + # + # Possible improvements to this tutorial could include: + # + # - A prioritized replay buffer could also be used. This will give a + # higher priority to samples that have the worst value accuracy. + # Learn more on the + # `replay buffer section `_ + # of the documentation. + # - A distributional loss (see :class:`torchrl.objectives.DistributionalDQNLoss` + # for more information). + # - More fancy exploration techniques, such as :class:`torchrl.modules.NoisyLinear` layers and such. From 89e7b1b156180ba1598ee475904fb9df189b2b1c Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 5 Apr 2023 16:44:58 +0100 Subject: [PATCH 85/89] amend --- tutorials/sphinx-tutorials/coding_dqn.py | 1399 +++++++++++----------- 1 file changed, 699 insertions(+), 700 deletions(-) diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py index 3d0eef5adc9..47268647e71 100644 --- a/tutorials/sphinx-tutorials/coding_dqn.py +++ b/tutorials/sphinx-tutorials/coding_dqn.py @@ -1,711 +1,710 @@ -if __name__ == "__main__": - # -*- coding: utf-8 -*- - """ - TorchRL trainer: A DQN example - ============================== - **Author**: `Vincent Moens `_ - - """ - - ############################################################################## - # TorchRL provides a generic :class:`torchrl.trainers.Trainer` class to handle - # your training loop. The trainer executes a nested loop where the outer loop - # is the data collection and the inner loop consumes this data or some data - # retrieved from the replay buffer to train the model. - # At various points in this training loop, hooks can be attached and executed at - # given intervals. - # - # In this tutorial, we will be using the trainer class to train a DQN algorithm - # to solve the CartPole task from scratch. - # - # Main takeaways: - # - # - Building a trainer with its essential components: data collector, loss - # module, replay buffer and optimizer. - # - Adding hooks to a trainer, such as loggers, target network updaters and such. - # - # The trainer is fully customisable and offers a large set of functionalities. - # The tutorial is organised around its construction. - # We will be detailing how to build each of the components of the library first, - # and then put the pieces together using the :class:`torchrl.trainers.Trainer` - # class. - # - # Along the road, we will also focus on some other aspects of the library: - # - # - how to build an environment in TorchRL, including transforms (e.g. data - # normalization, frame concatenation, resizing and turning to grayscale) - # and parallel execution. Unlike what we did in the - # `DDPG tutorial `_, we - # will normalize the pixels and not the state vector. - # - how to design a :class:`torchrl.modules.QValueActor` object, i.e. an actor - # that estimates the action values and picks up the action with the highest - # estimated return; - # - how to collect data from your environment efficiently and store them - # in a replay buffer; - # - how to use multi-step, a simple preprocessing step for off-policy algorithms; - # - and finally how to evaluate your model. - # - # **Prerequisites**: We encourage you to get familiar with torchrl through the - # `PPO tutorial `_ first. - # - # DQN - # --- - # - # DQN (`Deep Q-Learning `_) was - # the founding work in deep reinforcement learning. - # - # On a high level, the algorithm is quite simple: Q-learning consists in - # learning a table of state-action values in such a way that, when - # encountering any particular state, we know which action to pick just by - # searching for the one with the highest value. This simple setting - # requires the actions and states to be - # discrete, otherwise a lookup table cannot be built. - # - # DQN uses a neural network that encodes a map from the state-action space to - # a value (scalar) space, which amortizes the cost of storing and exploring all - # the possible state-action combinations: if a state has not been seen in the - # past, we can still pass it in conjunction with the various actions available - # through our neural network and get an interpolated value for each of the - # actions available. - # - # We will solve the classic control problem of the cart pole. From the - # Gymnasium doc from where this environment is retrieved: - # - # | A pole is attached by an un-actuated joint to a cart, which moves along a - # | frictionless track. The pendulum is placed upright on the cart and the goal - # | is to balance the pole by applying forces in the left and right direction - # | on the cart. - # - # .. figure:: /_static/img/cartpole_demo.gif - # :alt: Cart Pole - # - # We do not aim at giving a SOTA implementation of the algorithm, but rather - # to provide a high-level illustration of TorchRL features in the context - # of this algorithm. - - # sphinx_gallery_start_ignore - import tempfile - import warnings - - warnings.filterwarnings("ignore") - # sphinx_gallery_end_ignore - - import os - import uuid - - import torch - from torch import nn - from torchrl.collectors import MultiaSyncDataCollector - from torchrl.data import LazyMemmapStorage, MultiStep, TensorDictReplayBuffer - from torchrl.envs import EnvCreator, ParallelEnv, RewardScaling, StepCounter - from torchrl.envs.libs.gym import GymEnv - from torchrl.envs.transforms import ( - CatFrames, - Compose, - GrayScale, - ObservationNorm, - Resize, - ToTensorImage, - TransformedEnv, - ) - from torchrl.modules import DuelingCnnDQNet, EGreedyWrapper, QValueActor - - from torchrl.objectives import DQNLoss, SoftUpdate - from torchrl.record.loggers.csv import CSVLogger - from torchrl.trainers import ( - LogReward, - Recorder, - ReplayBufferTrainer, - Trainer, - UpdateWeights, - ) - - - def is_notebook() -> bool: - try: - shell = get_ipython().__class__.__name__ - if shell == "ZMQInteractiveShell": - return True # Jupyter notebook or qtconsole - elif shell == "TerminalInteractiveShell": - return False # Terminal running IPython - else: - return False # Other type (?) - except NameError: - return False # Probably standard Python interpreter - - - ############################################################################### - # Let's get started with the various pieces we need for our algorithm: - # - # - An environment; - # - A policy (and related modules that we group under the "model" umbrella); - # - A data collector, which makes the policy play in the environment and - # delivers training data; - # - A replay buffer to store the training data; - # - A loss module, which computes the objective function to train our policy - # to maximise the return; - # - An optimizer, which performs parameter updates based on our loss. - # - # Additional modules include a logger, a recorder (executes the policy in - # "eval" mode) and a target network updater. With all these components into - # place, it is easy to see how one could misplace or misuse one component in - # the training script. The trainer is there to orchestrate everything for you! - # - # Building the environment - # ------------------------ - # - # First let's write a helper function that will output an environment. As usual, - # the "raw" environment may be too simple to be used in practice and we'll need - # some data transformation to expose its output to the policy. - # - # We will be using five transforms: - # - # - :class:`torchrl.envs.StepCounter` to count the number of steps in each trajectory; - # - :class:`torchrl.envs.transforms.ToTensorImage` will convert a ``[W, H, C]`` uint8 - # tensor in a floating point tensor in the ``[0, 1]`` space with shape - # ``[C, W, H]``; - # - :class:`torchrl.envs.transforms.RewardScaling` to reduce the scale of the return; - # - :class:`torchrl.envs.transforms.GrayScale` will turn our image into grayscale; - # - :class:`torchrl.envs.transforms.Resize` will resize the image in a 64x64 format; - # - :class:`torchrl.envs.transforms.CatFrames` will concatenate an arbitrary number of - # successive frames (``N=4``) in a single tensor along the channel dimension. - # This is useful as a single image does not carry information about the - # motion of the cartpole. Some memory about past observations and actions - # is needed, either via a recurrent neural network or using a stack of - # frames. - # - :class:`torchrl.envs.transforms.ObservationNorm` which will normalize our observations - # given some custom summary statistics. - # - # In practice, our environment builder has two arguments: - # - # - ``parallel``: determines whether multiple environments have to be run in - # parallel. We stack the transforms after the - # :class:`torchrl.envs.ParallelEnv` to take advantage - # of vectorization of the operations on device, although this would - # technically work with every single environment attached to its own set of - # transforms. - # - ``obs_norm_sd`` will contain the normalizing constants for - # the :class:`torchrl.envs.ObservationNorm` transform. - # - - - def make_env( - parallel=False, - obs_norm_sd=None, - ): - if obs_norm_sd is None: - obs_norm_sd = {"standard_normal": True} - if parallel: - base_env = ParallelEnv( - num_workers, - EnvCreator( - lambda: GymEnv( - "CartPole-v1", - from_pixels=True, - pixels_only=True, - device=device, - ) - ), - ) +# -*- coding: utf-8 -*- +""" +TorchRL trainer: A DQN example +============================== +**Author**: `Vincent Moens `_ + +""" + +############################################################################## +# TorchRL provides a generic :class:`torchrl.trainers.Trainer` class to handle +# your training loop. The trainer executes a nested loop where the outer loop +# is the data collection and the inner loop consumes this data or some data +# retrieved from the replay buffer to train the model. +# At various points in this training loop, hooks can be attached and executed at +# given intervals. +# +# In this tutorial, we will be using the trainer class to train a DQN algorithm +# to solve the CartPole task from scratch. +# +# Main takeaways: +# +# - Building a trainer with its essential components: data collector, loss +# module, replay buffer and optimizer. +# - Adding hooks to a trainer, such as loggers, target network updaters and such. +# +# The trainer is fully customisable and offers a large set of functionalities. +# The tutorial is organised around its construction. +# We will be detailing how to build each of the components of the library first, +# and then put the pieces together using the :class:`torchrl.trainers.Trainer` +# class. +# +# Along the road, we will also focus on some other aspects of the library: +# +# - how to build an environment in TorchRL, including transforms (e.g. data +# normalization, frame concatenation, resizing and turning to grayscale) +# and parallel execution. Unlike what we did in the +# `DDPG tutorial `_, we +# will normalize the pixels and not the state vector. +# - how to design a :class:`torchrl.modules.QValueActor` object, i.e. an actor +# that estimates the action values and picks up the action with the highest +# estimated return; +# - how to collect data from your environment efficiently and store them +# in a replay buffer; +# - how to use multi-step, a simple preprocessing step for off-policy algorithms; +# - and finally how to evaluate your model. +# +# **Prerequisites**: We encourage you to get familiar with torchrl through the +# `PPO tutorial `_ first. +# +# DQN +# --- +# +# DQN (`Deep Q-Learning `_) was +# the founding work in deep reinforcement learning. +# +# On a high level, the algorithm is quite simple: Q-learning consists in +# learning a table of state-action values in such a way that, when +# encountering any particular state, we know which action to pick just by +# searching for the one with the highest value. This simple setting +# requires the actions and states to be +# discrete, otherwise a lookup table cannot be built. +# +# DQN uses a neural network that encodes a map from the state-action space to +# a value (scalar) space, which amortizes the cost of storing and exploring all +# the possible state-action combinations: if a state has not been seen in the +# past, we can still pass it in conjunction with the various actions available +# through our neural network and get an interpolated value for each of the +# actions available. +# +# We will solve the classic control problem of the cart pole. From the +# Gymnasium doc from where this environment is retrieved: +# +# | A pole is attached by an un-actuated joint to a cart, which moves along a +# | frictionless track. The pendulum is placed upright on the cart and the goal +# | is to balance the pole by applying forces in the left and right direction +# | on the cart. +# +# .. figure:: /_static/img/cartpole_demo.gif +# :alt: Cart Pole +# +# We do not aim at giving a SOTA implementation of the algorithm, but rather +# to provide a high-level illustration of TorchRL features in the context +# of this algorithm. + +# sphinx_gallery_start_ignore +import tempfile +import warnings + +warnings.filterwarnings("ignore") +# sphinx_gallery_end_ignore + +import os +import uuid + +import torch +from torch import nn +from torchrl.collectors import MultiaSyncDataCollector +from torchrl.data import LazyMemmapStorage, MultiStep, TensorDictReplayBuffer +from torchrl.envs import EnvCreator, ParallelEnv, RewardScaling, StepCounter +from torchrl.envs.libs.gym import GymEnv +from torchrl.envs.transforms import ( + CatFrames, + Compose, + GrayScale, + ObservationNorm, + Resize, + ToTensorImage, + TransformedEnv, +) +from torchrl.modules import DuelingCnnDQNet, EGreedyWrapper, QValueActor + +from torchrl.objectives import DQNLoss, SoftUpdate +from torchrl.record.loggers.csv import CSVLogger +from torchrl.trainers import ( + LogReward, + Recorder, + ReplayBufferTrainer, + Trainer, + UpdateWeights, +) + + +def is_notebook() -> bool: + try: + shell = get_ipython().__class__.__name__ + if shell == "ZMQInteractiveShell": + return True # Jupyter notebook or qtconsole + elif shell == "TerminalInteractiveShell": + return False # Terminal running IPython else: - base_env = GymEnv( - "CartPole-v1", - from_pixels=True, - pixels_only=True, - device=device, - ) - - env = TransformedEnv( - base_env, - Compose( - StepCounter(), # to count the steps of each trajectory - ToTensorImage(), - RewardScaling(loc=0.0, scale=0.1), - GrayScale(), - Resize(64, 64), - CatFrames(4, in_keys=["pixels"], dim=-3), - ObservationNorm(in_keys=["pixels"], **obs_norm_sd), + return False # Other type (?) + except NameError: + return False # Probably standard Python interpreter + + +############################################################################### +# Let's get started with the various pieces we need for our algorithm: +# +# - An environment; +# - A policy (and related modules that we group under the "model" umbrella); +# - A data collector, which makes the policy play in the environment and +# delivers training data; +# - A replay buffer to store the training data; +# - A loss module, which computes the objective function to train our policy +# to maximise the return; +# - An optimizer, which performs parameter updates based on our loss. +# +# Additional modules include a logger, a recorder (executes the policy in +# "eval" mode) and a target network updater. With all these components into +# place, it is easy to see how one could misplace or misuse one component in +# the training script. The trainer is there to orchestrate everything for you! +# +# Building the environment +# ------------------------ +# +# First let's write a helper function that will output an environment. As usual, +# the "raw" environment may be too simple to be used in practice and we'll need +# some data transformation to expose its output to the policy. +# +# We will be using five transforms: +# +# - :class:`torchrl.envs.StepCounter` to count the number of steps in each trajectory; +# - :class:`torchrl.envs.transforms.ToTensorImage` will convert a ``[W, H, C]`` uint8 +# tensor in a floating point tensor in the ``[0, 1]`` space with shape +# ``[C, W, H]``; +# - :class:`torchrl.envs.transforms.RewardScaling` to reduce the scale of the return; +# - :class:`torchrl.envs.transforms.GrayScale` will turn our image into grayscale; +# - :class:`torchrl.envs.transforms.Resize` will resize the image in a 64x64 format; +# - :class:`torchrl.envs.transforms.CatFrames` will concatenate an arbitrary number of +# successive frames (``N=4``) in a single tensor along the channel dimension. +# This is useful as a single image does not carry information about the +# motion of the cartpole. Some memory about past observations and actions +# is needed, either via a recurrent neural network or using a stack of +# frames. +# - :class:`torchrl.envs.transforms.ObservationNorm` which will normalize our observations +# given some custom summary statistics. +# +# In practice, our environment builder has two arguments: +# +# - ``parallel``: determines whether multiple environments have to be run in +# parallel. We stack the transforms after the +# :class:`torchrl.envs.ParallelEnv` to take advantage +# of vectorization of the operations on device, although this would +# technically work with every single environment attached to its own set of +# transforms. +# - ``obs_norm_sd`` will contain the normalizing constants for +# the :class:`torchrl.envs.ObservationNorm` transform. +# + + +def make_env( + parallel=False, + obs_norm_sd=None, +): + if obs_norm_sd is None: + obs_norm_sd = {"standard_normal": True} + if parallel: + base_env = ParallelEnv( + num_workers, + EnvCreator( + lambda: GymEnv( + "CartPole-v1", + from_pixels=True, + pixels_only=True, + device=device, + ) ), ) - return env - - - ############################################################################### - # Compute normalizing constants - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # - # To normalize images, we don't want to normalize each pixel independently - # with a full ``[C, W, H]`` normalizing mask, but with simpler ``[C, 1, 1]`` - # shaped set of normalizing constants (loc and scale parameters). - # We will be using the ``reduce_dim`` argument - # of :meth:`torchrl.envs.ObservationNorm.init_stats` to instruct which - # dimensions must be reduced, and the ``keep_dims`` parameter to ensure that - # not all dimensions disappear in the process: - # - - - def get_norm_stats(): - test_env = make_env() - test_env.transform[-1].init_stats( - num_iter=1000, cat_dim=0, reduce_dim=[-1, -2, -4], keep_dims=(-1, -2) - ) - obs_norm_sd = test_env.transform[-1].state_dict() - # let's check that normalizing constants have a size of ``[C, 1, 1]`` where - # ``C=4`` (because of :class:`torchrl.envs.CatFrames`). - print("state dict of the observation norm:", obs_norm_sd) - return obs_norm_sd - - - ############################################################################### - # Building the model (Deep Q-network) - # ----------------------------------- - # - # The following function builds a :class:`torchrl.modules.DuelingCnnDQNet` - # object which is a simple CNN followed by a two-layer MLP. The only trick used - # here is that the action values (i.e. left and right action value) are - # computed using - # - # .. math:: - # - # \mathbb{v} = b(obs) + v(obs) - \mathbb{E}[v(obs)] - # - # where :math:`\mathbb{v}` is our vector of action values, - # :math:`b` is a :math:`\mathbb{R}^n \rightarrow 1` function and :math:`v` is a - # :math:`\mathbb{R}^n \rightarrow \mathbb{R}^m` function, for - # :math:`n = \# obs` and :math:`m = \# actions`. - # - # Our network is wrapped in a :class:`torchrl.modules.QValueActor`, - # which will read the state-action - # values, pick up the one with the maximum value and write all those results - # in the input :class:`tensordict.TensorDict`. - # - - - def make_model(dummy_env): - cnn_kwargs = { - "num_cells": [32, 64, 64], - "kernel_sizes": [6, 4, 3], - "strides": [2, 2, 1], - "activation_class": nn.ELU, - # This can be used to reduce the size of the last layer of the CNN - # "squeeze_output": True, - # "aggregator_class": nn.AdaptiveAvgPool2d, - # "aggregator_kwargs": {"output_size": (1, 1)}, - } - mlp_kwargs = { - "depth": 2, - "num_cells": [ - 64, - 64, - ], - "activation_class": nn.ELU, - } - net = DuelingCnnDQNet( - dummy_env.action_spec.shape[-1], 1, cnn_kwargs, mlp_kwargs - ).to(device) - net.value[-1].bias.data.fill_(init_bias) - - actor = QValueActor(net, in_keys=["pixels"], spec=dummy_env.action_spec).to(device) - # init actor: because the model is composed of lazy conv/linear layers, - # we must pass a fake batch of data through it to instantiate them. - tensordict = dummy_env.fake_tensordict() - actor(tensordict) - - # we wrap our actor in an EGreedyWrapper for data collection - actor_explore = EGreedyWrapper( - actor, - annealing_num_steps=total_frames, - eps_init=eps_greedy_val, - eps_end=eps_greedy_val_env, - ) - - return actor, actor_explore - - - ############################################################################### - # Collecting and storing data - # --------------------------- - # - # Replay buffers - # ~~~~~~~~~~~~~~ - # - # Replay buffers play a central role in off-policy RL algorithms such as DQN. - # They constitute the dataset we will be sampling from during training. - # - # Here, we will use a regular sampling strategy, although a prioritized RB - # could improve the performance significantly. - # - # We place the storage on disk using - # :class:`torchrl.data.replay_buffers.storages.LazyMemmapStorage` class. This - # storage is created in a lazy manner: it will only be instantiated once the - # first batch of data is passed to it. - # - # The only requirement of this storage is that the data passed to it at write - # time must always have the same shape. - - - def get_replay_buffer(buffer_size, n_optim, batch_size): - replay_buffer = TensorDictReplayBuffer( - batch_size=batch_size, - storage=LazyMemmapStorage(buffer_size), - prefetch=n_optim, - ) - return replay_buffer - - - ############################################################################### - # Data collector - # ~~~~~~~~~~~~~~ - # - # As in `PPO `_ and - # `DDPG `_, we will be using - # a data collector as a dataloader in the outer loop. - # - # We choose the following configuration: we will be running a series of - # parallel environments synchronously in parallel in different collectors, - # themselves running in parallel but asynchronously. - # The advantage of this configuration is that we can balance the amount of - # compute that is executed in batch with what we want to be executed - # asynchronously. We encourage the reader to experiment how the collection - # speed is impacted by modifying the number of collectors (ie the number of - # environment constructors passed to the collector) and the number of - # environment executed in parallel in each collector (controlled by the - # ``num_workers`` hyperparameter). - # - # When building the collector, we can choose on which device we want the - # environment and policy to execute the operations through the ``device`` - # keyword argument. The ``storing_devices`` argument will modify the - # location of the data being collected: if the batches that we are gathering - # have a considerable size, we may want to store them on a different location - # than the device where the computation is happening. For asynchronous data - # collectors such as ours, different storing devices mean that the data that - # we collect won't sit on the same device each time, which is something that - # out training loop must account for. For simplicity, we set the devices to - # the same value for all sub-collectors. - - - def get_collector( - obs_norm_sd, - num_collectors, - actor_explore, - frames_per_batch, - total_frames, - device, - ): - data_collector = MultiaSyncDataCollector( - [ - make_env(parallel=True, obs_norm_sd=obs_norm_sd), - ] - * num_collectors, - policy=actor_explore, - frames_per_batch=frames_per_batch, - total_frames=total_frames, - # this is the default behaviour: the collector runs in ``"random"`` (or explorative) mode - exploration_mode="random", - # We set the all the devices to be identical. Below is an example of - # heterogeneous devices + else: + base_env = GymEnv( + "CartPole-v1", + from_pixels=True, + pixels_only=True, device=device, - storing_device=device, - split_trajs=False, - postproc=MultiStep(gamma=gamma, n_steps=5), ) - return data_collector - - - ############################################################################### - # Loss function - # ------------- - # - # Building our loss function is straightforward: we only need to provide - # the model and a bunch of hyperparameters to the DQNLoss class. - # - # Target parameters - # ~~~~~~~~~~~~~~~~~ - # - # Many off-policy RL algorithms use the concept of "target parameters" when it - # comes to estimate the value of the next state or state-action pair. - # The target parameters are lagged copies of the model parameters. Because - # their predictions mismatch those of the current model configuration, they - # help learning by putting a pessimistic bound on the value being estimated. - # This is a powerful trick (known as "Double Q-Learning") that is ubiquitous - # in similar algorithms. - # - - - def get_loss_module(actor, gamma): - loss_module = DQNLoss(actor, gamma=gamma, delay_value=True) - target_updater = SoftUpdate(loss_module) - return loss_module, target_updater - - - ############################################################################### - # Hyperparameters - # --------------- - # - # Let's start with our hyperparameters. The following setting should work well - # in practice, and the performance of the algorithm should hopefully not be - # too sensitive to slight variations of these. - - device = "cuda:0" if torch.cuda.device_count() > 0 else "cpu" - - ############################################################################### - # Optimizer - # ~~~~~~~~~ - - # the learning rate of the optimizer - lr = 2e-3 - # weight decay - wd = 1e-5 - # the beta parameters of Adam - betas = (0.9, 0.999) - # Optimization steps per batch collected (aka UPD or updates per data) - n_optim = 8 - - ############################################################################### - # DQN parameters - # ~~~~~~~~~~~~~~ - # gamma decay factor - gamma = 0.99 - - ############################################################################### - # Smooth target network update decay parameter. - # This loosely corresponds to a 1/tau interval with hard target network - # update - tau = 0.02 - - ############################################################################### - # Data collection and replay buffer - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # - # .. note:: - # Values to be used for proper training have been commented. - # - # Total frames collected in the environment. In other implementations, the - # user defines a maximum number of episodes. - # This is harder to do with our data collectors since they return batches - # of N collected frames, where N is a constant. - # However, one can easily get the same restriction on number of episodes by - # breaking the training loop when a certain number - # episodes has been collected. - total_frames = 10_000 # 500000 - - ############################################################################### - # Random frames used to initialize the replay buffer. - init_random_frames = 100 # 1000 - - ############################################################################### - # Frames in each batch collected. - frames_per_batch = 32 # 128 - - ############################################################################### - # Frames sampled from the replay buffer at each optimization step - batch_size = 32 # 256 - - ############################################################################### - # Size of the replay buffer in terms of frames - buffer_size = min(total_frames, 100000) - - ############################################################################### - # Number of environments run in parallel in each data collector - num_workers = 2 # 8 - num_collectors = 2 # 4 - - ############################################################################### - # Environment and exploration - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # - # We set the initial and final value of the epsilon factor in Epsilon-greedy - # exploration. - # Since our policy is deterministic, exploration is crucial: without it, the - # only source of randomness would be the environment reset. - - eps_greedy_val = 0.1 - eps_greedy_val_env = 0.005 - - ############################################################################### - # To speed up learning, we set the bias of the last layer of our value network - # to a predefined value (this is not mandatory) - init_bias = 2.0 - - ############################################################################### - # .. note:: - # For fast rendering of the tutorial ``total_frames`` hyperparameter - # was set to a very low number. To get a reasonable performance, use a greater - # value e.g. 500000 - # - - ############################################################################### - # Building a Trainer - # ------------------ - # - # TorchRL's :class:`torchrl.trainers.Trainer` class constructor takes the - # following keyword-only arguments: - # - # - ``collector`` - # - ``loss_module`` - # - ``optimizer`` - # - ``logger``: A logger can be - # - ``total_frames``: this parameter defines the lifespan of the trainer. - # - ``frame_skip``: when a frame-skip is used, the collector must be made - # aware of it in order to accurately count the number of frames - # collected etc. Making the trainer aware of this parameter is not - # mandatory but helps to have a fairer comparison between settings where - # the total number of frames (budget) is fixed but the frame-skip is - # variable. - - stats = get_norm_stats() - test_env = make_env(parallel=False, obs_norm_sd=stats) - # Get model - actor, actor_explore = make_model(test_env) - loss_module, target_net_updater = get_loss_module(actor, gamma) - target_net_updater.init_() - - collector = get_collector( - stats, num_collectors, actor_explore, frames_per_batch, total_frames, device + + env = TransformedEnv( + base_env, + Compose( + StepCounter(), # to count the steps of each trajectory + ToTensorImage(), + RewardScaling(loc=0.0, scale=0.1), + GrayScale(), + Resize(64, 64), + CatFrames(4, in_keys=["pixels"], dim=-3), + ObservationNorm(in_keys=["pixels"], **obs_norm_sd), + ), ) - optimizer = torch.optim.Adam( - loss_module.parameters(), lr=lr, weight_decay=wd, betas=betas + return env + + +############################################################################### +# Compute normalizing constants +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# To normalize images, we don't want to normalize each pixel independently +# with a full ``[C, W, H]`` normalizing mask, but with simpler ``[C, 1, 1]`` +# shaped set of normalizing constants (loc and scale parameters). +# We will be using the ``reduce_dim`` argument +# of :meth:`torchrl.envs.ObservationNorm.init_stats` to instruct which +# dimensions must be reduced, and the ``keep_dims`` parameter to ensure that +# not all dimensions disappear in the process: +# + + +def get_norm_stats(): + test_env = make_env() + test_env.transform[-1].init_stats( + num_iter=1000, cat_dim=0, reduce_dim=[-1, -2, -4], keep_dims=(-1, -2) ) - exp_name = f"dqn_exp_{uuid.uuid1()}" - tmpdir = tempfile.TemporaryDirectory() - logger = CSVLogger(exp_name=exp_name, log_dir=tmpdir.name) - warnings.warn(f"log dir: {logger.experiment.log_dir}") - - ############################################################################### - # We can control how often the scalars should be logged. Here we set this - # to a low value as our training loop is short: - - log_interval = 500 - - trainer = Trainer( - collector=collector, - total_frames=total_frames, - frame_skip=1, - loss_module=loss_module, - optimizer=optimizer, - logger=logger, - optim_steps_per_batch=n_optim, - log_interval=log_interval, + obs_norm_sd = test_env.transform[-1].state_dict() + # let's check that normalizing constants have a size of ``[C, 1, 1]`` where + # ``C=4`` (because of :class:`torchrl.envs.CatFrames`). + print("state dict of the observation norm:", obs_norm_sd) + return obs_norm_sd + + +############################################################################### +# Building the model (Deep Q-network) +# ----------------------------------- +# +# The following function builds a :class:`torchrl.modules.DuelingCnnDQNet` +# object which is a simple CNN followed by a two-layer MLP. The only trick used +# here is that the action values (i.e. left and right action value) are +# computed using +# +# .. math:: +# +# \mathbb{v} = b(obs) + v(obs) - \mathbb{E}[v(obs)] +# +# where :math:`\mathbb{v}` is our vector of action values, +# :math:`b` is a :math:`\mathbb{R}^n \rightarrow 1` function and :math:`v` is a +# :math:`\mathbb{R}^n \rightarrow \mathbb{R}^m` function, for +# :math:`n = \# obs` and :math:`m = \# actions`. +# +# Our network is wrapped in a :class:`torchrl.modules.QValueActor`, +# which will read the state-action +# values, pick up the one with the maximum value and write all those results +# in the input :class:`tensordict.TensorDict`. +# + + +def make_model(dummy_env): + cnn_kwargs = { + "num_cells": [32, 64, 64], + "kernel_sizes": [6, 4, 3], + "strides": [2, 2, 1], + "activation_class": nn.ELU, + # This can be used to reduce the size of the last layer of the CNN + # "squeeze_output": True, + # "aggregator_class": nn.AdaptiveAvgPool2d, + # "aggregator_kwargs": {"output_size": (1, 1)}, + } + mlp_kwargs = { + "depth": 2, + "num_cells": [ + 64, + 64, + ], + "activation_class": nn.ELU, + } + net = DuelingCnnDQNet( + dummy_env.action_spec.shape[-1], 1, cnn_kwargs, mlp_kwargs + ).to(device) + net.value[-1].bias.data.fill_(init_bias) + + actor = QValueActor(net, in_keys=["pixels"], spec=dummy_env.action_spec).to(device) + # init actor: because the model is composed of lazy conv/linear layers, + # we must pass a fake batch of data through it to instantiate them. + tensordict = dummy_env.fake_tensordict() + actor(tensordict) + + # we wrap our actor in an EGreedyWrapper for data collection + actor_explore = EGreedyWrapper( + actor, + annealing_num_steps=total_frames, + eps_init=eps_greedy_val, + eps_end=eps_greedy_val_env, ) - ############################################################################### - # Registering hooks - # ~~~~~~~~~~~~~~~~~ - # - # Registering hooks can be achieved in two separate ways: - # - # - If the hook has it, the :meth:`torchrl.trainers.TrainerHookBase.register` - # method is the first choice. One just needs to provide the trainer as input - # and the hook will be registered with a default name at a default location. - # For some hooks, the registration can be quite complex: :class:`torchrl.trainers.ReplayBufferTrainer` - # requires 3 hooks (``extend``, ``sample`` and ``update_priority``) which - # can be cumbersome to implement. - buffer_hook = ReplayBufferTrainer( - get_replay_buffer(buffer_size, n_optim, batch_size=batch_size), - flatten_tensordicts=True, + return actor, actor_explore + + +############################################################################### +# Collecting and storing data +# --------------------------- +# +# Replay buffers +# ~~~~~~~~~~~~~~ +# +# Replay buffers play a central role in off-policy RL algorithms such as DQN. +# They constitute the dataset we will be sampling from during training. +# +# Here, we will use a regular sampling strategy, although a prioritized RB +# could improve the performance significantly. +# +# We place the storage on disk using +# :class:`torchrl.data.replay_buffers.storages.LazyMemmapStorage` class. This +# storage is created in a lazy manner: it will only be instantiated once the +# first batch of data is passed to it. +# +# The only requirement of this storage is that the data passed to it at write +# time must always have the same shape. + + +def get_replay_buffer(buffer_size, n_optim, batch_size): + replay_buffer = TensorDictReplayBuffer( + batch_size=batch_size, + storage=LazyMemmapStorage(buffer_size), + prefetch=n_optim, ) - buffer_hook.register(trainer) - weight_updater = UpdateWeights(collector, update_weights_interval=1) - weight_updater.register(trainer) - recorder = Recorder( - record_interval=100, # log every 100 optimization steps - record_frames=1000, # maximum number of frames in the record - frame_skip=1, - policy_exploration=actor_explore, - environment=test_env, - exploration_mode="mode", - log_keys=[("next", "reward")], - out_keys={("next", "reward"): "rewards"}, - log_pbar=True, + return replay_buffer + + +############################################################################### +# Data collector +# ~~~~~~~~~~~~~~ +# +# As in `PPO `_ and +# `DDPG `_, we will be using +# a data collector as a dataloader in the outer loop. +# +# We choose the following configuration: we will be running a series of +# parallel environments synchronously in parallel in different collectors, +# themselves running in parallel but asynchronously. +# The advantage of this configuration is that we can balance the amount of +# compute that is executed in batch with what we want to be executed +# asynchronously. We encourage the reader to experiment how the collection +# speed is impacted by modifying the number of collectors (ie the number of +# environment constructors passed to the collector) and the number of +# environment executed in parallel in each collector (controlled by the +# ``num_workers`` hyperparameter). +# +# When building the collector, we can choose on which device we want the +# environment and policy to execute the operations through the ``device`` +# keyword argument. The ``storing_devices`` argument will modify the +# location of the data being collected: if the batches that we are gathering +# have a considerable size, we may want to store them on a different location +# than the device where the computation is happening. For asynchronous data +# collectors such as ours, different storing devices mean that the data that +# we collect won't sit on the same device each time, which is something that +# out training loop must account for. For simplicity, we set the devices to +# the same value for all sub-collectors. + + +def get_collector( + obs_norm_sd, + num_collectors, + actor_explore, + frames_per_batch, + total_frames, + device, +): + data_collector = MultiaSyncDataCollector( + [ + make_env(parallel=True, obs_norm_sd=obs_norm_sd), + ] + * num_collectors, + policy=actor_explore, + frames_per_batch=frames_per_batch, + total_frames=total_frames, + # this is the default behaviour: the collector runs in ``"random"`` (or explorative) mode + exploration_mode="random", + # We set the all the devices to be identical. Below is an example of + # heterogeneous devices + device=device, + storing_device=device, + split_trajs=False, + postproc=MultiStep(gamma=gamma, n_steps=5), ) - recorder.register(trainer) - - ############################################################################### - # - Any callable (including :class:`torchrl.trainers.TrainerHookBase` - # subclasses) can be registered using :meth:`torchrl.trainers.Trainer.register_op`. - # In this case, a location must be explicitly passed (). This method gives - # more control over the location of the hook but it also requires more - # understanding of the Trainer mechanism. - # Check the `trainer documentation `_ - # for a detailed description of the trainer hooks. - # - trainer.register_op("post_optim", target_net_updater.step) - - ############################################################################### - # We can log the training rewards too. Note that this is of limited interest - # with CartPole, as rewards are always 1. The discounted sum of rewards is - # maximised not by getting higher rewards but by keeping the cart-pole alive - # for longer. - # This will be reflected by the `total_rewards` value displayed in the - # progress bar. - # - log_reward = LogReward(log_pbar=True) - log_reward.register(trainer) - - ############################################################################### - # .. note:: - # It is possible to link multiple optimizers to the trainer if needed. - # In this case, each optimizer will be tied to a field in the loss - # dictionary. - # Check the :class:`torchrl.trainers.OptimizerHook` to learn more. - # - # Here we are, ready to train our algorithm! A simple call to - # ``trainer.train()`` and we'll be getting our results logged in. - # - trainer.train() - - ############################################################################### - # We can now quickly check the CSVs with the results. - - - def print_csv_files_in_folder(folder_path): - """ - Find all CSV files in a folder and return the first 10 lines of each file as a string. - - Args: - folder_path (str): The relative path to the folder. - - Returns: - str: A string containing the first 10 lines of each CSV file in the folder. - """ - csv_files = [] - output_str = "" - for file in os.listdir(folder_path): - if file.endswith(".csv"): - csv_files.append(os.path.join(folder_path, file)) - for csv_file in csv_files: - output_str += f"File: {csv_file}\n" - with open(csv_file, "r") as f: - for i, line in enumerate(f): - if i == 10: - break - output_str += line.strip() + "\n" - output_str += "\n" - return output_str - - - print_csv_files_in_folder(logger.experiment.log_dir) - - ############################################################################### - # Conclusion and possible improvements - # ------------------------------------ - # - # In this tutorial we have learned: - # - # - How to write a Trainer, including building its components and registering - # them in the trainer; - # - How to code a DQN algorithm, including how to create a policy that picks - # up the action with the highest value with - # :class:`torchrl.modules.QValueNetwork`; - # - How to build a multiprocessed data collector; - # - # Possible improvements to this tutorial could include: - # - # - A prioritized replay buffer could also be used. This will give a - # higher priority to samples that have the worst value accuracy. - # Learn more on the - # `replay buffer section `_ - # of the documentation. - # - A distributional loss (see :class:`torchrl.objectives.DistributionalDQNLoss` - # for more information). - # - More fancy exploration techniques, such as :class:`torchrl.modules.NoisyLinear` layers and such. + return data_collector + + +############################################################################### +# Loss function +# ------------- +# +# Building our loss function is straightforward: we only need to provide +# the model and a bunch of hyperparameters to the DQNLoss class. +# +# Target parameters +# ~~~~~~~~~~~~~~~~~ +# +# Many off-policy RL algorithms use the concept of "target parameters" when it +# comes to estimate the value of the next state or state-action pair. +# The target parameters are lagged copies of the model parameters. Because +# their predictions mismatch those of the current model configuration, they +# help learning by putting a pessimistic bound on the value being estimated. +# This is a powerful trick (known as "Double Q-Learning") that is ubiquitous +# in similar algorithms. +# + + +def get_loss_module(actor, gamma): + loss_module = DQNLoss(actor, gamma=gamma, delay_value=True) + target_updater = SoftUpdate(loss_module) + return loss_module, target_updater + + +############################################################################### +# Hyperparameters +# --------------- +# +# Let's start with our hyperparameters. The following setting should work well +# in practice, and the performance of the algorithm should hopefully not be +# too sensitive to slight variations of these. + +device = "cuda:0" if torch.cuda.device_count() > 0 else "cpu" + +############################################################################### +# Optimizer +# ~~~~~~~~~ + +# the learning rate of the optimizer +lr = 2e-3 +# weight decay +wd = 1e-5 +# the beta parameters of Adam +betas = (0.9, 0.999) +# Optimization steps per batch collected (aka UPD or updates per data) +n_optim = 8 + +############################################################################### +# DQN parameters +# ~~~~~~~~~~~~~~ +# gamma decay factor +gamma = 0.99 + +############################################################################### +# Smooth target network update decay parameter. +# This loosely corresponds to a 1/tau interval with hard target network +# update +tau = 0.02 + +############################################################################### +# Data collection and replay buffer +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# .. note:: +# Values to be used for proper training have been commented. +# +# Total frames collected in the environment. In other implementations, the +# user defines a maximum number of episodes. +# This is harder to do with our data collectors since they return batches +# of N collected frames, where N is a constant. +# However, one can easily get the same restriction on number of episodes by +# breaking the training loop when a certain number +# episodes has been collected. +total_frames = 10_000 # 500000 + +############################################################################### +# Random frames used to initialize the replay buffer. +init_random_frames = 100 # 1000 + +############################################################################### +# Frames in each batch collected. +frames_per_batch = 32 # 128 + +############################################################################### +# Frames sampled from the replay buffer at each optimization step +batch_size = 32 # 256 + +############################################################################### +# Size of the replay buffer in terms of frames +buffer_size = min(total_frames, 100000) + +############################################################################### +# Number of environments run in parallel in each data collector +num_workers = 2 # 8 +num_collectors = 2 # 4 + +############################################################################### +# Environment and exploration +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# We set the initial and final value of the epsilon factor in Epsilon-greedy +# exploration. +# Since our policy is deterministic, exploration is crucial: without it, the +# only source of randomness would be the environment reset. + +eps_greedy_val = 0.1 +eps_greedy_val_env = 0.005 + +############################################################################### +# To speed up learning, we set the bias of the last layer of our value network +# to a predefined value (this is not mandatory) +init_bias = 2.0 + +############################################################################### +# .. note:: +# For fast rendering of the tutorial ``total_frames`` hyperparameter +# was set to a very low number. To get a reasonable performance, use a greater +# value e.g. 500000 +# + +############################################################################### +# Building a Trainer +# ------------------ +# +# TorchRL's :class:`torchrl.trainers.Trainer` class constructor takes the +# following keyword-only arguments: +# +# - ``collector`` +# - ``loss_module`` +# - ``optimizer`` +# - ``logger``: A logger can be +# - ``total_frames``: this parameter defines the lifespan of the trainer. +# - ``frame_skip``: when a frame-skip is used, the collector must be made +# aware of it in order to accurately count the number of frames +# collected etc. Making the trainer aware of this parameter is not +# mandatory but helps to have a fairer comparison between settings where +# the total number of frames (budget) is fixed but the frame-skip is +# variable. + +stats = get_norm_stats() +test_env = make_env(parallel=False, obs_norm_sd=stats) +# Get model +actor, actor_explore = make_model(test_env) +loss_module, target_net_updater = get_loss_module(actor, gamma) +target_net_updater.init_() + +collector = get_collector( + stats, num_collectors, actor_explore, frames_per_batch, total_frames, device +) +optimizer = torch.optim.Adam( + loss_module.parameters(), lr=lr, weight_decay=wd, betas=betas +) +exp_name = f"dqn_exp_{uuid.uuid1()}" +tmpdir = tempfile.TemporaryDirectory() +logger = CSVLogger(exp_name=exp_name, log_dir=tmpdir.name) +warnings.warn(f"log dir: {logger.experiment.log_dir}") + +############################################################################### +# We can control how often the scalars should be logged. Here we set this +# to a low value as our training loop is short: + +log_interval = 500 + +trainer = Trainer( + collector=collector, + total_frames=total_frames, + frame_skip=1, + loss_module=loss_module, + optimizer=optimizer, + logger=logger, + optim_steps_per_batch=n_optim, + log_interval=log_interval, +) + +############################################################################### +# Registering hooks +# ~~~~~~~~~~~~~~~~~ +# +# Registering hooks can be achieved in two separate ways: +# +# - If the hook has it, the :meth:`torchrl.trainers.TrainerHookBase.register` +# method is the first choice. One just needs to provide the trainer as input +# and the hook will be registered with a default name at a default location. +# For some hooks, the registration can be quite complex: :class:`torchrl.trainers.ReplayBufferTrainer` +# requires 3 hooks (``extend``, ``sample`` and ``update_priority``) which +# can be cumbersome to implement. +buffer_hook = ReplayBufferTrainer( + get_replay_buffer(buffer_size, n_optim, batch_size=batch_size), + flatten_tensordicts=True, +) +buffer_hook.register(trainer) +weight_updater = UpdateWeights(collector, update_weights_interval=1) +weight_updater.register(trainer) +recorder = Recorder( + record_interval=100, # log every 100 optimization steps + record_frames=1000, # maximum number of frames in the record + frame_skip=1, + policy_exploration=actor_explore, + environment=test_env, + exploration_mode="mode", + log_keys=[("next", "reward")], + out_keys={("next", "reward"): "rewards"}, + log_pbar=True, +) +recorder.register(trainer) + +############################################################################### +# - Any callable (including :class:`torchrl.trainers.TrainerHookBase` +# subclasses) can be registered using :meth:`torchrl.trainers.Trainer.register_op`. +# In this case, a location must be explicitly passed (). This method gives +# more control over the location of the hook but it also requires more +# understanding of the Trainer mechanism. +# Check the `trainer documentation `_ +# for a detailed description of the trainer hooks. +# +trainer.register_op("post_optim", target_net_updater.step) + +############################################################################### +# We can log the training rewards too. Note that this is of limited interest +# with CartPole, as rewards are always 1. The discounted sum of rewards is +# maximised not by getting higher rewards but by keeping the cart-pole alive +# for longer. +# This will be reflected by the `total_rewards` value displayed in the +# progress bar. +# +log_reward = LogReward(log_pbar=True) +log_reward.register(trainer) + +############################################################################### +# .. note:: +# It is possible to link multiple optimizers to the trainer if needed. +# In this case, each optimizer will be tied to a field in the loss +# dictionary. +# Check the :class:`torchrl.trainers.OptimizerHook` to learn more. +# +# Here we are, ready to train our algorithm! A simple call to +# ``trainer.train()`` and we'll be getting our results logged in. +# +trainer.train() + +############################################################################### +# We can now quickly check the CSVs with the results. + + +def print_csv_files_in_folder(folder_path): + """ + Find all CSV files in a folder and return the first 10 lines of each file as a string. + + Args: + folder_path (str): The relative path to the folder. + + Returns: + str: A string containing the first 10 lines of each CSV file in the folder. + """ + csv_files = [] + output_str = "" + for file in os.listdir(folder_path): + if file.endswith(".csv"): + csv_files.append(os.path.join(folder_path, file)) + for csv_file in csv_files: + output_str += f"File: {csv_file}\n" + with open(csv_file, "r") as f: + for i, line in enumerate(f): + if i == 10: + break + output_str += line.strip() + "\n" + output_str += "\n" + return output_str + + +print_csv_files_in_folder(logger.experiment.log_dir) + +############################################################################### +# Conclusion and possible improvements +# ------------------------------------ +# +# In this tutorial we have learned: +# +# - How to write a Trainer, including building its components and registering +# them in the trainer; +# - How to code a DQN algorithm, including how to create a policy that picks +# up the action with the highest value with +# :class:`torchrl.modules.QValueNetwork`; +# - How to build a multiprocessed data collector; +# +# Possible improvements to this tutorial could include: +# +# - A prioritized replay buffer could also be used. This will give a +# higher priority to samples that have the worst value accuracy. +# Learn more on the +# `replay buffer section `_ +# of the documentation. +# - A distributional loss (see :class:`torchrl.objectives.DistributionalDQNLoss` +# for more information). +# - More fancy exploration techniques, such as :class:`torchrl.modules.NoisyLinear` layers and such. From 7d65ca46500d4d5a53b4aef3b8ad181e8c99bba4 Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 5 Apr 2023 17:36:43 +0100 Subject: [PATCH 86/89] remove prints --- torchrl/trainers/trainers.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/torchrl/trainers/trainers.py b/torchrl/trainers/trainers.py index 4a04acd4c98..69f33b796de 100644 --- a/torchrl/trainers/trainers.py +++ b/torchrl/trainers/trainers.py @@ -506,7 +506,6 @@ def _log(self, log_pbar=False, **kwargs) -> None: collected_frames = self.collected_frames for key, item in kwargs.items(): self._log_dict[key].append(item) - print(f"collected_frames {collected_frames}, self._last_log.get({key}, 0) {self._last_log.get(key, 0)}, self._log_interval {self._log_interval}") if (collected_frames - self._last_log.get(key, 0)) > self._log_interval: self._last_log[key] = collected_frames _log = True @@ -514,7 +513,6 @@ def _log(self, log_pbar=False, **kwargs) -> None: _log = False method = LOGGER_METHODS.get(key, "log_scalar") if _log and self.logger is not None: - print("logging!", key, self.logger.experiment.log_dir) getattr(self.logger, method)(key, item, step=collected_frames) if method == "log_scalar" and self.progress_bar and log_pbar: if isinstance(item, torch.Tensor): From 0d238d51b4b7b098b20972892e5a0446df01c816 Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 5 Apr 2023 18:24:40 +0100 Subject: [PATCH 87/89] amend --- tutorials/sphinx-tutorials/coding_dqn.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py index 47268647e71..f07ec471f3a 100644 --- a/tutorials/sphinx-tutorials/coding_dqn.py +++ b/tutorials/sphinx-tutorials/coding_dqn.py @@ -659,19 +659,18 @@ def get_loss_module(actor, gamma): def print_csv_files_in_folder(folder_path): """ - Find all CSV files in a folder and return the first 10 lines of each file as a string. + Find all CSV files in a folder and prints the first 10 lines of each file. Args: folder_path (str): The relative path to the folder. - Returns: - str: A string containing the first 10 lines of each CSV file in the folder. """ csv_files = [] output_str = "" - for file in os.listdir(folder_path): - if file.endswith(".csv"): - csv_files.append(os.path.join(folder_path, file)) + for dirpath, _, filenames in os.walk(folder_path): + for file in filenames: + if file.endswith(".csv"): + csv_files.append(os.path.join(dirpath, file)) for csv_file in csv_files: output_str += f"File: {csv_file}\n" with open(csv_file, "r") as f: @@ -680,10 +679,10 @@ def print_csv_files_in_folder(folder_path): break output_str += line.strip() + "\n" output_str += "\n" - return output_str + print(output_str) -print_csv_files_in_folder(logger.experiment.log_dir) +print_csv_files_in_folder("/var/folders/zs/9lq15k8x61l1g0c_sf__63c80000gn/T/tmpejpilvhb/dqn_exp_6d35f974-d3c2-11ed-8df6-acde48001122") ############################################################################### # Conclusion and possible improvements From 33133cbe11efafaee7ba58936b2afbc1236e998a Mon Sep 17 00:00:00 2001 From: vmoens Date: Thu, 6 Apr 2023 11:07:35 +0100 Subject: [PATCH 88/89] amend --- tutorials/sphinx-tutorials/coding_ddpg.py | 5 +++-- tutorials/sphinx-tutorials/coding_dqn.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tutorials/sphinx-tutorials/coding_ddpg.py b/tutorials/sphinx-tutorials/coding_ddpg.py index 914186f4ed9..53a6ae10e47 100644 --- a/tutorials/sphinx-tutorials/coding_ddpg.py +++ b/tutorials/sphinx-tutorials/coding_ddpg.py @@ -1179,8 +1179,9 @@ def ceil_div(x, y): # We make a simple plot of the average rewards during training. We can observe # that our policy learned quite well to solve the task. # -# **Note**: As already mentioned above, to get a more reasonable performance, -# use a greater value for ``total_frames`` e.g. 1M. +# .. note:: +# As already mentioned above, to get a more reasonable performance, +# use a greater value for ``total_frames`` e.g. 1M. from matplotlib import pyplot as plt diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py index f07ec471f3a..7b03e13af15 100644 --- a/tutorials/sphinx-tutorials/coding_dqn.py +++ b/tutorials/sphinx-tutorials/coding_dqn.py @@ -486,7 +486,7 @@ def get_loss_module(actor, gamma): # However, one can easily get the same restriction on number of episodes by # breaking the training loop when a certain number # episodes has been collected. -total_frames = 10_000 # 500000 +total_frames = 5_000 # 500000 ############################################################################### # Random frames used to initialize the replay buffer. From c221982dbbce16d3427a9bc150e7c7ada953518a Mon Sep 17 00:00:00 2001 From: vmoens Date: Thu, 6 Apr 2023 12:35:42 +0100 Subject: [PATCH 89/89] amend --- tutorials/sphinx-tutorials/coding_dqn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/sphinx-tutorials/coding_dqn.py b/tutorials/sphinx-tutorials/coding_dqn.py index 7b03e13af15..4603cecf37f 100644 --- a/tutorials/sphinx-tutorials/coding_dqn.py +++ b/tutorials/sphinx-tutorials/coding_dqn.py @@ -682,7 +682,7 @@ def print_csv_files_in_folder(folder_path): print(output_str) -print_csv_files_in_folder("/var/folders/zs/9lq15k8x61l1g0c_sf__63c80000gn/T/tmpejpilvhb/dqn_exp_6d35f974-d3c2-11ed-8df6-acde48001122") +print_csv_files_in_folder(logger.experiment.log_dir) ############################################################################### # Conclusion and possible improvements