Source code for d3rlpy.algos.awr

import numpy as np

from .base import AlgoBase
from .torch.awr_impl import AWRImpl, DiscreteAWRImpl
from ..dataset import compute_lambda_return
from ..optimizers import SGDFactory
from ..argument_utils import check_encoder, check_use_gpu, check_augmentation


[docs]class AWR(AlgoBase):
    r""" Advantage-Weighted Regression algorithm.

    AWR is an actor-critic algorithm that trains via supervised regression way,
    and has shown strong performance in online and offline settings.

    The value function is trained as a supervised regression problem.

    .. math::

        L(\theta) = \mathbb{E}_{s_t, R_t \sim D} [(R_t - V(s_t|\theta))^2]

    where :math:`R_t` is approximated using TD(:math:`\lambda`) to mitigate
    high variance issue.

    The policy function is also trained as a supervised regression problem.

    .. math::

        J(\phi) = \mathbb{E}_{s_t, a_t, R_t \sim D}
            [\log \pi(a_t|s_t, \phi)
                \exp (\frac{1}{B} (R_t - V(s_t|\theta)))]

    where :math:`B` is a constant factor.

    References:
        * `Peng et al., Advantage-Weighted Regression: Simple and Scalable
          Off-Policy Reinforcement Learning
          <https://arxiv.org/abs/1910.00177>`_

    Args:
        actor_learning_rate (float): learning rate for policy function.
        critic_learning_rate (float): learning rate for value function.
        actor_optim_factory (d3rlpy.optimizers.OptimizerFactory):
            optimizer factory for the actor.
        critic_optim_factory (d3rlpy.optimizers.OptimizerFactory):
            optimizer factory for the critic.
        actor_encoder_factory (d3rlpy.encoders.EncoderFactory or str):
            encoder factory for the actor.
        critic_encoder_factory (d3rlpy.encoders.EncoderFactory or str):
            encoder factory for the critic.
        batch_size (int): batch size per iteration.
        n_frames (int): the number of frames to stack for image observation.
        gamma (float): discount factor.
        batch_size_per_update (int): mini-batch size.
        n_actor_updates (int): actor gradient steps per iteration.
        n_critic_updates (int): critic gradient steps per iteration.
        lam (float): :math:`\lambda`  for TD(:math:`\lambda`).
        beta (float): :math:`B` for weight scale.
        max_weight (float): :math:`w_{\text{max}}` for weight clipping.
        use_gpu (bool, int or d3rlpy.gpu.Device):
            flag to use GPU, device ID or device.
        scaler (d3rlpy.preprocessing.Scaler or str): preprocessor.
            The available options are `['pixel', 'min_max', 'standard']`
        augmentation (d3rlpy.augmentation.AugmentationPipeline or list(str)):
            augmentation pipeline.
        dynamics (d3rlpy.dynamics.base.DynamicsBase): dynamics model for data
            augmentation.
        impl (d3rlpy.algos.torch.awr_impl.AWRImpl): algorithm implementation.

    Attributes:
        actor_learning_rate (float): learning rate for policy function.
        critic_learning_rate (float): learning rate for value function.
        actor_optim_factory (d3rlpy.optimizers.OptimizerFactory):
            optimizer factory for the actor.
        critic_optim_factory (d3rlpy.optimizers.OptimizerFactory):
            optimizer factory for the critic.
        actor_encoder_factory (d3rlpy.encoders.EncoderFactory):
            encoder factory for the actor.
        critic_encoder_factory (d3rlpy.encoders.EncoderFactory):
            encoder factory for the critic.
        batch_size (int): batch size per iteration.
        n_frames (int): the number of frames to stack for image observation.
        gamma (float): discount factor.
        batch_size_per_update (int): mini-batch size.
        n_actor_updates (int): actor gradient steps per iteration.
        n_critic_updates (int): critic gradient steps per iteration.
        lam (float): :math:`\lambda`  for TD(:math:`\lambda`).
        beta (float): :math:`B` for weight scale.
        max_weight (float): :math:`w_{\text{max}}` for weight clipping.
        use_gpu (d3rlpy.gpu.Device): GPU device.
        scaler (d3rlpy.preprocessing.Scaler): preprocessor.
        augmentation (d3rlpy.augmentation.AugmentationPipeline):
            augmentation pipeline.
        dynamics (d3rlpy.dynamics.base.DynamicsBase): dynamics model.
        impl (d3rlpy.algos.torch.awr_impl.AWRImpl): algorithm implementation.
        eval_results_ (dict): evaluation results.

    """
    def __init__(self,
                 *,
                 actor_learning_rate=5e-5,
                 critic_learning_rate=1e-4,
                 actor_optim_factory=SGDFactory(momentum=0.9),
                 critic_optim_factory=SGDFactory(momentum=0.9),
                 actor_encoder_factory='default',
                 critic_encoder_factory='default',
                 batch_size=2048,
                 n_frames=1,
                 gamma=0.99,
                 batch_size_per_update=256,
                 n_actor_updates=1000,
                 n_critic_updates=200,
                 lam=0.95,
                 beta=1.0,
                 max_weight=20.0,
                 use_gpu=False,
                 scaler=None,
                 augmentation=None,
                 dynamics=None,
                 impl=None,
                 **kwargs):
        # batch_size in AWR has different semantic from Q learning algorithms.
        super().__init__(batch_size=batch_size,
                         n_frames=n_frames,
                         n_steps=1,
                         gamma=gamma,
                         scaler=scaler,
                         dynamics=dynamics)
        self.actor_learning_rate = actor_learning_rate
        self.critic_learning_rate = critic_learning_rate
        self.actor_optim_factory = actor_optim_factory
        self.critic_optim_factory = critic_optim_factory
        self.actor_encoder_factory = check_encoder(actor_encoder_factory)
        self.critic_encoder_factory = check_encoder(critic_encoder_factory)
        self.batch_size_per_update = batch_size_per_update
        self.n_actor_updates = n_actor_updates
        self.n_critic_updates = n_critic_updates
        self.lam = lam
        self.beta = beta
        self.max_weight = max_weight
        self.augmentation = check_augmentation(augmentation)
        self.use_gpu = check_use_gpu(use_gpu)
        self.impl = impl

[docs]    def create_impl(self, observation_shape, action_size):
        self.impl = AWRImpl(observation_shape=observation_shape,
                            action_size=action_size,
                            actor_learning_rate=self.actor_learning_rate,
                            critic_learning_rate=self.critic_learning_rate,
                            actor_optim_factory=self.actor_optim_factory,
                            critic_optim_factory=self.critic_optim_factory,
                            actor_encoder_factory=self.actor_encoder_factory,
                            critic_encoder_factory=self.critic_encoder_factory,
                            use_gpu=self.use_gpu,
                            scaler=self.scaler,
                            augmentation=self.augmentation)
        self.impl.build()

    def _compute_lambda_returns(self, batch):
        # compute TD(lambda)
        lambda_returns = []
        for transition in batch.transitions:
            lambda_return = compute_lambda_return(transition=transition,
                                                  algo=self,
                                                  gamma=self.gamma,
                                                  lam=self.lam,
                                                  n_frames=self.n_frames)
            lambda_returns.append(lambda_return)
        return np.array(lambda_returns).reshape((-1, 1))

    def _compute_advantages(self, returns, batch):
        baselines = self.predict_value(batch.observations).reshape((-1, 1))
        advantages = returns - baselines
        adv_mean = np.mean(advantages)
        adv_std = np.std(advantages)
        return (advantages - adv_mean) / (adv_std + 1e-5)

    def _compute_clipped_weights(self, advantages):
        weights = np.exp(advantages / self.beta)
        return np.minimum(weights, self.max_weight)

[docs]    def predict_value(self, x, *args, **kwargs):
        """ Returns predicted state values.

        Args:
            x (numpy.ndarray): observations.

        Returns:
            numpy.ndarray: predicted state values.

        """
        return self.impl.predict_value(x)

[docs]    def update(self, epoch, itr, batch):
        # compute lmabda return
        lambda_returns = self._compute_lambda_returns(batch)

        # calcuate advantage
        advantages = self._compute_advantages(lambda_returns, batch)

        # compute weights
        clipped_weights = self._compute_clipped_weights(advantages)

        n_steps_per_batch = self.batch_size // self.batch_size_per_update

        # update critic
        critic_loss_history = []
        for i in range(self.n_critic_updates // n_steps_per_batch):
            for j in range(n_steps_per_batch):
                head_index = j * self.batch_size_per_update
                tail_index = head_index + self.batch_size_per_update
                observations = batch.observations[head_index:tail_index]
                returns = lambda_returns[head_index:tail_index]
                critic_loss = self.impl.update_critic(observations, returns)
                critic_loss_history.append(critic_loss)
        critic_loss_mean = np.mean(critic_loss_history)

        # update actor
        actor_loss_history = []
        for i in range(self.n_actor_updates // n_steps_per_batch):
            for j in range(n_steps_per_batch):
                head_index = j * self.batch_size_per_update
                tail_index = head_index + self.batch_size_per_update
                observations = batch.observations[head_index:tail_index]
                actions = batch.actions[head_index:tail_index]
                weights = clipped_weights[head_index:tail_index]
                actor_loss = self.impl.update_actor(observations, actions,
                                                    weights)
                actor_loss_history.append(actor_loss)
        actor_loss_mean = np.mean(actor_loss_history)

        return critic_loss_mean, actor_loss_mean, np.mean(clipped_weights)

    def _get_loss_labels(self):
        return ['critic_loss', 'actor_loss', 'weights']


[docs]class DiscreteAWR(AWR):
    r""" Discrete veriosn of Advantage-Weighted Regression algorithm.

    AWR is an actor-critic algorithm that trains via supervised regression way,
    and has shown strong performance in online and offline settings.

    The value function is trained as a supervised regression problem.

    .. math::

        L(\theta) = \mathbb{E}_{s_t, R_t \sim D} [(R_t - V(s_t|\theta))^2]

    where :math:`R_t` is approximated using TD(:math:`\lambda`) to mitigate
    high variance issue.

    The policy function is also trained as a supervised regression problem.

    .. math::

        J(\phi) = \mathbb{E}_{s_t, a_t, R_t \sim D}
            [\log \pi(a_t|s_t, \phi)
                \exp (\frac{1}{B} (R_t - V(s_t|\theta)))]

    where :math:`B` is a constant factor.

    References:
        * `Peng et al., Advantage-Weighted Regression: Simple and Scalable
          Off-Policy Reinforcement Learning
          <https://arxiv.org/abs/1910.00177>`_

    Args:
        actor_learning_rate (float): learning rate for policy function.
        critic_learning_rate (float): learning rate for value function.
        actor_optim_factory (d3rlpy.optimizers.OptimizerFactory):
            optimizer factory for the actor.
        critic_optim_factory (d3rlpy.optimizers.OptimizerFactory):
            optimizer factory for the critic.
        actor_encoder_factory (d3rlpy.encoders.EncoderFactory or str):
            encoder factory for the actor.
        critic_encoder_factory (d3rlpy.encoders.EncoderFactory or str):
            encoder factory for the critic.
        batch_size (int): batch size per iteration.
        n_frames (int): the number of frames to stack for image observation.
        gamma (float): discount factor.
        batch_size_per_update (int): mini-batch size.
        n_actor_updates (int): actor gradient steps per iteration.
        n_critic_updates (int): critic gradient steps per iteration.
        lam (float): :math:`\lambda`  for TD(:math:`\lambda`).
        beta (float): :math:`B` for weight scale.
        max_weight (float): :math:`w_{\text{max}}` for weight clipping.
        use_gpu (bool, int or d3rlpy.gpu.Device):
            flag to use GPU, device ID or device.
        scaler (d3rlpy.preprocessing.Scaler or str): preprocessor.
            The available options are `['pixel', 'min_max', 'standard']`
        augmentation (d3rlpy.augmentation.AugmentationPipeline or list(str)):
            augmentation pipeline.
        dynamics (d3rlpy.dynamics.base.DynamicsBase): dynamics model for data
            augmentation.
        impl (d3rlpy.algos.torch.awr_impl.DiscreteAWRImpl):
            algorithm implementation.

    Attributes:
        actor_learning_rate (float): learning rate for policy function.
        critic_learning_rate (float): learning rate for value function.
        actor_optim_factory (d3rlpy.optimizers.OptimizerFactory):
            optimizer factory for the actor.
        critic_optim_factory (d3rlpy.optimizers.OptimizerFactory):
            optimizer factory for the critic.
        actor_encoder_factory (d3rlpy.encoders.EncoderFactory):
            encoder factory for the actor.
        critic_encoder_factory (d3rlpy.encoders.EncoderFactory):
            encoder factory for the critic.
        batch_size (int): batch size per iteration.
        n_frames (int): the number of frames to stack for image observation.
        gamma (float): discount factor.
        batch_size_per_update (int): mini-batch size.
        n_actor_updates (int): actor gradient steps per iteration.
        n_critic_updates (int): critic gradient steps per iteration.
        lam (float): :math:`\lambda`  for TD(:math:`\lambda`).
        beta (float): :math:`B` for weight scale.
        max_weight (float): :math:`w_{\text{max}}` for weight clipping.
        use_gpu (d3rlpy.gpu.Device): GPU device.
        scaler (d3rlpy.preprocessing.Scaler): preprocessor.
        augmentation (d3rlpy.augmentation.AugmentationPipeline):
            augmentation pipeline.
        dynamics (d3rlpy.dynamics.base.DynamicsBase): dynamics model.
        impl (d3rlpy.algos.torch.awr_impl.DiscreteAWRImpl):
            algorithm implementation.
        eval_results_ (dict): evaluation results.

    """
[docs]    def create_impl(self, observation_shape, action_size):
        self.impl = DiscreteAWRImpl(
            observation_shape=observation_shape,
            action_size=action_size,
            actor_learning_rate=self.actor_learning_rate,
            critic_learning_rate=self.critic_learning_rate,
            actor_optim_factory=self.actor_optim_factory,
            critic_optim_factory=self.critic_optim_factory,
            actor_encoder_factory=self.actor_encoder_factory,
            critic_encoder_factory=self.critic_encoder_factory,
            use_gpu=self.use_gpu,
            scaler=self.scaler,
            augmentation=self.augmentation)
        self.impl.build()