Source code for d3rlpy.algos.awr

import numpy as np

from d3rlpy.dataset import compute_lambda_return
from .base import AlgoBase
from .torch.awr_impl import AWRImpl, DiscreteAWRImpl


[docs]class AWR(AlgoBase):
    """ Advantage-Weighted Regression algorithm.

    AWR is an actor-critic algorithm that trains via supervised regression way,
    and has shown strong performance in online and offline settings.

    The value function is trained as a supervised regression problem.

    .. math::

        L(\\theta) = \\mathbb{E}_{s_t, R_t \\sim D} [(R_t - V(s_t|\\theta))^2]

    where :math:`R_t` is approximated using TD(:math:`\\lambda`) to mitigate
    high variance issue.

    The policy function is also trained as a supervised regression problem.

    .. math::

        J(\\phi) = \\mathbb{E}_{s_t, a_t, R_t \\sim D}
            [\\log \\pi(a_t|s_t, \\phi)
                \\exp (\\frac{1}{B} (R_t - V(s_t|\\theta)))]

    where :math:`B` is a constant factor.

    References:
        * `Peng et al., Advantage-Weighted Regression: Simple and Scalable
          Off-Policy Reinforcement Learning
          <https://arxiv.org/abs/1910.00177>`_

    Args:
        actor_learning_rate (float): learning rate for policy function.
        critic_learning_rate (float): learning rate for value function.
        batch_size (int): batch size per iteration.
        n_frames (int): the number of frames to stack for image observation.
        gamma (float): discount factor.
        batch_size_per_update (int): mini-batch size.
        n_actor_updates (int): actor gradient steps per iteration.
        n_critic_updates (int): critic gradient steps per iteration.
        lam (float): :math:`\\lambda`  for TD(:math:`\\lambda`).
        beta (float): :math:`B` for weight scale.
        max_weight (float): :math:`w_{\\text{max}}` for weight clipping.
        momentum (float): momentum for stochastic gradient descent.
        use_batch_norm (bool): flag to insert batch normalization layers.
        use_gpu (bool, int or d3rlpy.gpu.Device):
            flag to use GPU, device ID or device.
        scaler (d3rlpy.preprocessing.Scaler or str): preprocessor.
            The available options are `['pixel', 'min_max', 'standard']`
        augmentation (d3rlpy.augmentation.AugmentationPipeline or list(str)):
            augmentation pipeline.
        n_augmentations (int): the number of data augmentations to update.
        encoder_params (dict): optional arguments for encoder setup. If the
            observation is pixel, you can pass ``filters`` with list of tuples
            consisting with ``(filter_size, kernel_size, stride)`` and
            ``feature_size`` with an integer scaler for the last linear layer
            size. If the observation is vector, you can pass ``hidden_units``
            with list of hidden unit sizes.
        dynamics (d3rlpy.dynamics.base.DynamicsBase): dynamics model for data
            augmentation.
        impl (d3rlpy.algos.torch.awr_impl.AWRImpl): algorithm implementation.

    Attributes:
        actor_learning_rate (float): learning rate for policy function.
        critic_learning_rate (float): learning rate for value function.
        batch_size (int): batch size per iteration.
        n_frames (int): the number of frames to stack for image observation.
        gamma (float): discount factor.
        batch_size_per_update (int): mini-batch size.
        n_actor_updates (int): actor gradient steps per iteration.
        n_critic_updates (int): critic gradient steps per iteration.
        lam (float): :math:`\\lambda`  for TD(:math:`\\lambda`).
        beta (float): :math:`B` for weight scale.
        max_weight (float): :math:`w_{\\text{max}}` for weight clipping.
        momentum (float): momentum for stochastic gradient descent.
        use_batch_norm (bool): flag to insert batch normalization layers.
        use_gpu (d3rlpy.gpu.Device): GPU device.
        scaler (d3rlpy.preprocessing.Scaler): preprocessor.
        augmentation (d3rlpy.augmentation.AugmentationPipeline):
            augmentation pipeline.
        n_augmentations (int): the number of data augmentations to update.
        encoder_params (dict): optional arguments for encoder setup.
        dynamics (d3rlpy.dynamics.base.DynamicsBase): dynamics model.
        impl (d3rlpy.algos.torch.awr_impl.AWRImpl): algorithm implementation.
        eval_results_ (dict): evaluation results.

    """
    def __init__(self,
                 *,
                 actor_learning_rate=5e-5,
                 critic_learning_rate=1e-4,
                 batch_size=2048,
                 n_frames=1,
                 gamma=0.99,
                 batch_size_per_update=256,
                 n_actor_updates=1000,
                 n_critic_updates=200,
                 lam=0.95,
                 beta=1.0,
                 max_weight=20.0,
                 momentum=0.9,
                 use_batch_norm=False,
                 use_gpu=False,
                 scaler=None,
                 augmentation=[],
                 n_augmentations=1,
                 encoder_params={},
                 dynamics=None,
                 impl=None,
                 **kwargs):
        # batch_size in AWR has different semantic from Q learning algorithms.
        super().__init__(batch_size=batch_size,
                         n_frames=n_frames,
                         scaler=scaler,
                         augmentation=augmentation,
                         dynamics=dynamics,
                         use_gpu=use_gpu)
        self.actor_learning_rate = actor_learning_rate
        self.critic_learning_rate = critic_learning_rate
        self.batch_size_per_update = batch_size_per_update
        self.n_actor_updates = n_actor_updates
        self.n_critic_updates = n_critic_updates
        self.gamma = gamma
        self.lam = lam
        self.beta = beta
        self.max_weight = max_weight
        self.use_batch_norm = use_batch_norm
        self.momentum = momentum
        self.n_augmentations = n_augmentations
        self.encoder_params = encoder_params
        self.impl = impl

[docs]    def create_impl(self, observation_shape, action_size):
        self.impl = AWRImpl(observation_shape=observation_shape,
                            action_size=action_size,
                            actor_learning_rate=self.actor_learning_rate,
                            critic_learning_rate=self.critic_learning_rate,
                            use_batch_norm=self.use_batch_norm,
                            momentum=self.momentum,
                            use_gpu=self.use_gpu,
                            scaler=self.scaler,
                            augmentation=self.augmentation,
                            n_augmentations=self.n_augmentations,
                            encoder_params=self.encoder_params)
        self.impl.build()

    def _compute_lambda_returns(self, batch):
        # compute TD(lambda)
        lambda_returns = []
        for transition in batch.transitions:
            lambda_return = compute_lambda_return(transition=transition,
                                                  algo=self,
                                                  gamma=self.gamma,
                                                  lam=self.lam,
                                                  n_frames=self.n_frames)
            lambda_returns.append(lambda_return)
        return np.array(lambda_returns).reshape((-1, 1))

    def _compute_advantages(self, returns, batch):
        baselines = self.predict_value(batch.observations).reshape((-1, 1))
        advantages = returns - baselines
        adv_mean = np.mean(advantages)
        adv_std = np.std(advantages)
        return (advantages - adv_mean) / (adv_std + 1e-5)

    def _compute_clipped_weights(self, advantages):
        weights = np.exp(advantages / self.beta)
        return np.minimum(weights, self.max_weight)

[docs]    def predict_value(self, x, *args, **kwargs):
        """ Returns predicted state values.

        Args:
            x (numpy.ndarray): observations.

        Returns:
            numpy.ndarray: predicted state values.

        """
        return self.impl.predict_value(x)

[docs]    def update(self, epoch, itr, batch):
        # compute lmabda return
        lambda_returns = self._compute_lambda_returns(batch)

        # calcuate advantage
        advantages = self._compute_advantages(lambda_returns, batch)

        # compute weights
        clipped_weights = self._compute_clipped_weights(advantages)

        n_steps_per_batch = self.batch_size // self.batch_size_per_update

        # update critic
        critic_loss_history = []
        for i in range(self.n_critic_updates // n_steps_per_batch):
            for j in range(n_steps_per_batch):
                head_index = j * self.batch_size_per_update
                tail_index = head_index + self.batch_size_per_update
                observations = batch.observations[head_index:tail_index]
                returns = lambda_returns[head_index:tail_index]
                critic_loss = self.impl.update_critic(observations, returns)
                critic_loss_history.append(critic_loss)
        critic_loss_mean = np.mean(critic_loss_history)

        # update actor
        actor_loss_history = []
        for i in range(self.n_actor_updates // n_steps_per_batch):
            for j in range(n_steps_per_batch):
                head_index = j * self.batch_size_per_update
                tail_index = head_index + self.batch_size_per_update
                observations = batch.observations[head_index:tail_index]
                actions = batch.actions[head_index:tail_index]
                weights = clipped_weights[head_index:tail_index]
                actor_loss = self.impl.update_actor(observations, actions,
                                                    weights)
                actor_loss_history.append(actor_loss)
        actor_loss_mean = np.mean(actor_loss_history)

        return critic_loss_mean, actor_loss_mean, np.mean(clipped_weights)

    def _get_loss_labels(self):
        return ['critic_loss', 'actor_loss', 'weights']


[docs]class DiscreteAWR(AWR):
    """ Discrete veriosn of Advantage-Weighted Regression algorithm.

    AWR is an actor-critic algorithm that trains via supervised regression way,
    and has shown strong performance in online and offline settings.

    The value function is trained as a supervised regression problem.

    .. math::

        L(\\theta) = \\mathbb{E}_{s_t, R_t \\sim D} [(R_t - V(s_t|\\theta))^2]

    where :math:`R_t` is approximated using TD(:math:`\\lambda`) to mitigate
    high variance issue.

    The policy function is also trained as a supervised regression problem.

    .. math::

        J(\\phi) = \\mathbb{E}_{s_t, a_t, R_t \\sim D}
            [\\log \\pi(a_t|s_t, \\phi)
                \\exp (\\frac{1}{B} (R_t - V(s_t|\\theta)))]

    where :math:`B` is a constant factor.

    References:
        * `Peng et al., Advantage-Weighted Regression: Simple and Scalable
          Off-Policy Reinforcement Learning
          <https://arxiv.org/abs/1910.00177>`_

    Args:
        actor_learning_rate (float): learning rate for policy function.
        critic_learning_rate (float): learning rate for value function.
        batch_size (int): batch size per iteration.
        n_frames (int): the number of frames to stack for image observation.
        gamma (float): discount factor.
        batch_size_per_update (int): mini-batch size.
        n_actor_updates (int): actor gradient steps per iteration.
        n_critic_updates (int): critic gradient steps per iteration.
        lam (float): :math:`\\lambda`  for TD(:math:`\\lambda`).
        beta (float): :math:`B` for weight scale.
        max_weight (float): :math:`w_{\\text{max}}` for weight clipping.
        momentum (float): momentum for stochastic gradient descent.
        use_batch_norm (bool): flag to insert batch normalization layers.
        use_gpu (bool, int or d3rlpy.gpu.Device):
            flag to use GPU, device ID or device.
        scaler (d3rlpy.preprocessing.Scaler or str): preprocessor.
            The available options are `['pixel', 'min_max', 'standard']`
        augmentation (d3rlpy.augmentation.AugmentationPipeline or list(str)):
            augmentation pipeline.
        n_augmentations (int): the number of data augmentations to update.
        encoder_params (dict): optional arguments for encoder setup. If the
            observation is pixel, you can pass ``filters`` with list of tuples
            consisting with ``(filter_size, kernel_size, stride)`` and
            ``feature_size`` with an integer scaler for the last linear layer
            size. If the observation is vector, you can pass ``hidden_units``
            with list of hidden unit sizes.
        dynamics (d3rlpy.dynamics.base.DynamicsBase): dynamics model for data
            augmentation.
        impl (d3rlpy.algos.torch.awr_impl.DiscreteAWRImpl):
            algorithm implementation.

    Attributes:
        actor_learning_rate (float): learning rate for policy function.
        critic_learning_rate (float): learning rate for value function.
        batch_size (int): batch size per iteration.
        n_frames (int): the number of frames to stack for image observation.
        gamma (float): discount factor.
        batch_size_per_update (int): mini-batch size.
        n_actor_updates (int): actor gradient steps per iteration.
        n_critic_updates (int): critic gradient steps per iteration.
        lam (float): :math:`\\lambda`  for TD(:math:`\\lambda`).
        beta (float): :math:`B` for weight scale.
        max_weight (float): :math:`w_{\\text{max}}` for weight clipping.
        momentum (float): momentum for stochastic gradient descent.
        use_batch_norm (bool): flag to insert batch normalization layers.
        use_gpu (d3rlpy.gpu.Device): GPU device.
        scaler (d3rlpy.preprocessing.Scaler): preprocessor.
        augmentation (d3rlpy.augmentation.AugmentationPipeline):
            augmentation pipeline.
        n_augmentations (int): the number of data augmentations to update.
        encoder_params (dict): optional arguments for encoder setup.
        dynamics (d3rlpy.dynamics.base.DynamicsBase): dynamics model.
        impl (d3rlpy.algos.torch.awr_impl.DiscreteAWRImpl):
            algorithm implementation.
        eval_results_ (dict): evaluation results.

    """
[docs]    def create_impl(self, observation_shape, action_size):
        self.impl = DiscreteAWRImpl(
            observation_shape=observation_shape,
            action_size=action_size,
            actor_learning_rate=self.actor_learning_rate,
            critic_learning_rate=self.critic_learning_rate,
            use_batch_norm=self.use_batch_norm,
            momentum=self.momentum,
            use_gpu=self.use_gpu,
            scaler=self.scaler,
            augmentation=self.augmentation,
            n_augmentations=self.n_augmentations,
            encoder_params=self.encoder_params)
        self.impl.build()