Source code for d3rlpy.algos.awr

import numpy as np

from d3rlpy.dataset import compute_lambda_return
from .base import AlgoBase
from .torch.awr_impl import AWRImpl, DiscreteAWRImpl


[docs]class AWR(AlgoBase): """ Advantage-Weighted Regression algorithm. AWR is an actor-critic algorithm that trains via supervised regression way, and has shown strong performance in online and offline settings. The value function is trained as a supervised regression problem. .. math:: L(\\theta) = \\mathbb{E}_{s_t, R_t \\sim D} [(R_t - V(s_t|\\theta))^2] where :math:`R_t` is approximated using TD(:math:`\\lambda`) to mitigate high variance issue. The policy function is also trained as a supervised regression problem. .. math:: J(\\phi) = \\mathbb{E}_{s_t, a_t, R_t \\sim D} [\\log \\pi(a_t|s_t, \\phi) \\exp (\\frac{1}{B} (R_t - V(s_t|\\theta)))] where :math:`B` is a constant factor. References: * `Peng et al., Advantage-Weighted Regression: Simple and Scalable Off-Policy Reinforcement Learning <https://arxiv.org/abs/1910.00177>`_ Args: actor_learning_rate (float): learning rate for policy function. critic_learning_rate (float): learning rate for value function. batch_size (int): batch size per iteration. n_frames (int): the number of frames to stack for image observation. gamma (float): discount factor. batch_size_per_update (int): mini-batch size. n_actor_updates (int): actor gradient steps per iteration. n_critic_updates (int): critic gradient steps per iteration. lam (float): :math:`\\lambda` for TD(:math:`\\lambda`). beta (float): :math:`B` for weight scale. max_weight (float): :math:`w_{\\text{max}}` for weight clipping. momentum (float): momentum for stochastic gradient descent. use_batch_norm (bool): flag to insert batch normalization layers. use_gpu (bool, int or d3rlpy.gpu.Device): flag to use GPU, device ID or device. scaler (d3rlpy.preprocessing.Scaler or str): preprocessor. The available options are `['pixel', 'min_max', 'standard']` augmentation (d3rlpy.augmentation.AugmentationPipeline or list(str)): augmentation pipeline. n_augmentations (int): the number of data augmentations to update. encoder_params (dict): optional arguments for encoder setup. If the observation is pixel, you can pass ``filters`` with list of tuples consisting with ``(filter_size, kernel_size, stride)`` and ``feature_size`` with an integer scaler for the last linear layer size. If the observation is vector, you can pass ``hidden_units`` with list of hidden unit sizes. dynamics (d3rlpy.dynamics.base.DynamicsBase): dynamics model for data augmentation. impl (d3rlpy.algos.torch.awr_impl.AWRImpl): algorithm implementation. Attributes: actor_learning_rate (float): learning rate for policy function. critic_learning_rate (float): learning rate for value function. batch_size (int): batch size per iteration. n_frames (int): the number of frames to stack for image observation. gamma (float): discount factor. batch_size_per_update (int): mini-batch size. n_actor_updates (int): actor gradient steps per iteration. n_critic_updates (int): critic gradient steps per iteration. lam (float): :math:`\\lambda` for TD(:math:`\\lambda`). beta (float): :math:`B` for weight scale. max_weight (float): :math:`w_{\\text{max}}` for weight clipping. momentum (float): momentum for stochastic gradient descent. use_batch_norm (bool): flag to insert batch normalization layers. use_gpu (d3rlpy.gpu.Device): GPU device. scaler (d3rlpy.preprocessing.Scaler): preprocessor. augmentation (d3rlpy.augmentation.AugmentationPipeline): augmentation pipeline. n_augmentations (int): the number of data augmentations to update. encoder_params (dict): optional arguments for encoder setup. dynamics (d3rlpy.dynamics.base.DynamicsBase): dynamics model. impl (d3rlpy.algos.torch.awr_impl.AWRImpl): algorithm implementation. eval_results_ (dict): evaluation results. """ def __init__(self, *, actor_learning_rate=5e-5, critic_learning_rate=1e-4, batch_size=2048, n_frames=1, gamma=0.99, batch_size_per_update=256, n_actor_updates=1000, n_critic_updates=200, lam=0.95, beta=1.0, max_weight=20.0, momentum=0.9, use_batch_norm=False, use_gpu=False, scaler=None, augmentation=[], n_augmentations=1, encoder_params={}, dynamics=None, impl=None, **kwargs): # batch_size in AWR has different semantic from Q learning algorithms. super().__init__(batch_size=batch_size, n_frames=n_frames, scaler=scaler, augmentation=augmentation, dynamics=dynamics, use_gpu=use_gpu) self.actor_learning_rate = actor_learning_rate self.critic_learning_rate = critic_learning_rate self.batch_size_per_update = batch_size_per_update self.n_actor_updates = n_actor_updates self.n_critic_updates = n_critic_updates self.gamma = gamma self.lam = lam self.beta = beta self.max_weight = max_weight self.use_batch_norm = use_batch_norm self.momentum = momentum self.n_augmentations = n_augmentations self.encoder_params = encoder_params self.impl = impl
[docs] def create_impl(self, observation_shape, action_size): self.impl = AWRImpl(observation_shape=observation_shape, action_size=action_size, actor_learning_rate=self.actor_learning_rate, critic_learning_rate=self.critic_learning_rate, use_batch_norm=self.use_batch_norm, momentum=self.momentum, use_gpu=self.use_gpu, scaler=self.scaler, augmentation=self.augmentation, n_augmentations=self.n_augmentations, encoder_params=self.encoder_params) self.impl.build()
def _compute_lambda_returns(self, batch): # compute TD(lambda) lambda_returns = [] for transition in batch.transitions: lambda_return = compute_lambda_return(transition=transition, algo=self, gamma=self.gamma, lam=self.lam, n_frames=self.n_frames) lambda_returns.append(lambda_return) return np.array(lambda_returns).reshape((-1, 1)) def _compute_advantages(self, returns, batch): baselines = self.predict_value(batch.observations).reshape((-1, 1)) advantages = returns - baselines adv_mean = np.mean(advantages) adv_std = np.std(advantages) return (advantages - adv_mean) / (adv_std + 1e-5) def _compute_clipped_weights(self, advantages): weights = np.exp(advantages / self.beta) return np.minimum(weights, self.max_weight)
[docs] def predict_value(self, x, *args, **kwargs): """ Returns predicted state values. Args: x (numpy.ndarray): observations. Returns: numpy.ndarray: predicted state values. """ return self.impl.predict_value(x)
[docs] def update(self, epoch, itr, batch): # compute lmabda return lambda_returns = self._compute_lambda_returns(batch) # calcuate advantage advantages = self._compute_advantages(lambda_returns, batch) # compute weights clipped_weights = self._compute_clipped_weights(advantages) n_steps_per_batch = self.batch_size // self.batch_size_per_update # update critic critic_loss_history = [] for i in range(self.n_critic_updates // n_steps_per_batch): for j in range(n_steps_per_batch): head_index = j * self.batch_size_per_update tail_index = head_index + self.batch_size_per_update observations = batch.observations[head_index:tail_index] returns = lambda_returns[head_index:tail_index] critic_loss = self.impl.update_critic(observations, returns) critic_loss_history.append(critic_loss) critic_loss_mean = np.mean(critic_loss_history) # update actor actor_loss_history = [] for i in range(self.n_actor_updates // n_steps_per_batch): for j in range(n_steps_per_batch): head_index = j * self.batch_size_per_update tail_index = head_index + self.batch_size_per_update observations = batch.observations[head_index:tail_index] actions = batch.actions[head_index:tail_index] weights = clipped_weights[head_index:tail_index] actor_loss = self.impl.update_actor(observations, actions, weights) actor_loss_history.append(actor_loss) actor_loss_mean = np.mean(actor_loss_history) return critic_loss_mean, actor_loss_mean, np.mean(clipped_weights)
def _get_loss_labels(self): return ['critic_loss', 'actor_loss', 'weights']
[docs]class DiscreteAWR(AWR): """ Discrete veriosn of Advantage-Weighted Regression algorithm. AWR is an actor-critic algorithm that trains via supervised regression way, and has shown strong performance in online and offline settings. The value function is trained as a supervised regression problem. .. math:: L(\\theta) = \\mathbb{E}_{s_t, R_t \\sim D} [(R_t - V(s_t|\\theta))^2] where :math:`R_t` is approximated using TD(:math:`\\lambda`) to mitigate high variance issue. The policy function is also trained as a supervised regression problem. .. math:: J(\\phi) = \\mathbb{E}_{s_t, a_t, R_t \\sim D} [\\log \\pi(a_t|s_t, \\phi) \\exp (\\frac{1}{B} (R_t - V(s_t|\\theta)))] where :math:`B` is a constant factor. References: * `Peng et al., Advantage-Weighted Regression: Simple and Scalable Off-Policy Reinforcement Learning <https://arxiv.org/abs/1910.00177>`_ Args: actor_learning_rate (float): learning rate for policy function. critic_learning_rate (float): learning rate for value function. batch_size (int): batch size per iteration. n_frames (int): the number of frames to stack for image observation. gamma (float): discount factor. batch_size_per_update (int): mini-batch size. n_actor_updates (int): actor gradient steps per iteration. n_critic_updates (int): critic gradient steps per iteration. lam (float): :math:`\\lambda` for TD(:math:`\\lambda`). beta (float): :math:`B` for weight scale. max_weight (float): :math:`w_{\\text{max}}` for weight clipping. momentum (float): momentum for stochastic gradient descent. use_batch_norm (bool): flag to insert batch normalization layers. use_gpu (bool, int or d3rlpy.gpu.Device): flag to use GPU, device ID or device. scaler (d3rlpy.preprocessing.Scaler or str): preprocessor. The available options are `['pixel', 'min_max', 'standard']` augmentation (d3rlpy.augmentation.AugmentationPipeline or list(str)): augmentation pipeline. n_augmentations (int): the number of data augmentations to update. encoder_params (dict): optional arguments for encoder setup. If the observation is pixel, you can pass ``filters`` with list of tuples consisting with ``(filter_size, kernel_size, stride)`` and ``feature_size`` with an integer scaler for the last linear layer size. If the observation is vector, you can pass ``hidden_units`` with list of hidden unit sizes. dynamics (d3rlpy.dynamics.base.DynamicsBase): dynamics model for data augmentation. impl (d3rlpy.algos.torch.awr_impl.DiscreteAWRImpl): algorithm implementation. Attributes: actor_learning_rate (float): learning rate for policy function. critic_learning_rate (float): learning rate for value function. batch_size (int): batch size per iteration. n_frames (int): the number of frames to stack for image observation. gamma (float): discount factor. batch_size_per_update (int): mini-batch size. n_actor_updates (int): actor gradient steps per iteration. n_critic_updates (int): critic gradient steps per iteration. lam (float): :math:`\\lambda` for TD(:math:`\\lambda`). beta (float): :math:`B` for weight scale. max_weight (float): :math:`w_{\\text{max}}` for weight clipping. momentum (float): momentum for stochastic gradient descent. use_batch_norm (bool): flag to insert batch normalization layers. use_gpu (d3rlpy.gpu.Device): GPU device. scaler (d3rlpy.preprocessing.Scaler): preprocessor. augmentation (d3rlpy.augmentation.AugmentationPipeline): augmentation pipeline. n_augmentations (int): the number of data augmentations to update. encoder_params (dict): optional arguments for encoder setup. dynamics (d3rlpy.dynamics.base.DynamicsBase): dynamics model. impl (d3rlpy.algos.torch.awr_impl.DiscreteAWRImpl): algorithm implementation. eval_results_ (dict): evaluation results. """
[docs] def create_impl(self, observation_shape, action_size): self.impl = DiscreteAWRImpl( observation_shape=observation_shape, action_size=action_size, actor_learning_rate=self.actor_learning_rate, critic_learning_rate=self.critic_learning_rate, use_batch_norm=self.use_batch_norm, momentum=self.momentum, use_gpu=self.use_gpu, scaler=self.scaler, augmentation=self.augmentation, n_augmentations=self.n_augmentations, encoder_params=self.encoder_params) self.impl.build()