import numpy as np
from .base import AlgoBase
from .torch.awr_impl import AWRImpl, DiscreteAWRImpl
from ..dataset import compute_lambda_return
from ..optimizers import SGDFactory
from ..argument_utils import check_encoder, check_use_gpu, check_augmentation
[docs]class AWR(AlgoBase):
r""" Advantage-Weighted Regression algorithm.
AWR is an actor-critic algorithm that trains via supervised regression way,
and has shown strong performance in online and offline settings.
The value function is trained as a supervised regression problem.
.. math::
L(\theta) = \mathbb{E}_{s_t, R_t \sim D} [(R_t - V(s_t|\theta))^2]
where :math:`R_t` is approximated using TD(:math:`\lambda`) to mitigate
high variance issue.
The policy function is also trained as a supervised regression problem.
.. math::
J(\phi) = \mathbb{E}_{s_t, a_t, R_t \sim D}
[\log \pi(a_t|s_t, \phi)
\exp (\frac{1}{B} (R_t - V(s_t|\theta)))]
where :math:`B` is a constant factor.
References:
* `Peng et al., Advantage-Weighted Regression: Simple and Scalable
Off-Policy Reinforcement Learning
<https://arxiv.org/abs/1910.00177>`_
Args:
actor_learning_rate (float): learning rate for policy function.
critic_learning_rate (float): learning rate for value function.
actor_optim_factory (d3rlpy.optimizers.OptimizerFactory):
optimizer factory for the actor.
critic_optim_factory (d3rlpy.optimizers.OptimizerFactory):
optimizer factory for the critic.
actor_encoder_factory (d3rlpy.encoders.EncoderFactory or str):
encoder factory for the actor.
critic_encoder_factory (d3rlpy.encoders.EncoderFactory or str):
encoder factory for the critic.
batch_size (int): batch size per iteration.
n_frames (int): the number of frames to stack for image observation.
gamma (float): discount factor.
batch_size_per_update (int): mini-batch size.
n_actor_updates (int): actor gradient steps per iteration.
n_critic_updates (int): critic gradient steps per iteration.
lam (float): :math:`\lambda` for TD(:math:`\lambda`).
beta (float): :math:`B` for weight scale.
max_weight (float): :math:`w_{\text{max}}` for weight clipping.
use_gpu (bool, int or d3rlpy.gpu.Device):
flag to use GPU, device ID or device.
scaler (d3rlpy.preprocessing.Scaler or str): preprocessor.
The available options are `['pixel', 'min_max', 'standard']`
augmentation (d3rlpy.augmentation.AugmentationPipeline or list(str)):
augmentation pipeline.
dynamics (d3rlpy.dynamics.base.DynamicsBase): dynamics model for data
augmentation.
impl (d3rlpy.algos.torch.awr_impl.AWRImpl): algorithm implementation.
Attributes:
actor_learning_rate (float): learning rate for policy function.
critic_learning_rate (float): learning rate for value function.
actor_optim_factory (d3rlpy.optimizers.OptimizerFactory):
optimizer factory for the actor.
critic_optim_factory (d3rlpy.optimizers.OptimizerFactory):
optimizer factory for the critic.
actor_encoder_factory (d3rlpy.encoders.EncoderFactory):
encoder factory for the actor.
critic_encoder_factory (d3rlpy.encoders.EncoderFactory):
encoder factory for the critic.
batch_size (int): batch size per iteration.
n_frames (int): the number of frames to stack for image observation.
gamma (float): discount factor.
batch_size_per_update (int): mini-batch size.
n_actor_updates (int): actor gradient steps per iteration.
n_critic_updates (int): critic gradient steps per iteration.
lam (float): :math:`\lambda` for TD(:math:`\lambda`).
beta (float): :math:`B` for weight scale.
max_weight (float): :math:`w_{\text{max}}` for weight clipping.
use_gpu (d3rlpy.gpu.Device): GPU device.
scaler (d3rlpy.preprocessing.Scaler): preprocessor.
augmentation (d3rlpy.augmentation.AugmentationPipeline):
augmentation pipeline.
dynamics (d3rlpy.dynamics.base.DynamicsBase): dynamics model.
impl (d3rlpy.algos.torch.awr_impl.AWRImpl): algorithm implementation.
eval_results_ (dict): evaluation results.
"""
def __init__(self,
*,
actor_learning_rate=5e-5,
critic_learning_rate=1e-4,
actor_optim_factory=SGDFactory(momentum=0.9),
critic_optim_factory=SGDFactory(momentum=0.9),
actor_encoder_factory='default',
critic_encoder_factory='default',
batch_size=2048,
n_frames=1,
gamma=0.99,
batch_size_per_update=256,
n_actor_updates=1000,
n_critic_updates=200,
lam=0.95,
beta=1.0,
max_weight=20.0,
use_gpu=False,
scaler=None,
augmentation=None,
dynamics=None,
impl=None,
**kwargs):
# batch_size in AWR has different semantic from Q learning algorithms.
super().__init__(batch_size=batch_size,
n_frames=n_frames,
n_steps=1,
gamma=gamma,
scaler=scaler,
dynamics=dynamics)
self.actor_learning_rate = actor_learning_rate
self.critic_learning_rate = critic_learning_rate
self.actor_optim_factory = actor_optim_factory
self.critic_optim_factory = critic_optim_factory
self.actor_encoder_factory = check_encoder(actor_encoder_factory)
self.critic_encoder_factory = check_encoder(critic_encoder_factory)
self.batch_size_per_update = batch_size_per_update
self.n_actor_updates = n_actor_updates
self.n_critic_updates = n_critic_updates
self.lam = lam
self.beta = beta
self.max_weight = max_weight
self.augmentation = check_augmentation(augmentation)
self.use_gpu = check_use_gpu(use_gpu)
self.impl = impl
[docs] def create_impl(self, observation_shape, action_size):
self.impl = AWRImpl(observation_shape=observation_shape,
action_size=action_size,
actor_learning_rate=self.actor_learning_rate,
critic_learning_rate=self.critic_learning_rate,
actor_optim_factory=self.actor_optim_factory,
critic_optim_factory=self.critic_optim_factory,
actor_encoder_factory=self.actor_encoder_factory,
critic_encoder_factory=self.critic_encoder_factory,
use_gpu=self.use_gpu,
scaler=self.scaler,
augmentation=self.augmentation)
self.impl.build()
def _compute_lambda_returns(self, batch):
# compute TD(lambda)
lambda_returns = []
for transition in batch.transitions:
lambda_return = compute_lambda_return(transition=transition,
algo=self,
gamma=self.gamma,
lam=self.lam,
n_frames=self.n_frames)
lambda_returns.append(lambda_return)
return np.array(lambda_returns).reshape((-1, 1))
def _compute_advantages(self, returns, batch):
baselines = self.predict_value(batch.observations).reshape((-1, 1))
advantages = returns - baselines
adv_mean = np.mean(advantages)
adv_std = np.std(advantages)
return (advantages - adv_mean) / (adv_std + 1e-5)
def _compute_clipped_weights(self, advantages):
weights = np.exp(advantages / self.beta)
return np.minimum(weights, self.max_weight)
[docs] def predict_value(self, x, *args, **kwargs):
""" Returns predicted state values.
Args:
x (numpy.ndarray): observations.
Returns:
numpy.ndarray: predicted state values.
"""
return self.impl.predict_value(x)
[docs] def update(self, epoch, itr, batch):
# compute lmabda return
lambda_returns = self._compute_lambda_returns(batch)
# calcuate advantage
advantages = self._compute_advantages(lambda_returns, batch)
# compute weights
clipped_weights = self._compute_clipped_weights(advantages)
n_steps_per_batch = self.batch_size // self.batch_size_per_update
# update critic
critic_loss_history = []
for i in range(self.n_critic_updates // n_steps_per_batch):
for j in range(n_steps_per_batch):
head_index = j * self.batch_size_per_update
tail_index = head_index + self.batch_size_per_update
observations = batch.observations[head_index:tail_index]
returns = lambda_returns[head_index:tail_index]
critic_loss = self.impl.update_critic(observations, returns)
critic_loss_history.append(critic_loss)
critic_loss_mean = np.mean(critic_loss_history)
# update actor
actor_loss_history = []
for i in range(self.n_actor_updates // n_steps_per_batch):
for j in range(n_steps_per_batch):
head_index = j * self.batch_size_per_update
tail_index = head_index + self.batch_size_per_update
observations = batch.observations[head_index:tail_index]
actions = batch.actions[head_index:tail_index]
weights = clipped_weights[head_index:tail_index]
actor_loss = self.impl.update_actor(observations, actions,
weights)
actor_loss_history.append(actor_loss)
actor_loss_mean = np.mean(actor_loss_history)
return critic_loss_mean, actor_loss_mean, np.mean(clipped_weights)
def _get_loss_labels(self):
return ['critic_loss', 'actor_loss', 'weights']
[docs]class DiscreteAWR(AWR):
r""" Discrete veriosn of Advantage-Weighted Regression algorithm.
AWR is an actor-critic algorithm that trains via supervised regression way,
and has shown strong performance in online and offline settings.
The value function is trained as a supervised regression problem.
.. math::
L(\theta) = \mathbb{E}_{s_t, R_t \sim D} [(R_t - V(s_t|\theta))^2]
where :math:`R_t` is approximated using TD(:math:`\lambda`) to mitigate
high variance issue.
The policy function is also trained as a supervised regression problem.
.. math::
J(\phi) = \mathbb{E}_{s_t, a_t, R_t \sim D}
[\log \pi(a_t|s_t, \phi)
\exp (\frac{1}{B} (R_t - V(s_t|\theta)))]
where :math:`B` is a constant factor.
References:
* `Peng et al., Advantage-Weighted Regression: Simple and Scalable
Off-Policy Reinforcement Learning
<https://arxiv.org/abs/1910.00177>`_
Args:
actor_learning_rate (float): learning rate for policy function.
critic_learning_rate (float): learning rate for value function.
actor_optim_factory (d3rlpy.optimizers.OptimizerFactory):
optimizer factory for the actor.
critic_optim_factory (d3rlpy.optimizers.OptimizerFactory):
optimizer factory for the critic.
actor_encoder_factory (d3rlpy.encoders.EncoderFactory or str):
encoder factory for the actor.
critic_encoder_factory (d3rlpy.encoders.EncoderFactory or str):
encoder factory for the critic.
batch_size (int): batch size per iteration.
n_frames (int): the number of frames to stack for image observation.
gamma (float): discount factor.
batch_size_per_update (int): mini-batch size.
n_actor_updates (int): actor gradient steps per iteration.
n_critic_updates (int): critic gradient steps per iteration.
lam (float): :math:`\lambda` for TD(:math:`\lambda`).
beta (float): :math:`B` for weight scale.
max_weight (float): :math:`w_{\text{max}}` for weight clipping.
use_gpu (bool, int or d3rlpy.gpu.Device):
flag to use GPU, device ID or device.
scaler (d3rlpy.preprocessing.Scaler or str): preprocessor.
The available options are `['pixel', 'min_max', 'standard']`
augmentation (d3rlpy.augmentation.AugmentationPipeline or list(str)):
augmentation pipeline.
dynamics (d3rlpy.dynamics.base.DynamicsBase): dynamics model for data
augmentation.
impl (d3rlpy.algos.torch.awr_impl.DiscreteAWRImpl):
algorithm implementation.
Attributes:
actor_learning_rate (float): learning rate for policy function.
critic_learning_rate (float): learning rate for value function.
actor_optim_factory (d3rlpy.optimizers.OptimizerFactory):
optimizer factory for the actor.
critic_optim_factory (d3rlpy.optimizers.OptimizerFactory):
optimizer factory for the critic.
actor_encoder_factory (d3rlpy.encoders.EncoderFactory):
encoder factory for the actor.
critic_encoder_factory (d3rlpy.encoders.EncoderFactory):
encoder factory for the critic.
batch_size (int): batch size per iteration.
n_frames (int): the number of frames to stack for image observation.
gamma (float): discount factor.
batch_size_per_update (int): mini-batch size.
n_actor_updates (int): actor gradient steps per iteration.
n_critic_updates (int): critic gradient steps per iteration.
lam (float): :math:`\lambda` for TD(:math:`\lambda`).
beta (float): :math:`B` for weight scale.
max_weight (float): :math:`w_{\text{max}}` for weight clipping.
use_gpu (d3rlpy.gpu.Device): GPU device.
scaler (d3rlpy.preprocessing.Scaler): preprocessor.
augmentation (d3rlpy.augmentation.AugmentationPipeline):
augmentation pipeline.
dynamics (d3rlpy.dynamics.base.DynamicsBase): dynamics model.
impl (d3rlpy.algos.torch.awr_impl.DiscreteAWRImpl):
algorithm implementation.
eval_results_ (dict): evaluation results.
"""
[docs] def create_impl(self, observation_shape, action_size):
self.impl = DiscreteAWRImpl(
observation_shape=observation_shape,
action_size=action_size,
actor_learning_rate=self.actor_learning_rate,
critic_learning_rate=self.critic_learning_rate,
actor_optim_factory=self.actor_optim_factory,
critic_optim_factory=self.critic_optim_factory,
actor_encoder_factory=self.actor_encoder_factory,
critic_encoder_factory=self.critic_encoder_factory,
use_gpu=self.use_gpu,
scaler=self.scaler,
augmentation=self.augmentation)
self.impl.build()