import dataclasses
from typing import Dict, Generic, TypeVar
from ...base import DeviceArg, LearnableConfig, register_learnable
from ...constants import IMPL_NOT_INITIALIZED_ERROR, ActionSpace
from ...dataset import Shape
from ...models.builders import (
create_deterministic_regressor,
create_discrete_imitator,
create_probablistic_regressor,
)
from ...models.encoders import EncoderFactory, make_encoder_field
from ...models.optimizers import OptimizerFactory, make_optimizer_field
from ...torch_utility import TorchMiniBatch
from .base import QLearningAlgoBase
from .torch.bc_impl import BCBaseImpl, BCImpl, DiscreteBCImpl
__all__ = ["BCConfig", "BC", "DiscreteBCConfig", "DiscreteBC"]
TBCConfig = TypeVar("TBCConfig", bound="LearnableConfig")
class _BCBase(Generic[TBCConfig], QLearningAlgoBase[BCBaseImpl, TBCConfig]):
def inner_update(self, batch: TorchMiniBatch) -> Dict[str, float]:
assert self._impl is not None, IMPL_NOT_INITIALIZED_ERROR
loss = self._impl.update_imitator(batch)
return {"loss": loss}
[docs]@dataclasses.dataclass()
class BCConfig(LearnableConfig):
r"""Config of Behavior Cloning algorithm.
Behavior Cloning (BC) is to imitate actions in the dataset via a supervised
learning approach.
Since BC is only imitating action distributions, the performance will be
close to the mean of the dataset even though BC mostly works better than
online RL algorithms.
.. math::
L(\theta) = \mathbb{E}_{a_t, s_t \sim D}
[(a_t - \pi_\theta(s_t))^2]
Args:
learning_rate (float): Learing rate.
optim_factory (d3rlpy.models.optimizers.OptimizerFactory):
Optimizer factory.
encoder_factory (d3rlpy.models.encoders.EncoderFactory):
Encoder factory.
batch_size (int): Mini-batch size.
policy_type (str): the policy type. Available options are
``['deterministic', 'stochastic']``.
observation_scaler (d3rlpy.preprocessing.ObservationScaler):
Observation preprocessor.
action_scaler (d3rlpy.preprocessing.ActionScaler): Action preprocessor.
"""
batch_size: int = 100
learning_rate: float = 1e-3
policy_type: str = "deterministic"
optim_factory: OptimizerFactory = make_optimizer_field()
encoder_factory: EncoderFactory = make_encoder_field()
[docs] def create(self, device: DeviceArg = False) -> "BC":
return BC(self, device)
@staticmethod
def get_type() -> str:
return "bc"
[docs]class BC(_BCBase[BCConfig]):
def inner_create_impl(
self, observation_shape: Shape, action_size: int
) -> None:
if self._config.policy_type == "deterministic":
imitator = create_deterministic_regressor(
observation_shape,
action_size,
self._config.encoder_factory,
device=self._device,
)
elif self._config.policy_type == "stochastic":
imitator = create_probablistic_regressor(
observation_shape,
action_size,
self._config.encoder_factory,
min_logstd=-4.0,
max_logstd=15.0,
device=self._device,
)
else:
raise ValueError(f"invalid policy_type: {self._config.policy_type}")
optim = self._config.optim_factory.create(
imitator.parameters(), lr=self._config.learning_rate
)
self._impl = BCImpl(
observation_shape=observation_shape,
action_size=action_size,
imitator=imitator,
optim=optim,
policy_type=self._config.policy_type,
device=self._device,
)
[docs] def get_action_type(self) -> ActionSpace:
return ActionSpace.CONTINUOUS
[docs]@dataclasses.dataclass()
class DiscreteBCConfig(LearnableConfig):
r"""Config of Behavior Cloning algorithm for discrete control.
Behavior Cloning (BC) is to imitate actions in the dataset via a supervised
learning approach.
Since BC is only imitating action distributions, the performance will be
close to the mean of the dataset even though BC mostly works better than
online RL algorithms.
.. math::
L(\theta) = \mathbb{E}_{a_t, s_t \sim D}
[-\sum_a p(a|s_t) \log \pi_\theta(a|s_t)]
where :math:`p(a|s_t)` is implemented as a one-hot vector.
Args:
learning_rate (float): Learing rate.
optim_factory (d3rlpy.models.optimizers.OptimizerFactory):
Optimizer factory.
encoder_factory (d3rlpy.models.encoders.EncoderFactory):
Encoder factory.
batch_size (int): Mini-batch size.
beta (float): Reguralization factor.
observation_scaler (d3rlpy.preprocessing.ObservationScaler):
Observation preprocessor.
"""
batch_size: int = 100
learning_rate: float = 1e-3
optim_factory: OptimizerFactory = make_optimizer_field()
encoder_factory: EncoderFactory = make_encoder_field()
beta: float = 0.5
[docs] def create(self, device: DeviceArg = False) -> "DiscreteBC":
return DiscreteBC(self, device)
@staticmethod
def get_type() -> str:
return "discrete_bc"
[docs]class DiscreteBC(_BCBase[DiscreteBCConfig]):
def inner_create_impl(
self, observation_shape: Shape, action_size: int
) -> None:
imitator = create_discrete_imitator(
observation_shape,
action_size,
self._config.beta,
self._config.encoder_factory,
device=self._device,
)
optim = self._config.optim_factory.create(
imitator.parameters(), lr=self._config.learning_rate
)
self._impl = DiscreteBCImpl(
observation_shape=observation_shape,
action_size=action_size,
imitator=imitator,
optim=optim,
beta=self._config.beta,
device=self._device,
)
[docs] def get_action_type(self) -> ActionSpace:
return ActionSpace.DISCRETE
register_learnable(BCConfig)
register_learnable(DiscreteBCConfig)