Source code for d3rlpy.algos.qlearning.random_policy

import dataclasses
from typing import Dict

import numpy as np

from ...base import DeviceArg, LearnableConfig, register_learnable
from ...constants import ActionSpace
from ...torch_utility import TorchMiniBatch
from ...types import NDArray, Observation, Shape
from .base import QLearningAlgoBase

__all__ = [
    "RandomPolicyConfig",
    "RandomPolicy",
    "DiscreteRandomPolicyConfig",
    "DiscreteRandomPolicy",
]


[docs]@dataclasses.dataclass() class RandomPolicyConfig(LearnableConfig): r"""Random Policy for continuous control algorithm. This is designed for data collection and lightweight interaction tests. ``fit`` and ``fit_online`` methods will raise exceptions. Args: action_scaler (d3rlpy.preprocessing.ActionScaler): Action preprocessor. distribution (str): Random distribution. Available options are ``['uniform', 'normal']``. normal_std (float): Standard deviation of the normal distribution. This is only used when ``distribution='normal'``. """ distribution: str = "uniform" normal_std: float = 1.0
[docs] def create(self, device: DeviceArg = False) -> "RandomPolicy": # type: ignore return RandomPolicy(self)
@staticmethod def get_type() -> str: return "random_policy"
[docs]class RandomPolicy(QLearningAlgoBase[None, RandomPolicyConfig]): # type: ignore _action_size: int def __init__(self, config: RandomPolicyConfig): super().__init__(config, False, None) self._action_size = 1 def inner_create_impl( self, observation_shape: Shape, action_size: int ) -> None: self._action_size = action_size
[docs] def predict(self, x: Observation) -> NDArray: return self.sample_action(x)
[docs] def sample_action(self, x: Observation) -> NDArray: x = np.asarray(x) action_shape = (x.shape[0], self._action_size) if self._config.distribution == "uniform": action = np.random.uniform(-1.0, 1.0, size=action_shape) elif self._config.distribution == "normal": action = np.random.normal( 0.0, self._config.normal_std, size=action_shape ) else: raise ValueError( f"invalid distribution type: {self._config.distribution}" ) action = np.clip(action, -1.0, 1.0) if self._config.action_scaler: action = self._config.action_scaler.reverse_transform_numpy(action) return action
[docs] def predict_value(self, x: Observation, action: NDArray) -> NDArray: raise NotImplementedError
def inner_update(self, batch: TorchMiniBatch) -> Dict[str, float]: raise NotImplementedError
[docs] def get_action_type(self) -> ActionSpace: return ActionSpace.CONTINUOUS
[docs]@dataclasses.dataclass() class DiscreteRandomPolicyConfig(LearnableConfig): r"""Random Policy for discrete control algorithm. This is designed for data collection and lightweight interaction tests. ``fit`` and ``fit_online`` methods will raise exceptions. """
[docs] def create(self, device: DeviceArg = False) -> "DiscreteRandomPolicy": # type: ignore return DiscreteRandomPolicy(self)
@staticmethod def get_type() -> str: return "discrete_random_policy"
[docs]class DiscreteRandomPolicy(QLearningAlgoBase[None, DiscreteRandomPolicyConfig]): # type: ignore _action_size: int def __init__(self, config: DiscreteRandomPolicyConfig): super().__init__(config, False, None) self._action_size = 1 def inner_create_impl( self, observation_shape: Shape, action_size: int ) -> None: self._action_size = action_size
[docs] def predict(self, x: Observation) -> NDArray: return self.sample_action(x)
[docs] def sample_action(self, x: Observation) -> NDArray: x = np.asarray(x) return np.random.randint(self._action_size, size=x.shape[0])
[docs] def predict_value(self, x: Observation, action: NDArray) -> NDArray: raise NotImplementedError
def inner_update(self, batch: TorchMiniBatch) -> Dict[str, float]: raise NotImplementedError
[docs] def get_action_type(self) -> ActionSpace: return ActionSpace.DISCRETE
register_learnable(RandomPolicyConfig) register_learnable(DiscreteRandomPolicyConfig)