Source code for d3rlpy.online.explorers

import numpy as np

from abc import ABCMeta, abstractmethod


class Explorer(metaclass=ABCMeta):
    @abstractmethod
    def sample(self, algo, x, step):
        pass


[docs]class LinearDecayEpsilonGreedy(Explorer):
    """ :math:`\\epsilon`-greedy explorer with linear decay schedule.

    Args:
        start_epsilon (float): the beginning :math:`\\epsilon`.
        end_epsilon (float): the end :math:`\\epsilon`.
        duration (int): the scheduling duration.

    Attributes:
        start_epsilon (float): the beginning :math:`\\epsilon`.
        end_epsilon (float): the end :math:`\\epsilon`.
        duration (int): the scheduling duration.

    """
    def __init__(self, start_epsilon=1.0, end_epsilon=0.1, duration=1000000):
        self.start_epsilon = start_epsilon
        self.end_epsilon = end_epsilon
        self.duration = duration

[docs]    def sample(self, algo, x, step):
        """ Returns :math:`\\epsilon`-greedy action.

        Args:
            algo (d3rlpy.algos.base.AlgoBase): algorithm.
            x (numpy.ndarray): observation.
            step (int): current environment step.

        Returns:
            int: :math:`\\epsilon`-greedy action.

        """
        if np.random.random() < self.compute_epsilon(step):
            return np.random.randint(algo.impl.action_size)
        return algo.predict([x])[0]

[docs]    def compute_epsilon(self, step):
        """ Returns decayed :math:`\\epsilon`.

        Returns:
            float: :math:`\\epsilon`.

        """
        if step >= self.duration:
            return self.end_epsilon
        base = self.start_epsilon - self.end_epsilon
        return base * (1.0 - step / self.duration) + self.end_epsilon


[docs]class NormalNoise(Explorer):
    """ Normal noise explorer.

    Args:
        mean (float): mean.
        std (float): standard deviation.

    Attributes:
        mean (float): mean.
        std (float): standard deviation.

    """
    def __init__(self, mean=0.0, std=0.1):
        self.mean = mean
        self.std = std

[docs]    def sample(self, algo, x, *args):
        """ Returns action with noise injection.

        Args:
            algo (d3rlpy.algos.base.AlgoBase): algorithm.
            x (numpy.ndarray): observation.

        Returns:
            numpy.ndarray: action with noise injection.

        """
        action = algo.sample_action([x])[0]
        noise = np.random.normal(self.mean, self.std, size=action.shape)
        return np.clip(action + noise, -1.0, 1.0)