Source code for d3rlpy.online.buffers

import numpy as np

from abc import ABCMeta, abstractmethod
from collections import deque
from random import sample
from ..gpu import Device
from ..dataset import Transition, TransitionMiniBatch, _numpy_to_tensor
from .utility import get_action_size_from_env


class Buffer(metaclass=ABCMeta):
    @abstractmethod
    def append(self, observation, action, reward, terminal):
        """ Append observation, action, reward and terminal flag to buffer.

        If the terminal flag is True, Monte-Carlo returns will be computed with
        an entire episode and the whole transitions will be appended.

        Args:
            observation (numpy.ndarray): observation.
            action (numpy.ndarray or int): action.
            reward (float): reward.
            terminal (bool or float): terminal flag.

        """
        pass

    @abstractmethod
    def sample(self, batch_size, n_frames=1):
        """ Returns sampled mini-batch of transitions.

        If observation is image, you can stack arbitrary frames via
        ``n_frames``.

        .. code-block:: python

            buffer.observation_shape == (3, 84, 84)

            # stack 4 frames
            batch = buffer.sample(batch_size=32, n_frames=4)

            batch.observations.shape == (32, 12, 84, 84)

        Args:
            batch_size (int): mini-batch size.
            n_frames (int):
                the number of frames to stack for image observation.

        Returns:
            d3rlpy.dataset.TransitionMiniBatch: mini-batch.

        """
        pass

    @abstractmethod
    def size(self):
        """ Returns the number of appended elements in buffer.

        Returns:
            int: the number of elements in buffer.

        """
        pass


[docs]class ReplayBuffer(Buffer):
    """ Standard Replay Buffer.

    Args:
        maxlen (int): the maximum number of data length.
        env (gym.Env): gym-like environment to extract shape information.
        as_tensor (bool): flag to hold observations as ``torch.Tensor``.
        device (d3rlpy.gpu.Device or int): gpu device or device id for tensor.

    Attributes:
        prev_observation (numpy.ndarray): previously appended observation.
        prev_action (numpy.ndarray or int): previously appended action.
        prev_reward (float): previously appended reward.
        prev_transition (d3rlpy.dataset.Transition):
            previously appended transition.
        transitions (collections.deque): list of transitions.
        observation_shape (tuple): observation shape.
        action_size (int): action size.
        as_tensor (bool): flag to hold observations as ``torch.Tensor``.
        device (d3rlpy.gpu.Device): gpu device.

    """
    def __init__(self, maxlen, env, as_tensor=False, device=None):
        # temporary cache to hold transitions for an entire episode
        self.prev_observation = None
        self.prev_action = None
        self.prev_reward = None
        self.prev_transition = None

        self.transitions = deque(maxlen=maxlen)

        # extract shape information
        self.observation_shape = env.observation_space.shape
        self.action_size = get_action_size_from_env(env)

        # data type option
        if isinstance(device, int):
            self.device = Device(device)
        else:
            self.device = device
        self.as_tensor = as_tensor

[docs]    def append(self, observation, action, reward, terminal):
        # validation
        assert observation.shape == self.observation_shape
        if isinstance(action, np.ndarray):
            assert action.shape[0] == self.action_size
        else:
            action = int(action)
            assert action < self.action_size

        # numpy.ndarray to PyTorch conversion
        if self.as_tensor:
            observation = _numpy_to_tensor(observation, self.device)

        # create Transition object
        if self.prev_observation is not None:
            if isinstance(terminal, bool):
                terminal = 1.0 if terminal else 0.0

            transition = Transition(observation_shape=self.observation_shape,
                                    action_size=self.action_size,
                                    observation=self.prev_observation,
                                    action=self.prev_action,
                                    reward=self.prev_reward,
                                    next_observation=observation,
                                    next_action=action,
                                    next_reward=reward,
                                    terminal=terminal,
                                    prev_transition=self.prev_transition)

            if self.prev_transition:
                self.prev_transition.next_transition = transition

            self.transitions.append(transition)

            self.prev_transition = transition

        self.prev_observation = observation
        self.prev_action = action
        self.prev_reward = reward

        if terminal:
            self.prev_observation = None
            self.prev_action = None
            self.prev_reward = None
            self.prev_transition = None

[docs]    def sample(self, batch_size, n_frames=1):
        transitions = sample(self.transitions, batch_size)
        return TransitionMiniBatch(transitions, n_frames)

[docs]    def size(self):
        return len(self.transitions)

[docs]    def __len__(self):
        return self.size()