Source code for d3rlpy.metrics.comparer

from typing import Callable, List

import numpy as np

from ..dataset import Episode
from .scorer import WINDOW_SIZE, AlgoProtocol, _make_batches


[docs]def compare_continuous_action_diff(
    base_algo: AlgoProtocol,
) -> Callable[[AlgoProtocol, List[Episode]], float]:
    r"""Returns scorer function of action difference between algorithms.

    This metrics suggests how different the two algorithms are in continuous
    action-space.
    If the algorithm to compare with is near-optimal, the small action
    difference would be better.

    .. math::

        \mathbb{E}_{s_t \sim D}
            [(\pi_{\phi_1}(s_t) - \pi_{\phi_2}(s_t))^2]

    .. code-block:: python

        from d3rlpy.algos import CQL
        from d3rlpy.metrics.comparer import compare_continuous_action_diff

        cql1 = CQL()
        cql2 = CQL()

        scorer = compare_continuous_action_diff(cql1)

        squared_action_diff = scorer(cql2, ...)

    Args:
        base_algo: algorithm to comapre with.

    Returns:
        scorer function.

    """

    def scorer(algo: AlgoProtocol, episodes: List[Episode]) -> float:
        total_diffs = []
        for episode in episodes:
            # TODO: handle different n_frames
            for batch in _make_batches(episode, WINDOW_SIZE, algo.n_frames):
                base_actions = base_algo.predict(batch.observations)
                actions = algo.predict(batch.observations)
                diff = ((actions - base_actions) ** 2).sum(axis=1).tolist()
                total_diffs += diff
        # smaller is better, sometimes?
        return -float(np.mean(total_diffs))

    return scorer


[docs]def compare_discrete_action_match(
    base_algo: AlgoProtocol,
) -> Callable[[AlgoProtocol, List[Episode]], float]:
    r"""Returns scorer function of action matches between algorithms.

    This metrics suggests how different the two algorithms are in discrete
    action-space.
    If the algorithm to compare with is near-optimal, the small action
    difference would be better.

    .. math::

        \mathbb{E}_{s_t \sim D} [\parallel
            \{\text{argmax}_a Q_{\theta_1}(s_t, a)
            = \text{argmax}_a Q_{\theta_2}(s_t, a)\}]

    .. code-block:: python

        from d3rlpy.algos import DQN
        from d3rlpy.metrics.comparer import compare_continuous_action_diff

        dqn1 = DQN()
        dqn2 = DQN()

        scorer = compare_continuous_action_diff(dqn1)

        percentage_of_identical_actions = scorer(dqn2, ...)

    Args:
        base_algo: algorithm to comapre with.

    Returns:
        scorer function.

    """

    def scorer(algo: AlgoProtocol, episodes: List[Episode]) -> float:
        total_matches = []
        for episode in episodes:
            # TODO: handle different n_frames
            for batch in _make_batches(episode, WINDOW_SIZE, algo.n_frames):
                base_actions = base_algo.predict(batch.observations)
                actions = algo.predict(batch.observations)
                match = (base_actions == actions).tolist()
                total_matches += match
        return float(np.mean(total_matches))

    return scorer