Source code for compiler_gym.spaces.reward

# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from typing import List, Optional, Tuple, Union

import numpy as np

import compiler_gym
from compiler_gym.spaces.scalar import Scalar
from compiler_gym.util.gym_type_hints import ActionType, ObservationType, RewardType


[docs]class Reward(Scalar): """An extension of the :class:`Scalar <compiler_gym.spaces.Scalar>` space that is used for computing a reward signal. A :code:`Reward` is a scalar value used to determine the reward for a particular action. An instance of :code:`Reward` is used to represent the reward function for a particular episode. For every :meth:`env.step() <compiler_gym.envs.CompilerEnv.step>` of the environment, the :meth:`reward.update() <compiler_gym.spaces.Reward.update>` method is called to produce a new incremental reward. Environments provide implementations of :code:`Reward` that compute reward signals based on observation values computed by the backend service. """
[docs] def __init__( self, name: str, observation_spaces: Optional[List[str]] = None, default_value: RewardType = 0, min: Optional[RewardType] = None, max: Optional[RewardType] = None, default_negates_returns: bool = False, success_threshold: Optional[RewardType] = None, deterministic: bool = False, platform_dependent: bool = True, ): """Constructor. :param name: The name of the reward space. This is a unique name used to represent the reward. :param observation_spaces: A list of observation space IDs (:class:`space.id <compiler_gym.views.ObservationSpaceSpec>` values) that are used to compute the reward. May be an empty list if no observations are requested. Requested observations will be provided to the :code:`observations` argument of :meth:`reward.update() <compiler_gym.spaces.Reward.update>`. :param default_value: A default reward. This value will be returned by :meth:`env.step() <compiler_gym.envs.CompilerEnv.step>` if the service terminates. :param min: The lower bound of the reward. :param max: The upper bound of the reward. :param default_negates_returns: If true, the default value will be offset by the sum of all rewards for the current episode. For example, given a default reward value of *-10.0* and an episode with prior rewards *[0.1, 0.3, -0.15]*, the default value is: *-10.0 - sum(0.1, 0.3, -0.15)*. :param success_threshold: The cumulative reward threshold before an episode is considered successful. For example, episodes where reward is scaled to an existing heuristic can be considered “successful” when the reward exceeds the existing heuristic. :param deterministic: Whether the reward space is deterministic. :param platform_dependent: Whether the reward values depend on the execution environment of the service. """ super().__init__( name=name, min=-np.inf if min is None else min, max=np.inf if max is None else max, dtype=np.float64, ) self.name = name or id if not self.name: raise TypeError("No name given") self.observation_spaces = observation_spaces or [] self.default_value: RewardType = default_value self.default_negates_returns: bool = default_negates_returns self.success_threshold = success_threshold self.deterministic = deterministic self.platform_dependent = platform_dependent
[docs] def reset( self, benchmark: str, observation_view: "compiler_gym.views.ObservationView" ) -> None: """Reset the rewards space. This is called on :meth:`env.reset() <compiler_gym.envs.CompilerEnv.reset>`. :param benchmark: The URI of the benchmark that is used for this episode. :param observation: An observation view for reward initialization """ pass
[docs] def update( self, actions: List[ActionType], observations: List[ObservationType], observation_view: "compiler_gym.views.ObservationView", # noqa: F821 ) -> RewardType: """Calculate a reward for the given action. :param action: The action performed. :param observations: A list of observation values as requested by the :code:`observation_spaces` constructor argument. :param observation_view: The :class:`ObservationView <compiler_gym.views.ObservationView>` instance. """ raise NotImplementedError("abstract class")
[docs] def reward_on_error(self, episode_reward: RewardType) -> RewardType: """Return the reward value for an error condition. This method should be used to produce the reward value that should be used if the compiler service cannot be reached, e.g. because it has crashed or the connection has dropped. :param episode_reward: The current cumulative reward of an episode. :return: A reward. """ if self.default_negates_returns: return self.default_value - episode_reward else: return self.default_value
@property def range(self) -> Tuple[RewardType, RewardType]: """The lower and upper bounds of the reward.""" return (self.min, self.max) def __repr__(self): return self.name def __eq__(self, other: Union["Reward", str]) -> bool: if isinstance(other, str): return self.name == other elif isinstance(other, Reward): return self.name == other.name else: return False
class DefaultRewardFromObservation(Reward): def __init__(self, observation_name: str, **kwargs): super().__init__( observation_spaces=[observation_name], name=observation_name, **kwargs ) self.previous_value: Optional[ObservationType] = None def reset(self, benchmark: str, observation_view) -> None: """Called on env.reset(). Reset incremental progress.""" del benchmark # unused self.previous_value = None def update( self, action: int, observations: List[ObservationType], observation_view: "compiler_gym.views.ObservationView", # noqa: F821 ) -> RewardType: """Called on env.step(). Compute and return new reward.""" del action # unused del observation_view # unused value: RewardType = observations[0] if self.previous_value is None: self.previous_value = 0 reward = RewardType(value - self.previous_value) self.previous_value = value return reward