# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from typing import List, Optional, Tuple, Union
import numpy as np
import compiler_gym
from compiler_gym.spaces.scalar import Scalar
from compiler_gym.util.gym_type_hints import ActionType, ObservationType, RewardType
[docs]class Reward(Scalar):
"""An extension of the :class:`Scalar <compiler_gym.spaces.Scalar>` space
that is used for computing a reward signal.
A :code:`Reward` is a scalar value used to determine the reward for a
particular action. An instance of :code:`Reward` is used to represent the
reward function for a particular episode. For every
:meth:`env.step() <compiler_gym.envs.CompilerEnv.step>` of the environment,
the :meth:`reward.update() <compiler_gym.spaces.Reward.update>` method is
called to produce a new incremental reward.
Environments provide implementations of :code:`Reward` that compute reward
signals based on observation values computed by the backend service.
"""
[docs] def __init__(
self,
name: str,
observation_spaces: Optional[List[str]] = None,
default_value: RewardType = 0,
min: Optional[RewardType] = None,
max: Optional[RewardType] = None,
default_negates_returns: bool = False,
success_threshold: Optional[RewardType] = None,
deterministic: bool = False,
platform_dependent: bool = True,
):
"""Constructor.
:param name: The name of the reward space. This is a unique name used to
represent the reward.
:param observation_spaces: A list of observation space IDs
(:class:`space.id <compiler_gym.views.ObservationSpaceSpec>` values)
that are used to compute the reward. May be an empty list if no
observations are requested. Requested observations will be provided
to the :code:`observations` argument of :meth:`reward.update()
<compiler_gym.spaces.Reward.update>`.
:param default_value: A default reward. This value will be returned by
:meth:`env.step() <compiler_gym.envs.CompilerEnv.step>` if the
service terminates.
:param min: The lower bound of the reward.
:param max: The upper bound of the reward.
:param default_negates_returns: If true, the default value will be
offset by the sum of all rewards for the current episode. For
example, given a default reward value of *-10.0* and an episode with
prior rewards *[0.1, 0.3, -0.15]*, the default value is: *-10.0 -
sum(0.1, 0.3, -0.15)*.
:param success_threshold: The cumulative reward threshold before an
episode is considered successful. For example, episodes where reward
is scaled to an existing heuristic can be considered “successful”
when the reward exceeds the existing heuristic.
:param deterministic: Whether the reward space is deterministic.
:param platform_dependent: Whether the reward values depend on the
execution environment of the service.
"""
super().__init__(
name=name,
min=-np.inf if min is None else min,
max=np.inf if max is None else max,
dtype=np.float64,
)
self.name = name or id
if not self.name:
raise TypeError("No name given")
self.observation_spaces = observation_spaces or []
self.default_value: RewardType = default_value
self.default_negates_returns: bool = default_negates_returns
self.success_threshold = success_threshold
self.deterministic = deterministic
self.platform_dependent = platform_dependent
[docs] def reset(
self, benchmark: str, observation_view: "compiler_gym.views.ObservationView"
) -> None:
"""Reset the rewards space. This is called on
:meth:`env.reset() <compiler_gym.envs.CompilerEnv.reset>`.
:param benchmark: The URI of the benchmark that is used for this
episode.
:param observation: An observation view for reward initialization
"""
pass
[docs] def update(
self,
actions: List[ActionType],
observations: List[ObservationType],
observation_view: "compiler_gym.views.ObservationView", # noqa: F821
) -> RewardType:
"""Calculate a reward for the given action.
:param action: The action performed.
:param observations: A list of observation values as requested by the
:code:`observation_spaces` constructor argument.
:param observation_view: The
:class:`ObservationView <compiler_gym.views.ObservationView>`
instance.
"""
raise NotImplementedError("abstract class")
[docs] def reward_on_error(self, episode_reward: RewardType) -> RewardType:
"""Return the reward value for an error condition.
This method should be used to produce the reward value that should be
used if the compiler service cannot be reached, e.g. because it has
crashed or the connection has dropped.
:param episode_reward: The current cumulative reward of an episode.
:return: A reward.
"""
if self.default_negates_returns:
return self.default_value - episode_reward
else:
return self.default_value
@property
def range(self) -> Tuple[RewardType, RewardType]:
"""The lower and upper bounds of the reward."""
return (self.min, self.max)
def __repr__(self):
return self.name
def __eq__(self, other: Union["Reward", str]) -> bool:
if isinstance(other, str):
return self.name == other
elif isinstance(other, Reward):
return self.name == other.name
else:
return False
class DefaultRewardFromObservation(Reward):
def __init__(self, observation_name: str, **kwargs):
super().__init__(
observation_spaces=[observation_name], name=observation_name, **kwargs
)
self.previous_value: Optional[ObservationType] = None
def reset(self, benchmark: str, observation_view) -> None:
"""Called on env.reset(). Reset incremental progress."""
del benchmark # unused
self.previous_value = None
def update(
self,
action: int,
observations: List[ObservationType],
observation_view: "compiler_gym.views.ObservationView", # noqa: F821
) -> RewardType:
"""Called on env.step(). Compute and return new reward."""
del action # unused
del observation_view # unused
value: RewardType = observations[0]
if self.previous_value is None:
self.previous_value = 0
reward = RewardType(value - self.previous_value)
self.previous_value = value
return reward