# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import logging
import os
import re
import shutil
import warnings
from pathlib import Path
from typing import Dict, Iterable, Optional, Union
import numpy as np
# The "deprecated" name is used as a constructor argument to Dataset, so rename
# this import to prevent shadowing.
from deprecated.sphinx import deprecated as mark_deprecated
import compiler_gym.errors
from compiler_gym.datasets.benchmark import Benchmark
from compiler_gym.datasets.uri import BenchmarkUri
logger = logging.getLogger(__name__)
_DATASET_VERSION_PATTERN = r"[a-zA-z0-9-_]+-v(?P<version>[0-9]+)"
_DATASET_VERSION_RE = re.compile(_DATASET_VERSION_PATTERN)
[docs]class Dataset:
"""A dataset is a collection of benchmarks.
The Dataset class has methods for installing and managing groups of
benchmarks, for listing the available benchmark URIs, and for instantiating
:class:`Benchmark <compiler_gym.datasets.Benchmark>` objects.
The Dataset class is an abstract base for implementing datasets. At a
minimum, subclasses must implement the :meth:`benchmark()
<compiler_gym.datasets.Dataset.benchmark>` and :meth:`benchmark_uris()
<compiler_gym.datasets.Dataset.benchmark_uris>` methods, and :meth:`size
<compiler_gym.datasets.Dataset.size>`. Other methods such as
:meth:`install() <compiler_gym.datasets.Dataset.install>` may be used where
helpful.
"""
[docs] def __init__(
self,
name: str,
description: str,
license: str, # pylint: disable=redefined-builtin
site_data_base: Optional[Path] = None,
benchmark_class=Benchmark,
references: Optional[Dict[str, str]] = None,
deprecated: Optional[str] = None,
sort_order: int = 0,
validatable: str = "No",
):
"""Constructor.
:param name: The name of the dataset, in the format:
:code:`scheme://name`.
:param description: A short human-readable description of the dataset.
:param license: The name of the dataset's license.
:param site_data_base: An optional directory that can be used by the
dataset to house the "site data", i.e. persistent files on disk. The
site data directory is a subdirectory of this :code:`site_data_base`
path, which can be shared by multiple datasets. If not provided, the
:attr:`dataset.site_data_path
<compiler_gym.datasets.Dataset.site_data_path>` attribute will raise
an error. Use :attr:`dataset.has_site_data
<compiler_gym.datasets.Dataset.has_site_data>` to check if a site
data path was set.
:param benchmark_class: The class to use when instantiating benchmarks.
It must have the same constructor signature as :class:`Benchmark
<compiler_gym.datasets.Benchmark>`.
:param references: A dictionary of useful named URLs for this dataset
containing extra information, download links, papers, etc.
:param deprecated: Mark the dataset as deprecated and issue a warning
when :meth:`install() <compiler_gym.datasets.Dataset.install>`,
including the given method. Deprecated datasets are excluded from
the :meth:`datasets() <compiler_gym.datasets.Datasets.dataset>`
iterator by default.
:param sort_order: An optional numeric value that should be used to
order this dataset relative to others. Lowest value sorts first.
:param validatable: Whether the dataset is validatable. A validatable
dataset is one where the behavior of the benchmarks can be checked
by compiling the programs to binaries and executing them. If the
benchmarks crash, or are found to have different behavior, then
validation fails. This type of validation is used to check that the
compiler has not broken the semantics of the program. This value
takes a string and is used for documentation purposes only.
Suggested values are "Yes", "No", or "Partial".
:raises ValueError: If :code:`name` does not match the expected type.
"""
self._name = name
uri = BenchmarkUri.from_string(name)
self._description = description
self._license = license
self._scheme = uri.scheme
match = _DATASET_VERSION_RE.match(uri.dataset)
self._version = int(match.group("version") if match else 0)
self._references = references or {}
self._deprecation_message = deprecated
self._validatable = validatable
self.sort_order = sort_order
self.benchmark_class = benchmark_class
# Set up the site data name.
if site_data_base:
self._site_data_path = (
Path(site_data_base).resolve() / uri.scheme / uri.dataset
)
def __repr__(self):
return self.name
@property
def name(self) -> str:
"""The name of the dataset.
:type: str
"""
return self._name
@property
def description(self) -> str:
"""A short human-readable description of the dataset.
:type: str
"""
return self._description
@property
def license(self) -> str:
"""The name of the license of the dataset.
:type: str
"""
return self._license
@property
@mark_deprecated(
version="0.2.2", reason="The `protocol` attribute has been renamed `scheme`"
)
def protocol(self) -> str:
"""The URI scheme that is used to identify benchmarks in this dataset.
:type: str
"""
return self.scheme
@property
def scheme(self) -> str:
"""The URI scheme that is used to identify benchmarks in this dataset.
:type: str
"""
return self._scheme
@property
def version(self) -> int:
"""The version tag for this dataset. Defaults to zero.
:type: int
"""
return self._version
@property
def references(self) -> Dict[str, str]:
"""A dictionary of useful named URLs for this dataset containing extra
information, download links, papers, etc.
For example:
>>> dataset.references
{'Paper': 'https://arxiv.org/pdf/1407.3487.pdf',
'Homepage': 'https://ctuning.org/wiki/index.php/CTools:CBench'}
:type: Dict[str, str]
"""
return self._references
@property
def deprecated(self) -> bool:
"""Whether the dataset is included in the iterable sequence of datasets
of a containing :class:`Datasets <compiler_gym.datasets.Datasets>`
collection.
:type: bool
"""
return self._deprecation_message is not None
@property
def validatable(self) -> str:
"""Whether the dataset is validatable. A validatable dataset is one
where the behavior of the benchmarks can be checked by compiling the
programs to binaries and executing them. If the benchmarks crash, or are
found to have different behavior, then validation fails. This type of
validation is used to check that the compiler has not broken the
semantics of the program.
This property takes a string and is used for documentation purposes
only. Suggested values are "Yes", "No", or "Partial".
:type: str
"""
return self._validatable
@property
def has_site_data(self) -> bool:
"""Return whether the dataset has a site data directory.
:type: bool
"""
return hasattr(self, "_site_data_path")
@property
def site_data_path(self) -> Path:
"""The filesystem path used to store persistent dataset files.
This directory may not exist.
:type: Path
:raises ValueError: If no site data path was specified at constructor
time.
"""
if not self.has_site_data:
raise ValueError(f"Dataset has no site data path: {self.name}")
return self._site_data_path
@property
def site_data_size_in_bytes(self) -> int:
"""The total size of the on-disk data used by this dataset.
:type: int
"""
if not self.has_site_data:
return 0
if not self.site_data_path.is_dir():
return 0
total_size = 0
for dirname, _, filenames in os.walk(self.site_data_path):
total_size += sum(
os.path.getsize(os.path.join(dirname, f)) for f in filenames
)
return total_size
@property
def size(self) -> int:
"""The number of benchmarks in the dataset.
If the number of benchmarks is unknown or unbounded, for example because
the dataset represents a program generator that can produce an infinite
number of programs, the value is 0.
:type: int
"""
return 0
[docs] def __len__(self) -> int:
"""The number of benchmarks in the dataset.
This is the same as :meth:`Dataset.size
<compiler_gym.datasets.Dataset.size>`:
>>> len(dataset) == dataset.size
True
If the number of benchmarks is unknown or unbounded, for example because
the dataset represents a program generator that can produce an infinite
number of programs, the value is 0.
:return: An integer.
"""
return self.size
def __eq__(self, other: Union["Dataset", str]) -> bool:
if isinstance(other, Dataset):
return self.name == other.name
return self.name == other
def __lt__(self, other: Union["Dataset", str]) -> bool:
if isinstance(other, Dataset):
return self.name < other.name
return self.name < other
def __le__(self, other: Union["Dataset", str]) -> bool:
return self < other or self == other
@property
def installed(self) -> bool:
"""Whether the dataset is installed locally. Installation occurs
automatically on first use, or by calling :meth:`install()
<compiler_gym.datasets.Dataset.install>`.
:type: bool
"""
return True
[docs] def install(self) -> None:
"""Install this dataset locally.
Implementing this method is optional. If implementing this method, you
must call :code:`super().install()` first.
This method should not perform redundant work. This method should first
detect whether any work needs to be done so that repeated calls to
:code:`install()` will complete quickly.
"""
if self.deprecated:
warnings.warn(
f"Dataset '{self.name}' is marked as deprecated. {self._deprecation_message}",
category=DeprecationWarning,
stacklevel=2,
)
[docs] def uninstall(self) -> None:
"""Remove any local data for this benchmark.
This method undoes the work of :meth:`install()
<compiler_gym.datasets.Dataset.install>`. The dataset can still be used
after calling this method.
"""
if self.has_site_data() and self.site_data_path.is_dir():
shutil.rmtree(self.site_data_path)
[docs] def benchmarks(self) -> Iterable[Benchmark]:
"""Enumerate the (possibly infinite) benchmarks lazily.
Iteration order is consistent across runs. The order of
:meth:`benchmarks() <compiler_gym.datasets.Dataset.benchmarks>` and
:meth:`benchmark_uris() <compiler_gym.datasets.Dataset.benchmark_uris>`
is the same.
If the number of benchmarks in the dataset is infinite
(:code:`len(dataset) == math.inf`), the iterable returned by this method
will continue indefinitely.
:return: An iterable sequence of :class:`Benchmark
<compiler_gym.datasets.Benchmark>` instances.
"""
# Default implementation. Subclasses may wish to provide an alternative
# implementation that is optimized to specific use cases.
yield from (self.benchmark(uri) for uri in self.benchmark_uris())
[docs] def __iter__(self) -> Iterable[Benchmark]:
"""Enumerate the (possibly infinite) benchmarks lazily.
This is the same as :meth:`Dataset.benchmarks()
<compiler_gym.datasets.Dataset.benchmarks>`:
>>> from itertools import islice
>>> list(islice(dataset, 100)) == list(islice(datset.benchmarks(), 100))
True
:return: An iterable sequence of :meth:`Benchmark
<compiler_gym.datasets.Benchmark>` instances.
"""
yield from self.benchmarks()
[docs] def benchmark_uris(self) -> Iterable[str]:
"""Enumerate the (possibly infinite) benchmark URIs.
Iteration order is consistent across runs. The order of
:meth:`benchmarks() <compiler_gym.datasets.Dataset.benchmarks>` and
:meth:`benchmark_uris() <compiler_gym.datasets.Dataset.benchmark_uris>`
is the same.
If the number of benchmarks in the dataset is infinite
(:code:`len(dataset) == math.inf`), the iterable returned by this method
will continue indefinitely.
:return: An iterable sequence of benchmark URI strings.
"""
raise NotImplementedError("abstract class")
[docs] def benchmark_from_parsed_uri(self, uri: BenchmarkUri) -> Benchmark:
"""Select a benchmark.
Subclasses must implement this method. Implementors may assume that the
URI is well formed and that the :code:`scheme` and :code:`dataset`
components are correct.
:param uri: The parsed URI of the benchmark to return.
:return: A :class:`Benchmark <compiler_gym.datasets.Benchmark>`
instance.
:raise LookupError: If :code:`uri` is not found.
:raise ValueError: If the URI is invalid.
"""
raise NotImplementedError("abstract class")
[docs] def benchmark(self, uri: str) -> Benchmark:
"""Select a benchmark.
:param uri: The URI of the benchmark to return.
:return: A :class:`Benchmark <compiler_gym.datasets.Benchmark>`
instance.
:raise LookupError: If :code:`uri` is not found.
:raise ValueError: If the URI is invalid.
"""
return self.benchmark_from_parsed_uri(BenchmarkUri.from_string(uri))
[docs] def random_benchmark(
self, random_state: Optional[np.random.Generator] = None
) -> Benchmark:
"""Select a benchmark randomly.
:param random_state: A random number generator. If not provided, a
default :code:`np.random.default_rng()` is used.
:return: A :class:`Benchmark <compiler_gym.datasets.Benchmark>`
instance.
"""
random_state = random_state or np.random.default_rng()
return self._random_benchmark(random_state)
def _random_benchmark(self, random_state: np.random.Generator) -> Benchmark:
"""Private implementation of the random benchmark getter.
Subclasses must implement this method so that it selects a benchmark
from the available benchmarks with uniform probability, using only
:code:`random_state` as a source of randomness.
"""
raise NotImplementedError("abstract class")
[docs] def __getitem__(self, uri: str) -> Benchmark:
"""Select a benchmark by URI.
This is the same as :meth:`Dataset.benchmark(uri)
<compiler_gym.datasets.Dataset.benchmark>`:
>>> dataset["benchmark://cbench-v1/crc32"] == dataset.benchmark("benchmark://cbench-v1/crc32")
True
:return: A :class:`Benchmark <compiler_gym.datasets.Benchmark>`
instance.
:raise LookupError: If :code:`uri` does not exist.
"""
return self.benchmark(uri)
# Deprecated since v0.2.4.
# This type is for backwards compatibility that will be removed in a future release.
# Please, use errors from `compiler_gym.errors`.
DatasetInitError = compiler_gym.errors.DatasetInitError
@mark_deprecated(
version="0.1.4",
reason=(
"Datasets are now automatically activated. "
"`More information <https://github.com/facebookresearch/CompilerGym/issues/45>`_."
),
)
def activate(env, dataset: Union[str, Dataset]) -> bool:
"""Deprecated function for managing datasets.
:param dataset: The name of the dataset to download, or a :class:`Dataset
<compiler_gym.datasets.Dataset>` instance.
:return: :code:`True` if the dataset was activated, else :code:`False` if
already active.
:raises ValueError: If there is no dataset with that name.
"""
return False
@mark_deprecated(
version="0.1.4",
reason=(
"Please use :meth:`del env.datasets[dataset] <compiler_gym.datasets.Datasets.__delitem__>`. "
"`More information <https://github.com/facebookresearch/CompilerGym/issues/45>`_."
),
)
def delete(env, dataset: Union[str, Dataset]) -> bool:
"""Deprecated function for managing datasets.
Please use :meth:`del env.datasets[dataset]
<compiler_gym.datasets.Datasets.__delitem__>`.
:param dataset: The name of the dataset to download, or a :class:`Dataset
<compiler_gym.datasets.Dataset>` instance.
:return: :code:`True` if the dataset was deleted, else :code:`False` if
already deleted.
"""
del env.datasets[dataset]
return False
@mark_deprecated(
version="0.1.4",
reason=(
"Please use :meth:`env.datasets.deactivate() <compiler_gym.datasets.Datasets.deactivate>`. "
"`More information <https://github.com/facebookresearch/CompilerGym/issues/45>`_."
),
)
def deactivate(env, dataset: Union[str, Dataset]) -> bool:
"""Deprecated function for managing datasets.
Please use :meth:`del env.datasets[dataset]
<compiler_gym.datasets.Datasets.__delitem__>`.
:param dataset: The name of the dataset to download, or a :class:`Dataset
<compiler_gym.datasets.Dataset>` instance.
:return: :code:`True` if the dataset was deactivated, else :code:`False` if
already inactive.
"""
del env.datasets[dataset]
return False
@mark_deprecated(
version="0.1.7",
reason=(
"Datasets are now installed automatically, there is no need to call :code:`require()`. "
"`More information <https://github.com/facebookresearch/CompilerGym/issues/45>`_."
),
)
def require(env, dataset: Union[str, Dataset]) -> bool:
"""Deprecated function for managing datasets.
Datasets are now installed automatically. See :class:`env.datasets
<compiler_gym.datasets.Datasets>`.
:param env: The environment that this dataset is required for.
:param dataset: The name of the dataset to download, or a :class:`Dataset
<compiler_gym.datasets.Dataset>` instance.
:return: :code:`True` if the dataset was downloaded, or :code:`False` if the
dataset was already available.
"""
return False