Source code for compiler_gym.envs.gcc.datasets.chstone

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import os
from pathlib import Path
from typing import Iterable

from compiler_gym.datasets import Benchmark, TarDatasetWithManifest
from compiler_gym.datasets.uri import BenchmarkUri
from compiler_gym.envs.gcc.gcc import Gcc
from compiler_gym.util.decorators import memoized_property
from compiler_gym.util.filesystem import atomic_file_write

URIS = [

# TODO( This can be merged
# with the LLVM implementation.
[docs]class CHStoneDataset(TarDatasetWithManifest): """A dataset of C programs curated from GitHub source code. The dataset is from: Hara, Yuko, Hiroyuki Tomiyama, Shinya Honda, Hiroaki Takada, and Katsuya Ishii. "Chstone: A benchmark program suite for practical c-based high-level synthesis." In 2008 IEEE International Symposium on Circuits and Systems, pp. 1192-1195. IEEE, 2008. And is available at: """ def __init__( self, gcc_bin: Path, site_data_base: Path, sort_order: int = 0, ): super().__init__( name="benchmark://chstone-v0", description="Benchmarks for C-based High-Level Synthesis", references={ "Paper": "", "Homepage": "", }, license="Mixture of open source and public domain licenses", site_data_base=site_data_base, tar_urls=[ "" ], tar_sha256="f7acab9d3c3dc7b971e62c8454bc909d84bddb6d0a96378e41beb94231739acb", strip_prefix="patmos_HLS-e62d878ceb91e5a18007ca2e0a9602ee44ff7d59/benchmarks/CHStone", tar_compression="gz", benchmark_file_suffix=".c", sort_order=sort_order, # We provide our own manifest. manifest_urls=[], manifest_sha256="", ) self.gcc_bin = gcc_bin def benchmark_uris(self) -> Iterable[str]: yield from URIS @memoized_property def gcc(self): # Defer instantiation of Gcc from the constructor as it will fail if the # given Gcc is not available. Memoize the result as initialization is # expensive. return Gcc(bin=self.gcc_bin) def benchmark_from_parsed_uri(self, uri: BenchmarkUri) -> Benchmark: self.install() # Most of the source files are named after the parent directory, but not # all. c_file_name = { "blowfish": "bf.c", "motion": "mpeg2.c", "sha": "sha_driver.c", "jpeg": "main.c", }.get(uri.path[1:], f"{uri.path[1:]}.c") source_dir_path = Path(os.path.normpath(f"{self.dataset_root}/{uri.path}")) source_path = source_dir_path / c_file_name preprocessed_path = source_dir_path / "src.c" # If the file does not exist, preprocess it on-demand. if not preprocessed_path.is_file(): if not source_path.is_file(): raise LookupError( f"Benchmark not found: {uri} (file not found: {source_path})" ) with atomic_file_write(preprocessed_path) as tmp_path: # TODO( Send # over the unprocessed code to the service, have the service # preprocess. Until then, we do it client side with GCC having # to be fixed by an environment variable. self.gcc( "-E", "-o",, c_file_name, cwd=source_dir_path, timeout=300, ) return Benchmark.from_file(uri, preprocessed_path) @property def size(self) -> int: return len(URIS)