build caches: collect files to relocate while tarballing w/o file (#48212)

A few changes to tarball creation (for build caches):
- do not run file to distinguish binary from text
- file is slow, even when running it in a batched fashion -- it usually reads all bytes and has slow logic to categorize specific types
- we don't need a highly detailed file categorization; a crude categorization of elf, mach-o, text suffices.
detecting elf and mach-o is straightforward and cheap
- detecting utf-8 (and with that ascii) is highly accurate: false positive rate decays exponentially as file size increases. Further it's not only the most common encoding, but the most common file type in package prefixes.
iso-8859-1 is cheaply (but heuristically) detected too, and sufficiently accurate after binaries and utf-8 files are classified earlier
- remove file as a dependency of Spack in general, which makes Spack itself easier to install
- detect file type and need to relocate as part of creating the tarball, which is more cache friendly and thus faster
This commit is contained in:
Harmen Stoppels 2024-12-24 18:53:13 +01:00 committed by GitHub
parent aca469b329
commit e9cdcc4af0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
17 changed files with 249 additions and 411 deletions

View File

@ -161,11 +161,7 @@ jobs:
source share/spack/setup-env.sh
spack -d gpg list
tree $HOME/.spack/bootstrap/store/
- name: Bootstrap File
run: |
source share/spack/setup-env.sh
spack -d python share/spack/qa/bootstrap-file.py
tree $HOME/.spack/bootstrap/store/
windows:
runs-on: "windows-latest"
@ -196,9 +192,3 @@ jobs:
spack -d gpg list
./share/spack/qa/validate_last_exit.ps1
tree $env:userprofile/.spack/bootstrap/store/
- name: Bootstrap File
run: |
./share/spack/setup-env.ps1
spack -d python share/spack/qa/bootstrap-file.py
./share/spack/qa/validate_last_exit.ps1
tree $env:userprofile/.spack/bootstrap/store/

View File

@ -140,7 +140,7 @@ jobs:
- name: Install dependencies
run: |
dnf install -y \
bzip2 curl file gcc-c++ gcc gcc-gfortran git gnupg2 gzip \
bzip2 curl gcc-c++ gcc gcc-gfortran git gnupg2 gzip \
make patch tcl unzip which xz
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
- name: Setup repo and non-root user

View File

@ -35,7 +35,7 @@ A build matrix showing which packages are working on which systems is shown belo
.. code-block:: console
apt update
apt install bzip2 ca-certificates file g++ gcc gfortran git gzip lsb-release patch python3 tar unzip xz-utils zstd
apt install bzip2 ca-certificates g++ gcc gfortran git gzip lsb-release patch python3 tar unzip xz-utils zstd
.. tab-item:: RHEL

View File

@ -8,7 +8,6 @@ unzip, , , Compress/Decompress archives
bzip2, , , Compress/Decompress archives
xz, , , Compress/Decompress archives
zstd, , Optional, Compress/Decompress archives
file, , , Create/Use Buildcaches
lsb-release, , , Linux: identify operating system version
gnupg2, , , Sign/Verify Buildcaches
git, , , Manage Software Repositories

1 Name Supported Versions Notes Requirement Reason
8 bzip2 Compress/Decompress archives
9 xz Compress/Decompress archives
10 zstd Optional Compress/Decompress archives
file Create/Use Buildcaches
11 lsb-release Linux: identify operating system version
12 gnupg2 Sign/Verify Buildcaches
13 git Manage Software Repositories

View File

@ -863,8 +863,10 @@ def elide_list(line_list: List[str], max_num: int = 10) -> List[str]:
if sys.version_info >= (3, 9):
PatternStr = re.Pattern[str]
PatternBytes = re.Pattern[bytes]
else:
PatternStr = typing.Pattern[str]
PatternBytes = typing.Pattern[bytes]
def fnmatch_translate_multiple(named_patterns: Dict[str, str]) -> str:

View File

@ -24,13 +24,12 @@
import urllib.request
import warnings
from contextlib import closing
from typing import Dict, Iterable, List, NamedTuple, Optional, Set, Tuple, Union
from typing import IO, Dict, Iterable, List, NamedTuple, Optional, Set, Tuple, Union
import llnl.util.filesystem as fsys
import llnl.util.lang
import llnl.util.tty as tty
from llnl.util.filesystem import BaseDirectoryVisitor, mkdirp, visit_directory_tree
from llnl.util.symlink import readlink
from llnl.util.filesystem import mkdirp
import spack.caches
import spack.config as config
@ -54,7 +53,6 @@
import spack.util.archive
import spack.util.crypto
import spack.util.file_cache as file_cache
import spack.util.filesystem as ssys
import spack.util.gpg
import spack.util.parallel
import spack.util.path
@ -587,129 +585,11 @@ def read_buildinfo_file(prefix):
return syaml.load(f)
class BuildManifestVisitor(BaseDirectoryVisitor):
"""Visitor that collects a list of files and symlinks
that can be checked for need of relocation. It knows how
to dedupe hardlinks and deal with symlinks to files and
directories."""
def __init__(self):
# Save unique identifiers of hardlinks to avoid relocating them multiple times
self.visited = set()
# Lists of files we will check
self.files = []
self.symlinks = []
def seen_before(self, root, rel_path):
stat_result = os.lstat(os.path.join(root, rel_path))
if stat_result.st_nlink == 1:
return False
identifier = (stat_result.st_dev, stat_result.st_ino)
if identifier in self.visited:
return True
else:
self.visited.add(identifier)
return False
def visit_file(self, root, rel_path, depth):
if self.seen_before(root, rel_path):
return
self.files.append(rel_path)
def visit_symlinked_file(self, root, rel_path, depth):
# Note: symlinks *can* be hardlinked, but it is unclear if
# symlinks can be relinked in-place (preserving inode).
# Therefore, we do *not* de-dupe hardlinked symlinks.
self.symlinks.append(rel_path)
def before_visit_dir(self, root, rel_path, depth):
return os.path.basename(rel_path) not in (".spack", "man")
def before_visit_symlinked_dir(self, root, rel_path, depth):
# Treat symlinked directories simply as symlinks.
self.visit_symlinked_file(root, rel_path, depth)
# Never recurse into symlinked directories.
return False
def file_matches(path, regex):
with open(path, "rb") as f:
contents = f.read()
return bool(regex.search(contents))
def get_buildfile_manifest(spec):
"""
Return a data structure with information about a build, including
text_to_relocate, binary_to_relocate, binary_to_relocate_fullpath
link_to_relocate, and other, which means it doesn't fit any of previous
checks (and should not be relocated). We exclude docs (man) and
metadata (.spack). This can be used to find a particular kind of file
in spack, or to generate the build metadata.
"""
data = {
"text_to_relocate": [],
"binary_to_relocate": [],
"link_to_relocate": [],
"other": [],
"binary_to_relocate_fullpath": [],
"hardlinks_deduped": True,
}
# Guard against filesystem footguns of hardlinks and symlinks by using
# a visitor to retrieve a list of files and symlinks, so we don't have
# to worry about hardlinks of symlinked dirs and what not.
visitor = BuildManifestVisitor()
root = spec.prefix
visit_directory_tree(root, visitor)
# Collect a list of prefixes for this package and it's dependencies, Spack will
# look for them to decide if text file needs to be relocated or not
prefixes = [d.prefix for d in spec.traverse(root=True, deptype="all") if not d.external]
prefixes.append(spack.hooks.sbang.sbang_install_path())
prefixes.append(str(spack.store.STORE.layout.root))
# Create a giant regex that matches all prefixes
regex = utf8_paths_to_single_binary_regex(prefixes)
# Symlinks.
# Obvious bugs:
# 1. relative links are not relocated.
# 2. paths are used as strings.
for rel_path in visitor.symlinks:
abs_path = os.path.join(root, rel_path)
link = readlink(abs_path)
if os.path.isabs(link) and link.startswith(spack.store.STORE.layout.root):
data["link_to_relocate"].append(rel_path)
# Non-symlinks.
for rel_path in visitor.files:
abs_path = os.path.join(root, rel_path)
m_type, m_subtype = ssys.mime_type(abs_path)
if relocate.needs_binary_relocation(m_type, m_subtype):
# Why is this branch not part of needs_binary_relocation? :(
if (
(
m_subtype in ("x-executable", "x-sharedlib", "x-pie-executable")
and sys.platform != "darwin"
)
or (m_subtype in ("x-mach-binary") and sys.platform == "darwin")
or (not rel_path.endswith(".o"))
):
data["binary_to_relocate"].append(rel_path)
data["binary_to_relocate_fullpath"].append(abs_path)
continue
elif relocate.needs_text_relocation(m_type, m_subtype) and file_matches(abs_path, regex):
data["text_to_relocate"].append(rel_path)
continue
data["other"].append(abs_path)
return data
def file_matches(f: IO[bytes], regex: llnl.util.lang.PatternBytes) -> bool:
try:
return bool(regex.search(f.read()))
finally:
f.seek(0)
def deps_to_relocate(spec):
@ -742,17 +622,15 @@ def deps_to_relocate(spec):
def get_buildinfo_dict(spec):
"""Create metadata for a tarball"""
manifest = get_buildfile_manifest(spec)
return {
"sbang_install_path": spack.hooks.sbang.sbang_install_path(),
"buildpath": spack.store.STORE.layout.root,
"spackprefix": spack.paths.prefix,
"relative_prefix": os.path.relpath(spec.prefix, spack.store.STORE.layout.root),
"relocate_textfiles": manifest["text_to_relocate"],
"relocate_binaries": manifest["binary_to_relocate"],
"relocate_links": manifest["link_to_relocate"],
"hardlinks_deduped": manifest["hardlinks_deduped"],
# "relocate_textfiles": [],
# "relocate_binaries": [],
# "relocate_links": [],
"hardlinks_deduped": True,
"hash_to_prefix": {d.dag_hash(): str(d.prefix) for d in deps_to_relocate(spec)},
}
@ -1042,7 +920,55 @@ def generate_key_index(key_prefix: str, tmpdir: str) -> None:
) from e
def tarfile_of_spec_prefix(tar: tarfile.TarFile, prefix: str) -> None:
class FileTypes:
BINARY = 0
TEXT = 1
UNKNOWN = 2
NOT_ISO8859_1_TEXT = re.compile(b"[\x00\x7F-\x9F]")
def file_type(f: IO[bytes]) -> int:
try:
# first check if this is an ELF or mach-o binary.
magic = f.read(8)
if len(magic) < 8:
return FileTypes.UNKNOWN
elif relocate.is_elf_magic(magic) or relocate.is_macho_magic(magic):
return FileTypes.BINARY
f.seek(0)
# Then try utf-8, which has a fast exponential decay in false positive rate with file size.
# Use chunked reads for fast early exit.
f_txt = io.TextIOWrapper(f, encoding="utf-8", errors="strict")
try:
while f_txt.read(1024):
pass
return FileTypes.TEXT
except UnicodeError:
f_txt.seek(0)
pass
finally:
f_txt.detach()
# Finally try iso-8859-1 heuristically. In Python, all possible 256 byte values are valid.
# We classify it as text if it does not contain any control characters / null bytes.
data = f.read(1024)
while data:
if NOT_ISO8859_1_TEXT.search(data):
break
data = f.read(1024)
else:
return FileTypes.TEXT
return FileTypes.UNKNOWN
finally:
f.seek(0)
def tarfile_of_spec_prefix(
tar: tarfile.TarFile, prefix: str, prefixes_to_relocate: List[str]
) -> dict:
"""Create a tarfile of an install prefix of a spec. Skips existing buildinfo file.
Args:
@ -1058,6 +984,33 @@ def tarfile_of_spec_prefix(tar: tarfile.TarFile, prefix: str) -> None:
except OSError:
skip = lambda entry: False
binary_regex = utf8_paths_to_single_binary_regex(prefixes_to_relocate)
relocate_binaries = []
relocate_links = []
relocate_textfiles = []
# use callbacks to add files and symlinks, so we can register which files need relocation upon
# extraction.
def add_file(tar: tarfile.TarFile, info: tarfile.TarInfo, path: str):
with open(path, "rb") as f:
relpath = os.path.relpath(path, prefix)
# no need to relocate anything in the .spack directory
if relpath.split(os.sep, 1)[0] == ".spack":
tar.addfile(info, f)
return
f_type = file_type(f)
if f_type == FileTypes.BINARY:
relocate_binaries.append(os.path.relpath(path, prefix))
elif f_type == FileTypes.TEXT and file_matches(f, binary_regex):
relocate_textfiles.append(os.path.relpath(path, prefix))
tar.addfile(info, f)
def add_symlink(tar: tarfile.TarFile, info: tarfile.TarInfo, path: str):
if os.path.isabs(info.linkname) and binary_regex.match(info.linkname.encode("utf-8")):
relocate_links.append(os.path.relpath(path, prefix))
tar.addfile(info)
spack.util.archive.reproducible_tarfile_from_prefix(
tar,
prefix,
@ -1065,29 +1018,51 @@ def tarfile_of_spec_prefix(tar: tarfile.TarFile, prefix: str) -> None:
# used in runtimes like AWS lambda.
include_parent_directories=True,
skip=skip,
add_file=add_file,
add_symlink=add_symlink,
)
return {
"relocate_binaries": relocate_binaries,
"relocate_links": relocate_links,
"relocate_textfiles": relocate_textfiles,
}
def create_tarball(spec: spack.spec.Spec, tarfile_path: str) -> Tuple[str, str]:
"""Create a tarball of a spec and return the checksums of the compressed tarfile and the
uncompressed tarfile."""
return _do_create_tarball(
tarfile_path,
spec.prefix,
buildinfo=get_buildinfo_dict(spec),
prefixes_to_relocate=prefixes_to_relocate(spec),
)
def _do_create_tarball(tarfile_path: str, binaries_dir: str, buildinfo: dict):
def _do_create_tarball(
tarfile_path: str, prefix: str, buildinfo: dict, prefixes_to_relocate: List[str]
) -> Tuple[str, str]:
with spack.util.archive.gzip_compressed_tarfile(tarfile_path) as (
tar,
inner_checksum,
outer_checksum,
tar_gz_checksum,
tar_checksum,
):
# Tarball the install prefix
tarfile_of_spec_prefix(tar, binaries_dir)
files_to_relocate = tarfile_of_spec_prefix(tar, prefix, prefixes_to_relocate)
buildinfo.update(files_to_relocate)
# Serialize buildinfo for the tarball
bstring = syaml.dump(buildinfo, default_flow_style=True).encode("utf-8")
tarinfo = tarfile.TarInfo(
name=spack.util.archive.default_path_to_name(buildinfo_file_name(binaries_dir))
name=spack.util.archive.default_path_to_name(buildinfo_file_name(prefix))
)
tarinfo.type = tarfile.REGTYPE
tarinfo.size = len(bstring)
tarinfo.mode = 0o644
tar.addfile(tarinfo, io.BytesIO(bstring))
return inner_checksum.hexdigest(), outer_checksum.hexdigest()
return tar_gz_checksum.hexdigest(), tar_checksum.hexdigest()
class ExistsInBuildcache(NamedTuple):
@ -1137,6 +1112,13 @@ def _exists_in_buildcache(spec: spack.spec.Spec, tmpdir: str, out_url: str) -> E
return ExistsInBuildcache(signed, unsigned, tarball)
def prefixes_to_relocate(spec):
prefixes = [s.prefix for s in deps_to_relocate(spec)]
prefixes.append(spack.hooks.sbang.sbang_install_path())
prefixes.append(str(spack.store.STORE.layout.root))
return prefixes
def _url_upload_tarball_and_specfile(
spec: spack.spec.Spec,
tmpdir: str,
@ -1146,7 +1128,7 @@ def _url_upload_tarball_and_specfile(
):
files = BuildcacheFiles(spec, tmpdir, out_url)
tarball = files.local_tarball()
checksum, _ = _do_create_tarball(tarball, spec.prefix, get_buildinfo_dict(spec))
checksum, _ = create_tarball(spec, tarball)
spec_dict = spec.to_dict(hash=ht.dag_hash)
spec_dict["buildcache_layout_version"] = CURRENT_BUILD_CACHE_LAYOUT_VERSION
spec_dict["binary_cache_checksum"] = {"hash_algorithm": "sha256", "hash": checksum}
@ -1470,13 +1452,11 @@ def _oci_push_pkg_blob(
filename = os.path.join(tmpdir, f"{spec.dag_hash()}.tar.gz")
# Create an oci.image.layer aka tarball of the package
compressed_tarfile_checksum, tarfile_checksum = _do_create_tarball(
filename, spec.prefix, get_buildinfo_dict(spec)
)
tar_gz_checksum, tar_checksum = create_tarball(spec, filename)
blob = spack.oci.oci.Blob(
Digest.from_sha256(compressed_tarfile_checksum),
Digest.from_sha256(tarfile_checksum),
Digest.from_sha256(tar_gz_checksum),
Digest.from_sha256(tar_checksum),
os.path.getsize(filename),
)
@ -2435,6 +2415,14 @@ def _tar_strip_component(tar: tarfile.TarFile, prefix: str):
yield m
def extract_buildcache_tarball(tarfile_path: str, destination: str) -> None:
with closing(tarfile.open(tarfile_path, "r")) as tar:
# Remove common prefix from tarball entries and directly extract them to the install dir.
tar.extractall(
path=destination, members=_tar_strip_component(tar, prefix=_ensure_common_prefix(tar))
)
def extract_tarball(spec, download_result, force=False, timer=timer.NULL_TIMER):
"""
extract binary tarball for given package into install area
@ -2504,12 +2492,7 @@ def extract_tarball(spec, download_result, force=False, timer=timer.NULL_TIMER):
tarfile_path, size, contents, "sha256", expected, local_checksum
)
try:
with closing(tarfile.open(tarfile_path, "r")) as tar:
# Remove install prefix from tarfil to extract directly into spec.prefix
tar.extractall(
path=spec.prefix,
members=_tar_strip_component(tar, prefix=_ensure_common_prefix(tar)),
)
extract_buildcache_tarball(tarfile_path, destination=spec.prefix)
except Exception:
shutil.rmtree(spec.prefix, ignore_errors=True)
_delete_staged_downloads(download_result)

View File

@ -9,7 +9,6 @@
all_core_root_specs,
ensure_clingo_importable_or_raise,
ensure_core_dependencies,
ensure_file_in_path_or_raise,
ensure_gpg_in_path_or_raise,
ensure_patchelf_in_path_or_raise,
)
@ -20,7 +19,6 @@
"is_bootstrapping",
"ensure_bootstrap_configuration",
"ensure_core_dependencies",
"ensure_file_in_path_or_raise",
"ensure_gpg_in_path_or_raise",
"ensure_clingo_importable_or_raise",
"ensure_patchelf_in_path_or_raise",

View File

@ -481,19 +481,6 @@ def ensure_gpg_in_path_or_raise() -> None:
)
def file_root_spec() -> str:
"""Return the root spec used to bootstrap file"""
root_spec_name = "win-file" if IS_WINDOWS else "file"
return _root_spec(root_spec_name)
def ensure_file_in_path_or_raise() -> None:
"""Ensure file is in the PATH or raise"""
return ensure_executables_in_path_or_raise(
executables=["file"], abstract_spec=file_root_spec()
)
def patchelf_root_spec() -> str:
"""Return the root spec used to bootstrap patchelf"""
# 0.13.1 is the last version not to require C++17.
@ -577,15 +564,13 @@ def ensure_core_dependencies() -> None:
"""Ensure the presence of all the core dependencies."""
if sys.platform.lower() == "linux":
ensure_patchelf_in_path_or_raise()
elif sys.platform == "win32":
ensure_file_in_path_or_raise()
ensure_gpg_in_path_or_raise()
ensure_clingo_importable_or_raise()
def all_core_root_specs() -> List[str]:
"""Return a list of all the core root specs that may be used to bootstrap Spack"""
return [clingo_root_spec(), gnupg_root_spec(), patchelf_root_spec(), file_root_spec()]
return [clingo_root_spec(), gnupg_root_spec(), patchelf_root_spec()]
def bootstrapping_sources(scope: Optional[str] = None):

View File

@ -3,8 +3,8 @@
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
"""Query the status of bootstrapping on this machine"""
import platform
from typing import List, Optional, Sequence, Tuple, Union
import sys
from typing import Dict, List, Optional, Sequence, Tuple, Union
import spack.util.executable
@ -72,7 +72,7 @@ def _core_requirements() -> List[RequiredResponseType]:
"bzip2": _missing("bzip2", "required to compress/decompress code archives"),
"git": _missing("git", "required to fetch/manage git repositories"),
}
if platform.system().lower() == "linux":
if sys.platform == "linux":
_core_system_exes["xz"] = _missing("xz", "required to compress/decompress code archives")
# Executables that are not bootstrapped yet
@ -87,17 +87,16 @@ def _core_requirements() -> List[RequiredResponseType]:
def _buildcache_requirements() -> List[RequiredResponseType]:
_buildcache_exes = {
"file": _missing("file", "required to analyze files for buildcaches", system_only=False),
("gpg2", "gpg"): _missing("gpg2", "required to sign/verify buildcaches", False),
_buildcache_exes: Dict[ExecutablesType, str] = {
("gpg2", "gpg"): _missing("gpg2", "required to sign/verify buildcaches", False)
}
if platform.system().lower() == "darwin":
if sys.platform == "darwin":
_buildcache_exes["otool"] = _missing("otool", "required to relocate binaries")
# Executables that are not bootstrapped yet
result = [_required_system_executable(exe, msg) for exe, msg in _buildcache_exes.items()]
if platform.system().lower() == "linux":
if sys.platform == "linux":
result.append(
_required_executable(
"patchelf",

View File

@ -103,7 +103,7 @@
from spack.phase_callbacks import run_after, run_before
from spack.spec import InvalidSpecDetected, Spec
from spack.util.executable import *
from spack.util.filesystem import file_command, fix_darwin_install_name, mime_type
from spack.util.filesystem import fix_darwin_install_name
from spack.variant import any_combination_of, auto_or_any_combination_of, disjoint_sets
from spack.version import Version, ver

View File

@ -23,7 +23,6 @@
import spack.store
import spack.util.elf as elf
import spack.util.executable as executable
import spack.util.filesystem as ssys
from .relocate_text import BinaryFilePrefixReplacer, TextFilePrefixReplacer
@ -350,32 +349,6 @@ def _set_elf_rpaths_and_interpreter(
return None
def needs_binary_relocation(m_type, m_subtype):
"""Returns True if the file with MIME type/subtype passed as arguments
needs binary relocation, False otherwise.
Args:
m_type (str): MIME type of the file
m_subtype (str): MIME subtype of the file
"""
subtypes = ("x-executable", "x-sharedlib", "x-mach-binary", "x-pie-executable")
if m_type == "application":
if m_subtype in subtypes:
return True
return False
def needs_text_relocation(m_type, m_subtype):
"""Returns True if the file with MIME type/subtype passed as arguments
needs text relocation, False otherwise.
Args:
m_type (str): MIME type of the file
m_subtype (str): MIME subtype of the file
"""
return m_type == "text"
def relocate_macho_binaries(
path_names, old_layout_root, new_layout_root, prefix_to_prefix, rel, old_prefix, new_prefix
):
@ -623,24 +596,32 @@ def relocate_text_bin(binaries, prefixes):
return BinaryFilePrefixReplacer.from_strings_or_bytes(prefixes).apply(binaries)
def is_binary(filename):
"""Returns true if a file is binary, False otherwise
def is_macho_magic(magic: bytes) -> bool:
return (
# In order of popularity: 64-bit mach-o le/be, 32-bit mach-o le/be.
magic.startswith(b"\xCF\xFA\xED\xFE")
or magic.startswith(b"\xFE\xED\xFA\xCF")
or magic.startswith(b"\xCE\xFA\xED\xFE")
or magic.startswith(b"\xFE\xED\xFA\xCE")
# universal binaries: 0xcafebabe be (most common?) or 0xbebafeca le (not sure if exists).
# Here we need to disambiguate mach-o and JVM class files. In mach-o the next 4 bytes are
# the number of binaries; in JVM class files it's the java version number. We assume there
# are less than 10 binaries in a universal binary.
or (magic.startswith(b"\xCA\xFE\xBA\xBE") and int.from_bytes(magic[4:8], "big") < 10)
or (magic.startswith(b"\xBE\xBA\xFE\xCA") and int.from_bytes(magic[4:8], "little") < 10)
)
Args:
filename: file to be tested
Returns:
True or False
"""
m_type, _ = ssys.mime_type(filename)
def is_elf_magic(magic: bytes) -> bool:
return magic.startswith(b"\x7FELF")
msg = "[{0}] -> ".format(filename)
if m_type == "application":
tty.debug(msg + "BINARY FILE")
return True
tty.debug(msg + "TEXT FILE")
return False
def is_binary(filename: str) -> bool:
"""Returns true iff a file is likely binary"""
with open(filename, "rb") as f:
magic = f.read(8)
return is_macho_magic(magic) or is_elf_magic(magic)
# Memoize this due to repeated calls to libraries in the same directory.
@ -649,6 +630,14 @@ def _exists_dir(dirname):
return os.path.isdir(dirname)
def is_macho_binary(path):
try:
with open(path, "rb") as f:
return is_macho_magic(f.read(4))
except OSError:
return False
def fixup_macos_rpath(root, filename):
"""Apply rpath fixups to the given file.
@ -660,7 +649,8 @@ def fixup_macos_rpath(root, filename):
True if fixups were applied, else False
"""
abspath = os.path.join(root, filename)
if ssys.mime_type(abspath) != ("application", "x-mach-binary"):
if not is_macho_binary(abspath):
return False
# Get Mach-O header commands

View File

@ -4,12 +4,8 @@
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
import os
import re
import shutil
import tempfile
from collections import OrderedDict
from llnl.util.symlink import readlink, symlink
import spack.binary_distribution as bindist
import spack.deptypes as dt
@ -20,19 +16,6 @@
import spack.store
def _relocate_spliced_links(links, orig_prefix, new_prefix):
"""Re-linking function which differs from `relocate.relocate_links` by
reading the old link rather than the new link, since the latter wasn't moved
in our case. This still needs to be called after the copy to destination
because it expects the new directory structure to be in place."""
for link in links:
link_target = readlink(os.path.join(orig_prefix, link))
link_target = re.sub("^" + orig_prefix, new_prefix, link_target)
new_link_path = os.path.join(new_prefix, link)
os.unlink(new_link_path)
symlink(link_target, new_link_path)
def rewire(spliced_spec):
"""Given a spliced spec, this function conducts all the rewiring on all
nodes in the DAG of that spec."""
@ -54,13 +37,17 @@ def rewire_node(spec, explicit):
the splice. The resulting package is then 'installed.'"""
tempdir = tempfile.mkdtemp()
# copy anything installed to a temporary directory
shutil.copytree(spec.build_spec.prefix, os.path.join(tempdir, spec.dag_hash()))
# Copy spec.build_spec.prefix to spec.prefix through a temporary tarball
tarball = os.path.join(tempdir, f"{spec.dag_hash()}.tar.gz")
bindist.create_tarball(spec.build_spec, tarball)
spack.hooks.pre_install(spec)
bindist.extract_buildcache_tarball(tarball, destination=spec.prefix)
buildinfo = bindist.read_buildinfo_file(spec.prefix)
# compute prefix-to-prefix for every node from the build spec to the spliced
# spec
prefix_to_prefix = OrderedDict({spec.build_spec.prefix: spec.prefix})
prefix_to_prefix = {spec.build_spec.prefix: spec.prefix}
build_spec_ids = set(id(s) for s in spec.build_spec.traverse(deptype=dt.ALL & ~dt.BUILD))
for s in bindist.deps_to_relocate(spec):
analog = s
@ -77,19 +64,17 @@ def rewire_node(spec, explicit):
prefix_to_prefix[analog.prefix] = s.prefix
manifest = bindist.get_buildfile_manifest(spec.build_spec)
platform = spack.platforms.by_name(spec.platform)
text_to_relocate = [
os.path.join(tempdir, spec.dag_hash(), rel_path)
for rel_path in manifest.get("text_to_relocate", [])
os.path.join(spec.prefix, rel_path) for rel_path in buildinfo["relocate_textfiles"]
]
if text_to_relocate:
relocate.relocate_text(files=text_to_relocate, prefixes=prefix_to_prefix)
links = [os.path.join(spec.prefix, f) for f in buildinfo["relocate_links"]]
relocate.relocate_links(links, prefix_to_prefix)
bins_to_relocate = [
os.path.join(tempdir, spec.dag_hash(), rel_path)
for rel_path in manifest.get("binary_to_relocate", [])
os.path.join(spec.prefix, rel_path) for rel_path in buildinfo["relocate_binaries"]
]
if bins_to_relocate:
if "macho" in platform.binary_formats:
@ -113,22 +98,18 @@ def rewire_node(spec, explicit):
spec.prefix,
)
relocate.relocate_text_bin(binaries=bins_to_relocate, prefixes=prefix_to_prefix)
# Copy package into place, except for spec.json (because spec.json
# describes the old spec and not the new spliced spec).
shutil.copytree(
os.path.join(tempdir, spec.dag_hash()),
spec.prefix,
ignore=shutil.ignore_patterns("spec.json", "install_manifest.json"),
)
if manifest.get("link_to_relocate"):
_relocate_spliced_links(
manifest.get("link_to_relocate"), spec.build_spec.prefix, spec.prefix
)
shutil.rmtree(tempdir)
# Above, we did not copy spec.json: instead, here we write the new
# (spliced) spec into spec.json, without this, Database.add would fail on
# the next line (because it checks the spec.json in the prefix against the
# spec being added to look for mismatches)
install_manifest = os.path.join(
spec.prefix,
spack.store.STORE.layout.metadata_dir,
spack.store.STORE.layout.manifest_file_name,
)
try:
os.unlink(install_manifest)
except FileNotFoundError:
pass
# Write the spliced spec into spec.json. Without this, Database.add would fail because it
# checks the spec.json in the prefix against the spec being added to look for mismatches
spack.store.STORE.layout.write_spec(spec, spack.store.STORE.layout.spec_file_path(spec))
# add to database, not sure about explicit
spack.store.STORE.db.add(spec, explicit=explicit)

View File

@ -23,7 +23,7 @@
import archspec.cpu
from llnl.util.filesystem import copy_tree, join_path, visit_directory_tree
from llnl.util.filesystem import copy_tree, join_path
from llnl.util.symlink import readlink
import spack.binary_distribution as bindist
@ -43,7 +43,7 @@
import spack.util.spack_yaml as syaml
import spack.util.url as url_util
import spack.util.web as web_util
from spack.binary_distribution import CannotListKeys, GenerateIndexError, get_buildfile_manifest
from spack.binary_distribution import CannotListKeys, GenerateIndexError
from spack.directory_layout import DirectoryLayout
from spack.paths import test_path
from spack.spec import Spec
@ -623,60 +623,21 @@ def test_FetchCacheError_pretty_printing_single():
assert str_e.rstrip() == str_e
def test_build_manifest_visitor(tmpdir):
dir = "directory"
file = os.path.join("directory", "file")
with tmpdir.as_cwd():
# Create a file inside a directory
os.mkdir(dir)
with open(file, "wb") as f:
f.write(b"example file")
# Symlink the dir
os.symlink(dir, "symlink_to_directory")
# Symlink the file
os.symlink(file, "symlink_to_file")
# Hardlink the file
os.link(file, "hardlink_of_file")
# Hardlinked symlinks: seems like this is only a thing on Linux,
# on Darwin the symlink *target* is hardlinked, on Linux the
# symlink *itself* is hardlinked.
if sys.platform.startswith("linux"):
os.link("symlink_to_file", "hardlink_of_symlink_to_file")
os.link("symlink_to_directory", "hardlink_of_symlink_to_directory")
visitor = bindist.BuildManifestVisitor()
visit_directory_tree(str(tmpdir), visitor)
# We de-dupe hardlinks of files, so there should really be just one file
assert len(visitor.files) == 1
# We do not de-dupe symlinks, cause it's unclear how to update symlinks
# in-place, preserving inodes.
if sys.platform.startswith("linux"):
assert len(visitor.symlinks) == 4 # includes hardlinks of symlinks.
else:
assert len(visitor.symlinks) == 2
with tmpdir.as_cwd():
assert not any(os.path.islink(f) or os.path.isdir(f) for f in visitor.files)
assert all(os.path.islink(f) for f in visitor.symlinks)
def test_text_relocate_if_needed(install_mockery, temporary_store, mock_fetch, monkeypatch, capfd):
def test_text_relocate_if_needed(install_mockery, temporary_store, mock_fetch, tmp_path):
install_cmd("needs-text-relocation")
spec = temporary_store.db.query_one("needs-text-relocation")
tgz_path = tmp_path / "relocatable.tar.gz"
bindist.create_tarball(spec, str(tgz_path))
specs = temporary_store.db.query("needs-text-relocation")
assert len(specs) == 1
manifest = get_buildfile_manifest(specs[0])
# extract the .spack/binary_distribution file
with tarfile.open(tgz_path) as tar:
entry_name = next(x for x in tar.getnames() if x.endswith(".spack/binary_distribution"))
bd_file = tar.extractfile(entry_name)
manifest = syaml.load(bd_file)
assert join_path("bin", "exe") in manifest["text_to_relocate"]
assert join_path("bin", "otherexe") not in manifest["text_to_relocate"]
assert join_path("bin", "secretexe") not in manifest["text_to_relocate"]
assert join_path("bin", "exe") in manifest["relocate_textfiles"]
assert join_path("bin", "otherexe") not in manifest["relocate_textfiles"]
assert join_path("bin", "secretexe") not in manifest["relocate_textfiles"]
def test_etag_fetching_304():
@ -917,7 +878,7 @@ def test_tarball_doesnt_include_buildinfo_twice(tmp_path: Path):
tarball = str(tmp_path / "prefix.tar.gz")
bindist._do_create_tarball(
tarfile_path=tarball, binaries_dir=str(p), buildinfo={"metadata": "new"}
tarfile_path=tarball, prefix=str(p), buildinfo={"metadata": "new"}, prefixes_to_relocate=[]
)
expected_prefix = str(p).lstrip("/")
@ -926,7 +887,10 @@ def test_tarball_doesnt_include_buildinfo_twice(tmp_path: Path):
# and that the tarball contains the new one, not the old one.
with tarfile.open(tarball) as tar:
assert syaml.load(tar.extractfile(f"{expected_prefix}/.spack/binary_distribution")) == {
"metadata": "new"
"metadata": "new",
"relocate_binaries": [],
"relocate_textfiles": [],
"relocate_links": [],
}
assert tar.getnames() == [
*_all_parents(expected_prefix),
@ -951,11 +915,15 @@ def test_reproducible_tarball_is_reproducible(tmp_path: Path):
# Create a tarball with a certain mtime of bin/app
os.utime(app, times=(0, 0))
bindist._do_create_tarball(tarball_1, binaries_dir=str(p), buildinfo=buildinfo)
bindist._do_create_tarball(
tarball_1, prefix=str(p), buildinfo=buildinfo, prefixes_to_relocate=[]
)
# Do it another time with different mtime of bin/app
os.utime(app, times=(10, 10))
bindist._do_create_tarball(tarball_2, binaries_dir=str(p), buildinfo=buildinfo)
bindist._do_create_tarball(
tarball_2, prefix=str(p), buildinfo=buildinfo, prefixes_to_relocate=[]
)
# They should be bitwise identical:
assert filecmp.cmp(tarball_1, tarball_2, shallow=False)
@ -1001,7 +969,7 @@ def test_tarball_normalized_permissions(tmpdir):
) as f:
f.write("hello world")
bindist._do_create_tarball(tarball, binaries_dir=p.strpath, buildinfo={})
bindist._do_create_tarball(tarball, prefix=p.strpath, buildinfo={}, prefixes_to_relocate=[])
expected_prefix = p.strpath.lstrip("/")
@ -1120,7 +1088,7 @@ def test_tarfile_of_spec_prefix(tmpdir):
file = tmpdir.join("example.tar")
with tarfile.open(file, mode="w") as tar:
bindist.tarfile_of_spec_prefix(tar, prefix.strpath)
bindist.tarfile_of_spec_prefix(tar, prefix.strpath, prefixes_to_relocate=[])
expected_prefix = prefix.strpath.lstrip("/")

View File

@ -36,8 +36,6 @@
macho_find_paths,
macho_make_paths_normal,
macho_make_paths_relative,
needs_binary_relocation,
needs_text_relocation,
relocate_links,
relocate_text,
)
@ -193,16 +191,6 @@ def test_relocate_links(tmpdir):
assert readlink("to_self_but_relative") == "relative"
def test_needs_relocation():
assert needs_binary_relocation("application", "x-sharedlib")
assert needs_binary_relocation("application", "x-executable")
assert not needs_binary_relocation("application", "x-octet-stream")
assert not needs_binary_relocation("text", "x-")
assert needs_text_relocation("text", "x-")
assert not needs_text_relocation("symbolic link to", "x-")
assert needs_binary_relocation("application", "x-mach-binary")
def test_replace_paths(tmpdir):
with tmpdir.as_cwd():
suffix = "dylib" if platform.system().lower() == "darwin" else "so"

View File

@ -10,7 +10,7 @@
import tarfile
from contextlib import closing, contextmanager
from gzip import GzipFile
from typing import Callable, Dict, Tuple
from typing import Callable, Dict, List, Tuple
from llnl.util.symlink import readlink
@ -130,6 +130,15 @@ def default_path_to_name(path: str) -> str:
return pathlib.PurePath(*p.parts[1:]).as_posix() if p.is_absolute() else p.as_posix()
def default_add_file(tar: tarfile.TarFile, file_info: tarfile.TarInfo, path: str) -> None:
with open(path, "rb") as f:
tar.addfile(file_info, f)
def default_add_link(tar: tarfile.TarFile, file_info: tarfile.TarInfo, path: str) -> None:
tar.addfile(file_info)
def reproducible_tarfile_from_prefix(
tar: tarfile.TarFile,
prefix: str,
@ -137,6 +146,9 @@ def reproducible_tarfile_from_prefix(
include_parent_directories: bool = False,
skip: Callable[[os.DirEntry], bool] = lambda entry: False,
path_to_name: Callable[[str], str] = default_path_to_name,
add_file: Callable[[tarfile.TarFile, tarfile.TarInfo, str], None] = default_add_file,
add_symlink: Callable[[tarfile.TarFile, tarfile.TarInfo, str], None] = default_add_link,
add_hardlink: Callable[[tarfile.TarFile, tarfile.TarInfo, str], None] = default_add_link,
) -> None:
"""Create a tarball from a given directory. Only adds regular files, symlinks and dirs.
Skips devices, fifos. Preserves hardlinks. Normalizes permissions like git. Tar entries are
@ -170,8 +182,10 @@ def reproducible_tarfile_from_prefix(
tar.addfile(dir_info)
dir_stack = [prefix]
new_dirs: List[str] = []
while dir_stack:
dir = dir_stack.pop()
new_dirs.clear()
# Add the dir before its contents
dir_info = tarfile.TarInfo(path_to_name(dir))
@ -183,7 +197,6 @@ def reproducible_tarfile_from_prefix(
with os.scandir(dir) as it:
entries = sorted(it, key=lambda entry: entry.name)
new_dirs = []
for entry in entries:
if skip(entry):
continue
@ -201,7 +214,7 @@ def reproducible_tarfile_from_prefix(
# st_mode field of the stat structure is unspecified." So we set it to
# something sensible without lstat'ing the link.
file_info.mode = 0o755
tar.addfile(file_info)
add_symlink(tar, file_info, entry.path)
elif entry.is_file(follow_symlinks=False):
# entry.stat has zero (st_ino, st_dev, st_nlink) on Windows: use lstat.
@ -216,15 +229,13 @@ def reproducible_tarfile_from_prefix(
if ident in hardlink_to_tarinfo_name:
file_info.type = tarfile.LNKTYPE
file_info.linkname = hardlink_to_tarinfo_name[ident]
tar.addfile(file_info)
add_hardlink(tar, file_info, entry.path)
continue
hardlink_to_tarinfo_name[ident] = file_info.name
# If file not yet seen, copy it
file_info.type = tarfile.REGTYPE
file_info.size = s.st_size
with open(entry.path, "rb") as f:
tar.addfile(file_info, f)
add_file(tar, file_info, entry.path)
dir_stack.extend(reversed(new_dirs)) # we pop, so reverse to stay alphabetical

View File

@ -10,62 +10,10 @@
import glob
import os
import sys
from llnl.util import tty
from llnl.util.filesystem import edit_in_place_through_temporary_file
from llnl.util.lang import memoized
from spack.util.executable import Executable, which
def _ensure_file_on_win():
"""Ensures the file command is available on Windows
If not, it is bootstrapped.
No-op on all other platforms"""
if sys.platform != "win32":
return
import spack.bootstrap
with spack.bootstrap.ensure_bootstrap_configuration():
spack.bootstrap.ensure_file_in_path_or_raise()
@memoized
def file_command(*args):
"""Creates entry point to `file` system command with provided arguments"""
_ensure_file_on_win()
file_cmd = which("file", required=True)
for arg in args:
file_cmd.add_default_arg(arg)
return file_cmd
@memoized
def _get_mime_type():
"""Generate method to call `file` system command to aquire mime type
for a specified path
"""
if sys.platform == "win32":
# -h option (no-dereference) does not exist in Windows
return file_command("-b", "--mime-type")
else:
return file_command("-b", "-h", "--mime-type")
def mime_type(filename):
"""Returns the mime type and subtype of a file.
Args:
filename: file to be analyzed
Returns:
Tuple containing the MIME type and subtype
"""
output = _get_mime_type()(filename, output=str, error=str).strip()
tty.debug("==> " + output)
type, _, subtype = output.partition("/")
return type, subtype
from spack.util.executable import Executable
def fix_darwin_install_name(path):

View File

@ -1,4 +0,0 @@
from spack.util.filesystem import file_command
if __name__ == "__main__":
file_command()