Create reproducible tarballs in VCSFetchStrategy.archive (#42042)
Currently when you repeatedly create a bootstrap mirror that includes `clingo-bootstrap@spack` you get different tarballs every time. This is a general problem with mirroring checkouts from version control as tarballs. I think it's best to create tarballs ourselves, since that way we have more control over its contents. This PR ensures normalized tarballs like we do for build caches: - normalize file permissions (in fact that was already inspired by git, so should be good) - normalized file creation/modification time (timestamp 0) - uid / guid = 0, no usernames - normalized gzip header - dir entries are ordered by `(is_dir, name)` where strings are not locale aware ;) - POSIX says st_mode of symlinks is unspecified, so work around it and force mode to `0o755`
This commit is contained in:
parent
c05ed2c31a
commit
28675478ce
@ -5,7 +5,6 @@
|
|||||||
|
|
||||||
import codecs
|
import codecs
|
||||||
import collections
|
import collections
|
||||||
import errno
|
|
||||||
import hashlib
|
import hashlib
|
||||||
import io
|
import io
|
||||||
import itertools
|
import itertools
|
||||||
@ -23,8 +22,7 @@
|
|||||||
import urllib.parse
|
import urllib.parse
|
||||||
import urllib.request
|
import urllib.request
|
||||||
import warnings
|
import warnings
|
||||||
from contextlib import closing, contextmanager
|
from contextlib import closing
|
||||||
from gzip import GzipFile
|
|
||||||
from typing import Dict, Iterable, List, NamedTuple, Optional, Set, Tuple
|
from typing import Dict, Iterable, List, NamedTuple, Optional, Set, Tuple
|
||||||
from urllib.error import HTTPError, URLError
|
from urllib.error import HTTPError, URLError
|
||||||
|
|
||||||
@ -50,6 +48,7 @@
|
|||||||
import spack.stage
|
import spack.stage
|
||||||
import spack.store
|
import spack.store
|
||||||
import spack.traverse as traverse
|
import spack.traverse as traverse
|
||||||
|
import spack.util.archive
|
||||||
import spack.util.crypto
|
import spack.util.crypto
|
||||||
import spack.util.file_cache as file_cache
|
import spack.util.file_cache as file_cache
|
||||||
import spack.util.gpg
|
import spack.util.gpg
|
||||||
@ -1133,205 +1132,46 @@ def generate_key_index(key_prefix, tmpdir=None):
|
|||||||
shutil.rmtree(tmpdir)
|
shutil.rmtree(tmpdir)
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
|
||||||
def gzip_compressed_tarfile(path):
|
|
||||||
"""Create a reproducible, compressed tarfile"""
|
|
||||||
# Create gzip compressed tarball of the install prefix
|
|
||||||
# 1) Use explicit empty filename and mtime 0 for gzip header reproducibility.
|
|
||||||
# If the filename="" is dropped, Python will use fileobj.name instead.
|
|
||||||
# This should effectively mimick `gzip --no-name`.
|
|
||||||
# 2) On AMD Ryzen 3700X and an SSD disk, we have the following on compression speed:
|
|
||||||
# compresslevel=6 gzip default: llvm takes 4mins, roughly 2.1GB
|
|
||||||
# compresslevel=9 python default: llvm takes 12mins, roughly 2.1GB
|
|
||||||
# So we follow gzip.
|
|
||||||
with open(path, "wb") as f, ChecksumWriter(f) as inner_checksum, closing(
|
|
||||||
GzipFile(filename="", mode="wb", compresslevel=6, mtime=0, fileobj=inner_checksum)
|
|
||||||
) as gzip_file, ChecksumWriter(gzip_file) as outer_checksum, tarfile.TarFile(
|
|
||||||
name="", mode="w", fileobj=outer_checksum
|
|
||||||
) as tar:
|
|
||||||
yield tar, inner_checksum, outer_checksum
|
|
||||||
|
|
||||||
|
|
||||||
def _tarinfo_name(absolute_path: str, *, _path=pathlib.PurePath) -> str:
|
|
||||||
"""Compute tarfile entry name as the relative path from the (system) root."""
|
|
||||||
return _path(*_path(absolute_path).parts[1:]).as_posix()
|
|
||||||
|
|
||||||
|
|
||||||
def tarfile_of_spec_prefix(tar: tarfile.TarFile, prefix: str) -> None:
|
def tarfile_of_spec_prefix(tar: tarfile.TarFile, prefix: str) -> None:
|
||||||
"""Create a tarfile of an install prefix of a spec. Skips existing buildinfo file.
|
"""Create a tarfile of an install prefix of a spec. Skips existing buildinfo file.
|
||||||
Only adds regular files, symlinks and dirs. Skips devices, fifos. Preserves hardlinks.
|
|
||||||
Normalizes permissions like git. Tar entries are added in depth-first pre-order, with
|
|
||||||
dir entries partitioned by file | dir, and sorted alphabetically, for reproducibility.
|
|
||||||
Partitioning ensures only one dir is in memory at a time, and sorting improves compression.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
tar: tarfile object to add files to
|
tar: tarfile object to add files to
|
||||||
prefix: absolute install prefix of spec"""
|
prefix: absolute install prefix of spec"""
|
||||||
if not os.path.isabs(prefix) or not os.path.isdir(prefix):
|
if not os.path.isabs(prefix) or not os.path.isdir(prefix):
|
||||||
raise ValueError(f"prefix '{prefix}' must be an absolute path to a directory")
|
raise ValueError(f"prefix '{prefix}' must be an absolute path to a directory")
|
||||||
hardlink_to_tarinfo_name: Dict[Tuple[int, int], str] = dict()
|
|
||||||
stat_key = lambda stat: (stat.st_dev, stat.st_ino)
|
stat_key = lambda stat: (stat.st_dev, stat.st_ino)
|
||||||
|
|
||||||
try: # skip buildinfo file if it exists
|
try: # skip buildinfo file if it exists
|
||||||
files_to_skip = [stat_key(os.lstat(buildinfo_file_name(prefix)))]
|
files_to_skip = [stat_key(os.lstat(buildinfo_file_name(prefix)))]
|
||||||
|
skip = lambda entry: stat_key(entry.stat(follow_symlinks=False)) in files_to_skip
|
||||||
except OSError:
|
except OSError:
|
||||||
files_to_skip = []
|
skip = lambda entry: False
|
||||||
|
|
||||||
# First add all directories leading up to `prefix` (Spack <= 0.21 did not do this, leading to
|
spack.util.archive.reproducible_tarfile_from_prefix(
|
||||||
# issues when tarballs are used in runtimes like AWS lambda). Skip the file system root.
|
tar,
|
||||||
parent_dirs = reversed(pathlib.Path(prefix).parents)
|
prefix,
|
||||||
next(parent_dirs) # skip the root: slices are supported from python 3.10
|
# Spack <= 0.21 did not include parent directories, leading to issues when tarballs are
|
||||||
for parent_dir in parent_dirs:
|
# used in runtimes like AWS lambda.
|
||||||
dir_info = tarfile.TarInfo(_tarinfo_name(str(parent_dir)))
|
include_parent_directories=True,
|
||||||
dir_info.type = tarfile.DIRTYPE
|
skip=skip,
|
||||||
dir_info.mode = 0o755
|
)
|
||||||
tar.addfile(dir_info)
|
|
||||||
|
|
||||||
dir_stack = [prefix]
|
|
||||||
while dir_stack:
|
|
||||||
dir = dir_stack.pop()
|
|
||||||
|
|
||||||
# Add the dir before its contents
|
|
||||||
dir_info = tarfile.TarInfo(_tarinfo_name(dir))
|
|
||||||
dir_info.type = tarfile.DIRTYPE
|
|
||||||
dir_info.mode = 0o755
|
|
||||||
tar.addfile(dir_info)
|
|
||||||
|
|
||||||
# Sort by name: reproducible & improves compression
|
|
||||||
with os.scandir(dir) as it:
|
|
||||||
entries = sorted(it, key=lambda entry: entry.name)
|
|
||||||
|
|
||||||
new_dirs = []
|
|
||||||
for entry in entries:
|
|
||||||
if entry.is_dir(follow_symlinks=False):
|
|
||||||
new_dirs.append(entry.path)
|
|
||||||
continue
|
|
||||||
|
|
||||||
file_info = tarfile.TarInfo(_tarinfo_name(entry.path))
|
|
||||||
|
|
||||||
s = entry.stat(follow_symlinks=False)
|
|
||||||
|
|
||||||
# Skip existing binary distribution files.
|
|
||||||
id = stat_key(s)
|
|
||||||
if id in files_to_skip:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Normalize the mode
|
|
||||||
file_info.mode = 0o644 if s.st_mode & 0o100 == 0 else 0o755
|
|
||||||
|
|
||||||
if entry.is_symlink():
|
|
||||||
file_info.type = tarfile.SYMTYPE
|
|
||||||
file_info.linkname = os.readlink(entry.path)
|
|
||||||
tar.addfile(file_info)
|
|
||||||
|
|
||||||
elif entry.is_file(follow_symlinks=False):
|
|
||||||
# Deduplicate hardlinks
|
|
||||||
if s.st_nlink > 1:
|
|
||||||
if id in hardlink_to_tarinfo_name:
|
|
||||||
file_info.type = tarfile.LNKTYPE
|
|
||||||
file_info.linkname = hardlink_to_tarinfo_name[id]
|
|
||||||
tar.addfile(file_info)
|
|
||||||
continue
|
|
||||||
hardlink_to_tarinfo_name[id] = file_info.name
|
|
||||||
|
|
||||||
# If file not yet seen, copy it.
|
|
||||||
file_info.type = tarfile.REGTYPE
|
|
||||||
file_info.size = s.st_size
|
|
||||||
|
|
||||||
with open(entry.path, "rb") as f:
|
|
||||||
tar.addfile(file_info, f)
|
|
||||||
|
|
||||||
dir_stack.extend(reversed(new_dirs)) # we pop, so reverse to stay alphabetical
|
|
||||||
|
|
||||||
|
|
||||||
class ChecksumWriter(io.BufferedIOBase):
|
|
||||||
"""Checksum writer computes a checksum while writing to a file."""
|
|
||||||
|
|
||||||
myfileobj = None
|
|
||||||
|
|
||||||
def __init__(self, fileobj, algorithm=hashlib.sha256):
|
|
||||||
self.fileobj = fileobj
|
|
||||||
self.hasher = algorithm()
|
|
||||||
self.length = 0
|
|
||||||
|
|
||||||
def hexdigest(self):
|
|
||||||
return self.hasher.hexdigest()
|
|
||||||
|
|
||||||
def write(self, data):
|
|
||||||
if isinstance(data, (bytes, bytearray)):
|
|
||||||
length = len(data)
|
|
||||||
else:
|
|
||||||
data = memoryview(data)
|
|
||||||
length = data.nbytes
|
|
||||||
|
|
||||||
if length > 0:
|
|
||||||
self.fileobj.write(data)
|
|
||||||
self.hasher.update(data)
|
|
||||||
|
|
||||||
self.length += length
|
|
||||||
|
|
||||||
return length
|
|
||||||
|
|
||||||
def read(self, size=-1):
|
|
||||||
raise OSError(errno.EBADF, "read() on write-only object")
|
|
||||||
|
|
||||||
def read1(self, size=-1):
|
|
||||||
raise OSError(errno.EBADF, "read1() on write-only object")
|
|
||||||
|
|
||||||
def peek(self, n):
|
|
||||||
raise OSError(errno.EBADF, "peek() on write-only object")
|
|
||||||
|
|
||||||
@property
|
|
||||||
def closed(self):
|
|
||||||
return self.fileobj is None
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
fileobj = self.fileobj
|
|
||||||
if fileobj is None:
|
|
||||||
return
|
|
||||||
self.fileobj.close()
|
|
||||||
self.fileobj = None
|
|
||||||
|
|
||||||
def flush(self):
|
|
||||||
self.fileobj.flush()
|
|
||||||
|
|
||||||
def fileno(self):
|
|
||||||
return self.fileobj.fileno()
|
|
||||||
|
|
||||||
def rewind(self):
|
|
||||||
raise OSError("Can't rewind while computing checksum")
|
|
||||||
|
|
||||||
def readable(self):
|
|
||||||
return False
|
|
||||||
|
|
||||||
def writable(self):
|
|
||||||
return True
|
|
||||||
|
|
||||||
def seekable(self):
|
|
||||||
return True
|
|
||||||
|
|
||||||
def tell(self):
|
|
||||||
return self.fileobj.tell()
|
|
||||||
|
|
||||||
def seek(self, offset, whence=io.SEEK_SET):
|
|
||||||
# In principle forward seek is possible with b"0" padding,
|
|
||||||
# but this is not implemented.
|
|
||||||
if offset == 0 and whence == io.SEEK_CUR:
|
|
||||||
return
|
|
||||||
raise OSError("Can't seek while computing checksum")
|
|
||||||
|
|
||||||
def readline(self, size=-1):
|
|
||||||
raise OSError(errno.EBADF, "readline() on write-only object")
|
|
||||||
|
|
||||||
|
|
||||||
def _do_create_tarball(tarfile_path: str, binaries_dir: str, buildinfo: dict):
|
def _do_create_tarball(tarfile_path: str, binaries_dir: str, buildinfo: dict):
|
||||||
with gzip_compressed_tarfile(tarfile_path) as (tar, inner_checksum, outer_checksum):
|
with spack.util.archive.gzip_compressed_tarfile(tarfile_path) as (
|
||||||
|
tar,
|
||||||
|
inner_checksum,
|
||||||
|
outer_checksum,
|
||||||
|
):
|
||||||
# Tarball the install prefix
|
# Tarball the install prefix
|
||||||
tarfile_of_spec_prefix(tar, binaries_dir)
|
tarfile_of_spec_prefix(tar, binaries_dir)
|
||||||
|
|
||||||
# Serialize buildinfo for the tarball
|
# Serialize buildinfo for the tarball
|
||||||
bstring = syaml.dump(buildinfo, default_flow_style=True).encode("utf-8")
|
bstring = syaml.dump(buildinfo, default_flow_style=True).encode("utf-8")
|
||||||
tarinfo = tarfile.TarInfo(name=_tarinfo_name(buildinfo_file_name(binaries_dir)))
|
tarinfo = tarfile.TarInfo(
|
||||||
|
name=spack.util.archive.default_path_to_name(buildinfo_file_name(binaries_dir))
|
||||||
|
)
|
||||||
tarinfo.type = tarfile.REGTYPE
|
tarinfo.type = tarfile.REGTYPE
|
||||||
tarinfo.size = len(bstring)
|
tarinfo.size = len(bstring)
|
||||||
tarinfo.mode = 0o644
|
tarinfo.mode = 0o644
|
||||||
|
@ -30,6 +30,7 @@
|
|||||||
import shutil
|
import shutil
|
||||||
import urllib.error
|
import urllib.error
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
from pathlib import PurePath
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
import llnl.url
|
import llnl.url
|
||||||
@ -37,13 +38,14 @@
|
|||||||
import llnl.util.filesystem as fs
|
import llnl.util.filesystem as fs
|
||||||
import llnl.util.tty as tty
|
import llnl.util.tty as tty
|
||||||
from llnl.string import comma_and, quote
|
from llnl.string import comma_and, quote
|
||||||
from llnl.util.filesystem import get_single_file, mkdirp, temp_cwd, temp_rename, working_dir
|
from llnl.util.filesystem import get_single_file, mkdirp, temp_cwd, working_dir
|
||||||
from llnl.util.symlink import symlink
|
from llnl.util.symlink import symlink
|
||||||
|
|
||||||
import spack.config
|
import spack.config
|
||||||
import spack.error
|
import spack.error
|
||||||
import spack.oci.opener
|
import spack.oci.opener
|
||||||
import spack.url
|
import spack.url
|
||||||
|
import spack.util.archive
|
||||||
import spack.util.crypto as crypto
|
import spack.util.crypto as crypto
|
||||||
import spack.util.git
|
import spack.util.git
|
||||||
import spack.util.url as url_util
|
import spack.util.url as url_util
|
||||||
@ -600,29 +602,21 @@ def expand(self):
|
|||||||
tty.debug("Source fetched with %s is already expanded." % self.url_attr)
|
tty.debug("Source fetched with %s is already expanded." % self.url_attr)
|
||||||
|
|
||||||
@_needs_stage
|
@_needs_stage
|
||||||
def archive(self, destination, **kwargs):
|
def archive(self, destination, *, exclude: Optional[str] = None):
|
||||||
assert llnl.url.extension_from_path(destination) == "tar.gz"
|
assert llnl.url.extension_from_path(destination) == "tar.gz"
|
||||||
assert self.stage.source_path.startswith(self.stage.path)
|
assert self.stage.source_path.startswith(self.stage.path)
|
||||||
|
# We need to prepend this dir name to every entry of the tarfile
|
||||||
|
top_level_dir = PurePath(self.stage.srcdir or os.path.basename(self.stage.source_path))
|
||||||
|
|
||||||
tar = which("tar", required=True)
|
with working_dir(self.stage.source_path), spack.util.archive.gzip_compressed_tarfile(
|
||||||
|
destination
|
||||||
patterns = kwargs.get("exclude", None)
|
) as (tar, _, _):
|
||||||
if patterns is not None:
|
spack.util.archive.reproducible_tarfile_from_prefix(
|
||||||
if isinstance(patterns, str):
|
tar=tar,
|
||||||
patterns = [patterns]
|
prefix=".",
|
||||||
for p in patterns:
|
skip=lambda entry: entry.name == exclude,
|
||||||
tar.add_default_arg("--exclude=%s" % p)
|
path_to_name=lambda path: (top_level_dir / PurePath(path)).as_posix(),
|
||||||
|
)
|
||||||
with working_dir(self.stage.path):
|
|
||||||
if self.stage.srcdir:
|
|
||||||
# Here we create an archive with the default repository name.
|
|
||||||
# The 'tar' command has options for changing the name of a
|
|
||||||
# directory that is included in the archive, but they differ
|
|
||||||
# based on OS, so we temporarily rename the repo
|
|
||||||
with temp_rename(self.stage.source_path, self.stage.srcdir):
|
|
||||||
tar("-czf", destination, self.stage.srcdir)
|
|
||||||
else:
|
|
||||||
tar("-czf", destination, os.path.basename(self.stage.source_path))
|
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return "VCS: %s" % self.url
|
return "VCS: %s" % self.url
|
||||||
|
@ -13,11 +13,11 @@
|
|||||||
|
|
||||||
import spack.environment as ev
|
import spack.environment as ev
|
||||||
import spack.oci.opener
|
import spack.oci.opener
|
||||||
from spack.binary_distribution import gzip_compressed_tarfile
|
|
||||||
from spack.main import SpackCommand
|
from spack.main import SpackCommand
|
||||||
from spack.oci.image import Digest, ImageReference, default_config, default_manifest
|
from spack.oci.image import Digest, ImageReference, default_config, default_manifest
|
||||||
from spack.oci.oci import blob_exists, get_manifest_and_config, upload_blob, upload_manifest
|
from spack.oci.oci import blob_exists, get_manifest_and_config, upload_blob, upload_manifest
|
||||||
from spack.test.oci.mock_registry import DummyServer, InMemoryOCIRegistry, create_opener
|
from spack.test.oci.mock_registry import DummyServer, InMemoryOCIRegistry, create_opener
|
||||||
|
from spack.util.archive import gzip_compressed_tarfile
|
||||||
|
|
||||||
buildcache = SpackCommand("buildcache")
|
buildcache = SpackCommand("buildcache")
|
||||||
mirror = SpackCommand("mirror")
|
mirror = SpackCommand("mirror")
|
||||||
|
157
lib/spack/spack/test/util/archive.py
Normal file
157
lib/spack/spack/test/util/archive.py
Normal file
@ -0,0 +1,157 @@
|
|||||||
|
# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other
|
||||||
|
# Spack Project Developers. See the top-level COPYRIGHT file for details.
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
|
||||||
|
|
||||||
|
import gzip
|
||||||
|
import hashlib
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import tarfile
|
||||||
|
from pathlib import Path, PurePath
|
||||||
|
|
||||||
|
import spack.util.crypto
|
||||||
|
from spack.util.archive import gzip_compressed_tarfile, reproducible_tarfile_from_prefix
|
||||||
|
|
||||||
|
|
||||||
|
def test_gzip_compressed_tarball_is_reproducible(tmpdir):
|
||||||
|
"""Test gzip_compressed_tarfile and reproducible_tarfile_from_prefix for reproducibility"""
|
||||||
|
|
||||||
|
with tmpdir.as_cwd():
|
||||||
|
# Create a few directories
|
||||||
|
root = Path("root")
|
||||||
|
dir_a = root / "a"
|
||||||
|
dir_b = root / "b"
|
||||||
|
root.mkdir(mode=0o777)
|
||||||
|
dir_a.mkdir(mode=0o777)
|
||||||
|
dir_b.mkdir(mode=0o777)
|
||||||
|
|
||||||
|
(root / "y").touch()
|
||||||
|
(root / "x").touch()
|
||||||
|
|
||||||
|
(dir_a / "executable").touch(mode=0o777)
|
||||||
|
(dir_a / "data").touch(mode=0o666)
|
||||||
|
(dir_a / "symlink_file").symlink_to("data")
|
||||||
|
(dir_a / "symlink_dir").symlink_to(PurePath("..", "b"))
|
||||||
|
try:
|
||||||
|
os.link(dir_a / "executable", dir_a / "hardlink")
|
||||||
|
hardlink_support = True
|
||||||
|
except OSError:
|
||||||
|
hardlink_support = False
|
||||||
|
|
||||||
|
(dir_b / "executable").touch(mode=0o777)
|
||||||
|
(dir_b / "data").touch(mode=0o666)
|
||||||
|
(dir_b / "symlink_file").symlink_to("data")
|
||||||
|
(dir_b / "symlink_dir").symlink_to(PurePath("..", "a"))
|
||||||
|
|
||||||
|
# Create the first tarball
|
||||||
|
with gzip_compressed_tarfile("fst.tar.gz") as (tar, gzip_checksum_1, tarfile_checksum_1):
|
||||||
|
reproducible_tarfile_from_prefix(tar, "root")
|
||||||
|
|
||||||
|
# Expected mode for non-dirs is 644 if not executable, 755 if executable. Better to compute
|
||||||
|
# that as we don't know the umask of the user running the test.
|
||||||
|
expected_mode = (
|
||||||
|
lambda name: 0o755 if Path(*name.split("/")).lstat().st_mode & 0o100 else 0o644
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify the tarball contents
|
||||||
|
with tarfile.open("fst.tar.gz", "r:gz") as tar:
|
||||||
|
# Directories (mode is always 755)
|
||||||
|
for dir in ("root", "root/a", "root/b"):
|
||||||
|
m = tar.getmember(dir)
|
||||||
|
assert m.isdir()
|
||||||
|
assert m.mode == 0o755
|
||||||
|
assert m.uid == m.gid == 0
|
||||||
|
assert m.uname == m.gname == ""
|
||||||
|
|
||||||
|
# Non-executable regular files
|
||||||
|
for file in (
|
||||||
|
"root/x",
|
||||||
|
"root/y",
|
||||||
|
"root/a/data",
|
||||||
|
"root/b/data",
|
||||||
|
"root/a/executable",
|
||||||
|
"root/b/executable",
|
||||||
|
):
|
||||||
|
m = tar.getmember(file)
|
||||||
|
assert m.isreg()
|
||||||
|
assert m.mode == expected_mode(file)
|
||||||
|
assert m.uid == m.gid == 0
|
||||||
|
assert m.uname == m.gname == ""
|
||||||
|
|
||||||
|
# Symlinks
|
||||||
|
for file in (
|
||||||
|
"root/a/symlink_file",
|
||||||
|
"root/a/symlink_dir",
|
||||||
|
"root/b/symlink_file",
|
||||||
|
"root/b/symlink_dir",
|
||||||
|
):
|
||||||
|
m = tar.getmember(file)
|
||||||
|
assert m.issym()
|
||||||
|
assert m.mode == 0o755
|
||||||
|
assert m.uid == m.gid == m.mtime == 0
|
||||||
|
assert m.uname == m.gname == ""
|
||||||
|
|
||||||
|
# Verify the symlink targets. Notice that symlink targets are copied verbatim. That
|
||||||
|
# means the value is platform specific for relative symlinks within the current prefix,
|
||||||
|
# as on Windows they'd be ..\a and ..\b instead of ../a and ../b. So, reproducilility
|
||||||
|
# is only guaranteed per-platform currently.
|
||||||
|
assert PurePath(tar.getmember("root/a/symlink_file").linkname) == PurePath("data")
|
||||||
|
assert PurePath(tar.getmember("root/b/symlink_file").linkname) == PurePath("data")
|
||||||
|
assert PurePath(tar.getmember("root/a/symlink_dir").linkname) == PurePath("..", "b")
|
||||||
|
assert PurePath(tar.getmember("root/b/symlink_dir").linkname) == PurePath("..", "a")
|
||||||
|
|
||||||
|
# Check hardlink if supported
|
||||||
|
if hardlink_support:
|
||||||
|
m = tar.getmember("root/a/hardlink")
|
||||||
|
assert m.islnk()
|
||||||
|
assert m.mode == expected_mode("root/a/hardlink")
|
||||||
|
assert m.uid == m.gid == 0
|
||||||
|
assert m.uname == m.gname == ""
|
||||||
|
# Hardlink targets are always in posix format, as they reference a file that exists
|
||||||
|
# in the tarball.
|
||||||
|
assert m.linkname == "root/a/executable"
|
||||||
|
|
||||||
|
# Finally verify if entries are ordered by (is_dir, name)
|
||||||
|
assert [t.name for t in tar.getmembers()] == [
|
||||||
|
"root",
|
||||||
|
"root/x",
|
||||||
|
"root/y",
|
||||||
|
"root/a",
|
||||||
|
"root/a/data",
|
||||||
|
"root/a/executable",
|
||||||
|
*(["root/a/hardlink"] if hardlink_support else []),
|
||||||
|
"root/a/symlink_dir",
|
||||||
|
"root/a/symlink_file",
|
||||||
|
"root/b",
|
||||||
|
"root/b/data",
|
||||||
|
"root/b/executable",
|
||||||
|
"root/b/symlink_dir",
|
||||||
|
"root/b/symlink_file",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Delete the current root dir, extract the first tarball, create a second
|
||||||
|
shutil.rmtree(root)
|
||||||
|
with tarfile.open("fst.tar.gz", "r:gz") as tar:
|
||||||
|
tar.extractall()
|
||||||
|
|
||||||
|
# Create the second tarball
|
||||||
|
with gzip_compressed_tarfile("snd.tar.gz") as (tar, gzip_checksum_2, tarfile_checksum_2):
|
||||||
|
reproducible_tarfile_from_prefix(tar, "root")
|
||||||
|
|
||||||
|
# Verify the .tar.gz checksums are identical and correct
|
||||||
|
assert (
|
||||||
|
gzip_checksum_1.hexdigest()
|
||||||
|
== gzip_checksum_2.hexdigest()
|
||||||
|
== spack.util.crypto.checksum(hashlib.sha256, "fst.tar.gz")
|
||||||
|
== spack.util.crypto.checksum(hashlib.sha256, "snd.tar.gz")
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify the .tar checksums are identical and correct
|
||||||
|
with gzip.open("fst.tar.gz", "rb") as f, gzip.open("snd.tar.gz", "rb") as g:
|
||||||
|
assert (
|
||||||
|
tarfile_checksum_1.hexdigest()
|
||||||
|
== tarfile_checksum_2.hexdigest()
|
||||||
|
== spack.util.crypto.checksum_stream(hashlib.sha256, f)
|
||||||
|
== spack.util.crypto.checksum_stream(hashlib.sha256, g)
|
||||||
|
)
|
228
lib/spack/spack/util/archive.py
Normal file
228
lib/spack/spack/util/archive.py
Normal file
@ -0,0 +1,228 @@
|
|||||||
|
# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other
|
||||||
|
# Spack Project Developers. See the top-level COPYRIGHT file for details.
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
|
||||||
|
import errno
|
||||||
|
import hashlib
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
import tarfile
|
||||||
|
from contextlib import closing, contextmanager
|
||||||
|
from gzip import GzipFile
|
||||||
|
from typing import Callable, Dict, Tuple
|
||||||
|
|
||||||
|
|
||||||
|
class ChecksumWriter(io.BufferedIOBase):
|
||||||
|
"""Checksum writer computes a checksum while writing to a file."""
|
||||||
|
|
||||||
|
myfileobj = None
|
||||||
|
|
||||||
|
def __init__(self, fileobj, algorithm=hashlib.sha256):
|
||||||
|
self.fileobj = fileobj
|
||||||
|
self.hasher = algorithm()
|
||||||
|
self.length = 0
|
||||||
|
|
||||||
|
def hexdigest(self):
|
||||||
|
return self.hasher.hexdigest()
|
||||||
|
|
||||||
|
def write(self, data):
|
||||||
|
if isinstance(data, (bytes, bytearray)):
|
||||||
|
length = len(data)
|
||||||
|
else:
|
||||||
|
data = memoryview(data)
|
||||||
|
length = data.nbytes
|
||||||
|
|
||||||
|
if length > 0:
|
||||||
|
self.fileobj.write(data)
|
||||||
|
self.hasher.update(data)
|
||||||
|
|
||||||
|
self.length += length
|
||||||
|
|
||||||
|
return length
|
||||||
|
|
||||||
|
def read(self, size=-1):
|
||||||
|
raise OSError(errno.EBADF, "read() on write-only object")
|
||||||
|
|
||||||
|
def read1(self, size=-1):
|
||||||
|
raise OSError(errno.EBADF, "read1() on write-only object")
|
||||||
|
|
||||||
|
def peek(self, n):
|
||||||
|
raise OSError(errno.EBADF, "peek() on write-only object")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def closed(self):
|
||||||
|
return self.fileobj is None
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
fileobj = self.fileobj
|
||||||
|
if fileobj is None:
|
||||||
|
return
|
||||||
|
self.fileobj.close()
|
||||||
|
self.fileobj = None
|
||||||
|
|
||||||
|
def flush(self):
|
||||||
|
self.fileobj.flush()
|
||||||
|
|
||||||
|
def fileno(self):
|
||||||
|
return self.fileobj.fileno()
|
||||||
|
|
||||||
|
def rewind(self):
|
||||||
|
raise OSError("Can't rewind while computing checksum")
|
||||||
|
|
||||||
|
def readable(self):
|
||||||
|
return False
|
||||||
|
|
||||||
|
def writable(self):
|
||||||
|
return True
|
||||||
|
|
||||||
|
def seekable(self):
|
||||||
|
return True
|
||||||
|
|
||||||
|
def tell(self):
|
||||||
|
return self.fileobj.tell()
|
||||||
|
|
||||||
|
def seek(self, offset, whence=io.SEEK_SET):
|
||||||
|
# In principle forward seek is possible with b"0" padding,
|
||||||
|
# but this is not implemented.
|
||||||
|
if offset == 0 and whence == io.SEEK_CUR:
|
||||||
|
return
|
||||||
|
raise OSError("Can't seek while computing checksum")
|
||||||
|
|
||||||
|
def readline(self, size=-1):
|
||||||
|
raise OSError(errno.EBADF, "readline() on write-only object")
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def gzip_compressed_tarfile(path):
|
||||||
|
"""Create a reproducible, gzip compressed tarfile, and keep track of shasums of both the
|
||||||
|
compressed and uncompressed tarfile. Reproduciblity is achived by normalizing the gzip header
|
||||||
|
(no file name and zero mtime).
|
||||||
|
|
||||||
|
Yields a tuple of the following:
|
||||||
|
tarfile.TarFile: tarfile object
|
||||||
|
ChecksumWriter: checksum of the gzip compressed tarfile
|
||||||
|
ChecksumWriter: checksum of the uncompressed tarfile
|
||||||
|
"""
|
||||||
|
# Create gzip compressed tarball of the install prefix
|
||||||
|
# 1) Use explicit empty filename and mtime 0 for gzip header reproducibility.
|
||||||
|
# If the filename="" is dropped, Python will use fileobj.name instead.
|
||||||
|
# This should effectively mimick `gzip --no-name`.
|
||||||
|
# 2) On AMD Ryzen 3700X and an SSD disk, we have the following on compression speed:
|
||||||
|
# compresslevel=6 gzip default: llvm takes 4mins, roughly 2.1GB
|
||||||
|
# compresslevel=9 python default: llvm takes 12mins, roughly 2.1GB
|
||||||
|
# So we follow gzip.
|
||||||
|
with open(path, "wb") as f, ChecksumWriter(f) as gzip_checksum, closing(
|
||||||
|
GzipFile(filename="", mode="wb", compresslevel=6, mtime=0, fileobj=gzip_checksum)
|
||||||
|
) as gzip_file, ChecksumWriter(gzip_file) as tarfile_checksum, tarfile.TarFile(
|
||||||
|
name="", mode="w", fileobj=tarfile_checksum
|
||||||
|
) as tar:
|
||||||
|
yield tar, gzip_checksum, tarfile_checksum
|
||||||
|
|
||||||
|
|
||||||
|
def default_path_to_name(path: str) -> str:
|
||||||
|
"""Converts a path to a tarfile name, which uses posix path separators."""
|
||||||
|
p = pathlib.PurePath(path)
|
||||||
|
# Drop the leading slash on posix and the drive letter on windows, and always format as a
|
||||||
|
# posix path.
|
||||||
|
return pathlib.PurePath(*p.parts[1:]).as_posix() if p.is_absolute() else p.as_posix()
|
||||||
|
|
||||||
|
|
||||||
|
def reproducible_tarfile_from_prefix(
|
||||||
|
tar: tarfile.TarFile,
|
||||||
|
prefix: str,
|
||||||
|
*,
|
||||||
|
include_parent_directories: bool = False,
|
||||||
|
skip: Callable[[os.DirEntry], bool] = lambda entry: False,
|
||||||
|
path_to_name: Callable[[str], str] = default_path_to_name,
|
||||||
|
) -> None:
|
||||||
|
"""Create a tarball from a given directory. Only adds regular files, symlinks and dirs.
|
||||||
|
Skips devices, fifos. Preserves hardlinks. Normalizes permissions like git. Tar entries are
|
||||||
|
added in depth-first pre-order, with dir entries partitioned by file | dir, and sorted
|
||||||
|
lexicographically, for reproducibility. Partitioning ensures only one dir is in memory at a
|
||||||
|
time, and sorting improves compression.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tar: tarfile object opened in write mode
|
||||||
|
prefix: path to directory to tar (either absolute or relative)
|
||||||
|
include_parent_directories: whether to include every directory leading up to ``prefix`` in
|
||||||
|
the tarball
|
||||||
|
skip: function that receives a DirEntry and returns True if the entry should be skipped,
|
||||||
|
whether it is a file or directory. Default implementation does not skip anything.
|
||||||
|
path_to_name: function that converts a path string to a tarfile entry name, which should be
|
||||||
|
in posix format. Not only is it necessary to transform paths in certain cases, such as
|
||||||
|
windows path to posix format, but it can also be used to prepend a directory to each
|
||||||
|
entry even if it does not exist on the filesystem. The default implementation drops the
|
||||||
|
leading slash on posix and the drive letter on windows for absolute paths, and formats
|
||||||
|
as a posix."""
|
||||||
|
|
||||||
|
hardlink_to_tarinfo_name: Dict[Tuple[int, int], str] = dict()
|
||||||
|
|
||||||
|
if include_parent_directories:
|
||||||
|
parent_dirs = reversed(pathlib.Path(prefix).parents)
|
||||||
|
next(parent_dirs) # skip the root: slices are supported from python 3.10
|
||||||
|
for parent_dir in parent_dirs:
|
||||||
|
dir_info = tarfile.TarInfo(path_to_name(str(parent_dir)))
|
||||||
|
dir_info.type = tarfile.DIRTYPE
|
||||||
|
dir_info.mode = 0o755
|
||||||
|
tar.addfile(dir_info)
|
||||||
|
|
||||||
|
dir_stack = [prefix]
|
||||||
|
while dir_stack:
|
||||||
|
dir = dir_stack.pop()
|
||||||
|
|
||||||
|
# Add the dir before its contents
|
||||||
|
dir_info = tarfile.TarInfo(path_to_name(dir))
|
||||||
|
dir_info.type = tarfile.DIRTYPE
|
||||||
|
dir_info.mode = 0o755
|
||||||
|
tar.addfile(dir_info)
|
||||||
|
|
||||||
|
# Sort by name: reproducible & improves compression
|
||||||
|
with os.scandir(dir) as it:
|
||||||
|
entries = sorted(it, key=lambda entry: entry.name)
|
||||||
|
|
||||||
|
new_dirs = []
|
||||||
|
for entry in entries:
|
||||||
|
if skip(entry):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if entry.is_dir(follow_symlinks=False):
|
||||||
|
new_dirs.append(entry.path)
|
||||||
|
continue
|
||||||
|
|
||||||
|
file_info = tarfile.TarInfo(path_to_name(entry.path))
|
||||||
|
|
||||||
|
if entry.is_symlink():
|
||||||
|
file_info.type = tarfile.SYMTYPE
|
||||||
|
file_info.linkname = os.readlink(entry.path)
|
||||||
|
# According to POSIX: "the value of the file mode bits returned in the
|
||||||
|
# st_mode field of the stat structure is unspecified." So we set it to
|
||||||
|
# something sensible without lstat'ing the link.
|
||||||
|
file_info.mode = 0o755
|
||||||
|
tar.addfile(file_info)
|
||||||
|
|
||||||
|
elif entry.is_file(follow_symlinks=False):
|
||||||
|
# entry.stat has zero (st_ino, st_dev, st_nlink) on Windows: use lstat.
|
||||||
|
s = os.lstat(entry.path)
|
||||||
|
|
||||||
|
# Normalize permissions like git
|
||||||
|
file_info.mode = 0o755 if s.st_mode & 0o100 else 0o644
|
||||||
|
|
||||||
|
# Deduplicate hardlinks
|
||||||
|
if s.st_nlink > 1:
|
||||||
|
ident = (s.st_dev, s.st_ino)
|
||||||
|
if ident in hardlink_to_tarinfo_name:
|
||||||
|
file_info.type = tarfile.LNKTYPE
|
||||||
|
file_info.linkname = hardlink_to_tarinfo_name[ident]
|
||||||
|
tar.addfile(file_info)
|
||||||
|
continue
|
||||||
|
hardlink_to_tarinfo_name[ident] = file_info.name
|
||||||
|
|
||||||
|
# If file not yet seen, copy it
|
||||||
|
file_info.type = tarfile.REGTYPE
|
||||||
|
file_info.size = s.st_size
|
||||||
|
|
||||||
|
with open(entry.path, "rb") as f:
|
||||||
|
tar.addfile(file_info, f)
|
||||||
|
|
||||||
|
dir_stack.extend(reversed(new_dirs)) # we pop, so reverse to stay alphabetical
|
Loading…
Reference in New Issue
Block a user