Create reproducible tarballs in VCSFetchStrategy.archive (#42042)

Currently when you repeatedly create a bootstrap mirror that includes
`clingo-bootstrap@spack` you get different tarballs every time.

This is a general problem with mirroring checkouts from version control
as tarballs. I think it's best to create tarballs ourselves, since that way we
have more control over its contents.

This PR ensures normalized tarballs like we do for build caches:

- normalize file permissions (in fact that was already inspired by git, so
  should be good)
- normalized file creation/modification time (timestamp 0)
- uid / guid = 0, no usernames
- normalized gzip header
- dir entries are ordered by `(is_dir, name)` where strings are not locale aware ;)

- POSIX says st_mode of symlinks is unspecified, so work around it and
  force mode to `0o755`
This commit is contained in:
Harmen Stoppels 2024-01-17 06:11:43 +01:00 committed by GitHub
parent c05ed2c31a
commit 28675478ce
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 421 additions and 202 deletions

View File

@ -5,7 +5,6 @@
import codecs
import collections
import errno
import hashlib
import io
import itertools
@ -23,8 +22,7 @@
import urllib.parse
import urllib.request
import warnings
from contextlib import closing, contextmanager
from gzip import GzipFile
from contextlib import closing
from typing import Dict, Iterable, List, NamedTuple, Optional, Set, Tuple
from urllib.error import HTTPError, URLError
@ -50,6 +48,7 @@
import spack.stage
import spack.store
import spack.traverse as traverse
import spack.util.archive
import spack.util.crypto
import spack.util.file_cache as file_cache
import spack.util.gpg
@ -1133,205 +1132,46 @@ def generate_key_index(key_prefix, tmpdir=None):
shutil.rmtree(tmpdir)
@contextmanager
def gzip_compressed_tarfile(path):
"""Create a reproducible, compressed tarfile"""
# Create gzip compressed tarball of the install prefix
# 1) Use explicit empty filename and mtime 0 for gzip header reproducibility.
# If the filename="" is dropped, Python will use fileobj.name instead.
# This should effectively mimick `gzip --no-name`.
# 2) On AMD Ryzen 3700X and an SSD disk, we have the following on compression speed:
# compresslevel=6 gzip default: llvm takes 4mins, roughly 2.1GB
# compresslevel=9 python default: llvm takes 12mins, roughly 2.1GB
# So we follow gzip.
with open(path, "wb") as f, ChecksumWriter(f) as inner_checksum, closing(
GzipFile(filename="", mode="wb", compresslevel=6, mtime=0, fileobj=inner_checksum)
) as gzip_file, ChecksumWriter(gzip_file) as outer_checksum, tarfile.TarFile(
name="", mode="w", fileobj=outer_checksum
) as tar:
yield tar, inner_checksum, outer_checksum
def _tarinfo_name(absolute_path: str, *, _path=pathlib.PurePath) -> str:
"""Compute tarfile entry name as the relative path from the (system) root."""
return _path(*_path(absolute_path).parts[1:]).as_posix()
def tarfile_of_spec_prefix(tar: tarfile.TarFile, prefix: str) -> None:
"""Create a tarfile of an install prefix of a spec. Skips existing buildinfo file.
Only adds regular files, symlinks and dirs. Skips devices, fifos. Preserves hardlinks.
Normalizes permissions like git. Tar entries are added in depth-first pre-order, with
dir entries partitioned by file | dir, and sorted alphabetically, for reproducibility.
Partitioning ensures only one dir is in memory at a time, and sorting improves compression.
Args:
tar: tarfile object to add files to
prefix: absolute install prefix of spec"""
if not os.path.isabs(prefix) or not os.path.isdir(prefix):
raise ValueError(f"prefix '{prefix}' must be an absolute path to a directory")
hardlink_to_tarinfo_name: Dict[Tuple[int, int], str] = dict()
stat_key = lambda stat: (stat.st_dev, stat.st_ino)
try: # skip buildinfo file if it exists
files_to_skip = [stat_key(os.lstat(buildinfo_file_name(prefix)))]
skip = lambda entry: stat_key(entry.stat(follow_symlinks=False)) in files_to_skip
except OSError:
files_to_skip = []
skip = lambda entry: False
# First add all directories leading up to `prefix` (Spack <= 0.21 did not do this, leading to
# issues when tarballs are used in runtimes like AWS lambda). Skip the file system root.
parent_dirs = reversed(pathlib.Path(prefix).parents)
next(parent_dirs) # skip the root: slices are supported from python 3.10
for parent_dir in parent_dirs:
dir_info = tarfile.TarInfo(_tarinfo_name(str(parent_dir)))
dir_info.type = tarfile.DIRTYPE
dir_info.mode = 0o755
tar.addfile(dir_info)
dir_stack = [prefix]
while dir_stack:
dir = dir_stack.pop()
# Add the dir before its contents
dir_info = tarfile.TarInfo(_tarinfo_name(dir))
dir_info.type = tarfile.DIRTYPE
dir_info.mode = 0o755
tar.addfile(dir_info)
# Sort by name: reproducible & improves compression
with os.scandir(dir) as it:
entries = sorted(it, key=lambda entry: entry.name)
new_dirs = []
for entry in entries:
if entry.is_dir(follow_symlinks=False):
new_dirs.append(entry.path)
continue
file_info = tarfile.TarInfo(_tarinfo_name(entry.path))
s = entry.stat(follow_symlinks=False)
# Skip existing binary distribution files.
id = stat_key(s)
if id in files_to_skip:
continue
# Normalize the mode
file_info.mode = 0o644 if s.st_mode & 0o100 == 0 else 0o755
if entry.is_symlink():
file_info.type = tarfile.SYMTYPE
file_info.linkname = os.readlink(entry.path)
tar.addfile(file_info)
elif entry.is_file(follow_symlinks=False):
# Deduplicate hardlinks
if s.st_nlink > 1:
if id in hardlink_to_tarinfo_name:
file_info.type = tarfile.LNKTYPE
file_info.linkname = hardlink_to_tarinfo_name[id]
tar.addfile(file_info)
continue
hardlink_to_tarinfo_name[id] = file_info.name
# If file not yet seen, copy it.
file_info.type = tarfile.REGTYPE
file_info.size = s.st_size
with open(entry.path, "rb") as f:
tar.addfile(file_info, f)
dir_stack.extend(reversed(new_dirs)) # we pop, so reverse to stay alphabetical
class ChecksumWriter(io.BufferedIOBase):
"""Checksum writer computes a checksum while writing to a file."""
myfileobj = None
def __init__(self, fileobj, algorithm=hashlib.sha256):
self.fileobj = fileobj
self.hasher = algorithm()
self.length = 0
def hexdigest(self):
return self.hasher.hexdigest()
def write(self, data):
if isinstance(data, (bytes, bytearray)):
length = len(data)
else:
data = memoryview(data)
length = data.nbytes
if length > 0:
self.fileobj.write(data)
self.hasher.update(data)
self.length += length
return length
def read(self, size=-1):
raise OSError(errno.EBADF, "read() on write-only object")
def read1(self, size=-1):
raise OSError(errno.EBADF, "read1() on write-only object")
def peek(self, n):
raise OSError(errno.EBADF, "peek() on write-only object")
@property
def closed(self):
return self.fileobj is None
def close(self):
fileobj = self.fileobj
if fileobj is None:
return
self.fileobj.close()
self.fileobj = None
def flush(self):
self.fileobj.flush()
def fileno(self):
return self.fileobj.fileno()
def rewind(self):
raise OSError("Can't rewind while computing checksum")
def readable(self):
return False
def writable(self):
return True
def seekable(self):
return True
def tell(self):
return self.fileobj.tell()
def seek(self, offset, whence=io.SEEK_SET):
# In principle forward seek is possible with b"0" padding,
# but this is not implemented.
if offset == 0 and whence == io.SEEK_CUR:
return
raise OSError("Can't seek while computing checksum")
def readline(self, size=-1):
raise OSError(errno.EBADF, "readline() on write-only object")
spack.util.archive.reproducible_tarfile_from_prefix(
tar,
prefix,
# Spack <= 0.21 did not include parent directories, leading to issues when tarballs are
# used in runtimes like AWS lambda.
include_parent_directories=True,
skip=skip,
)
def _do_create_tarball(tarfile_path: str, binaries_dir: str, buildinfo: dict):
with gzip_compressed_tarfile(tarfile_path) as (tar, inner_checksum, outer_checksum):
with spack.util.archive.gzip_compressed_tarfile(tarfile_path) as (
tar,
inner_checksum,
outer_checksum,
):
# Tarball the install prefix
tarfile_of_spec_prefix(tar, binaries_dir)
# Serialize buildinfo for the tarball
bstring = syaml.dump(buildinfo, default_flow_style=True).encode("utf-8")
tarinfo = tarfile.TarInfo(name=_tarinfo_name(buildinfo_file_name(binaries_dir)))
tarinfo = tarfile.TarInfo(
name=spack.util.archive.default_path_to_name(buildinfo_file_name(binaries_dir))
)
tarinfo.type = tarfile.REGTYPE
tarinfo.size = len(bstring)
tarinfo.mode = 0o644

View File

@ -30,6 +30,7 @@
import shutil
import urllib.error
import urllib.parse
from pathlib import PurePath
from typing import List, Optional
import llnl.url
@ -37,13 +38,14 @@
import llnl.util.filesystem as fs
import llnl.util.tty as tty
from llnl.string import comma_and, quote
from llnl.util.filesystem import get_single_file, mkdirp, temp_cwd, temp_rename, working_dir
from llnl.util.filesystem import get_single_file, mkdirp, temp_cwd, working_dir
from llnl.util.symlink import symlink
import spack.config
import spack.error
import spack.oci.opener
import spack.url
import spack.util.archive
import spack.util.crypto as crypto
import spack.util.git
import spack.util.url as url_util
@ -600,29 +602,21 @@ def expand(self):
tty.debug("Source fetched with %s is already expanded." % self.url_attr)
@_needs_stage
def archive(self, destination, **kwargs):
def archive(self, destination, *, exclude: Optional[str] = None):
assert llnl.url.extension_from_path(destination) == "tar.gz"
assert self.stage.source_path.startswith(self.stage.path)
# We need to prepend this dir name to every entry of the tarfile
top_level_dir = PurePath(self.stage.srcdir or os.path.basename(self.stage.source_path))
tar = which("tar", required=True)
patterns = kwargs.get("exclude", None)
if patterns is not None:
if isinstance(patterns, str):
patterns = [patterns]
for p in patterns:
tar.add_default_arg("--exclude=%s" % p)
with working_dir(self.stage.path):
if self.stage.srcdir:
# Here we create an archive with the default repository name.
# The 'tar' command has options for changing the name of a
# directory that is included in the archive, but they differ
# based on OS, so we temporarily rename the repo
with temp_rename(self.stage.source_path, self.stage.srcdir):
tar("-czf", destination, self.stage.srcdir)
else:
tar("-czf", destination, os.path.basename(self.stage.source_path))
with working_dir(self.stage.source_path), spack.util.archive.gzip_compressed_tarfile(
destination
) as (tar, _, _):
spack.util.archive.reproducible_tarfile_from_prefix(
tar=tar,
prefix=".",
skip=lambda entry: entry.name == exclude,
path_to_name=lambda path: (top_level_dir / PurePath(path)).as_posix(),
)
def __str__(self):
return "VCS: %s" % self.url

View File

@ -13,11 +13,11 @@
import spack.environment as ev
import spack.oci.opener
from spack.binary_distribution import gzip_compressed_tarfile
from spack.main import SpackCommand
from spack.oci.image import Digest, ImageReference, default_config, default_manifest
from spack.oci.oci import blob_exists, get_manifest_and_config, upload_blob, upload_manifest
from spack.test.oci.mock_registry import DummyServer, InMemoryOCIRegistry, create_opener
from spack.util.archive import gzip_compressed_tarfile
buildcache = SpackCommand("buildcache")
mirror = SpackCommand("mirror")

View File

@ -0,0 +1,157 @@
# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other
# Spack Project Developers. See the top-level COPYRIGHT file for details.
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
import gzip
import hashlib
import os
import shutil
import tarfile
from pathlib import Path, PurePath
import spack.util.crypto
from spack.util.archive import gzip_compressed_tarfile, reproducible_tarfile_from_prefix
def test_gzip_compressed_tarball_is_reproducible(tmpdir):
"""Test gzip_compressed_tarfile and reproducible_tarfile_from_prefix for reproducibility"""
with tmpdir.as_cwd():
# Create a few directories
root = Path("root")
dir_a = root / "a"
dir_b = root / "b"
root.mkdir(mode=0o777)
dir_a.mkdir(mode=0o777)
dir_b.mkdir(mode=0o777)
(root / "y").touch()
(root / "x").touch()
(dir_a / "executable").touch(mode=0o777)
(dir_a / "data").touch(mode=0o666)
(dir_a / "symlink_file").symlink_to("data")
(dir_a / "symlink_dir").symlink_to(PurePath("..", "b"))
try:
os.link(dir_a / "executable", dir_a / "hardlink")
hardlink_support = True
except OSError:
hardlink_support = False
(dir_b / "executable").touch(mode=0o777)
(dir_b / "data").touch(mode=0o666)
(dir_b / "symlink_file").symlink_to("data")
(dir_b / "symlink_dir").symlink_to(PurePath("..", "a"))
# Create the first tarball
with gzip_compressed_tarfile("fst.tar.gz") as (tar, gzip_checksum_1, tarfile_checksum_1):
reproducible_tarfile_from_prefix(tar, "root")
# Expected mode for non-dirs is 644 if not executable, 755 if executable. Better to compute
# that as we don't know the umask of the user running the test.
expected_mode = (
lambda name: 0o755 if Path(*name.split("/")).lstat().st_mode & 0o100 else 0o644
)
# Verify the tarball contents
with tarfile.open("fst.tar.gz", "r:gz") as tar:
# Directories (mode is always 755)
for dir in ("root", "root/a", "root/b"):
m = tar.getmember(dir)
assert m.isdir()
assert m.mode == 0o755
assert m.uid == m.gid == 0
assert m.uname == m.gname == ""
# Non-executable regular files
for file in (
"root/x",
"root/y",
"root/a/data",
"root/b/data",
"root/a/executable",
"root/b/executable",
):
m = tar.getmember(file)
assert m.isreg()
assert m.mode == expected_mode(file)
assert m.uid == m.gid == 0
assert m.uname == m.gname == ""
# Symlinks
for file in (
"root/a/symlink_file",
"root/a/symlink_dir",
"root/b/symlink_file",
"root/b/symlink_dir",
):
m = tar.getmember(file)
assert m.issym()
assert m.mode == 0o755
assert m.uid == m.gid == m.mtime == 0
assert m.uname == m.gname == ""
# Verify the symlink targets. Notice that symlink targets are copied verbatim. That
# means the value is platform specific for relative symlinks within the current prefix,
# as on Windows they'd be ..\a and ..\b instead of ../a and ../b. So, reproducilility
# is only guaranteed per-platform currently.
assert PurePath(tar.getmember("root/a/symlink_file").linkname) == PurePath("data")
assert PurePath(tar.getmember("root/b/symlink_file").linkname) == PurePath("data")
assert PurePath(tar.getmember("root/a/symlink_dir").linkname) == PurePath("..", "b")
assert PurePath(tar.getmember("root/b/symlink_dir").linkname) == PurePath("..", "a")
# Check hardlink if supported
if hardlink_support:
m = tar.getmember("root/a/hardlink")
assert m.islnk()
assert m.mode == expected_mode("root/a/hardlink")
assert m.uid == m.gid == 0
assert m.uname == m.gname == ""
# Hardlink targets are always in posix format, as they reference a file that exists
# in the tarball.
assert m.linkname == "root/a/executable"
# Finally verify if entries are ordered by (is_dir, name)
assert [t.name for t in tar.getmembers()] == [
"root",
"root/x",
"root/y",
"root/a",
"root/a/data",
"root/a/executable",
*(["root/a/hardlink"] if hardlink_support else []),
"root/a/symlink_dir",
"root/a/symlink_file",
"root/b",
"root/b/data",
"root/b/executable",
"root/b/symlink_dir",
"root/b/symlink_file",
]
# Delete the current root dir, extract the first tarball, create a second
shutil.rmtree(root)
with tarfile.open("fst.tar.gz", "r:gz") as tar:
tar.extractall()
# Create the second tarball
with gzip_compressed_tarfile("snd.tar.gz") as (tar, gzip_checksum_2, tarfile_checksum_2):
reproducible_tarfile_from_prefix(tar, "root")
# Verify the .tar.gz checksums are identical and correct
assert (
gzip_checksum_1.hexdigest()
== gzip_checksum_2.hexdigest()
== spack.util.crypto.checksum(hashlib.sha256, "fst.tar.gz")
== spack.util.crypto.checksum(hashlib.sha256, "snd.tar.gz")
)
# Verify the .tar checksums are identical and correct
with gzip.open("fst.tar.gz", "rb") as f, gzip.open("snd.tar.gz", "rb") as g:
assert (
tarfile_checksum_1.hexdigest()
== tarfile_checksum_2.hexdigest()
== spack.util.crypto.checksum_stream(hashlib.sha256, f)
== spack.util.crypto.checksum_stream(hashlib.sha256, g)
)

View File

@ -0,0 +1,228 @@
# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other
# Spack Project Developers. See the top-level COPYRIGHT file for details.
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
import errno
import hashlib
import io
import os
import pathlib
import tarfile
from contextlib import closing, contextmanager
from gzip import GzipFile
from typing import Callable, Dict, Tuple
class ChecksumWriter(io.BufferedIOBase):
"""Checksum writer computes a checksum while writing to a file."""
myfileobj = None
def __init__(self, fileobj, algorithm=hashlib.sha256):
self.fileobj = fileobj
self.hasher = algorithm()
self.length = 0
def hexdigest(self):
return self.hasher.hexdigest()
def write(self, data):
if isinstance(data, (bytes, bytearray)):
length = len(data)
else:
data = memoryview(data)
length = data.nbytes
if length > 0:
self.fileobj.write(data)
self.hasher.update(data)
self.length += length
return length
def read(self, size=-1):
raise OSError(errno.EBADF, "read() on write-only object")
def read1(self, size=-1):
raise OSError(errno.EBADF, "read1() on write-only object")
def peek(self, n):
raise OSError(errno.EBADF, "peek() on write-only object")
@property
def closed(self):
return self.fileobj is None
def close(self):
fileobj = self.fileobj
if fileobj is None:
return
self.fileobj.close()
self.fileobj = None
def flush(self):
self.fileobj.flush()
def fileno(self):
return self.fileobj.fileno()
def rewind(self):
raise OSError("Can't rewind while computing checksum")
def readable(self):
return False
def writable(self):
return True
def seekable(self):
return True
def tell(self):
return self.fileobj.tell()
def seek(self, offset, whence=io.SEEK_SET):
# In principle forward seek is possible with b"0" padding,
# but this is not implemented.
if offset == 0 and whence == io.SEEK_CUR:
return
raise OSError("Can't seek while computing checksum")
def readline(self, size=-1):
raise OSError(errno.EBADF, "readline() on write-only object")
@contextmanager
def gzip_compressed_tarfile(path):
"""Create a reproducible, gzip compressed tarfile, and keep track of shasums of both the
compressed and uncompressed tarfile. Reproduciblity is achived by normalizing the gzip header
(no file name and zero mtime).
Yields a tuple of the following:
tarfile.TarFile: tarfile object
ChecksumWriter: checksum of the gzip compressed tarfile
ChecksumWriter: checksum of the uncompressed tarfile
"""
# Create gzip compressed tarball of the install prefix
# 1) Use explicit empty filename and mtime 0 for gzip header reproducibility.
# If the filename="" is dropped, Python will use fileobj.name instead.
# This should effectively mimick `gzip --no-name`.
# 2) On AMD Ryzen 3700X and an SSD disk, we have the following on compression speed:
# compresslevel=6 gzip default: llvm takes 4mins, roughly 2.1GB
# compresslevel=9 python default: llvm takes 12mins, roughly 2.1GB
# So we follow gzip.
with open(path, "wb") as f, ChecksumWriter(f) as gzip_checksum, closing(
GzipFile(filename="", mode="wb", compresslevel=6, mtime=0, fileobj=gzip_checksum)
) as gzip_file, ChecksumWriter(gzip_file) as tarfile_checksum, tarfile.TarFile(
name="", mode="w", fileobj=tarfile_checksum
) as tar:
yield tar, gzip_checksum, tarfile_checksum
def default_path_to_name(path: str) -> str:
"""Converts a path to a tarfile name, which uses posix path separators."""
p = pathlib.PurePath(path)
# Drop the leading slash on posix and the drive letter on windows, and always format as a
# posix path.
return pathlib.PurePath(*p.parts[1:]).as_posix() if p.is_absolute() else p.as_posix()
def reproducible_tarfile_from_prefix(
tar: tarfile.TarFile,
prefix: str,
*,
include_parent_directories: bool = False,
skip: Callable[[os.DirEntry], bool] = lambda entry: False,
path_to_name: Callable[[str], str] = default_path_to_name,
) -> None:
"""Create a tarball from a given directory. Only adds regular files, symlinks and dirs.
Skips devices, fifos. Preserves hardlinks. Normalizes permissions like git. Tar entries are
added in depth-first pre-order, with dir entries partitioned by file | dir, and sorted
lexicographically, for reproducibility. Partitioning ensures only one dir is in memory at a
time, and sorting improves compression.
Args:
tar: tarfile object opened in write mode
prefix: path to directory to tar (either absolute or relative)
include_parent_directories: whether to include every directory leading up to ``prefix`` in
the tarball
skip: function that receives a DirEntry and returns True if the entry should be skipped,
whether it is a file or directory. Default implementation does not skip anything.
path_to_name: function that converts a path string to a tarfile entry name, which should be
in posix format. Not only is it necessary to transform paths in certain cases, such as
windows path to posix format, but it can also be used to prepend a directory to each
entry even if it does not exist on the filesystem. The default implementation drops the
leading slash on posix and the drive letter on windows for absolute paths, and formats
as a posix."""
hardlink_to_tarinfo_name: Dict[Tuple[int, int], str] = dict()
if include_parent_directories:
parent_dirs = reversed(pathlib.Path(prefix).parents)
next(parent_dirs) # skip the root: slices are supported from python 3.10
for parent_dir in parent_dirs:
dir_info = tarfile.TarInfo(path_to_name(str(parent_dir)))
dir_info.type = tarfile.DIRTYPE
dir_info.mode = 0o755
tar.addfile(dir_info)
dir_stack = [prefix]
while dir_stack:
dir = dir_stack.pop()
# Add the dir before its contents
dir_info = tarfile.TarInfo(path_to_name(dir))
dir_info.type = tarfile.DIRTYPE
dir_info.mode = 0o755
tar.addfile(dir_info)
# Sort by name: reproducible & improves compression
with os.scandir(dir) as it:
entries = sorted(it, key=lambda entry: entry.name)
new_dirs = []
for entry in entries:
if skip(entry):
continue
if entry.is_dir(follow_symlinks=False):
new_dirs.append(entry.path)
continue
file_info = tarfile.TarInfo(path_to_name(entry.path))
if entry.is_symlink():
file_info.type = tarfile.SYMTYPE
file_info.linkname = os.readlink(entry.path)
# According to POSIX: "the value of the file mode bits returned in the
# st_mode field of the stat structure is unspecified." So we set it to
# something sensible without lstat'ing the link.
file_info.mode = 0o755
tar.addfile(file_info)
elif entry.is_file(follow_symlinks=False):
# entry.stat has zero (st_ino, st_dev, st_nlink) on Windows: use lstat.
s = os.lstat(entry.path)
# Normalize permissions like git
file_info.mode = 0o755 if s.st_mode & 0o100 else 0o644
# Deduplicate hardlinks
if s.st_nlink > 1:
ident = (s.st_dev, s.st_ino)
if ident in hardlink_to_tarinfo_name:
file_info.type = tarfile.LNKTYPE
file_info.linkname = hardlink_to_tarinfo_name[ident]
tar.addfile(file_info)
continue
hardlink_to_tarinfo_name[ident] = file_info.name
# If file not yet seen, copy it
file_info.type = tarfile.REGTYPE
file_info.size = s.st_size
with open(entry.path, "rb") as f:
tar.addfile(file_info, f)
dir_stack.extend(reversed(new_dirs)) # we pop, so reverse to stay alphabetical