Create reproducible tarballs in VCSFetchStrategy.archive (#42042)
Currently when you repeatedly create a bootstrap mirror that includes `clingo-bootstrap@spack` you get different tarballs every time. This is a general problem with mirroring checkouts from version control as tarballs. I think it's best to create tarballs ourselves, since that way we have more control over its contents. This PR ensures normalized tarballs like we do for build caches: - normalize file permissions (in fact that was already inspired by git, so should be good) - normalized file creation/modification time (timestamp 0) - uid / guid = 0, no usernames - normalized gzip header - dir entries are ordered by `(is_dir, name)` where strings are not locale aware ;) - POSIX says st_mode of symlinks is unspecified, so work around it and force mode to `0o755`
This commit is contained in:
parent
c05ed2c31a
commit
28675478ce
@ -5,7 +5,6 @@
|
||||
|
||||
import codecs
|
||||
import collections
|
||||
import errno
|
||||
import hashlib
|
||||
import io
|
||||
import itertools
|
||||
@ -23,8 +22,7 @@
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
import warnings
|
||||
from contextlib import closing, contextmanager
|
||||
from gzip import GzipFile
|
||||
from contextlib import closing
|
||||
from typing import Dict, Iterable, List, NamedTuple, Optional, Set, Tuple
|
||||
from urllib.error import HTTPError, URLError
|
||||
|
||||
@ -50,6 +48,7 @@
|
||||
import spack.stage
|
||||
import spack.store
|
||||
import spack.traverse as traverse
|
||||
import spack.util.archive
|
||||
import spack.util.crypto
|
||||
import spack.util.file_cache as file_cache
|
||||
import spack.util.gpg
|
||||
@ -1133,205 +1132,46 @@ def generate_key_index(key_prefix, tmpdir=None):
|
||||
shutil.rmtree(tmpdir)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def gzip_compressed_tarfile(path):
|
||||
"""Create a reproducible, compressed tarfile"""
|
||||
# Create gzip compressed tarball of the install prefix
|
||||
# 1) Use explicit empty filename and mtime 0 for gzip header reproducibility.
|
||||
# If the filename="" is dropped, Python will use fileobj.name instead.
|
||||
# This should effectively mimick `gzip --no-name`.
|
||||
# 2) On AMD Ryzen 3700X and an SSD disk, we have the following on compression speed:
|
||||
# compresslevel=6 gzip default: llvm takes 4mins, roughly 2.1GB
|
||||
# compresslevel=9 python default: llvm takes 12mins, roughly 2.1GB
|
||||
# So we follow gzip.
|
||||
with open(path, "wb") as f, ChecksumWriter(f) as inner_checksum, closing(
|
||||
GzipFile(filename="", mode="wb", compresslevel=6, mtime=0, fileobj=inner_checksum)
|
||||
) as gzip_file, ChecksumWriter(gzip_file) as outer_checksum, tarfile.TarFile(
|
||||
name="", mode="w", fileobj=outer_checksum
|
||||
) as tar:
|
||||
yield tar, inner_checksum, outer_checksum
|
||||
|
||||
|
||||
def _tarinfo_name(absolute_path: str, *, _path=pathlib.PurePath) -> str:
|
||||
"""Compute tarfile entry name as the relative path from the (system) root."""
|
||||
return _path(*_path(absolute_path).parts[1:]).as_posix()
|
||||
|
||||
|
||||
def tarfile_of_spec_prefix(tar: tarfile.TarFile, prefix: str) -> None:
|
||||
"""Create a tarfile of an install prefix of a spec. Skips existing buildinfo file.
|
||||
Only adds regular files, symlinks and dirs. Skips devices, fifos. Preserves hardlinks.
|
||||
Normalizes permissions like git. Tar entries are added in depth-first pre-order, with
|
||||
dir entries partitioned by file | dir, and sorted alphabetically, for reproducibility.
|
||||
Partitioning ensures only one dir is in memory at a time, and sorting improves compression.
|
||||
|
||||
Args:
|
||||
tar: tarfile object to add files to
|
||||
prefix: absolute install prefix of spec"""
|
||||
if not os.path.isabs(prefix) or not os.path.isdir(prefix):
|
||||
raise ValueError(f"prefix '{prefix}' must be an absolute path to a directory")
|
||||
hardlink_to_tarinfo_name: Dict[Tuple[int, int], str] = dict()
|
||||
stat_key = lambda stat: (stat.st_dev, stat.st_ino)
|
||||
|
||||
try: # skip buildinfo file if it exists
|
||||
files_to_skip = [stat_key(os.lstat(buildinfo_file_name(prefix)))]
|
||||
skip = lambda entry: stat_key(entry.stat(follow_symlinks=False)) in files_to_skip
|
||||
except OSError:
|
||||
files_to_skip = []
|
||||
skip = lambda entry: False
|
||||
|
||||
# First add all directories leading up to `prefix` (Spack <= 0.21 did not do this, leading to
|
||||
# issues when tarballs are used in runtimes like AWS lambda). Skip the file system root.
|
||||
parent_dirs = reversed(pathlib.Path(prefix).parents)
|
||||
next(parent_dirs) # skip the root: slices are supported from python 3.10
|
||||
for parent_dir in parent_dirs:
|
||||
dir_info = tarfile.TarInfo(_tarinfo_name(str(parent_dir)))
|
||||
dir_info.type = tarfile.DIRTYPE
|
||||
dir_info.mode = 0o755
|
||||
tar.addfile(dir_info)
|
||||
|
||||
dir_stack = [prefix]
|
||||
while dir_stack:
|
||||
dir = dir_stack.pop()
|
||||
|
||||
# Add the dir before its contents
|
||||
dir_info = tarfile.TarInfo(_tarinfo_name(dir))
|
||||
dir_info.type = tarfile.DIRTYPE
|
||||
dir_info.mode = 0o755
|
||||
tar.addfile(dir_info)
|
||||
|
||||
# Sort by name: reproducible & improves compression
|
||||
with os.scandir(dir) as it:
|
||||
entries = sorted(it, key=lambda entry: entry.name)
|
||||
|
||||
new_dirs = []
|
||||
for entry in entries:
|
||||
if entry.is_dir(follow_symlinks=False):
|
||||
new_dirs.append(entry.path)
|
||||
continue
|
||||
|
||||
file_info = tarfile.TarInfo(_tarinfo_name(entry.path))
|
||||
|
||||
s = entry.stat(follow_symlinks=False)
|
||||
|
||||
# Skip existing binary distribution files.
|
||||
id = stat_key(s)
|
||||
if id in files_to_skip:
|
||||
continue
|
||||
|
||||
# Normalize the mode
|
||||
file_info.mode = 0o644 if s.st_mode & 0o100 == 0 else 0o755
|
||||
|
||||
if entry.is_symlink():
|
||||
file_info.type = tarfile.SYMTYPE
|
||||
file_info.linkname = os.readlink(entry.path)
|
||||
tar.addfile(file_info)
|
||||
|
||||
elif entry.is_file(follow_symlinks=False):
|
||||
# Deduplicate hardlinks
|
||||
if s.st_nlink > 1:
|
||||
if id in hardlink_to_tarinfo_name:
|
||||
file_info.type = tarfile.LNKTYPE
|
||||
file_info.linkname = hardlink_to_tarinfo_name[id]
|
||||
tar.addfile(file_info)
|
||||
continue
|
||||
hardlink_to_tarinfo_name[id] = file_info.name
|
||||
|
||||
# If file not yet seen, copy it.
|
||||
file_info.type = tarfile.REGTYPE
|
||||
file_info.size = s.st_size
|
||||
|
||||
with open(entry.path, "rb") as f:
|
||||
tar.addfile(file_info, f)
|
||||
|
||||
dir_stack.extend(reversed(new_dirs)) # we pop, so reverse to stay alphabetical
|
||||
|
||||
|
||||
class ChecksumWriter(io.BufferedIOBase):
|
||||
"""Checksum writer computes a checksum while writing to a file."""
|
||||
|
||||
myfileobj = None
|
||||
|
||||
def __init__(self, fileobj, algorithm=hashlib.sha256):
|
||||
self.fileobj = fileobj
|
||||
self.hasher = algorithm()
|
||||
self.length = 0
|
||||
|
||||
def hexdigest(self):
|
||||
return self.hasher.hexdigest()
|
||||
|
||||
def write(self, data):
|
||||
if isinstance(data, (bytes, bytearray)):
|
||||
length = len(data)
|
||||
else:
|
||||
data = memoryview(data)
|
||||
length = data.nbytes
|
||||
|
||||
if length > 0:
|
||||
self.fileobj.write(data)
|
||||
self.hasher.update(data)
|
||||
|
||||
self.length += length
|
||||
|
||||
return length
|
||||
|
||||
def read(self, size=-1):
|
||||
raise OSError(errno.EBADF, "read() on write-only object")
|
||||
|
||||
def read1(self, size=-1):
|
||||
raise OSError(errno.EBADF, "read1() on write-only object")
|
||||
|
||||
def peek(self, n):
|
||||
raise OSError(errno.EBADF, "peek() on write-only object")
|
||||
|
||||
@property
|
||||
def closed(self):
|
||||
return self.fileobj is None
|
||||
|
||||
def close(self):
|
||||
fileobj = self.fileobj
|
||||
if fileobj is None:
|
||||
return
|
||||
self.fileobj.close()
|
||||
self.fileobj = None
|
||||
|
||||
def flush(self):
|
||||
self.fileobj.flush()
|
||||
|
||||
def fileno(self):
|
||||
return self.fileobj.fileno()
|
||||
|
||||
def rewind(self):
|
||||
raise OSError("Can't rewind while computing checksum")
|
||||
|
||||
def readable(self):
|
||||
return False
|
||||
|
||||
def writable(self):
|
||||
return True
|
||||
|
||||
def seekable(self):
|
||||
return True
|
||||
|
||||
def tell(self):
|
||||
return self.fileobj.tell()
|
||||
|
||||
def seek(self, offset, whence=io.SEEK_SET):
|
||||
# In principle forward seek is possible with b"0" padding,
|
||||
# but this is not implemented.
|
||||
if offset == 0 and whence == io.SEEK_CUR:
|
||||
return
|
||||
raise OSError("Can't seek while computing checksum")
|
||||
|
||||
def readline(self, size=-1):
|
||||
raise OSError(errno.EBADF, "readline() on write-only object")
|
||||
spack.util.archive.reproducible_tarfile_from_prefix(
|
||||
tar,
|
||||
prefix,
|
||||
# Spack <= 0.21 did not include parent directories, leading to issues when tarballs are
|
||||
# used in runtimes like AWS lambda.
|
||||
include_parent_directories=True,
|
||||
skip=skip,
|
||||
)
|
||||
|
||||
|
||||
def _do_create_tarball(tarfile_path: str, binaries_dir: str, buildinfo: dict):
|
||||
with gzip_compressed_tarfile(tarfile_path) as (tar, inner_checksum, outer_checksum):
|
||||
with spack.util.archive.gzip_compressed_tarfile(tarfile_path) as (
|
||||
tar,
|
||||
inner_checksum,
|
||||
outer_checksum,
|
||||
):
|
||||
# Tarball the install prefix
|
||||
tarfile_of_spec_prefix(tar, binaries_dir)
|
||||
|
||||
# Serialize buildinfo for the tarball
|
||||
bstring = syaml.dump(buildinfo, default_flow_style=True).encode("utf-8")
|
||||
tarinfo = tarfile.TarInfo(name=_tarinfo_name(buildinfo_file_name(binaries_dir)))
|
||||
tarinfo = tarfile.TarInfo(
|
||||
name=spack.util.archive.default_path_to_name(buildinfo_file_name(binaries_dir))
|
||||
)
|
||||
tarinfo.type = tarfile.REGTYPE
|
||||
tarinfo.size = len(bstring)
|
||||
tarinfo.mode = 0o644
|
||||
|
@ -30,6 +30,7 @@
|
||||
import shutil
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
from pathlib import PurePath
|
||||
from typing import List, Optional
|
||||
|
||||
import llnl.url
|
||||
@ -37,13 +38,14 @@
|
||||
import llnl.util.filesystem as fs
|
||||
import llnl.util.tty as tty
|
||||
from llnl.string import comma_and, quote
|
||||
from llnl.util.filesystem import get_single_file, mkdirp, temp_cwd, temp_rename, working_dir
|
||||
from llnl.util.filesystem import get_single_file, mkdirp, temp_cwd, working_dir
|
||||
from llnl.util.symlink import symlink
|
||||
|
||||
import spack.config
|
||||
import spack.error
|
||||
import spack.oci.opener
|
||||
import spack.url
|
||||
import spack.util.archive
|
||||
import spack.util.crypto as crypto
|
||||
import spack.util.git
|
||||
import spack.util.url as url_util
|
||||
@ -600,29 +602,21 @@ def expand(self):
|
||||
tty.debug("Source fetched with %s is already expanded." % self.url_attr)
|
||||
|
||||
@_needs_stage
|
||||
def archive(self, destination, **kwargs):
|
||||
def archive(self, destination, *, exclude: Optional[str] = None):
|
||||
assert llnl.url.extension_from_path(destination) == "tar.gz"
|
||||
assert self.stage.source_path.startswith(self.stage.path)
|
||||
# We need to prepend this dir name to every entry of the tarfile
|
||||
top_level_dir = PurePath(self.stage.srcdir or os.path.basename(self.stage.source_path))
|
||||
|
||||
tar = which("tar", required=True)
|
||||
|
||||
patterns = kwargs.get("exclude", None)
|
||||
if patterns is not None:
|
||||
if isinstance(patterns, str):
|
||||
patterns = [patterns]
|
||||
for p in patterns:
|
||||
tar.add_default_arg("--exclude=%s" % p)
|
||||
|
||||
with working_dir(self.stage.path):
|
||||
if self.stage.srcdir:
|
||||
# Here we create an archive with the default repository name.
|
||||
# The 'tar' command has options for changing the name of a
|
||||
# directory that is included in the archive, but they differ
|
||||
# based on OS, so we temporarily rename the repo
|
||||
with temp_rename(self.stage.source_path, self.stage.srcdir):
|
||||
tar("-czf", destination, self.stage.srcdir)
|
||||
else:
|
||||
tar("-czf", destination, os.path.basename(self.stage.source_path))
|
||||
with working_dir(self.stage.source_path), spack.util.archive.gzip_compressed_tarfile(
|
||||
destination
|
||||
) as (tar, _, _):
|
||||
spack.util.archive.reproducible_tarfile_from_prefix(
|
||||
tar=tar,
|
||||
prefix=".",
|
||||
skip=lambda entry: entry.name == exclude,
|
||||
path_to_name=lambda path: (top_level_dir / PurePath(path)).as_posix(),
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return "VCS: %s" % self.url
|
||||
|
@ -13,11 +13,11 @@
|
||||
|
||||
import spack.environment as ev
|
||||
import spack.oci.opener
|
||||
from spack.binary_distribution import gzip_compressed_tarfile
|
||||
from spack.main import SpackCommand
|
||||
from spack.oci.image import Digest, ImageReference, default_config, default_manifest
|
||||
from spack.oci.oci import blob_exists, get_manifest_and_config, upload_blob, upload_manifest
|
||||
from spack.test.oci.mock_registry import DummyServer, InMemoryOCIRegistry, create_opener
|
||||
from spack.util.archive import gzip_compressed_tarfile
|
||||
|
||||
buildcache = SpackCommand("buildcache")
|
||||
mirror = SpackCommand("mirror")
|
||||
|
157
lib/spack/spack/test/util/archive.py
Normal file
157
lib/spack/spack/test/util/archive.py
Normal file
@ -0,0 +1,157 @@
|
||||
# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other
|
||||
# Spack Project Developers. See the top-level COPYRIGHT file for details.
|
||||
#
|
||||
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
|
||||
|
||||
import gzip
|
||||
import hashlib
|
||||
import os
|
||||
import shutil
|
||||
import tarfile
|
||||
from pathlib import Path, PurePath
|
||||
|
||||
import spack.util.crypto
|
||||
from spack.util.archive import gzip_compressed_tarfile, reproducible_tarfile_from_prefix
|
||||
|
||||
|
||||
def test_gzip_compressed_tarball_is_reproducible(tmpdir):
|
||||
"""Test gzip_compressed_tarfile and reproducible_tarfile_from_prefix for reproducibility"""
|
||||
|
||||
with tmpdir.as_cwd():
|
||||
# Create a few directories
|
||||
root = Path("root")
|
||||
dir_a = root / "a"
|
||||
dir_b = root / "b"
|
||||
root.mkdir(mode=0o777)
|
||||
dir_a.mkdir(mode=0o777)
|
||||
dir_b.mkdir(mode=0o777)
|
||||
|
||||
(root / "y").touch()
|
||||
(root / "x").touch()
|
||||
|
||||
(dir_a / "executable").touch(mode=0o777)
|
||||
(dir_a / "data").touch(mode=0o666)
|
||||
(dir_a / "symlink_file").symlink_to("data")
|
||||
(dir_a / "symlink_dir").symlink_to(PurePath("..", "b"))
|
||||
try:
|
||||
os.link(dir_a / "executable", dir_a / "hardlink")
|
||||
hardlink_support = True
|
||||
except OSError:
|
||||
hardlink_support = False
|
||||
|
||||
(dir_b / "executable").touch(mode=0o777)
|
||||
(dir_b / "data").touch(mode=0o666)
|
||||
(dir_b / "symlink_file").symlink_to("data")
|
||||
(dir_b / "symlink_dir").symlink_to(PurePath("..", "a"))
|
||||
|
||||
# Create the first tarball
|
||||
with gzip_compressed_tarfile("fst.tar.gz") as (tar, gzip_checksum_1, tarfile_checksum_1):
|
||||
reproducible_tarfile_from_prefix(tar, "root")
|
||||
|
||||
# Expected mode for non-dirs is 644 if not executable, 755 if executable. Better to compute
|
||||
# that as we don't know the umask of the user running the test.
|
||||
expected_mode = (
|
||||
lambda name: 0o755 if Path(*name.split("/")).lstat().st_mode & 0o100 else 0o644
|
||||
)
|
||||
|
||||
# Verify the tarball contents
|
||||
with tarfile.open("fst.tar.gz", "r:gz") as tar:
|
||||
# Directories (mode is always 755)
|
||||
for dir in ("root", "root/a", "root/b"):
|
||||
m = tar.getmember(dir)
|
||||
assert m.isdir()
|
||||
assert m.mode == 0o755
|
||||
assert m.uid == m.gid == 0
|
||||
assert m.uname == m.gname == ""
|
||||
|
||||
# Non-executable regular files
|
||||
for file in (
|
||||
"root/x",
|
||||
"root/y",
|
||||
"root/a/data",
|
||||
"root/b/data",
|
||||
"root/a/executable",
|
||||
"root/b/executable",
|
||||
):
|
||||
m = tar.getmember(file)
|
||||
assert m.isreg()
|
||||
assert m.mode == expected_mode(file)
|
||||
assert m.uid == m.gid == 0
|
||||
assert m.uname == m.gname == ""
|
||||
|
||||
# Symlinks
|
||||
for file in (
|
||||
"root/a/symlink_file",
|
||||
"root/a/symlink_dir",
|
||||
"root/b/symlink_file",
|
||||
"root/b/symlink_dir",
|
||||
):
|
||||
m = tar.getmember(file)
|
||||
assert m.issym()
|
||||
assert m.mode == 0o755
|
||||
assert m.uid == m.gid == m.mtime == 0
|
||||
assert m.uname == m.gname == ""
|
||||
|
||||
# Verify the symlink targets. Notice that symlink targets are copied verbatim. That
|
||||
# means the value is platform specific for relative symlinks within the current prefix,
|
||||
# as on Windows they'd be ..\a and ..\b instead of ../a and ../b. So, reproducilility
|
||||
# is only guaranteed per-platform currently.
|
||||
assert PurePath(tar.getmember("root/a/symlink_file").linkname) == PurePath("data")
|
||||
assert PurePath(tar.getmember("root/b/symlink_file").linkname) == PurePath("data")
|
||||
assert PurePath(tar.getmember("root/a/symlink_dir").linkname) == PurePath("..", "b")
|
||||
assert PurePath(tar.getmember("root/b/symlink_dir").linkname) == PurePath("..", "a")
|
||||
|
||||
# Check hardlink if supported
|
||||
if hardlink_support:
|
||||
m = tar.getmember("root/a/hardlink")
|
||||
assert m.islnk()
|
||||
assert m.mode == expected_mode("root/a/hardlink")
|
||||
assert m.uid == m.gid == 0
|
||||
assert m.uname == m.gname == ""
|
||||
# Hardlink targets are always in posix format, as they reference a file that exists
|
||||
# in the tarball.
|
||||
assert m.linkname == "root/a/executable"
|
||||
|
||||
# Finally verify if entries are ordered by (is_dir, name)
|
||||
assert [t.name for t in tar.getmembers()] == [
|
||||
"root",
|
||||
"root/x",
|
||||
"root/y",
|
||||
"root/a",
|
||||
"root/a/data",
|
||||
"root/a/executable",
|
||||
*(["root/a/hardlink"] if hardlink_support else []),
|
||||
"root/a/symlink_dir",
|
||||
"root/a/symlink_file",
|
||||
"root/b",
|
||||
"root/b/data",
|
||||
"root/b/executable",
|
||||
"root/b/symlink_dir",
|
||||
"root/b/symlink_file",
|
||||
]
|
||||
|
||||
# Delete the current root dir, extract the first tarball, create a second
|
||||
shutil.rmtree(root)
|
||||
with tarfile.open("fst.tar.gz", "r:gz") as tar:
|
||||
tar.extractall()
|
||||
|
||||
# Create the second tarball
|
||||
with gzip_compressed_tarfile("snd.tar.gz") as (tar, gzip_checksum_2, tarfile_checksum_2):
|
||||
reproducible_tarfile_from_prefix(tar, "root")
|
||||
|
||||
# Verify the .tar.gz checksums are identical and correct
|
||||
assert (
|
||||
gzip_checksum_1.hexdigest()
|
||||
== gzip_checksum_2.hexdigest()
|
||||
== spack.util.crypto.checksum(hashlib.sha256, "fst.tar.gz")
|
||||
== spack.util.crypto.checksum(hashlib.sha256, "snd.tar.gz")
|
||||
)
|
||||
|
||||
# Verify the .tar checksums are identical and correct
|
||||
with gzip.open("fst.tar.gz", "rb") as f, gzip.open("snd.tar.gz", "rb") as g:
|
||||
assert (
|
||||
tarfile_checksum_1.hexdigest()
|
||||
== tarfile_checksum_2.hexdigest()
|
||||
== spack.util.crypto.checksum_stream(hashlib.sha256, f)
|
||||
== spack.util.crypto.checksum_stream(hashlib.sha256, g)
|
||||
)
|
228
lib/spack/spack/util/archive.py
Normal file
228
lib/spack/spack/util/archive.py
Normal file
@ -0,0 +1,228 @@
|
||||
# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other
|
||||
# Spack Project Developers. See the top-level COPYRIGHT file for details.
|
||||
#
|
||||
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
|
||||
import errno
|
||||
import hashlib
|
||||
import io
|
||||
import os
|
||||
import pathlib
|
||||
import tarfile
|
||||
from contextlib import closing, contextmanager
|
||||
from gzip import GzipFile
|
||||
from typing import Callable, Dict, Tuple
|
||||
|
||||
|
||||
class ChecksumWriter(io.BufferedIOBase):
|
||||
"""Checksum writer computes a checksum while writing to a file."""
|
||||
|
||||
myfileobj = None
|
||||
|
||||
def __init__(self, fileobj, algorithm=hashlib.sha256):
|
||||
self.fileobj = fileobj
|
||||
self.hasher = algorithm()
|
||||
self.length = 0
|
||||
|
||||
def hexdigest(self):
|
||||
return self.hasher.hexdigest()
|
||||
|
||||
def write(self, data):
|
||||
if isinstance(data, (bytes, bytearray)):
|
||||
length = len(data)
|
||||
else:
|
||||
data = memoryview(data)
|
||||
length = data.nbytes
|
||||
|
||||
if length > 0:
|
||||
self.fileobj.write(data)
|
||||
self.hasher.update(data)
|
||||
|
||||
self.length += length
|
||||
|
||||
return length
|
||||
|
||||
def read(self, size=-1):
|
||||
raise OSError(errno.EBADF, "read() on write-only object")
|
||||
|
||||
def read1(self, size=-1):
|
||||
raise OSError(errno.EBADF, "read1() on write-only object")
|
||||
|
||||
def peek(self, n):
|
||||
raise OSError(errno.EBADF, "peek() on write-only object")
|
||||
|
||||
@property
|
||||
def closed(self):
|
||||
return self.fileobj is None
|
||||
|
||||
def close(self):
|
||||
fileobj = self.fileobj
|
||||
if fileobj is None:
|
||||
return
|
||||
self.fileobj.close()
|
||||
self.fileobj = None
|
||||
|
||||
def flush(self):
|
||||
self.fileobj.flush()
|
||||
|
||||
def fileno(self):
|
||||
return self.fileobj.fileno()
|
||||
|
||||
def rewind(self):
|
||||
raise OSError("Can't rewind while computing checksum")
|
||||
|
||||
def readable(self):
|
||||
return False
|
||||
|
||||
def writable(self):
|
||||
return True
|
||||
|
||||
def seekable(self):
|
||||
return True
|
||||
|
||||
def tell(self):
|
||||
return self.fileobj.tell()
|
||||
|
||||
def seek(self, offset, whence=io.SEEK_SET):
|
||||
# In principle forward seek is possible with b"0" padding,
|
||||
# but this is not implemented.
|
||||
if offset == 0 and whence == io.SEEK_CUR:
|
||||
return
|
||||
raise OSError("Can't seek while computing checksum")
|
||||
|
||||
def readline(self, size=-1):
|
||||
raise OSError(errno.EBADF, "readline() on write-only object")
|
||||
|
||||
|
||||
@contextmanager
|
||||
def gzip_compressed_tarfile(path):
|
||||
"""Create a reproducible, gzip compressed tarfile, and keep track of shasums of both the
|
||||
compressed and uncompressed tarfile. Reproduciblity is achived by normalizing the gzip header
|
||||
(no file name and zero mtime).
|
||||
|
||||
Yields a tuple of the following:
|
||||
tarfile.TarFile: tarfile object
|
||||
ChecksumWriter: checksum of the gzip compressed tarfile
|
||||
ChecksumWriter: checksum of the uncompressed tarfile
|
||||
"""
|
||||
# Create gzip compressed tarball of the install prefix
|
||||
# 1) Use explicit empty filename and mtime 0 for gzip header reproducibility.
|
||||
# If the filename="" is dropped, Python will use fileobj.name instead.
|
||||
# This should effectively mimick `gzip --no-name`.
|
||||
# 2) On AMD Ryzen 3700X and an SSD disk, we have the following on compression speed:
|
||||
# compresslevel=6 gzip default: llvm takes 4mins, roughly 2.1GB
|
||||
# compresslevel=9 python default: llvm takes 12mins, roughly 2.1GB
|
||||
# So we follow gzip.
|
||||
with open(path, "wb") as f, ChecksumWriter(f) as gzip_checksum, closing(
|
||||
GzipFile(filename="", mode="wb", compresslevel=6, mtime=0, fileobj=gzip_checksum)
|
||||
) as gzip_file, ChecksumWriter(gzip_file) as tarfile_checksum, tarfile.TarFile(
|
||||
name="", mode="w", fileobj=tarfile_checksum
|
||||
) as tar:
|
||||
yield tar, gzip_checksum, tarfile_checksum
|
||||
|
||||
|
||||
def default_path_to_name(path: str) -> str:
|
||||
"""Converts a path to a tarfile name, which uses posix path separators."""
|
||||
p = pathlib.PurePath(path)
|
||||
# Drop the leading slash on posix and the drive letter on windows, and always format as a
|
||||
# posix path.
|
||||
return pathlib.PurePath(*p.parts[1:]).as_posix() if p.is_absolute() else p.as_posix()
|
||||
|
||||
|
||||
def reproducible_tarfile_from_prefix(
|
||||
tar: tarfile.TarFile,
|
||||
prefix: str,
|
||||
*,
|
||||
include_parent_directories: bool = False,
|
||||
skip: Callable[[os.DirEntry], bool] = lambda entry: False,
|
||||
path_to_name: Callable[[str], str] = default_path_to_name,
|
||||
) -> None:
|
||||
"""Create a tarball from a given directory. Only adds regular files, symlinks and dirs.
|
||||
Skips devices, fifos. Preserves hardlinks. Normalizes permissions like git. Tar entries are
|
||||
added in depth-first pre-order, with dir entries partitioned by file | dir, and sorted
|
||||
lexicographically, for reproducibility. Partitioning ensures only one dir is in memory at a
|
||||
time, and sorting improves compression.
|
||||
|
||||
Args:
|
||||
tar: tarfile object opened in write mode
|
||||
prefix: path to directory to tar (either absolute or relative)
|
||||
include_parent_directories: whether to include every directory leading up to ``prefix`` in
|
||||
the tarball
|
||||
skip: function that receives a DirEntry and returns True if the entry should be skipped,
|
||||
whether it is a file or directory. Default implementation does not skip anything.
|
||||
path_to_name: function that converts a path string to a tarfile entry name, which should be
|
||||
in posix format. Not only is it necessary to transform paths in certain cases, such as
|
||||
windows path to posix format, but it can also be used to prepend a directory to each
|
||||
entry even if it does not exist on the filesystem. The default implementation drops the
|
||||
leading slash on posix and the drive letter on windows for absolute paths, and formats
|
||||
as a posix."""
|
||||
|
||||
hardlink_to_tarinfo_name: Dict[Tuple[int, int], str] = dict()
|
||||
|
||||
if include_parent_directories:
|
||||
parent_dirs = reversed(pathlib.Path(prefix).parents)
|
||||
next(parent_dirs) # skip the root: slices are supported from python 3.10
|
||||
for parent_dir in parent_dirs:
|
||||
dir_info = tarfile.TarInfo(path_to_name(str(parent_dir)))
|
||||
dir_info.type = tarfile.DIRTYPE
|
||||
dir_info.mode = 0o755
|
||||
tar.addfile(dir_info)
|
||||
|
||||
dir_stack = [prefix]
|
||||
while dir_stack:
|
||||
dir = dir_stack.pop()
|
||||
|
||||
# Add the dir before its contents
|
||||
dir_info = tarfile.TarInfo(path_to_name(dir))
|
||||
dir_info.type = tarfile.DIRTYPE
|
||||
dir_info.mode = 0o755
|
||||
tar.addfile(dir_info)
|
||||
|
||||
# Sort by name: reproducible & improves compression
|
||||
with os.scandir(dir) as it:
|
||||
entries = sorted(it, key=lambda entry: entry.name)
|
||||
|
||||
new_dirs = []
|
||||
for entry in entries:
|
||||
if skip(entry):
|
||||
continue
|
||||
|
||||
if entry.is_dir(follow_symlinks=False):
|
||||
new_dirs.append(entry.path)
|
||||
continue
|
||||
|
||||
file_info = tarfile.TarInfo(path_to_name(entry.path))
|
||||
|
||||
if entry.is_symlink():
|
||||
file_info.type = tarfile.SYMTYPE
|
||||
file_info.linkname = os.readlink(entry.path)
|
||||
# According to POSIX: "the value of the file mode bits returned in the
|
||||
# st_mode field of the stat structure is unspecified." So we set it to
|
||||
# something sensible without lstat'ing the link.
|
||||
file_info.mode = 0o755
|
||||
tar.addfile(file_info)
|
||||
|
||||
elif entry.is_file(follow_symlinks=False):
|
||||
# entry.stat has zero (st_ino, st_dev, st_nlink) on Windows: use lstat.
|
||||
s = os.lstat(entry.path)
|
||||
|
||||
# Normalize permissions like git
|
||||
file_info.mode = 0o755 if s.st_mode & 0o100 else 0o644
|
||||
|
||||
# Deduplicate hardlinks
|
||||
if s.st_nlink > 1:
|
||||
ident = (s.st_dev, s.st_ino)
|
||||
if ident in hardlink_to_tarinfo_name:
|
||||
file_info.type = tarfile.LNKTYPE
|
||||
file_info.linkname = hardlink_to_tarinfo_name[ident]
|
||||
tar.addfile(file_info)
|
||||
continue
|
||||
hardlink_to_tarinfo_name[ident] = file_info.name
|
||||
|
||||
# If file not yet seen, copy it
|
||||
file_info.type = tarfile.REGTYPE
|
||||
file_info.size = s.st_size
|
||||
|
||||
with open(entry.path, "rb") as f:
|
||||
tar.addfile(file_info, f)
|
||||
|
||||
dir_stack.extend(reversed(new_dirs)) # we pop, so reverse to stay alphabetical
|
Loading…
Reference in New Issue
Block a user