Windows: prefer Python decompression support (#36507)

On Windows, several commonly available system tools for decompression
are unreliable (gz/bz2/xz). This commit refactors `decompressor_for`
to call out to a Windows or Unix-specific method:

* The decompressor_for_nix method behaves the same as before and
  generally treats the Python/system support options for decompression
  as interchangeable (although avoids using Python's built-in tar
  support since that has had issues with permissions).
* The decompressor_for_win method can only use Python support for
  gz/bz2/xz, although for a tar.gz it does use system support for
  untar (after the decompression step). .zip uses the system tar
  utility, and .Z depends on external support (i.e. that the user
  has installed 7zip).

A naming scheme has been introduced for the various _decompression
methods:

* _system_gunzip means to use a system tool (and fail if it's not
    available)
* _py_gunzip means to use Python's built-in support for decompressing
    .gzip files (and fail if it's not available)
* _gunzip is a method that can do either
This commit is contained in:
John W. Parent 2023-05-10 18:07:56 -04:00 committed by GitHub
parent 830ee6a1eb
commit 85cc9097cb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 254 additions and 110 deletions

View File

@ -5,6 +5,7 @@
import os import os
import shutil import shutil
import sys
import pytest import pytest
@ -62,6 +63,7 @@ def test_native_unpacking(tmpdir_factory, archive_file):
assert "TEST" in contents assert "TEST" in contents
@pytest.mark.skipif(sys.platform == "win32", reason="Only Python unpacking available on Windows")
@pytest.mark.parametrize("archive_file", ext_archive.keys(), indirect=True) @pytest.mark.parametrize("archive_file", ext_archive.keys(), indirect=True)
def test_system_unpacking(tmpdir_factory, archive_file, compr_support_check): def test_system_unpacking(tmpdir_factory, archive_file, compr_support_check):
# actually run test # actually run test

View File

@ -14,6 +14,7 @@
from llnl.util import tty from llnl.util import tty
import spack.util.path as spath import spack.util.path as spath
from spack.error import SpackError
from spack.util.executable import CommandNotFoundError, which from spack.util.executable import CommandNotFoundError, which
# Supported archive extensions. # Supported archive extensions.
@ -68,12 +69,9 @@ def allowed_archive(path):
return False if not path else any(path.endswith(t) for t in ALLOWED_ARCHIVE_TYPES) return False if not path else any(path.endswith(t) for t in ALLOWED_ARCHIVE_TYPES)
def _untar(archive_file): def _system_untar(archive_file):
"""Untar archive. Prefer native Python `tarfile` """Returns path to unarchived tar file.
but fall back to system utility if there is a failure Untars archive via system tar.
to find the native Python module (tar on Unix).
Filters archives through native support gzip and xz
compression formats.
Args: Args:
archive_file (str): absolute path to the archive to be extracted. archive_file (str): absolute path to the archive to be extracted.
@ -88,32 +86,50 @@ def _untar(archive_file):
def _bunzip2(archive_file): def _bunzip2(archive_file):
"""Use Python's bz2 module to decompress bz2 compressed archives """Returns path to decompressed file.
Uses Python's bz2 module to decompress bz2 compressed archives
Fall back to system utility failing to find Python module `bz2` Fall back to system utility failing to find Python module `bz2`
Args: Args:
archive_file (str): absolute path to the bz2 archive to be decompressed archive_file (str): absolute path to the bz2 archive to be decompressed
""" """
if is_bz2_supported():
return _py_bunzip(archive_file)
else:
return _system_bunzip(archive_file)
def _py_bunzip(archive_file):
"""Returns path to decompressed file.
Decompresses bz2 compressed archives/files via python's bz2 module"""
decompressed_file = os.path.basename(strip_compression_extension(archive_file, "bz2"))
working_dir = os.getcwd()
archive_out = os.path.join(working_dir, decompressed_file)
f_bz = bz2.BZ2File(archive_file, mode="rb")
with open(archive_out, "wb") as ar:
shutil.copyfileobj(f_bz, ar)
f_bz.close()
return archive_out
def _system_bunzip(archive_file):
"""Returns path to decompressed file.
Decompresses bz2 compressed archives/files via system bzip2 utility"""
compressed_file_name = os.path.basename(archive_file) compressed_file_name = os.path.basename(archive_file)
decompressed_file = os.path.basename(strip_extension(archive_file, "bz2")) decompressed_file = os.path.basename(strip_compression_extension(archive_file, "bz2"))
working_dir = os.getcwd() working_dir = os.getcwd()
archive_out = os.path.join(working_dir, decompressed_file) archive_out = os.path.join(working_dir, decompressed_file)
copy_path = os.path.join(working_dir, compressed_file_name) copy_path = os.path.join(working_dir, compressed_file_name)
if is_bz2_supported(): shutil.copy(archive_file, copy_path)
f_bz = bz2.BZ2File(archive_file, mode="rb") bunzip2 = which("bunzip2", required=True)
with open(archive_out, "wb") as ar: bunzip2.add_default_arg("-q")
shutil.copyfileobj(f_bz, ar) bunzip2(copy_path)
f_bz.close()
else:
shutil.copy(archive_file, copy_path)
bunzip2 = which("bunzip2", required=True)
bunzip2.add_default_arg("-q")
return bunzip2(copy_path)
return archive_out return archive_out
def _gunzip(archive_file): def _gunzip(archive_file):
"""Decompress `.gz` extensions. Prefer native Python `gzip` module. """Returns path to gunzip'd file
Decompresses `.gz` extensions. Prefer native Python `gzip` module.
Failing back to system utility gunzip. Failing back to system utility gunzip.
Like gunzip, but extracts in the current working directory Like gunzip, but extracts in the current working directory
instead of in-place. instead of in-place.
@ -121,34 +137,42 @@ def _gunzip(archive_file):
Args: Args:
archive_file (str): absolute path of the file to be decompressed archive_file (str): absolute path of the file to be decompressed
""" """
decompressed_file = os.path.basename(strip_extension(archive_file, "gz")) if is_gzip_supported():
return _py_gunzip(archive_file)
else:
return _system_gunzip(archive_file)
def _py_gunzip(archive_file):
"""Returns path to gunzip'd file
Decompresses `.gz` compressed archvies via python gzip module"""
decompressed_file = os.path.basename(strip_compression_extension(archive_file, "gz"))
working_dir = os.getcwd() working_dir = os.getcwd()
destination_abspath = os.path.join(working_dir, decompressed_file) destination_abspath = os.path.join(working_dir, decompressed_file)
if is_gzip_supported(): f_in = gzip.open(archive_file, "rb")
f_in = gzip.open(archive_file, "rb") with open(destination_abspath, "wb") as f_out:
with open(destination_abspath, "wb") as f_out: shutil.copyfileobj(f_in, f_out)
shutil.copyfileobj(f_in, f_out) f_in.close()
f_in.close()
else:
_system_gunzip(archive_file)
return destination_abspath return destination_abspath
def _system_gunzip(archive_file): def _system_gunzip(archive_file):
decompressed_file = os.path.basename(strip_extension(archive_file, "gz")) """Returns path to gunzip'd file
Decompresses `.gz` compressed files via system gzip"""
decompressed_file = os.path.basename(strip_compression_extension(archive_file, "gz"))
working_dir = os.getcwd() working_dir = os.getcwd()
destination_abspath = os.path.join(working_dir, decompressed_file) destination_abspath = os.path.join(working_dir, decompressed_file)
compressed_file = os.path.basename(archive_file) compressed_file = os.path.basename(archive_file)
copy_path = os.path.join(working_dir, compressed_file) copy_path = os.path.join(working_dir, compressed_file)
shutil.copy(archive_file, copy_path) shutil.copy(archive_file, copy_path)
gzip = which("gzip") gzip = which("gzip", required=True)
gzip.add_default_arg("-d") gzip.add_default_arg("-d")
gzip(copy_path) gzip(copy_path)
return destination_abspath return destination_abspath
def _unzip(archive_file): def _unzip(archive_file):
""" """Returns path to extracted zip archive
Extract Zipfile, searching for unzip system executable Extract Zipfile, searching for unzip system executable
If unavailable, search for 'tar' executable on system and use instead If unavailable, search for 'tar' executable on system and use instead
@ -157,7 +181,7 @@ def _unzip(archive_file):
""" """
extracted_file = os.path.basename(strip_extension(archive_file, "zip")) extracted_file = os.path.basename(strip_extension(archive_file, "zip"))
if sys.platform == "win32": if sys.platform == "win32":
return _untar(archive_file) return _system_untar(archive_file)
else: else:
exe = "unzip" exe = "unzip"
arg = "-q" arg = "-q"
@ -167,66 +191,76 @@ def _unzip(archive_file):
return extracted_file return extracted_file
def _unZ(archive_file): def _system_unZ(archive_file):
"""Returns path to decompressed file
Decompress UNIX compress style compression
Utilizes gunzip on unix and 7zip on Windows
"""
if sys.platform == "win32": if sys.platform == "win32":
result = _7zip(archive_file) result = _system_7zip(archive_file)
else: else:
result = _system_gunzip(archive_file) result = _system_gunzip(archive_file)
return result return result
def _lzma_decomp(archive_file): def _lzma_decomp(archive_file):
"""Decompress lzma compressed files. Prefer Python native """Returns path to decompressed xz file.
Decompress lzma compressed files. Prefer Python native
lzma module, but fall back on command line xz tooling lzma module, but fall back on command line xz tooling
to find available Python support. This is the xz command to find available Python support."""
on Unix and 7z on Windows"""
if is_lzma_supported(): if is_lzma_supported():
decompressed_file = os.path.basename(strip_extension(archive_file, "xz")) return _py_lzma(archive_file)
archive_out = os.path.join(os.getcwd(), decompressed_file)
with open(archive_out, "wb") as ar:
with lzma.open(archive_file) as lar:
shutil.copyfileobj(lar, ar)
else: else:
if sys.platform == "win32": return _xz(archive_file)
return _7zip(archive_file)
else:
return _xz(archive_file)
def _win_compressed_tarball_handler(archive_file): def _win_compressed_tarball_handler(decompressor):
"""Decompress and extract compressed tarballs on Windows. """Returns function pointer to two stage decompression
This method uses 7zip in conjunction with the tar utility and extraction method
to perform decompression and extraction in a two step process Decompress and extract compressed tarballs on Windows.
first using 7zip to decompress, and tar to extract. This method uses a decompression method in conjunction with
the tar utility to perform decompression and extraction in
a two step process first using decompressor to decompress,
and tar to extract.
The motivation for this method is the inability of 7zip The motivation for this method is Windows tar utility's lack
to directly decompress and extract compressed archives of access to the xz tool (unsupported natively on Windows) but
in a single shot without undocumented workarounds, and can be installed manually or via spack
the Windows tar utility's lack of access to the xz tool (unsupported on Windows)
""" """
# perform intermediate extraction step
# record name of new archive so we can extract def unarchive(archive_file):
# and later clean up # perform intermediate extraction step
decomped_tarball = _7zip(archive_file) # record name of new archive so we can extract
# 7zip is able to one shot extract compressed archives # and later clean up
# that have been named .txz. If that is the case, there will decomped_tarball = decompressor(archive_file)
# be no intermediate archvie to extract. if check_extension(decomped_tarball, "tar"):
if check_extension(decomped_tarball, "tar"): # run tar on newly decomped archive
# run tar on newly decomped archive outfile = _system_untar(decomped_tarball)
outfile = _untar(decomped_tarball) # clean intermediate archive to mimic end result
# clean intermediate archive to mimic end result # produced by one shot decomp/extraction
# produced by one shot decomp/extraction os.remove(decomped_tarball)
os.remove(decomped_tarball) return outfile
return outfile return decomped_tarball
return decomped_tarball
return unarchive
def _py_lzma(archive_file):
"""Returns path to decompressed .xz files
Decompress lzma compressed .xz files via python lzma module"""
decompressed_file = os.path.basename(strip_extension(archive_file, "xz"))
archive_out = os.path.join(os.getcwd(), decompressed_file)
with open(archive_out, "wb") as ar:
with lzma.open(archive_file) as lar:
shutil.copyfileobj(lar, ar)
return archive_out
def _xz(archive_file): def _xz(archive_file):
"""Decompress lzma compressed .xz files via xz command line """Returns path to decompressed xz files
tool. Available only on Unix Decompress lzma compressed .xz files via xz command line
tool.
""" """
if sys.platform == "win32":
raise RuntimeError("XZ tool unavailable on Windows")
decompressed_file = os.path.basename(strip_extension(archive_file, "xz")) decompressed_file = os.path.basename(strip_extension(archive_file, "xz"))
working_dir = os.getcwd() working_dir = os.getcwd()
destination_abspath = os.path.join(working_dir, decompressed_file) destination_abspath = os.path.join(working_dir, decompressed_file)
@ -239,21 +273,20 @@ def _xz(archive_file):
return destination_abspath return destination_abspath
def _7zip(archive_file): def _system_7zip(archive_file):
"""Unpack/decompress with 7z executable """Returns path to decompressed file
Unpack/decompress with 7z executable
7z is able to handle a number file extensions however 7z is able to handle a number file extensions however
it may not be available on system. it may not be available on system.
Without 7z, Windows users with certain versions of Python may Without 7z, Windows users with certain versions of Python may
be unable to extract .xz files, and all Windows users will be unable be unable to extract .xz files, and all Windows users will be unable
to extract .Z files. If we cannot find 7z either externally or a to extract .Z files. If we cannot find 7z either externally or a
Spack installed copy, we fail, but inform the user that 7z can Spack installed copy, we fail, but inform the user that 7z can
be installed via `spack install 7zip` be installed via `spack install 7zip`
Args: Args:
archive_file (str): absolute path of file to be unarchived archive_file (str): absolute path of file to be unarchived
""" """
outfile = os.path.basename(strip_last_extension(archive_file)) outfile = os.path.basename(strip_compression_extension(archive_file))
_7z = which("7z") _7z = which("7z")
if not _7z: if not _7z:
raise CommandNotFoundError( raise CommandNotFoundError(
@ -267,12 +300,10 @@ def _7zip(archive_file):
def decompressor_for(path, extension=None): def decompressor_for(path, extension=None):
"""Returns a function pointer to appropriate decompression """Returns appropriate decompression/extraction algorithm function pointer
algorithm based on extension type. for provided extension. If extension is none, it is computed
from the `path` and the decompression function is derived
Args: from that information."""
path (str): path of the archive file requiring decompression
"""
if not extension: if not extension:
extension = extension_from_file(path, decompress=True) extension = extension_from_file(path, decompress=True)
@ -282,14 +313,28 @@ def decompressor_for(path, extension=None):
unrecognized file extension: '%s'" unrecognized file extension: '%s'"
% extension % extension
) )
if sys.platform == "win32":
return decompressor_for_win(extension)
else:
return decompressor_for_nix(extension)
if re.match(r"\.?zip$", extension) or path.endswith(".zip"):
def decompressor_for_nix(extension):
"""Returns a function pointer to appropriate decompression
algorithm based on extension type and unix specific considerations
i.e. a reasonable expectation system utils like gzip, bzip2, and xz are
available
Args:
path (str): path of the archive file requiring decompression
"""
if re.match(r"zip$", extension):
return _unzip return _unzip
if re.match(r"gz", extension): if re.match(r"gz$", extension):
return _gunzip return _gunzip
if re.match(r"bz2", extension): if re.match(r"bz2$", extension):
return _bunzip2 return _bunzip2
# Python does not have native support # Python does not have native support
@ -297,21 +342,80 @@ def decompressor_for(path, extension=None):
# we rely on external tools such as tar, # we rely on external tools such as tar,
# 7z, or uncompressZ # 7z, or uncompressZ
if re.match(r"Z$", extension): if re.match(r"Z$", extension):
return _unZ return _system_unZ
# Python and platform may not have support for lzma # Python and platform may not have support for lzma
# compression. If no lzma support, use tools available on systems # compression. If no lzma support, use tools available on systems
# 7zip on Windows and the xz tool on Unix systems. if re.match(r"xz$", extension):
if re.match(r"xz", extension):
return _lzma_decomp return _lzma_decomp
# Catch tar.xz/tar.Z files here for Windows return _system_untar
# as the tar utility on Windows cannot handle such
# compression types directly
if ("xz" in extension or "Z" in extension) and sys.platform == "win32":
return _win_compressed_tarball_handler
return _untar
def _determine_py_decomp_archive_strategy(extension):
"""Returns appropriate python based decompression strategy
based on extension type"""
# Only rely on Python decompression support for gz
if re.match(r"gz$", extension):
return _py_gunzip
# Only rely on Python decompression support for bzip2
if re.match(r"bz2$", extension):
return _py_bunzip
# Only rely on Python decompression support for xz
if re.match(r"xz$", extension):
return _py_lzma
return None
def decompressor_for_win(extension):
"""Returns a function pointer to appropriate decompression
algorithm based on extension type and Windows specific considerations
Windows natively vendors *only* tar, no other archive/compression utilities
So we must rely exclusively on Python module support for all compression
operations, tar for tarballs and zip files, and 7zip for Z compressed archives
and files as Python does not provide support for the UNIX compress algorithm
Args:
path (str): path of the archive file requiring decompression
extension (str): extension
"""
extension = expand_contracted_extension(extension)
# Windows native tar can handle .zip extensions, use standard
# unzip method
if re.match(r"zip$", extension):
return _unzip
# if extension is standard tarball, invoke Windows native tar
if re.match(r"tar$", extension):
return _system_untar
# Python does not have native support
# of any kind for .Z files. In these cases,
# we rely on 7zip, which must be installed outside
# of spack and added to the PATH or externally detected
if re.match(r"Z$", extension):
return _system_unZ
# Windows vendors no native decompression tools, attempt to derive
# python based decompression strategy
# Expand extension from contracted extension i.e. tar.gz from .tgz
# no-op on non contracted extensions
compression_extension = compression_ext_from_compressed_archive(extension)
decompressor = _determine_py_decomp_archive_strategy(compression_extension)
if not decompressor:
raise SpackError(
"Spack was unable to determine a proper decompression strategy for"
f"valid extension: {extension}"
"This is a bug, please file an issue at https://github.com/spack/spack/issues"
)
if "tar" not in extension:
return decompressor
return _win_compressed_tarball_handler(decompressor)
class FileTypeInterface: class FileTypeInterface:
@ -589,7 +693,7 @@ def extension_from_file(file, decompress=False):
def extension_from_path(path): def extension_from_path(path):
"""Get the allowed archive extension for a path. """Returns the allowed archive extension for a path.
If path does not include a valid archive extension If path does not include a valid archive extension
(see`spack.util.compression.ALLOWED_ARCHIVE_TYPES`) return None (see`spack.util.compression.ALLOWED_ARCHIVE_TYPES`) return None
""" """
@ -602,19 +706,23 @@ def extension_from_path(path):
return None return None
def strip_last_extension(path): def strip_compression_extension(path, ext=None):
"""Strips last supported archive extension from path""" """Returns path with last supported or provided archive extension stripped"""
if path: path = expand_contracted_extension_in_path(path)
for ext in ALLOWED_SINGLE_EXT_ARCHIVE_TYPES: exts_to_check = EXTS
mod_path = check_and_remove_ext(path, ext) if ext:
if mod_path != path: exts_to_check = [ext]
return mod_path for ext_check in exts_to_check:
mod_path = check_and_remove_ext(path, ext_check)
if mod_path != path:
return mod_path
return path return path
def strip_extension(path, ext=None): def strip_extension(path, ext=None):
"""Get the part of a path that does not include its compressed """Returns the part of a path that does not include extension.
type extension.""" If ext is given, only attempts to remove that extension. If no
extension given, attempts to strip any valid extension from path"""
if ext: if ext:
return check_and_remove_ext(path, ext) return check_and_remove_ext(path, ext)
for t in ALLOWED_ARCHIVE_TYPES: for t in ALLOWED_ARCHIVE_TYPES:
@ -625,7 +733,8 @@ def strip_extension(path, ext=None):
def check_extension(path, ext): def check_extension(path, ext):
"""Check if extension is present in path""" """Returns true if extension is present in path
false otherwise"""
# Strip sourceforge suffix. # Strip sourceforge suffix.
prefix, _ = spath.find_sourceforge_suffix(path) prefix, _ = spath.find_sourceforge_suffix(path)
if not ext.startswith(r"\."): if not ext.startswith(r"\."):
@ -636,7 +745,7 @@ def check_extension(path, ext):
def reg_remove_ext(path, ext): def reg_remove_ext(path, ext):
"""Regex remove ext from path""" """Returns path with ext remove via regex"""
if path and ext: if path and ext:
suffix = r"\.%s$" % ext suffix = r"\.%s$" % ext
return re.sub(suffix, "", path) return re.sub(suffix, "", path)
@ -644,8 +753,41 @@ def reg_remove_ext(path, ext):
def check_and_remove_ext(path, ext): def check_and_remove_ext(path, ext):
"""If given extension is present in path, remove and return, """Returns path with extension removed if extension
otherwise just return path""" is present in path. Otherwise just returns path"""
if check_extension(path, ext): if check_extension(path, ext):
return reg_remove_ext(path, ext) return reg_remove_ext(path, ext)
return path return path
def _substitute_extension(path, old_ext, new_ext):
"""Returns path with old_ext replaced with new_ext.
old_ext and new_ext can be extension strings or regexs"""
return re.sub(rf"{old_ext}", rf"{new_ext}", path)
def expand_contracted_extension_in_path(path, ext=None):
"""Returns path with any contraction extension (i.e. tgz) expanded
(i.e. tar.gz). If ext is specified, only attempt to expand that extension"""
if not ext:
ext = extension_from_path(path)
expanded_ext = expand_contracted_extension(ext)
if expanded_ext != ext:
return _substitute_extension(path, ext, expanded_ext)
return path
def expand_contracted_extension(extension):
"""Return expanded version of contracted extension
i.e. .tgz -> .tar.gz, no op on non contracted extensions"""
extension = extension.strip(".")
contraction_map = {"tgz": "tar.gz", "txz": "tar.xz", "tbz": "tar.bz2", "tbz2": "tar.bz2"}
return contraction_map.get(extension, extension)
def compression_ext_from_compressed_archive(extension):
"""Returns compression extension for a compressed archive"""
extension = expand_contracted_extension(extension)
for ext in [*EXTS]:
if ext in extension:
return ext