Allow spec.json files to be clearsigned, use transparent compression for *.json (#34490)

This commit allows (remote) spec.json files to be clearsigned and gzipped.

The idea is to reduce the number of requests and number of bytes transferred
This commit is contained in:
Harmen Stoppels 2023-01-07 12:22:40 +01:00 committed by GitHub
parent 86e346e906
commit 8a1b817978
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 252 additions and 155 deletions

View File

@ -5,7 +5,9 @@
import codecs import codecs
import collections import collections
import gzip
import hashlib import hashlib
import io
import json import json
import multiprocessing.pool import multiprocessing.pool
import os import os
@ -613,6 +615,28 @@ def read_buildinfo_file(prefix):
return buildinfo return buildinfo
def transparently_decompress_bytes(binary_stream):
"""Wrap stream in a decompress wrapper if gzip compressed"""
# Get magic bytes
if isinstance(binary_stream, io.BytesIO):
# Not peekable... Alternatively io.BufferedReader(io.BytesIO(...))
# but to add yet another wrapper just to read two bytes that are
# already in memory... sigh.
magic = binary_stream.read(2)
binary_stream.seek(0)
else:
magic = binary_stream.peek(2)
# Verify magic
if magic.startswith(b"\x1f\x8b"):
return gzip.GzipFile(fileobj=binary_stream)
return binary_stream
def transparently_decompress_bytes_to_string(binary_stream, encoding="utf-8"):
return codecs.getreader(encoding)(transparently_decompress_bytes(binary_stream))
class BuildManifestVisitor(BaseDirectoryVisitor): class BuildManifestVisitor(BaseDirectoryVisitor):
"""Visitor that collects a list of files and symlinks """Visitor that collects a list of files and symlinks
that can be checked for need of relocation. It knows how that can be checked for need of relocation. It knows how
@ -845,6 +869,42 @@ def sign_specfile(key, force, specfile_path):
spack.util.gpg.sign(key, specfile_path, signed_specfile_path, clearsign=True) spack.util.gpg.sign(key, specfile_path, signed_specfile_path, clearsign=True)
def _load_clearsigned_json(stream):
# Skip the PGP header
stream.readline()
stream.readline()
json = stream.read()
footer_index = json.rfind("-----BEGIN PGP SIGNATURE-----")
if footer_index == -1 or not json[footer_index - 1].isspace():
raise ValueError("Could not find PGP signature in clearsigned json file.")
return sjson.load(json[:footer_index])
def _load_possibly_clearsigned_json(stream):
if _is_clearsigned_stream(stream):
return _load_clearsigned_json(stream)
return sjson.load(stream)
def _is_clearsigned_stream(stream):
curr = stream.tell()
header = stream.read(34)
stream.seek(curr)
return header == "-----BEGIN PGP SIGNED MESSAGE-----"
def is_clearsigned_file(path):
with open(path, "r") as f:
return _is_clearsigned_stream(f)
def load_possibly_clearsigned_json(s):
"""Deserialize JSON from a string or stream s, removing any clearsign
header/footer."""
s = io.StringIO(s) if isinstance(s, str) else s
return _load_possibly_clearsigned_json(s)
def _read_specs_and_push_index(file_list, read_method, cache_prefix, db, temp_dir, concurrency): def _read_specs_and_push_index(file_list, read_method, cache_prefix, db, temp_dir, concurrency):
"""Read all the specs listed in the provided list, using thread given thread parallelism, """Read all the specs listed in the provided list, using thread given thread parallelism,
generate the index, and push it to the mirror. generate the index, and push it to the mirror.
@ -867,11 +927,7 @@ def _fetch_spec_from_mirror(spec_url):
if spec_file_contents: if spec_file_contents:
# Need full spec.json name or this gets confused with index.json. # Need full spec.json name or this gets confused with index.json.
if spec_url.endswith(".json.sig"): return Spec.from_dict(load_possibly_clearsigned_json(spec_file_contents))
specfile_json = Spec.extract_json_from_clearsig(spec_file_contents)
return Spec.from_dict(specfile_json)
if spec_url.endswith(".json"):
return Spec.from_json(spec_file_contents)
tp = multiprocessing.pool.ThreadPool(processes=concurrency) tp = multiprocessing.pool.ThreadPool(processes=concurrency)
try: try:
@ -933,8 +989,8 @@ def _specs_from_cache_aws_cli(cache_prefix):
aws = which("aws") aws = which("aws")
def file_read_method(file_path): def file_read_method(file_path):
with open(file_path) as fd: with open(file_path, "rb") as f:
return fd.read() return transparently_decompress_bytes_to_string(f).read()
tmpspecsdir = tempfile.mkdtemp() tmpspecsdir = tempfile.mkdtemp()
sync_command_args = [ sync_command_args = [
@ -981,7 +1037,7 @@ def url_read_method(url):
contents = None contents = None
try: try:
_, _, spec_file = web_util.read_from_url(url) _, _, spec_file = web_util.read_from_url(url)
contents = codecs.getreader("utf-8")(spec_file).read() contents = transparently_decompress_bytes_to_string(spec_file).read()
except (URLError, web_util.SpackWebError) as url_err: except (URLError, web_util.SpackWebError) as url_err:
tty.error("Error reading specfile: {0}".format(url)) tty.error("Error reading specfile: {0}".format(url))
tty.error(url_err) tty.error(url_err)
@ -1379,7 +1435,7 @@ def try_verify(specfile_path):
return True return True
def try_fetch(url_to_fetch): def try_fetch(url_to_fetch, try_decompress=False):
"""Utility function to try and fetch a file from a url, stage it """Utility function to try and fetch a file from a url, stage it
locally, and return the path to the staged file. locally, and return the path to the staged file.
@ -1398,6 +1454,21 @@ def try_fetch(url_to_fetch):
stage.destroy() stage.destroy()
return None return None
if not try_decompress:
return stage
# Stage has some logic for automatically expanding
# archives, but it is based on file extensions. So instead,
# we more or less repeat the logic.
try:
tmp = stage.save_filename + ".tmp"
with gzip.open(stage.save_filename, "rb") as compressed:
with open(tmp, "wb") as decompressed:
shutil.copyfileobj(compressed, decompressed)
os.rename(tmp, stage.save_filename)
except OSError:
pass
return stage return stage
@ -1467,61 +1538,45 @@ def download_tarball(spec, unsigned=False, mirrors_for_spec=None):
} }
) )
tried_to_verify_sigs = [] verification_failure = False
# Assumes we care more about finding a spec file by preferred ext
# than by mirrory priority. This can be made less complicated as
# we remove support for deprecated spec formats and buildcache layouts.
for ext in ["json.sig", "json"]: for ext in ["json.sig", "json"]:
for mirror_to_try in mirrors_to_try: for mirror in mirrors_to_try:
specfile_url = "{0}.{1}".format(mirror_to_try["specfile"], ext) # Try to download the specfile. For any legacy version of Spack's buildcache
spackfile_url = mirror_to_try["spackfile"] # we definitely require this file.
local_specfile_stage = try_fetch(specfile_url) specfile_url = "{0}.{1}".format(mirror["specfile"], ext)
if local_specfile_stage: specfile_stage = try_fetch(specfile_url, try_decompress=True)
local_specfile_path = local_specfile_stage.save_filename if not specfile_stage:
signature_verified = False continue
if ext.endswith(".sig") and not unsigned: specfile_path = specfile_stage.save_filename
# If we found a signed specfile at the root, try to verify
# the signature immediately. We will not download the
# tarball if we could not verify the signature.
tried_to_verify_sigs.append(specfile_url)
signature_verified = try_verify(local_specfile_path)
if not signature_verified:
tty.warn("Failed to verify: {0}".format(specfile_url))
if unsigned or signature_verified or not ext.endswith(".sig"): # If it is a clearsign file, we must verify it (unless disabled)
# We will download the tarball in one of three cases: should_verify = not unsigned and is_clearsigned_file(specfile_path)
# 1. user asked for --no-check-signature if should_verify and not try_verify(specfile_path):
# 2. user didn't ask for --no-check-signature, but we verification_failure = True
# found a spec.json.sig and verified the signature already tty.warn("Failed to verify: {0}".format(specfile_url))
# 3. neither of the first two cases are true, but this file specfile_stage.destroy()
# is *not* a signed json (not a spec.json.sig file). That continue
# means we already looked at all the mirrors and either didn't
# find any .sig files or couldn't verify any of them. But it
# is still possible to find an old style binary package where
# the signature is a detached .asc file in the outer archive
# of the tarball, and in that case, the only way to know is to
# download the tarball. This is a deprecated use case, so if
# something goes wrong during the extraction process (can't
# verify signature, checksum doesn't match) we will fail at
# that point instead of trying to download more tarballs from
# the remaining mirrors, looking for one we can use.
tarball_stage = try_fetch(spackfile_url)
if tarball_stage:
return {
"tarball_stage": tarball_stage,
"specfile_stage": local_specfile_stage,
"signature_verified": signature_verified,
}
local_specfile_stage.destroy() # In case the spec.json is not clearsigned, it means it's a legacy
# format, where either the signature is in the tarball with binaries, or
# the package is unsigned. Verification
# is then postponed.
spackfile_url = mirror["spackfile"]
tarball_stage = try_fetch(spackfile_url)
if tarball_stage:
return {
"tarball_stage": tarball_stage,
"specfile_stage": specfile_stage,
"signature_verified": should_verify, # should_verify implies it was verified
}
specfile_stage.destroy()
# Falling through the nested loops meeans we exhaustively searched # Falling through the nested loops meeans we exhaustively searched
# for all known kinds of spec files on all mirrors and did not find # for all known kinds of spec files on all mirrors and did not find
# an acceptable one for which we could download a tarball. # an acceptable one for which we could download a tarball.
if tried_to_verify_sigs: if verification_failure:
raise NoVerifyException( raise NoVerifyException(
( (
"Spack found new style signed binary packages, " "Spack found new style signed binary packages, "
@ -1802,11 +1857,7 @@ def extract_tarball(spec, download_result, allow_root=False, unsigned=False, for
specfile_path = download_result["specfile_stage"].save_filename specfile_path = download_result["specfile_stage"].save_filename
with open(specfile_path, "r") as inputfile: with open(specfile_path, "r") as inputfile:
content = inputfile.read() spec_dict = load_possibly_clearsigned_json(inputfile)
if specfile_path.endswith(".json.sig"):
spec_dict = Spec.extract_json_from_clearsig(content)
else:
spec_dict = sjson.load(content)
bchecksum = spec_dict["binary_cache_checksum"] bchecksum = spec_dict["binary_cache_checksum"]
filename = download_result["tarball_stage"].save_filename filename = download_result["tarball_stage"].save_filename
@ -1971,54 +2022,35 @@ def try_direct_fetch(spec, mirrors=None):
""" """
specfile_name = tarball_name(spec, ".spec.json") specfile_name = tarball_name(spec, ".spec.json")
signed_specfile_name = tarball_name(spec, ".spec.json.sig") signed_specfile_name = tarball_name(spec, ".spec.json.sig")
specfile_is_signed = False
found_specs = [] found_specs = []
for mirror in spack.mirror.MirrorCollection(mirrors=mirrors).values(): for mirror in spack.mirror.MirrorCollection(mirrors=mirrors).values():
buildcache_fetch_url_json = url_util.join( for file in (specfile_name, signed_specfile_name):
mirror.fetch_url, _build_cache_relative_path, specfile_name url = url_util.join(mirror.fetch_url, _build_cache_relative_path, file)
)
buildcache_fetch_url_signed_json = url_util.join(
mirror.fetch_url, _build_cache_relative_path, signed_specfile_name
)
try:
_, _, fs = web_util.read_from_url(buildcache_fetch_url_signed_json)
specfile_is_signed = True
except (URLError, web_util.SpackWebError, HTTPError) as url_err:
try: try:
_, _, fs = web_util.read_from_url(buildcache_fetch_url_json) _, _, fs = web_util.read_from_url(url)
except (URLError, web_util.SpackWebError, HTTPError) as url_err_x: except (URLError, web_util.SpackWebError, HTTPError) as url_err:
tty.debug( tty.debug(
"Did not find {0} on {1}".format( "Did not find {0} on {1}".format(specfile_name, url),
specfile_name, buildcache_fetch_url_signed_json
),
url_err, url_err,
level=2, level=2,
) )
tty.debug(
"Did not find {0} on {1}".format(specfile_name, buildcache_fetch_url_json),
url_err_x,
level=2,
)
continue continue
specfile_contents = codecs.getreader("utf-8")(fs).read()
# read the spec from the build cache file. All specs in build caches # read the spec from the build cache file. All specs in build caches
# are concrete (as they are built) so we need to mark this spec # are concrete (as they are built) so we need to mark this spec
# concrete on read-in. # concrete on read-in.
if specfile_is_signed: stream = transparently_decompress_bytes_to_string(fs)
specfile_json = Spec.extract_json_from_clearsig(specfile_contents) fetched_spec = Spec.from_dict(load_possibly_clearsigned_json(stream))
fetched_spec = Spec.from_dict(specfile_json) fetched_spec._mark_concrete()
else:
fetched_spec = Spec.from_json(specfile_contents)
fetched_spec._mark_concrete()
found_specs.append( found_specs.append(
{ {
"mirror_url": mirror.fetch_url, "mirror_url": mirror.fetch_url,
"spec": fetched_spec, "spec": fetched_spec,
} }
) )
break
return found_specs return found_specs
@ -2097,7 +2129,7 @@ def get_keys(install=False, trust=False, force=False, mirrors=None):
try: try:
_, _, json_file = web_util.read_from_url(keys_index) _, _, json_file = web_util.read_from_url(keys_index)
json_index = sjson.load(codecs.getreader("utf-8")(json_file)) json_index = sjson.load(transparently_decompress_bytes_to_string(json_file))
except (URLError, web_util.SpackWebError) as url_err: except (URLError, web_util.SpackWebError) as url_err:
if web_util.url_exists(keys_index): if web_util.url_exists(keys_index):
err_msg = [ err_msg = [
@ -2422,11 +2454,15 @@ def conditional_fetch(self):
raise FetchIndexError("Could not fetch index from {}".format(url_index), e) raise FetchIndexError("Could not fetch index from {}".format(url_index), e)
try: try:
result = codecs.getreader("utf-8")(response).read() binary_result = response.read()
except ValueError as e: except ValueError as e:
return FetchCacheError("Remote index {} is invalid".format(url_index), e) return FetchCacheError("Remote index {} is invalid".format(url_index), e)
computed_hash = compute_hash(result) # The hash is computed on the raw bytes
computed_hash = compute_hash(binary_result)
# Only then decode as string, possibly decompress
result = transparently_decompress_bytes_to_string(io.BytesIO(binary_result)).read()
# We don't handle computed_hash != remote_hash here, which can happen # We don't handle computed_hash != remote_hash here, which can happen
# when remote index.json and index.json.hash are out of sync, or if # when remote index.json and index.json.hash are out of sync, or if
@ -2480,15 +2516,21 @@ def conditional_fetch(self):
raise FetchIndexError("Could not fetch index {}".format(url), e) from e raise FetchIndexError("Could not fetch index {}".format(url), e) from e
try: try:
result = codecs.getreader("utf-8")(response).read() binary_result = response.read()
except ValueError as e: except ValueError as e:
raise FetchIndexError("Remote index {} is invalid".format(url), e) from e raise FetchIndexError("Remote index {} is invalid".format(url), e) from e
# The hash is computed on the raw bytes
computed_hash = compute_hash(binary_result)
# Only then decode as string, possibly decompress
result = transparently_decompress_bytes_to_string(io.BytesIO(binary_result)).read()
headers = response.headers headers = response.headers
etag_header_value = headers.get("Etag", None) or headers.get("etag", None) etag_header_value = headers.get("Etag", None) or headers.get("etag", None)
return FetchIndexResult( return FetchIndexResult(
etag=web_util.parse_etag(etag_header_value), etag=web_util.parse_etag(etag_header_value),
hash=compute_hash(result), hash=computed_hash,
data=result, data=result,
fresh=False, fresh=False,
) )

View File

@ -156,16 +156,6 @@
default_format += "{%compiler.name}{@compiler.version}{compiler_flags}" default_format += "{%compiler.name}{@compiler.version}{compiler_flags}"
default_format += "{variants}{arch=architecture}" default_format += "{variants}{arch=architecture}"
#: Regular expression to pull spec contents out of clearsigned signature
#: file.
CLEARSIGN_FILE_REGEX = re.compile(
(
r"^-----BEGIN PGP SIGNED MESSAGE-----"
r"\s+Hash:\s+[^\s]+\s+(.+)-----BEGIN PGP SIGNATURE-----"
),
re.MULTILINE | re.DOTALL,
)
#: specfile format version. Must increase monotonically #: specfile format version. Must increase monotonically
specfile_format_version = 3 specfile_format_version = 3
@ -2453,27 +2443,6 @@ def from_json(stream):
except Exception as e: except Exception as e:
raise sjson.SpackJSONError("error parsing JSON spec:", str(e)) from e raise sjson.SpackJSONError("error parsing JSON spec:", str(e)) from e
@staticmethod
def extract_json_from_clearsig(data):
m = CLEARSIGN_FILE_REGEX.search(data)
if m:
return sjson.load(m.group(1))
return sjson.load(data)
@staticmethod
def from_signed_json(stream):
"""Construct a spec from clearsigned json spec file.
Args:
stream: string or file object to read from.
"""
data = stream
if hasattr(stream, "read"):
data = stream.read()
extracted_json = Spec.extract_json_from_clearsig(data)
return Spec.from_dict(extracted_json)
@staticmethod @staticmethod
def from_detection(spec_str, extra_attributes=None): def from_detection(spec_str, extra_attributes=None):
"""Construct a spec from a spec string determined during external """Construct a spec from a spec string determined during external

View File

@ -3,7 +3,9 @@
# #
# SPDX-License-Identifier: (Apache-2.0 OR MIT) # SPDX-License-Identifier: (Apache-2.0 OR MIT)
import glob import glob
import gzip
import io import io
import json
import os import os
import platform import platform
import sys import sys
@ -889,3 +891,106 @@ def urlopen(request: urllib.request.Request):
with pytest.raises(bindist.FetchIndexError, match="Could not fetch index"): with pytest.raises(bindist.FetchIndexError, match="Could not fetch index"):
fetcher.conditional_fetch() fetcher.conditional_fetch()
def test_read_spec_from_signed_json():
spec_dir = os.path.join(test_path, "data", "mirrors", "signed_json")
file_name = (
"linux-ubuntu18.04-haswell-gcc-8.4.0-"
"zlib-1.2.12-g7otk5dra3hifqxej36m5qzm7uyghqgb.spec.json.sig"
)
spec_path = os.path.join(spec_dir, file_name)
def check_spec(spec_to_check):
assert spec_to_check.name == "zlib"
assert spec_to_check._hash == "g7otk5dra3hifqxej36m5qzm7uyghqgb"
with open(spec_path) as fd:
s = Spec.from_dict(bindist.load_possibly_clearsigned_json(fd))
check_spec(s)
def test_load_clearsigned_json():
obj = {"hello": "world"}
clearsigned = """\
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA512
{}
-----BEGIN PGP SIGNATURE-----
xyz
-----END PGP SIGNATURE-----""".format(
json.dumps(obj)
)
# Should accept strings and streams
assert bindist.load_possibly_clearsigned_json(clearsigned) == obj
assert bindist.load_possibly_clearsigned_json(io.StringIO(clearsigned)) == obj
def test_load_without_clearsigned_json():
obj = {"hello": "world"}
not_clearsigned = json.dumps(obj)
# Should accept strings and streams
assert bindist.load_possibly_clearsigned_json(not_clearsigned) == obj
assert bindist.load_possibly_clearsigned_json(io.StringIO(not_clearsigned)) == obj
def test_json_containing_clearsigned_message_is_json():
# Test that we don't interpret json with a PGP signed message as a string somewhere
# as a clearsigned message. It should just deserialize the json contents.
val = """\
"-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA512
{}
-----BEGIN PGP SIGNATURE-----
signature
-----END PGP SIGNATURE-----
"""
input = json.dumps({"key": val})
assert bindist.load_possibly_clearsigned_json(input)["key"] == val
def test_clearsign_signature_part_of_json_string():
# Check if we can deal with a string in json containing the string that is used
# at the start of a PGP signature.
obj = {"-----BEGIN PGP SIGNATURE-----": "-----BEGIN PGP SIGNATURE-----"}
input = """\
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA512
{}
-----BEGIN PGP SIGNATURE-----
signature
-----END PGP SIGNATURE-----
""".format(
json.dumps(obj)
)
assert bindist.load_possibly_clearsigned_json(input) == obj
def test_broken_clearsign_signature():
# In this test there is no PGP signature.
obj = {"-----BEGIN PGP SIGNATURE-----": "-----BEGIN PGP SIGNATURE-----"}
input = """\
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA512
{}
""".format(
json.dumps(obj)
)
with pytest.raises(ValueError, match="Could not find PGP signature"):
bindist.load_possibly_clearsigned_json(input)
def test_transparent_decompression():
"""Test a roundtrip of string -> gzip -> string."""
text = "this is utf8 that should be compressed"
gzipped = io.BytesIO()
with gzip.GzipFile(fileobj=gzipped, mode="w", compresslevel=1, mtime=0) as f:
f.write(text.encode("utf-8"))
gzipped.seek(0)
assert bindist.transparently_decompress_bytes_to_string(gzipped).read() == text

View File

@ -1324,7 +1324,9 @@ def test_push_mirror_contents(
if file_name.endswith(".spec.json.sig"): if file_name.endswith(".spec.json.sig"):
spec_json_path = os.path.join(buildcache_path, file_name) spec_json_path = os.path.join(buildcache_path, file_name)
with open(spec_json_path) as json_fd: with open(spec_json_path) as json_fd:
json_object = Spec.extract_json_from_clearsig(json_fd.read()) json_object = spack.binary_distribution.load_possibly_clearsigned_json(
json_fd
)
jsonschema.validate(json_object, specfile_schema) jsonschema.validate(json_object, specfile_schema)
logs_dir = working_dir.join("logs_dir") logs_dir = working_dir.join("logs_dir")

View File

@ -47,27 +47,6 @@ def test_simple_spec():
check_json_round_trip(spec) check_json_round_trip(spec)
def test_read_spec_from_signed_json():
spec_dir = os.path.join(spack.paths.test_path, "data", "mirrors", "signed_json")
file_name = (
"linux-ubuntu18.04-haswell-gcc-8.4.0-"
"zlib-1.2.12-g7otk5dra3hifqxej36m5qzm7uyghqgb.spec.json.sig"
)
spec_path = os.path.join(spec_dir, file_name)
def check_spec(spec_to_check):
assert spec_to_check.name == "zlib"
assert spec_to_check._hash == "g7otk5dra3hifqxej36m5qzm7uyghqgb"
with open(spec_path) as fd:
s = Spec.from_signed_json(fd)
check_spec(s)
with open(spec_path) as fd:
s = Spec.from_signed_json(fd.read())
check_spec(s)
def test_normal_spec(mock_packages): def test_normal_spec(mock_packages):
spec = Spec("mpileaks+debug~opt") spec = Spec("mpileaks+debug~opt")
spec.normalize() spec.normalize()