Include all deps and package content in the dag_hash()

For a long time, Spack has used a coarser hash to identify packages
than it likely should. Packages are identified by `dag_hash()`, which
includes only link and run dependencies. Build dependencies are
stripped before hashing, and we have notincluded hashes of build
artifacts or the `package.py` files used to build.  This means the
DAG hash actually doesn't represent all the things Spack can build,
and it reduces reproducibility.

We did this because, in the early days, users were (rightly) annoyed
when a new version of CMake, autotools, or some other build dependency
would necessitate a rebuild of their entire stack. Coarsening the hash
avoided this issue and enabled a modicum of stability when only reusing
packages by hash match.

Now that we have `--reuse`, we don't need to be so careful. Users can
avoid unnecessary rebuilds much more easily, and we can add more
provenance to the spec without worrying that frequent hash changes
will cause too many rebuilds.

This commit starts the refactor with the following major change:

- [x] Make `Spec.dag_hash()` include build, run, and link
      dependencides and the package hash (it is now equivalent to
      `full_hash()`).

It also adds a couple of bugfixes for problems discovered during
the switch:

- [x] Don't add a `package_hash()` in `to_node_dict()` unless
      the spec is concrete (fixes breaks on abstract specs)

- [x] Don't add source ids to the package hash for packages without
      a known fetch strategy (may mock packages are like this)

- [x] Change how `Spec.patches` is memoized. Using
      `llnl.util.lang.memoized` on `Spec` objects causes specs to
      be stored in a `dict`, which means they need a hash.  But,
      `dag_hash()` now includes patch `sha256`'s via the package
      hash, which can lead to infinite recursion
This commit is contained in:
Todd Gamblin 2022-01-19 01:04:34 -08:00
parent d900ac2003
commit e02020c80a
4 changed files with 43 additions and 39 deletions

View File

@ -34,9 +34,14 @@ def attr(self):
return '_' + self.name
#: Default Hash descriptor, used by Spec.dag_hash() and stored in the DB.
#: Spack's deployment hash. Includes all inputs that can affect how a package is built.
dag_hash = SpecHashDescriptor(
deptype=('link', 'run'), package_hash=False, name='hash')
deptype=('build', 'link', 'run'), package_hash=True, name='hash')
#: Same as dag_hash; old name.
full_hash = SpecHashDescriptor(
deptype=('build', 'link', 'run'), package_hash=True, name='full_hash')
#: Hash descriptor that includes build dependencies.
@ -51,10 +56,6 @@ def attr(self):
name='process_hash'
)
#: Full hash used in build pipelines to determine when to rebuild packages.
full_hash = SpecHashDescriptor(
deptype=('build', 'link', 'run'), package_hash=True, name='full_hash')
#: Package hash used as part of full hash
package_hash = SpecHashDescriptor(

View File

@ -1684,8 +1684,12 @@ def content_hash(self, content=None):
hash_content = list()
try:
source_id = fs.for_package_version(self, self.version).source_id()
except fs.ExtrapolationError:
except (fs.ExtrapolationError, fs.InvalidArgsError):
# ExtrapolationError happens if the package has no fetchers defined.
# InvalidArgsError happens when there are version directives with args,
# but none of them identifies an actual fetcher.
source_id = None
if not source_id:
# TODO? in cases where a digest or source_id isn't available,
# should this attempt to download the source and set one? This
@ -1699,6 +1703,7 @@ def content_hash(self, content=None):
hash_content.append(''.encode('utf-8'))
else:
hash_content.append(source_id.encode('utf-8'))
hash_content.extend(':'.join((p.sha256, str(p.level))).encode('utf-8')
for p in self.spec.patches)
hash_content.append(package_hash(self.spec, source=content).encode('utf-8'))

View File

@ -1803,13 +1803,20 @@ def package_hash(self):
def dag_hash(self, length=None):
"""This is Spack's default hash, used to identify installations.
At the moment, it excludes build dependencies to avoid rebuilding
packages whenever build dependency versions change. We will
revise this to include more detailed provenance when the
concretizer can more aggressievly reuse installed dependencies.
Same as the full hash (includes package hash and build/link/run deps).
NOTE: Versions of Spack prior to 0.18 only included link and run deps.
"""
return self._cached_hash(ht.dag_hash, length)
def full_hash(self, length=None):
"""Hash that includes all build and run inputs for a spec.
Inputs are: the package hash (to identify the package.py),
build, link, and run dependencies.
"""
return self._cached_hash(ht.full_hash, length)
def build_hash(self, length=None):
"""Hash used to store specs in environments.
@ -1819,21 +1826,13 @@ def build_hash(self, length=None):
return self._cached_hash(ht.build_hash, length)
def process_hash(self, length=None):
"""Hash used to store specs in environments.
"""Hash used to transfer specs among processes.
This hash includes build and test dependencies and is only used to
serialize a spec and pass it around among processes.
"""
return self._cached_hash(ht.process_hash, length)
def full_hash(self, length=None):
"""Hash to determine when to rebuild packages in the build pipeline.
This hash includes the package hash, so that we know when package
files has changed between builds.
"""
return self._cached_hash(ht.full_hash, length)
def dag_hash_bit_prefix(self, bits):
"""Get the first <bits> bits of the DAG hash as an integer type."""
return spack.util.hash.base32_prefix_bits(self.dag_hash(), bits)
@ -1931,7 +1930,7 @@ def to_node_dict(self, hash=ht.dag_hash):
if hasattr(variant, '_patches_in_order_of_appearance'):
d['patches'] = variant._patches_in_order_of_appearance
if hash.package_hash:
if self._concrete and hash.package_hash:
package_hash = self.package_hash()
# Full hashes are in bytes
@ -2845,13 +2844,13 @@ def ensure_external_path_if_external(external_spec):
@staticmethod
def ensure_no_deprecated(root):
"""Raise is a deprecated spec is in the dag.
"""Raise if a deprecated spec is in the dag.
Args:
root (Spec): root spec to be analyzed
Raises:
SpecDeprecatedError: is any deprecated spec is found
SpecDeprecatedError: if any deprecated spec is found
"""
deprecated = []
with spack.store.db.read_transaction():
@ -3683,7 +3682,6 @@ def virtual_dependencies(self):
return [spec for spec in self.traverse() if spec.virtual]
@property # type: ignore[misc] # decorated prop not supported in mypy
@lang.memoized
def patches(self):
"""Return patch objects for any patch sha256 sums on this Spec.
@ -3696,21 +3694,21 @@ def patches(self):
if not self.concrete:
raise spack.error.SpecError("Spec is not concrete: " + str(self))
if 'patches' not in self.variants:
return []
if not hasattr(self, "_patches"):
self._patches = []
if 'patches' in self.variants:
# FIXME: _patches_in_order_of_appearance is attached after
# FIXME: concretization to store the order of patches somewhere.
# FIXME: Needs to be refactored in a cleaner way.
# FIXME: _patches_in_order_of_appearance is attached after
# FIXME: concretization to store the order of patches somewhere.
# FIXME: Needs to be refactored in a cleaner way.
# translate patch sha256sums to patch objects by consulting the index
self._patches = []
for sha256 in self.variants['patches']._patches_in_order_of_appearance:
index = spack.repo.path.patch_index
patch = index.patch_for_package(sha256, self.package)
self._patches.append(patch)
# translate patch sha256sums to patch objects by consulting the index
patches = []
for sha256 in self.variants['patches']._patches_in_order_of_appearance:
index = spack.repo.path.patch_index
patch = index.patch_for_package(sha256, self.package)
patches.append(patch)
return patches
return self._patches
def _dup(self, other, deps=True, cleardeps=True):
"""Copy the spec other into self. This is an overwriting

View File

@ -3,9 +3,9 @@
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
"""Test YAML serialization for specs.
"""Test YAML and JSON serialization for specs.
YAML format preserves DAG information in the spec.
The YAML and JSON formats preserve DAG information in the spec.
"""
import ast