Extract prefix locks and failure markers from Database (#39024)

This PR extracts two responsibilities from the `Database` class: 1. Managing locks for prefixes during an installation 2. Marking installation failures and pushes them into their own class (`SpecLocker` and `FailureMarker`). These responsibilities are also pushed up into the `Store`, leaving to `Database` only the duty to manage `index.json` files. `SpecLocker` classes no longer share a global list of locks, but locks are per instance. Their identifier is simply `(dag hash, package name)`, and not the spec prefix path, to avoid circular dependencies across Store / Database / Spec.
2023-08-07 12:47:52 +02:00
parent 27f04b3544
commit ba1d295023
12 changed files with 358 additions and 384 deletions
--- a/lib/spack/spack/cmd/clean.py
+++ b/lib/spack/spack/cmd/clean.py
@@ -17,6 +17,7 @@
 import spack.config
 import spack.repo
 import spack.stage
 import spack.store
 import spack.util.path
 from spack.paths import lib_path, var_path
@@ -121,7 +122,7 @@ def clean(parser, args):
    if args.failures:
        tty.msg("Removing install failure marks")
-        spack.installer.clear_failures()
+        spack.store.STORE.failure_tracker.clear_all()
    if args.misc_cache:
        tty.msg("Removing cached information on repositories")
--- a/lib/spack/spack/database.py
+++ b/lib/spack/spack/database.py
@@ -21,10 +21,11 @@
 import contextlib
 import datetime
 import os
 import pathlib
 import socket
 import sys
 import time
-from typing import Dict, List, NamedTuple, Set, Type, Union
+from typing import Any, Callable, Dict, Generator, List, NamedTuple, Set, Type, Union
 try:
    import uuid
@@ -141,22 +142,23 @@ class InstallStatuses:
    def canonicalize(cls, query_arg):
        if query_arg is True:
            return [cls.INSTALLED]
-        elif query_arg is False:
+        if query_arg is False:
            return [cls.MISSING]
-        elif query_arg is any:
+        if query_arg is any:
            return [cls.INSTALLED, cls.DEPRECATED, cls.MISSING]
-        elif isinstance(query_arg, InstallStatus):
+        if isinstance(query_arg, InstallStatus):
            return [query_arg]
-        else:
+        try:
-            try:  # Try block catches if it is not an iterable at all
+            statuses = list(query_arg)
-                if any(type(x) != InstallStatus for x in query_arg):
+            if all(isinstance(x, InstallStatus) for x in statuses):
-                    raise TypeError
+                return statuses
-            except TypeError:
+        except TypeError:
-                raise TypeError(
+            pass
-                    "installation query must be `any`, boolean, "
+
-                    "InstallStatus, or iterable of InstallStatus"
+        raise TypeError(
-                )
+            "installation query must be `any`, boolean, "
-            return query_arg
+            "InstallStatus, or iterable of InstallStatus"
        )
 class InstallRecord:
@@ -306,15 +308,16 @@ def __reduce__(self):
        """
 #: Data class to configure locks in Database objects
 #:
 #: Args:
 #:    enable (bool): whether to enable locks or not.
 #:    database_timeout (int or None): timeout for the database lock
 #:    package_timeout (int or None): timeout for the package lock
 class LockConfiguration(NamedTuple):
    """Data class to configure locks in Database objects
    Args:
        enable: whether to enable locks or not.
        database_timeout: timeout for the database lock
        package_timeout: timeout for the package lock
    """
    enable: bool
    database_timeout: Optional[int]
    package_timeout: Optional[int]
@@ -348,13 +351,230 @@ def lock_configuration(configuration):
    )
 def prefix_lock_path(root_dir: Union[str, pathlib.Path]) -> pathlib.Path:
    """Returns the path of the prefix lock file, given the root directory.
    Args:
        root_dir: root directory containing the database directory
    """
    return pathlib.Path(root_dir) / _DB_DIRNAME / "prefix_lock"
 def failures_lock_path(root_dir: Union[str, pathlib.Path]) -> pathlib.Path:
    """Returns the path of the failures lock file, given the root directory.
    Args:
        root_dir: root directory containing the database directory
    """
    return pathlib.Path(root_dir) / _DB_DIRNAME / "prefix_failures"
 class SpecLocker:
    """Manages acquiring and releasing read or write locks on concrete specs."""
    def __init__(self, lock_path: Union[str, pathlib.Path], default_timeout: Optional[float]):
        self.lock_path = pathlib.Path(lock_path)
        self.default_timeout = default_timeout
        # Maps (spec.dag_hash(), spec.name) to the corresponding lock object
        self.locks: Dict[Tuple[str, str], lk.Lock] = {}
    def lock(self, spec: "spack.spec.Spec", timeout: Optional[float] = None) -> lk.Lock:
        """Returns a lock on a concrete spec.
        The lock is a byte range lock on the nth byte of a file.
        The lock file is ``self.lock_path``.
        n is the sys.maxsize-bit prefix of the DAG hash.  This makes likelihood of collision is
        very low AND it gives us readers-writer lock semantics with just a single lockfile, so
        no cleanup required.
        """
        assert spec.concrete, "cannot lock a non-concrete spec"
        timeout = timeout or self.default_timeout
        key = self._lock_key(spec)
        if key not in self.locks:
            self.locks[key] = self.raw_lock(spec, timeout=timeout)
        else:
            self.locks[key].default_timeout = timeout
        return self.locks[key]
    def raw_lock(self, spec: "spack.spec.Spec", timeout: Optional[float] = None) -> lk.Lock:
        """Returns a raw lock for a Spec, but doesn't keep track of it."""
        return lk.Lock(
            str(self.lock_path),
            start=spec.dag_hash_bit_prefix(bit_length(sys.maxsize)),
            length=1,
            default_timeout=timeout,
            desc=spec.name,
        )
    def has_lock(self, spec: "spack.spec.Spec") -> bool:
        """Returns True if the spec is already managed by this spec locker"""
        return self._lock_key(spec) in self.locks
    def _lock_key(self, spec: "spack.spec.Spec") -> Tuple[str, str]:
        return (spec.dag_hash(), spec.name)
    @contextlib.contextmanager
    def write_lock(self, spec: "spack.spec.Spec") -> Generator["SpecLocker", None, None]:
        lock = self.lock(spec)
        lock.acquire_write()
        try:
            yield self
        except lk.LockError:
            # This addresses the case where a nested lock attempt fails inside
            # of this context manager
            raise
        except (Exception, KeyboardInterrupt):
            lock.release_write()
            raise
        else:
            lock.release_write()
    def clear(self, spec: "spack.spec.Spec") -> Tuple[bool, Optional[lk.Lock]]:
        key = self._lock_key(spec)
        lock = self.locks.pop(key, None)
        return bool(lock), lock
    def clear_all(self, clear_fn: Optional[Callable[[lk.Lock], Any]] = None) -> None:
        if clear_fn is not None:
            for lock in self.locks.values():
                clear_fn(lock)
        self.locks.clear()
 class FailureTracker:
    """Tracks installation failures.
    Prefix failure marking takes the form of a byte range lock on the nth
    byte of a file for coordinating between concurrent parallel build
    processes and a persistent file, named with the full hash and
    containing the spec, in a subdirectory of the database to enable
    persistence across overlapping but separate related build processes.
    The failure lock file lives alongside the install DB.
    ``n`` is the sys.maxsize-bit prefix of the associated DAG hash to make
    the likelihood of collision very low with no cleanup required.
    """
    def __init__(self, root_dir: Union[str, pathlib.Path], default_timeout: Optional[float]):
        #: Ensure a persistent location for dealing with parallel installation
        #: failures (e.g., across near-concurrent processes).
        self.dir = pathlib.Path(root_dir) / _DB_DIRNAME / "failures"
        self.dir.mkdir(parents=True, exist_ok=True)
        self.locker = SpecLocker(failures_lock_path(root_dir), default_timeout=default_timeout)
    def clear(self, spec: "spack.spec.Spec", force: bool = False) -> None:
        """Removes any persistent and cached failure tracking for the spec.
        see `mark()`.
        Args:
            spec: the spec whose failure indicators are being removed
            force: True if the failure information should be cleared when a failure lock
                exists for the file, or False if the failure should not be cleared (e.g.,
                it may be associated with a concurrent build)
        """
        locked = self.lock_taken(spec)
        if locked and not force:
            tty.msg(f"Retaining failure marking for {spec.name} due to lock")
            return
        if locked:
            tty.warn(f"Removing failure marking despite lock for {spec.name}")
        succeeded, lock = self.locker.clear(spec)
        if succeeded and lock is not None:
            lock.release_write()
        if self.persistent_mark(spec):
            path = self._path(spec)
            tty.debug(f"Removing failure marking for {spec.name}")
            try:
                path.unlink()
            except OSError as err:
                tty.warn(
                    f"Unable to remove failure marking for {spec.name} ({str(path)}): {str(err)}"
                )
    def clear_all(self) -> None:
        """Force remove install failure tracking files."""
        tty.debug("Releasing prefix failure locks")
        self.locker.clear_all(
            clear_fn=lambda x: x.release_write() if x.is_write_locked() else True
        )
        tty.debug("Removing prefix failure tracking files")
        try:
            for fail_mark in os.listdir(str(self.dir)):
                try:
                    (self.dir / fail_mark).unlink()
                except OSError as exc:
                    tty.warn(f"Unable to remove failure marking file {fail_mark}: {str(exc)}")
        except OSError as exc:
            tty.warn(f"Unable to remove failure marking files: {str(exc)}")
    def mark(self, spec: "spack.spec.Spec") -> lk.Lock:
        """Marks a spec as failing to install.
        Args:
            spec: spec that failed to install
        """
        # Dump the spec to the failure file for (manual) debugging purposes
        path = self._path(spec)
        path.write_text(spec.to_json())
        # Also ensure a failure lock is taken to prevent cleanup removal
        # of failure status information during a concurrent parallel build.
        if not self.locker.has_lock(spec):
            try:
                mark = self.locker.lock(spec)
                mark.acquire_write()
            except lk.LockTimeoutError:
                # Unlikely that another process failed to install at the same
                # time but log it anyway.
                tty.debug(f"PID {os.getpid()} failed to mark install failure for {spec.name}")
                tty.warn(f"Unable to mark {spec.name} as failed.")
        return self.locker.lock(spec)
    def has_failed(self, spec: "spack.spec.Spec") -> bool:
        """Return True if the spec is marked as failed."""
        # The failure was detected in this process.
        if self.locker.has_lock(spec):
            return True
        # The failure was detected by a concurrent process (e.g., an srun),
        # which is expected to be holding a write lock if that is the case.
        if self.lock_taken(spec):
            return True
        # Determine if the spec may have been marked as failed by a separate
        # spack build process running concurrently.
        return self.persistent_mark(spec)
    def lock_taken(self, spec: "spack.spec.Spec") -> bool:
        """Return True if another process has a failure lock on the spec."""
        check = self.locker.raw_lock(spec)
        return check.is_write_locked()
    def persistent_mark(self, spec: "spack.spec.Spec") -> bool:
        """Determine if the spec has a persistent failure marking."""
        return self._path(spec).exists()
    def _path(self, spec: "spack.spec.Spec") -> pathlib.Path:
        """Return the path to the spec's failure file, which may not exist."""
        assert spec.concrete, "concrete spec required for failure path"
        return self.dir / f"{spec.name}-{spec.dag_hash()}"
 class Database:
    #: Per-process lock objects for each install prefix
    _prefix_locks: Dict[str, lk.Lock] = {}
    #: Per-process failure (lock) objects for each install prefix
    _prefix_failures: Dict[str, lk.Lock] = {}
    #: Fields written for each install record
    record_fields: Tuple[str, ...] = DEFAULT_INSTALL_RECORD_FIELDS
@@ -392,24 +612,10 @@ def __init__(
        self._verifier_path = os.path.join(self.database_directory, "index_verifier")
        self._lock_path = os.path.join(self.database_directory, "lock")
        # This is for other classes to use to lock prefix directories.
        self.prefix_lock_path = os.path.join(self.database_directory, "prefix_lock")
        # Ensure a persistent location for dealing with parallel installation
        # failures (e.g., across near-concurrent processes).
        self._failure_dir = os.path.join(self.database_directory, "failures")
        # Support special locks for handling parallel installation failures
        # of a spec.
        self.prefix_fail_path = os.path.join(self.database_directory, "prefix_failures")
        # Create needed directories and files
        if not is_upstream and not os.path.exists(self.database_directory):
            fs.mkdirp(self.database_directory)
        if not is_upstream and not os.path.exists(self._failure_dir):
            fs.mkdirp(self._failure_dir)
        self.is_upstream = is_upstream
        self.last_seen_verifier = ""
        # Failed write transactions (interrupted by exceptions) will alert
@@ -423,15 +629,7 @@ def __init__(
        # initialize rest of state.
        self.db_lock_timeout = lock_cfg.database_timeout
        self.package_lock_timeout = lock_cfg.package_timeout
        tty.debug("DATABASE LOCK TIMEOUT: {0}s".format(str(self.db_lock_timeout)))
        timeout_format_str = (
            "{0}s".format(str(self.package_lock_timeout))
            if self.package_lock_timeout
            else "No timeout"
        )
        tty.debug("PACKAGE LOCK TIMEOUT: {0}".format(str(timeout_format_str)))
        self.lock: Union[ForbiddenLock, lk.Lock]
        if self.is_upstream:
@@ -471,212 +669,6 @@ def read_transaction(self):
        """Get a read lock context manager for use in a `with` block."""
        return self._read_transaction_impl(self.lock, acquire=self._read)
    def _failed_spec_path(self, spec):
        """Return the path to the spec's failure file, which may not exist."""
        if not spec.concrete:
            raise ValueError("Concrete spec required for failure path for {0}".format(spec.name))
        return os.path.join(self._failure_dir, "{0}-{1}".format(spec.name, spec.dag_hash()))
    def clear_all_failures(self) -> None:
        """Force remove install failure tracking files."""
        tty.debug("Releasing prefix failure locks")
        for pkg_id in list(self._prefix_failures.keys()):
            lock = self._prefix_failures.pop(pkg_id, None)
            if lock:
                lock.release_write()
        # Remove all failure markings (aka files)
        tty.debug("Removing prefix failure tracking files")
        for fail_mark in os.listdir(self._failure_dir):
            try:
                os.remove(os.path.join(self._failure_dir, fail_mark))
            except OSError as exc:
                tty.warn(
                    "Unable to remove failure marking file {0}: {1}".format(fail_mark, str(exc))
                )
    def clear_failure(self, spec: "spack.spec.Spec", force: bool = False) -> None:
        """
        Remove any persistent and cached failure tracking for the spec.
        see `mark_failed()`.
        Args:
            spec: the spec whose failure indicators are being removed
            force: True if the failure information should be cleared when a prefix failure
                lock exists for the file, or False if the failure should not be cleared (e.g.,
                it may be associated with a concurrent build)
        """
        failure_locked = self.prefix_failure_locked(spec)
        if failure_locked and not force:
            tty.msg("Retaining failure marking for {0} due to lock".format(spec.name))
            return
        if failure_locked:
            tty.warn("Removing failure marking despite lock for {0}".format(spec.name))
        lock = self._prefix_failures.pop(spec.prefix, None)
        if lock:
            lock.release_write()
        if self.prefix_failure_marked(spec):
            try:
                path = self._failed_spec_path(spec)
                tty.debug("Removing failure marking for {0}".format(spec.name))
                os.remove(path)
            except OSError as err:
                tty.warn(
                    "Unable to remove failure marking for {0} ({1}): {2}".format(
                        spec.name, path, str(err)
                    )
                )
    def mark_failed(self, spec: "spack.spec.Spec") -> lk.Lock:
        """
        Mark a spec as failing to install.
        Prefix failure marking takes the form of a byte range lock on the nth
        byte of a file for coordinating between concurrent parallel build
        processes and a persistent file, named with the full hash and
        containing the spec, in a subdirectory of the database to enable
        persistence across overlapping but separate related build processes.
        The failure lock file, ``spack.store.STORE.db.prefix_failures``, lives
        alongside the install DB. ``n`` is the sys.maxsize-bit prefix of the
        associated DAG hash to make the likelihood of collision very low with
        no cleanup required.
        """
        # Dump the spec to the failure file for (manual) debugging purposes
        path = self._failed_spec_path(spec)
        with open(path, "w") as f:
            spec.to_json(f)
        # Also ensure a failure lock is taken to prevent cleanup removal
        # of failure status information during a concurrent parallel build.
        err = "Unable to mark {0.name} as failed."
        prefix = spec.prefix
        if prefix not in self._prefix_failures:
            mark = lk.Lock(
                self.prefix_fail_path,
                start=spec.dag_hash_bit_prefix(bit_length(sys.maxsize)),
                length=1,
                default_timeout=self.package_lock_timeout,
                desc=spec.name,
            )
            try:
                mark.acquire_write()
            except lk.LockTimeoutError:
                # Unlikely that another process failed to install at the same
                # time but log it anyway.
                tty.debug(
                    "PID {0} failed to mark install failure for {1}".format(os.getpid(), spec.name)
                )
                tty.warn(err.format(spec))
            # Whether we or another process marked it as a failure, track it
            # as such locally.
            self._prefix_failures[prefix] = mark
        return self._prefix_failures[prefix]
    def prefix_failed(self, spec: "spack.spec.Spec") -> bool:
        """Return True if the prefix (installation) is marked as failed."""
        # The failure was detected in this process.
        if spec.prefix in self._prefix_failures:
            return True
        # The failure was detected by a concurrent process (e.g., an srun),
        # which is expected to be holding a write lock if that is the case.
        if self.prefix_failure_locked(spec):
            return True
        # Determine if the spec may have been marked as failed by a separate
        # spack build process running concurrently.
        return self.prefix_failure_marked(spec)
    def prefix_failure_locked(self, spec: "spack.spec.Spec") -> bool:
        """Return True if a process has a failure lock on the spec."""
        check = lk.Lock(
            self.prefix_fail_path,
            start=spec.dag_hash_bit_prefix(bit_length(sys.maxsize)),
            length=1,
            default_timeout=self.package_lock_timeout,
            desc=spec.name,
        )
        return check.is_write_locked()
    def prefix_failure_marked(self, spec: "spack.spec.Spec") -> bool:
        """Determine if the spec has a persistent failure marking."""
        return os.path.exists(self._failed_spec_path(spec))
    def prefix_lock(self, spec: "spack.spec.Spec", timeout: Optional[float] = None) -> lk.Lock:
        """Get a lock on a particular spec's installation directory.
        NOTE: The installation directory **does not** need to exist.
        Prefix lock is a byte range lock on the nth byte of a file.
        The lock file is ``spack.store.STORE.db.prefix_lock`` -- the DB
        tells us what to call it and it lives alongside the install DB.
        n is the sys.maxsize-bit prefix of the DAG hash.  This makes
        likelihood of collision is very low AND it gives us
        readers-writer lock semantics with just a single lockfile, so no
        cleanup required.
        """
        timeout = timeout or self.package_lock_timeout
        prefix = spec.prefix
        if prefix not in self._prefix_locks:
            self._prefix_locks[prefix] = lk.Lock(
                self.prefix_lock_path,
                start=spec.dag_hash_bit_prefix(bit_length(sys.maxsize)),
                length=1,
                default_timeout=timeout,
                desc=spec.name,
            )
        elif timeout != self._prefix_locks[prefix].default_timeout:
            self._prefix_locks[prefix].default_timeout = timeout
        return self._prefix_locks[prefix]
    @contextlib.contextmanager
    def prefix_read_lock(self, spec):
        prefix_lock = self.prefix_lock(spec)
        prefix_lock.acquire_read()
        try:
            yield self
        except lk.LockError:
            # This addresses the case where a nested lock attempt fails inside
            # of this context manager
            raise
        except (Exception, KeyboardInterrupt):
            prefix_lock.release_read()
            raise
        else:
            prefix_lock.release_read()
    @contextlib.contextmanager
    def prefix_write_lock(self, spec):
        prefix_lock = self.prefix_lock(spec)
        prefix_lock.acquire_write()
        try:
            yield self
        except lk.LockError:
            # This addresses the case where a nested lock attempt fails inside
            # of this context manager
            raise
        except (Exception, KeyboardInterrupt):
            prefix_lock.release_write()
            raise
        else:
            prefix_lock.release_write()
    def _write_to_file(self, stream):
        """Write out the database in JSON format to the stream passed
        as argument.
--- a/lib/spack/spack/installer.py
+++ b/lib/spack/spack/installer.py
@@ -519,13 +519,6 @@ def _try_install_from_binary_cache(
    )
 def clear_failures() -> None:
    """
    Remove all failure tracking markers for the Spack instance.
    """
    spack.store.STORE.db.clear_all_failures()
 def combine_phase_logs(phase_log_files: List[str], log_path: str) -> None:
    """
    Read set or list of logs and combine them into one file.
@@ -1126,15 +1119,13 @@ class PackageInstaller:
    instance.
    """
-    def __init__(self, installs: List[Tuple["spack.package_base.PackageBase", dict]] = []):
+    def __init__(self, installs: List[Tuple["spack.package_base.PackageBase", dict]] = []) -> None:
        """Initialize the installer.
        Args:
            installs (list): list of tuples, where each
                tuple consists of a package (PackageBase) and its associated
                 install arguments (dict)
        Return:
            PackageInstaller: instance
        """
        # List of build requests
        self.build_requests = [BuildRequest(pkg, install_args) for pkg, install_args in installs]
@@ -1287,7 +1278,7 @@ def _check_deps_status(self, request: BuildRequest) -> None:
            dep_id = package_id(dep_pkg)
            # Check for failure since a prefix lock is not required
-            if spack.store.STORE.db.prefix_failed(dep):
+            if spack.store.STORE.failure_tracker.has_failed(dep):
                action = "'spack install' the dependency"
                msg = "{0} is marked as an install failure: {1}".format(dep_id, action)
                raise InstallError(err.format(request.pkg_id, msg), pkg=dep_pkg)
@@ -1502,7 +1493,7 @@ def _ensure_locked(
            if lock is None:
                tty.debug(msg.format("Acquiring", desc, pkg_id, pretty_seconds(timeout or 0)))
                op = "acquire"
-                lock = spack.store.STORE.db.prefix_lock(pkg.spec, timeout)
+                lock = spack.store.STORE.prefix_locker.lock(pkg.spec, timeout)
                if timeout != lock.default_timeout:
                    tty.warn(
                        "Expected prefix lock timeout {0}, not {1}".format(
@@ -1627,12 +1618,12 @@ def _add_tasks(self, request: BuildRequest, all_deps):
                # Clear any persistent failure markings _unless_ they are
                # associated with another process in this parallel build
                # of the spec.
-                spack.store.STORE.db.clear_failure(dep, force=False)
+                spack.store.STORE.failure_tracker.clear(dep, force=False)
        install_package = request.install_args.get("install_package")
        if install_package and request.pkg_id not in self.build_tasks:
            # Be sure to clear any previous failure
-            spack.store.STORE.db.clear_failure(request.spec, force=True)
+            spack.store.STORE.failure_tracker.clear(request.spec, force=True)
            # If not installing dependencies, then determine their
            # installation status before proceeding
@@ -1888,7 +1879,7 @@ def _update_failed(
        err = "" if exc is None else ": {0}".format(str(exc))
        tty.debug("Flagging {0} as failed{1}".format(pkg_id, err))
        if mark:
-            self.failed[pkg_id] = spack.store.STORE.db.mark_failed(task.pkg.spec)
+            self.failed[pkg_id] = spack.store.STORE.failure_tracker.mark(task.pkg.spec)
        else:
            self.failed[pkg_id] = None
        task.status = STATUS_FAILED
@@ -2074,7 +2065,7 @@ def install(self) -> None:
            # Flag a failed spec.  Do not need an (install) prefix lock since
            # assume using a separate (failed) prefix lock file.
-            if pkg_id in self.failed or spack.store.STORE.db.prefix_failed(spec):
+            if pkg_id in self.failed or spack.store.STORE.failure_tracker.has_failed(spec):
                term_status.clear()
                tty.warn("{0} failed to install".format(pkg_id))
                self._update_failed(task)
--- a/lib/spack/spack/package_base.py
+++ b/lib/spack/spack/package_base.py
@@ -2209,7 +2209,7 @@ def uninstall_by_spec(spec, force=False, deprecator=None):
            pkg = None
        # Pre-uninstall hook runs first.
-        with spack.store.STORE.db.prefix_write_lock(spec):
+        with spack.store.STORE.prefix_locker.write_lock(spec):
            if pkg is not None:
                try:
                    spack.hooks.pre_uninstall(spec)
--- a/lib/spack/spack/stage.py
+++ b/lib/spack/spack/stage.py
@@ -326,7 +326,7 @@ def __init__(
        self.keep = keep
        # File lock for the stage directory.  We use one file for all
-        # stage locks. See spack.database.Database.prefix_lock for
+        # stage locks. See spack.database.Database.prefix_locker.lock for
        # details on this approach.
        self._lock = None
        if lock:
--- a/lib/spack/spack/store.py
+++ b/lib/spack/spack/store.py
@@ -25,13 +25,14 @@
 from typing import Any, Callable, Dict, Generator, List, Optional, Union
 import llnl.util.lang
-import llnl.util.tty as tty
+from llnl.util import tty
 import spack.config
 import spack.database
 import spack.directory_layout
 import spack.error
 import spack.paths
 import spack.spec
 import spack.util.path
 #: default installation root, relative to the Spack install path
@@ -134,18 +135,21 @@ def parse_install_tree(config_dict):
 class Store:
    """A store is a path full of installed Spack packages.
-    Stores consist of packages installed according to a
+    Stores consist of packages installed according to a ``DirectoryLayout``, along with a database
-    ``DirectoryLayout``, along with an index, or _database_ of their
+    of their contents.
-    contents.  The directory layout controls what paths look like and how
+
-    Spack ensures that each unique spec gets its own unique directory (or
+    The directory layout controls what paths look like and how Spack ensures that each unique spec
-    not, though we don't recommend that). The database is a single file
+    gets its own unique directory (or not, though we don't recommend that).
-    that caches metadata for the entire Spack installation.  It prevents
+
-    us from having to spider the install tree to figure out what's there.
+    The database is a single file that caches metadata for the entire Spack installation. It
    prevents us from having to spider the install tree to figure out what's there.
    The store is also able to lock installation prefixes, and to mark installation failures.
    Args:
        root: path to the root of the install tree
-        unpadded_root: path to the root of the install tree without padding.
+        unpadded_root: path to the root of the install tree without padding. The sbang script has
-            The sbang script has to be installed here to work with padded roots
+            to be installed here to work with padded roots
        projections: expression according to guidelines that describes how to construct a path to
            a package prefix in this store
        hash_length: length of the hashes used in the directory layout. Spec hash suffixes will be
@@ -170,6 +174,19 @@ def __init__(
        self.upstreams = upstreams
        self.lock_cfg = lock_cfg
        self.db = spack.database.Database(root, upstream_dbs=upstreams, lock_cfg=lock_cfg)
        timeout_format_str = (
            f"{str(lock_cfg.package_timeout)}s" if lock_cfg.package_timeout else "No timeout"
        )
        tty.debug("PACKAGE LOCK TIMEOUT: {0}".format(str(timeout_format_str)))
        self.prefix_locker = spack.database.SpecLocker(
            spack.database.prefix_lock_path(root), default_timeout=lock_cfg.package_timeout
        )
        self.failure_tracker = spack.database.FailureTracker(
            self.root, default_timeout=lock_cfg.package_timeout
        )
        self.layout = spack.directory_layout.DirectoryLayout(
            root, projections=projections, hash_length=hash_length
        )
--- a/lib/spack/spack/test/cmd/clean.py
+++ b/lib/spack/spack/test/cmd/clean.py
@@ -10,9 +10,11 @@
 import llnl.util.filesystem as fs
 import spack.caches
 import spack.cmd.clean
 import spack.main
 import spack.package_base
 import spack.stage
 import spack.store
 clean = spack.main.SpackCommand("clean")
@@ -33,7 +35,7 @@ def __call__(self, *args, **kwargs):
    monkeypatch.setattr(spack.stage, "purge", Counter("stages"))
    monkeypatch.setattr(spack.caches.fetch_cache, "destroy", Counter("downloads"), raising=False)
    monkeypatch.setattr(spack.caches.misc_cache, "destroy", Counter("caches"))
-    monkeypatch.setattr(spack.installer, "clear_failures", Counter("failures"))
+    monkeypatch.setattr(spack.store.STORE.failure_tracker, "clear_all", Counter("failures"))
    monkeypatch.setattr(spack.cmd.clean, "remove_python_cache", Counter("python_cache"))
    yield counts
--- a/lib/spack/spack/test/cmd/install.py
+++ b/lib/spack/spack/test/cmd/install.py
@@ -23,6 +23,7 @@
 import spack.environment as ev
 import spack.hash_types as ht
 import spack.package_base
 import spack.store
 import spack.util.executable
 from spack.error import SpackError
 from spack.main import SpackCommand
@@ -705,9 +706,11 @@ def test_cache_only_fails(tmpdir, mock_fetch, install_mockery, capfd):
    assert "was not installed" in out
    # Check that failure prefix locks are still cached
-    failure_lock_prefixes = ",".join(spack.store.STORE.db._prefix_failures.keys())
+    failed_packages = [
-    assert "libelf" in failure_lock_prefixes
+        pkg_name for dag_hash, pkg_name in spack.store.STORE.failure_tracker.locker.locks.keys()
-    assert "libdwarf" in failure_lock_prefixes
+    ]
    assert "libelf" in failed_packages
    assert "libdwarf" in failed_packages
 def test_install_only_dependencies(tmpdir, mock_fetch, install_mockery):
--- a/lib/spack/spack/test/conftest.py
+++ b/lib/spack/spack/test/conftest.py
@@ -950,21 +950,14 @@ def disable_compiler_execution(monkeypatch, request):
@pytest.fixture(scope="function")
-def install_mockery(temporary_store, mutable_config, mock_packages):
+def install_mockery(temporary_store: spack.store.Store, mutable_config, mock_packages):
    """Hooks a fake install directory, DB, and stage directory into Spack."""
    # We use a fake package, so temporarily disable checksumming
    with spack.config.override("config:checksum", False):
        yield
-    # Also wipe out any cached prefix failure locks (associated with
+    # Wipe out any cached prefix failure locks (associated with the session-scoped mock archive)
-    # the session-scoped mock archive).
+    temporary_store.failure_tracker.clear_all()
    for pkg_id in list(temporary_store.db._prefix_failures.keys()):
        lock = spack.store.STORE.db._prefix_failures.pop(pkg_id, None)
        if lock:
            try:
                lock.release_write()
            except Exception:
                pass
@pytest.fixture(scope="function")
--- a/lib/spack/spack/test/database.py
+++ b/lib/spack/spack/test/database.py
@@ -807,22 +807,22 @@ def test_query_spec_with_non_conditional_virtual_dependency(database):
 def test_failed_spec_path_error(database):
    """Ensure spec not concrete check is covered."""
    s = spack.spec.Spec("a")
-    with pytest.raises(ValueError, match="Concrete spec required"):
+    with pytest.raises(AssertionError, match="concrete spec required"):
-        spack.store.STORE.db._failed_spec_path(s)
+        spack.store.STORE.failure_tracker.mark(s)
@pytest.mark.db
 def test_clear_failure_keep(mutable_database, monkeypatch, capfd):
    """Add test coverage for clear_failure operation when to be retained."""
-    def _is(db, spec):
+    def _is(self, spec):
        return True
    # Pretend the spec has been failure locked
-    monkeypatch.setattr(spack.database.Database, "prefix_failure_locked", _is)
+    monkeypatch.setattr(spack.database.FailureTracker, "lock_taken", _is)
-    s = spack.spec.Spec("a")
+    s = spack.spec.Spec("a").concretized()
-    spack.store.STORE.db.clear_failure(s)
+    spack.store.STORE.failure_tracker.clear(s)
    out = capfd.readouterr()[0]
    assert "Retaining failure marking" in out
@@ -831,16 +831,16 @@ def _is(db, spec):
 def test_clear_failure_forced(default_mock_concretization, mutable_database, monkeypatch, capfd):
    """Add test coverage for clear_failure operation when force."""
-    def _is(db, spec):
+    def _is(self, spec):
        return True
    # Pretend the spec has been failure locked
-    monkeypatch.setattr(spack.database.Database, "prefix_failure_locked", _is)
+    monkeypatch.setattr(spack.database.FailureTracker, "lock_taken", _is)
    # Ensure raise OSError when try to remove the non-existent marking
-    monkeypatch.setattr(spack.database.Database, "prefix_failure_marked", _is)
+    monkeypatch.setattr(spack.database.FailureTracker, "persistent_mark", _is)
    s = default_mock_concretization("a")
-    spack.store.STORE.db.clear_failure(s, force=True)
+    spack.store.STORE.failure_tracker.clear(s, force=True)
    out = capfd.readouterr()[1]
    assert "Removing failure marking despite lock" in out
    assert "Unable to remove failure marking" in out
@@ -858,55 +858,34 @@ def _raise_exc(lock):
    with tmpdir.as_cwd():
        s = default_mock_concretization("a")
-        spack.store.STORE.db.mark_failed(s)
+        spack.store.STORE.failure_tracker.mark(s)
        out = str(capsys.readouterr()[1])
        assert "Unable to mark a as failed" in out
-        # Clean up the failure mark to ensure it does not interfere with other
+    spack.store.STORE.failure_tracker.clear_all()
        # tests using the same spec.
        del spack.store.STORE.db._prefix_failures[s.prefix]
@pytest.mark.db
 def test_prefix_failed(default_mock_concretization, mutable_database, monkeypatch):
-    """Add coverage to prefix_failed operation."""
+    """Add coverage to failed operation."""
    def _is(db, spec):
        return True
    s = default_mock_concretization("a")
    # Confirm the spec is not already marked as failed
-    assert not spack.store.STORE.db.prefix_failed(s)
+    assert not spack.store.STORE.failure_tracker.has_failed(s)
    # Check that a failure entry is sufficient
-    spack.store.STORE.db._prefix_failures[s.prefix] = None
+    spack.store.STORE.failure_tracker.mark(s)
-    assert spack.store.STORE.db.prefix_failed(s)
+    assert spack.store.STORE.failure_tracker.has_failed(s)
    # Remove the entry and check again
-    del spack.store.STORE.db._prefix_failures[s.prefix]
+    spack.store.STORE.failure_tracker.clear(s)
-    assert not spack.store.STORE.db.prefix_failed(s)
+    assert not spack.store.STORE.failure_tracker.has_failed(s)
    # Now pretend that the prefix failure is locked
-    monkeypatch.setattr(spack.database.Database, "prefix_failure_locked", _is)
+    monkeypatch.setattr(spack.database.FailureTracker, "lock_taken", lambda self, spec: True)
-    assert spack.store.STORE.db.prefix_failed(s)
+    assert spack.store.STORE.failure_tracker.has_failed(s)
 def test_prefix_read_lock_error(default_mock_concretization, mutable_database, monkeypatch):
    """Cover the prefix read lock exception."""
    def _raise(db, spec):
        raise lk.LockError("Mock lock error")
    s = default_mock_concretization("a")
    # Ensure subsequent lock operations fail
    monkeypatch.setattr(lk.Lock, "acquire_read", _raise)
    with pytest.raises(Exception):
        with spack.store.STORE.db.prefix_read_lock(s):
            assert False
 def test_prefix_write_lock_error(default_mock_concretization, mutable_database, monkeypatch):
@@ -921,7 +900,7 @@ def _raise(db, spec):
    monkeypatch.setattr(lk.Lock, "acquire_write", _raise)
    with pytest.raises(Exception):
-        with spack.store.STORE.db.prefix_write_lock(s):
+        with spack.store.STORE.prefix_locker.write_lock(s):
            assert False
--- a/lib/spack/spack/test/install.py
+++ b/lib/spack/spack/test/install.py
@@ -159,7 +159,7 @@ def test_partial_install_delete_prefix_and_stage(install_mockery, mock_fetch, wo
        s.package.remove_prefix = rm_prefix_checker.remove_prefix
        # must clear failure markings for the package before re-installing it
-        spack.store.STORE.db.clear_failure(s, True)
+        spack.store.STORE.failure_tracker.clear(s, True)
        s.package.set_install_succeed()
        s.package.stage = MockStage(s.package.stage)
@@ -354,7 +354,7 @@ def test_partial_install_keep_prefix(install_mockery, mock_fetch, monkeypatch, w
    assert os.path.exists(s.package.prefix)
    # must clear failure markings for the package before re-installing it
-    spack.store.STORE.db.clear_failure(s, True)
+    spack.store.STORE.failure_tracker.clear(s, True)
    s.package.set_install_succeed()
    s.package.stage = MockStage(s.package.stage)
--- a/lib/spack/spack/test/installer.py
+++ b/lib/spack/spack/test/installer.py
@@ -19,6 +19,7 @@
 import spack.compilers
 import spack.concretize
 import spack.config
 import spack.database
 import spack.installer as inst
 import spack.package_base
 import spack.package_prefs as prefs
@@ -364,7 +365,7 @@ def test_ensure_locked_err(install_mockery, monkeypatch, tmpdir, capsys):
    """Test _ensure_locked when a non-lock exception is raised."""
    mock_err_msg = "Mock exception error"
-    def _raise(lock, timeout):
+    def _raise(lock, timeout=None):
        raise RuntimeError(mock_err_msg)
    const_arg = installer_args(["trivial-install-test-package"], {})
@@ -432,7 +433,7 @@ def test_ensure_locked_new_lock(install_mockery, tmpdir, lock_type, reads, write
 def test_ensure_locked_new_warn(install_mockery, monkeypatch, tmpdir, capsys):
-    orig_pl = spack.database.Database.prefix_lock
+    orig_pl = spack.database.SpecLocker.lock
    def _pl(db, spec, timeout):
        lock = orig_pl(db, spec, timeout)
@@ -444,7 +445,7 @@ def _pl(db, spec, timeout):
    installer = create_installer(const_arg)
    spec = installer.build_requests[0].pkg.spec
-    monkeypatch.setattr(spack.database.Database, "prefix_lock", _pl)
+    monkeypatch.setattr(spack.database.SpecLocker, "lock", _pl)
    lock_type = "read"
    ltype, lock = installer._ensure_locked(lock_type, spec.package)
@@ -597,59 +598,50 @@ def _repoerr(repo, name):
    assert "Couldn't copy in provenance for cmake" in out
-def test_clear_failures_success(install_mockery):
+def test_clear_failures_success(tmpdir):
    """Test the clear_failures happy path."""
    failures = spack.database.FailureTracker(str(tmpdir), default_timeout=0.1)
    spec = spack.spec.Spec("a")
    spec._mark_concrete()
    # Set up a test prefix failure lock
-    lock = lk.Lock(
+    failures.mark(spec)
-        spack.store.STORE.db.prefix_fail_path, start=1, length=1, default_timeout=1e-9, desc="test"
+    assert failures.has_failed(spec)
    )
    try:
        lock.acquire_write()
    except lk.LockTimeoutError:
        tty.warn("Failed to write lock the test install failure")
    spack.store.STORE.db._prefix_failures["test"] = lock
    # Set up a fake failure mark (or file)
    fs.touch(os.path.join(spack.store.STORE.db._failure_dir, "test"))
    # Now clear failure tracking
-    inst.clear_failures()
+    failures.clear_all()
    # Ensure there are no cached failure locks or failure marks
-    assert len(spack.store.STORE.db._prefix_failures) == 0
+    assert len(failures.locker.locks) == 0
-    assert len(os.listdir(spack.store.STORE.db._failure_dir)) == 0
+    assert len(os.listdir(failures.dir)) == 0
    # Ensure the core directory and failure lock file still exist
-    assert os.path.isdir(spack.store.STORE.db._failure_dir)
+    assert os.path.isdir(failures.dir)
    # Locks on windows are a no-op
    if sys.platform != "win32":
-        assert os.path.isfile(spack.store.STORE.db.prefix_fail_path)
+        assert os.path.isfile(failures.locker.lock_path)
-def test_clear_failures_errs(install_mockery, monkeypatch, capsys):
+@pytest.mark.xfail(sys.platform == "win32", reason="chmod does not prevent removal on Win")
 def test_clear_failures_errs(tmpdir, capsys):
    """Test the clear_failures exception paths."""
-    orig_fn = os.remove
+    failures = spack.database.FailureTracker(str(tmpdir), default_timeout=0.1)
-    err_msg = "Mock os remove"
+    spec = spack.spec.Spec("a")
    spec._mark_concrete()
    failures.mark(spec)
-    def _raise_except(path):
+    # Make the file marker not writeable, so that clearing_failures fails
-        raise OSError(err_msg)
+    failures.dir.chmod(0o000)
    # Set up a fake failure mark (or file)
    fs.touch(os.path.join(spack.store.STORE.db._failure_dir, "test"))
    monkeypatch.setattr(os, "remove", _raise_except)
    # Clear failure tracking
-    inst.clear_failures()
+    failures.clear_all()
    # Ensure expected warning generated
    out = str(capsys.readouterr()[1])
    assert "Unable to remove failure" in out
-    assert err_msg in out
+    failures.dir.chmod(0o750)
    # Restore remove for teardown
    monkeypatch.setattr(os, "remove", orig_fn)
 def test_combine_phase_logs(tmpdir):
@@ -694,14 +686,18 @@ def test_combine_phase_logs_does_not_care_about_encoding(tmpdir):
        assert f.read() == data * 2
-def test_check_deps_status_install_failure(install_mockery, monkeypatch):
+def test_check_deps_status_install_failure(install_mockery):
    """Tests that checking the dependency status on a request to install
    'a' fails, if we mark the dependency as failed.
    """
    s = spack.spec.Spec("a").concretized()
    for dep in s.traverse(root=False):
        spack.store.STORE.failure_tracker.mark(dep)
    const_arg = installer_args(["a"], {})
    installer = create_installer(const_arg)
    request = installer.build_requests[0]
    # Make sure the package is identified as failed
    monkeypatch.setattr(spack.database.Database, "prefix_failed", _true)
    with pytest.raises(inst.InstallError, match="install failure"):
        installer._check_deps_status(request)
@@ -1006,7 +1002,7 @@ def test_install_failed(install_mockery, monkeypatch, capsys):
    installer = create_installer(const_arg)
    # Make sure the package is identified as failed
-    monkeypatch.setattr(spack.database.Database, "prefix_failed", _true)
+    monkeypatch.setattr(spack.database.FailureTracker, "has_failed", _true)
    with pytest.raises(inst.InstallError, match="request failed"):
        installer.install()
@@ -1022,7 +1018,7 @@ def test_install_failed_not_fast(install_mockery, monkeypatch, capsys):
    installer = create_installer(const_arg)
    # Make sure the package is identified as failed
-    monkeypatch.setattr(spack.database.Database, "prefix_failed", _true)
+    monkeypatch.setattr(spack.database.FailureTracker, "has_failed", _true)
    with pytest.raises(inst.InstallError, match="request failed"):
        installer.install()
@@ -1121,7 +1117,7 @@ def test_install_fail_fast_on_detect(install_mockery, monkeypatch, capsys):
    #
    # This will prevent b from installing, which will cause the build of a
    # to be skipped.
-    monkeypatch.setattr(spack.database.Database, "prefix_failed", _true)
+    monkeypatch.setattr(spack.database.FailureTracker, "has_failed", _true)
    with pytest.raises(inst.InstallError, match="after first install failure"):
        installer.install()