avoid 7K stat calls

2024-08-22 09:06:18 +02:00 · 2024-08-22 09:06:18 +02:00 · 225a4ed1ff
commit 225a4ed1ff
parent ea823d2308
5 changed files with 70 additions and 32 deletions
--- a/lib/spack/spack/cmd/repo.py
+++ b/lib/spack/spack/cmd/repo.py
@ -168,8 +168,12 @@ def repo_zip(args):
    except spack.repo.RepoError:
        tty.die(f"No repository at path: {key}")

-    def _zip_repo_skip(entry: os.DirEntry):
-        return entry.name == "__pycache__"
+    def _zip_repo_skip(entry: os.DirEntry, depth: int):
+        if entry.name == "__pycache__":
+            return True
+        if depth == 0 and not os.path.exists(os.path.join(entry.path, "package.py")):
+            return True
+        return False

    def _zip_repo_path_to_name(path: str) -> str:
        # use spack/pkg/<repo>/* prefix and rename `package.py` as `__init__.py`
--- a/lib/spack/spack/patch.py
+++ b/lib/spack/spack/patch.py
@ -9,7 +9,7 @@
 import pathlib
 import sys
 import zipfile
-from typing import Any, Dict, Optional, Tuple, Type, Union
+from typing import Any, Dict, Optional, Set, Tuple, Type, Union

 import llnl.util.filesystem
 from llnl.url import allowed_archive
@ -155,7 +155,7 @@ def __hash__(self) -> int:
        return hash(self.sha256)


-zipfilecache = {}
+zipfilecache: Dict[str, Tuple[zipfile.ZipFile, Set[str]]] = {}


 class FilePatch(Patch):
@ -202,9 +202,8 @@ def __init__(
            if "packages.zip" in path.parts:
                # check if it exists in the zip file.
                idx = path.parts.index("packages.zip")
-                zip_path, entry_path = pathlib.PurePath(*path.parts[: idx + 1]), pathlib.PurePath(
-                    *path.parts[idx + 1 :]
-                )
+                zip_path = str(pathlib.PurePath(*path.parts[: idx + 1]))
+                entry_path = str(pathlib.PurePath(*path.parts[idx + 1 :]))

                lookup = zipfilecache.get(zip_path)
                if lookup is None:
@ -213,7 +212,7 @@ def __init__(
                    zipfilecache[zip_path] = (zip, namelist)
                else:
                    zip, namelist = lookup
-                if str(entry_path) in namelist:
+                if entry_path in namelist:
                    abs_path = str(path)
                    break
            elif path.exists():
@ -241,17 +240,16 @@ def sha256(self) -> str:
            if "packages.zip" in path.parts:
                # split in path to packages.zip and the path within the zip
                idx = path.parts.index("packages.zip")
-                path_to_zip, path_in_zip = pathlib.PurePath(
-                    *path.parts[: idx + 1]
-                ), pathlib.PurePath(*path.parts[idx + 1 :])
-                lookup = zipfilecache.get(path_to_zip)
+                zip_path = str(pathlib.PurePath(*path.parts[: idx + 1]))
+                entry_path = str(pathlib.PurePath(*path.parts[idx + 1 :]))
+                lookup = zipfilecache.get(zip_path)
                if lookup is None:
-                    zip = zipfile.ZipFile(path_to_zip, "r")
+                    zip = zipfile.ZipFile(zip_path, "r")
                    namelist = set(zip.namelist())
-                    zipfilecache[path_to_zip] = (zip, namelist)
+                    zipfilecache[zip_path] = (zip, namelist)
                else:
                    zip, namelist = lookup
-                f = zip.open(str(path_in_zip), "r")
+                f = zip.open(entry_path, "r")
            else:
                f = open(self.path, "rb")
            self._sha256 = checksum_stream(hashlib.sha256, f)
--- a/lib/spack/spack/repo.py
+++ b/lib/spack/spack/repo.py
@ -26,6 +26,7 @@
 import types
 import uuid
 import warnings
+import zipfile
 import zipimport
 from typing import Any, Dict, Generator, List, Optional, Set, Tuple, Type, Union

@ -165,10 +166,10 @@ def compute_loader(self, fullname):
                if package_name:
                    # annoyingly there is a many to one mapping for pkg module to file, have to
                    # figure out how to deal with this properly.
-                    return (
-                        (repo.zipimporter, f"{namespace}.{package_name}")
-                        if repo.zipimporter
-                        else (
+                    if repo.zipimporter:
+                        return repo.zipimporter, f"{namespace}.{package_name}"
+                    else:
+                        return (
                            _PrependFileLoader(
                                fullname=fullname,
                                path=repo.filename_for_package_name(package_name),
@ -176,7 +177,6 @@ def compute_loader(self, fullname):
                            ),
                            fullname,
                        )
-                    )

            # We are importing a full namespace like 'spack.pkg.builtin'
            if fullname == repo.full_namespace:
@ -364,6 +364,37 @@ def __getattr__(self, name):
        return getattr(self, name)


+class EvenFasterPackageChecker(collections.abc.Mapping):
+    def __init__(self, packages_path):
+        # The path of the repository managed by this instance
+        self.packages_path = packages_path
+        self.zipfile = zipfile.ZipFile(os.path.join(packages_path, "..", "packages.zip"), "r")
+        self.invalidate()
+
+    def invalidate(self):
+        self.mtime = os.stat(self.zipfile.filename).st_mtime
+        self.pkgs = {
+            f.rstrip("/"): self.mtime
+            for f in self.zipfile.namelist()
+            if f.endswith("/") and f.count("/") == 1 and f != "./"
+        }
+
+    def last_mtime(self):
+        return self.mtime
+
+    def modified_since(self, since: float) -> List[str]:
+        return list(self.pkgs) if self.mtime > since else []
+
+    def __getitem__(self, item):
+        return self.pkgs[item]
+
+    def __iter__(self):
+        return iter(self.pkgs)
+
+    def __len__(self):
+        return len(self.pkgs)
+
+
 class FastPackageChecker(collections.abc.Mapping):
    """Cache that maps package names to the stats obtained on the
    'package.py' files associated with them.
@ -578,7 +609,7 @@ class RepoIndex:

    def __init__(
        self,
-        package_checker: FastPackageChecker,
+        package_checker: Union[FastPackageChecker, EvenFasterPackageChecker],
        namespace: str,
        cache: "spack.caches.FileCacheType",
    ):
@ -1016,7 +1047,9 @@ def check(condition, msg):
        self._finder: Optional[RepoPath] = None

        # Maps that goes from package name to corresponding file stat
-        self._fast_package_checker: Optional[FastPackageChecker] = None
+        self._fast_package_checker: Optional[
+            Union[EvenFasterPackageChecker, FastPackageChecker]
+        ] = None

        # Indexes for this repository, computed lazily
        self._repo_index: Optional[RepoIndex] = None
@ -1190,9 +1223,12 @@ def filename_for_package_name(self, pkg_name: str) -> str:
        return os.path.join(pkg_dir, package_file_name)

    @property
-    def _pkg_checker(self) -> FastPackageChecker:
+    def _pkg_checker(self) -> Union[FastPackageChecker, EvenFasterPackageChecker]:
        if self._fast_package_checker is None:
-            self._fast_package_checker = FastPackageChecker(self.packages_path)
+            if self.zipimporter:
+                self._fast_package_checker = EvenFasterPackageChecker(self.packages_path)
+            else:
+                self._fast_package_checker = FastPackageChecker(self.packages_path)
        return self._fast_package_checker

    def all_package_names(self, include_virtuals: bool = False) -> List[str]:
--- a/lib/spack/spack/util/archive.py
+++ b/lib/spack/spack/util/archive.py
@ -12,7 +12,7 @@
 import zipfile
 from contextlib import closing, contextmanager
 from gzip import GzipFile
-from typing import Callable, Dict, Tuple
+from typing import Callable, Dict, List, Tuple

 from llnl.util.symlink import readlink

@ -236,13 +236,13 @@ def reproducible_zipfile_from_prefix(
    zip: zipfile.ZipFile,
    prefix: str,
    *,
-    skip: Callable[[os.DirEntry], bool] = lambda entry: False,
+    skip: Callable[[os.DirEntry, int], bool] = lambda entry, depth: False,
    path_to_name: Callable[[str], str] = default_path_to_name,
 ) -> None:
    """Similar to ``reproducible_tarfile_from_prefix`` but for zipfiles."""
-    dir_stack = [prefix]
+    dir_stack: List[Tuple[str, int]] = [(prefix, 0)]
    while dir_stack:
-        dir = dir_stack.pop()
+        dir, depth = dir_stack.pop()

        # Add the dir before its contents. zip.mkdir is Python 3.11.
        dir_info = zipfile.ZipInfo(path_to_name(dir))
@ -259,11 +259,11 @@ def reproducible_zipfile_from_prefix(

        new_dirs = []
        for entry in entries:
-            if skip(entry):
+            if skip(entry, depth):
                continue

            if entry.is_dir(follow_symlinks=False):
-                new_dirs.append(entry.path)
+                new_dirs.append((entry.path, depth + 1))
                continue

            # symlink / hardlink support in ZIP is poor or non-existent: make copies.
--- a/lib/spack/spack/util/crypto.py
+++ b/lib/spack/spack/util/crypto.py
@ -4,7 +4,7 @@
 # SPDX-License-Identifier: (Apache-2.0 OR MIT)

 import hashlib
-from typing import BinaryIO, Callable, Dict, Optional
+from typing import IO, Callable, Dict, Optional

 import llnl.util.tty as tty

@ -80,7 +80,7 @@ def hash_fun_for_digest(hexdigest: str) -> HashFactory:
    return hash_fun_for_algo(hash_algo_for_digest(hexdigest))


-def checksum_stream(hashlib_algo: HashFactory, fp: BinaryIO, *, block_size: int = 2**20) -> str:
+def checksum_stream(hashlib_algo: HashFactory, fp: IO[bytes], *, block_size: int = 2**20) -> str:
    """Returns a hex digest of the stream generated using given algorithm from hashlib."""
    hasher = hashlib_algo()
    while True: