avoid 7K stat calls

This commit is contained in:
Harmen Stoppels 2024-08-22 09:06:18 +02:00
parent ea823d2308
commit 225a4ed1ff
5 changed files with 70 additions and 32 deletions

View File

@ -168,8 +168,12 @@ def repo_zip(args):
except spack.repo.RepoError: except spack.repo.RepoError:
tty.die(f"No repository at path: {key}") tty.die(f"No repository at path: {key}")
def _zip_repo_skip(entry: os.DirEntry): def _zip_repo_skip(entry: os.DirEntry, depth: int):
return entry.name == "__pycache__" if entry.name == "__pycache__":
return True
if depth == 0 and not os.path.exists(os.path.join(entry.path, "package.py")):
return True
return False
def _zip_repo_path_to_name(path: str) -> str: def _zip_repo_path_to_name(path: str) -> str:
# use spack/pkg/<repo>/* prefix and rename `package.py` as `__init__.py` # use spack/pkg/<repo>/* prefix and rename `package.py` as `__init__.py`

View File

@ -9,7 +9,7 @@
import pathlib import pathlib
import sys import sys
import zipfile import zipfile
from typing import Any, Dict, Optional, Tuple, Type, Union from typing import Any, Dict, Optional, Set, Tuple, Type, Union
import llnl.util.filesystem import llnl.util.filesystem
from llnl.url import allowed_archive from llnl.url import allowed_archive
@ -155,7 +155,7 @@ def __hash__(self) -> int:
return hash(self.sha256) return hash(self.sha256)
zipfilecache = {} zipfilecache: Dict[str, Tuple[zipfile.ZipFile, Set[str]]] = {}
class FilePatch(Patch): class FilePatch(Patch):
@ -202,9 +202,8 @@ def __init__(
if "packages.zip" in path.parts: if "packages.zip" in path.parts:
# check if it exists in the zip file. # check if it exists in the zip file.
idx = path.parts.index("packages.zip") idx = path.parts.index("packages.zip")
zip_path, entry_path = pathlib.PurePath(*path.parts[: idx + 1]), pathlib.PurePath( zip_path = str(pathlib.PurePath(*path.parts[: idx + 1]))
*path.parts[idx + 1 :] entry_path = str(pathlib.PurePath(*path.parts[idx + 1 :]))
)
lookup = zipfilecache.get(zip_path) lookup = zipfilecache.get(zip_path)
if lookup is None: if lookup is None:
@ -213,7 +212,7 @@ def __init__(
zipfilecache[zip_path] = (zip, namelist) zipfilecache[zip_path] = (zip, namelist)
else: else:
zip, namelist = lookup zip, namelist = lookup
if str(entry_path) in namelist: if entry_path in namelist:
abs_path = str(path) abs_path = str(path)
break break
elif path.exists(): elif path.exists():
@ -241,17 +240,16 @@ def sha256(self) -> str:
if "packages.zip" in path.parts: if "packages.zip" in path.parts:
# split in path to packages.zip and the path within the zip # split in path to packages.zip and the path within the zip
idx = path.parts.index("packages.zip") idx = path.parts.index("packages.zip")
path_to_zip, path_in_zip = pathlib.PurePath( zip_path = str(pathlib.PurePath(*path.parts[: idx + 1]))
*path.parts[: idx + 1] entry_path = str(pathlib.PurePath(*path.parts[idx + 1 :]))
), pathlib.PurePath(*path.parts[idx + 1 :]) lookup = zipfilecache.get(zip_path)
lookup = zipfilecache.get(path_to_zip)
if lookup is None: if lookup is None:
zip = zipfile.ZipFile(path_to_zip, "r") zip = zipfile.ZipFile(zip_path, "r")
namelist = set(zip.namelist()) namelist = set(zip.namelist())
zipfilecache[path_to_zip] = (zip, namelist) zipfilecache[zip_path] = (zip, namelist)
else: else:
zip, namelist = lookup zip, namelist = lookup
f = zip.open(str(path_in_zip), "r") f = zip.open(entry_path, "r")
else: else:
f = open(self.path, "rb") f = open(self.path, "rb")
self._sha256 = checksum_stream(hashlib.sha256, f) self._sha256 = checksum_stream(hashlib.sha256, f)

View File

@ -26,6 +26,7 @@
import types import types
import uuid import uuid
import warnings import warnings
import zipfile
import zipimport import zipimport
from typing import Any, Dict, Generator, List, Optional, Set, Tuple, Type, Union from typing import Any, Dict, Generator, List, Optional, Set, Tuple, Type, Union
@ -165,10 +166,10 @@ def compute_loader(self, fullname):
if package_name: if package_name:
# annoyingly there is a many to one mapping for pkg module to file, have to # annoyingly there is a many to one mapping for pkg module to file, have to
# figure out how to deal with this properly. # figure out how to deal with this properly.
return ( if repo.zipimporter:
(repo.zipimporter, f"{namespace}.{package_name}") return repo.zipimporter, f"{namespace}.{package_name}"
if repo.zipimporter else:
else ( return (
_PrependFileLoader( _PrependFileLoader(
fullname=fullname, fullname=fullname,
path=repo.filename_for_package_name(package_name), path=repo.filename_for_package_name(package_name),
@ -176,7 +177,6 @@ def compute_loader(self, fullname):
), ),
fullname, fullname,
) )
)
# We are importing a full namespace like 'spack.pkg.builtin' # We are importing a full namespace like 'spack.pkg.builtin'
if fullname == repo.full_namespace: if fullname == repo.full_namespace:
@ -364,6 +364,37 @@ def __getattr__(self, name):
return getattr(self, name) return getattr(self, name)
class EvenFasterPackageChecker(collections.abc.Mapping):
def __init__(self, packages_path):
# The path of the repository managed by this instance
self.packages_path = packages_path
self.zipfile = zipfile.ZipFile(os.path.join(packages_path, "..", "packages.zip"), "r")
self.invalidate()
def invalidate(self):
self.mtime = os.stat(self.zipfile.filename).st_mtime
self.pkgs = {
f.rstrip("/"): self.mtime
for f in self.zipfile.namelist()
if f.endswith("/") and f.count("/") == 1 and f != "./"
}
def last_mtime(self):
return self.mtime
def modified_since(self, since: float) -> List[str]:
return list(self.pkgs) if self.mtime > since else []
def __getitem__(self, item):
return self.pkgs[item]
def __iter__(self):
return iter(self.pkgs)
def __len__(self):
return len(self.pkgs)
class FastPackageChecker(collections.abc.Mapping): class FastPackageChecker(collections.abc.Mapping):
"""Cache that maps package names to the stats obtained on the """Cache that maps package names to the stats obtained on the
'package.py' files associated with them. 'package.py' files associated with them.
@ -578,7 +609,7 @@ class RepoIndex:
def __init__( def __init__(
self, self,
package_checker: FastPackageChecker, package_checker: Union[FastPackageChecker, EvenFasterPackageChecker],
namespace: str, namespace: str,
cache: "spack.caches.FileCacheType", cache: "spack.caches.FileCacheType",
): ):
@ -1016,7 +1047,9 @@ def check(condition, msg):
self._finder: Optional[RepoPath] = None self._finder: Optional[RepoPath] = None
# Maps that goes from package name to corresponding file stat # Maps that goes from package name to corresponding file stat
self._fast_package_checker: Optional[FastPackageChecker] = None self._fast_package_checker: Optional[
Union[EvenFasterPackageChecker, FastPackageChecker]
] = None
# Indexes for this repository, computed lazily # Indexes for this repository, computed lazily
self._repo_index: Optional[RepoIndex] = None self._repo_index: Optional[RepoIndex] = None
@ -1190,9 +1223,12 @@ def filename_for_package_name(self, pkg_name: str) -> str:
return os.path.join(pkg_dir, package_file_name) return os.path.join(pkg_dir, package_file_name)
@property @property
def _pkg_checker(self) -> FastPackageChecker: def _pkg_checker(self) -> Union[FastPackageChecker, EvenFasterPackageChecker]:
if self._fast_package_checker is None: if self._fast_package_checker is None:
self._fast_package_checker = FastPackageChecker(self.packages_path) if self.zipimporter:
self._fast_package_checker = EvenFasterPackageChecker(self.packages_path)
else:
self._fast_package_checker = FastPackageChecker(self.packages_path)
return self._fast_package_checker return self._fast_package_checker
def all_package_names(self, include_virtuals: bool = False) -> List[str]: def all_package_names(self, include_virtuals: bool = False) -> List[str]:

View File

@ -12,7 +12,7 @@
import zipfile import zipfile
from contextlib import closing, contextmanager from contextlib import closing, contextmanager
from gzip import GzipFile from gzip import GzipFile
from typing import Callable, Dict, Tuple from typing import Callable, Dict, List, Tuple
from llnl.util.symlink import readlink from llnl.util.symlink import readlink
@ -236,13 +236,13 @@ def reproducible_zipfile_from_prefix(
zip: zipfile.ZipFile, zip: zipfile.ZipFile,
prefix: str, prefix: str,
*, *,
skip: Callable[[os.DirEntry], bool] = lambda entry: False, skip: Callable[[os.DirEntry, int], bool] = lambda entry, depth: False,
path_to_name: Callable[[str], str] = default_path_to_name, path_to_name: Callable[[str], str] = default_path_to_name,
) -> None: ) -> None:
"""Similar to ``reproducible_tarfile_from_prefix`` but for zipfiles.""" """Similar to ``reproducible_tarfile_from_prefix`` but for zipfiles."""
dir_stack = [prefix] dir_stack: List[Tuple[str, int]] = [(prefix, 0)]
while dir_stack: while dir_stack:
dir = dir_stack.pop() dir, depth = dir_stack.pop()
# Add the dir before its contents. zip.mkdir is Python 3.11. # Add the dir before its contents. zip.mkdir is Python 3.11.
dir_info = zipfile.ZipInfo(path_to_name(dir)) dir_info = zipfile.ZipInfo(path_to_name(dir))
@ -259,11 +259,11 @@ def reproducible_zipfile_from_prefix(
new_dirs = [] new_dirs = []
for entry in entries: for entry in entries:
if skip(entry): if skip(entry, depth):
continue continue
if entry.is_dir(follow_symlinks=False): if entry.is_dir(follow_symlinks=False):
new_dirs.append(entry.path) new_dirs.append((entry.path, depth + 1))
continue continue
# symlink / hardlink support in ZIP is poor or non-existent: make copies. # symlink / hardlink support in ZIP is poor or non-existent: make copies.

View File

@ -4,7 +4,7 @@
# SPDX-License-Identifier: (Apache-2.0 OR MIT) # SPDX-License-Identifier: (Apache-2.0 OR MIT)
import hashlib import hashlib
from typing import BinaryIO, Callable, Dict, Optional from typing import IO, Callable, Dict, Optional
import llnl.util.tty as tty import llnl.util.tty as tty
@ -80,7 +80,7 @@ def hash_fun_for_digest(hexdigest: str) -> HashFactory:
return hash_fun_for_algo(hash_algo_for_digest(hexdigest)) return hash_fun_for_algo(hash_algo_for_digest(hexdigest))
def checksum_stream(hashlib_algo: HashFactory, fp: BinaryIO, *, block_size: int = 2**20) -> str: def checksum_stream(hashlib_algo: HashFactory, fp: IO[bytes], *, block_size: int = 2**20) -> str:
"""Returns a hex digest of the stream generated using given algorithm from hashlib.""" """Returns a hex digest of the stream generated using given algorithm from hashlib."""
hasher = hashlib_algo() hasher = hashlib_algo()
while True: while True: