avoid 7K stat calls

This commit is contained in:
Harmen Stoppels 2024-08-22 09:06:18 +02:00
parent ea823d2308
commit 225a4ed1ff
5 changed files with 70 additions and 32 deletions

View File

@ -168,8 +168,12 @@ def repo_zip(args):
except spack.repo.RepoError:
tty.die(f"No repository at path: {key}")
def _zip_repo_skip(entry: os.DirEntry):
return entry.name == "__pycache__"
def _zip_repo_skip(entry: os.DirEntry, depth: int):
if entry.name == "__pycache__":
return True
if depth == 0 and not os.path.exists(os.path.join(entry.path, "package.py")):
return True
return False
def _zip_repo_path_to_name(path: str) -> str:
# use spack/pkg/<repo>/* prefix and rename `package.py` as `__init__.py`

View File

@ -9,7 +9,7 @@
import pathlib
import sys
import zipfile
from typing import Any, Dict, Optional, Tuple, Type, Union
from typing import Any, Dict, Optional, Set, Tuple, Type, Union
import llnl.util.filesystem
from llnl.url import allowed_archive
@ -155,7 +155,7 @@ def __hash__(self) -> int:
return hash(self.sha256)
zipfilecache = {}
zipfilecache: Dict[str, Tuple[zipfile.ZipFile, Set[str]]] = {}
class FilePatch(Patch):
@ -202,9 +202,8 @@ def __init__(
if "packages.zip" in path.parts:
# check if it exists in the zip file.
idx = path.parts.index("packages.zip")
zip_path, entry_path = pathlib.PurePath(*path.parts[: idx + 1]), pathlib.PurePath(
*path.parts[idx + 1 :]
)
zip_path = str(pathlib.PurePath(*path.parts[: idx + 1]))
entry_path = str(pathlib.PurePath(*path.parts[idx + 1 :]))
lookup = zipfilecache.get(zip_path)
if lookup is None:
@ -213,7 +212,7 @@ def __init__(
zipfilecache[zip_path] = (zip, namelist)
else:
zip, namelist = lookup
if str(entry_path) in namelist:
if entry_path in namelist:
abs_path = str(path)
break
elif path.exists():
@ -241,17 +240,16 @@ def sha256(self) -> str:
if "packages.zip" in path.parts:
# split in path to packages.zip and the path within the zip
idx = path.parts.index("packages.zip")
path_to_zip, path_in_zip = pathlib.PurePath(
*path.parts[: idx + 1]
), pathlib.PurePath(*path.parts[idx + 1 :])
lookup = zipfilecache.get(path_to_zip)
zip_path = str(pathlib.PurePath(*path.parts[: idx + 1]))
entry_path = str(pathlib.PurePath(*path.parts[idx + 1 :]))
lookup = zipfilecache.get(zip_path)
if lookup is None:
zip = zipfile.ZipFile(path_to_zip, "r")
zip = zipfile.ZipFile(zip_path, "r")
namelist = set(zip.namelist())
zipfilecache[path_to_zip] = (zip, namelist)
zipfilecache[zip_path] = (zip, namelist)
else:
zip, namelist = lookup
f = zip.open(str(path_in_zip), "r")
f = zip.open(entry_path, "r")
else:
f = open(self.path, "rb")
self._sha256 = checksum_stream(hashlib.sha256, f)

View File

@ -26,6 +26,7 @@
import types
import uuid
import warnings
import zipfile
import zipimport
from typing import Any, Dict, Generator, List, Optional, Set, Tuple, Type, Union
@ -165,10 +166,10 @@ def compute_loader(self, fullname):
if package_name:
# annoyingly there is a many to one mapping for pkg module to file, have to
# figure out how to deal with this properly.
return (
(repo.zipimporter, f"{namespace}.{package_name}")
if repo.zipimporter
else (
if repo.zipimporter:
return repo.zipimporter, f"{namespace}.{package_name}"
else:
return (
_PrependFileLoader(
fullname=fullname,
path=repo.filename_for_package_name(package_name),
@ -176,7 +177,6 @@ def compute_loader(self, fullname):
),
fullname,
)
)
# We are importing a full namespace like 'spack.pkg.builtin'
if fullname == repo.full_namespace:
@ -364,6 +364,37 @@ def __getattr__(self, name):
return getattr(self, name)
class EvenFasterPackageChecker(collections.abc.Mapping):
def __init__(self, packages_path):
# The path of the repository managed by this instance
self.packages_path = packages_path
self.zipfile = zipfile.ZipFile(os.path.join(packages_path, "..", "packages.zip"), "r")
self.invalidate()
def invalidate(self):
self.mtime = os.stat(self.zipfile.filename).st_mtime
self.pkgs = {
f.rstrip("/"): self.mtime
for f in self.zipfile.namelist()
if f.endswith("/") and f.count("/") == 1 and f != "./"
}
def last_mtime(self):
return self.mtime
def modified_since(self, since: float) -> List[str]:
return list(self.pkgs) if self.mtime > since else []
def __getitem__(self, item):
return self.pkgs[item]
def __iter__(self):
return iter(self.pkgs)
def __len__(self):
return len(self.pkgs)
class FastPackageChecker(collections.abc.Mapping):
"""Cache that maps package names to the stats obtained on the
'package.py' files associated with them.
@ -578,7 +609,7 @@ class RepoIndex:
def __init__(
self,
package_checker: FastPackageChecker,
package_checker: Union[FastPackageChecker, EvenFasterPackageChecker],
namespace: str,
cache: "spack.caches.FileCacheType",
):
@ -1016,7 +1047,9 @@ def check(condition, msg):
self._finder: Optional[RepoPath] = None
# Maps that goes from package name to corresponding file stat
self._fast_package_checker: Optional[FastPackageChecker] = None
self._fast_package_checker: Optional[
Union[EvenFasterPackageChecker, FastPackageChecker]
] = None
# Indexes for this repository, computed lazily
self._repo_index: Optional[RepoIndex] = None
@ -1190,9 +1223,12 @@ def filename_for_package_name(self, pkg_name: str) -> str:
return os.path.join(pkg_dir, package_file_name)
@property
def _pkg_checker(self) -> FastPackageChecker:
def _pkg_checker(self) -> Union[FastPackageChecker, EvenFasterPackageChecker]:
if self._fast_package_checker is None:
self._fast_package_checker = FastPackageChecker(self.packages_path)
if self.zipimporter:
self._fast_package_checker = EvenFasterPackageChecker(self.packages_path)
else:
self._fast_package_checker = FastPackageChecker(self.packages_path)
return self._fast_package_checker
def all_package_names(self, include_virtuals: bool = False) -> List[str]:

View File

@ -12,7 +12,7 @@
import zipfile
from contextlib import closing, contextmanager
from gzip import GzipFile
from typing import Callable, Dict, Tuple
from typing import Callable, Dict, List, Tuple
from llnl.util.symlink import readlink
@ -236,13 +236,13 @@ def reproducible_zipfile_from_prefix(
zip: zipfile.ZipFile,
prefix: str,
*,
skip: Callable[[os.DirEntry], bool] = lambda entry: False,
skip: Callable[[os.DirEntry, int], bool] = lambda entry, depth: False,
path_to_name: Callable[[str], str] = default_path_to_name,
) -> None:
"""Similar to ``reproducible_tarfile_from_prefix`` but for zipfiles."""
dir_stack = [prefix]
dir_stack: List[Tuple[str, int]] = [(prefix, 0)]
while dir_stack:
dir = dir_stack.pop()
dir, depth = dir_stack.pop()
# Add the dir before its contents. zip.mkdir is Python 3.11.
dir_info = zipfile.ZipInfo(path_to_name(dir))
@ -259,11 +259,11 @@ def reproducible_zipfile_from_prefix(
new_dirs = []
for entry in entries:
if skip(entry):
if skip(entry, depth):
continue
if entry.is_dir(follow_symlinks=False):
new_dirs.append(entry.path)
new_dirs.append((entry.path, depth + 1))
continue
# symlink / hardlink support in ZIP is poor or non-existent: make copies.

View File

@ -4,7 +4,7 @@
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
import hashlib
from typing import BinaryIO, Callable, Dict, Optional
from typing import IO, Callable, Dict, Optional
import llnl.util.tty as tty
@ -80,7 +80,7 @@ def hash_fun_for_digest(hexdigest: str) -> HashFactory:
return hash_fun_for_algo(hash_algo_for_digest(hexdigest))
def checksum_stream(hashlib_algo: HashFactory, fp: BinaryIO, *, block_size: int = 2**20) -> str:
def checksum_stream(hashlib_algo: HashFactory, fp: IO[bytes], *, block_size: int = 2**20) -> str:
"""Returns a hex digest of the stream generated using given algorithm from hashlib."""
hasher = hashlib_algo()
while True: