llnl.util.filesystem: multiple entrypoints and max_depth (#47495)

If a package `foo` doesn't implement `libs`, the default was to search recursively for `libfoo` whenever asking for `spec[foo].libs` (this also happens automatically if a package includes `foo` as a link dependency).

This can lead to some strange behavior:
1. A package that is normally used as a build dependency (e.g. `cmake` at one point) is referenced like
   `depends_on(cmake)` which leads to a fully-recursive search for `libcmake` (this can take
   "forever" when CMake is registered as an external with a prefix like `/usr`, particularly on NFS mounts).
2. A similar hang can occur if a package is registered as an external with an incorrect prefix

- [x] Update the default library search to stop after a maximum depth (by default, search
  the root prefix and each directory in it, but no lower).
- [x] 

The following is a list of known changes to `find` compared to `develop`:

1. Matching directories are no longer returned -- `find` consistently only finds non-dirs, 
   even at `max_depth`
2. Symlinked directories are followed (needed to support max_depth)
3. `find(..., "dir/*.txt")` is allowed, for finding files inside certain dirs. These "complex"
   patterns are delegated to `glob`, like they are on `develop`.
4. `root` and `files` arguments both support generic sequences, and `root`
   allows both `str` and `path` types. This allows us to specify multiple entry points to `find`.

---------

Co-authored-by: Peter Scheibel <scheibel1@llnl.gov>
This commit is contained in:
Harmen Stoppels
2024-11-08 22:55:53 +01:00
committed by GitHub
parent 4778d2d332
commit 907a37145f
7 changed files with 460 additions and 116 deletions

View File

@@ -20,11 +20,23 @@
import tempfile
from contextlib import contextmanager
from itertools import accumulate
from typing import Callable, Iterable, List, Match, Optional, Tuple, Union
from typing import (
Callable,
Deque,
Dict,
Iterable,
List,
Match,
Optional,
Sequence,
Set,
Tuple,
Union,
)
import llnl.util.symlink
from llnl.util import tty
from llnl.util.lang import dedupe, memoized
from llnl.util.lang import dedupe, fnmatch_translate_multiple, memoized
from llnl.util.symlink import islink, readlink, resolve_link_target_relative_to_the_link, symlink
from ..path import path_to_os_path, system_path_filter
@@ -85,6 +97,8 @@
"visit_directory_tree",
]
Path = Union[str, pathlib.Path]
if sys.version_info < (3, 7, 4):
# monkeypatch shutil.copystat to fix PermissionError when copying read-only
# files on Lustre when using Python < 3.7.4
@@ -1673,105 +1687,199 @@ def find_first(root: str, files: Union[Iterable[str], str], bfs_depth: int = 2)
return FindFirstFile(root, *files, bfs_depth=bfs_depth).find()
def find(root, files, recursive=True):
"""Search for ``files`` starting from the ``root`` directory.
Like GNU/BSD find but written entirely in Python.
Examples:
.. code-block:: console
$ find /usr -name python
is equivalent to:
>>> find('/usr', 'python')
.. code-block:: console
$ find /usr/local/bin -maxdepth 1 -name python
is equivalent to:
>>> find('/usr/local/bin', 'python', recursive=False)
def find(
root: Union[Path, Sequence[Path]],
files: Union[str, Sequence[str]],
recursive: bool = True,
max_depth: Optional[int] = None,
) -> List[str]:
"""Finds all non-directory files matching the patterns from ``files`` starting from ``root``.
This function returns a deterministic result for the same input and directory structure when
run multiple times. Symlinked directories are followed, and unique directories are searched
only once. Each matching file is returned only once at lowest depth in case multiple paths
exist due to symlinked directories.
Accepts any glob characters accepted by fnmatch:
========== ====================================
Pattern Meaning
========== ====================================
``*`` matches everything
``*`` matches one or more characters
``?`` matches any single character
``[seq]`` matches any character in ``seq``
``[!seq]`` matches any character not in ``seq``
========== ====================================
Parameters:
root (str): The root directory to start searching from
files (str or collections.abc.Sequence): Library name(s) to search for
recursive (bool): if False search only root folder,
if True descends top-down from the root. Defaults to True.
Examples:
Returns:
list: The files that have been found
>>> find("/usr", "*.txt", recursive=True, max_depth=2)
finds all files with the extension ``.txt`` in the directory ``/usr`` and subdirectories up to
depth 2.
>>> find(["/usr", "/var"], ["*.txt", "*.log"], recursive=True)
finds all files with the extension ``.txt`` or ``.log`` in the directories ``/usr`` and
``/var`` at any depth.
>>> find("/usr", "GL/*.h", recursive=True)
finds all header files in a directory GL at any depth in the directory ``/usr``.
Parameters:
root: One or more root directories to start searching from
files: One or more filename patterns to search for
recursive: if False search only root, if True descends from roots. Defaults to True.
max_depth: if set, don't search below this depth. Cannot be set if recursive is False
Returns a list of absolute, matching file paths.
"""
if isinstance(root, (str, pathlib.Path)):
root = [root]
elif not isinstance(root, collections.abc.Sequence):
raise TypeError(f"'root' arg must be a path or a sequence of paths, not '{type(root)}']")
if isinstance(files, str):
files = [files]
elif not isinstance(files, collections.abc.Sequence):
raise TypeError(f"'files' arg must be str or a sequence of str, not '{type(files)}']")
if recursive:
tty.debug(f"Find (recursive): {root} {str(files)}")
result = _find_recursive(root, files)
else:
tty.debug(f"Find (not recursive): {root} {str(files)}")
result = _find_non_recursive(root, files)
# If recursive is false, max_depth can only be None or 0
if max_depth and not recursive:
raise ValueError(f"max_depth ({max_depth}) cannot be set if recursive is False")
tty.debug(f"Find complete: {root} {str(files)}")
tty.debug(f"Find (max depth = {max_depth}): {root} {files}")
if not recursive:
max_depth = 0
elif max_depth is None:
max_depth = sys.maxsize
result = _find_max_depth(root, files, max_depth)
tty.debug(f"Find complete: {root} {files}")
return result
@system_path_filter
def _find_recursive(root, search_files):
# The variable here is **on purpose** a defaultdict. The idea is that
# we want to poke the filesystem as little as possible, but still maintain
# stability in the order of the answer. Thus we are recording each library
# found in a key, and reconstructing the stable order later.
found_files = collections.defaultdict(list)
# Make the path absolute to have os.walk also return an absolute path
root = os.path.abspath(root)
for path, _, list_files in os.walk(root):
for search_file in search_files:
matches = glob.glob(os.path.join(path, search_file))
matches = [os.path.join(path, x) for x in matches]
found_files[search_file].extend(matches)
answer = []
for search_file in search_files:
answer.extend(found_files[search_file])
return answer
def _log_file_access_issue(e: OSError, path: str) -> None:
errno_name = errno.errorcode.get(e.errno, "UNKNOWN")
tty.debug(f"find must skip {path}: {errno_name} {e}")
@system_path_filter
def _find_non_recursive(root, search_files):
# The variable here is **on purpose** a defaultdict as os.list_dir
# can return files in any order (does not preserve stability)
found_files = collections.defaultdict(list)
def _file_id(s: os.stat_result) -> Tuple[int, int]:
# Note: on windows, st_ino is the file index and st_dev is the volume serial number. See
# https://github.com/python/cpython/blob/3.9/Python/fileutils.c
return (s.st_ino, s.st_dev)
# Make the path absolute to have absolute path returned
root = os.path.abspath(root)
for search_file in search_files:
matches = glob.glob(os.path.join(root, search_file))
matches = [os.path.join(root, x) for x in matches]
found_files[search_file].extend(matches)
def _dedupe_files(paths: List[str]) -> List[str]:
"""Deduplicate files by inode and device, dropping files that cannot be accessed."""
unique_files: List[str] = []
# tuple of (inode, device) for each file without following symlinks
visited: Set[Tuple[int, int]] = set()
for path in paths:
try:
stat_info = os.lstat(path)
except OSError as e:
_log_file_access_issue(e, path)
continue
file_id = _file_id(stat_info)
if file_id not in visited:
unique_files.append(path)
visited.add(file_id)
return unique_files
answer = []
for search_file in search_files:
answer.extend(found_files[search_file])
return answer
def _find_max_depth(
roots: Sequence[Path], globs: Sequence[str], max_depth: int = sys.maxsize
) -> List[str]:
"""See ``find`` for the public API."""
# We optimize for the common case of simple filename only patterns: a single, combined regex
# is used. For complex patterns that include path components, we use a slower glob call from
# every directory we visit within max_depth.
filename_only_patterns = {
f"pattern_{i}": os.path.normcase(x) for i, x in enumerate(globs) if "/" not in x
}
complex_patterns = {f"pattern_{i}": x for i, x in enumerate(globs) if "/" in x}
regex = re.compile(fnmatch_translate_multiple(filename_only_patterns))
# Ordered dictionary that keeps track of what pattern found which files
matched_paths: Dict[str, List[str]] = {f"pattern_{i}": [] for i, _ in enumerate(globs)}
# Ensure returned paths are always absolute
roots = [os.path.abspath(r) for r in roots]
# Breadth-first search queue. Each element is a tuple of (depth, dir)
dir_queue: Deque[Tuple[int, str]] = collections.deque()
# Set of visited directories. Each element is a tuple of (inode, device)
visited_dirs: Set[Tuple[int, int]] = set()
for root in roots:
try:
stat_root = os.stat(root)
except OSError as e:
_log_file_access_issue(e, root)
continue
dir_id = _file_id(stat_root)
if dir_id not in visited_dirs:
dir_queue.appendleft((0, root))
visited_dirs.add(dir_id)
while dir_queue:
depth, curr_dir = dir_queue.pop()
try:
dir_iter = os.scandir(curr_dir)
except OSError as e:
_log_file_access_issue(e, curr_dir)
continue
# Use glob.glob for complex patterns.
for pattern_name, pattern in complex_patterns.items():
matched_paths[pattern_name].extend(
path
for path in glob.glob(os.path.join(curr_dir, pattern))
if not os.path.isdir(path)
)
with dir_iter:
ordered_entries = sorted(dir_iter, key=lambda x: x.name)
for dir_entry in ordered_entries:
try:
it_is_a_dir = dir_entry.is_dir(follow_symlinks=True)
except OSError as e:
# Possible permission issue, or a symlink that cannot be resolved (ELOOP).
_log_file_access_issue(e, dir_entry.path)
continue
if it_is_a_dir:
if depth >= max_depth:
continue
try:
# The stat should be performed in a try/except block. We repeat that here
# vs. moving to the above block because we only want to call `stat` if we
# haven't exceeded our max_depth
if sys.platform == "win32":
# Note: st_ino/st_dev on DirEntry.stat are not set on Windows, so we
# have to call os.stat
stat_info = os.stat(dir_entry.path, follow_symlinks=True)
else:
stat_info = dir_entry.stat(follow_symlinks=True)
except OSError as e:
_log_file_access_issue(e, dir_entry.path)
continue
dir_id = _file_id(stat_info)
if dir_id not in visited_dirs:
dir_queue.appendleft((depth + 1, dir_entry.path))
visited_dirs.add(dir_id)
elif filename_only_patterns:
m = regex.match(os.path.normcase(dir_entry.name))
if not m:
continue
for pattern_name in filename_only_patterns:
if m.group(pattern_name):
matched_paths[pattern_name].append(dir_entry.path)
break
all_matching_paths = [path for paths in matched_paths.values() for path in paths]
# we only dedupe files if we have any complex patterns, since only they can match the same file
# multiple times
return _dedupe_files(all_matching_paths) if complex_patterns else all_matching_paths
# Utilities for libraries and headers
@@ -2210,7 +2318,9 @@ def find_system_libraries(libraries, shared=True):
return libraries_found
def find_libraries(libraries, root, shared=True, recursive=False, runtime=True):
def find_libraries(
libraries, root, shared=True, recursive=False, runtime=True, max_depth: Optional[int] = None
):
"""Returns an iterable of full paths to libraries found in a root dir.
Accepts any glob characters accepted by fnmatch:
@@ -2231,6 +2341,8 @@ def find_libraries(libraries, root, shared=True, recursive=False, runtime=True):
otherwise for static. Defaults to True.
recursive (bool): if False search only root folder,
if True descends top-down from the root. Defaults to False.
max_depth (int): if set, don't search below this depth. Cannot be set
if recursive is False
runtime (bool): Windows only option, no-op elsewhere. If true,
search for runtime shared libs (.DLL), otherwise, search
for .Lib files. If shared is false, this has no meaning.
@@ -2239,6 +2351,7 @@ def find_libraries(libraries, root, shared=True, recursive=False, runtime=True):
Returns:
LibraryList: The libraries that have been found
"""
if isinstance(libraries, str):
libraries = [libraries]
elif not isinstance(libraries, collections.abc.Sequence):
@@ -2271,8 +2384,10 @@ def find_libraries(libraries, root, shared=True, recursive=False, runtime=True):
libraries = ["{0}.{1}".format(lib, suffix) for lib in libraries for suffix in suffixes]
if not recursive:
if max_depth:
raise ValueError(f"max_depth ({max_depth}) cannot be set if recursive is False")
# If not recursive, look for the libraries directly in root
return LibraryList(find(root, libraries, False))
return LibraryList(find(root, libraries, recursive=False))
# To speedup the search for external packages configured e.g. in /usr,
# perform first non-recursive search in root/lib then in root/lib64 and
@@ -2290,7 +2405,7 @@ def find_libraries(libraries, root, shared=True, recursive=False, runtime=True):
if found_libs:
break
else:
found_libs = find(root, libraries, True)
found_libs = find(root, libraries, recursive=True, max_depth=max_depth)
return LibraryList(found_libs)

View File

@@ -5,15 +5,17 @@
import collections.abc
import contextlib
import fnmatch
import functools
import itertools
import os
import re
import sys
import traceback
import typing
import warnings
from datetime import datetime, timedelta
from typing import Callable, Iterable, List, Tuple, TypeVar
from typing import Callable, Dict, Iterable, List, Tuple, TypeVar
# Ignore emacs backups when listing modules
ignore_modules = r"^\.#|~$"
@@ -859,6 +861,19 @@ def elide_list(line_list: List[str], max_num: int = 10) -> List[str]:
return line_list
if sys.version_info >= (3, 9):
PatternStr = re.Pattern[str]
else:
PatternStr = typing.Pattern[str]
def fnmatch_translate_multiple(named_patterns: Dict[str, str]) -> str:
"""Similar to ``fnmatch.translate``, but takes an ordered dictionary where keys are pattern
names, and values are filename patterns. The output is a regex that matches any of the
patterns in order, and named capture groups are used to identify which pattern matched."""
return "|".join(f"(?P<{n}>{fnmatch.translate(p)})" for n, p in named_patterns.items())
@contextlib.contextmanager
def nullcontext(*args, **kwargs):
"""Empty context manager.