llnl.util.filesystem.find: multiple entrypoints (#47436)

You can now provide multiple roots to a single `find()` call and all of
them will be searched. The roots can overlap (e.g. can be parents of one
another).

This also adds a library function for taking a set of regular expression
patterns and creating a single OR expression (and that library function
is used in `find` to improve its performance).
This commit is contained in:
Harmen Stoppels 2024-11-07 00:22:26 +01:00 committed by GitHub
parent 57a90c91a4
commit 73219e4b02
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 178 additions and 130 deletions

View File

@ -20,11 +20,11 @@
import tempfile import tempfile
from contextlib import contextmanager from contextlib import contextmanager
from itertools import accumulate from itertools import accumulate
from typing import Callable, Iterable, List, Match, Optional, Tuple, Union from typing import Callable, Deque, Dict, Iterable, List, Match, Optional, Set, Tuple, Union
import llnl.util.symlink import llnl.util.symlink
from llnl.util import tty from llnl.util import tty
from llnl.util.lang import dedupe, memoized from llnl.util.lang import dedupe, fnmatch_translate_multiple, memoized
from llnl.util.symlink import islink, readlink, resolve_link_target_relative_to_the_link, symlink from llnl.util.symlink import islink, readlink, resolve_link_target_relative_to_the_link, symlink
from ..path import path_to_os_path, system_path_filter from ..path import path_to_os_path, system_path_filter
@ -1673,32 +1673,40 @@ def find_first(root: str, files: Union[Iterable[str], str], bfs_depth: int = 2)
return FindFirstFile(root, *files, bfs_depth=bfs_depth).find() return FindFirstFile(root, *files, bfs_depth=bfs_depth).find()
def find(root, files, recursive=True, max_depth: Optional[int] = None): def find(
"""Search for ``files`` starting from the ``root`` directory. root: Union[str, List[str]],
files: Union[str, List[str]],
Like GNU/BSD find but written entirely in Python. recursive: bool = True,
max_depth: Optional[int] = None,
Specifically this behaves like `find -type f`: it only returns ) -> List[str]:
results that are files. When searching recursively, this behaves """Finds all non-directory files matching the filename patterns from ``files`` starting from
as `find` with the `-L` option (follows symlinks). ``root``. This function returns a deterministic result for the same input and directory
structure when run multiple times. Symlinked directories are followed, and unique directories
are searched only once. Each matching file is returned only once at lowest depth in case
multiple paths exist due to symlinked directories. The function has similarities to the Unix
``find`` utility.
Examples: Examples:
.. code-block:: console .. code-block:: console
$ find -L /usr -name python $ find -L /usr -name python3 -type f
is equivalent to: is roughly equivalent to
>>> find('/usr', 'python') >>> find("/usr", "python3")
with the notable difference that this function only lists a single path to each file in case of
symlinked directories.
.. code-block:: console .. code-block:: console
$ find /usr/local/bin -maxdepth 1 -name python $ find -L /usr/local/bin /usr/local/sbin -maxdepth 1 '(' -name python3 -o -name getcap \\
')' -type f
is equivalent to: is roughly equivalent to:
>>> find('/usr/local/bin', 'python', recursive=False) >>> find(["/usr/local/bin", "/usr/local/sbin"], ["python3", "getcap"], recursive=False)
Accepts any glob characters accepted by fnmatch: Accepts any glob characters accepted by fnmatch:
@ -1712,17 +1720,17 @@ def find(root, files, recursive=True, max_depth: Optional[int] = None):
========== ==================================== ========== ====================================
Parameters: Parameters:
root (str): The root directory to start searching from root: One or more root directories to start searching from
files (str or collections.abc.Sequence): Library name(s) to search for files: One or more filename patterns to search for
recursive (bool): if False search only root folder, recursive: if False search only root, if True descends from roots. Defaults to True.
if True descends top-down from the root. Defaults to True. max_depth: if set, don't search below this depth. Cannot be set if recursive is False
max_depth (int): if set, don't search below this depth. Cannot be set
if recursive is False
Returns: Returns a list of absolute, matching file paths.
list: The files that have been found
""" """
if isinstance(files, str): if not isinstance(root, list):
root = [root]
if not isinstance(files, list):
files = [files] files = [files]
# If recursive is false, max_depth can only be None or 0 # If recursive is false, max_depth can only be None or 0
@ -1734,10 +1742,9 @@ def find(root, files, recursive=True, max_depth: Optional[int] = None):
elif max_depth is None: elif max_depth is None:
max_depth = sys.maxsize max_depth = sys.maxsize
tty.debug(f"Find (max depth = {max_depth}): {root} {str(files)}") tty.debug(f"Find (max depth = {max_depth}): {root} {files}")
result = find_max_depth(root, files, max_depth) result = _find_max_depth(root, files, max_depth)
tty.debug(f"Find complete: {root} {files}")
tty.debug(f"Find complete: {root} {str(files)}")
return result return result
@ -1746,56 +1753,36 @@ def _log_file_access_issue(e: OSError, path: str) -> None:
tty.debug(f"find must skip {path}: {errno_name} {e}") tty.debug(f"find must skip {path}: {errno_name} {e}")
@system_path_filter(arg_slice=slice(1)) def _dir_id(s: os.stat_result) -> Tuple[int, int]:
def find_max_depth(root, globs, max_depth: Optional[int] = None): # Note: on windows, st_ino is the file index and st_dev is the volume serial number. See
"""Given a set of non-recursive glob file patterns, finds all # https://github.com/python/cpython/blob/3.9/Python/fileutils.c
files matching those patterns up to a maximum specified depth. return (s.st_ino, s.st_dev)
If a directory has a name which matches an input pattern, it will
not be included in the results.
If ``max_depth`` is specified, does not search below that depth. def _find_max_depth(roots: List[str], globs: List[str], max_depth: int = sys.maxsize) -> List[str]:
"""See ``find`` for the public API."""
# Apply normcase to file patterns and filenames to respect case insensitive filesystems
regex, groups = fnmatch_translate_multiple([os.path.normcase(x) for x in globs])
# Ordered dictionary that keeps track of the files found for each pattern
capture_group_to_paths: Dict[str, List[str]] = {group: [] for group in groups}
# Ensure returned paths are always absolute
roots = [os.path.abspath(r) for r in roots]
# Breadth-first search queue. Each element is a tuple of (depth, directory)
dir_queue: Deque[Tuple[int, str]] = collections.deque()
# Set of visited directories. Each element is a tuple of (inode, device)
visited_dirs: Set[Tuple[int, int]] = set()
If ``globs`` is a list, files matching earlier entries are placed for root in roots:
in the return value before files matching later entries. try:
""" stat_root = os.stat(root)
try: except OSError as e:
stat_root = os.stat(root) _log_file_access_issue(e, root)
except OSError: continue
return [] dir_id = _dir_id(stat_root)
if dir_id not in visited_dirs:
dir_queue.appendleft((0, root))
visited_dirs.add(dir_id)
if max_depth is None:
max_depth = sys.maxsize
if isinstance(globs, str):
globs = [globs]
# Apply normcase to regular expressions and to the filenames:
# this respects case-sensitivity semantics of different OSes
# (e.g. file search is typically case-insensitive on Windows)
regexes = [re.compile(fnmatch.translate(os.path.normcase(x))) for x in globs]
# Note later calls to os.scandir etc. return abspaths if the
# input is absolute, see https://docs.python.org/3/library/os.html#os.DirEntry.path
root = os.path.abspath(root)
found_files = collections.defaultdict(list)
def _dir_id(stat_info):
# Note: on windows, st_ino is the file index and st_dev
# is the volume serial number. See
# https://github.com/python/cpython/blob/3.9/Python/fileutils.c
return (stat_info.st_ino, stat_info.st_dev)
visited_dirs = set([_dir_id(stat_root)])
# Each queue item stores the depth and path
# This achieves a consistent traversal order by iterating through
# each directory in alphabetical order.
# This also traverses in BFS order to ensure finding the shortest
# path to any file (or one of the shortest paths, if there are
# several - the one returned will be consistent given the prior
# point).
dir_queue = collections.deque([(0, root)])
while dir_queue: while dir_queue:
depth, next_dir = dir_queue.pop() depth, next_dir = dir_queue.pop()
try: try:
@ -1810,20 +1797,18 @@ def _dir_id(stat_info):
try: try:
it_is_a_dir = dir_entry.is_dir(follow_symlinks=True) it_is_a_dir = dir_entry.is_dir(follow_symlinks=True)
except OSError as e: except OSError as e:
# Possible permission issue, or a symlink that cannot # Possible permission issue, or a symlink that cannot be resolved (ELOOP).
# be resolved (ELOOP).
_log_file_access_issue(e, dir_entry.path) _log_file_access_issue(e, dir_entry.path)
continue continue
if it_is_a_dir and (depth < max_depth): if it_is_a_dir and depth < max_depth:
try: try:
# The stat should be performed in a try/except block. # The stat should be performed in a try/except block. We repeat that here
# We repeat that here vs. moving to the above block # vs. moving to the above block because we only want to call `stat` if we
# because we only want to call `stat` if we haven't # haven't exceeded our max_depth
# exceeded our max_depth
if sys.platform == "win32": if sys.platform == "win32":
# Note: st_ino/st_dev on DirEntry.stat are not set on # Note: st_ino/st_dev on DirEntry.stat are not set on Windows, so we
# Windows, so we have to call os.stat # have to call os.stat
stat_info = os.stat(dir_entry.path, follow_symlinks=True) stat_info = os.stat(dir_entry.path, follow_symlinks=True)
else: else:
stat_info = dir_entry.stat(follow_symlinks=True) stat_info = dir_entry.stat(follow_symlinks=True)
@ -1836,15 +1821,15 @@ def _dir_id(stat_info):
dir_queue.appendleft((depth + 1, dir_entry.path)) dir_queue.appendleft((depth + 1, dir_entry.path))
visited_dirs.add(dir_id) visited_dirs.add(dir_id)
else: else:
fname = os.path.basename(dir_entry.path) m = regex.match(os.path.normcase(os.path.basename(dir_entry.path)))
for pattern in regexes: if not m:
if pattern.match(os.path.normcase(fname)): continue
found_files[pattern].append(os.path.join(next_dir, fname)) for group in capture_group_to_paths:
if m.group(group):
capture_group_to_paths[group].append(dir_entry.path)
break
# TODO: for fully-recursive searches, we can print a warning after return [path for paths in capture_group_to_paths.values() for path in paths]
# after having searched everything up to some fixed depth
return list(itertools.chain(*[found_files[x] for x in regexes]))
# Utilities for libraries and headers # Utilities for libraries and headers

View File

@ -5,12 +5,14 @@
import collections.abc import collections.abc
import contextlib import contextlib
import fnmatch
import functools import functools
import itertools import itertools
import os import os
import re import re
import sys import sys
import traceback import traceback
import typing
import warnings import warnings
from datetime import datetime, timedelta from datetime import datetime, timedelta
from typing import Callable, Iterable, List, Tuple, TypeVar from typing import Callable, Iterable, List, Tuple, TypeVar
@ -859,6 +861,32 @@ def elide_list(line_list: List[str], max_num: int = 10) -> List[str]:
return line_list return line_list
if sys.version_info >= (3, 9):
PatternStr = re.Pattern[str]
else:
PatternStr = typing.Pattern[str]
def fnmatch_translate_multiple(patterns: List[str]) -> Tuple[PatternStr, List[str]]:
"""Same as fnmatch.translate, but creates a single regex of the form
``(?P<pattern0>...)|(?P<pattern1>...)|...`` for each pattern in the iterable, where
``patternN`` is a named capture group that matches the corresponding pattern translated by
``fnmatch.translate``. This can be used to match multiple patterns in a single pass. No case
normalization is performed on the patterns.
Args:
patterns: list of fnmatch patterns
Returns:
Tuple of the combined regex and the list of named capture groups corresponding to each
pattern in the input list.
"""
groups = [f"pattern{i}" for i in range(len(patterns))]
regexes = (fnmatch.translate(p) for p in patterns)
combined = re.compile("|".join(f"(?P<{g}>{r})" for g, r in zip(groups, regexes)))
return combined, groups
@contextlib.contextmanager @contextlib.contextmanager
def nullcontext(*args, **kwargs): def nullcontext(*args, **kwargs):
"""Empty context manager. """Empty context manager.

View File

@ -1072,16 +1072,16 @@ def test_find_max_depth(dir_structure_with_things_to_find):
# Make sure the paths we use to verify are absolute # Make sure the paths we use to verify are absolute
assert os.path.isabs(locations["file_one"]) assert os.path.isabs(locations["file_one"])
assert set(fs.find_max_depth(root, "file_*", 0)) == {locations["file_four"]} assert set(fs.find(root, "file_*", max_depth=0)) == {locations["file_four"]}
assert set(fs.find_max_depth(root, "file_*", 1)) == { assert set(fs.find(root, "file_*", max_depth=1)) == {
locations["file_one"], locations["file_one"],
locations["file_three"], locations["file_three"],
locations["file_four"], locations["file_four"],
} }
assert set(fs.find_max_depth(root, "file_two", 2)) == {locations["file_two"]} assert set(fs.find(root, "file_two", max_depth=2)) == {locations["file_two"]}
assert not set(fs.find_max_depth(root, "file_two", 1)) assert not set(fs.find(root, "file_two", max_depth=1))
assert set(fs.find_max_depth(root, "file_two")) == {locations["file_two"]} assert set(fs.find(root, "file_two")) == {locations["file_two"]}
assert set(fs.find_max_depth(root, "file_*")) == set(locations.values()) assert set(fs.find(root, "file_*")) == set(locations.values())
def test_find_max_depth_relative(dir_structure_with_things_to_find): def test_find_max_depth_relative(dir_structure_with_things_to_find):
@ -1090,8 +1090,8 @@ def test_find_max_depth_relative(dir_structure_with_things_to_find):
""" """
root, locations = dir_structure_with_things_to_find root, locations = dir_structure_with_things_to_find
with fs.working_dir(root): with fs.working_dir(root):
assert set(fs.find_max_depth(".", "file_*", 0)) == {locations["file_four"]} assert set(fs.find(".", "file_*", max_depth=0)) == {locations["file_four"]}
assert set(fs.find_max_depth(".", "file_two", 2)) == {locations["file_two"]} assert set(fs.find(".", "file_two", max_depth=2)) == {locations["file_two"]}
@pytest.mark.parametrize("recursive,max_depth", [(False, -1), (False, 1)]) @pytest.mark.parametrize("recursive,max_depth", [(False, -1), (False, 1)])
@ -1105,7 +1105,8 @@ def test_max_depth_and_recursive_errors(tmpdir, recursive, max_depth):
fs.find_libraries(["some_lib"], root, recursive=recursive, max_depth=max_depth) fs.find_libraries(["some_lib"], root, recursive=recursive, max_depth=max_depth)
def dir_structure_with_things_to_find_links(tmpdir, use_junctions=False): @pytest.fixture(params=[True, False])
def complex_dir_structure(request, tmpdir):
""" """
"lx-dy" means "level x, directory y" "lx-dy" means "level x, directory y"
"lx-fy" means "level x, file y" "lx-fy" means "level x, file y"
@ -1128,8 +1129,11 @@ def dir_structure_with_things_to_find_links(tmpdir, use_junctions=False):
l1-s3 -> l3-d4 # a link that "skips" a directory level l1-s3 -> l3-d4 # a link that "skips" a directory level
l1-s4 -> l2-s3 # a link to a link to a dir l1-s4 -> l2-s3 # a link to a link to a dir
""" """
if sys.platform == "win32" and (not use_junctions) and (not _windows_can_symlink()): use_junctions = request.param
if sys.platform == "win32" and not use_junctions and not _windows_can_symlink():
pytest.skip("This Windows instance is not configured with symlink support") pytest.skip("This Windows instance is not configured with symlink support")
elif sys.platform != "win32" and use_junctions:
pytest.skip("Junctions are a Windows-only feature")
l1_d1 = tmpdir.join("l1-d1").ensure(dir=True) l1_d1 = tmpdir.join("l1-d1").ensure(dir=True)
l2_d1 = l1_d1.join("l2-d1").ensure(dir=True) l2_d1 = l1_d1.join("l2-d1").ensure(dir=True)
@ -1150,44 +1154,60 @@ def dir_structure_with_things_to_find_links(tmpdir, use_junctions=False):
link_fn(l2_d2, l2_s3) link_fn(l2_d2, l2_s3)
link_fn(l2_s3, pathlib.Path(tmpdir) / "l1-s4") link_fn(l2_s3, pathlib.Path(tmpdir) / "l1-s4")
locations = {} locations = {
locations["l4-f1"] = str(l3_d2.join("l4-f1").ensure()) "l4-f1": str(l3_d2.join("l4-f1").ensure()),
locations["l4-f2-full"] = str(l3_d4.join("l4-f2").ensure()) "l4-f2-full": str(l3_d4.join("l4-f2").ensure()),
locations["l4-f2-link"] = str(pathlib.Path(tmpdir) / "l1-s3" / "l4-f2") "l4-f2-link": str(pathlib.Path(tmpdir) / "l1-s3" / "l4-f2"),
locations["l2-f1"] = str(l1_d2.join("l2-f1").ensure()) "l2-f1": str(l1_d2.join("l2-f1").ensure()),
locations["l2-f1-link"] = str(pathlib.Path(tmpdir) / "l1-d1" / "l2-d1" / "l3-s1" / "l2-f1") "l2-f1-link": str(pathlib.Path(tmpdir) / "l1-d1" / "l2-d1" / "l3-s1" / "l2-f1"),
locations["l3-f3-full"] = str(l2_d2.join("l3-f3").ensure()) "l3-f3-full": str(l2_d2.join("l3-f3").ensure()),
locations["l3-f3-link-l1"] = str(pathlib.Path(tmpdir) / "l1-s4" / "l3-f3") "l3-f3-link-l1": str(pathlib.Path(tmpdir) / "l1-s4" / "l3-f3"),
}
return str(tmpdir), locations return str(tmpdir), locations
def _check_find_links(root, locations): def test_find_max_depth_symlinks(complex_dir_structure):
root, locations = complex_dir_structure
root = pathlib.Path(root) root = pathlib.Path(root)
assert set(fs.find_max_depth(root, "l4-f1")) == {locations["l4-f1"]} assert set(fs.find(root, "l4-f1")) == {locations["l4-f1"]}
assert set(fs.find_max_depth(root / "l1-s3", "l4-f2", 0)) == {locations["l4-f2-link"]} assert set(fs.find(root / "l1-s3", "l4-f2", max_depth=0)) == {locations["l4-f2-link"]}
assert set(fs.find_max_depth(root / "l1-d1", "l2-f1")) == {locations["l2-f1-link"]} assert set(fs.find(root / "l1-d1", "l2-f1")) == {locations["l2-f1-link"]}
# File is accessible via symlink and subdir, the link path will be # File is accessible via symlink and subdir, the link path will be
# searched first, and the directory will not be searched again when # searched first, and the directory will not be searched again when
# it is encountered the second time (via not-link) in the traversal # it is encountered the second time (via not-link) in the traversal
assert set(fs.find_max_depth(root, "l4-f2")) == {locations["l4-f2-link"]} assert set(fs.find(root, "l4-f2")) == {locations["l4-f2-link"]}
# File is accessible only via the dir, so the full file path should # File is accessible only via the dir, so the full file path should
# be reported # be reported
assert set(fs.find_max_depth(root / "l1-d1", "l4-f2")) == {locations["l4-f2-full"]} assert set(fs.find(root / "l1-d1", "l4-f2")) == {locations["l4-f2-full"]}
# Check following links to links # Check following links to links
assert set(fs.find_max_depth(root, "l3-f3")) == {locations["l3-f3-link-l1"]} assert set(fs.find(root, "l3-f3")) == {locations["l3-f3-link-l1"]}
@pytest.mark.parametrize( def test_find_max_depth_multiple_and_repeated_entry_points(complex_dir_structure):
"use_junctions", root, locations = complex_dir_structure
[
False, fst = str(pathlib.Path(root) / "l1-d1" / "l2-d1")
pytest.param( snd = str(pathlib.Path(root) / "l1-d2")
True, nonexistent = str(pathlib.Path(root) / "nonexistent")
marks=pytest.mark.skipif(sys.platform != "win32", reason="Only Windows has junctions"),
), assert set(fs.find([fst, snd, fst, snd, nonexistent], ["l*-f*"], max_depth=1)) == {
], locations["l2-f1"],
) locations["l4-f1"],
def test_find_max_depth_symlinks(tmpdir, use_junctions): locations["l4-f2-full"],
root, locations = dir_structure_with_things_to_find_links(tmpdir, use_junctions=use_junctions) locations["l3-f3-full"],
_check_find_links(root, locations) }
def test_multiple_patterns(complex_dir_structure):
root, _ = complex_dir_structure
paths = fs.find(root, ["l2-f1", "l3-f3", "*"])
# There shouldn't be duplicate results with multiple, overlapping patterns
assert len(set(paths)) == len(paths)
# All files should be found
filenames = [os.path.basename(p) for p in paths]
assert set(filenames) == {"l2-f1", "l3-f3", "l4-f1", "l4-f2"}
# They are ordered by first matching pattern (this is a bit of an implementation detail,
# and we could decide to change the exact order in the future)
assert filenames[0] == "l2-f1"
assert filenames[1] == "l3-f3"

View File

@ -373,3 +373,18 @@ class _SomeClass:
_SomeClass.deprecated.error_lvl = 2 _SomeClass.deprecated.error_lvl = 2
with pytest.raises(AttributeError): with pytest.raises(AttributeError):
_ = s.deprecated _ = s.deprecated
def test_fnmatch_multiple():
regex, groups = llnl.util.lang.fnmatch_translate_multiple(["libf*o.so", "libb*r.so"])
a = regex.match("libfoo.so")
assert a and a.group(groups[0]) == "libfoo.so"
b = regex.match("libbar.so")
assert b and b.group(groups[1]) == "libbar.so"
assert not regex.match("libfoo.so.1")
assert not regex.match("libbar.so.1")
assert not regex.match("libfoo.solibbar.so")
assert not regex.match("libbaz.so")