Revert "filesystem.py: add max_depth argument to find (#41945)"

This reverts commit 38c8069ab4.
This commit is contained in:
Harmen Stoppels 2024-11-07 20:53:29 +01:00 committed by Peter Scheibel
parent 4fbdf2f2c0
commit ed916ffe6c
4 changed files with 77 additions and 292 deletions

View File

@ -1673,20 +1673,16 @@ def find_first(root: str, files: Union[Iterable[str], str], bfs_depth: int = 2)
return FindFirstFile(root, *files, bfs_depth=bfs_depth).find() return FindFirstFile(root, *files, bfs_depth=bfs_depth).find()
def find(root, files, recursive=True, max_depth: Optional[int] = None): def find(root, files, recursive=True):
"""Search for ``files`` starting from the ``root`` directory. """Search for ``files`` starting from the ``root`` directory.
Like GNU/BSD find but written entirely in Python. Like GNU/BSD find but written entirely in Python.
Specifically this behaves like `find -type f`: it only returns
results that are files. When searching recursively, this behaves
as `find` with the `-L` option (follows symlinks).
Examples: Examples:
.. code-block:: console .. code-block:: console
$ find -L /usr -name python $ find /usr -name python
is equivalent to: is equivalent to:
@ -1716,8 +1712,6 @@ def find(root, files, recursive=True, max_depth: Optional[int] = None):
files (str or collections.abc.Sequence): Library name(s) to search for files (str or collections.abc.Sequence): Library name(s) to search for
recursive (bool): if False search only root folder, recursive (bool): if False search only root folder,
if True descends top-down from the root. Defaults to True. if True descends top-down from the root. Defaults to True.
max_depth (int): if set, don't search below this depth. Cannot be set
if recursive is False
Returns: Returns:
list: The files that have been found list: The files that have been found
@ -1725,135 +1719,59 @@ def find(root, files, recursive=True, max_depth: Optional[int] = None):
if isinstance(files, str): if isinstance(files, str):
files = [files] files = [files]
# If recursive is false, max_depth can only be None or 0 if recursive:
if max_depth and not recursive: tty.debug(f"Find (recursive): {root} {str(files)}")
raise ValueError(f"max_depth ({max_depth}) cannot be set if recursive is False") result = _find_recursive(root, files)
else:
if not recursive: tty.debug(f"Find (not recursive): {root} {str(files)}")
max_depth = 0 result = _find_non_recursive(root, files)
elif max_depth is None:
max_depth = sys.maxsize
tty.debug(f"Find (max depth = {max_depth}): {root} {str(files)}")
result = find_max_depth(root, files, max_depth)
tty.debug(f"Find complete: {root} {str(files)}") tty.debug(f"Find complete: {root} {str(files)}")
return result return result
@system_path_filter(arg_slice=slice(1)) @system_path_filter
def find_max_depth(root, globs, max_depth: Optional[int] = None): def _find_recursive(root, search_files):
"""Given a set of non-recursive glob file patterns, finds all # The variable here is **on purpose** a defaultdict. The idea is that
files matching those patterns up to a maximum specified depth. # we want to poke the filesystem as little as possible, but still maintain
# stability in the order of the answer. Thus we are recording each library
If a directory has a name which matches an input pattern, it will # found in a key, and reconstructing the stable order later.
not be included in the results.
If ``max_depth`` is specified, does not search below that depth.
If ``globs`` is a list, files matching earlier entries are placed
in the return value before files matching later entries.
"""
# If root doesn't exist, then we say we found nothing. If it
# exists but is not a dir, we assume the user would want to
# know; likewise if it exists but we do not have permission to
# access it.
try:
stat_root = os.stat(root)
except OSError as e:
if e.errno == errno.ENOENT:
return []
else:
raise
if not stat.S_ISDIR(stat_root.st_mode):
raise ValueError(f"{root} is not a directory")
if max_depth is None:
max_depth = sys.maxsize
if isinstance(globs, str):
globs = [globs]
# Apply normcase to regular expressions and to the filenames:
# this respects case-sensitivity semantics of different OSes
# (e.g. file search is typically case-insensitive on Windows)
regexes = [re.compile(fnmatch.translate(os.path.normcase(x))) for x in globs]
# Note later calls to os.scandir etc. return abspaths if the
# input is absolute, see https://docs.python.org/3/library/os.html#os.DirEntry.path
root = os.path.abspath(root)
found_files = collections.defaultdict(list) found_files = collections.defaultdict(list)
def _dir_id(stat_info): # Make the path absolute to have os.walk also return an absolute path
# Note: on windows, st_ino is the file index and st_dev root = os.path.abspath(root)
# is the volume serial number. See for path, _, list_files in os.walk(root):
# https://github.com/python/cpython/blob/3.9/Python/fileutils.c for search_file in search_files:
return (stat_info.st_ino, stat_info.st_dev) matches = glob.glob(os.path.join(path, search_file))
matches = [os.path.join(path, x) for x in matches]
found_files[search_file].extend(matches)
def _log_file_access_issue(e): answer = []
errno_name = errno.errorcode.get(e.errno, "UNKNOWN") for search_file in search_files:
tty.debug(f"find must skip {dir_entry.path}: {errno_name} {str(e)}") answer.extend(found_files[search_file])
visited_dirs = set([_dir_id(stat_root)]) return answer
# Each queue item stores the depth and path
# This achieves a consistent traversal order by iterating through
# each directory in alphabetical order.
# This also traverses in BFS order to ensure finding the shortest
# path to any file (or one of the shortest paths, if there are
# several - the one returned will be consistent given the prior
# point).
dir_queue = collections.deque([(0, root)])
while dir_queue:
depth, next_dir = dir_queue.pop()
try:
dir_iter = os.scandir(next_dir)
except OSError:
# Most commonly, this would be a permissions issue, for
# example if we are scanning an external directory like /usr
continue
with dir_iter: @system_path_filter
ordered_entries = sorted(dir_iter, key=lambda x: x.name) def _find_non_recursive(root, search_files):
for dir_entry in ordered_entries: # The variable here is **on purpose** a defaultdict as os.list_dir
try: # can return files in any order (does not preserve stability)
it_is_a_dir = dir_entry.is_dir(follow_symlinks=True) found_files = collections.defaultdict(list)
except OSError as e:
# Possible permission issue, or a symlink that cannot
# be resolved (ELOOP).
_log_file_access_issue(e)
continue
if it_is_a_dir and (depth < max_depth): # Make the path absolute to have absolute path returned
try: root = os.path.abspath(root)
# The stat should be performed in a try/except block.
# We repeat that here vs. moving to the above block
# because we only want to call `stat` if we haven't
# exceeded our max_depth
if sys.platform == "win32":
# Note: st_ino/st_dev on DirEntry.stat are not set on
# Windows, so we have to call os.stat
stat_info = os.stat(dir_entry.path, follow_symlinks=True)
else:
stat_info = dir_entry.stat(follow_symlinks=True)
except OSError as e:
_log_file_access_issue(e)
continue
dir_id = _dir_id(stat_info) for search_file in search_files:
if dir_id not in visited_dirs: matches = glob.glob(os.path.join(root, search_file))
dir_queue.appendleft((depth + 1, dir_entry.path)) matches = [os.path.join(root, x) for x in matches]
visited_dirs.add(dir_id) found_files[search_file].extend(matches)
else:
fname = os.path.basename(dir_entry.path)
for pattern in regexes:
if pattern.match(os.path.normcase(fname)):
found_files[pattern].append(os.path.join(next_dir, fname))
# TODO: for fully-recursive searches, we can print a warning after answer = []
# after having searched everything up to some fixed depth for search_file in search_files:
answer.extend(found_files[search_file])
return list(itertools.chain(*[found_files[x] for x in regexes])) return answer
# Utilities for libraries and headers # Utilities for libraries and headers
@ -2292,9 +2210,7 @@ def find_system_libraries(libraries, shared=True):
return libraries_found return libraries_found
def find_libraries( def find_libraries(libraries, root, shared=True, recursive=False, runtime=True):
libraries, root, shared=True, recursive=False, runtime=True, max_depth: Optional[int] = None
):
"""Returns an iterable of full paths to libraries found in a root dir. """Returns an iterable of full paths to libraries found in a root dir.
Accepts any glob characters accepted by fnmatch: Accepts any glob characters accepted by fnmatch:
@ -2315,8 +2231,6 @@ def find_libraries(
otherwise for static. Defaults to True. otherwise for static. Defaults to True.
recursive (bool): if False search only root folder, recursive (bool): if False search only root folder,
if True descends top-down from the root. Defaults to False. if True descends top-down from the root. Defaults to False.
max_depth (int): if set, don't search below this depth. Cannot be set
if recursive is False
runtime (bool): Windows only option, no-op elsewhere. If true, runtime (bool): Windows only option, no-op elsewhere. If true,
search for runtime shared libs (.DLL), otherwise, search search for runtime shared libs (.DLL), otherwise, search
for .Lib files. If shared is false, this has no meaning. for .Lib files. If shared is false, this has no meaning.
@ -2325,7 +2239,6 @@ def find_libraries(
Returns: Returns:
LibraryList: The libraries that have been found LibraryList: The libraries that have been found
""" """
if isinstance(libraries, str): if isinstance(libraries, str):
libraries = [libraries] libraries = [libraries]
elif not isinstance(libraries, collections.abc.Sequence): elif not isinstance(libraries, collections.abc.Sequence):
@ -2358,10 +2271,8 @@ def find_libraries(
libraries = ["{0}.{1}".format(lib, suffix) for lib in libraries for suffix in suffixes] libraries = ["{0}.{1}".format(lib, suffix) for lib in libraries for suffix in suffixes]
if not recursive: if not recursive:
if max_depth:
raise ValueError(f"max_depth ({max_depth}) cannot be set if recursive is False")
# If not recursive, look for the libraries directly in root # If not recursive, look for the libraries directly in root
return LibraryList(find(root, libraries, recursive=False)) return LibraryList(find(root, libraries, False))
# To speedup the search for external packages configured e.g. in /usr, # To speedup the search for external packages configured e.g. in /usr,
# perform first non-recursive search in root/lib then in root/lib64 and # perform first non-recursive search in root/lib then in root/lib64 and
@ -2379,7 +2290,7 @@ def find_libraries(
if found_libs: if found_libs:
break break
else: else:
found_libs = find(root, libraries, recursive=True, max_depth=max_depth) found_libs = find(root, libraries, True)
return LibraryList(found_libs) return LibraryList(found_libs)

View File

@ -9,7 +9,7 @@
import pytest import pytest
from llnl.util.filesystem import HeaderList, LibraryList, find_headers, find_libraries from llnl.util.filesystem import HeaderList, LibraryList, find, find_headers, find_libraries
import spack.paths import spack.paths
@ -324,3 +324,33 @@ def test_searching_order(search_fn, search_list, root, kwargs):
# List should be empty here # List should be empty here
assert len(rlist) == 0 assert len(rlist) == 0
@pytest.mark.parametrize(
"root,search_list,kwargs,expected",
[
(
search_dir,
"*/*bar.tx?",
{"recursive": False},
[
os.path.join(search_dir, os.path.join("a", "foobar.txt")),
os.path.join(search_dir, os.path.join("b", "bar.txp")),
os.path.join(search_dir, os.path.join("c", "bar.txt")),
],
),
(
search_dir,
"*/*bar.tx?",
{"recursive": True},
[
os.path.join(search_dir, os.path.join("a", "foobar.txt")),
os.path.join(search_dir, os.path.join("b", "bar.txp")),
os.path.join(search_dir, os.path.join("c", "bar.txt")),
],
),
],
)
def test_find_with_globbing(root, search_list, kwargs, expected):
matches = find(root, search_list, **kwargs)
assert sorted(matches) == sorted(expected)

View File

@ -6,7 +6,6 @@
"""Tests for ``llnl/util/filesystem.py``""" """Tests for ``llnl/util/filesystem.py``"""
import filecmp import filecmp
import os import os
import pathlib
import shutil import shutil
import stat import stat
import sys import sys
@ -15,8 +14,7 @@
import pytest import pytest
import llnl.util.filesystem as fs import llnl.util.filesystem as fs
import llnl.util.symlink from llnl.util.symlink import islink, readlink, symlink
from llnl.util.symlink import _windows_can_symlink, islink, readlink, symlink
import spack.paths import spack.paths
@ -1037,157 +1035,3 @@ def test_windows_sfn(tmpdir):
assert "d\\LONGER~1" in fs.windows_sfn(d) assert "d\\LONGER~1" in fs.windows_sfn(d)
assert "d\\LONGER~2" in fs.windows_sfn(e) assert "d\\LONGER~2" in fs.windows_sfn(e)
shutil.rmtree(tmpdir.join("d")) shutil.rmtree(tmpdir.join("d"))
@pytest.fixture
def dir_structure_with_things_to_find(tmpdir):
"""
<root>/
dir_one/
file_one
dir_two/
dir_three/
dir_four/
file_two
file_three
file_four
"""
dir_one = tmpdir.join("dir_one").ensure(dir=True)
tmpdir.join("dir_two").ensure(dir=True)
dir_three = tmpdir.join("dir_three").ensure(dir=True)
dir_four = dir_three.join("dir_four").ensure(dir=True)
locations = {}
locations["file_one"] = str(dir_one.join("file_one").ensure())
locations["file_two"] = str(dir_four.join("file_two").ensure())
locations["file_three"] = str(dir_three.join("file_three").ensure())
locations["file_four"] = str(tmpdir.join("file_four").ensure())
return str(tmpdir), locations
def test_find_max_depth(dir_structure_with_things_to_find):
root, locations = dir_structure_with_things_to_find
# Make sure the paths we use to verify are absolute
assert os.path.isabs(locations["file_one"])
assert set(fs.find_max_depth(root, "file_*", 0)) == {locations["file_four"]}
assert set(fs.find_max_depth(root, "file_*", 1)) == {
locations["file_one"],
locations["file_three"],
locations["file_four"],
}
assert set(fs.find_max_depth(root, "file_two", 2)) == {locations["file_two"]}
assert not set(fs.find_max_depth(root, "file_two", 1))
assert set(fs.find_max_depth(root, "file_two")) == {locations["file_two"]}
assert set(fs.find_max_depth(root, "file_*")) == set(locations.values())
def test_find_max_depth_relative(dir_structure_with_things_to_find):
"""find_max_depth should return absolute paths even if
the provided path is relative.
"""
root, locations = dir_structure_with_things_to_find
with fs.working_dir(root):
assert set(fs.find_max_depth(".", "file_*", 0)) == {locations["file_four"]}
assert set(fs.find_max_depth(".", "file_two", 2)) == {locations["file_two"]}
@pytest.mark.parametrize("recursive,max_depth", [(False, -1), (False, 1)])
def test_max_depth_and_recursive_errors(tmpdir, recursive, max_depth):
root = str(tmpdir)
error_str = "cannot be set if recursive is False"
with pytest.raises(ValueError, match=error_str):
fs.find(root, ["some_file"], recursive=recursive, max_depth=max_depth)
with pytest.raises(ValueError, match=error_str):
fs.find_libraries(["some_lib"], root, recursive=recursive, max_depth=max_depth)
def dir_structure_with_things_to_find_links(tmpdir, use_junctions=False):
"""
"lx-dy" means "level x, directory y"
"lx-fy" means "level x, file y"
"lx-sy" means "level x, symlink y"
<root>/
l1-d1/
l2-d1/
l3-s1 -> l1-d2 # points to directory above l2-d1
l3-d2/
l4-f1
l3-s3 -> l1-d1 # cyclic link
l3-d4/
l4-f2
l1-d2/
l2-f1
l2-d2/
l3-f3
l2-s3 -> l2-d2
l1-s3 -> l3-d4 # a link that "skips" a directory level
l1-s4 -> l2-s3 # a link to a link to a dir
"""
if sys.platform == "win32" and (not use_junctions) and (not _windows_can_symlink()):
pytest.skip("This Windows instance is not configured with symlink support")
l1_d1 = tmpdir.join("l1-d1").ensure(dir=True)
l2_d1 = l1_d1.join("l2-d1").ensure(dir=True)
l3_d2 = l2_d1.join("l3-d2").ensure(dir=True)
l3_d4 = l2_d1.join("l3-d4").ensure(dir=True)
l1_d2 = tmpdir.join("l1-d2").ensure(dir=True)
l2_d2 = l1_d2.join("l1-d2").ensure(dir=True)
if use_junctions:
link_fn = llnl.util.symlink._windows_create_junction
else:
link_fn = os.symlink
link_fn(l1_d2, pathlib.Path(l2_d1) / "l3-s1")
link_fn(l1_d1, pathlib.Path(l2_d1) / "l3-s3")
link_fn(l3_d4, pathlib.Path(tmpdir) / "l1-s3")
l2_s3 = pathlib.Path(l1_d2) / "l2-s3"
link_fn(l2_d2, l2_s3)
link_fn(l2_s3, pathlib.Path(tmpdir) / "l1-s4")
locations = {}
locations["l4-f1"] = str(l3_d2.join("l4-f1").ensure())
locations["l4-f2-full"] = str(l3_d4.join("l4-f2").ensure())
locations["l4-f2-link"] = str(pathlib.Path(tmpdir) / "l1-s3" / "l4-f2")
locations["l2-f1"] = str(l1_d2.join("l2-f1").ensure())
locations["l2-f1-link"] = str(pathlib.Path(tmpdir) / "l1-d1" / "l2-d1" / "l3-s1" / "l2-f1")
locations["l3-f3-full"] = str(l2_d2.join("l3-f3").ensure())
locations["l3-f3-link-l1"] = str(pathlib.Path(tmpdir) / "l1-s4" / "l3-f3")
return str(tmpdir), locations
def _check_find_links(root, locations):
root = pathlib.Path(root)
assert set(fs.find_max_depth(root, "l4-f1")) == {locations["l4-f1"]}
assert set(fs.find_max_depth(root / "l1-s3", "l4-f2", 0)) == {locations["l4-f2-link"]}
assert set(fs.find_max_depth(root / "l1-d1", "l2-f1")) == {locations["l2-f1-link"]}
# File is accessible via symlink and subdir, the link path will be
# searched first, and the directory will not be searched again when
# it is encountered the second time (via not-link) in the traversal
assert set(fs.find_max_depth(root, "l4-f2")) == {locations["l4-f2-link"]}
# File is accessible only via the dir, so the full file path should
# be reported
assert set(fs.find_max_depth(root / "l1-d1", "l4-f2")) == {locations["l4-f2-full"]}
# Check following links to links
assert set(fs.find_max_depth(root, "l3-f3")) == {locations["l3-f3-link-l1"]}
@pytest.mark.parametrize(
"use_junctions",
[
False,
pytest.param(
True,
marks=pytest.mark.skipif(sys.platform != "win32", reason="Only Windows has junctions"),
),
],
)
def test_find_max_depth_symlinks(tmpdir, use_junctions):
root, locations = dir_structure_with_things_to_find_links(tmpdir, use_junctions=use_junctions)
_check_find_links(root, locations)

View File

@ -44,7 +44,7 @@ def libs(self):
# Header provided by the bar virutal package # Header provided by the bar virutal package
@property @property
def bar_headers(self): def bar_headers(self):
return find_headers("bar", root=self.home.include, recursive=True) return find_headers("bar/bar", root=self.home.include, recursive=False)
# Libary provided by the bar virtual package # Libary provided by the bar virtual package
@property @property
@ -59,7 +59,7 @@ def baz_home(self):
# Header provided by the baz virtual package # Header provided by the baz virtual package
@property @property
def baz_headers(self): def baz_headers(self):
return find_headers("baz", root=self.baz_home.include, recursive=True) return find_headers("baz/baz", root=self.baz_home.include, recursive=False)
# Library provided by the baz virtual package # Library provided by the baz virtual package
@property @property