Add llnl.util.filesystem.find_first
(#36083)
Add a `find_first` method that locates one instance of a file that matches a specified pattern by recursively searching a directory tree. Unlike other `find` methods, this only locates one file at most, so can use optimizations that avoid searching the entire tree: Typically the relevant files are at low depth, so it makes sense to locate files through iterative deepening and early exit.
This commit is contained in:
parent
c5b3fc6929
commit
5072e48dab
@ -5,18 +5,20 @@
|
|||||||
import collections
|
import collections
|
||||||
import collections.abc
|
import collections.abc
|
||||||
import errno
|
import errno
|
||||||
|
import fnmatch
|
||||||
import glob
|
import glob
|
||||||
import hashlib
|
import hashlib
|
||||||
import itertools
|
import itertools
|
||||||
import numbers
|
import numbers
|
||||||
import os
|
import os
|
||||||
|
import posixpath
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import stat
|
import stat
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from typing import Callable, List, Match, Optional, Tuple, Union
|
from typing import Callable, Iterable, List, Match, Optional, Tuple, Union
|
||||||
|
|
||||||
from llnl.util import tty
|
from llnl.util import tty
|
||||||
from llnl.util.lang import dedupe, memoized
|
from llnl.util.lang import dedupe, memoized
|
||||||
@ -1671,6 +1673,38 @@ def fix_darwin_install_name(path):
|
|||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
|
def find_first(root: str, files: Union[Iterable[str], str], bfs_depth: int = 2) -> Optional[str]:
|
||||||
|
"""Find the first file matching a pattern.
|
||||||
|
|
||||||
|
The following
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
$ find /usr -name 'abc*' -o -name 'def*' -quit
|
||||||
|
|
||||||
|
is equivalent to:
|
||||||
|
|
||||||
|
>>> find_first("/usr", ["abc*", "def*"])
|
||||||
|
|
||||||
|
Any glob pattern supported by fnmatch can be used.
|
||||||
|
|
||||||
|
The search order of this method is breadth-first over directories,
|
||||||
|
until depth bfs_depth, after which depth-first search is used.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
root (str): The root directory to start searching from
|
||||||
|
files (str or Iterable): File pattern(s) to search for
|
||||||
|
bfs_depth (int): (advanced) parameter that specifies at which
|
||||||
|
depth to switch to depth-first search.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str or None: The matching file or None when no file is found.
|
||||||
|
"""
|
||||||
|
if isinstance(files, str):
|
||||||
|
files = [files]
|
||||||
|
return FindFirstFile(root, *files, bfs_depth=bfs_depth).find()
|
||||||
|
|
||||||
|
|
||||||
def find(root, files, recursive=True):
|
def find(root, files, recursive=True):
|
||||||
"""Search for ``files`` starting from the ``root`` directory.
|
"""Search for ``files`` starting from the ``root`` directory.
|
||||||
|
|
||||||
@ -2720,3 +2754,105 @@ def filesummary(path, print_bytes=16) -> Tuple[int, bytes]:
|
|||||||
return size, short_contents
|
return size, short_contents
|
||||||
except OSError:
|
except OSError:
|
||||||
return 0, b""
|
return 0, b""
|
||||||
|
|
||||||
|
|
||||||
|
class FindFirstFile:
|
||||||
|
"""Uses hybrid iterative deepening to locate the first matching
|
||||||
|
file. Up to depth ``bfs_depth`` it uses iterative deepening, which
|
||||||
|
mimics breadth-first with the same memory footprint as depth-first
|
||||||
|
search, after which it switches to ordinary depth-first search using
|
||||||
|
``os.walk``."""
|
||||||
|
|
||||||
|
def __init__(self, root: str, *file_patterns: str, bfs_depth: int = 2):
|
||||||
|
"""Create a small summary of the given file. Does not error
|
||||||
|
when file does not exist.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
root (str): directory in which to recursively search
|
||||||
|
file_patterns (str): glob file patterns understood by fnmatch
|
||||||
|
bfs_depth (int): until this depth breadth-first traversal is used,
|
||||||
|
when no match is found, the mode is switched to depth-first search.
|
||||||
|
"""
|
||||||
|
self.root = root
|
||||||
|
self.bfs_depth = bfs_depth
|
||||||
|
self.match: Callable
|
||||||
|
|
||||||
|
# normcase is trivial on posix
|
||||||
|
regex = re.compile("|".join(fnmatch.translate(os.path.normcase(p)) for p in file_patterns))
|
||||||
|
|
||||||
|
# On case sensitive filesystems match against normcase'd paths.
|
||||||
|
if os.path is posixpath:
|
||||||
|
self.match = regex.match
|
||||||
|
else:
|
||||||
|
self.match = lambda p: regex.match(os.path.normcase(p))
|
||||||
|
|
||||||
|
def find(self) -> Optional[str]:
|
||||||
|
"""Run the file search
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str or None: path of the matching file
|
||||||
|
"""
|
||||||
|
self.file = None
|
||||||
|
|
||||||
|
# First do iterative deepening (i.e. bfs through limited depth dfs)
|
||||||
|
for i in range(self.bfs_depth + 1):
|
||||||
|
if self._find_at_depth(self.root, i):
|
||||||
|
return self.file
|
||||||
|
|
||||||
|
# Then fall back to depth-first search
|
||||||
|
return self._find_dfs()
|
||||||
|
|
||||||
|
def _find_at_depth(self, path, max_depth, depth=0) -> bool:
|
||||||
|
"""Returns True when done. Notice search can be done
|
||||||
|
either because a file was found, or because it recursed
|
||||||
|
through all directories."""
|
||||||
|
try:
|
||||||
|
entries = os.scandir(path)
|
||||||
|
except OSError:
|
||||||
|
return True
|
||||||
|
|
||||||
|
done = True
|
||||||
|
|
||||||
|
with entries:
|
||||||
|
# At max depth we look for matching files.
|
||||||
|
if depth == max_depth:
|
||||||
|
for f in entries:
|
||||||
|
# Exit on match
|
||||||
|
if self.match(f.name):
|
||||||
|
self.file = os.path.join(path, f.name)
|
||||||
|
return True
|
||||||
|
|
||||||
|
# is_dir should not require a stat call, so it's a good optimization.
|
||||||
|
if self._is_dir(f):
|
||||||
|
done = False
|
||||||
|
return done
|
||||||
|
|
||||||
|
# At lower depth only recurse into subdirs
|
||||||
|
for f in entries:
|
||||||
|
if not self._is_dir(f):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# If any subdir is not fully traversed, we're not done yet.
|
||||||
|
if not self._find_at_depth(os.path.join(path, f.name), max_depth, depth + 1):
|
||||||
|
done = False
|
||||||
|
|
||||||
|
# Early exit when we've found something.
|
||||||
|
if self.file:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return done
|
||||||
|
|
||||||
|
def _is_dir(self, f: os.DirEntry) -> bool:
|
||||||
|
"""Returns True when f is dir we can enter (and not a symlink)."""
|
||||||
|
try:
|
||||||
|
return f.is_dir(follow_symlinks=False)
|
||||||
|
except OSError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _find_dfs(self) -> Optional[str]:
|
||||||
|
"""Returns match or None"""
|
||||||
|
for dirpath, _, filenames in os.walk(self.root):
|
||||||
|
for file in filenames:
|
||||||
|
if self.match(file):
|
||||||
|
return os.path.join(dirpath, file)
|
||||||
|
return None
|
||||||
|
@ -871,3 +871,34 @@ def test_filesummary(tmpdir):
|
|||||||
assert fs.filesummary(p, print_bytes=8) == (26, b"abcdefgh...stuvwxyz")
|
assert fs.filesummary(p, print_bytes=8) == (26, b"abcdefgh...stuvwxyz")
|
||||||
assert fs.filesummary(p, print_bytes=13) == (26, b"abcdefghijklmnopqrstuvwxyz")
|
assert fs.filesummary(p, print_bytes=13) == (26, b"abcdefghijklmnopqrstuvwxyz")
|
||||||
assert fs.filesummary(p, print_bytes=100) == (26, b"abcdefghijklmnopqrstuvwxyz")
|
assert fs.filesummary(p, print_bytes=100) == (26, b"abcdefghijklmnopqrstuvwxyz")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("bfs_depth", [1, 2, 10])
|
||||||
|
def test_find_first_file(tmpdir, bfs_depth):
|
||||||
|
# Create a structure: a/a/a/{file1,file2}, b/a, c/a, d/{a,file1}
|
||||||
|
tmpdir.join("a", "a", "a").ensure(dir=True)
|
||||||
|
tmpdir.join("b", "a").ensure(dir=True)
|
||||||
|
tmpdir.join("c", "a").ensure(dir=True)
|
||||||
|
tmpdir.join("d", "a").ensure(dir=True)
|
||||||
|
tmpdir.join("e").ensure(dir=True)
|
||||||
|
|
||||||
|
fs.touch(tmpdir.join("a", "a", "a", "file1"))
|
||||||
|
fs.touch(tmpdir.join("a", "a", "a", "file2"))
|
||||||
|
fs.touch(tmpdir.join("d", "file1"))
|
||||||
|
|
||||||
|
root = str(tmpdir)
|
||||||
|
|
||||||
|
# Iterative deepening: should find low-depth file1.
|
||||||
|
assert os.path.samefile(
|
||||||
|
fs.find_first(root, "file*", bfs_depth=bfs_depth), os.path.join(root, "d", "file1")
|
||||||
|
)
|
||||||
|
|
||||||
|
assert fs.find_first(root, "nonexisting", bfs_depth=bfs_depth) is None
|
||||||
|
|
||||||
|
assert os.path.samefile(
|
||||||
|
fs.find_first(root, ["nonexisting", "file2"], bfs_depth=bfs_depth),
|
||||||
|
os.path.join(root, "a", "a", "a", "file2"),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should find first dir
|
||||||
|
assert os.path.samefile(fs.find_first(root, "a", bfs_depth=bfs_depth), os.path.join(root, "a"))
|
||||||
|
Loading…
Reference in New Issue
Block a user