filesystem.py find: return directories and improve performance (#47537)

This commit is contained in:
Harmen Stoppels 2024-11-11 10:43:23 +01:00 committed by GitHub
parent 30db764449
commit a9e6074996
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 49 additions and 57 deletions

View File

@ -1693,11 +1693,11 @@ def find(
recursive: bool = True,
max_depth: Optional[int] = None,
) -> List[str]:
"""Finds all non-directory files matching the patterns from ``files`` starting from ``root``.
This function returns a deterministic result for the same input and directory structure when
run multiple times. Symlinked directories are followed, and unique directories are searched
only once. Each matching file is returned only once at lowest depth in case multiple paths
exist due to symlinked directories.
"""Finds all files matching the patterns from ``files`` starting from ``root``. This function
returns a deterministic result for the same input and directory structure when run multiple
times. Symlinked directories are followed, and unique directories are searched only once. Each
matching file is returned only once at lowest depth in case multiple paths exist due to
symlinked directories.
Accepts any glob characters accepted by fnmatch:
@ -1830,54 +1830,58 @@ def _find_max_depth(
# Use glob.glob for complex patterns.
for pattern_name, pattern in complex_patterns.items():
matched_paths[pattern_name].extend(
path
for path in glob.glob(os.path.join(curr_dir, pattern))
if not os.path.isdir(path)
path for path in glob.glob(os.path.join(curr_dir, pattern))
)
# List of subdirectories by path and (inode, device) tuple
subdirs: List[Tuple[str, Tuple[int, int]]] = []
with dir_iter:
ordered_entries = sorted(dir_iter, key=lambda x: x.name)
for dir_entry in ordered_entries:
try:
it_is_a_dir = dir_entry.is_dir(follow_symlinks=True)
except OSError as e:
# Possible permission issue, or a symlink that cannot be resolved (ELOOP).
_log_file_access_issue(e, dir_entry.path)
continue
for dir_entry in dir_iter:
if it_is_a_dir:
if depth >= max_depth:
continue
try:
# The stat should be performed in a try/except block. We repeat that here
# vs. moving to the above block because we only want to call `stat` if we
# haven't exceeded our max_depth
if sys.platform == "win32":
# Note: st_ino/st_dev on DirEntry.stat are not set on Windows, so we
# have to call os.stat
stat_info = os.stat(dir_entry.path, follow_symlinks=True)
else:
stat_info = dir_entry.stat(follow_symlinks=True)
except OSError as e:
_log_file_access_issue(e, dir_entry.path)
continue
dir_id = _file_id(stat_info)
if dir_id not in visited_dirs:
dir_queue.appendleft((depth + 1, dir_entry.path))
visited_dirs.add(dir_id)
elif filename_only_patterns:
# Match filename only patterns
if filename_only_patterns:
m = regex.match(os.path.normcase(dir_entry.name))
if not m:
continue
if m:
for pattern_name in filename_only_patterns:
if m.group(pattern_name):
matched_paths[pattern_name].append(dir_entry.path)
break
# Collect subdirectories
if depth >= max_depth:
continue
try:
if not dir_entry.is_dir(follow_symlinks=True):
continue
if sys.platform == "win32":
# Note: st_ino/st_dev on DirEntry.stat are not set on Windows, so we have
# to call os.stat
stat_info = os.stat(dir_entry.path, follow_symlinks=True)
else:
stat_info = dir_entry.stat(follow_symlinks=True)
except OSError as e:
# Possible permission issue, or a symlink that cannot be resolved (ELOOP).
_log_file_access_issue(e, dir_entry.path)
continue
subdirs.append((dir_entry.path, _file_id(stat_info)))
# Enqueue subdirectories in a deterministic order
if subdirs:
subdirs.sort(key=lambda s: os.path.basename(s[0]))
for subdir, subdir_id in subdirs:
if subdir_id not in visited_dirs:
dir_queue.appendleft((depth + 1, subdir))
visited_dirs.add(subdir_id)
# Sort the matched paths for deterministic output
for paths in matched_paths.values():
paths.sort()
all_matching_paths = [path for paths in matched_paths.values() for path in paths]
# we only dedupe files if we have any complex patterns, since only they can match the same file
# We only dedupe files if we have any complex patterns, since only they can match the same file
# multiple times
return _dedupe_files(all_matching_paths) if complex_patterns else all_matching_paths

View File

@ -1130,16 +1130,16 @@ def complex_dir_structure(request, tmpdir):
<root>/
l1-d1/
l2-d1/
l3-s1 -> l1-d2 # points to directory above l2-d1
l3-d2/
l4-f1
l3-s3 -> l1-d1 # cyclic link
l3-d4/
l4-f2
l3-s1 -> l1-d2 # points to directory above l2-d1
l3-s3 -> l1-d1 # cyclic link
l1-d2/
l2-f1
l2-d2/
l3-f3
l2-f1
l2-s3 -> l2-d2
l1-s3 -> l3-d4 # a link that "skips" a directory level
l1-s4 -> l2-s3 # a link to a link to a dir
@ -1155,7 +1155,7 @@ def complex_dir_structure(request, tmpdir):
l3_d2 = l2_d1.join("l3-d2").ensure(dir=True)
l3_d4 = l2_d1.join("l3-d4").ensure(dir=True)
l1_d2 = tmpdir.join("l1-d2").ensure(dir=True)
l2_d2 = l1_d2.join("l1-d2").ensure(dir=True)
l2_d2 = l1_d2.join("l2-d2").ensure(dir=True)
if use_junctions:
link_fn = llnl.util.symlink._windows_create_junction
@ -1216,7 +1216,7 @@ def test_find_max_depth_multiple_and_repeated_entry_points(complex_dir_structure
def test_multiple_patterns(complex_dir_structure):
root, _ = complex_dir_structure
paths = fs.find(root, ["l2-f1", "l*-d*/l3-f3", "*", "*/*"])
paths = fs.find(root, ["l2-f1", "l*-d*/l3-f3", "*-f*", "*/*-f*"])
# There shouldn't be duplicate results with multiple, overlapping patterns
assert len(set(paths)) == len(paths)
# All files should be found
@ -1249,15 +1249,3 @@ def test_find_input_types(tmp_path: pathlib.Path):
with pytest.raises(TypeError):
fs.find(1, "file.txt") # type: ignore
def test_find_only_finds_files(tmp_path: pathlib.Path):
"""ensure that find only returns files even at max_depth"""
(tmp_path / "subdir").mkdir()
(tmp_path / "subdir" / "dir").mkdir()
(tmp_path / "subdir" / "file.txt").write_text("")
assert (
fs.find(tmp_path, "*", max_depth=1)
== fs.find(tmp_path, "*/*", max_depth=1)
== [str(tmp_path / "subdir" / "file.txt")]
)