filesystem.py find: return directories and improve performance (#47537)
This commit is contained in:
parent
30db764449
commit
a9e6074996
@ -1693,11 +1693,11 @@ def find(
|
||||
recursive: bool = True,
|
||||
max_depth: Optional[int] = None,
|
||||
) -> List[str]:
|
||||
"""Finds all non-directory files matching the patterns from ``files`` starting from ``root``.
|
||||
This function returns a deterministic result for the same input and directory structure when
|
||||
run multiple times. Symlinked directories are followed, and unique directories are searched
|
||||
only once. Each matching file is returned only once at lowest depth in case multiple paths
|
||||
exist due to symlinked directories.
|
||||
"""Finds all files matching the patterns from ``files`` starting from ``root``. This function
|
||||
returns a deterministic result for the same input and directory structure when run multiple
|
||||
times. Symlinked directories are followed, and unique directories are searched only once. Each
|
||||
matching file is returned only once at lowest depth in case multiple paths exist due to
|
||||
symlinked directories.
|
||||
|
||||
Accepts any glob characters accepted by fnmatch:
|
||||
|
||||
@ -1830,54 +1830,58 @@ def _find_max_depth(
|
||||
# Use glob.glob for complex patterns.
|
||||
for pattern_name, pattern in complex_patterns.items():
|
||||
matched_paths[pattern_name].extend(
|
||||
path
|
||||
for path in glob.glob(os.path.join(curr_dir, pattern))
|
||||
if not os.path.isdir(path)
|
||||
path for path in glob.glob(os.path.join(curr_dir, pattern))
|
||||
)
|
||||
|
||||
# List of subdirectories by path and (inode, device) tuple
|
||||
subdirs: List[Tuple[str, Tuple[int, int]]] = []
|
||||
|
||||
with dir_iter:
|
||||
ordered_entries = sorted(dir_iter, key=lambda x: x.name)
|
||||
for dir_entry in ordered_entries:
|
||||
try:
|
||||
it_is_a_dir = dir_entry.is_dir(follow_symlinks=True)
|
||||
except OSError as e:
|
||||
# Possible permission issue, or a symlink that cannot be resolved (ELOOP).
|
||||
_log_file_access_issue(e, dir_entry.path)
|
||||
continue
|
||||
for dir_entry in dir_iter:
|
||||
|
||||
if it_is_a_dir:
|
||||
if depth >= max_depth:
|
||||
continue
|
||||
try:
|
||||
# The stat should be performed in a try/except block. We repeat that here
|
||||
# vs. moving to the above block because we only want to call `stat` if we
|
||||
# haven't exceeded our max_depth
|
||||
if sys.platform == "win32":
|
||||
# Note: st_ino/st_dev on DirEntry.stat are not set on Windows, so we
|
||||
# have to call os.stat
|
||||
stat_info = os.stat(dir_entry.path, follow_symlinks=True)
|
||||
else:
|
||||
stat_info = dir_entry.stat(follow_symlinks=True)
|
||||
except OSError as e:
|
||||
_log_file_access_issue(e, dir_entry.path)
|
||||
continue
|
||||
|
||||
dir_id = _file_id(stat_info)
|
||||
if dir_id not in visited_dirs:
|
||||
dir_queue.appendleft((depth + 1, dir_entry.path))
|
||||
visited_dirs.add(dir_id)
|
||||
elif filename_only_patterns:
|
||||
# Match filename only patterns
|
||||
if filename_only_patterns:
|
||||
m = regex.match(os.path.normcase(dir_entry.name))
|
||||
if not m:
|
||||
continue
|
||||
if m:
|
||||
for pattern_name in filename_only_patterns:
|
||||
if m.group(pattern_name):
|
||||
matched_paths[pattern_name].append(dir_entry.path)
|
||||
break
|
||||
|
||||
# Collect subdirectories
|
||||
if depth >= max_depth:
|
||||
continue
|
||||
|
||||
try:
|
||||
if not dir_entry.is_dir(follow_symlinks=True):
|
||||
continue
|
||||
if sys.platform == "win32":
|
||||
# Note: st_ino/st_dev on DirEntry.stat are not set on Windows, so we have
|
||||
# to call os.stat
|
||||
stat_info = os.stat(dir_entry.path, follow_symlinks=True)
|
||||
else:
|
||||
stat_info = dir_entry.stat(follow_symlinks=True)
|
||||
except OSError as e:
|
||||
# Possible permission issue, or a symlink that cannot be resolved (ELOOP).
|
||||
_log_file_access_issue(e, dir_entry.path)
|
||||
continue
|
||||
|
||||
subdirs.append((dir_entry.path, _file_id(stat_info)))
|
||||
|
||||
# Enqueue subdirectories in a deterministic order
|
||||
if subdirs:
|
||||
subdirs.sort(key=lambda s: os.path.basename(s[0]))
|
||||
for subdir, subdir_id in subdirs:
|
||||
if subdir_id not in visited_dirs:
|
||||
dir_queue.appendleft((depth + 1, subdir))
|
||||
visited_dirs.add(subdir_id)
|
||||
|
||||
# Sort the matched paths for deterministic output
|
||||
for paths in matched_paths.values():
|
||||
paths.sort()
|
||||
all_matching_paths = [path for paths in matched_paths.values() for path in paths]
|
||||
|
||||
# we only dedupe files if we have any complex patterns, since only they can match the same file
|
||||
# We only dedupe files if we have any complex patterns, since only they can match the same file
|
||||
# multiple times
|
||||
return _dedupe_files(all_matching_paths) if complex_patterns else all_matching_paths
|
||||
|
||||
|
@ -1130,16 +1130,16 @@ def complex_dir_structure(request, tmpdir):
|
||||
<root>/
|
||||
l1-d1/
|
||||
l2-d1/
|
||||
l3-s1 -> l1-d2 # points to directory above l2-d1
|
||||
l3-d2/
|
||||
l4-f1
|
||||
l3-s3 -> l1-d1 # cyclic link
|
||||
l3-d4/
|
||||
l4-f2
|
||||
l3-s1 -> l1-d2 # points to directory above l2-d1
|
||||
l3-s3 -> l1-d1 # cyclic link
|
||||
l1-d2/
|
||||
l2-f1
|
||||
l2-d2/
|
||||
l3-f3
|
||||
l2-f1
|
||||
l2-s3 -> l2-d2
|
||||
l1-s3 -> l3-d4 # a link that "skips" a directory level
|
||||
l1-s4 -> l2-s3 # a link to a link to a dir
|
||||
@ -1155,7 +1155,7 @@ def complex_dir_structure(request, tmpdir):
|
||||
l3_d2 = l2_d1.join("l3-d2").ensure(dir=True)
|
||||
l3_d4 = l2_d1.join("l3-d4").ensure(dir=True)
|
||||
l1_d2 = tmpdir.join("l1-d2").ensure(dir=True)
|
||||
l2_d2 = l1_d2.join("l1-d2").ensure(dir=True)
|
||||
l2_d2 = l1_d2.join("l2-d2").ensure(dir=True)
|
||||
|
||||
if use_junctions:
|
||||
link_fn = llnl.util.symlink._windows_create_junction
|
||||
@ -1216,7 +1216,7 @@ def test_find_max_depth_multiple_and_repeated_entry_points(complex_dir_structure
|
||||
|
||||
def test_multiple_patterns(complex_dir_structure):
|
||||
root, _ = complex_dir_structure
|
||||
paths = fs.find(root, ["l2-f1", "l*-d*/l3-f3", "*", "*/*"])
|
||||
paths = fs.find(root, ["l2-f1", "l*-d*/l3-f3", "*-f*", "*/*-f*"])
|
||||
# There shouldn't be duplicate results with multiple, overlapping patterns
|
||||
assert len(set(paths)) == len(paths)
|
||||
# All files should be found
|
||||
@ -1249,15 +1249,3 @@ def test_find_input_types(tmp_path: pathlib.Path):
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
fs.find(1, "file.txt") # type: ignore
|
||||
|
||||
|
||||
def test_find_only_finds_files(tmp_path: pathlib.Path):
|
||||
"""ensure that find only returns files even at max_depth"""
|
||||
(tmp_path / "subdir").mkdir()
|
||||
(tmp_path / "subdir" / "dir").mkdir()
|
||||
(tmp_path / "subdir" / "file.txt").write_text("")
|
||||
assert (
|
||||
fs.find(tmp_path, "*", max_depth=1)
|
||||
== fs.find(tmp_path, "*/*", max_depth=1)
|
||||
== [str(tmp_path / "subdir" / "file.txt")]
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user