filesystem.py find: return directories and improve performance (#47537)
This commit is contained in:
parent
30db764449
commit
a9e6074996
@ -1693,11 +1693,11 @@ def find(
|
|||||||
recursive: bool = True,
|
recursive: bool = True,
|
||||||
max_depth: Optional[int] = None,
|
max_depth: Optional[int] = None,
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
"""Finds all non-directory files matching the patterns from ``files`` starting from ``root``.
|
"""Finds all files matching the patterns from ``files`` starting from ``root``. This function
|
||||||
This function returns a deterministic result for the same input and directory structure when
|
returns a deterministic result for the same input and directory structure when run multiple
|
||||||
run multiple times. Symlinked directories are followed, and unique directories are searched
|
times. Symlinked directories are followed, and unique directories are searched only once. Each
|
||||||
only once. Each matching file is returned only once at lowest depth in case multiple paths
|
matching file is returned only once at lowest depth in case multiple paths exist due to
|
||||||
exist due to symlinked directories.
|
symlinked directories.
|
||||||
|
|
||||||
Accepts any glob characters accepted by fnmatch:
|
Accepts any glob characters accepted by fnmatch:
|
||||||
|
|
||||||
@ -1830,54 +1830,58 @@ def _find_max_depth(
|
|||||||
# Use glob.glob for complex patterns.
|
# Use glob.glob for complex patterns.
|
||||||
for pattern_name, pattern in complex_patterns.items():
|
for pattern_name, pattern in complex_patterns.items():
|
||||||
matched_paths[pattern_name].extend(
|
matched_paths[pattern_name].extend(
|
||||||
path
|
path for path in glob.glob(os.path.join(curr_dir, pattern))
|
||||||
for path in glob.glob(os.path.join(curr_dir, pattern))
|
|
||||||
if not os.path.isdir(path)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# List of subdirectories by path and (inode, device) tuple
|
||||||
|
subdirs: List[Tuple[str, Tuple[int, int]]] = []
|
||||||
|
|
||||||
with dir_iter:
|
with dir_iter:
|
||||||
ordered_entries = sorted(dir_iter, key=lambda x: x.name)
|
for dir_entry in dir_iter:
|
||||||
for dir_entry in ordered_entries:
|
|
||||||
try:
|
|
||||||
it_is_a_dir = dir_entry.is_dir(follow_symlinks=True)
|
|
||||||
except OSError as e:
|
|
||||||
# Possible permission issue, or a symlink that cannot be resolved (ELOOP).
|
|
||||||
_log_file_access_issue(e, dir_entry.path)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if it_is_a_dir:
|
# Match filename only patterns
|
||||||
if depth >= max_depth:
|
if filename_only_patterns:
|
||||||
continue
|
|
||||||
try:
|
|
||||||
# The stat should be performed in a try/except block. We repeat that here
|
|
||||||
# vs. moving to the above block because we only want to call `stat` if we
|
|
||||||
# haven't exceeded our max_depth
|
|
||||||
if sys.platform == "win32":
|
|
||||||
# Note: st_ino/st_dev on DirEntry.stat are not set on Windows, so we
|
|
||||||
# have to call os.stat
|
|
||||||
stat_info = os.stat(dir_entry.path, follow_symlinks=True)
|
|
||||||
else:
|
|
||||||
stat_info = dir_entry.stat(follow_symlinks=True)
|
|
||||||
except OSError as e:
|
|
||||||
_log_file_access_issue(e, dir_entry.path)
|
|
||||||
continue
|
|
||||||
|
|
||||||
dir_id = _file_id(stat_info)
|
|
||||||
if dir_id not in visited_dirs:
|
|
||||||
dir_queue.appendleft((depth + 1, dir_entry.path))
|
|
||||||
visited_dirs.add(dir_id)
|
|
||||||
elif filename_only_patterns:
|
|
||||||
m = regex.match(os.path.normcase(dir_entry.name))
|
m = regex.match(os.path.normcase(dir_entry.name))
|
||||||
if not m:
|
if m:
|
||||||
continue
|
|
||||||
for pattern_name in filename_only_patterns:
|
for pattern_name in filename_only_patterns:
|
||||||
if m.group(pattern_name):
|
if m.group(pattern_name):
|
||||||
matched_paths[pattern_name].append(dir_entry.path)
|
matched_paths[pattern_name].append(dir_entry.path)
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# Collect subdirectories
|
||||||
|
if depth >= max_depth:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
if not dir_entry.is_dir(follow_symlinks=True):
|
||||||
|
continue
|
||||||
|
if sys.platform == "win32":
|
||||||
|
# Note: st_ino/st_dev on DirEntry.stat are not set on Windows, so we have
|
||||||
|
# to call os.stat
|
||||||
|
stat_info = os.stat(dir_entry.path, follow_symlinks=True)
|
||||||
|
else:
|
||||||
|
stat_info = dir_entry.stat(follow_symlinks=True)
|
||||||
|
except OSError as e:
|
||||||
|
# Possible permission issue, or a symlink that cannot be resolved (ELOOP).
|
||||||
|
_log_file_access_issue(e, dir_entry.path)
|
||||||
|
continue
|
||||||
|
|
||||||
|
subdirs.append((dir_entry.path, _file_id(stat_info)))
|
||||||
|
|
||||||
|
# Enqueue subdirectories in a deterministic order
|
||||||
|
if subdirs:
|
||||||
|
subdirs.sort(key=lambda s: os.path.basename(s[0]))
|
||||||
|
for subdir, subdir_id in subdirs:
|
||||||
|
if subdir_id not in visited_dirs:
|
||||||
|
dir_queue.appendleft((depth + 1, subdir))
|
||||||
|
visited_dirs.add(subdir_id)
|
||||||
|
|
||||||
|
# Sort the matched paths for deterministic output
|
||||||
|
for paths in matched_paths.values():
|
||||||
|
paths.sort()
|
||||||
all_matching_paths = [path for paths in matched_paths.values() for path in paths]
|
all_matching_paths = [path for paths in matched_paths.values() for path in paths]
|
||||||
|
|
||||||
# we only dedupe files if we have any complex patterns, since only they can match the same file
|
# We only dedupe files if we have any complex patterns, since only they can match the same file
|
||||||
# multiple times
|
# multiple times
|
||||||
return _dedupe_files(all_matching_paths) if complex_patterns else all_matching_paths
|
return _dedupe_files(all_matching_paths) if complex_patterns else all_matching_paths
|
||||||
|
|
||||||
|
@ -1130,16 +1130,16 @@ def complex_dir_structure(request, tmpdir):
|
|||||||
<root>/
|
<root>/
|
||||||
l1-d1/
|
l1-d1/
|
||||||
l2-d1/
|
l2-d1/
|
||||||
l3-s1 -> l1-d2 # points to directory above l2-d1
|
|
||||||
l3-d2/
|
l3-d2/
|
||||||
l4-f1
|
l4-f1
|
||||||
l3-s3 -> l1-d1 # cyclic link
|
|
||||||
l3-d4/
|
l3-d4/
|
||||||
l4-f2
|
l4-f2
|
||||||
|
l3-s1 -> l1-d2 # points to directory above l2-d1
|
||||||
|
l3-s3 -> l1-d1 # cyclic link
|
||||||
l1-d2/
|
l1-d2/
|
||||||
l2-f1
|
|
||||||
l2-d2/
|
l2-d2/
|
||||||
l3-f3
|
l3-f3
|
||||||
|
l2-f1
|
||||||
l2-s3 -> l2-d2
|
l2-s3 -> l2-d2
|
||||||
l1-s3 -> l3-d4 # a link that "skips" a directory level
|
l1-s3 -> l3-d4 # a link that "skips" a directory level
|
||||||
l1-s4 -> l2-s3 # a link to a link to a dir
|
l1-s4 -> l2-s3 # a link to a link to a dir
|
||||||
@ -1155,7 +1155,7 @@ def complex_dir_structure(request, tmpdir):
|
|||||||
l3_d2 = l2_d1.join("l3-d2").ensure(dir=True)
|
l3_d2 = l2_d1.join("l3-d2").ensure(dir=True)
|
||||||
l3_d4 = l2_d1.join("l3-d4").ensure(dir=True)
|
l3_d4 = l2_d1.join("l3-d4").ensure(dir=True)
|
||||||
l1_d2 = tmpdir.join("l1-d2").ensure(dir=True)
|
l1_d2 = tmpdir.join("l1-d2").ensure(dir=True)
|
||||||
l2_d2 = l1_d2.join("l1-d2").ensure(dir=True)
|
l2_d2 = l1_d2.join("l2-d2").ensure(dir=True)
|
||||||
|
|
||||||
if use_junctions:
|
if use_junctions:
|
||||||
link_fn = llnl.util.symlink._windows_create_junction
|
link_fn = llnl.util.symlink._windows_create_junction
|
||||||
@ -1216,7 +1216,7 @@ def test_find_max_depth_multiple_and_repeated_entry_points(complex_dir_structure
|
|||||||
|
|
||||||
def test_multiple_patterns(complex_dir_structure):
|
def test_multiple_patterns(complex_dir_structure):
|
||||||
root, _ = complex_dir_structure
|
root, _ = complex_dir_structure
|
||||||
paths = fs.find(root, ["l2-f1", "l*-d*/l3-f3", "*", "*/*"])
|
paths = fs.find(root, ["l2-f1", "l*-d*/l3-f3", "*-f*", "*/*-f*"])
|
||||||
# There shouldn't be duplicate results with multiple, overlapping patterns
|
# There shouldn't be duplicate results with multiple, overlapping patterns
|
||||||
assert len(set(paths)) == len(paths)
|
assert len(set(paths)) == len(paths)
|
||||||
# All files should be found
|
# All files should be found
|
||||||
@ -1249,15 +1249,3 @@ def test_find_input_types(tmp_path: pathlib.Path):
|
|||||||
|
|
||||||
with pytest.raises(TypeError):
|
with pytest.raises(TypeError):
|
||||||
fs.find(1, "file.txt") # type: ignore
|
fs.find(1, "file.txt") # type: ignore
|
||||||
|
|
||||||
|
|
||||||
def test_find_only_finds_files(tmp_path: pathlib.Path):
|
|
||||||
"""ensure that find only returns files even at max_depth"""
|
|
||||||
(tmp_path / "subdir").mkdir()
|
|
||||||
(tmp_path / "subdir" / "dir").mkdir()
|
|
||||||
(tmp_path / "subdir" / "file.txt").write_text("")
|
|
||||||
assert (
|
|
||||||
fs.find(tmp_path, "*", max_depth=1)
|
|
||||||
== fs.find(tmp_path, "*/*", max_depth=1)
|
|
||||||
== [str(tmp_path / "subdir" / "file.txt")]
|
|
||||||
)
|
|
||||||
|
Loading…
Reference in New Issue
Block a user