filesystem.py find: return directories and improve performance (#47537)
This commit is contained in:
		@@ -1693,11 +1693,11 @@ def find(
 | 
			
		||||
    recursive: bool = True,
 | 
			
		||||
    max_depth: Optional[int] = None,
 | 
			
		||||
) -> List[str]:
 | 
			
		||||
    """Finds all non-directory files matching the patterns from ``files`` starting from ``root``.
 | 
			
		||||
    This function returns a deterministic result for the same input and directory structure when
 | 
			
		||||
    run multiple times. Symlinked directories are followed, and unique directories are searched
 | 
			
		||||
    only once. Each matching file is returned only once at lowest depth in case multiple paths
 | 
			
		||||
    exist due to symlinked directories.
 | 
			
		||||
    """Finds all files matching the patterns from ``files`` starting from ``root``. This function
 | 
			
		||||
    returns a deterministic result for the same input and directory structure when run multiple
 | 
			
		||||
    times. Symlinked directories are followed, and unique directories are searched only once. Each
 | 
			
		||||
    matching file is returned only once at lowest depth in case multiple paths exist due to
 | 
			
		||||
    symlinked directories.
 | 
			
		||||
 | 
			
		||||
    Accepts any glob characters accepted by fnmatch:
 | 
			
		||||
 | 
			
		||||
@@ -1830,54 +1830,58 @@ def _find_max_depth(
 | 
			
		||||
        # Use glob.glob for complex patterns.
 | 
			
		||||
        for pattern_name, pattern in complex_patterns.items():
 | 
			
		||||
            matched_paths[pattern_name].extend(
 | 
			
		||||
                path
 | 
			
		||||
                for path in glob.glob(os.path.join(curr_dir, pattern))
 | 
			
		||||
                if not os.path.isdir(path)
 | 
			
		||||
                path for path in glob.glob(os.path.join(curr_dir, pattern))
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        # List of subdirectories by path and (inode, device) tuple
 | 
			
		||||
        subdirs: List[Tuple[str, Tuple[int, int]]] = []
 | 
			
		||||
 | 
			
		||||
        with dir_iter:
 | 
			
		||||
            ordered_entries = sorted(dir_iter, key=lambda x: x.name)
 | 
			
		||||
            for dir_entry in ordered_entries:
 | 
			
		||||
            for dir_entry in dir_iter:
 | 
			
		||||
 | 
			
		||||
                # Match filename only patterns
 | 
			
		||||
                if filename_only_patterns:
 | 
			
		||||
                    m = regex.match(os.path.normcase(dir_entry.name))
 | 
			
		||||
                    if m:
 | 
			
		||||
                        for pattern_name in filename_only_patterns:
 | 
			
		||||
                            if m.group(pattern_name):
 | 
			
		||||
                                matched_paths[pattern_name].append(dir_entry.path)
 | 
			
		||||
                                break
 | 
			
		||||
 | 
			
		||||
                # Collect subdirectories
 | 
			
		||||
                if depth >= max_depth:
 | 
			
		||||
                    continue
 | 
			
		||||
 | 
			
		||||
                try:
 | 
			
		||||
                    it_is_a_dir = dir_entry.is_dir(follow_symlinks=True)
 | 
			
		||||
                    if not dir_entry.is_dir(follow_symlinks=True):
 | 
			
		||||
                        continue
 | 
			
		||||
                    if sys.platform == "win32":
 | 
			
		||||
                        # Note: st_ino/st_dev on DirEntry.stat are not set on Windows, so we have
 | 
			
		||||
                        # to call os.stat
 | 
			
		||||
                        stat_info = os.stat(dir_entry.path, follow_symlinks=True)
 | 
			
		||||
                    else:
 | 
			
		||||
                        stat_info = dir_entry.stat(follow_symlinks=True)
 | 
			
		||||
                except OSError as e:
 | 
			
		||||
                    # Possible permission issue, or a symlink that cannot be resolved (ELOOP).
 | 
			
		||||
                    _log_file_access_issue(e, dir_entry.path)
 | 
			
		||||
                    continue
 | 
			
		||||
 | 
			
		||||
                if it_is_a_dir:
 | 
			
		||||
                    if depth >= max_depth:
 | 
			
		||||
                        continue
 | 
			
		||||
                    try:
 | 
			
		||||
                        # The stat should be performed in a try/except block. We repeat that here
 | 
			
		||||
                        # vs. moving to the above block because we only want to call `stat` if we
 | 
			
		||||
                        # haven't exceeded our max_depth
 | 
			
		||||
                        if sys.platform == "win32":
 | 
			
		||||
                            # Note: st_ino/st_dev on DirEntry.stat are not set on Windows, so we
 | 
			
		||||
                            # have to call os.stat
 | 
			
		||||
                            stat_info = os.stat(dir_entry.path, follow_symlinks=True)
 | 
			
		||||
                        else:
 | 
			
		||||
                            stat_info = dir_entry.stat(follow_symlinks=True)
 | 
			
		||||
                    except OSError as e:
 | 
			
		||||
                        _log_file_access_issue(e, dir_entry.path)
 | 
			
		||||
                        continue
 | 
			
		||||
                subdirs.append((dir_entry.path, _file_id(stat_info)))
 | 
			
		||||
 | 
			
		||||
                    dir_id = _file_id(stat_info)
 | 
			
		||||
                    if dir_id not in visited_dirs:
 | 
			
		||||
                        dir_queue.appendleft((depth + 1, dir_entry.path))
 | 
			
		||||
                        visited_dirs.add(dir_id)
 | 
			
		||||
                elif filename_only_patterns:
 | 
			
		||||
                    m = regex.match(os.path.normcase(dir_entry.name))
 | 
			
		||||
                    if not m:
 | 
			
		||||
                        continue
 | 
			
		||||
                    for pattern_name in filename_only_patterns:
 | 
			
		||||
                        if m.group(pattern_name):
 | 
			
		||||
                            matched_paths[pattern_name].append(dir_entry.path)
 | 
			
		||||
                            break
 | 
			
		||||
        # Enqueue subdirectories in a deterministic order
 | 
			
		||||
        if subdirs:
 | 
			
		||||
            subdirs.sort(key=lambda s: os.path.basename(s[0]))
 | 
			
		||||
            for subdir, subdir_id in subdirs:
 | 
			
		||||
                if subdir_id not in visited_dirs:
 | 
			
		||||
                    dir_queue.appendleft((depth + 1, subdir))
 | 
			
		||||
                    visited_dirs.add(subdir_id)
 | 
			
		||||
 | 
			
		||||
    # Sort the matched paths for deterministic output
 | 
			
		||||
    for paths in matched_paths.values():
 | 
			
		||||
        paths.sort()
 | 
			
		||||
    all_matching_paths = [path for paths in matched_paths.values() for path in paths]
 | 
			
		||||
 | 
			
		||||
    # we only dedupe files if we have any complex patterns, since only they can match the same file
 | 
			
		||||
    # We only dedupe files if we have any complex patterns, since only they can match the same file
 | 
			
		||||
    # multiple times
 | 
			
		||||
    return _dedupe_files(all_matching_paths) if complex_patterns else all_matching_paths
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1130,16 +1130,16 @@ def complex_dir_structure(request, tmpdir):
 | 
			
		||||
    <root>/
 | 
			
		||||
        l1-d1/
 | 
			
		||||
            l2-d1/
 | 
			
		||||
                l3-s1 -> l1-d2 # points to directory above l2-d1
 | 
			
		||||
                l3-d2/
 | 
			
		||||
                    l4-f1
 | 
			
		||||
                l3-s3 -> l1-d1 # cyclic link
 | 
			
		||||
                l3-d4/
 | 
			
		||||
                    l4-f2
 | 
			
		||||
                l3-s1 -> l1-d2 # points to directory above l2-d1
 | 
			
		||||
                l3-s3 -> l1-d1 # cyclic link
 | 
			
		||||
        l1-d2/
 | 
			
		||||
            l2-f1
 | 
			
		||||
            l2-d2/
 | 
			
		||||
                l3-f3
 | 
			
		||||
            l2-f1
 | 
			
		||||
            l2-s3 -> l2-d2
 | 
			
		||||
        l1-s3 -> l3-d4 # a link that "skips" a directory level
 | 
			
		||||
        l1-s4 -> l2-s3 # a link to a link to a dir
 | 
			
		||||
@@ -1155,7 +1155,7 @@ def complex_dir_structure(request, tmpdir):
 | 
			
		||||
    l3_d2 = l2_d1.join("l3-d2").ensure(dir=True)
 | 
			
		||||
    l3_d4 = l2_d1.join("l3-d4").ensure(dir=True)
 | 
			
		||||
    l1_d2 = tmpdir.join("l1-d2").ensure(dir=True)
 | 
			
		||||
    l2_d2 = l1_d2.join("l1-d2").ensure(dir=True)
 | 
			
		||||
    l2_d2 = l1_d2.join("l2-d2").ensure(dir=True)
 | 
			
		||||
 | 
			
		||||
    if use_junctions:
 | 
			
		||||
        link_fn = llnl.util.symlink._windows_create_junction
 | 
			
		||||
@@ -1216,7 +1216,7 @@ def test_find_max_depth_multiple_and_repeated_entry_points(complex_dir_structure
 | 
			
		||||
 | 
			
		||||
def test_multiple_patterns(complex_dir_structure):
 | 
			
		||||
    root, _ = complex_dir_structure
 | 
			
		||||
    paths = fs.find(root, ["l2-f1", "l*-d*/l3-f3", "*", "*/*"])
 | 
			
		||||
    paths = fs.find(root, ["l2-f1", "l*-d*/l3-f3", "*-f*", "*/*-f*"])
 | 
			
		||||
    # There shouldn't be duplicate results with multiple, overlapping patterns
 | 
			
		||||
    assert len(set(paths)) == len(paths)
 | 
			
		||||
    # All files should be found
 | 
			
		||||
@@ -1249,15 +1249,3 @@ def test_find_input_types(tmp_path: pathlib.Path):
 | 
			
		||||
 | 
			
		||||
    with pytest.raises(TypeError):
 | 
			
		||||
        fs.find(1, "file.txt")  # type: ignore
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_find_only_finds_files(tmp_path: pathlib.Path):
 | 
			
		||||
    """ensure that find only returns files even at max_depth"""
 | 
			
		||||
    (tmp_path / "subdir").mkdir()
 | 
			
		||||
    (tmp_path / "subdir" / "dir").mkdir()
 | 
			
		||||
    (tmp_path / "subdir" / "file.txt").write_text("")
 | 
			
		||||
    assert (
 | 
			
		||||
        fs.find(tmp_path, "*", max_depth=1)
 | 
			
		||||
        == fs.find(tmp_path, "*/*", max_depth=1)
 | 
			
		||||
        == [str(tmp_path / "subdir" / "file.txt")]
 | 
			
		||||
    )
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user