Relocation regex single pass (#33496)
Instead of looping over multiple regexes and the entire text file contents, create a giant regex with all literal prefixes and do a single pass over files to detect prefixes. Not only is a single pass faster, it's also likely that the regex is compiled better, given that most prefixes share a common ... prefix.
This commit is contained in:
		@@ -42,7 +42,7 @@
 | 
				
			|||||||
import spack.util.url as url_util
 | 
					import spack.util.url as url_util
 | 
				
			||||||
import spack.util.web as web_util
 | 
					import spack.util.web as web_util
 | 
				
			||||||
from spack.caches import misc_cache_location
 | 
					from spack.caches import misc_cache_location
 | 
				
			||||||
from spack.relocate import utf8_path_to_binary_regex
 | 
					from spack.relocate import utf8_paths_to_single_binary_regex
 | 
				
			||||||
from spack.spec import Spec
 | 
					from spack.spec import Spec
 | 
				
			||||||
from spack.stage import Stage
 | 
					from spack.stage import Stage
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -694,13 +694,10 @@ def before_visit_symlinked_dir(self, root, rel_path, depth):
 | 
				
			|||||||
        return False
 | 
					        return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def file_matches_any_binary_regex(path, regexes):
 | 
					def file_matches(path, regex):
 | 
				
			||||||
    with open(path, "rb") as f:
 | 
					    with open(path, "rb") as f:
 | 
				
			||||||
        contents = f.read()
 | 
					        contents = f.read()
 | 
				
			||||||
    for regex in regexes:
 | 
					    return bool(regex.search(contents))
 | 
				
			||||||
        if regex.search(contents):
 | 
					 | 
				
			||||||
            return True
 | 
					 | 
				
			||||||
    return False
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_buildfile_manifest(spec):
 | 
					def get_buildfile_manifest(spec):
 | 
				
			||||||
@@ -734,8 +731,8 @@ def get_buildfile_manifest(spec):
 | 
				
			|||||||
    prefixes.append(spack.hooks.sbang.sbang_install_path())
 | 
					    prefixes.append(spack.hooks.sbang.sbang_install_path())
 | 
				
			||||||
    prefixes.append(str(spack.store.layout.root))
 | 
					    prefixes.append(str(spack.store.layout.root))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Create a list regexes matching collected prefixes
 | 
					    # Create a giant regex that matches all prefixes
 | 
				
			||||||
    compiled_prefixes = [utf8_path_to_binary_regex(prefix) for prefix in prefixes]
 | 
					    regex = utf8_paths_to_single_binary_regex(prefixes)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Symlinks.
 | 
					    # Symlinks.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -767,10 +764,9 @@ def get_buildfile_manifest(spec):
 | 
				
			|||||||
                data["binary_to_relocate_fullpath"].append(abs_path)
 | 
					                data["binary_to_relocate_fullpath"].append(abs_path)
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        elif relocate.needs_text_relocation(m_type, m_subtype):
 | 
					        elif relocate.needs_text_relocation(m_type, m_subtype) and file_matches(abs_path, regex):
 | 
				
			||||||
            if file_matches_any_binary_regex(abs_path, compiled_prefixes):
 | 
					            data["text_to_relocate"].append(rel_path)
 | 
				
			||||||
                data["text_to_relocate"].append(rel_path)
 | 
					            continue
 | 
				
			||||||
                continue
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        data["other"].append(abs_path)
 | 
					        data["other"].append(abs_path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -747,9 +747,13 @@ def relocate_links(links, orig_layout_root, orig_install_prefix, new_install_pre
 | 
				
			|||||||
def utf8_path_to_binary_regex(prefix):
 | 
					def utf8_path_to_binary_regex(prefix):
 | 
				
			||||||
    """Create a (binary) regex that matches the input path in utf8"""
 | 
					    """Create a (binary) regex that matches the input path in utf8"""
 | 
				
			||||||
    prefix_bytes = re.escape(prefix).encode("utf-8")
 | 
					    prefix_bytes = re.escape(prefix).encode("utf-8")
 | 
				
			||||||
    prefix_rexp = re.compile(b"(?<![\\w\\-_/])([\\w\\-_]*?)%s([\\w\\-_/]*)" % prefix_bytes)
 | 
					    return re.compile(b"(?<![\\w\\-_/])([\\w\\-_]*?)%s([\\w\\-_/]*)" % prefix_bytes)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return prefix_rexp
 | 
					
 | 
				
			||||||
 | 
					def utf8_paths_to_single_binary_regex(prefixes):
 | 
				
			||||||
 | 
					    """Create a (binary) regex that matches any input path in utf8"""
 | 
				
			||||||
 | 
					    all_prefixes = b"|".join(re.escape(prefix).encode("utf-8") for prefix in prefixes)
 | 
				
			||||||
 | 
					    return re.compile(b"(?<![\\w\\-_/])([\\w\\-_]*?)(%s)([\\w\\-_/]*)" % all_prefixes)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def unsafe_relocate_text(files, prefixes, concurrency=32):
 | 
					def unsafe_relocate_text(files, prefixes, concurrency=32):
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -20,7 +20,7 @@
 | 
				
			|||||||
import spack.store
 | 
					import spack.store
 | 
				
			||||||
import spack.tengine
 | 
					import spack.tengine
 | 
				
			||||||
import spack.util.executable
 | 
					import spack.util.executable
 | 
				
			||||||
from spack.relocate import utf8_path_to_binary_regex
 | 
					from spack.relocate import utf8_path_to_binary_regex, utf8_paths_to_single_binary_regex
 | 
				
			||||||
 | 
					
 | 
				
			||||||
pytestmark = pytest.mark.skipif(sys.platform == "win32", reason="Tests fail on Windows")
 | 
					pytestmark = pytest.mark.skipif(sys.platform == "win32", reason="Tests fail on Windows")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -480,9 +480,24 @@ def test_fixup_macos_rpaths(make_dylib, make_object_file):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_text_relocation_regex_is_safe():
 | 
					def test_text_relocation_regex_is_safe():
 | 
				
			||||||
    assert (
 | 
					    # Test whether prefix regex is properly escaped
 | 
				
			||||||
        utf8_path_to_binary_regex("/[a-z]/")
 | 
					    string = b"This does not match /a/, but this does: /[a-z]/."
 | 
				
			||||||
        .search(b"This does not match /a/, but this does: /[a-z]/.")
 | 
					    assert utf8_path_to_binary_regex("/[a-z]/").search(string).group(0) == b"/[a-z]/"
 | 
				
			||||||
        .group(0)
 | 
					
 | 
				
			||||||
        == b"/[a-z]/"
 | 
					
 | 
				
			||||||
    )
 | 
					def test_utf8_paths_to_single_binary_regex():
 | 
				
			||||||
 | 
					    regex = utf8_paths_to_single_binary_regex(["/first/path", "/second/path", "/safe/[a-z]"])
 | 
				
			||||||
 | 
					    # Match nothing
 | 
				
			||||||
 | 
					    assert not regex.search(b"text /neither/first/path text /the/second/path text")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Match first
 | 
				
			||||||
 | 
					    string = b"contains both /first/path/subdir and /second/path/sub"
 | 
				
			||||||
 | 
					    assert regex.search(string).group(0) == b"/first/path/subdir"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Match second
 | 
				
			||||||
 | 
					    string = b"contains both /not/first/path/subdir but /second/path/subdir"
 | 
				
			||||||
 | 
					    assert regex.search(string).group(0) == b"/second/path/subdir"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Match "unsafe" dir name
 | 
				
			||||||
 | 
					    string = b"don't match /safe/a/path but do match /safe/[a-z]/file"
 | 
				
			||||||
 | 
					    assert regex.search(string).group(0) == b"/safe/[a-z]/file"
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user