Relocation regex single pass (#33496)
Instead of looping over multiple regexes and the entire text file contents, create a giant regex with all literal prefixes and do a single pass over files to detect prefixes. Not only is a single pass faster, it's also likely that the regex is compiled better, given that most prefixes share a common ... prefix.
This commit is contained in:
parent
a2520e80c0
commit
b92bdf8c72
@ -42,7 +42,7 @@
|
||||
import spack.util.url as url_util
|
||||
import spack.util.web as web_util
|
||||
from spack.caches import misc_cache_location
|
||||
from spack.relocate import utf8_path_to_binary_regex
|
||||
from spack.relocate import utf8_paths_to_single_binary_regex
|
||||
from spack.spec import Spec
|
||||
from spack.stage import Stage
|
||||
|
||||
@ -694,13 +694,10 @@ def before_visit_symlinked_dir(self, root, rel_path, depth):
|
||||
return False
|
||||
|
||||
|
||||
def file_matches_any_binary_regex(path, regexes):
|
||||
def file_matches(path, regex):
|
||||
with open(path, "rb") as f:
|
||||
contents = f.read()
|
||||
for regex in regexes:
|
||||
if regex.search(contents):
|
||||
return True
|
||||
return False
|
||||
return bool(regex.search(contents))
|
||||
|
||||
|
||||
def get_buildfile_manifest(spec):
|
||||
@ -734,8 +731,8 @@ def get_buildfile_manifest(spec):
|
||||
prefixes.append(spack.hooks.sbang.sbang_install_path())
|
||||
prefixes.append(str(spack.store.layout.root))
|
||||
|
||||
# Create a list regexes matching collected prefixes
|
||||
compiled_prefixes = [utf8_path_to_binary_regex(prefix) for prefix in prefixes]
|
||||
# Create a giant regex that matches all prefixes
|
||||
regex = utf8_paths_to_single_binary_regex(prefixes)
|
||||
|
||||
# Symlinks.
|
||||
|
||||
@ -767,10 +764,9 @@ def get_buildfile_manifest(spec):
|
||||
data["binary_to_relocate_fullpath"].append(abs_path)
|
||||
continue
|
||||
|
||||
elif relocate.needs_text_relocation(m_type, m_subtype):
|
||||
if file_matches_any_binary_regex(abs_path, compiled_prefixes):
|
||||
data["text_to_relocate"].append(rel_path)
|
||||
continue
|
||||
elif relocate.needs_text_relocation(m_type, m_subtype) and file_matches(abs_path, regex):
|
||||
data["text_to_relocate"].append(rel_path)
|
||||
continue
|
||||
|
||||
data["other"].append(abs_path)
|
||||
|
||||
|
@ -747,9 +747,13 @@ def relocate_links(links, orig_layout_root, orig_install_prefix, new_install_pre
|
||||
def utf8_path_to_binary_regex(prefix):
|
||||
"""Create a (binary) regex that matches the input path in utf8"""
|
||||
prefix_bytes = re.escape(prefix).encode("utf-8")
|
||||
prefix_rexp = re.compile(b"(?<![\\w\\-_/])([\\w\\-_]*?)%s([\\w\\-_/]*)" % prefix_bytes)
|
||||
return re.compile(b"(?<![\\w\\-_/])([\\w\\-_]*?)%s([\\w\\-_/]*)" % prefix_bytes)
|
||||
|
||||
return prefix_rexp
|
||||
|
||||
def utf8_paths_to_single_binary_regex(prefixes):
|
||||
"""Create a (binary) regex that matches any input path in utf8"""
|
||||
all_prefixes = b"|".join(re.escape(prefix).encode("utf-8") for prefix in prefixes)
|
||||
return re.compile(b"(?<![\\w\\-_/])([\\w\\-_]*?)(%s)([\\w\\-_/]*)" % all_prefixes)
|
||||
|
||||
|
||||
def unsafe_relocate_text(files, prefixes, concurrency=32):
|
||||
|
@ -20,7 +20,7 @@
|
||||
import spack.store
|
||||
import spack.tengine
|
||||
import spack.util.executable
|
||||
from spack.relocate import utf8_path_to_binary_regex
|
||||
from spack.relocate import utf8_path_to_binary_regex, utf8_paths_to_single_binary_regex
|
||||
|
||||
pytestmark = pytest.mark.skipif(sys.platform == "win32", reason="Tests fail on Windows")
|
||||
|
||||
@ -480,9 +480,24 @@ def test_fixup_macos_rpaths(make_dylib, make_object_file):
|
||||
|
||||
|
||||
def test_text_relocation_regex_is_safe():
|
||||
assert (
|
||||
utf8_path_to_binary_regex("/[a-z]/")
|
||||
.search(b"This does not match /a/, but this does: /[a-z]/.")
|
||||
.group(0)
|
||||
== b"/[a-z]/"
|
||||
)
|
||||
# Test whether prefix regex is properly escaped
|
||||
string = b"This does not match /a/, but this does: /[a-z]/."
|
||||
assert utf8_path_to_binary_regex("/[a-z]/").search(string).group(0) == b"/[a-z]/"
|
||||
|
||||
|
||||
def test_utf8_paths_to_single_binary_regex():
|
||||
regex = utf8_paths_to_single_binary_regex(["/first/path", "/second/path", "/safe/[a-z]"])
|
||||
# Match nothing
|
||||
assert not regex.search(b"text /neither/first/path text /the/second/path text")
|
||||
|
||||
# Match first
|
||||
string = b"contains both /first/path/subdir and /second/path/sub"
|
||||
assert regex.search(string).group(0) == b"/first/path/subdir"
|
||||
|
||||
# Match second
|
||||
string = b"contains both /not/first/path/subdir but /second/path/subdir"
|
||||
assert regex.search(string).group(0) == b"/second/path/subdir"
|
||||
|
||||
# Match "unsafe" dir name
|
||||
string = b"don't match /safe/a/path but do match /safe/[a-z]/file"
|
||||
assert regex.search(string).group(0) == b"/safe/[a-z]/file"
|
||||
|
Loading…
Reference in New Issue
Block a user