Relocation regex single pass (#33496)
Instead of looping over multiple regexes and the entire text file contents, create a giant regex with all literal prefixes and do a single pass over files to detect prefixes. Not only is a single pass faster, it's also likely that the regex is compiled better, given that most prefixes share a common ... prefix.
This commit is contained in:
parent
a2520e80c0
commit
b92bdf8c72
@ -42,7 +42,7 @@
|
|||||||
import spack.util.url as url_util
|
import spack.util.url as url_util
|
||||||
import spack.util.web as web_util
|
import spack.util.web as web_util
|
||||||
from spack.caches import misc_cache_location
|
from spack.caches import misc_cache_location
|
||||||
from spack.relocate import utf8_path_to_binary_regex
|
from spack.relocate import utf8_paths_to_single_binary_regex
|
||||||
from spack.spec import Spec
|
from spack.spec import Spec
|
||||||
from spack.stage import Stage
|
from spack.stage import Stage
|
||||||
|
|
||||||
@ -694,13 +694,10 @@ def before_visit_symlinked_dir(self, root, rel_path, depth):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def file_matches_any_binary_regex(path, regexes):
|
def file_matches(path, regex):
|
||||||
with open(path, "rb") as f:
|
with open(path, "rb") as f:
|
||||||
contents = f.read()
|
contents = f.read()
|
||||||
for regex in regexes:
|
return bool(regex.search(contents))
|
||||||
if regex.search(contents):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def get_buildfile_manifest(spec):
|
def get_buildfile_manifest(spec):
|
||||||
@ -734,8 +731,8 @@ def get_buildfile_manifest(spec):
|
|||||||
prefixes.append(spack.hooks.sbang.sbang_install_path())
|
prefixes.append(spack.hooks.sbang.sbang_install_path())
|
||||||
prefixes.append(str(spack.store.layout.root))
|
prefixes.append(str(spack.store.layout.root))
|
||||||
|
|
||||||
# Create a list regexes matching collected prefixes
|
# Create a giant regex that matches all prefixes
|
||||||
compiled_prefixes = [utf8_path_to_binary_regex(prefix) for prefix in prefixes]
|
regex = utf8_paths_to_single_binary_regex(prefixes)
|
||||||
|
|
||||||
# Symlinks.
|
# Symlinks.
|
||||||
|
|
||||||
@ -767,10 +764,9 @@ def get_buildfile_manifest(spec):
|
|||||||
data["binary_to_relocate_fullpath"].append(abs_path)
|
data["binary_to_relocate_fullpath"].append(abs_path)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
elif relocate.needs_text_relocation(m_type, m_subtype):
|
elif relocate.needs_text_relocation(m_type, m_subtype) and file_matches(abs_path, regex):
|
||||||
if file_matches_any_binary_regex(abs_path, compiled_prefixes):
|
data["text_to_relocate"].append(rel_path)
|
||||||
data["text_to_relocate"].append(rel_path)
|
continue
|
||||||
continue
|
|
||||||
|
|
||||||
data["other"].append(abs_path)
|
data["other"].append(abs_path)
|
||||||
|
|
||||||
|
@ -747,9 +747,13 @@ def relocate_links(links, orig_layout_root, orig_install_prefix, new_install_pre
|
|||||||
def utf8_path_to_binary_regex(prefix):
|
def utf8_path_to_binary_regex(prefix):
|
||||||
"""Create a (binary) regex that matches the input path in utf8"""
|
"""Create a (binary) regex that matches the input path in utf8"""
|
||||||
prefix_bytes = re.escape(prefix).encode("utf-8")
|
prefix_bytes = re.escape(prefix).encode("utf-8")
|
||||||
prefix_rexp = re.compile(b"(?<![\\w\\-_/])([\\w\\-_]*?)%s([\\w\\-_/]*)" % prefix_bytes)
|
return re.compile(b"(?<![\\w\\-_/])([\\w\\-_]*?)%s([\\w\\-_/]*)" % prefix_bytes)
|
||||||
|
|
||||||
return prefix_rexp
|
|
||||||
|
def utf8_paths_to_single_binary_regex(prefixes):
|
||||||
|
"""Create a (binary) regex that matches any input path in utf8"""
|
||||||
|
all_prefixes = b"|".join(re.escape(prefix).encode("utf-8") for prefix in prefixes)
|
||||||
|
return re.compile(b"(?<![\\w\\-_/])([\\w\\-_]*?)(%s)([\\w\\-_/]*)" % all_prefixes)
|
||||||
|
|
||||||
|
|
||||||
def unsafe_relocate_text(files, prefixes, concurrency=32):
|
def unsafe_relocate_text(files, prefixes, concurrency=32):
|
||||||
|
@ -20,7 +20,7 @@
|
|||||||
import spack.store
|
import spack.store
|
||||||
import spack.tengine
|
import spack.tengine
|
||||||
import spack.util.executable
|
import spack.util.executable
|
||||||
from spack.relocate import utf8_path_to_binary_regex
|
from spack.relocate import utf8_path_to_binary_regex, utf8_paths_to_single_binary_regex
|
||||||
|
|
||||||
pytestmark = pytest.mark.skipif(sys.platform == "win32", reason="Tests fail on Windows")
|
pytestmark = pytest.mark.skipif(sys.platform == "win32", reason="Tests fail on Windows")
|
||||||
|
|
||||||
@ -480,9 +480,24 @@ def test_fixup_macos_rpaths(make_dylib, make_object_file):
|
|||||||
|
|
||||||
|
|
||||||
def test_text_relocation_regex_is_safe():
|
def test_text_relocation_regex_is_safe():
|
||||||
assert (
|
# Test whether prefix regex is properly escaped
|
||||||
utf8_path_to_binary_regex("/[a-z]/")
|
string = b"This does not match /a/, but this does: /[a-z]/."
|
||||||
.search(b"This does not match /a/, but this does: /[a-z]/.")
|
assert utf8_path_to_binary_regex("/[a-z]/").search(string).group(0) == b"/[a-z]/"
|
||||||
.group(0)
|
|
||||||
== b"/[a-z]/"
|
|
||||||
)
|
def test_utf8_paths_to_single_binary_regex():
|
||||||
|
regex = utf8_paths_to_single_binary_regex(["/first/path", "/second/path", "/safe/[a-z]"])
|
||||||
|
# Match nothing
|
||||||
|
assert not regex.search(b"text /neither/first/path text /the/second/path text")
|
||||||
|
|
||||||
|
# Match first
|
||||||
|
string = b"contains both /first/path/subdir and /second/path/sub"
|
||||||
|
assert regex.search(string).group(0) == b"/first/path/subdir"
|
||||||
|
|
||||||
|
# Match second
|
||||||
|
string = b"contains both /not/first/path/subdir but /second/path/subdir"
|
||||||
|
assert regex.search(string).group(0) == b"/second/path/subdir"
|
||||||
|
|
||||||
|
# Match "unsafe" dir name
|
||||||
|
string = b"don't match /safe/a/path but do match /safe/[a-z]/file"
|
||||||
|
assert regex.search(string).group(0) == b"/safe/[a-z]/file"
|
||||||
|
Loading…
Reference in New Issue
Block a user