_replace_prefix_bin performance improvements (#33590)
- single pass over the binary data matching all prefixes - collect offsets and replacement strings - do in-place updates with `fseek` / `fwrite`, since typically our replacement touch O(few bytes) while the file is O(many megabytes) - be nice: leave the file untouched if some string can't be replaced
This commit is contained in:
		@@ -467,40 +467,35 @@ def _replace_prefix_bin(filename, byte_prefixes):
 | 
			
		||||
    """Replace all the occurrences of the old install prefix with a
 | 
			
		||||
    new install prefix in binary files.
 | 
			
		||||
 | 
			
		||||
    The new install prefix is prefixed with ``os.sep`` until the
 | 
			
		||||
    The new install prefix is prefixed with ``b'/'`` until the
 | 
			
		||||
    lengths of the prefixes are the same.
 | 
			
		||||
 | 
			
		||||
    Args:
 | 
			
		||||
        filename (str): target binary file
 | 
			
		||||
        byte_prefixes (OrderedDict): OrderedDictionary where the keys are
 | 
			
		||||
        precompiled regex of the old prefixes and the values are the new
 | 
			
		||||
        prefixes (uft-8 encoded)
 | 
			
		||||
        binary strings of the old prefixes and the values are the new
 | 
			
		||||
        binary prefixes
 | 
			
		||||
    """
 | 
			
		||||
    all_prefixes = re.compile(b"|".join(re.escape(prefix) for prefix in byte_prefixes.keys()))
 | 
			
		||||
 | 
			
		||||
    def padded_replacement(old):
 | 
			
		||||
        new = byte_prefixes[old]
 | 
			
		||||
        pad = len(old) - len(new)
 | 
			
		||||
        if pad < 0:
 | 
			
		||||
            raise BinaryTextReplaceError(old, new)
 | 
			
		||||
        return new + b"/" * pad
 | 
			
		||||
 | 
			
		||||
    with open(filename, "rb+") as f:
 | 
			
		||||
        data = f.read()
 | 
			
		||||
        f.seek(0)
 | 
			
		||||
        for orig_bytes, new_bytes in byte_prefixes.items():
 | 
			
		||||
            original_data_len = len(data)
 | 
			
		||||
            # Skip this hassle if not found
 | 
			
		||||
            if orig_bytes not in data:
 | 
			
		||||
                continue
 | 
			
		||||
            # We only care about this problem if we are about to replace
 | 
			
		||||
            length_compatible = len(new_bytes) <= len(orig_bytes)
 | 
			
		||||
            if not length_compatible:
 | 
			
		||||
                tty.debug("Binary failing to relocate is %s" % filename)
 | 
			
		||||
                raise BinaryTextReplaceError(orig_bytes, new_bytes)
 | 
			
		||||
            pad_length = len(orig_bytes) - len(new_bytes)
 | 
			
		||||
            padding = os.sep * pad_length
 | 
			
		||||
            padding = padding.encode("utf-8")
 | 
			
		||||
            data = data.replace(orig_bytes, new_bytes + padding)
 | 
			
		||||
            # Really needs to be the same length
 | 
			
		||||
            if not len(data) == original_data_len:
 | 
			
		||||
                print("Length of pad:", pad_length, "should be", len(padding))
 | 
			
		||||
                print(new_bytes, "was to replace", orig_bytes)
 | 
			
		||||
                raise BinaryStringReplacementError(filename, original_data_len, len(data))
 | 
			
		||||
        f.write(data)
 | 
			
		||||
        f.truncate()
 | 
			
		||||
        # Register what replacement string to put on what offsets in the file.
 | 
			
		||||
        replacements_at_offset = [
 | 
			
		||||
            (padded_replacement(m.group(0)), m.start())
 | 
			
		||||
            for m in re.finditer(all_prefixes, f.read())
 | 
			
		||||
        ]
 | 
			
		||||
 | 
			
		||||
        # Apply the replacements
 | 
			
		||||
        for replacement, offset in replacements_at_offset:
 | 
			
		||||
            f.seek(offset)
 | 
			
		||||
            f.write(replacement)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def relocate_macho_binaries(
 | 
			
		||||
 
 | 
			
		||||
@@ -7,6 +7,7 @@
 | 
			
		||||
import re
 | 
			
		||||
import shutil
 | 
			
		||||
import sys
 | 
			
		||||
from collections import OrderedDict
 | 
			
		||||
 | 
			
		||||
import pytest
 | 
			
		||||
 | 
			
		||||
@@ -501,3 +502,32 @@ def test_utf8_paths_to_single_binary_regex():
 | 
			
		||||
    # Match "unsafe" dir name
 | 
			
		||||
    string = b"don't match /safe/a/path but do match /safe/[a-z]/file"
 | 
			
		||||
    assert regex.search(string).group(0) == b"/safe/[a-z]/file"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_ordered_replacement(tmpdir):
 | 
			
		||||
    # This tests whether binary text replacement respects order, so that
 | 
			
		||||
    # a long package prefix is replaced before a shorter sub-prefix like
 | 
			
		||||
    # the root of the spack store (as a fallback).
 | 
			
		||||
    def replace_and_expect(prefix_map, before, after):
 | 
			
		||||
        file = str(tmpdir.join("file"))
 | 
			
		||||
        with open(file, "wb") as f:
 | 
			
		||||
            f.write(before)
 | 
			
		||||
        spack.relocate._replace_prefix_bin(file, prefix_map)
 | 
			
		||||
        with open(file, "rb") as f:
 | 
			
		||||
            assert f.read() == after
 | 
			
		||||
 | 
			
		||||
    replace_and_expect(
 | 
			
		||||
        OrderedDict(
 | 
			
		||||
            [(b"/old-spack/opt/specific-package", b"/first"), (b"/old-spack/opt", b"/second")]
 | 
			
		||||
        ),
 | 
			
		||||
        b"Binary with /old-spack/opt/specific-package and /old-spack/opt",
 | 
			
		||||
        b"Binary with /first///////////////////////// and /second///////",
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    replace_and_expect(
 | 
			
		||||
        OrderedDict(
 | 
			
		||||
            [(b"/old-spack/opt", b"/second"), (b"/old-spack/opt/specific-package", b"/first")]
 | 
			
		||||
        ),
 | 
			
		||||
        b"Binary with /old-spack/opt/specific-package and /old-spack/opt",
 | 
			
		||||
        b"Binary with /second////////specific-package and /second///////",
 | 
			
		||||
    )
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user