From 719d31682f1df0ca990b36e4ba2a2ab7ea68d90b Mon Sep 17 00:00:00 2001 From: Greg Becker Date: Thu, 2 Jun 2022 08:17:21 +0200 Subject: [PATCH] speed up relocation using memoized offsets refactor to do scanning in a single pass parallelize new relocate method with threadpool relocate_by_offsets can recompute if offsets not memoized --- lib/spack/spack/binary_distribution.py | 56 ++++--- lib/spack/spack/filesystem_view.py | 5 +- lib/spack/spack/relocate.py | 201 ++++++++++++++----------- lib/spack/spack/rewiring.py | 6 +- lib/spack/spack/test/relocate.py | 42 ++++-- 5 files changed, 186 insertions(+), 124 deletions(-) diff --git a/lib/spack/spack/binary_distribution.py b/lib/spack/spack/binary_distribution.py index a0b0e75b830..e2970ec55ab 100644 --- a/lib/spack/spack/binary_distribution.py +++ b/lib/spack/spack/binary_distribution.py @@ -624,10 +624,14 @@ def get_buildfile_manifest(spec): """ data = {"text_to_relocate": [], "binary_to_relocate": [], "link_to_relocate": [], "other": [], - "binary_to_relocate_fullpath": []} + "binary_to_relocate_fullpath": [], "offsets": {}} blacklist = (".spack", "man") + # Get all the paths we will want to relocate in binaries + paths_to_relocate = [s.prefix for s in spec.traverse(root=True)] + paths_to_relocate.append(spack.store.layout.root) + # Do this at during tarball creation to save time when tarball unpacked. # Used by make_package_relative to determine binaries to change. for root, dirs, files in os.walk(spec.prefix, topdown=True): @@ -662,6 +666,11 @@ def get_buildfile_manifest(spec): (m_subtype in ('x-mach-binary') and sys.platform == 'darwin') or (not filename.endswith('.o'))): + + # Last path to relocate is the layout root, which is a substring + # of the others + indices = relocate.compute_indices(path_name, paths_to_relocate) + data['offsets'][rel_path_name] = indices data['binary_to_relocate'].append(rel_path_name) data['binary_to_relocate_fullpath'].append(path_name) added = True @@ -700,6 +709,7 @@ def write_buildinfo_file(spec, workdir, rel=False): buildinfo['relocate_binaries'] = manifest['binary_to_relocate'] buildinfo['relocate_links'] = manifest['link_to_relocate'] buildinfo['prefix_to_hash'] = prefix_to_hash + buildinfo['offsets'] = manifest['offsets'] filename = buildinfo_file_name(workdir) with open(filename, 'w') as outfile: outfile.write(syaml.dump(buildinfo, default_flow_style=True)) @@ -1473,11 +1483,25 @@ def is_backup_file(file): # If we are not installing back to the same install tree do the relocation if old_prefix != new_prefix: - files_to_relocate = [os.path.join(workdir, filename) - for filename in buildinfo.get('relocate_binaries') - ] + # Relocate links to the new install prefix + links = [link for link in buildinfo.get('relocate_links', [])] + relocate.relocate_links( + links, old_layout_root, old_prefix, new_prefix + ) + + # For all buildcaches + # relocate the install prefixes in text files including dependencies + relocate.relocate_text(text_names, prefix_to_prefix_text) + # If the buildcache was not created with relativized rpaths - # do the relocation of path in binaries + # do the relocation of rpaths in binaries + # TODO: Is this necessary? How are null-terminated strings handled + # in the rpath header? + files_to_relocate = [ + os.path.join(workdir, filename) + for filename in buildinfo.get('relocate_binaries') + ] + platform = spack.platforms.by_name(spec.platform) if 'macho' in platform.binary_formats: relocate.relocate_macho_binaries(files_to_relocate, @@ -1493,25 +1517,11 @@ def is_backup_file(file): prefix_to_prefix_bin, rel, old_prefix, new_prefix) - # Relocate links to the new install prefix - links = [link for link in buildinfo.get('relocate_links', [])] - relocate.relocate_links( - links, old_layout_root, old_prefix, new_prefix - ) - # For all buildcaches - # relocate the install prefixes in text files including dependencies - relocate.relocate_text(text_names, prefix_to_prefix_text) - - paths_to_relocate = [old_prefix, old_layout_root] - paths_to_relocate.extend(prefix_to_hash.keys()) - files_to_relocate = list(filter( - lambda pathname: not relocate.file_is_relocatable( - pathname, paths_to_relocate=paths_to_relocate), - map(lambda filename: os.path.join(workdir, filename), - buildinfo['relocate_binaries']))) - # relocate the install prefixes in binary files including dependencies - relocate.relocate_text_bin(files_to_relocate, prefix_to_prefix_bin) + # If offsets is None, we will recompute offsets when needed + offsets = buildinfo.get('offsets', None) + relocate.relocate_text_bin( + files_to_relocate, prefix_to_prefix_bin, offsets, workdir) # If we are installing back to the same location # relocate the sbang location if the spack directory changed diff --git a/lib/spack/spack/filesystem_view.py b/lib/spack/spack/filesystem_view.py index 1a4a492d2fb..e7d8145eaee 100644 --- a/lib/spack/spack/filesystem_view.py +++ b/lib/spack/spack/filesystem_view.py @@ -95,10 +95,7 @@ def view_copy(src, dst, view, spec=None): view.get_projection_for_spec(dep) if spack.relocate.is_binary(dst): - spack.relocate.relocate_text_bin( - binaries=[dst], - prefixes=prefix_to_projection - ) + spack.relocate.relocate_text_bin([dst], prefix_to_projection) else: prefix_to_projection[spack.store.layout.root] = view._root prefix_to_projection[orig_sbang] = new_sbang diff --git a/lib/spack/spack/relocate.py b/lib/spack/spack/relocate.py index bbb5e8025a2..3f70a03e257 100644 --- a/lib/spack/spack/relocate.py +++ b/lib/spack/spack/relocate.py @@ -469,47 +469,6 @@ def _replace_prefix_text(filename, compiled_prefixes): f.truncate() -def _replace_prefix_bin(filename, byte_prefixes): - """Replace all the occurrences of the old install prefix with a - new install prefix in binary files. - - The new install prefix is prefixed with ``os.sep`` until the - lengths of the prefixes are the same. - - Args: - filename (str): target binary file - byte_prefixes (OrderedDict): OrderedDictionary where the keys are - precompiled regex of the old prefixes and the values are the new - prefixes (uft-8 encoded) - """ - - with open(filename, 'rb+') as f: - data = f.read() - f.seek(0) - for orig_bytes, new_bytes in byte_prefixes.items(): - original_data_len = len(data) - # Skip this hassle if not found - if orig_bytes not in data: - continue - # We only care about this problem if we are about to replace - length_compatible = len(new_bytes) <= len(orig_bytes) - if not length_compatible: - tty.debug('Binary failing to relocate is %s' % filename) - raise BinaryTextReplaceError(orig_bytes, new_bytes) - pad_length = len(orig_bytes) - len(new_bytes) - padding = os.sep * pad_length - padding = padding.encode('utf-8') - data = data.replace(orig_bytes, new_bytes + padding) - # Really needs to be the same length - if not len(data) == original_data_len: - print('Length of pad:', pad_length, 'should be', len(padding)) - print(new_bytes, 'was to replace', orig_bytes) - raise BinaryStringReplacementError( - filename, original_data_len, len(data)) - f.write(data) - f.truncate() - - def relocate_macho_binaries(path_names, old_layout_root, new_layout_root, prefix_to_prefix, rel, old_prefix, new_prefix): """ @@ -817,49 +776,6 @@ def relocate_text(files, prefixes, concurrency=32): tp.join() -def relocate_text_bin(binaries, prefixes, concurrency=32): - """Replace null terminated path strings hard coded into binaries. - - The new install prefix must be shorter than the original one. - - Args: - binaries (list): binaries to be relocated - prefixes (OrderedDict): String prefixes which need to be changed. - concurrency (int): Desired degree of parallelism. - - Raises: - BinaryTextReplaceError: when the new path is longer than the old path - """ - byte_prefixes = collections.OrderedDict({}) - - for orig_prefix, new_prefix in prefixes.items(): - if orig_prefix != new_prefix: - if isinstance(orig_prefix, bytes): - orig_bytes = orig_prefix - else: - orig_bytes = orig_prefix.encode('utf-8') - if isinstance(new_prefix, bytes): - new_bytes = new_prefix - else: - new_bytes = new_prefix.encode('utf-8') - byte_prefixes[orig_bytes] = new_bytes - - # Do relocations on text in binaries that refers to the install tree - # multiprocesing.ThreadPool.map requires single argument - args = [] - - for binary in binaries: - args.append((binary, byte_prefixes)) - - tp = multiprocessing.pool.ThreadPool(processes=concurrency) - - try: - tp.map(llnl.util.lang.star(_replace_prefix_bin), args) - finally: - tp.terminate() - tp.join() - - def is_relocatable(spec): """Returns True if an installed spec is relocatable. @@ -1126,3 +1042,120 @@ def fixup_macos_rpaths(spec): )) else: tty.debug('No rpath fixup needed for ' + specname) + + +def compute_indices(filename, paths_to_relocate): + """ + Compute the indices in filename at which each of paths_to_relocate occurs. + + Arguments: + filename (str): file to compute indices for + paths_to_relocate (List[str]): paths to find indices of + Returns: + Dict + """ + with open(filename, 'rb') as f: + contents = f.read() + + substring_prefix = os.path.commonprefix(paths_to_relocate).encode('utf-8') + + indices = {} + index = 0 + max_length = max(len(path) for path in paths_to_relocate) + while True: + try: + # We search for the smallest substring of all paths we relocate + # In practice, this is the spack install root, and we relocate + # prefixes in the root and the root itself + index = contents.index(substring_prefix, index) + except ValueError: + # The string isn't found in the rest of the binary + break + else: + # only copy the smallest portion of the binary for comparisons + substring_to_check = contents[index:index + max_length] + for path in paths_to_relocate: + # We guarantee any substring in the list comes after any superstring + p = path.encode('utf-8') + if substring_to_check.startswith(p): + indices[index] = str(path) + index += len(path) + break + else: + index += 1 + return indices + + +def _relocate_binary_text(filename, offsets, prefix_to_prefix): + """ + Relocate the text of a single binary file, given the offsets at which the + replacements need to be made + + Arguments: + filename (str): file to modify + offsets (Dict[int, str]): locations of the strings to replace + prefix_to_prefix (Dict[str, str]): strings to replace and their replacements + """ + with open(filename, 'rb+') as f: + for index, prefix in offsets.items(): + replacement = prefix_to_prefix[prefix].encode('utf-8') + if len(replacement) > len(prefix): + raise BinaryTextReplaceError(prefix, replacement) + + # read forward until we find the end of the string including + # the prefix and compute the replacement as we go + f.seek(index + len(prefix)) + c = f.read(1) + while c not in (None, b'\x00'): + replacement += c + c = f.read(1) + + # seek back to the index position and write the replacement in + # and add null-terminator + f.seek(index) + f.write(replacement) + f.write(b'\x00') + + +def relocate_text_bin( + files_to_relocate, prefix_to_prefix, offsets=None, + relative_root=None, concurrency=32 +): + """ + For each file given, replace all keys in the given translation dict with + the associated values. Optionally executes using precomputed memoized offsets + for the substitutions. + + Arguments: + files_to_relocate (List[str]): The files to modify + prefix_to_prefix (Dict[str, str]): keys are strings to replace, values are + replacements + offsets (Dict[str, Dict[int, str]): (optional) Mapping from relative filenames to + a mapping from indices to strings to replace found at each index + relative_root (str): (optional) prefix for relative paths in offsets + """ + # defaults to the common prefix of all input files + rel_root = relative_root or os.path.commonprefix(files_to_relocate) + + if offsets is None: + offsets = {} + for filename in files_to_relocate: + indices = compute_indices( + filename, + list(prefix_to_prefix.keys()), + ) + relpath = os.path.relpath(filename, rel_root) + offsets[relpath] = indices + + args = [ + (filename, offsets[os.path.relpath(filename, rel_root)], prefix_to_prefix) + for filename in files_to_relocate + ] + + tp = multiprocessing.pool.ThreadPool(processes=concurrency) + + try: + tp.map(llnl.util.lang.star(_relocate_binary_text), args) + finally: + tp.terminate() + tp.join() diff --git a/lib/spack/spack/rewiring.py b/lib/spack/spack/rewiring.py index c91485219b4..260fb4c54e4 100644 --- a/lib/spack/spack/rewiring.py +++ b/lib/spack/spack/rewiring.py @@ -93,8 +93,10 @@ def rewire_node(spec, explicit): False, spec.build_spec.prefix, spec.prefix) - relocate.relocate_text_bin(binaries=bins_to_relocate, - prefixes=prefix_to_prefix) + + # Relocate text strings of prefixes embedded in binaries + relocate.relocate_text_bin(bins_to_relocate, prefix_to_prefix) + # Copy package into place, except for spec.json (because spec.json # describes the old spec and not the new spliced spec). shutil.copytree(os.path.join(tempdir, spec.dag_hash()), spec.prefix, diff --git a/lib/spack/spack/test/relocate.py b/lib/spack/spack/test/relocate.py index b5c0151420f..c2e4bc0c381 100644 --- a/lib/spack/spack/test/relocate.py +++ b/lib/spack/spack/test/relocate.py @@ -294,14 +294,37 @@ def test_set_elf_rpaths_warning(mock_patchelf): assert output is None +def test_relocate_binary_text(tmpdir): + filename = str(tmpdir.join('binary')) + with open(filename, 'wb') as f: + f.write(b'somebinarytext') + f.write(b'/usr/relpath') + f.write(b'\0') + f.write(b'morebinarytext') + + spack.relocate._relocate_binary_text(filename, {14: '/usr'}, {'/usr': '/foo'}) + with open(filename, 'rb') as f: + contents = f.read() + assert b'/foo/relpath\0' in contents + + @pytest.mark.requires_executables('patchelf', 'strings', 'file', 'gcc') @skip_unless_linux -def test_replace_prefix_bin(hello_world): +def test_relocate_binary(hello_world): # Compile an "Hello world!" executable and set RPATHs executable = hello_world(rpaths=['/usr/lib', '/usr/lib64']) + with open(str(executable), 'rb') as f: + contents = f.read() + index_0 = contents.index(b'/usr') + index_1 = contents.index(b'/usr', index_0 + 1) + offsets = {index_0: '/usr/lib', index_1: '/usr/lib64'} + # Relocate the RPATHs - spack.relocate._replace_prefix_bin(str(executable), {b'/usr': b'/foo'}) + spack.relocate._relocate_binary_text( + str(executable), offsets, + {'/usr/lib': '/foo/lib', '/usr/lib64': '/foo/lib64'} + ) # Some compilers add rpaths so ensure changes included in final result assert '/foo/lib:/foo/lib64' in rpaths_for(executable) @@ -390,13 +413,11 @@ def test_relocate_text_bin(hello_world, copy_binary, tmpdir): assert not text_in_bin(str(new_binary.dirpath()), new_binary) # Check this call succeed - orig_path_bytes = str(orig_binary.dirpath()).encode('utf-8') - new_path_bytes = str(new_binary.dirpath()).encode('utf-8') + orig_path_bytes = str(orig_binary.dirpath()) + new_path_bytes = str(new_binary.dirpath()) spack.relocate.relocate_text_bin( - [str(new_binary)], - {orig_path_bytes: new_path_bytes} - ) + [str(new_binary)], {orig_path_bytes: new_path_bytes}) # Check original directory is not there anymore and it was # substituted with the new one @@ -405,15 +426,14 @@ def test_relocate_text_bin(hello_world, copy_binary, tmpdir): def test_relocate_text_bin_raise_if_new_prefix_is_longer(tmpdir): - short_prefix = b'/short' - long_prefix = b'/much/longer' + short_prefix = '/short' + long_prefix = '/much/longer' fpath = str(tmpdir.join('fakebin')) with open(fpath, 'w') as f: f.write('/short') with pytest.raises(spack.relocate.BinaryTextReplaceError): spack.relocate.relocate_text_bin( - [fpath], {short_prefix: long_prefix} - ) + [fpath], {short_prefix: long_prefix}) @pytest.mark.requires_executables('install_name_tool', 'file', 'cc')