Add elf parsing utility function (#33628)

Introduces `spack.util.elf.parse_elf(file_handle)`
This commit is contained in:
Harmen Stoppels 2022-11-01 20:42:06 +01:00 committed by GitHub
parent 6b3ea94630
commit 230e96fbb8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 635 additions and 46 deletions

View File

@ -1789,3 +1789,39 @@ def _spider(*args, **kwargs):
@pytest.fixture(scope="function")
def mock_tty_stdout(monkeypatch):
monkeypatch.setattr(sys.stdout, "isatty", lambda: True)
@pytest.fixture()
def binary_with_rpaths(tmpdir):
"""Factory fixture that compiles an ELF binary setting its RPATH. Relative
paths are encoded with `$ORIGIN` prepended.
"""
def _factory(rpaths, message="Hello world!"):
source = tmpdir.join("main.c")
source.write(
"""
#include <stdio.h>
int main(){{
printf("{0}");
}}
""".format(
message
)
)
gcc = spack.util.executable.which("gcc")
executable = source.dirpath("main.x")
# Encode relative RPATHs using `$ORIGIN` as the root prefix
rpaths = [x if os.path.isabs(x) else os.path.join("$ORIGIN", x) for x in rpaths]
rpath_str = ":".join(rpaths)
opts = [
"-Wl,--disable-new-dtags",
"-Wl,-rpath={0}".format(rpath_str),
str(source),
"-o",
str(executable),
]
gcc(*opts)
return executable
return _factory

View File

@ -82,42 +82,6 @@ def _factory(output):
return _factory
@pytest.fixture()
def hello_world(tmpdir):
"""Factory fixture that compiles an ELF binary setting its RPATH. Relative
paths are encoded with `$ORIGIN` prepended.
"""
def _factory(rpaths, message="Hello world!"):
source = tmpdir.join("main.c")
source.write(
"""
#include <stdio.h>
int main(){{
printf("{0}");
}}
""".format(
message
)
)
gcc = spack.util.executable.which("gcc")
executable = source.dirpath("main.x")
# Encode relative RPATHs using `$ORIGIN` as the root prefix
rpaths = [x if os.path.isabs(x) else os.path.join("$ORIGIN", x) for x in rpaths]
rpath_str = ":".join(rpaths)
opts = [
"-Wl,--disable-new-dtags",
"-Wl,-rpath={0}".format(rpath_str),
str(source),
"-o",
str(executable),
]
gcc(*opts)
return executable
return _factory
@pytest.fixture()
def make_dylib(tmpdir_factory):
"""Create a shared library with unfriendly qualities.
@ -315,9 +279,9 @@ def test_set_elf_rpaths_warning(mock_patchelf):
@pytest.mark.requires_executables("patchelf", "strings", "file", "gcc")
@skip_unless_linux
def test_replace_prefix_bin(hello_world):
def test_replace_prefix_bin(binary_with_rpaths):
# Compile an "Hello world!" executable and set RPATHs
executable = hello_world(rpaths=["/usr/lib", "/usr/lib64"])
executable = binary_with_rpaths(rpaths=["/usr/lib", "/usr/lib64"])
# Relocate the RPATHs
spack.relocate._replace_prefix_bin(str(executable), {b"/usr": b"/foo"})
@ -328,9 +292,9 @@ def test_replace_prefix_bin(hello_world):
@pytest.mark.requires_executables("patchelf", "strings", "file", "gcc")
@skip_unless_linux
def test_relocate_elf_binaries_absolute_paths(hello_world, copy_binary, tmpdir):
def test_relocate_elf_binaries_absolute_paths(binary_with_rpaths, copy_binary, tmpdir):
# Create an executable, set some RPATHs, copy it to another location
orig_binary = hello_world(rpaths=[str(tmpdir.mkdir("lib")), "/usr/lib64"])
orig_binary = binary_with_rpaths(rpaths=[str(tmpdir.mkdir("lib")), "/usr/lib64"])
new_binary = copy_binary(orig_binary)
spack.relocate.relocate_elf_binaries(
@ -350,9 +314,9 @@ def test_relocate_elf_binaries_absolute_paths(hello_world, copy_binary, tmpdir):
@pytest.mark.requires_executables("patchelf", "strings", "file", "gcc")
@skip_unless_linux
def test_relocate_elf_binaries_relative_paths(hello_world, copy_binary):
def test_relocate_elf_binaries_relative_paths(binary_with_rpaths, copy_binary):
# Create an executable, set some RPATHs, copy it to another location
orig_binary = hello_world(rpaths=["lib", "lib64", "/opt/local/lib"])
orig_binary = binary_with_rpaths(rpaths=["lib", "lib64", "/opt/local/lib"])
new_binary = copy_binary(orig_binary)
spack.relocate.relocate_elf_binaries(
@ -371,8 +335,8 @@ def test_relocate_elf_binaries_relative_paths(hello_world, copy_binary):
@pytest.mark.requires_executables("patchelf", "strings", "file", "gcc")
@skip_unless_linux
def test_make_elf_binaries_relative(hello_world, copy_binary, tmpdir):
orig_binary = hello_world(
def test_make_elf_binaries_relative(binary_with_rpaths, copy_binary, tmpdir):
orig_binary = binary_with_rpaths(
rpaths=[str(tmpdir.mkdir("lib")), str(tmpdir.mkdir("lib64")), "/opt/local/lib"]
)
new_binary = copy_binary(orig_binary)
@ -393,8 +357,8 @@ def test_raise_if_not_relocatable(monkeypatch):
@pytest.mark.requires_executables("patchelf", "strings", "file", "gcc")
@skip_unless_linux
def test_relocate_text_bin(hello_world, copy_binary, tmpdir):
orig_binary = hello_world(
def test_relocate_text_bin(binary_with_rpaths, copy_binary, tmpdir):
orig_binary = binary_with_rpaths(
rpaths=[str(tmpdir.mkdir("lib")), str(tmpdir.mkdir("lib64")), "/opt/local/lib"],
message=str(tmpdir),
)

View File

@ -0,0 +1,130 @@
# Copyright 2013-2021 Lawrence Livermore National Security, LLC and other
# Spack Project Developers. See the top-level COPYRIGHT file for details.
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
import io
import pytest
import llnl.util.filesystem as fs
import spack.platforms
import spack.util.elf as elf
import spack.util.executable
# note that our elf parser is platform independent... but I guess creating an elf file
# is slightly more difficult with system tools on non-linux.
def skip_unless_linux(f):
return pytest.mark.skipif(
str(spack.platforms.real_host()) != "linux",
reason="implementation currently requires linux",
)(f)
@pytest.mark.requires_executables("gcc")
@skip_unless_linux
def test_elf_get_rpaths(binary_with_rpaths):
# Compile an "Hello world!" executable and set RPATHs
long_rpaths = ["/very/long/prefix/x", "/very/long/prefix/y"]
executable = str(binary_with_rpaths(rpaths=long_rpaths))
assert elf.get_rpaths(executable) == long_rpaths
@pytest.mark.requires_executables("gcc")
@skip_unless_linux
@pytest.mark.parametrize(
"linker_flag,is_runpath",
[
("-Wl,--disable-new-dtags", False),
("-Wl,--enable-new-dtags", True),
],
)
def test_elf_parsing_shared_linking(linker_flag, is_runpath, tmpdir):
gcc = spack.util.executable.which("gcc")
with fs.working_dir(str(tmpdir)):
# Create a library to link to so we can force a dynamic section in an ELF file
with open("foo.c", "w") as f:
f.write("int foo(){return 0;}")
with open("bar.c", "w") as f:
f.write("int foo(); int _start(){return foo();}")
# Create library and executable linking to it.
gcc("-shared", "-o", "libfoo.so", "-Wl,-soname,libfoo.so.1", "-nostdlib", "foo.c")
gcc(
"-o",
"bar",
linker_flag,
"-Wl,-rpath,/first",
"-Wl,-rpath,/second",
"-Wl,--no-as-needed",
"-nostdlib",
"libfoo.so",
"bar.c",
"-o",
"bar",
)
with open("libfoo.so", "rb") as f:
foo_parsed = elf.parse_elf(f, interpreter=True, dynamic_section=True)
assert not foo_parsed.has_pt_interp
assert foo_parsed.has_pt_dynamic
assert not foo_parsed.has_rpath
assert not foo_parsed.has_needed
assert foo_parsed.has_soname
assert foo_parsed.dt_soname_str == b"libfoo.so.1"
with open("bar", "rb") as f:
bar_parsed = elf.parse_elf(f, interpreter=True, dynamic_section=True)
assert bar_parsed.has_pt_interp
assert bar_parsed.has_pt_dynamic
assert bar_parsed.has_rpath
assert bar_parsed.has_needed
assert not bar_parsed.has_soname
assert bar_parsed.dt_rpath_str == b"/first:/second"
assert bar_parsed.dt_needed_strs == [b"libfoo.so.1"]
def test_broken_elf():
# No elf magic
with pytest.raises(elf.ElfParsingError, match="Not an ELF file"):
elf.parse_elf(io.BytesIO(b"x"))
# Incomplete ELF header
with pytest.raises(elf.ElfParsingError, match="Not an ELF file"):
elf.parse_elf(io.BytesIO(b"\x7fELF"))
# Invalid class
with pytest.raises(elf.ElfParsingError, match="Invalid class"):
elf.parse_elf(io.BytesIO(b"\x7fELF\x09\x01" + b"\x00" * 10))
# Invalid data type
with pytest.raises(elf.ElfParsingError, match="Invalid data type"):
elf.parse_elf(io.BytesIO(b"\x7fELF\x01\x09" + b"\x00" * 10))
# 64-bit needs at least 64 bytes of header; this is only 56 bytes
with pytest.raises(elf.ElfParsingError, match="ELF header malformed"):
elf.parse_elf(io.BytesIO(b"\x7fELF\x02\x01" + b"\x00" * 50))
# 32-bit needs at least 52 bytes of header; this is only 46 bytes
with pytest.raises(elf.ElfParsingError, match="ELF header malformed"):
elf.parse_elf(io.BytesIO(b"\x7fELF\x01\x01" + b"\x00" * 40))
# Not a ET_DYN/ET_EXEC on a 32-bit LE ELF
with pytest.raises(elf.ElfParsingError, match="Not an ET_DYN or ET_EXEC"):
elf.parse_elf(io.BytesIO(b"\x7fELF\x01\x01" + (b"\x00" * 10) + b"\x09" + (b"\x00" * 35)))
def test_parser_doesnt_deal_with_nonzero_offset():
# Currently we don't have logic to parse ELF files at nonzero offsets in a file
# This could be useful when e.g. modifying an ELF file inside a tarball or so,
# but currently we cannot.
elf_at_offset_one = io.BytesIO(b"\x00\x7fELF\x01\x01" + b"\x00" * 10)
elf_at_offset_one.read(1)
with pytest.raises(elf.ElfParsingError, match="Cannot parse at a nonzero offset"):
elf.parse_elf(elf_at_offset_one)

459
lib/spack/spack/util/elf.py Normal file
View File

@ -0,0 +1,459 @@
# Copyright 2013-2021 Lawrence Livermore National Security, LLC and other
# Spack Project Developers. See the top-level COPYRIGHT file for details.
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
import bisect
import struct
import sys
from collections import namedtuple
from struct import calcsize, unpack, unpack_from
ElfHeader = namedtuple(
"ElfHeader",
[
"e_type",
"e_machine",
"e_version",
"e_entry",
"e_phoff",
"e_shoff",
"e_flags",
"e_ehsize",
"e_phentsize",
"e_phnum",
"e_shentsize",
"e_shnum",
"e_shstrndx",
],
)
SectionHeader = namedtuple(
"SectionHeader",
[
"sh_name",
"sh_type",
"sh_flags",
"sh_addr",
"sh_offset",
"sh_size",
"sh_link",
"sh_info",
"sh_addralign",
"sh_entsize",
],
)
ProgramHeader32 = namedtuple(
"ProgramHeader32",
[
"p_type",
"p_offset",
"p_vaddr",
"p_paddr",
"p_filesz",
"p_memsz",
"p_flags",
"p_align",
],
)
ProgramHeader64 = namedtuple(
"ProgramHeader64",
[
"p_type",
"p_flags",
"p_offset",
"p_vaddr",
"p_paddr",
"p_filesz",
"p_memsz",
"p_align",
],
)
class ELF_CONSTANTS:
MAGIC = b"\x7fELF"
CLASS32 = 1
CLASS64 = 2
DATA2LSB = 1
DATA2MSB = 2
ET_EXEC = 2
ET_DYN = 3
PT_LOAD = 1
PT_DYNAMIC = 2
PT_INTERP = 3
DT_NULL = 0
DT_NEEDED = 1
DT_STRTAB = 5
DT_SONAME = 14
DT_RPATH = 15
DT_RUNPATH = 29
SHT_STRTAB = 3
def get_byte_at(byte_array, idx):
if sys.version_info[0] < 3:
return ord(byte_array[idx])
return byte_array[idx]
class ElfParsingError(Exception):
pass
class ElfFile(object):
"""Parsed ELF file."""
__slots__ = [
"is_64_bit",
"is_little_endian",
"byte_order",
"elf_hdr",
"pt_load",
# pt_interp
"has_pt_interp",
"pt_interp_p_offset",
"pt_interp_p_filesz",
"pt_interp_str",
# pt_dynamic
"has_pt_dynamic",
"pt_dynamic_p_offset",
"pt_dynamic_p_filesz",
# rpath
"has_rpath",
"dt_rpath_offset",
"dt_rpath_str",
"rpath_strtab_offset",
"is_runpath",
# dt needed
"has_needed",
"dt_needed_strtab_offsets",
"dt_needed_strs",
# dt soname
"has_soname",
"dt_soname_strtab_offset",
"dt_soname_str",
]
def __init__(self):
self.dt_needed_strtab_offsets = []
self.has_soname = False
self.has_rpath = False
self.has_needed = False
self.pt_load = []
self.has_pt_dynamic = False
self.has_pt_interp = False
def parse_c_string(byte_string, start=0):
"""
Retrieve a C-string at a given offset in a byte string
Arguments:
byte_string (bytes): String
start (int): Offset into the string
Returns:
bytes: A copy of the C-string excluding the terminating null byte
"""
str_end = byte_string.find(b"\0", start)
if str_end == -1:
raise ElfParsingError("C-string is not null terminated")
return byte_string[start:str_end]
def read_exactly(f, num_bytes, msg):
"""
Read exactly num_bytes at the current offset, otherwise raise
a parsing error with the given error message.
Arguments:
f: file handle
num_bytes (int): Number of bytes to read
msg (str): Error to show when bytes cannot be read
Returns:
bytes: the ``num_bytes`` bytes that were read.
"""
data = f.read(num_bytes)
if len(data) != num_bytes:
raise ElfParsingError(msg)
return data
def parse_program_headers(f, elf):
"""
Parse program headers
Arguments:
f: file handle
elf (ElfFile): ELF file parser data
"""
# Forward to the program header
f.seek(elf.elf_hdr.e_phoff)
# Here we have to make a mapping from virtual address to offset in the file.
ProgramHeader = ProgramHeader64 if elf.is_64_bit else ProgramHeader32
ph_fmt = elf.byte_order + ("LLQQQQQQ" if elf.is_64_bit else "LLLLLLLL")
ph_size = calcsize(ph_fmt)
ph_num = elf.elf_hdr.e_phnum
# Read all program headers in one go
data = read_exactly(f, ph_num * ph_size, "Malformed program header")
for i in range(ph_num):
ph = ProgramHeader._make(unpack_from(ph_fmt, data, i * ph_size))
# Skip segments of size 0; we don't distinguish between missing segment and
# empty segments. I've see an empty PT_DYNAMIC section for an ELF file that
# contained debug data.
if ph.p_filesz == 0:
continue
# For PT_LOAD entries: Save offsets and virtual addrs of the loaded ELF segments
# This way we can map offsets by virtual address to offsets in the file.
if ph.p_type == ELF_CONSTANTS.PT_LOAD:
elf.pt_load.append((ph.p_offset, ph.p_vaddr))
elif ph.p_type == ELF_CONSTANTS.PT_INTERP:
elf.pt_interp_p_offset = ph.p_offset
elf.pt_interp_p_filesz = ph.p_filesz
elf.has_pt_interp = True
elif ph.p_type == ELF_CONSTANTS.PT_DYNAMIC:
elf.pt_dynamic_p_offset = ph.p_offset
elf.pt_dynamic_p_filesz = ph.p_filesz
elf.has_pt_dynamic = True
# The linker sorts PT_LOAD segments by vaddr, but let's do it just to be sure, since
# patchelf for example has a flag to leave them in an arbitrary order.
elf.pt_load.sort(key=lambda x: x[1])
def parse_pt_interp(f, elf):
"""
Parse the interpreter (i.e. absolute path to the dynamic linker)
Arguments:
f: file handle
elf (ElfFile): ELF file parser data
"""
f.seek(elf.pt_interp_p_offset)
data = read_exactly(f, elf.pt_interp_p_filesz, "Malformed PT_INTERP entry")
elf.pt_interp_str = parse_c_string(data)
def find_strtab_size_at_offset(f, elf, offset):
"""
Retrieve the size of a string table section at a particular known offset
Arguments:
f: file handle
elf (ElfFile): ELF file parser data
offset (int): offset of the section in the file (i.e. ``sh_offset``)
Returns:
int: the size of the string table in bytes
"""
section_hdr_fmt = elf.byte_order + ("LLQQQQLLQQ" if elf.is_64_bit else "LLLLLLLLLL")
section_hdr_size = calcsize(section_hdr_fmt)
f.seek(elf.elf_hdr.e_shoff)
for _ in range(elf.elf_hdr.e_shnum):
data = read_exactly(f, section_hdr_size, "Malformed section header")
sh = SectionHeader._make(unpack(section_hdr_fmt, data))
if sh.sh_type == ELF_CONSTANTS.SHT_STRTAB and sh.sh_offset == offset:
return sh.sh_size
raise ElfParsingError("Could not determine strtab size")
def retrieve_strtab(f, elf, offset):
"""
Read a full string table at the given offset, which
requires looking it up in the section headers.
Arguments:
elf (ElfFile): ELF file parser data
vaddr (int): virtual address
Returns:
bytes: file offset
"""
size = find_strtab_size_at_offset(f, elf, offset)
f.seek(offset)
return read_exactly(f, size, "Could not read string table")
def vaddr_to_offset(elf, vaddr):
"""
Given a virtual address, find the corresponding offset in the ELF file itself.
Arguments:
elf (ElfFile): ELF file parser data
vaddr (int): virtual address
"""
idx = bisect.bisect_right([p_vaddr for (p_offset, p_vaddr) in elf.pt_load], vaddr) - 1
p_offset, p_vaddr = elf.pt_load[idx]
return p_offset - p_vaddr + vaddr
def parse_pt_dynamic(f, elf):
"""
Parse the dynamic section of an ELF file
Arguments:
f: file handle
elf (ElfFile): ELF file parse data
"""
dynamic_array_fmt = elf.byte_order + ("qQ" if elf.is_64_bit else "lL")
dynamic_array_size = calcsize(dynamic_array_fmt)
current_offset = elf.pt_dynamic_p_offset
count_rpath = 0
count_runpath = 0
count_strtab = 0
f.seek(elf.pt_dynamic_p_offset)
# In case of broken ELF files, don't read beyond the advertized size.
for _ in range(elf.pt_dynamic_p_filesz // dynamic_array_size):
data = read_exactly(f, dynamic_array_size, "Malformed dynamic array entry")
tag, val = unpack(dynamic_array_fmt, data)
if tag == ELF_CONSTANTS.DT_NULL:
break
elif tag == ELF_CONSTANTS.DT_RPATH:
count_rpath += 1
elf.rpath_strtab_offset = val
elf.dt_rpath_offset = current_offset
elf.is_runpath = False
elf.has_rpath = True
elif tag == ELF_CONSTANTS.DT_RUNPATH:
count_runpath += 1
elf.rpath_strtab_offset = val
elf.dt_rpath_offset = current_offset
elf.is_runpath = True
elf.has_rpath = True
elif tag == ELF_CONSTANTS.DT_STRTAB:
count_strtab += 1
strtab_vaddr = val
elif tag == ELF_CONSTANTS.DT_NEEDED:
elf.has_needed = True
elf.dt_needed_strtab_offsets.append(val)
elif tag == ELF_CONSTANTS.DT_SONAME:
elf.has_soname = True
elf.dt_soname_strtab_offset = val
current_offset += dynamic_array_size
# No rpath/runpath, that happens.
if count_rpath == count_runpath == 0:
elf.has_rpath = False
elif count_rpath + count_runpath != 1:
raise ElfParsingError("Could not find a unique rpath/runpath.")
if count_strtab != 1:
raise ElfParsingError("Could not find a unique strtab of for the dynamic section strings")
# Nothing to retrieve, so don't bother getting the string table.
if not (elf.has_rpath or elf.has_soname or elf.has_needed):
return
string_table = retrieve_strtab(f, elf, vaddr_to_offset(elf, strtab_vaddr))
if elf.has_needed:
elf.dt_needed_strs = list(
parse_c_string(string_table, offset) for offset in elf.dt_needed_strtab_offsets
)
if elf.has_soname:
elf.dt_soname_str = parse_c_string(string_table, elf.dt_soname_strtab_offset)
if elf.has_rpath:
elf.dt_rpath_str = parse_c_string(string_table, elf.rpath_strtab_offset)
def parse_header(f, elf):
# Read the 32/64 bit class independent part of the header and validate
e_ident = f.read(16)
# Require ELF magic bytes.
if len(e_ident) != 16 or e_ident[:4] != ELF_CONSTANTS.MAGIC:
raise ElfParsingError("Not an ELF file")
# Defensively require a valid class and data.
e_ident_class, e_ident_data = get_byte_at(e_ident, 4), get_byte_at(e_ident, 5)
if e_ident_class not in (ELF_CONSTANTS.CLASS32, ELF_CONSTANTS.CLASS64):
raise ElfParsingError("Invalid class found")
if e_ident_data not in (ELF_CONSTANTS.DATA2LSB, ELF_CONSTANTS.DATA2MSB):
raise ElfParsingError("Invalid data type")
elf.is_64_bit = e_ident_class == ELF_CONSTANTS.CLASS64
elf.is_little_endian = e_ident_data == ELF_CONSTANTS.DATA2LSB
# Set up byte order and types for unpacking
elf.byte_order = "<" if elf.is_little_endian else ">"
# Parse the rest of the header
elf_header_fmt = elf.byte_order + ("HHLQQQLHHHHHH" if elf.is_64_bit else "HHLLLLLHHHHHH")
hdr_size = calcsize(elf_header_fmt)
data = read_exactly(f, hdr_size, "ELF header malformed")
elf.elf_hdr = ElfHeader._make(unpack(elf_header_fmt, data))
def _do_parse_elf(f, interpreter=True, dynamic_section=True):
# We don't (yet?) allow parsing ELF files at a nonzero offset, we just
# jump to absolute offsets as they are specified in the ELF file.
if f.tell() != 0:
raise ElfParsingError("Cannot parse at a nonzero offset")
elf = ElfFile()
parse_header(f, elf)
# We don't handle anything but executables and shared libraries now.
if elf.elf_hdr.e_type not in (ELF_CONSTANTS.ET_EXEC, ELF_CONSTANTS.ET_DYN):
raise ElfParsingError("Not an ET_DYN or ET_EXEC type")
parse_program_headers(f, elf)
# Parse PT_INTERP section
if interpreter and elf.has_pt_interp:
parse_pt_interp(f, elf)
# Parse PT_DYNAMIC section.
if dynamic_section and elf.has_pt_dynamic and len(elf.pt_load) > 0:
parse_pt_dynamic(f, elf)
return elf
def parse_elf(f, interpreter=False, dynamic_section=False):
"""Given a file handle f for an ELF file opened in binary mode, return an ElfFile
object that is stores data about rpaths"""
try:
return _do_parse_elf(f, interpreter, dynamic_section)
except (DeprecationWarning, struct.error):
# According to the docs old versions of Python can throw DeprecationWarning
# instead of struct.error.
raise ElfParsingError("Malformed ELF file")
def get_rpaths(path):
"""Returns list of rpaths of the given file as UTF-8 strings, or None if the file
does not have any rpaths."""
try:
with open(path, "rb") as f:
elf = parse_elf(f, interpreter=False, dynamic_section=True)
except ElfParsingError:
return None
if not elf.has_rpath:
return None
# If it does, split the string in components
rpath = elf.dt_rpath_str
if sys.version_info[0] >= 3:
rpath = rpath.decode("utf-8")
return rpath.split(":")