log.py: improve utf-8 handling, and non-utf-8 output (#48005)

This commit is contained in:
Harmen Stoppels 2024-12-11 10:54:17 +01:00 committed by GitHub
parent 03525528d6
commit e9d2732e00
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 22 additions and 18 deletions

View File

@ -879,10 +879,13 @@ def _writer_daemon(
write_fd.close() write_fd.close()
# 1. Use line buffering (3rd param = 1) since Python 3 has a bug # 1. Use line buffering (3rd param = 1) since Python 3 has a bug
# that prevents unbuffered text I/O. # that prevents unbuffered text I/O. [needs citation]
# 2. Python 3.x before 3.7 does not open with UTF-8 encoding by default # 2. Enforce a UTF-8 interpretation of build process output with errors replaced by '?'.
# The downside is that the log file will not contain the exact output of the build process.
# 3. closefd=False because Connection has "ownership" # 3. closefd=False because Connection has "ownership"
read_file = os.fdopen(read_fd.fileno(), "r", 1, encoding="utf-8", closefd=False) read_file = os.fdopen(
read_fd.fileno(), "r", 1, encoding="utf-8", errors="replace", closefd=False
)
if stdin_fd: if stdin_fd:
stdin_file = os.fdopen(stdin_fd.fileno(), closefd=False) stdin_file = os.fdopen(stdin_fd.fileno(), closefd=False)
@ -928,11 +931,7 @@ def _writer_daemon(
try: try:
while line_count < 100: while line_count < 100:
# Handle output from the calling process. # Handle output from the calling process.
try: line = _retry(read_file.readline)()
line = _retry(read_file.readline)()
except UnicodeDecodeError:
# installs like --test=root gpgme produce non-UTF8 logs
line = "<line lost: output was not encoded as UTF-8>\n"
if not line: if not line:
return return
@ -946,6 +945,13 @@ def _writer_daemon(
output_line = clean_line output_line = clean_line
if filter_fn: if filter_fn:
output_line = filter_fn(clean_line) output_line = filter_fn(clean_line)
enc = sys.stdout.encoding
if enc != "utf-8":
# On Python 3.6 and 3.7-3.14 with non-{utf-8,C} locale stdout
# may not be able to handle utf-8 output. We do an inefficient
# dance of re-encoding with errors replaced, so stdout.write
# does not raise.
output_line = output_line.encode(enc, "replace").decode(enc)
sys.stdout.write(output_line) sys.stdout.write(output_line)
# Stripped output to log file. # Stripped output to log file.

View File

@ -57,18 +57,16 @@ def test_log_python_output_without_echo(capfd, tmpdir):
assert capfd.readouterr()[0] == "" assert capfd.readouterr()[0] == ""
def test_log_python_output_with_invalid_utf8(capfd, tmpdir): def test_log_python_output_with_invalid_utf8(capfd, tmp_path):
with tmpdir.as_cwd(): tmp_file = str(tmp_path / "foo.txt")
with log.log_output("foo.txt"): with log.log_output(tmp_file, echo=True):
sys.stdout.buffer.write(b"\xc3\x28\n") sys.stdout.buffer.write(b"\xc3helloworld\n")
expected = b"<line lost: output was not encoded as UTF-8>\n" # we should be able to read this as valid utf-8
with open("foo.txt", "rb") as f: with open(tmp_file, "r", encoding="utf-8") as f:
written = f.read() assert f.read() == "<EFBFBD>helloworld\n"
assert written == expected
# nothing on stdout or stderr assert capfd.readouterr().out == "<EFBFBD>helloworld\n"
assert capfd.readouterr()[0] == ""
def test_log_python_output_and_echo_output(capfd, tmpdir): def test_log_python_output_and_echo_output(capfd, tmpdir):