log.py: improve utf-8 handling, and non-utf-8 output (#48005)

This commit is contained in:
Harmen Stoppels 2024-12-11 10:54:17 +01:00 committed by GitHub
parent 03525528d6
commit e9d2732e00
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 22 additions and 18 deletions

View File

@ -879,10 +879,13 @@ def _writer_daemon(
write_fd.close()
# 1. Use line buffering (3rd param = 1) since Python 3 has a bug
# that prevents unbuffered text I/O.
# 2. Python 3.x before 3.7 does not open with UTF-8 encoding by default
# that prevents unbuffered text I/O. [needs citation]
# 2. Enforce a UTF-8 interpretation of build process output with errors replaced by '?'.
# The downside is that the log file will not contain the exact output of the build process.
# 3. closefd=False because Connection has "ownership"
read_file = os.fdopen(read_fd.fileno(), "r", 1, encoding="utf-8", closefd=False)
read_file = os.fdopen(
read_fd.fileno(), "r", 1, encoding="utf-8", errors="replace", closefd=False
)
if stdin_fd:
stdin_file = os.fdopen(stdin_fd.fileno(), closefd=False)
@ -928,11 +931,7 @@ def _writer_daemon(
try:
while line_count < 100:
# Handle output from the calling process.
try:
line = _retry(read_file.readline)()
except UnicodeDecodeError:
# installs like --test=root gpgme produce non-UTF8 logs
line = "<line lost: output was not encoded as UTF-8>\n"
line = _retry(read_file.readline)()
if not line:
return
@ -946,6 +945,13 @@ def _writer_daemon(
output_line = clean_line
if filter_fn:
output_line = filter_fn(clean_line)
enc = sys.stdout.encoding
if enc != "utf-8":
# On Python 3.6 and 3.7-3.14 with non-{utf-8,C} locale stdout
# may not be able to handle utf-8 output. We do an inefficient
# dance of re-encoding with errors replaced, so stdout.write
# does not raise.
output_line = output_line.encode(enc, "replace").decode(enc)
sys.stdout.write(output_line)
# Stripped output to log file.

View File

@ -57,18 +57,16 @@ def test_log_python_output_without_echo(capfd, tmpdir):
assert capfd.readouterr()[0] == ""
def test_log_python_output_with_invalid_utf8(capfd, tmpdir):
with tmpdir.as_cwd():
with log.log_output("foo.txt"):
sys.stdout.buffer.write(b"\xc3\x28\n")
def test_log_python_output_with_invalid_utf8(capfd, tmp_path):
tmp_file = str(tmp_path / "foo.txt")
with log.log_output(tmp_file, echo=True):
sys.stdout.buffer.write(b"\xc3helloworld\n")
expected = b"<line lost: output was not encoded as UTF-8>\n"
with open("foo.txt", "rb") as f:
written = f.read()
assert written == expected
# we should be able to read this as valid utf-8
with open(tmp_file, "r", encoding="utf-8") as f:
assert f.read() == "<EFBFBD>helloworld\n"
# nothing on stdout or stderr
assert capfd.readouterr()[0] == ""
assert capfd.readouterr().out == "<EFBFBD>helloworld\n"
def test_log_python_output_and_echo_output(capfd, tmpdir):