spack_yaml: add anchorify function (#44995)

This adds spack.util.spack_yaml.anchorify, which takes a non-cyclic
dict/list structure, and replaces identical values with (back)
references to the first instance, so that yaml serialization will use
anchors.

`repr` is used to identify sub-dags, which in principle is quadratic
complexity in depth of the graph, but in practice the depth is O(1) so
this should not matter.

Then this is used in CI to reduce the size of generated YAML files to
30% of their original size.
This commit is contained in:
Harmen Stoppels 2024-07-02 14:00:19 +02:00 committed by GitHub
parent 0ca1ee8b91
commit 2e8b4e660e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 80 additions and 3 deletions

View File

@ -22,6 +22,8 @@
from urllib.parse import urlencode from urllib.parse import urlencode
from urllib.request import HTTPHandler, Request, build_opener from urllib.request import HTTPHandler, Request, build_opener
import ruamel.yaml
import llnl.util.filesystem as fs import llnl.util.filesystem as fs
import llnl.util.tty as tty import llnl.util.tty as tty
from llnl.util.lang import memoized from llnl.util.lang import memoized
@ -1310,8 +1312,11 @@ def main_script_replacements(cmd):
if not rebuild_everything: if not rebuild_everything:
sys.exit(1) sys.exit(1)
with open(output_file, "w") as outf: # Minimize yaml output size through use of anchors
outf.write(syaml.dump(sorted_output, default_flow_style=True)) syaml.anchorify(sorted_output)
with open(output_file, "w") as f:
ruamel.yaml.YAML().dump(sorted_output, f)
def _url_encode_string(input_string): def _url_encode_string(input_string):

View File

@ -13,10 +13,12 @@
import collections.abc import collections.abc
import gzip import gzip
import inspect import inspect
import io
import json import json
import os import os
import pytest import pytest
import ruamel.yaml
import spack.hash_types as ht import spack.hash_types as ht
import spack.paths import spack.paths
@ -505,3 +507,50 @@ def test_load_json_specfiles(specfile, expected_hash, reader_cls):
# JSON or YAML file, not a list # JSON or YAML file, not a list
for edge in s2.traverse_edges(): for edge in s2.traverse_edges():
assert isinstance(edge.virtuals, tuple), edge assert isinstance(edge.virtuals, tuple), edge
def test_anchorify_1():
"""Test that anchorify replaces duplicate values with references to a single instance, and
that that results in anchors in the output YAML."""
before = {"a": [1, 2, 3], "b": [1, 2, 3]}
after = {"a": [1, 2, 3], "b": [1, 2, 3]}
syaml.anchorify(after)
assert before == after
assert after["a"] is after["b"]
# Check if anchors are used
out = io.StringIO()
ruamel.yaml.YAML().dump(after, out)
assert (
out.getvalue()
== """\
a: &id001
- 1
- 2
- 3
b: *id001
"""
)
def test_anchorify_2():
before = {"a": {"b": {"c": True}}, "d": {"b": {"c": True}}, "e": {"c": True}}
after = {"a": {"b": {"c": True}}, "d": {"b": {"c": True}}, "e": {"c": True}}
syaml.anchorify(after)
assert before == after
assert after["a"] is after["d"]
assert after["a"]["b"] is after["e"]
# Check if anchors are used
out = io.StringIO()
ruamel.yaml.YAML().dump(after, out)
assert (
out.getvalue()
== """\
a: &id001
b: &id002
c: true
d: *id001
e: *id002
"""
)

View File

@ -20,7 +20,7 @@
import functools import functools
import io import io
import re import re
from typing import IO, List, Optional from typing import IO, Any, Callable, Dict, List, Optional, Union
import ruamel.yaml import ruamel.yaml
from ruamel.yaml import comments, constructor, emitter, error, representer from ruamel.yaml import comments, constructor, emitter, error, representer
@ -493,6 +493,29 @@ def name_mark(name):
return error.StringMark(name, None, None, None, None, None) return error.StringMark(name, None, None, None, None, None)
def anchorify(data: Union[dict, list], identifier: Callable[[Any], str] = repr) -> None:
"""Replace identical dict/list branches in tree with references to earlier instances. The YAML
serializer generate anchors for them, resulting in small yaml files."""
anchors: Dict[str, Union[dict, list]] = {}
queue: List[Union[dict, list]] = [data]
while queue:
item = queue.pop()
for key, value in item.items() if isinstance(item, dict) else enumerate(item):
if not isinstance(value, (dict, list)):
continue
id = identifier(value)
anchor = anchors.get(id)
if anchor is None:
anchors[id] = value
queue.append(value)
else:
item[key] = anchor # replace with reference
class SpackYAMLError(spack.error.SpackError): class SpackYAMLError(spack.error.SpackError):
"""Raised when there are issues with YAML parsing.""" """Raised when there are issues with YAML parsing."""