spack_yaml: add anchorify function (#44995)
This adds spack.util.spack_yaml.anchorify, which takes a non-cyclic dict/list structure, and replaces identical values with (back) references to the first instance, so that yaml serialization will use anchors. `repr` is used to identify sub-dags, which in principle is quadratic complexity in depth of the graph, but in practice the depth is O(1) so this should not matter. Then this is used in CI to reduce the size of generated YAML files to 30% of their original size.
This commit is contained in:
parent
0ca1ee8b91
commit
2e8b4e660e
@ -22,6 +22,8 @@
|
|||||||
from urllib.parse import urlencode
|
from urllib.parse import urlencode
|
||||||
from urllib.request import HTTPHandler, Request, build_opener
|
from urllib.request import HTTPHandler, Request, build_opener
|
||||||
|
|
||||||
|
import ruamel.yaml
|
||||||
|
|
||||||
import llnl.util.filesystem as fs
|
import llnl.util.filesystem as fs
|
||||||
import llnl.util.tty as tty
|
import llnl.util.tty as tty
|
||||||
from llnl.util.lang import memoized
|
from llnl.util.lang import memoized
|
||||||
@ -1310,8 +1312,11 @@ def main_script_replacements(cmd):
|
|||||||
if not rebuild_everything:
|
if not rebuild_everything:
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
with open(output_file, "w") as outf:
|
# Minimize yaml output size through use of anchors
|
||||||
outf.write(syaml.dump(sorted_output, default_flow_style=True))
|
syaml.anchorify(sorted_output)
|
||||||
|
|
||||||
|
with open(output_file, "w") as f:
|
||||||
|
ruamel.yaml.YAML().dump(sorted_output, f)
|
||||||
|
|
||||||
|
|
||||||
def _url_encode_string(input_string):
|
def _url_encode_string(input_string):
|
||||||
|
@ -13,10 +13,12 @@
|
|||||||
import collections.abc
|
import collections.abc
|
||||||
import gzip
|
import gzip
|
||||||
import inspect
|
import inspect
|
||||||
|
import io
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
import ruamel.yaml
|
||||||
|
|
||||||
import spack.hash_types as ht
|
import spack.hash_types as ht
|
||||||
import spack.paths
|
import spack.paths
|
||||||
@ -505,3 +507,50 @@ def test_load_json_specfiles(specfile, expected_hash, reader_cls):
|
|||||||
# JSON or YAML file, not a list
|
# JSON or YAML file, not a list
|
||||||
for edge in s2.traverse_edges():
|
for edge in s2.traverse_edges():
|
||||||
assert isinstance(edge.virtuals, tuple), edge
|
assert isinstance(edge.virtuals, tuple), edge
|
||||||
|
|
||||||
|
|
||||||
|
def test_anchorify_1():
|
||||||
|
"""Test that anchorify replaces duplicate values with references to a single instance, and
|
||||||
|
that that results in anchors in the output YAML."""
|
||||||
|
before = {"a": [1, 2, 3], "b": [1, 2, 3]}
|
||||||
|
after = {"a": [1, 2, 3], "b": [1, 2, 3]}
|
||||||
|
syaml.anchorify(after)
|
||||||
|
assert before == after
|
||||||
|
assert after["a"] is after["b"]
|
||||||
|
|
||||||
|
# Check if anchors are used
|
||||||
|
out = io.StringIO()
|
||||||
|
ruamel.yaml.YAML().dump(after, out)
|
||||||
|
assert (
|
||||||
|
out.getvalue()
|
||||||
|
== """\
|
||||||
|
a: &id001
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 3
|
||||||
|
b: *id001
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_anchorify_2():
|
||||||
|
before = {"a": {"b": {"c": True}}, "d": {"b": {"c": True}}, "e": {"c": True}}
|
||||||
|
after = {"a": {"b": {"c": True}}, "d": {"b": {"c": True}}, "e": {"c": True}}
|
||||||
|
syaml.anchorify(after)
|
||||||
|
assert before == after
|
||||||
|
assert after["a"] is after["d"]
|
||||||
|
assert after["a"]["b"] is after["e"]
|
||||||
|
|
||||||
|
# Check if anchors are used
|
||||||
|
out = io.StringIO()
|
||||||
|
ruamel.yaml.YAML().dump(after, out)
|
||||||
|
assert (
|
||||||
|
out.getvalue()
|
||||||
|
== """\
|
||||||
|
a: &id001
|
||||||
|
b: &id002
|
||||||
|
c: true
|
||||||
|
d: *id001
|
||||||
|
e: *id002
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
@ -20,7 +20,7 @@
|
|||||||
import functools
|
import functools
|
||||||
import io
|
import io
|
||||||
import re
|
import re
|
||||||
from typing import IO, List, Optional
|
from typing import IO, Any, Callable, Dict, List, Optional, Union
|
||||||
|
|
||||||
import ruamel.yaml
|
import ruamel.yaml
|
||||||
from ruamel.yaml import comments, constructor, emitter, error, representer
|
from ruamel.yaml import comments, constructor, emitter, error, representer
|
||||||
@ -493,6 +493,29 @@ def name_mark(name):
|
|||||||
return error.StringMark(name, None, None, None, None, None)
|
return error.StringMark(name, None, None, None, None, None)
|
||||||
|
|
||||||
|
|
||||||
|
def anchorify(data: Union[dict, list], identifier: Callable[[Any], str] = repr) -> None:
|
||||||
|
"""Replace identical dict/list branches in tree with references to earlier instances. The YAML
|
||||||
|
serializer generate anchors for them, resulting in small yaml files."""
|
||||||
|
anchors: Dict[str, Union[dict, list]] = {}
|
||||||
|
queue: List[Union[dict, list]] = [data]
|
||||||
|
|
||||||
|
while queue:
|
||||||
|
item = queue.pop()
|
||||||
|
|
||||||
|
for key, value in item.items() if isinstance(item, dict) else enumerate(item):
|
||||||
|
if not isinstance(value, (dict, list)):
|
||||||
|
continue
|
||||||
|
|
||||||
|
id = identifier(value)
|
||||||
|
anchor = anchors.get(id)
|
||||||
|
|
||||||
|
if anchor is None:
|
||||||
|
anchors[id] = value
|
||||||
|
queue.append(value)
|
||||||
|
else:
|
||||||
|
item[key] = anchor # replace with reference
|
||||||
|
|
||||||
|
|
||||||
class SpackYAMLError(spack.error.SpackError):
|
class SpackYAMLError(spack.error.SpackError):
|
||||||
"""Raised when there are issues with YAML parsing."""
|
"""Raised when there are issues with YAML parsing."""
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user