Pre ci optimization (#16372)

* add initial optimization script * integrate optimization in spack ci * make optimization opt-in * fix import error * flake8 fixes * update command completion * work around vermin errors * fix sphynx errors
2020-06-22 13:19:47 -04:00
parent 42f2c168d2
commit 224dc95159
4 changed files with 393 additions and 3 deletions
--- a/lib/spack/spack/ci.py
+++ b/lib/spack/spack/ci.py
@@ -449,7 +449,8 @@ def format_job_needs(phase_name, strip_compilers, dep_jobs,
 def generate_gitlab_ci_yaml(env, print_summary, output_file,
-                            custom_spack_repo=None, custom_spack_ref=None):
+                            custom_spack_repo=None, custom_spack_ref=None,
                            run_optimizer=False):
    # FIXME: What's the difference between one that opens with 'spack'
    # and one that opens with 'env'?  This will only handle the former.
    with spack.concretize.disable_compiler_existence_check():
@@ -788,6 +789,11 @@ def generate_gitlab_ci_yaml(env, print_summary, output_file,
    for output_key, output_value in sorted(output_object.items()):
        sorted_output[output_key] = output_value
    # TODO(opadron): remove this or refactor
    if run_optimizer:
        import spack.ci_optimization as ci_opt
        sorted_output = ci_opt.optimizer(sorted_output)
    with open(output_file, 'w') as outf:
        outf.write(syaml.dump_config(sorted_output, default_flow_style=True))
--- a/lib/spack/spack/ci_optimization.py
+++ b/lib/spack/spack/ci_optimization.py
@@ -0,0 +1,377 @@
 # Copyright 2013-2020 Lawrence Livermore National Security, LLC and other
 # Spack Project Developers. See the top-level COPYRIGHT file for details.
 #
 # SPDX-License-Identifier: (Apache-2.0 OR MIT)
 import collections
 try:
    # dynamically import to keep vermin from complaining
    collections_abc = __import__('collections.abc')
 except ImportError:
    collections_abc = collections
 import copy
 import hashlib
 import spack.util.spack_yaml as syaml
 def matches(obj, proto):
    """Returns True if the test object "obj" matches the prototype object
    "proto".
    If obj and proto are mappings, obj matches proto if (key in obj) and
    (obj[key] matches proto[key]) for every key in proto.
    If obj and proto are sequences, obj matches proto if they are of the same
    length and (a matches b) for every (a,b) in zip(obj, proto).
    Otherwise, obj matches proto if obj == proto.
    Precondition: proto must not have any reference cycles
    """
    if isinstance(obj, collections_abc.Mapping):
        if not isinstance(proto, collections_abc.Mapping):
            return False
        return all(
            (key in obj and matches(obj[key], val))
            for key, val in proto.items()
        )
    if (isinstance(obj, collections_abc.Sequence) and
            not isinstance(obj, str)):
        if not (isinstance(proto, collections_abc.Sequence) and
                not isinstance(proto, str)):
            return False
        if len(obj) != len(proto):
            return False
        return all(
            matches(obj[index], val)
            for index, val in enumerate(proto)
        )
    return obj == proto
 def subkeys(obj, proto):
    """Returns the test mapping "obj" after factoring out the items it has in
    common with the prototype mapping "proto".
    Consider a recursive merge operation, merge(a, b) on mappings a and b, that
    returns a mapping, m, whose keys are the union of the keys of a and b, and
    for every such key, "k", its corresponding value is:
      - merge(a[key], b[key])  if a[key] and b[key] are mappings, or
      - b[key]                 if (key in b) and not matches(a[key], b[key]),
                               or
      - a[key]                 otherwise
    If obj and proto are mappings, the returned object is the smallest object,
    "a", such that merge(a, proto) matches obj.
    Otherwise, obj is returned.
    """
    if not (isinstance(obj, collections_abc.Mapping) and
            isinstance(proto, collections_abc.Mapping)):
        return obj
    new_obj = {}
    for key, value in obj.items():
        if key not in proto:
            new_obj[key] = value
            continue
        if (matches(value, proto[key]) and
            matches(proto[key], value)):
            continue
        if isinstance(value, collections_abc.Mapping):
            new_obj[key] = subkeys(value, proto[key])
            continue
        new_obj[key] = value
    return new_obj
 def add_extends(yaml, key):
    """Modifies the given object "yaml" so that it includes an "extends" key
    whose value features "key".
    If "extends" is not in yaml, then yaml is modified such that
    yaml["extends"] == key.
    If yaml["extends"] is a str, then yaml is modified such that
    yaml["extends"] == [yaml["extends"], key]
    If yaml["extends"] is a list that does not include key, then key is
    appended to the list.
    Otherwise, yaml is left unchanged.
    """
    has_key = ('extends' in yaml)
    extends = yaml.get('extends')
    if has_key and not isinstance(extends, (str, collections_abc.Sequence)):
        return
    if extends is None:
        yaml['extends'] = key
        return
    if isinstance(extends, str):
        if extends != key:
            yaml['extends'] = [extends, key]
        return
    if key not in extends:
        extends.append(key)
 def common_subobject(yaml, sub):
    """Factor prototype object "sub" out of the values of mapping "yaml".
    Consider a modified copy of yaml, "new", where for each key, "key" in yaml:
      - If yaml[key] matches sub, then new[key] = subkeys(yaml[key], sub).
      - Otherwise, new[key] = yaml[key].
    If the above match criteria is not satisfied for any such key, then (yaml,
    None) is returned. The yaml object is returned unchanged.
    Otherwise, each matching value in new is modified as in
    add_extends(new[key], common_key), and then new[common_key] is set to sub.
    The common_key value is chosen such that it does not match any preexisting
    key in new. In this case, (new, common_key) is returned.
    """
    match_list = set(k for k, v in yaml.items() if matches(v, sub))
    if not match_list:
        return yaml, None
    common_prefix = '.c'
    common_index = 0
    while True:
        common_key = ''.join((common_prefix, str(common_index)))
        if common_key not in yaml:
            break
        common_index += 1
    new_yaml = {}
    for key, val in yaml.items():
        new_yaml[key] = copy.deepcopy(val)
        if not matches(val, sub):
            continue
        new_yaml[key] = subkeys(new_yaml[key], sub)
        add_extends(new_yaml[key], common_key)
    new_yaml[common_key] = sub
    return new_yaml, common_key
 def print_delta(name, old, new, applied=None):
    delta = new - old
    reldelta = (1000 * delta) // old
    reldelta = (reldelta // 10, reldelta % 10)
    if applied is None:
        applied = (new <= old)
    print('\n'.join((
        '{} {}:',
        '  before: {: 10d}',
        '  after : {: 10d}',
        '  delta : {:+10d} ({:=+3d}.{}%)',
    )).format(
        name,
        ('+' if applied else 'x'),
        old,
        new,
        delta,
        reldelta[0],
        reldelta[1]
    ))
 def try_optimization_pass(name, yaml, optimization_pass, *args, **kwargs):
    """Try applying an optimization pass and return information about the
    result
    "name" is a string describing the nature of the pass. If it is a non-empty
    string, summary statistics are also printed to stdout.
    "yaml" is the object to apply the pass to.
    "optimization_pass" is the function implementing the pass to be applied.
    "args" and "kwargs" are the additional arguments to pass to optimization
    pass. The pass is applied as
    >>> (new_yaml, *other_results) = optimization_pass(yaml, *args, **kwargs)
    The pass's results are greedily rejected if it does not modify the original
    yaml document, or if it produces a yaml document that serializes to a
    larger string.
    Returns (new_yaml, yaml, applied, other_results) if applied, or
    (yaml, new_yaml, applied, other_results) otherwise.
    """
    result = optimization_pass(yaml, *args, **kwargs)
    new_yaml, other_results = result[0], result[1:]
    if new_yaml is yaml:
        # pass was not applied
        return (yaml, new_yaml, False, other_results)
    pre_size = len(syaml.dump_config(yaml, default_flow_style=True))
    post_size = len(syaml.dump_config(new_yaml, default_flow_style=True))
    # pass makes the size worse: not applying
    applied = (post_size <= pre_size)
    if applied:
        yaml, new_yaml = new_yaml, yaml
    if name:
        print_delta(name, pre_size, post_size, applied)
    return (yaml, new_yaml, applied, other_results)
 def build_histogram(iterator, key):
    """Builds a histogram of values given an iterable of mappings and a key.
    For each mapping "m" with key "key" in iterator, the value m[key] is
    considered.
    Returns a list of tuples (hash, count, proportion, value), where
      - "hash" is a sha1sum hash of the value.
      - "count" is the number of occurences of values that hash to "hash".
      - "proportion" is the proportion of all values considered above that
        hash to "hash".
      - "value" is one of the values considered above that hash to "hash".
        Which value is chosen when multiple values hash to the same "hash" is
        undefined.
    The list is sorted in descending order by count, yielding the most
    frequently occuring hashes first.
    """
    buckets = collections.defaultdict(int)
    values = {}
    num_objects = 0
    for obj in iterator:
        num_objects += 1
        try:
            val = obj[key]
        except (KeyError, TypeError):
            continue
        value_hash = hashlib.sha1()
        value_hash.update(syaml.dump_config(val).encode())
        value_hash = value_hash.hexdigest()
        buckets[value_hash] += 1
        values[value_hash] = val
    return [(h, buckets[h], float(buckets[h]) / num_objects, values[h])
            for h in sorted(buckets.keys(), key=lambda k: -buckets[k])]
 def optimizer(yaml):
    original_size = len(syaml.dump_config(yaml, default_flow_style=True))
    # try factoring out commonly repeated portions
    common_job = {
        'variables': {
            'SPACK_COMPILER_ACTION': 'NONE',
            'SPACK_RELATED_BUILDS_CDASH': ''
        },
        'after_script': ['rm -rf "./spack"'],
        'artifacts': {
            'paths': ['jobs_scratch_dir', 'cdash_report'],
            'when': 'always'
        },
    }
    # look for a list of tags that appear frequently
    _, count, proportion, tags = next(iter(
        build_histogram(yaml.values(), 'tags')),
        (None,) * 4)
    # If a list of tags is found, and there are more than one job that uses it,
    # *and* the jobs that do use it represent at least 70% of all jobs, then
    # add the list to the prototype object.
    if tags and count > 1 and proportion >= 0.70:
        common_job['tags'] = tags
    # apply common object factorization
    yaml, other, applied, rest = try_optimization_pass(
        'general common object factorization',
        yaml, common_subobject, common_job)
    # look for a common script, and try factoring that out
    _, count, proportion, script = next(iter(
        build_histogram(yaml.values(), 'script')),
        (None,) * 4)
    if script and count > 1 and proportion >= 0.70:
        yaml, other, applied, rest = try_optimization_pass(
            'script factorization',
            yaml, common_subobject, {'script': script})
    # look for a common before_script, and try factoring that out
    _, count, proportion, script = next(iter(
        build_histogram(yaml.values(), 'before_script')),
        (None,) * 4)
    if script and count > 1 and proportion >= 0.70:
        yaml, other, applied, rest = try_optimization_pass(
            'before_script factorization',
            yaml, common_subobject, {'before_script': script})
    # Look specifically for the SPACK_ROOT_SPEC environment variables.
    # Try to factor them out.
    h = build_histogram((
        getattr(val, 'get', lambda *args: {})('variables')
        for val in yaml.values()), 'SPACK_ROOT_SPEC')
    # In this case, we try to factor out *all* instances of the SPACK_ROOT_SPEC
    # environment variable; not just the one that appears with the greatest
    # frequency. We only require that more than 1 job uses a given instance's
    # value, because we expect the value to be very large, and so expect even
    # few-to-one factorizations to yield large space savings.
    counter = 0
    for _, count, proportion, spec in h:
        if count <= 1:
            continue
        counter += 1
        yaml, other, applied, rest = try_optimization_pass(
            'SPACK_ROOT_SPEC factorization ({count})'.format(count=counter),
            yaml,
            common_subobject,
            {'variables': {'SPACK_ROOT_SPEC': spec}})
    new_size = len(syaml.dump_config(yaml, default_flow_style=True))
    print('\n')
    print_delta('overall summary', original_size, new_size)
    print('\n')
    return yaml
--- a/lib/spack/spack/cmd/ci.py
+++ b/lib/spack/spack/cmd/ci.py
@@ -54,6 +54,11 @@ def setup_parser(subparser):
        help="Provide a git branch or tag if a custom spack branch " +
             "should be checked out as a step in each generated job.  " +
             "This argument is ignored if no --spack-repo is provided.")
    generate.add_argument(
        '--optimize', action='store_true',
        help="(Experimental) run the generated document through a series of "
             "optimization passes designed to reduce the size of the "
             "generated file.")
    generate.set_defaults(func=ci_generate)
    # Check a spec against mirror. Rebuild, create buildcache and push to
@@ -75,6 +80,7 @@ def ci_generate(args):
    copy_yaml_to = args.copy_to
    spack_repo = args.spack_repo
    spack_ref = args.spack_ref
    run_optimizer = args.optimize
    if not output_file:
        gen_ci_dir = os.getcwd()
@@ -86,7 +92,8 @@ def ci_generate(args):
    # Generate the jobs
    spack_ci.generate_gitlab_ci_yaml(
-        env, True, output_file, spack_repo, spack_ref)
+        env, True, output_file, spack_repo, spack_ref,
        run_optimizer=run_optimizer)
    if copy_yaml_to:
        copy_to_dir = os.path.dirname(copy_yaml_to)
--- a/share/spack/spack-completion.bash
+++ b/share/spack/spack-completion.bash
@@ -474,7 +474,7 @@ _spack_ci() {
 }
 _spack_ci_generate() {
-    SPACK_COMPREPLY="-h --help --output-file --copy-to --spack-repo --spack-ref"
+    SPACK_COMPREPLY="-h --help --output-file --copy-to --spack-repo --spack-ref --optimize"
 }
 _spack_ci_rebuild() {