repo: refactor indexes to be more extensible

- virtual provider cache and tags were previously generated by nearly
  identical but separate methods.

- factor out an Indexer interface for updating repository caches, and
  provide implementations for each type of index (TagIndex,
  ProviderIndex) so that more can be added if needed.

- Among other things, this allows all indexes to be updated at once.
  This is an advantage because loading package files is the real
  overhead, and building the indexes once the packages are loaded is
  trivial. We avoid extra bulk read-ins by generating all package indexes
  at once.

- This can be extended for dependents (reverse dependencies) and patches
  later.
This commit is contained in:
Todd Gamblin 2018-12-21 09:10:09 -08:00
parent 527ff860f0
commit c1d7adaaac

View File

@ -3,6 +3,7 @@
# #
# SPDX-License-Identifier: (Apache-2.0 OR MIT) # SPDX-License-Identifier: (Apache-2.0 OR MIT)
import abc
import collections import collections
import os import os
import stat import stat
@ -14,7 +15,7 @@
import traceback import traceback
import json import json
from contextlib import contextmanager from contextlib import contextmanager
from six import string_types from six import string_types, add_metaclass
try: try:
from collections.abc import Mapping from collections.abc import Mapping
@ -230,111 +231,153 @@ def update_package(self, pkg_name):
self._tag_dict[tag].append(package.name) self._tag_dict[tag].append(package.name)
@llnl.util.lang.memoized @add_metaclass(abc.ABCMeta)
def make_provider_index_cache(packages_path, namespace): class Indexer(object):
"""Lazily updates the provider index cache associated with a repository, """Adaptor for indexes that need to be generated when repos are updated."""
if need be, then returns it. Caches results for later look-ups.
Args: def create(self):
packages_path: path of the repository self.index = self._create()
namespace: namespace of the repository
@abc.abstractmethod
def _create(self):
"""Create an empty index and return it."""
@abc.abstractmethod
def read(self, stream):
"""Read this index from a provided file object."""
@abc.abstractmethod
def update(self, pkg_fullname):
"""Update the index in memory with information about a package."""
@abc.abstractmethod
def write(self, stream):
"""Write the index to a file object."""
class TagIndexer(Indexer):
"""Lifecycle methods for a TagIndex on a Repo."""
def _create(self):
return TagIndex()
def read(self, stream):
self.index = TagIndex.from_json(stream)
def update(self, pkg_fullname):
self.index.update_package(pkg_fullname)
def write(self, stream):
self.index.to_json(stream)
class ProviderIndexer(Indexer):
"""Lifecycle methods for virtual package providers."""
def _create(self):
return ProviderIndex()
def read(self, stream):
self.index = ProviderIndex.from_yaml(stream)
def update(self, pkg_fullname):
self.index.remove_provider(pkg_fullname)
self.index.update(pkg_fullname)
def write(self, stream):
self.index.to_yaml(stream)
class RepoIndex(object):
"""Container class that manages a set of Indexers for a Repo.
This class is responsible for checking packages in a repository for
updates (using ``FastPackageChecker``) and for regenerating indexes
when they're needed.
``Indexers`` shoudl be added to the ``RepoIndex`` using
``add_index(name, indexer)``, and they should support the interface
defined by ``Indexer``, so that the ``RepoIndex`` can read, generate,
and update stored indices.
Generated indexes are accessed by name via ``__getitem__()``.
Returns:
instance of ProviderIndex
""" """
# Map that goes from package names to stat info def __init__(self, package_checker, namespace):
fast_package_checker = FastPackageChecker(packages_path) self.checker = package_checker
self.packages_path = self.checker.packages_path
self.namespace = namespace
# Filename of the provider index cache self.indexers = {}
cache_filename = 'providers/{0}-index.yaml'.format(namespace) self.indexes = {}
# Compute which packages needs to be updated in the cache def add_indexer(self, name, indexer):
misc_cache = spack.caches.misc_cache """Add an indexer to the repo index.
index_mtime = misc_cache.mtime(cache_filename)
needs_update = [ Arguments:
x for x, sinfo in fast_package_checker.items() name (str): name of this indexer
if sinfo.st_mtime > index_mtime
]
# Read the old ProviderIndex, or make a new one. indexer (object): an object that supports create(), read(),
index_existed = misc_cache.init_entry(cache_filename) write(), and get_index() operations
if index_existed and not needs_update: """
self.indexers[name] = indexer
# If the provider index exists and doesn't need an update def __getitem__(self, name):
# just read from it """Get the index with the specified name, reindexing if needed."""
with misc_cache.read_transaction(cache_filename) as f: indexer = self.indexers.get(name)
index = ProviderIndex.from_yaml(f) if not indexer:
raise KeyError('no such index: %s' % name)
else: if name not in self.indexes:
self._build_all_indexes()
# Otherwise we need a write transaction to update it return self.indexes[name]
with misc_cache.write_transaction(cache_filename) as (old, new):
index = ProviderIndex.from_yaml(old) if old else ProviderIndex() def _build_all_indexes(self):
"""Build all the indexes at once.
for pkg_name in needs_update: We regenerate *all* indexes whenever *any* index needs an update,
namespaced_name = '{0}.{1}'.format(namespace, pkg_name) because the main bottleneck here is loading all the packages. It
index.remove_provider(namespaced_name) can take tens of seconds to regenerate sequentially, and we'd
index.update(namespaced_name) rather only pay that cost once rather than on several
invocations.
index.to_yaml(new) """
for name, indexer in self.indexers.items():
self.indexes[name] = self._build_index(name, indexer)
return index def _build_index(self, name, indexer):
"""Determine which packages need an update, and update indexes."""
# Filename of the provider index cache (we assume they're all json)
cache_filename = '{0}/{1}-index.json'.format(name, self.namespace)
@llnl.util.lang.memoized # Compute which packages needs to be updated in the cache
def make_tag_index_cache(packages_path, namespace): misc_cache = spack.caches.misc_cache
"""Lazily updates the tag index cache associated with a repository, index_mtime = misc_cache.mtime(cache_filename)
if need be, then returns it. Caches results for later look-ups.
Args: needs_update = [
packages_path: path of the repository x for x, sinfo in self.checker.items()
namespace: namespace of the repository if sinfo.st_mtime > index_mtime
]
Returns: index_existed = misc_cache.init_entry(cache_filename)
instance of TagIndex if index_existed and not needs_update:
""" # If the index exists and doesn't need an update, read it
# Map that goes from package names to stat info with misc_cache.read_transaction(cache_filename) as f:
fast_package_checker = FastPackageChecker(packages_path) indexer.read(f)
# Filename of the provider index cache else:
cache_filename = 'tags/{0}-index.json'.format(namespace) # Otherwise update it and rewrite the cache file
with misc_cache.write_transaction(cache_filename) as (old, new):
indexer.read(old) if old else indexer.create()
# Compute which packages needs to be updated in the cache for pkg_name in needs_update:
misc_cache = spack.caches.misc_cache namespaced_name = '%s.%s' % (self.namespace, pkg_name)
index_mtime = misc_cache.mtime(cache_filename) indexer.update(namespaced_name)
needs_update = [ indexer.write(new)
x for x, sinfo in fast_package_checker.items()
if sinfo.st_mtime > index_mtime
]
# Read the old ProviderIndex, or make a new one. return indexer.index
index_existed = misc_cache.init_entry(cache_filename)
if index_existed and not needs_update:
# If the provider index exists and doesn't need an update
# just read from it
with misc_cache.read_transaction(cache_filename) as f:
index = TagIndex.from_json(f)
else:
# Otherwise we need a write transaction to update it
with misc_cache.write_transaction(cache_filename) as (old, new):
index = TagIndex.from_json(old) if old else TagIndex()
for pkg_name in needs_update:
namespaced_name = '{0}.{1}'.format(namespace, pkg_name)
index.update_package(namespaced_name)
index.to_json(new)
return index
class RepoPath(object): class RepoPath(object):
@ -658,11 +701,8 @@ def check(condition, msg):
# Maps that goes from package name to corresponding file stat # Maps that goes from package name to corresponding file stat
self._fast_package_checker = None self._fast_package_checker = None
# Index of virtual dependencies, computed lazily # Indexes for this repository, computed lazily
self._provider_index = None self._repo_index = None
# Index of tags, computed lazily
self._tag_index = None
# make sure the namespace for packages in this repo exists. # make sure the namespace for packages in this repo exists.
self._create_namespace() self._create_namespace()
@ -847,27 +887,24 @@ def purge(self):
"""Clear entire package instance cache.""" """Clear entire package instance cache."""
self._instances.clear() self._instances.clear()
@property
def index(self):
"""Construct the index for this repo lazily."""
if self._repo_index is None:
self._repo_index = RepoIndex(self._pkg_checker, self.namespace)
self._repo_index.add_indexer('providers', ProviderIndexer())
self._repo_index.add_indexer('tags', TagIndexer())
return self._repo_index
@property @property
def provider_index(self): def provider_index(self):
"""A provider index with names *specific* to this repo.""" """A provider index with names *specific* to this repo."""
return self.index['providers']
if self._provider_index is None:
self._provider_index = make_provider_index_cache(
self.packages_path, self.namespace
)
return self._provider_index
@property @property
def tag_index(self): def tag_index(self):
"""A provider index with names *specific* to this repo.""" """Index of tags and which packages they're defined on."""
return self.index['tags']
if self._tag_index is None:
self._tag_index = make_tag_index_cache(
self.packages_path, self.namespace
)
return self._tag_index
@_autospec @_autospec
def providers_for(self, vpkg_spec): def providers_for(self, vpkg_spec):