Add options for sparse checkout in GitFetcher (#45473)

* Add options for sparse checkout in GitFetcher

Newer versions of git have a beta feature called sparse checkout
that allow users to check out a portion of a large repo.

This feature will be ideal for monolithic repo projects that want to
model their infrastructure via spack.  This PR implements an addition
to the GitFetcher that allows users to add a `git_sparse_paths`
attribute to package classes or versions which will then use sparse
checkout on those directories/files for the package.

* Style

* Split git clone into multiple functions

* Add sparse-checkout impl

* Internalize src clone functions

* Docs

* Adding sparse clone test

* Add test for partial clone

* [@spackbot] updating style on behalf of psakievich

* Small fixes

* Restore default branch status

* Fix attributes for package

* Update lib/spack/docs/packaging_guide.rst

Co-authored-by: Matthew Mosby <44072882+mdmosby@users.noreply.github.com>

* Extend unit test to multiple git versions

* style

---------

Co-authored-by: psakievich <psakievich@users.noreply.github.com>
Co-authored-by: Matthew Mosby <44072882+mdmosby@users.noreply.github.com>
This commit is contained in:
psakievich 2024-08-14 23:28:34 -06:00 committed by GitHub
parent 55b1b0f3f0
commit 1b82779087
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 248 additions and 25 deletions

View File

@ -1263,6 +1263,11 @@ Git fetching supports the following parameters to ``version``:
option ``--depth 1`` will be used if the version of git and the specified
transport protocol support it, and ``--single-branch`` will be used if the
version of git supports it.
* ``git_sparse_paths``: Use ``sparse-checkout`` to only clone these relative paths.
This feature requires ``git`` to be version ``2.25.0`` or later but is useful for
large repositories that have separate portions that can be built independently.
If paths provided are directories then all the subdirectories and associated files
will also be cloned.
Only one of ``tag``, ``branch``, or ``commit`` can be used at a time.
@ -1361,6 +1366,41 @@ Submodules
For more information about git submodules see the manpage of git: ``man
git-submodule``.
Sparse-Checkout
You can supply ``git_sparse_paths`` at the package or version level to utilize git's
sparse-checkout feature. This will only clone the paths that are specified in the
``git_sparse_paths`` attribute for the package along with the files in the top level directory.
This feature allows you to only clone what you need from a large repository.
Note that this is a newer feature in git and requries git ``2.25.0`` or greater.
If ``git_sparse_paths`` is supplied and the git version is too old
then a warning will be issued and that package will use the standard cloning operations instead.
``git_sparse_paths`` should be supplied as a list of paths, a callable function for versions,
or a more complex package attribute using the ``@property`` decorator. The return value should be
a list for a callable implementation of ``git_sparse_paths``.
.. code-block:: python
def sparse_path_function(package)
"""a callable function that can be used in side a version"""
# paths can be directories or functions, all subdirectories and files are included
paths = ["doe", "rae", "me/file.cpp"]
if package.spec.version > Version("1.2.0"):
paths.extend(["fae"])
return paths
class MyPackage(package):
# can also be a package attribute that will be used if not specified in versions
git_sparse_paths = ["doe", "rae"]
# use the package attribute
version("1.0.0")
version("1.1.0")
# use the function
version("1.1.5", git_sparse_paths=sparse_path_func)
version("1.2.0", git_sparse_paths=sparse_path_func)
version("1.2.5", git_sparse_paths=sparse_path_func)
version("1.1.5", git_sparse_paths=sparse_path_func)
.. _github-fetch:
^^^^^^

View File

@ -720,6 +720,7 @@ class GitFetchStrategy(VCSFetchStrategy):
"submodules",
"get_full_repo",
"submodules_delete",
"git_sparse_paths",
]
git_version_re = r"git version (\S+)"
@ -735,6 +736,7 @@ def __init__(self, **kwargs):
self.submodules = kwargs.get("submodules", False)
self.submodules_delete = kwargs.get("submodules_delete", False)
self.get_full_repo = kwargs.get("get_full_repo", False)
self.git_sparse_paths = kwargs.get("git_sparse_paths", None)
@property
def git_version(self):
@ -802,38 +804,50 @@ def fetch(self):
tty.debug("Already fetched {0}".format(self.stage.source_path))
return
self.clone(commit=self.commit, branch=self.branch, tag=self.tag)
if self.git_sparse_paths:
self._sparse_clone_src(commit=self.commit, branch=self.branch, tag=self.tag)
else:
self._clone_src(commit=self.commit, branch=self.branch, tag=self.tag)
self.submodule_operations()
def clone(self, dest=None, commit=None, branch=None, tag=None, bare=False):
def bare_clone(self, dest):
"""
Clone a repository to a path.
Execute a bare clone for metadata only
This method handles cloning from git, but does not require a stage.
Arguments:
dest (str or None): The path into which the code is cloned. If None,
requires a stage and uses the stage's source path.
commit (str or None): A commit to fetch from the remote. Only one of
commit, branch, and tag may be non-None.
branch (str or None): A branch to fetch from the remote.
tag (str or None): A tag to fetch from the remote.
bare (bool): Execute a "bare" git clone (--bare option to git)
Requires a destination since bare cloning does not provide source
and shouldn't be used for staging.
"""
# Default to spack source path
dest = dest or self.stage.source_path
tty.debug("Cloning git repository: {0}".format(self._repo_info()))
git = self.git
debug = spack.config.get("config:debug")
if bare:
# We don't need to worry about which commit/branch/tag is checked out
clone_args = ["clone", "--bare"]
if not debug:
clone_args.append("--quiet")
clone_args.extend([self.url, dest])
git(*clone_args)
elif commit:
# We don't need to worry about which commit/branch/tag is checked out
clone_args = ["clone", "--bare"]
if not debug:
clone_args.append("--quiet")
clone_args.extend([self.url, dest])
git(*clone_args)
def _clone_src(self, commit=None, branch=None, tag=None):
"""
Clone a repository to a path using git.
Arguments:
commit (str or None): A commit to fetch from the remote. Only one of
commit, branch, and tag may be non-None.
branch (str or None): A branch to fetch from the remote.
tag (str or None): A tag to fetch from the remote.
"""
# Default to spack source path
dest = self.stage.source_path
tty.debug("Cloning git repository: {0}".format(self._repo_info()))
git = self.git
debug = spack.config.get("config:debug")
if commit:
# Need to do a regular clone and check out everything if
# they asked for a particular commit.
clone_args = ["clone", self.url]
@ -912,6 +926,85 @@ def clone(self, dest=None, commit=None, branch=None, tag=None, bare=False):
git(*pull_args, ignore_errors=1)
git(*co_args)
def _sparse_clone_src(self, commit=None, branch=None, tag=None, **kwargs):
"""
Use git's sparse checkout feature to clone portions of a git repository
Arguments:
commit (str or None): A commit to fetch from the remote. Only one of
commit, branch, and tag may be non-None.
branch (str or None): A branch to fetch from the remote.
tag (str or None): A tag to fetch from the remote.
"""
dest = self.stage.source_path
git = self.git
if self.git_version < spack.version.Version("2.25.0.0"):
# code paths exist where the package is not set. Assure some indentifier for the
# package that was configured for sparse checkout exists in the error message
identifier = str(self.url)
if self.package:
identifier += f" ({self.package.name})"
tty.warn(
(
f"{identifier} is configured for git sparse-checkout "
"but the git version is too old to support sparse cloning. "
"Cloning the full repository instead."
)
)
self._clone_src(commit, branch, tag)
else:
# default to depth=2 to allow for retention of some git properties
depth = kwargs.get("depth", 2)
needs_fetch = branch or tag
git_ref = branch or tag or commit
assert git_ref
clone_args = ["clone"]
if needs_fetch:
clone_args.extend(["--branch", git_ref])
if self.get_full_repo:
clone_args.append("--no-single-branch")
else:
clone_args.append("--single-branch")
clone_args.extend(
[f"--depth={depth}", "--no-checkout", "--filter=blob:none", self.url]
)
sparse_args = ["sparse-checkout", "set"]
if callable(self.git_sparse_paths):
sparse_args.extend(self.git_sparse_paths())
else:
sparse_args.extend([p for p in self.git_sparse_paths])
sparse_args.append("--cone")
checkout_args = ["checkout", git_ref]
if not spack.config.get("config:debug"):
clone_args.insert(1, "--quiet")
checkout_args.insert(1, "--quiet")
with temp_cwd():
git(*clone_args)
repo_name = get_single_file(".")
if self.stage:
self.stage.srcdir = repo_name
shutil.move(repo_name, dest)
with working_dir(dest):
git(*sparse_args)
git(*checkout_args)
def submodule_operations(self):
dest = self.stage.source_path
git = self.git
if self.submodules_delete:
with working_dir(dest):
for submodule_to_delete in self.submodules_delete:
@ -1541,8 +1634,11 @@ def _from_merged_attrs(fetcher, pkg, version):
attrs["fetch_options"] = pkg.fetch_options
attrs.update(pkg.versions[version])
if fetcher.url_attr == "git" and hasattr(pkg, "submodules"):
attrs.setdefault("submodules", pkg.submodules)
if fetcher.url_attr == "git":
pkg_attr_list = ["submodules", "git_sparse_paths"]
for pkg_attr in pkg_attr_list:
if hasattr(pkg, pkg_attr):
attrs.setdefault(pkg_attr, getattr(pkg, pkg_attr))
return fetcher(**attrs)

View File

@ -1418,6 +1418,24 @@ def mock_git_repository(git, tmpdir_factory):
r1 = rev_hash(branch)
r1_file = branch_file
multiple_directories_branch = "many_dirs"
num_dirs = 3
num_files = 2
dir_files = []
for i in range(num_dirs):
for j in range(num_files):
dir_files.append(f"dir{i}/file{j}")
git("checkout", "-b", multiple_directories_branch)
for f in dir_files:
repodir.ensure(f, file=True)
git("add", f)
git("-c", "commit.gpgsign=false", "commit", "-m", "many_dirs add files")
# restore default
git("checkout", default_branch)
# Map of version -> bunch. Each bunch includes; all the args
# that must be specified as part of a version() declaration (used to
# manufacture a version for the 'git-test' package); the associated
@ -1437,6 +1455,11 @@ def mock_git_repository(git, tmpdir_factory):
"default-no-per-version-git": Bunch(
revision=default_branch, file=r0_file, args={"branch": default_branch}
),
"many-directories": Bunch(
revision=multiple_directories_branch,
file=dir_files[0],
args={"git": url, "branch": multiple_directories_branch},
),
}
t = Bunch(

View File

@ -390,3 +390,38 @@ def submodules_callback(package):
assert not os.path.isfile(file_path)
file_path = os.path.join(s.package.stage.source_path, "third_party/submodule1/r0_file_1")
assert not os.path.isfile(file_path)
@pytest.mark.disable_clean_stage_check
def test_git_sparse_paths_partial_clone(
mock_git_repository, git_version, default_mock_concretization, mutable_mock_repo, monkeypatch
):
"""
Test partial clone of repository when using git_sparse_paths property
"""
type_of_test = "many-directories"
sparse_paths = ["dir0"]
omitted_paths = ["dir1", "dir2"]
t = mock_git_repository.checks[type_of_test]
args = copy.copy(t.args)
args["git_sparse_paths"] = sparse_paths
s = default_mock_concretization("git-test")
monkeypatch.setitem(s.package.versions, Version("git"), args)
s.package.do_stage()
with working_dir(s.package.stage.source_path):
# top level directory files are cloned via sparse-checkout
assert os.path.isfile("r0_file")
for p in sparse_paths:
assert os.path.isdir(p)
if git_version < Version("2.25.0.0"):
# older versions of git should fall back to a full clone
for p in omitted_paths:
assert os.path.isdir(p)
else:
for p in omitted_paths:
assert not os.path.isdir(p)
# fixture file is in the sparse-path expansion tree
assert os.path.isfile(t.file)

View File

@ -259,6 +259,7 @@ def test_git_url_top_level_git_versions(version_str, tag, commit, branch):
assert fetcher.tag == tag
assert fetcher.commit == commit
assert fetcher.branch == branch
assert fetcher.url == pkg_factory("git-url-top-level").git
@pytest.mark.usefixtures("mock_packages", "config")
@ -319,3 +320,14 @@ def test_package_deprecated_version(mock_packages, mock_fetch, mock_stage):
assert spack.package_base.deprecated_version(pkg_cls, "1.1.0")
assert not spack.package_base.deprecated_version(pkg_cls, "1.0.0")
def test_package_can_have_sparse_checkout_properties(mock_packages, mock_fetch, mock_stage):
spec = Spec("git-sparsepaths-pkg")
pkg_cls = spack.repo.PATH.get_pkg_class(spec.name)
assert hasattr(pkg_cls, "git_sparse_paths")
fetcher = spack.fetch_strategy.for_package_version(pkg_cls(spec), "1.0")
assert isinstance(fetcher, spack.fetch_strategy.GitFetchStrategy)
assert hasattr(fetcher, "git_sparse_paths")
assert fetcher.git_sparse_paths == pkg_cls.git_sparse_paths

View File

@ -138,7 +138,7 @@ def lookup_ref(self, ref) -> Tuple[Optional[str], int]:
# Only clone if we don't have it!
if not os.path.exists(dest):
self.fetcher.clone(dest, bare=True)
self.fetcher.bare_clone(dest)
# Lookup commit info
with working_dir(dest):

View File

@ -0,0 +1,17 @@
# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other
# Spack Project Developers. See the top-level COPYRIGHT file for details.
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
from spack.package import *
class GitSparsepathsPkg(Package):
"""Mock package with git_sparse_paths attribute"""
homepage = "http://www.git-fetch-example.com"
git = "https://a/really.com/big/repo.git"
version("1.0", tag="v1.0")
git_sparse_paths = ["foo", "bar", "bing/bang"]