Group primitive url/path handling functions together (#40028)

This commit is contained in:
Massimiliano Culpo
2023-09-15 15:43:23 +02:00
committed by GitHub
parent bc02453f6d
commit fb9e5fcc4f
21 changed files with 903 additions and 888 deletions

459
lib/spack/llnl/url.py Normal file
View File

@@ -0,0 +1,459 @@
# Copyright 2013-2023 Lawrence Livermore National Security, LLC and other
# Spack Project Developers. See the top-level COPYRIGHT file for details.
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
"""URL primitives that just require Python standard library."""
import itertools
import os.path
import re
from typing import Optional, Set, Tuple
from urllib.parse import urlsplit, urlunsplit
# Archive extensions allowed in Spack
PREFIX_EXTENSIONS = ("tar", "TAR")
EXTENSIONS = ("gz", "bz2", "xz", "Z")
NO_TAR_EXTENSIONS = ("zip", "tgz", "tbz2", "tbz", "txz")
# Add PREFIX_EXTENSIONS and EXTENSIONS last so that .tar.gz is matched *before* .tar or .gz
ALLOWED_ARCHIVE_TYPES = (
tuple(".".join(ext) for ext in itertools.product(PREFIX_EXTENSIONS, EXTENSIONS))
+ PREFIX_EXTENSIONS
+ EXTENSIONS
+ NO_TAR_EXTENSIONS
)
CONTRACTION_MAP = {"tgz": "tar.gz", "txz": "tar.xz", "tbz": "tar.bz2", "tbz2": "tar.bz2"}
def find_list_urls(url: str) -> Set[str]:
r"""Find good list URLs for the supplied URL.
By default, returns the dirname of the archive path.
Provides special treatment for the following websites, which have a
unique list URL different from the dirname of the download URL:
========= =======================================================
GitHub https://github.com/<repo>/<name>/releases
GitLab https://gitlab.\*/<repo>/<name>/tags
BitBucket https://bitbucket.org/<repo>/<name>/downloads/?tab=tags
CRAN https://\*.r-project.org/src/contrib/Archive/<name>
PyPI https://pypi.org/simple/<name>/
LuaRocks https://luarocks.org/modules/<repo>/<name>
========= =======================================================
Note: this function is called by `spack versions`, `spack checksum`,
and `spack create`, but not by `spack fetch` or `spack install`.
Parameters:
url (str): The download URL for the package
Returns:
set: One or more list URLs for the package
"""
url_types = [
# GitHub
# e.g. https://github.com/llnl/callpath/archive/v1.0.1.tar.gz
(r"(.*github\.com/[^/]+/[^/]+)", lambda m: m.group(1) + "/releases"),
# GitLab API endpoint
# e.g. https://gitlab.dkrz.de/api/v4/projects/k202009%2Flibaec/repository/archive.tar.gz?sha=v1.0.2
(
r"(.*gitlab[^/]+)/api/v4/projects/([^/]+)%2F([^/]+)",
lambda m: m.group(1) + "/" + m.group(2) + "/" + m.group(3) + "/tags",
),
# GitLab non-API endpoint
# e.g. https://gitlab.dkrz.de/k202009/libaec/uploads/631e85bcf877c2dcaca9b2e6d6526339/libaec-1.0.0.tar.gz
(r"(.*gitlab[^/]+/(?!api/v4/projects)[^/]+/[^/]+)", lambda m: m.group(1) + "/tags"),
# BitBucket
# e.g. https://bitbucket.org/eigen/eigen/get/3.3.3.tar.bz2
(r"(.*bitbucket.org/[^/]+/[^/]+)", lambda m: m.group(1) + "/downloads/?tab=tags"),
# CRAN
# e.g. https://cran.r-project.org/src/contrib/Rcpp_0.12.9.tar.gz
# e.g. https://cloud.r-project.org/src/contrib/rgl_0.98.1.tar.gz
(
r"(.*\.r-project\.org/src/contrib)/([^_]+)",
lambda m: m.group(1) + "/Archive/" + m.group(2),
),
# PyPI
# e.g. https://pypi.io/packages/source/n/numpy/numpy-1.19.4.zip
# e.g. https://www.pypi.io/packages/source/n/numpy/numpy-1.19.4.zip
# e.g. https://pypi.org/packages/source/n/numpy/numpy-1.19.4.zip
# e.g. https://pypi.python.org/packages/source/n/numpy/numpy-1.19.4.zip
# e.g. https://files.pythonhosted.org/packages/source/n/numpy/numpy-1.19.4.zip
# e.g. https://pypi.io/packages/py2.py3/o/opencensus-context/opencensus_context-0.1.1-py2.py3-none-any.whl
(
r"(?:pypi|pythonhosted)[^/]+/packages/[^/]+/./([^/]+)",
lambda m: "https://pypi.org/simple/" + m.group(1) + "/",
),
# LuaRocks
# e.g. https://luarocks.org/manifests/gvvaughan/lpeg-1.0.2-1.src.rock
# e.g. https://luarocks.org/manifests/openresty/lua-cjson-2.1.0-1.src.rock
(
r"luarocks[^/]+/(?:modules|manifests)/(?P<org>[^/]+)/"
+ r"(?P<name>.+?)-[0-9.-]*\.src\.rock",
lambda m: "https://luarocks.org/modules/"
+ m.group("org")
+ "/"
+ m.group("name")
+ "/",
),
]
list_urls = {os.path.dirname(url)}
for pattern, fun in url_types:
match = re.search(pattern, url)
if match:
list_urls.add(fun(match))
return list_urls
def strip_query_and_fragment(url: str) -> Tuple[str, str]:
"""Strips query and fragment from a url, then returns the base url and the suffix.
Args:
url: URL to be stripped
Raises:
ValueError: when there is any error parsing the URL
"""
components = urlsplit(url)
stripped = components[:3] + (None, None)
query, frag = components[3:5]
suffix = ""
if query:
suffix += "?" + query
if frag:
suffix += "#" + frag
return urlunsplit(stripped), suffix
SOURCEFORGE_RE = re.compile(r"(.*(?:sourceforge\.net|sf\.net)/.*)(/download)$")
def split_url_on_sourceforge_suffix(url: str) -> Tuple[str, ...]:
"""If the input is a sourceforge URL, returns base URL and "/download" suffix. Otherwise,
returns the input URL and an empty string.
"""
match = SOURCEFORGE_RE.search(url)
if match is not None:
return match.groups()
return url, ""
def has_extension(path_or_url: str, ext: str) -> bool:
"""Returns true if the extension in input is present in path, false otherwise."""
prefix, _ = split_url_on_sourceforge_suffix(path_or_url)
if not ext.startswith(r"\."):
ext = rf"\.{ext}$"
if re.search(ext, prefix):
return True
return False
def extension_from_path(path_or_url: Optional[str]) -> Optional[str]:
"""Tries to match an allowed archive extension to the input. Returns the first match,
or None if no match was found.
Raises:
ValueError: if the input is None
"""
if path_or_url is None:
raise ValueError("Can't call extension() on None")
for t in ALLOWED_ARCHIVE_TYPES:
if has_extension(path_or_url, t):
return t
return None
def remove_extension(path_or_url: str, *, extension: str) -> str:
"""Returns the input with the extension removed"""
suffix = rf"\.{extension}$"
return re.sub(suffix, "", path_or_url)
def check_and_remove_ext(path: str, *, extension: str) -> str:
"""Returns the input path with the extension removed, if the extension is present in path.
Otherwise, returns the input unchanged.
"""
if not has_extension(path, extension):
return path
path, _ = split_url_on_sourceforge_suffix(path)
return remove_extension(path, extension=extension)
def strip_extension(path_or_url: str, *, extension: Optional[str] = None) -> str:
"""If a path contains the extension in input, returns the path stripped of the extension.
Otherwise, returns the input path.
If extension is None, attempts to strip any allowed extension from path.
"""
if extension is None:
for t in ALLOWED_ARCHIVE_TYPES:
if has_extension(path_or_url, ext=t):
extension = t
break
else:
return path_or_url
return check_and_remove_ext(path_or_url, extension=extension)
def split_url_extension(url: str) -> Tuple[str, ...]:
"""Some URLs have a query string, e.g.:
1. https://github.com/losalamos/CLAMR/blob/packages/PowerParser_v2.0.7.tgz?raw=true
2. http://www.apache.org/dyn/closer.cgi?path=/cassandra/1.2.0/apache-cassandra-1.2.0-rc2-bin.tar.gz
3. https://gitlab.kitware.com/vtk/vtk/repository/archive.tar.bz2?ref=v7.0.0
In (1), the query string needs to be stripped to get at the
extension, but in (2) & (3), the filename is IN a single final query
argument.
This strips the URL into three pieces: ``prefix``, ``ext``, and ``suffix``.
The suffix contains anything that was stripped off the URL to
get at the file extension. In (1), it will be ``'?raw=true'``, but
in (2), it will be empty. In (3) the suffix is a parameter that follows
after the file extension, e.g.:
1. ``('https://github.com/losalamos/CLAMR/blob/packages/PowerParser_v2.0.7', '.tgz', '?raw=true')``
2. ``('http://www.apache.org/dyn/closer.cgi?path=/cassandra/1.2.0/apache-cassandra-1.2.0-rc2-bin', '.tar.gz', None)``
3. ``('https://gitlab.kitware.com/vtk/vtk/repository/archive', '.tar.bz2', '?ref=v7.0.0')``
"""
# Strip off sourceforge download suffix.
# e.g. https://sourceforge.net/projects/glew/files/glew/2.0.0/glew-2.0.0.tgz/download
prefix, suffix = split_url_on_sourceforge_suffix(url)
ext = extension_from_path(prefix)
if ext is not None:
prefix = strip_extension(prefix)
return prefix, ext, suffix
try:
prefix, suf = strip_query_and_fragment(prefix)
except ValueError:
# FIXME: tty.debug("Got error parsing path %s" % path)
# Ignore URL parse errors here
return url, ""
ext = extension_from_path(prefix)
prefix = strip_extension(prefix)
suffix = suf + suffix
if ext is None:
ext = ""
return prefix, ext, suffix
def strip_version_suffixes(path_or_url: str) -> str:
"""Some tarballs contain extraneous information after the version:
* ``bowtie2-2.2.5-source``
* ``libevent-2.0.21-stable``
* ``cuda_8.0.44_linux.run``
These strings are not part of the version number and should be ignored.
This function strips those suffixes off and returns the remaining string.
The goal is that the version is always the last thing in ``path``:
* ``bowtie2-2.2.5``
* ``libevent-2.0.21``
* ``cuda_8.0.44``
Args:
path_or_url: The filename or URL for the package
Returns:
The ``path`` with any extraneous suffixes removed
"""
# NOTE: This could be done with complicated regexes in parse_version_offset
# NOTE: The problem is that we would have to add these regexes to the end
# NOTE: of every single version regex. Easier to just strip them off
# NOTE: permanently
suffix_regexes = [
# Download type
r"[Ii]nstall",
r"all",
r"code",
r"[Ss]ources?",
r"file",
r"full",
r"single",
r"with[a-zA-Z_-]+",
r"rock",
r"src(_0)?",
r"public",
r"bin",
r"binary",
r"run",
r"[Uu]niversal",
r"jar",
r"complete",
r"dynamic",
r"oss",
r"gem",
r"tar",
r"sh",
# Download version
r"release",
r"bin",
r"stable",
r"[Ff]inal",
r"rel",
r"orig",
r"dist",
r"\+",
# License
r"gpl",
# Arch
# Needs to come before and after OS, appears in both orders
r"ia32",
r"intel",
r"amd64",
r"linux64",
r"x64",
r"64bit",
r"x86[_-]64",
r"i586_64",
r"x86",
r"i[36]86",
r"ppc64(le)?",
r"armv?(7l|6l|64)",
# Other
r"cpp",
r"gtk",
r"incubating",
# OS
r"[Ll]inux(_64)?",
r"LINUX",
r"[Uu]ni?x",
r"[Ss]un[Oo][Ss]",
r"[Mm]ac[Oo][Ss][Xx]?",
r"[Oo][Ss][Xx]",
r"[Dd]arwin(64)?",
r"[Aa]pple",
r"[Ww]indows",
r"[Ww]in(64|32)?",
r"[Cc]ygwin(64|32)?",
r"[Mm]ingw",
r"centos",
# Arch
# Needs to come before and after OS, appears in both orders
r"ia32",
r"intel",
r"amd64",
r"linux64",
r"x64",
r"64bit",
r"x86[_-]64",
r"i586_64",
r"x86",
r"i[36]86",
r"ppc64(le)?",
r"armv?(7l|6l|64)?",
# PyPI
r"[._-]py[23].*\.whl",
r"[._-]cp[23].*\.whl",
r"[._-]win.*\.exe",
]
for regex in suffix_regexes:
# Remove the suffix from the end of the path
# This may be done multiple times
path_or_url = re.sub(r"[._-]?" + regex + "$", "", path_or_url)
return path_or_url
def expand_contracted_extension(extension: str) -> str:
"""Returns the expanded version of a known contracted extension.
This function maps extensions like ".tgz" to ".tar.gz". On unknown extensions,
return the input unmodified.
"""
extension = extension.strip(".")
return CONTRACTION_MAP.get(extension, extension)
def expand_contracted_extension_in_path(
path_or_url: str, *, extension: Optional[str] = None
) -> str:
"""Returns the input path or URL with any contraction extension expanded.
Args:
path_or_url: path or URL to be expanded
extension: if specified, only attempt to expand that extension
"""
extension = extension or extension_from_path(path_or_url)
if extension is None:
return path_or_url
expanded = expand_contracted_extension(extension)
if expanded != extension:
return re.sub(rf"{extension}", rf"{expanded}", path_or_url)
return path_or_url
def compression_ext_from_compressed_archive(extension: str) -> Optional[str]:
"""Returns compression extension for a compressed archive"""
extension = expand_contracted_extension(extension)
for ext in [*EXTENSIONS]:
if ext in extension:
return ext
return None
def strip_compression_extension(path_or_url: str, ext: Optional[str] = None) -> str:
"""Strips the compression extension from the input, and returns it. For instance,
"foo.tgz" becomes "foo.tar".
If no extension is given, try a default list of extensions.
Args:
path_or_url: input to be stripped
ext: if given, extension to be stripped
"""
if not extension_from_path(path_or_url):
return path_or_url
expanded_path = expand_contracted_extension_in_path(path_or_url)
candidates = [ext] if ext is not None else EXTENSIONS
for current_extension in candidates:
modified_path = check_and_remove_ext(expanded_path, extension=current_extension)
if modified_path != expanded_path:
return modified_path
return expanded_path
def allowed_archive(path_or_url: str) -> bool:
"""Returns true if the input is a valid archive, False otherwise."""
return (
False if not path_or_url else any(path_or_url.endswith(t) for t in ALLOWED_ARCHIVE_TYPES)
)
def determine_url_file_extension(path: str) -> str:
"""This returns the type of archive a URL refers to. This is
sometimes confusing because of URLs like:
(1) https://github.com/petdance/ack/tarball/1.93_02
Where the URL doesn't actually contain the filename. We need
to know what type it is so that we can appropriately name files
in mirrors.
"""
match = re.search(r"github.com/.+/(zip|tar)ball/", path)
if match:
if match.group(1) == "zip":
return "zip"
elif match.group(1) == "tar":
return "tar.gz"
prefix, ext, suffix = split_url_extension(path)
return ext

View File

@@ -822,7 +822,7 @@ def get_versions(args, name):
if args.url is not None and args.template != "bundle" and valid_url:
# Find available versions
try:
url_dict = spack.util.web.find_versions_of_archive(args.url)
url_dict = spack.url.find_versions_of_archive(args.url)
except UndetectableVersionError:
# Use fake versions
tty.warn("Couldn't detect version in: {0}".format(args.url))

View File

@@ -12,6 +12,7 @@
import spack.fetch_strategy as fs
import spack.repo
import spack.spec
import spack.url
import spack.util.crypto as crypto
from spack.url import (
UndetectableNameError,
@@ -26,7 +27,6 @@
substitution_offsets,
)
from spack.util.naming import simplify_name
from spack.util.web import find_versions_of_archive
description = "debugging tool for url parsing"
section = "developer"
@@ -139,7 +139,7 @@ def url_parse(args):
if args.spider:
print()
tty.msg("Spidering for versions:")
versions = find_versions_of_archive(url)
versions = spack.url.find_versions_of_archive(url)
if not versions:
print(" Found no versions for {0}".format(name))

View File

@@ -31,6 +31,7 @@
import urllib.parse
from typing import List, Optional
import llnl.url
import llnl.util
import llnl.util.filesystem as fs
import llnl.util.tty as tty
@@ -46,7 +47,7 @@
import spack.util.web as web_util
import spack.version
import spack.version.git_ref_lookup
from spack.util.compression import decompressor_for, extension_from_path
from spack.util.compression import decompressor_for
from spack.util.executable import CommandNotFoundError, which
from spack.util.string import comma_and, quote
@@ -441,7 +442,7 @@ def expand(self):
# TODO: replace this by mime check.
if not self.extension:
self.extension = spack.url.determine_url_file_extension(self.url)
self.extension = llnl.url.determine_url_file_extension(self.url)
if self.stage.expanded:
tty.debug("Source already staged to %s" % self.stage.source_path)
@@ -570,7 +571,7 @@ def expand(self):
@_needs_stage
def archive(self, destination, **kwargs):
assert extension_from_path(destination) == "tar.gz"
assert llnl.url.extension_from_path(destination) == "tar.gz"
assert self.stage.source_path.startswith(self.stage.path)
tar = which("tar", required=True)

View File

@@ -1,28 +0,0 @@
# Copyright 2013-2023 Lawrence Livermore National Security, LLC and other
# Spack Project Developers. See the top-level COPYRIGHT file for details.
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
import urllib.parse
import urllib.response
from urllib.error import URLError
from urllib.request import BaseHandler
def gcs_open(req, *args, **kwargs):
"""Open a reader stream to a blob object on GCS"""
import spack.util.gcs as gcs_util
url = urllib.parse.urlparse(req.get_full_url())
gcsblob = gcs_util.GCSBlob(url)
if not gcsblob.exists():
raise URLError("GCS blob {0} does not exist".format(gcsblob.blob_path))
stream = gcsblob.get_blob_byte_stream()
headers = gcsblob.get_blob_headers()
return urllib.response.addinfourl(stream, headers, url)
class GCSHandler(BaseHandler):
def gs_open(self, req):
return gcs_open(req)

View File

@@ -30,7 +30,6 @@
import llnl.util.tty.color as color
from llnl.util.tty.log import log_output
import spack
import spack.cmd
import spack.config
import spack.environment as ev

View File

@@ -20,6 +20,7 @@
import urllib.parse
from typing import Optional, Union
import llnl.url
import llnl.util.tty as tty
from llnl.util.filesystem import mkdirp
@@ -29,7 +30,6 @@
import spack.fetch_strategy as fs
import spack.mirror
import spack.spec
import spack.url as url
import spack.util.path
import spack.util.spack_json as sjson
import spack.util.spack_yaml as syaml
@@ -375,7 +375,7 @@ def _determine_extension(fetcher):
if isinstance(fetcher, fs.URLFetchStrategy):
if fetcher.expand_archive:
# If we fetch with a URLFetchStrategy, use URL's archive type
ext = url.determine_url_file_extension(fetcher.url)
ext = llnl.url.determine_url_file_extension(fetcher.url)
if ext:
# Remove any leading dots

View File

@@ -2377,7 +2377,7 @@ def fetch_remote_versions(self, concurrency=128):
return {}
try:
return spack.util.web.find_versions_of_archive(
return spack.url.find_versions_of_archive(
self.all_urls, self.list_url, self.list_depth, concurrency, reference_package=self
)
except spack.util.web.NoNetworkConnectionError as e:

View File

@@ -11,6 +11,7 @@
import llnl.util.filesystem
import llnl.util.lang
from llnl.url import allowed_archive
import spack
import spack.error
@@ -19,7 +20,6 @@
import spack.repo
import spack.stage
import spack.util.spack_json as sjson
from spack.util.compression import allowed_archive
from spack.util.crypto import Checker, checksum
from spack.util.executable import which, which_string

View File

@@ -1,80 +0,0 @@
# Copyright 2013-2023 Lawrence Livermore National Security, LLC and other
# Spack Project Developers. See the top-level COPYRIGHT file for details.
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
import urllib.error
import urllib.parse
import urllib.request
import urllib.response
from io import BufferedReader, BytesIO, IOBase
import spack.util.s3 as s3_util
# NOTE(opadron): Workaround issue in boto where its StreamingBody
# implementation is missing several APIs expected from IOBase. These missing
# APIs prevent the streams returned by boto from being passed as-are along to
# urllib.
#
# https://github.com/boto/botocore/issues/879
# https://github.com/python/cpython/pull/3249
class WrapStream(BufferedReader):
def __init__(self, raw):
# In botocore >=1.23.47, StreamingBody inherits from IOBase, so we
# only add missing attributes in older versions.
# https://github.com/boto/botocore/commit/a624815eabac50442ed7404f3c4f2664cd0aa784
if not isinstance(raw, IOBase):
raw.readable = lambda: True
raw.writable = lambda: False
raw.seekable = lambda: False
raw.closed = False
raw.flush = lambda: None
super().__init__(raw)
def detach(self):
self.raw = None
def read(self, *args, **kwargs):
return self.raw.read(*args, **kwargs)
def __getattr__(self, key):
return getattr(self.raw, key)
def _s3_open(url, method="GET"):
parsed = urllib.parse.urlparse(url)
s3 = s3_util.get_s3_session(url, method="fetch")
bucket = parsed.netloc
key = parsed.path
if key.startswith("/"):
key = key[1:]
if method not in ("GET", "HEAD"):
raise urllib.error.URLError(
"Only GET and HEAD verbs are currently supported for the s3:// scheme"
)
try:
if method == "GET":
obj = s3.get_object(Bucket=bucket, Key=key)
# NOTE(opadron): Apply workaround here (see above)
stream = WrapStream(obj["Body"])
elif method == "HEAD":
obj = s3.head_object(Bucket=bucket, Key=key)
stream = BytesIO()
except s3.ClientError as e:
raise urllib.error.URLError(e) from e
headers = obj["ResponseMetadata"]["HTTPHeaders"]
return url, headers, stream
class UrllibS3Handler(urllib.request.BaseHandler):
def s3_open(self, req):
orig_url = req.get_full_url()
url, headers, stream = _s3_open(orig_url, method=req.get_method())
return urllib.response.addinfourl(stream, headers, url)

View File

@@ -0,0 +1,167 @@
# Copyright 2013-2023 Lawrence Livermore National Security, LLC and other
# Spack Project Developers. See the top-level COPYRIGHT file for details.
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
"""Tests for llnl.url functions"""
import itertools
import pytest
import llnl.url
@pytest.fixture(params=llnl.url.ALLOWED_ARCHIVE_TYPES)
def archive_and_expected(request):
archive_name = ".".join(["Foo", request.param])
return archive_name, request.param
def test_get_extension(archive_and_expected):
"""Tests that we can predict correctly known extensions for simple cases."""
archive, expected = archive_and_expected
result = llnl.url.extension_from_path(archive)
assert result == expected
def test_get_bad_extension():
"""Tests that a bad extension returns None"""
result = llnl.url.extension_from_path("Foo.cxx")
assert result is None
@pytest.mark.parametrize(
"url,expected",
[
# No suffix
("rgb-1.0.6", "rgb-1.0.6"),
# Misleading prefix
("jpegsrc.v9b", "jpegsrc.v9b"),
("turbolinux702", "turbolinux702"),
("converge_install_2.3.16", "converge_install_2.3.16"),
# Download type - code, source
("cistem-1.0.0-beta-source-code", "cistem-1.0.0-beta"),
# Download type - src
("apache-ant-1.9.7-src", "apache-ant-1.9.7"),
("go1.7.4.src", "go1.7.4"),
# Download type - source
("bowtie2-2.2.5-source", "bowtie2-2.2.5"),
("grib_api-1.17.0-Source", "grib_api-1.17.0"),
# Download type - full
("julia-0.4.3-full", "julia-0.4.3"),
# Download type - bin
("apache-maven-3.3.9-bin", "apache-maven-3.3.9"),
# Download type - binary
("Jmol-14.8.0-binary", "Jmol-14.8.0"),
# Download type - gem
("rubysl-date-2.0.9.gem", "rubysl-date-2.0.9"),
# Download type - tar
("gromacs-4.6.1-tar", "gromacs-4.6.1"),
# Download type - sh
("Miniconda2-4.3.11-Linux-x86_64.sh", "Miniconda2-4.3.11"),
# Download version - release
("v1.0.4-release", "v1.0.4"),
# Download version - stable
("libevent-2.0.21-stable", "libevent-2.0.21"),
# Download version - final
("2.6.7-final", "2.6.7"),
# Download version - rel
("v1.9.5.1rel", "v1.9.5.1"),
# Download version - orig
("dash_0.5.5.1.orig", "dash_0.5.5.1"),
# Download version - plus
("ncbi-blast-2.6.0+-src", "ncbi-blast-2.6.0"),
# License
("cppad-20170114.gpl", "cppad-20170114"),
# Arch
("pcraster-4.1.0_x86-64", "pcraster-4.1.0"),
("dislin-11.0.linux.i586_64", "dislin-11.0"),
("PAGIT.V1.01.64bit", "PAGIT.V1.01"),
# OS - linux
("astyle_2.04_linux", "astyle_2.04"),
# OS - unix
("install-tl-unx", "install-tl"),
# OS - macos
("astyle_1.23_macosx", "astyle_1.23"),
("haxe-2.08-osx", "haxe-2.08"),
# PyPI - wheel
("entrypoints-0.2.2-py2.py3-none-any.whl", "entrypoints-0.2.2"),
(
"numpy-1.12.0-cp27-cp27m-macosx_10_6_intel.macosx_10_9_intel."
"macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl",
"numpy-1.12.0",
),
# PyPI - exe
("PyYAML-3.12.win-amd64-py3.5.exe", "PyYAML-3.12"),
# Combinations of multiple patterns - bin, release
("rocketmq-all-4.5.2-bin-release", "rocketmq-all-4.5.2"),
# Combinations of multiple patterns - all
("p7zip_9.04_src_all", "p7zip_9.04"),
# Combinations of multiple patterns - run
("cuda_8.0.44_linux.run", "cuda_8.0.44"),
# Combinations of multiple patterns - file
("ack-2.14-single-file", "ack-2.14"),
# Combinations of multiple patterns - jar
("antlr-3.4-complete.jar", "antlr-3.4"),
# Combinations of multiple patterns - oss
("tbb44_20160128oss_src_0", "tbb44_20160128"),
# Combinations of multiple patterns - darwin
("ghc-7.0.4-x86_64-apple-darwin", "ghc-7.0.4"),
("ghc-7.0.4-i386-apple-darwin", "ghc-7.0.4"),
# Combinations of multiple patterns - centos
("sratoolkit.2.8.2-1-centos_linux64", "sratoolkit.2.8.2-1"),
# Combinations of multiple patterns - arch
(
"VizGlow_v2.2alpha17-R21November2016-Linux-x86_64-Install",
"VizGlow_v2.2alpha17-R21November2016",
),
("jdk-8u92-linux-x64", "jdk-8u92"),
("cuda_6.5.14_linux_64.run", "cuda_6.5.14"),
("Mathematica_12.0.0_LINUX.sh", "Mathematica_12.0.0"),
("trf407b.linux64", "trf407b"),
# Combinations of multiple patterns - with
("mafft-7.221-with-extensions-src", "mafft-7.221"),
("spark-2.0.0-bin-without-hadoop", "spark-2.0.0"),
("conduit-v0.3.0-src-with-blt", "conduit-v0.3.0"),
# Combinations of multiple patterns - rock
("bitlib-23-2.src.rock", "bitlib-23-2"),
# Combinations of multiple patterns - public
("dakota-6.3-public.src", "dakota-6.3"),
# Combinations of multiple patterns - universal
("synergy-1.3.6p2-MacOSX-Universal", "synergy-1.3.6p2"),
# Combinations of multiple patterns - dynamic
("snptest_v2.5.2_linux_x86_64_dynamic", "snptest_v2.5.2"),
# Combinations of multiple patterns - other
("alglib-3.11.0.cpp.gpl", "alglib-3.11.0"),
("hpcviewer-2019.08-linux.gtk.x86_64", "hpcviewer-2019.08"),
("apache-mxnet-src-1.3.0-incubating", "apache-mxnet-src-1.3.0"),
],
)
def test_url_strip_version_suffixes(url, expected):
stripped = llnl.url.strip_version_suffixes(url)
assert stripped == expected
def test_strip_compression_extension(archive_and_expected):
archive, extension = archive_and_expected
stripped = llnl.url.strip_compression_extension(archive)
if extension == "zip":
assert stripped == "Foo.zip"
stripped = llnl.url.strip_compression_extension(archive, "zip")
assert stripped == "Foo"
elif (
extension.lower() == "tar"
or extension in llnl.url.CONTRACTION_MAP
or extension
in [
".".join(ext)
for ext in itertools.product(llnl.url.PREFIX_EXTENSIONS, llnl.url.EXTENSIONS)
]
):
assert stripped == "Foo.tar" or stripped == "Foo.TAR"
else:
assert stripped == "Foo"
def test_allowed_archive(archive_and_expected):
archive, _ = archive_and_expected
assert llnl.url.allowed_archive(archive)

View File

@@ -17,124 +17,11 @@
parse_name_offset,
parse_version_offset,
strip_name_suffixes,
strip_version_suffixes,
substitute_version,
)
from spack.version import Version
@pytest.mark.parametrize(
"url,expected",
[
# No suffix
("rgb-1.0.6", "rgb-1.0.6"),
# Misleading prefix
("jpegsrc.v9b", "jpegsrc.v9b"),
("turbolinux702", "turbolinux702"),
("converge_install_2.3.16", "converge_install_2.3.16"),
# Download type - code, source
("cistem-1.0.0-beta-source-code", "cistem-1.0.0-beta"),
# Download type - src
("apache-ant-1.9.7-src", "apache-ant-1.9.7"),
("go1.7.4.src", "go1.7.4"),
# Download type - source
("bowtie2-2.2.5-source", "bowtie2-2.2.5"),
("grib_api-1.17.0-Source", "grib_api-1.17.0"),
# Download type - full
("julia-0.4.3-full", "julia-0.4.3"),
# Download type - bin
("apache-maven-3.3.9-bin", "apache-maven-3.3.9"),
# Download type - binary
("Jmol-14.8.0-binary", "Jmol-14.8.0"),
# Download type - gem
("rubysl-date-2.0.9.gem", "rubysl-date-2.0.9"),
# Download type - tar
("gromacs-4.6.1-tar", "gromacs-4.6.1"),
# Download type - sh
("Miniconda2-4.3.11-Linux-x86_64.sh", "Miniconda2-4.3.11"),
# Download version - release
("v1.0.4-release", "v1.0.4"),
# Download version - stable
("libevent-2.0.21-stable", "libevent-2.0.21"),
# Download version - final
("2.6.7-final", "2.6.7"),
# Download version - rel
("v1.9.5.1rel", "v1.9.5.1"),
# Download version - orig
("dash_0.5.5.1.orig", "dash_0.5.5.1"),
# Download version - plus
("ncbi-blast-2.6.0+-src", "ncbi-blast-2.6.0"),
# License
("cppad-20170114.gpl", "cppad-20170114"),
# Arch
("pcraster-4.1.0_x86-64", "pcraster-4.1.0"),
("dislin-11.0.linux.i586_64", "dislin-11.0"),
("PAGIT.V1.01.64bit", "PAGIT.V1.01"),
# OS - linux
("astyle_2.04_linux", "astyle_2.04"),
# OS - unix
("install-tl-unx", "install-tl"),
# OS - macos
("astyle_1.23_macosx", "astyle_1.23"),
("haxe-2.08-osx", "haxe-2.08"),
# PyPI - wheel
("entrypoints-0.2.2-py2.py3-none-any.whl", "entrypoints-0.2.2"),
(
"numpy-1.12.0-cp27-cp27m-macosx_10_6_intel.macosx_10_9_intel."
"macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl",
"numpy-1.12.0",
),
# PyPI - exe
("PyYAML-3.12.win-amd64-py3.5.exe", "PyYAML-3.12"),
# Combinations of multiple patterns - bin, release
("rocketmq-all-4.5.2-bin-release", "rocketmq-all-4.5.2"),
# Combinations of multiple patterns - all
("p7zip_9.04_src_all", "p7zip_9.04"),
# Combinations of multiple patterns - run
("cuda_8.0.44_linux.run", "cuda_8.0.44"),
# Combinations of multiple patterns - file
("ack-2.14-single-file", "ack-2.14"),
# Combinations of multiple patterns - jar
("antlr-3.4-complete.jar", "antlr-3.4"),
# Combinations of multiple patterns - oss
("tbb44_20160128oss_src_0", "tbb44_20160128"),
# Combinations of multiple patterns - darwin
("ghc-7.0.4-x86_64-apple-darwin", "ghc-7.0.4"),
("ghc-7.0.4-i386-apple-darwin", "ghc-7.0.4"),
# Combinations of multiple patterns - centos
("sratoolkit.2.8.2-1-centos_linux64", "sratoolkit.2.8.2-1"),
# Combinations of multiple patterns - arch
(
"VizGlow_v2.2alpha17-R21November2016-Linux-x86_64-Install",
"VizGlow_v2.2alpha17-R21November2016",
),
("jdk-8u92-linux-x64", "jdk-8u92"),
("cuda_6.5.14_linux_64.run", "cuda_6.5.14"),
("Mathematica_12.0.0_LINUX.sh", "Mathematica_12.0.0"),
("trf407b.linux64", "trf407b"),
# Combinations of multiple patterns - with
("mafft-7.221-with-extensions-src", "mafft-7.221"),
("spark-2.0.0-bin-without-hadoop", "spark-2.0.0"),
("conduit-v0.3.0-src-with-blt", "conduit-v0.3.0"),
# Combinations of multiple patterns - rock
("bitlib-23-2.src.rock", "bitlib-23-2"),
# Combinations of multiple patterns - public
("dakota-6.3-public.src", "dakota-6.3"),
# Combinations of multiple patterns - universal
("synergy-1.3.6p2-MacOSX-Universal", "synergy-1.3.6p2"),
# Combinations of multiple patterns - dynamic
("snptest_v2.5.2_linux_x86_64_dynamic", "snptest_v2.5.2"),
# Combinations of multiple patterns - other
("alglib-3.11.0.cpp.gpl", "alglib-3.11.0"),
("hpcviewer-2019.08-linux.gtk.x86_64", "hpcviewer-2019.08"),
("apache-mxnet-src-1.3.0-incubating", "apache-mxnet-src-1.3.0"),
],
)
def test_url_strip_version_suffixes(url, expected):
stripped = strip_version_suffixes(url)
assert stripped == expected
@pytest.mark.parametrize(
"url,version,expected",
[

View File

@@ -10,6 +10,7 @@
import pytest
import llnl.url
from llnl.util.filesystem import working_dir
from spack.paths import spack_root
@@ -21,7 +22,7 @@
ext_archive = {}
[
ext_archive.update({ext: ".".join(["Foo", ext])})
for ext in scomp.ALLOWED_ARCHIVE_TYPES
for ext in llnl.url.ALLOWED_ARCHIVE_TYPES
if "TAR" not in ext
]
# Spack does not use Python native handling for tarballs or zip
@@ -95,38 +96,3 @@ def test_unallowed_extension():
bad_ext_archive = "Foo.cxx"
with pytest.raises(CommandNotFoundError):
scomp.decompressor_for(bad_ext_archive)
@pytest.mark.parametrize("archive", ext_archive.values())
def test_get_extension(archive):
ext = scomp.extension_from_path(archive)
assert ext_archive[ext] == archive
def test_get_bad_extension():
archive = "Foo.cxx"
ext = scomp.extension_from_path(archive)
assert ext is None
@pytest.mark.parametrize("path", ext_archive.values())
def test_allowed_archive(path):
assert scomp.allowed_archive(path)
@pytest.mark.parametrize("ext_path", ext_archive.items())
def test_strip_compression_extension(ext_path):
ext, path = ext_path
stripped = scomp.strip_compression_extension(path)
if ext == "zip":
assert stripped == "Foo.zip"
stripped = scomp.strip_compression_extension(path, "zip")
assert stripped == "Foo"
elif (
ext == "tar"
or ext in scomp.CONTRACTION_MAP.keys()
or ext in [".".join(ext) for ext in product(scomp.PRE_EXTS, scomp.EXTS)]
):
assert stripped == "Foo.tar" or stripped == "Foo.TAR"
else:
assert stripped == "Foo"

View File

@@ -15,6 +15,7 @@
import spack.config
import spack.mirror
import spack.paths
import spack.url
import spack.util.path
import spack.util.s3
import spack.util.url as url_util
@@ -102,31 +103,31 @@ def test_spider_no_response(monkeypatch):
def test_find_versions_of_archive_0():
versions = spack.util.web.find_versions_of_archive(root_tarball, root, list_depth=0)
versions = spack.url.find_versions_of_archive(root_tarball, root, list_depth=0)
assert Version("0.0.0") in versions
def test_find_versions_of_archive_1():
versions = spack.util.web.find_versions_of_archive(root_tarball, root, list_depth=1)
versions = spack.url.find_versions_of_archive(root_tarball, root, list_depth=1)
assert Version("0.0.0") in versions
assert Version("1.0.0") in versions
def test_find_versions_of_archive_2():
versions = spack.util.web.find_versions_of_archive(root_tarball, root, list_depth=2)
versions = spack.url.find_versions_of_archive(root_tarball, root, list_depth=2)
assert Version("0.0.0") in versions
assert Version("1.0.0") in versions
assert Version("2.0.0") in versions
def test_find_exotic_versions_of_archive_2():
versions = spack.util.web.find_versions_of_archive(root_tarball, root, list_depth=2)
versions = spack.url.find_versions_of_archive(root_tarball, root, list_depth=2)
# up for grabs to make this better.
assert Version("2.0.0b2") in versions
def test_find_versions_of_archive_3():
versions = spack.util.web.find_versions_of_archive(root_tarball, root, list_depth=3)
versions = spack.url.find_versions_of_archive(root_tarball, root, list_depth=3)
assert Version("0.0.0") in versions
assert Version("1.0.0") in versions
assert Version("2.0.0") in versions
@@ -135,16 +136,14 @@ def test_find_versions_of_archive_3():
def test_find_exotic_versions_of_archive_3():
versions = spack.util.web.find_versions_of_archive(root_tarball, root, list_depth=3)
versions = spack.url.find_versions_of_archive(root_tarball, root, list_depth=3)
assert Version("2.0.0b2") in versions
assert Version("3.0a1") in versions
assert Version("4.5-rc5") in versions
def test_find_versions_of_archive_with_fragment():
versions = spack.util.web.find_versions_of_archive(
root_tarball, root_with_fragment, list_depth=0
)
versions = spack.url.find_versions_of_archive(root_tarball, root_with_fragment, list_depth=0)
assert Version("5.0.0") in versions
@@ -311,7 +310,7 @@ def test_remove_s3_url(monkeypatch, capfd):
def get_s3_session(url, method="fetch"):
return MockS3Client()
monkeypatch.setattr(spack.util.s3, "get_s3_session", get_s3_session)
monkeypatch.setattr(spack.util.web, "get_s3_session", get_s3_session)
current_debug_level = tty.debug_level()
tty.set_debug(1)

View File

@@ -27,246 +27,22 @@
"""
import io
import os
import pathlib
import re
from urllib.parse import urlsplit, urlunsplit
import llnl.util.tty as tty
import llnl.url
from llnl.util.tty.color import cescape, colorize
import spack.error
import spack.util.compression as comp
import spack.util.path as spath
import spack.util.web
import spack.version
from spack.util.path import convert_to_posix_path
#
# Note: We call the input to most of these functions a "path" but the functions
# work on paths and URLs. There's not a good word for both of these, but
# "path" seemed like the most generic term.
#
def find_list_urls(url):
r"""Find good list URLs for the supplied URL.
By default, returns the dirname of the archive path.
Provides special treatment for the following websites, which have a
unique list URL different from the dirname of the download URL:
========= =======================================================
GitHub https://github.com/<repo>/<name>/releases
GitLab https://gitlab.\*/<repo>/<name>/tags
BitBucket https://bitbucket.org/<repo>/<name>/downloads/?tab=tags
CRAN https://\*.r-project.org/src/contrib/Archive/<name>
PyPI https://pypi.org/simple/<name>/
LuaRocks https://luarocks.org/modules/<repo>/<name>
========= =======================================================
Note: this function is called by `spack versions`, `spack checksum`,
and `spack create`, but not by `spack fetch` or `spack install`.
Parameters:
url (str): The download URL for the package
Returns:
set: One or more list URLs for the package
"""
url_types = [
# GitHub
# e.g. https://github.com/llnl/callpath/archive/v1.0.1.tar.gz
(r"(.*github\.com/[^/]+/[^/]+)", lambda m: m.group(1) + "/releases"),
# GitLab API endpoint
# e.g. https://gitlab.dkrz.de/api/v4/projects/k202009%2Flibaec/repository/archive.tar.gz?sha=v1.0.2
(
r"(.*gitlab[^/]+)/api/v4/projects/([^/]+)%2F([^/]+)",
lambda m: m.group(1) + "/" + m.group(2) + "/" + m.group(3) + "/tags",
),
# GitLab non-API endpoint
# e.g. https://gitlab.dkrz.de/k202009/libaec/uploads/631e85bcf877c2dcaca9b2e6d6526339/libaec-1.0.0.tar.gz
(r"(.*gitlab[^/]+/(?!api/v4/projects)[^/]+/[^/]+)", lambda m: m.group(1) + "/tags"),
# BitBucket
# e.g. https://bitbucket.org/eigen/eigen/get/3.3.3.tar.bz2
(r"(.*bitbucket.org/[^/]+/[^/]+)", lambda m: m.group(1) + "/downloads/?tab=tags"),
# CRAN
# e.g. https://cran.r-project.org/src/contrib/Rcpp_0.12.9.tar.gz
# e.g. https://cloud.r-project.org/src/contrib/rgl_0.98.1.tar.gz
(
r"(.*\.r-project\.org/src/contrib)/([^_]+)",
lambda m: m.group(1) + "/Archive/" + m.group(2),
),
# PyPI
# e.g. https://pypi.io/packages/source/n/numpy/numpy-1.19.4.zip
# e.g. https://www.pypi.io/packages/source/n/numpy/numpy-1.19.4.zip
# e.g. https://pypi.org/packages/source/n/numpy/numpy-1.19.4.zip
# e.g. https://pypi.python.org/packages/source/n/numpy/numpy-1.19.4.zip
# e.g. https://files.pythonhosted.org/packages/source/n/numpy/numpy-1.19.4.zip
# e.g. https://pypi.io/packages/py2.py3/o/opencensus-context/opencensus_context-0.1.1-py2.py3-none-any.whl
(
r"(?:pypi|pythonhosted)[^/]+/packages/[^/]+/./([^/]+)",
lambda m: "https://pypi.org/simple/" + m.group(1) + "/",
),
# LuaRocks
# e.g. https://luarocks.org/manifests/gvvaughan/lpeg-1.0.2-1.src.rock
# e.g. https://luarocks.org/manifests/openresty/lua-cjson-2.1.0-1.src.rock
(
r"luarocks[^/]+/(?:modules|manifests)/(?P<org>[^/]+)/"
+ r"(?P<name>.+?)-[0-9.-]*\.src\.rock",
lambda m: "https://luarocks.org/modules/"
+ m.group("org")
+ "/"
+ m.group("name")
+ "/",
),
]
list_urls = set([os.path.dirname(url)])
for pattern, fun in url_types:
match = re.search(pattern, url)
if match:
list_urls.add(fun(match))
return list_urls
def strip_query_and_fragment(path):
try:
components = urlsplit(path)
stripped = components[:3] + (None, None)
query, frag = components[3:5]
suffix = ""
if query:
suffix += "?" + query
if frag:
suffix += "#" + frag
return (urlunsplit(stripped), suffix)
except ValueError:
tty.debug("Got error parsing path %s" % path)
return (path, "") # Ignore URL parse errors here
def strip_version_suffixes(path):
"""Some tarballs contain extraneous information after the version:
* ``bowtie2-2.2.5-source``
* ``libevent-2.0.21-stable``
* ``cuda_8.0.44_linux.run``
These strings are not part of the version number and should be ignored.
This function strips those suffixes off and returns the remaining string.
The goal is that the version is always the last thing in ``path``:
* ``bowtie2-2.2.5``
* ``libevent-2.0.21``
* ``cuda_8.0.44``
Args:
path (str): The filename or URL for the package
Returns:
str: The ``path`` with any extraneous suffixes removed
"""
# NOTE: This could be done with complicated regexes in parse_version_offset
# NOTE: The problem is that we would have to add these regexes to the end
# NOTE: of every single version regex. Easier to just strip them off
# NOTE: permanently
suffix_regexes = [
# Download type
r"[Ii]nstall",
r"all",
r"code",
r"[Ss]ources?",
r"file",
r"full",
r"single",
r"with[a-zA-Z_-]+",
r"rock",
r"src(_0)?",
r"public",
r"bin",
r"binary",
r"run",
r"[Uu]niversal",
r"jar",
r"complete",
r"dynamic",
r"oss",
r"gem",
r"tar",
r"sh",
# Download version
r"release",
r"bin",
r"stable",
r"[Ff]inal",
r"rel",
r"orig",
r"dist",
r"\+",
# License
r"gpl",
# Arch
# Needs to come before and after OS, appears in both orders
r"ia32",
r"intel",
r"amd64",
r"linux64",
r"x64",
r"64bit",
r"x86[_-]64",
r"i586_64",
r"x86",
r"i[36]86",
r"ppc64(le)?",
r"armv?(7l|6l|64)",
# Other
r"cpp",
r"gtk",
r"incubating",
# OS
r"[Ll]inux(_64)?",
r"LINUX",
r"[Uu]ni?x",
r"[Ss]un[Oo][Ss]",
r"[Mm]ac[Oo][Ss][Xx]?",
r"[Oo][Ss][Xx]",
r"[Dd]arwin(64)?",
r"[Aa]pple",
r"[Ww]indows",
r"[Ww]in(64|32)?",
r"[Cc]ygwin(64|32)?",
r"[Mm]ingw",
r"centos",
# Arch
# Needs to come before and after OS, appears in both orders
r"ia32",
r"intel",
r"amd64",
r"linux64",
r"x64",
r"64bit",
r"x86[_-]64",
r"i586_64",
r"x86",
r"i[36]86",
r"ppc64(le)?",
r"armv?(7l|6l|64)?",
# PyPI
r"[._-]py[23].*\.whl",
r"[._-]cp[23].*\.whl",
r"[._-]win.*\.exe",
]
for regex in suffix_regexes:
# Remove the suffix from the end of the path
# This may be done multiple times
path = re.sub(r"[._-]?" + regex + "$", "", path)
return path
def strip_name_suffixes(path, version):
@@ -341,69 +117,6 @@ def strip_name_suffixes(path, version):
return path
def split_url_extension(path):
"""Some URLs have a query string, e.g.:
1. https://github.com/losalamos/CLAMR/blob/packages/PowerParser_v2.0.7.tgz?raw=true
2. http://www.apache.org/dyn/closer.cgi?path=/cassandra/1.2.0/apache-cassandra-1.2.0-rc2-bin.tar.gz
3. https://gitlab.kitware.com/vtk/vtk/repository/archive.tar.bz2?ref=v7.0.0
In (1), the query string needs to be stripped to get at the
extension, but in (2) & (3), the filename is IN a single final query
argument.
This strips the URL into three pieces: ``prefix``, ``ext``, and ``suffix``.
The suffix contains anything that was stripped off the URL to
get at the file extension. In (1), it will be ``'?raw=true'``, but
in (2), it will be empty. In (3) the suffix is a parameter that follows
after the file extension, e.g.:
1. ``('https://github.com/losalamos/CLAMR/blob/packages/PowerParser_v2.0.7', '.tgz', '?raw=true')``
2. ``('http://www.apache.org/dyn/closer.cgi?path=/cassandra/1.2.0/apache-cassandra-1.2.0-rc2-bin', '.tar.gz', None)``
3. ``('https://gitlab.kitware.com/vtk/vtk/repository/archive', '.tar.bz2', '?ref=v7.0.0')``
"""
prefix, ext, suffix = path, "", ""
# Strip off sourceforge download suffix.
# e.g. https://sourceforge.net/projects/glew/files/glew/2.0.0/glew-2.0.0.tgz/download
prefix, suffix = spath.find_sourceforge_suffix(path)
ext = comp.extension_from_path(prefix)
if ext is not None:
prefix = comp.strip_extension(prefix)
else:
prefix, suf = strip_query_and_fragment(prefix)
ext = comp.extension_from_path(prefix)
prefix = comp.strip_extension(prefix)
suffix = suf + suffix
if ext is None:
ext = ""
return prefix, ext, suffix
def determine_url_file_extension(path):
"""This returns the type of archive a URL refers to. This is
sometimes confusing because of URLs like:
(1) https://github.com/petdance/ack/tarball/1.93_02
Where the URL doesn't actually contain the filename. We need
to know what type it is so that we can appropriately name files
in mirrors.
"""
match = re.search(r"github.com/.+/(zip|tar)ball/", path)
if match:
if match.group(1) == "zip":
return "zip"
elif match.group(1) == "tar":
return "tar.gz"
prefix, ext, suffix = split_url_extension(path)
return ext
def parse_version_offset(path):
"""Try to extract a version string from a filename or URL.
@@ -426,13 +139,13 @@ def parse_version_offset(path):
# path: The prefix of the URL, everything before the ext and suffix
# ext: The file extension
# suffix: Any kind of query string that begins with a '?'
path, ext, suffix = split_url_extension(path)
path, ext, suffix = llnl.url.split_url_extension(path)
# stem: Everything from path after the final '/'
original_stem = os.path.basename(path)
# Try to strip off anything after the version number
stem = strip_version_suffixes(original_stem)
stem = llnl.url.strip_version_suffixes(original_stem)
# Assumptions:
#
@@ -620,7 +333,7 @@ def parse_name_offset(path, v=None):
# path: The prefix of the URL, everything before the ext and suffix
# ext: The file extension
# suffix: Any kind of query string that begins with a '?'
path, ext, suffix = split_url_extension(path)
path, ext, suffix = llnl.url.split_url_extension(path)
# stem: Everything from path after the final '/'
original_stem = os.path.basename(path)
@@ -735,28 +448,6 @@ def parse_name_and_version(path):
return (name, ver)
def insensitize(string):
"""Change upper and lowercase letters to be case insensitive in
the provided string. e.g., 'a' becomes '[Aa]', 'B' becomes
'[bB]', etc. Use for building regexes."""
def to_ins(match):
char = match.group(1)
return "[%s%s]" % (char.lower(), char.upper())
return re.sub(r"([a-zA-Z])", to_ins, string)
def cumsum(elts, init=0, fn=lambda x: x):
"""Return cumulative sum of result of fn on each element in elts."""
sums = []
s = init
for i, e in enumerate(elts):
sums.append(s)
s += fn(e)
return sums
def find_all(substring, string):
"""Returns a list containing the indices of
every occurrence of substring in string."""
@@ -912,6 +603,122 @@ def color_url(path, **kwargs):
return colorize(out.getvalue())
def find_versions_of_archive(
archive_urls, list_url=None, list_depth=0, concurrency=32, reference_package=None
):
"""Scrape web pages for new versions of a tarball. This function prefers URLs in the
following order: links found on the scraped page that match a url generated by the
reference package, found and in the archive_urls list, found and derived from those
in the archive_urls list, and if none are found for a version then the item in the
archive_urls list is included for the version.
Args:
archive_urls (str or list or tuple): URL or sequence of URLs for
different versions of a package. Typically these are just the
tarballs from the package file itself. By default, this searches
the parent directories of archives.
list_url (str or None): URL for a listing of archives.
Spack will scrape these pages for download links that look
like the archive URL.
list_depth (int): max depth to follow links on list_url pages.
Defaults to 0.
concurrency (int): maximum number of concurrent requests
reference_package (spack.package_base.PackageBase or None): a spack package
used as a reference for url detection. Uses the url_for_version
method on the package to produce reference urls which, if found,
are preferred.
"""
if not isinstance(archive_urls, (list, tuple)):
archive_urls = [archive_urls]
# Generate a list of list_urls based on archive urls and any
# explicitly listed list_url in the package
list_urls = set()
if list_url is not None:
list_urls.add(list_url)
for aurl in archive_urls:
list_urls |= llnl.url.find_list_urls(aurl)
# Add '/' to the end of the URL. Some web servers require this.
additional_list_urls = set()
for lurl in list_urls:
if not lurl.endswith("/"):
additional_list_urls.add(lurl + "/")
list_urls |= additional_list_urls
# Grab some web pages to scrape.
pages, links = spack.util.web.spider(list_urls, depth=list_depth, concurrency=concurrency)
# Scrape them for archive URLs
regexes = []
for aurl in archive_urls:
# This creates a regex from the URL with a capture group for
# the version part of the URL. The capture group is converted
# to a generic wildcard, so we can use this to extract things
# on a page that look like archive URLs.
url_regex = wildcard_version(aurl)
# We'll be a bit more liberal and just look for the archive
# part, not the full path.
# this is a URL so it is a posixpath even on Windows
url_regex = pathlib.PurePosixPath(url_regex).name
# We need to add a / to the beginning of the regex to prevent
# Spack from picking up similarly named packages like:
# https://cran.r-project.org/src/contrib/pls_2.6-0.tar.gz
# https://cran.r-project.org/src/contrib/enpls_5.7.tar.gz
# https://cran.r-project.org/src/contrib/autopls_1.3.tar.gz
# https://cran.r-project.org/src/contrib/matrixpls_1.0.4.tar.gz
url_regex = "/" + url_regex
# We need to add a $ anchor to the end of the regex to prevent
# Spack from picking up signature files like:
# .asc
# .md5
# .sha256
# .sig
# However, SourceForge downloads still need to end in '/download'.
url_regex += r"(\/download)?"
# PyPI adds #sha256=... to the end of the URL
url_regex += "(#sha256=.*)?"
url_regex += "$"
regexes.append(url_regex)
regexes = [re.compile(r) for r in regexes]
# Build a dict version -> URL from any links that match the wildcards.
# Walk through archive_url links first.
# Any conflicting versions will be overwritten by the list_url links.
versions = {}
matched = set()
for url in sorted(links):
url = convert_to_posix_path(url)
if any(r.search(url) for r in regexes):
try:
ver = parse_version(url)
if ver in matched:
continue
versions[ver] = url
# prevent this version from getting overwritten
if reference_package is not None:
if url == reference_package.url_for_version(ver):
matched.add(ver)
else:
extrapolated_urls = [substitute_version(u, ver) for u in archive_urls]
if url in extrapolated_urls:
matched.add(ver)
except UndetectableVersionError:
continue
for url in archive_urls:
url = convert_to_posix_path(url)
ver = parse_version(url)
if ver not in versions:
versions[ver] = url
return versions
class UrlParseError(spack.error.SpackError):
"""Raised when the URL module can't parse something correctly."""

View File

@@ -9,27 +9,13 @@
import re
import shutil
import sys
from itertools import product
import llnl.url
from llnl.util import tty
import spack.util.path as spath
from spack.error import SpackError
from spack.util.executable import CommandNotFoundError, which
# Supported archive extensions.
PRE_EXTS = ["tar", "TAR"]
EXTS = ["gz", "bz2", "xz", "Z"]
NOTAR_EXTS = ["zip", "tgz", "tbz2", "tbz", "txz"]
CONTRACTION_MAP = {"tgz": "tar.gz", "txz": "tar.xz", "tbz": "tar.bz2", "tbz2": "tar.bz2"}
# Add PRE_EXTS and EXTS last so that .tar.gz is matched *before* .tar or .gz
ALLOWED_ARCHIVE_TYPES = (
[".".join(ext) for ext in product(PRE_EXTS, EXTS)] + PRE_EXTS + EXTS + NOTAR_EXTS
)
ALLOWED_SINGLE_EXT_ARCHIVE_TYPES = PRE_EXTS + EXTS + NOTAR_EXTS
try:
import bz2 # noqa
@@ -66,10 +52,6 @@ def is_bz2_supported():
return _bz2_support
def allowed_archive(path):
return False if not path else any(path.endswith(t) for t in ALLOWED_ARCHIVE_TYPES)
def _system_untar(archive_file, remove_archive_file=False):
"""Returns path to unarchived tar file.
Untars archive via system tar.
@@ -78,7 +60,7 @@ def _system_untar(archive_file, remove_archive_file=False):
archive_file (str): absolute path to the archive to be extracted.
Can be one of .tar(.[gz|bz2|xz|Z]) or .(tgz|tbz|tbz2|txz).
"""
archive_file_no_ext = strip_extension(archive_file)
archive_file_no_ext = llnl.url.strip_extension(archive_file)
outfile = os.path.basename(archive_file_no_ext)
if archive_file_no_ext == archive_file:
# the archive file has no extension. Tar on windows cannot untar onto itself
@@ -114,7 +96,7 @@ def _bunzip2(archive_file):
def _py_bunzip(archive_file):
"""Returns path to decompressed file.
Decompresses bz2 compressed archives/files via python's bz2 module"""
decompressed_file = os.path.basename(strip_compression_extension(archive_file, "bz2"))
decompressed_file = os.path.basename(llnl.url.strip_compression_extension(archive_file, "bz2"))
working_dir = os.getcwd()
archive_out = os.path.join(working_dir, decompressed_file)
f_bz = bz2.BZ2File(archive_file, mode="rb")
@@ -128,7 +110,7 @@ def _system_bunzip(archive_file):
"""Returns path to decompressed file.
Decompresses bz2 compressed archives/files via system bzip2 utility"""
compressed_file_name = os.path.basename(archive_file)
decompressed_file = os.path.basename(strip_compression_extension(archive_file, "bz2"))
decompressed_file = os.path.basename(llnl.url.strip_compression_extension(archive_file, "bz2"))
working_dir = os.getcwd()
archive_out = os.path.join(working_dir, decompressed_file)
copy_path = os.path.join(working_dir, compressed_file_name)
@@ -158,7 +140,7 @@ def _gunzip(archive_file):
def _py_gunzip(archive_file):
"""Returns path to gunzip'd file
Decompresses `.gz` compressed archvies via python gzip module"""
decompressed_file = os.path.basename(strip_compression_extension(archive_file, "gz"))
decompressed_file = os.path.basename(llnl.url.strip_compression_extension(archive_file, "gz"))
working_dir = os.getcwd()
destination_abspath = os.path.join(working_dir, decompressed_file)
f_in = gzip.open(archive_file, "rb")
@@ -171,7 +153,7 @@ def _py_gunzip(archive_file):
def _system_gunzip(archive_file):
"""Returns path to gunzip'd file
Decompresses `.gz` compressed files via system gzip"""
archive_file_no_ext = strip_compression_extension(archive_file)
archive_file_no_ext = llnl.url.strip_compression_extension(archive_file)
if archive_file_no_ext == archive_file:
# the zip file has no extension. On Unix gunzip cannot unzip onto itself
archive_file = archive_file + ".gz"
@@ -196,7 +178,7 @@ def _unzip(archive_file):
Args:
archive_file (str): absolute path of the file to be decompressed
"""
extracted_file = os.path.basename(strip_extension(archive_file, "zip"))
extracted_file = os.path.basename(llnl.url.strip_extension(archive_file, extension="zip"))
if sys.platform == "win32":
return _system_untar(archive_file)
else:
@@ -259,7 +241,7 @@ def unarchive(archive_file):
def _py_lzma(archive_file):
"""Returns path to decompressed .xz files
Decompress lzma compressed .xz files via python lzma module"""
decompressed_file = os.path.basename(strip_compression_extension(archive_file, "xz"))
decompressed_file = os.path.basename(llnl.url.strip_compression_extension(archive_file, "xz"))
archive_out = os.path.join(os.getcwd(), decompressed_file)
with open(archive_out, "wb") as ar:
with lzma.open(archive_file) as lar:
@@ -272,7 +254,7 @@ def _xz(archive_file):
Decompress lzma compressed .xz files via xz command line
tool.
"""
decompressed_file = os.path.basename(strip_extension(archive_file, "xz"))
decompressed_file = os.path.basename(llnl.url.strip_extension(archive_file, extension="xz"))
working_dir = os.getcwd()
destination_abspath = os.path.join(working_dir, decompressed_file)
compressed_file = os.path.basename(archive_file)
@@ -297,13 +279,13 @@ def _system_7zip(archive_file):
Args:
archive_file (str): absolute path of file to be unarchived
"""
outfile = os.path.basename(strip_compression_extension(archive_file))
outfile = os.path.basename(llnl.url.strip_compression_extension(archive_file))
_7z = which("7z")
if not _7z:
raise CommandNotFoundError(
"7z unavailable,\
unable to extract %s files. 7z can be installed via Spack"
% extension_from_path(archive_file)
% llnl.url.extension_from_path(archive_file)
)
_7z.add_default_arg("e")
_7z(archive_file)
@@ -318,7 +300,7 @@ def decompressor_for(path, extension=None):
if not extension:
extension = extension_from_file(path, decompress=True)
if not allowed_archive(extension):
if not llnl.url.allowed_archive(extension):
raise CommandNotFoundError(
"Cannot extract archive, \
unrecognized file extension: '%s'"
@@ -394,7 +376,7 @@ def decompressor_for_win(extension):
path (str): path of the archive file requiring decompression
extension (str): extension
"""
extension = expand_contracted_extension(extension)
extension = llnl.url.expand_contracted_extension(extension)
# Windows native tar can handle .zip extensions, use standard
# unzip method
if re.match(r"zip$", extension):
@@ -415,7 +397,7 @@ def decompressor_for_win(extension):
# python based decompression strategy
# Expand extension from contracted extension i.e. tar.gz from .tgz
# no-op on non contracted extensions
compression_extension = compression_ext_from_compressed_archive(extension)
compression_extension = llnl.url.compression_ext_from_compressed_archive(extension)
decompressor = _determine_py_decomp_archive_strategy(compression_extension)
if not decompressor:
raise SpackError(
@@ -657,7 +639,7 @@ def extension_from_stream(stream, decompress=False):
"Cannot derive file extension from magic number;"
" falling back to regex path parsing."
)
return extension_from_path(stream.name)
return llnl.url.extension_from_path(stream.name)
resultant_ext = suffix_ext if not prefix_ext else ".".join([prefix_ext, suffix_ext])
tty.debug("File extension %s successfully derived by magic number." % resultant_ext)
return resultant_ext
@@ -693,114 +675,11 @@ def extension_from_file(file, decompress=False):
if ext and ext.startswith("tar."):
suf = ext.split(".")[1]
abbr = "t" + suf
if check_extension(file, abbr):
if llnl.url.has_extension(file, abbr):
return abbr
if not ext:
# If unable to parse extension from stream,
# attempt to fall back to string parsing
ext = extension_from_path(file)
ext = llnl.url.extension_from_path(file)
return ext
return None
def extension_from_path(path):
"""Returns the allowed archive extension for a path.
If path does not include a valid archive extension
(see`spack.util.compression.ALLOWED_ARCHIVE_TYPES`) return None
"""
if path is None:
raise ValueError("Can't call extension() on None")
for t in ALLOWED_ARCHIVE_TYPES:
if check_extension(path, t):
return t
return None
def strip_compression_extension(path, ext=None):
"""Returns path with last supported (can be combined with tar) or
provided archive extension stripped"""
path_ext = extension_from_path(path)
if path_ext:
path = expand_contracted_extension_in_path(path)
exts_to_check = EXTS
if ext:
exts_to_check = [ext]
for ext_check in exts_to_check:
mod_path = check_and_remove_ext(path, ext_check)
if mod_path != path:
return mod_path
return path
def strip_extension(path, ext=None):
"""Returns the part of a path that does not include extension.
If ext is given, only attempts to remove that extension. If no
extension given, attempts to strip any valid extension from path"""
if ext:
return check_and_remove_ext(path, ext)
for t in ALLOWED_ARCHIVE_TYPES:
mod_path = check_and_remove_ext(path, t)
if mod_path != path:
return mod_path
return path
def check_extension(path, ext):
"""Returns true if extension is present in path
false otherwise"""
# Strip sourceforge suffix.
prefix, _ = spath.find_sourceforge_suffix(path)
if not ext.startswith(r"\."):
ext = r"\.%s$" % ext
if re.search(ext, prefix):
return True
return False
def reg_remove_ext(path, ext):
"""Returns path with ext remove via regex"""
if path and ext:
suffix = r"\.%s$" % ext
return re.sub(suffix, "", path)
return path
def check_and_remove_ext(path, ext):
"""Returns path with extension removed if extension
is present in path. Otherwise just returns path"""
if check_extension(path, ext):
return reg_remove_ext(path, ext)
return path
def _substitute_extension(path, old_ext, new_ext):
"""Returns path with old_ext replaced with new_ext.
old_ext and new_ext can be extension strings or regexs"""
return re.sub(rf"{old_ext}", rf"{new_ext}", path)
def expand_contracted_extension_in_path(path, ext=None):
"""Returns path with any contraction extension (i.e. tgz) expanded
(i.e. tar.gz). If ext is specified, only attempt to expand that extension"""
if not ext:
ext = extension_from_path(path)
expanded_ext = expand_contracted_extension(ext)
if expanded_ext != ext:
return _substitute_extension(path, ext, expanded_ext)
return path
def expand_contracted_extension(extension):
"""Return expanded version of contracted extension
i.e. .tgz -> .tar.gz, no op on non contracted extensions"""
extension = extension.strip(".")
return CONTRACTION_MAP.get(extension, extension)
def compression_ext_from_compressed_archive(extension):
"""Returns compression extension for a compressed archive"""
extension = expand_contracted_extension(extension)
for ext in [*EXTS]:
if ext in extension:
return ext

View File

@@ -10,6 +10,10 @@
import os
import sys
import urllib.parse
import urllib.response
from urllib.error import URLError
from urllib.request import BaseHandler
import llnl.util.tty as tty
@@ -222,3 +226,21 @@ def get_blob_headers(self):
}
return headers
def gcs_open(req, *args, **kwargs):
"""Open a reader stream to a blob object on GCS"""
url = urllib.parse.urlparse(req.get_full_url())
gcsblob = GCSBlob(url)
if not gcsblob.exists():
raise URLError("GCS blob {0} does not exist".format(gcsblob.blob_path))
stream = gcsblob.get_blob_byte_stream()
headers = gcsblob.get_blob_headers()
return urllib.response.addinfourl(stream, headers, url)
class GCSHandler(BaseHandler):
def gs_open(self, req):
return gcs_open(req)

View File

@@ -109,15 +109,6 @@ def win_exe_ext():
return ".exe"
def find_sourceforge_suffix(path):
"""find and match sourceforge filepath components
Return match object"""
match = re.search(r"(.*(?:sourceforge\.net|sf\.net)/.*)(/download)$", path)
if match:
return match.groups()
return path, ""
def path_to_os_path(*pths):
"""
Takes an arbitrary number of positional parameters

View File

@@ -3,10 +3,13 @@
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
import os
import urllib.error
import urllib.parse
import urllib.request
import urllib.response
from io import BufferedReader, BytesIO, IOBase
from typing import Any, Dict, Tuple
import spack
import spack.config
#: Map (mirror name, method) tuples to s3 client instances.
@@ -114,4 +117,72 @@ def get_mirror_s3_connection_info(mirror, method):
if endpoint_url:
s3_client_args["endpoint_url"] = _parse_s3_endpoint_url(endpoint_url)
return (s3_connection, s3_client_args)
return s3_connection, s3_client_args
# NOTE(opadron): Workaround issue in boto where its StreamingBody
# implementation is missing several APIs expected from IOBase. These missing
# APIs prevent the streams returned by boto from being passed as-are along to
# urllib.
#
# https://github.com/boto/botocore/issues/879
# https://github.com/python/cpython/pull/3249
class WrapStream(BufferedReader):
def __init__(self, raw):
# In botocore >=1.23.47, StreamingBody inherits from IOBase, so we
# only add missing attributes in older versions.
# https://github.com/boto/botocore/commit/a624815eabac50442ed7404f3c4f2664cd0aa784
if not isinstance(raw, IOBase):
raw.readable = lambda: True
raw.writable = lambda: False
raw.seekable = lambda: False
raw.closed = False
raw.flush = lambda: None
super().__init__(raw)
def detach(self):
self.raw = None
def read(self, *args, **kwargs):
return self.raw.read(*args, **kwargs)
def __getattr__(self, key):
return getattr(self.raw, key)
def _s3_open(url, method="GET"):
parsed = urllib.parse.urlparse(url)
s3 = get_s3_session(url, method="fetch")
bucket = parsed.netloc
key = parsed.path
if key.startswith("/"):
key = key[1:]
if method not in ("GET", "HEAD"):
raise urllib.error.URLError(
"Only GET and HEAD verbs are currently supported for the s3:// scheme"
)
try:
if method == "GET":
obj = s3.get_object(Bucket=bucket, Key=key)
# NOTE(opadron): Apply workaround here (see above)
stream = WrapStream(obj["Body"])
elif method == "HEAD":
obj = s3.head_object(Bucket=bucket, Key=key)
stream = BytesIO()
except s3.ClientError as e:
raise urllib.error.URLError(e) from e
headers = obj["ResponseMetadata"]["HTTPHeaders"]
return url, headers, stream
class UrllibS3Handler(urllib.request.BaseHandler):
def s3_open(self, req):
orig_url = req.get_full_url()
url, headers, stream = _s3_open(orig_url, method=req.get_method())
return urllib.response.addinfourl(stream, headers, url)

View File

@@ -21,23 +21,17 @@
from urllib.error import HTTPError, URLError
from urllib.request import HTTPSHandler, Request, build_opener
import llnl.util.lang
import llnl.util.tty as tty
import llnl.url
from llnl.util import lang, tty
from llnl.util.filesystem import mkdirp, rename, working_dir
import spack
import spack.config
import spack.error
import spack.gcs_handler
import spack.s3_handler
import spack.url
import spack.util.crypto
import spack.util.gcs as gcs_util
import spack.util.s3 as s3_util
import spack.util.url as url_util
from spack.util.compression import ALLOWED_ARCHIVE_TYPES
from spack.util.executable import CommandNotFoundError, which
from spack.util.path import convert_to_posix_path
from .executable import CommandNotFoundError, which
from .gcs import GCSBlob, GCSBucket, GCSHandler
from .s3 import UrllibS3Handler, get_s3_session
class DetailedHTTPError(HTTPError):
@@ -66,8 +60,8 @@ def http_error_default(self, req, fp, code, msg, hdrs):
def _urlopen():
s3 = spack.s3_handler.UrllibS3Handler()
gcs = spack.gcs_handler.GCSHandler()
s3 = UrllibS3Handler()
gcs = GCSHandler()
error_handler = SpackHTTPDefaultErrorHandler()
# One opener with HTTPS ssl enabled
@@ -90,7 +84,7 @@ def dispatch_open(fullurl, data=None, timeout=None):
#: Dispatches to the correct OpenerDirector.open, based on Spack configuration.
urlopen = llnl.util.lang.Singleton(_urlopen)
urlopen = lang.Singleton(_urlopen)
#: User-Agent used in Request objects
SPACK_USER_AGENT = "Spackbot/{0}".format(spack.spack_version)
@@ -190,14 +184,14 @@ def push_to_url(local_file_path, remote_path, keep_original=True, extra_args=Non
while remote_path.startswith("/"):
remote_path = remote_path[1:]
s3 = s3_util.get_s3_session(remote_url, method="push")
s3 = get_s3_session(remote_url, method="push")
s3.upload_file(local_file_path, remote_url.netloc, remote_path, ExtraArgs=extra_args)
if not keep_original:
os.remove(local_file_path)
elif remote_url.scheme == "gs":
gcs = gcs_util.GCSBlob(remote_url)
gcs = GCSBlob(remote_url)
gcs.upload_to_blob(local_file_path)
if not keep_original:
os.remove(local_file_path)
@@ -427,7 +421,7 @@ def remove_url(url, recursive=False):
if url.scheme == "s3":
# Try to find a mirror for potential connection information
s3 = s3_util.get_s3_session(url, method="push")
s3 = get_s3_session(url, method="push")
bucket = url.netloc
if recursive:
# Because list_objects_v2 can only return up to 1000 items
@@ -460,10 +454,10 @@ def remove_url(url, recursive=False):
elif url.scheme == "gs":
if recursive:
bucket = gcs_util.GCSBucket(url)
bucket = GCSBucket(url)
bucket.destroy(recursive=recursive)
else:
blob = gcs_util.GCSBlob(url)
blob = GCSBlob(url)
blob.delete_blob()
return
@@ -538,14 +532,14 @@ def list_url(url, recursive=False):
]
if url.scheme == "s3":
s3 = s3_util.get_s3_session(url, method="fetch")
s3 = get_s3_session(url, method="fetch")
if recursive:
return list(_iter_s3_prefix(s3, url))
return list(set(key.split("/", 1)[0] for key in _iter_s3_prefix(s3, url)))
elif url.scheme == "gs":
gcs = gcs_util.GCSBucket(url)
gcs = GCSBucket(url)
return gcs.get_all_blobs(recursive=recursive)
@@ -636,7 +630,7 @@ def _spider(url, collect_nested):
links.add(abs_link)
# Skip stuff that looks like an archive
if any(raw_link.endswith(s) for s in ALLOWED_ARCHIVE_TYPES):
if any(raw_link.endswith(s) for s in llnl.url.ALLOWED_ARCHIVE_TYPES):
continue
# Skip already-visited links
@@ -696,7 +690,7 @@ def _spider(url, collect_nested):
current_depth, depth, len(spider_args)
)
)
results = tp.map(llnl.util.lang.star(_spider), spider_args)
results = tp.map(lang.star(_spider), spider_args)
spider_args = []
collect = current_depth < depth
for sub_pages, sub_links, sub_spider_args in results:
@@ -713,123 +707,6 @@ def _spider(url, collect_nested):
return pages, links
def find_versions_of_archive(
archive_urls, list_url=None, list_depth=0, concurrency=32, reference_package=None
):
"""Scrape web pages for new versions of a tarball. This function prefers URLs in the
following order: links found on the scraped page that match a url generated by the
reference package, found and in the archive_urls list, found and derived from those
in the archive_urls list, and if none are found for a version then the item in the
archive_urls list is included for the version.
Args:
archive_urls (str or list or tuple): URL or sequence of URLs for
different versions of a package. Typically these are just the
tarballs from the package file itself. By default, this searches
the parent directories of archives.
list_url (str or None): URL for a listing of archives.
Spack will scrape these pages for download links that look
like the archive URL.
list_depth (int): max depth to follow links on list_url pages.
Defaults to 0.
concurrency (int): maximum number of concurrent requests
reference_package (spack.package_base.PackageBase or None): a spack package
used as a reference for url detection. Uses the url_for_version
method on the package to produce reference urls which, if found,
are preferred.
"""
if not isinstance(archive_urls, (list, tuple)):
archive_urls = [archive_urls]
# Generate a list of list_urls based on archive urls and any
# explicitly listed list_url in the package
list_urls = set()
if list_url is not None:
list_urls.add(list_url)
for aurl in archive_urls:
list_urls |= spack.url.find_list_urls(aurl)
# Add '/' to the end of the URL. Some web servers require this.
additional_list_urls = set()
for lurl in list_urls:
if not lurl.endswith("/"):
additional_list_urls.add(lurl + "/")
list_urls |= additional_list_urls
# Grab some web pages to scrape.
pages, links = spider(list_urls, depth=list_depth, concurrency=concurrency)
# Scrape them for archive URLs
regexes = []
for aurl in archive_urls:
# This creates a regex from the URL with a capture group for
# the version part of the URL. The capture group is converted
# to a generic wildcard, so we can use this to extract things
# on a page that look like archive URLs.
url_regex = spack.url.wildcard_version(aurl)
# We'll be a bit more liberal and just look for the archive
# part, not the full path.
# this is a URL so it is a posixpath even on Windows
url_regex = PurePosixPath(url_regex).name
# We need to add a / to the beginning of the regex to prevent
# Spack from picking up similarly named packages like:
# https://cran.r-project.org/src/contrib/pls_2.6-0.tar.gz
# https://cran.r-project.org/src/contrib/enpls_5.7.tar.gz
# https://cran.r-project.org/src/contrib/autopls_1.3.tar.gz
# https://cran.r-project.org/src/contrib/matrixpls_1.0.4.tar.gz
url_regex = "/" + url_regex
# We need to add a $ anchor to the end of the regex to prevent
# Spack from picking up signature files like:
# .asc
# .md5
# .sha256
# .sig
# However, SourceForge downloads still need to end in '/download'.
url_regex += r"(\/download)?"
# PyPI adds #sha256=... to the end of the URL
url_regex += "(#sha256=.*)?"
url_regex += "$"
regexes.append(url_regex)
# Build a dict version -> URL from any links that match the wildcards.
# Walk through archive_url links first.
# Any conflicting versions will be overwritten by the list_url links.
versions = {}
matched = set()
for url in sorted(links):
url = convert_to_posix_path(url)
if any(re.search(r, url) for r in regexes):
try:
ver = spack.url.parse_version(url)
if ver in matched:
continue
versions[ver] = url
# prevent this version from getting overwritten
if reference_package is not None:
if url == reference_package.url_for_version(ver):
matched.add(ver)
else:
extrapolated_urls = [
spack.url.substitute_version(u, ver) for u in archive_urls
]
if url in extrapolated_urls:
matched.add(ver)
except spack.url.UndetectableVersionError:
continue
for url in archive_urls:
url = convert_to_posix_path(url)
ver = spack.url.parse_version(url)
if ver not in versions:
versions[ver] = url
return versions
def get_header(headers, header_name):
"""Looks up a dict of headers for the given header value.

View File

@@ -3,7 +3,7 @@
#
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
import spack.util.web
import spack.url
from spack.package import *
@@ -120,9 +120,7 @@ def fetch_remote_versions(self, *args, **kwargs):
return dict(
map(
lambda u: (u, self.url_for_version(u)),
spack.util.web.find_versions_of_archive(
self.all_urls, self.list_url, self.list_depth
),
spack.url.find_versions_of_archive(self.all_urls, self.list_url, self.list_depth),
)
)