Group primitive url/path handling functions together (#40028)
This commit is contained in:

committed by
GitHub

parent
bc02453f6d
commit
fb9e5fcc4f
459
lib/spack/llnl/url.py
Normal file
459
lib/spack/llnl/url.py
Normal file
@@ -0,0 +1,459 @@
|
||||
# Copyright 2013-2023 Lawrence Livermore National Security, LLC and other
|
||||
# Spack Project Developers. See the top-level COPYRIGHT file for details.
|
||||
#
|
||||
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
|
||||
"""URL primitives that just require Python standard library."""
|
||||
import itertools
|
||||
import os.path
|
||||
import re
|
||||
from typing import Optional, Set, Tuple
|
||||
from urllib.parse import urlsplit, urlunsplit
|
||||
|
||||
# Archive extensions allowed in Spack
|
||||
PREFIX_EXTENSIONS = ("tar", "TAR")
|
||||
EXTENSIONS = ("gz", "bz2", "xz", "Z")
|
||||
NO_TAR_EXTENSIONS = ("zip", "tgz", "tbz2", "tbz", "txz")
|
||||
|
||||
# Add PREFIX_EXTENSIONS and EXTENSIONS last so that .tar.gz is matched *before* .tar or .gz
|
||||
ALLOWED_ARCHIVE_TYPES = (
|
||||
tuple(".".join(ext) for ext in itertools.product(PREFIX_EXTENSIONS, EXTENSIONS))
|
||||
+ PREFIX_EXTENSIONS
|
||||
+ EXTENSIONS
|
||||
+ NO_TAR_EXTENSIONS
|
||||
)
|
||||
CONTRACTION_MAP = {"tgz": "tar.gz", "txz": "tar.xz", "tbz": "tar.bz2", "tbz2": "tar.bz2"}
|
||||
|
||||
|
||||
def find_list_urls(url: str) -> Set[str]:
|
||||
r"""Find good list URLs for the supplied URL.
|
||||
|
||||
By default, returns the dirname of the archive path.
|
||||
|
||||
Provides special treatment for the following websites, which have a
|
||||
unique list URL different from the dirname of the download URL:
|
||||
|
||||
========= =======================================================
|
||||
GitHub https://github.com/<repo>/<name>/releases
|
||||
GitLab https://gitlab.\*/<repo>/<name>/tags
|
||||
BitBucket https://bitbucket.org/<repo>/<name>/downloads/?tab=tags
|
||||
CRAN https://\*.r-project.org/src/contrib/Archive/<name>
|
||||
PyPI https://pypi.org/simple/<name>/
|
||||
LuaRocks https://luarocks.org/modules/<repo>/<name>
|
||||
========= =======================================================
|
||||
|
||||
Note: this function is called by `spack versions`, `spack checksum`,
|
||||
and `spack create`, but not by `spack fetch` or `spack install`.
|
||||
|
||||
Parameters:
|
||||
url (str): The download URL for the package
|
||||
|
||||
Returns:
|
||||
set: One or more list URLs for the package
|
||||
"""
|
||||
|
||||
url_types = [
|
||||
# GitHub
|
||||
# e.g. https://github.com/llnl/callpath/archive/v1.0.1.tar.gz
|
||||
(r"(.*github\.com/[^/]+/[^/]+)", lambda m: m.group(1) + "/releases"),
|
||||
# GitLab API endpoint
|
||||
# e.g. https://gitlab.dkrz.de/api/v4/projects/k202009%2Flibaec/repository/archive.tar.gz?sha=v1.0.2
|
||||
(
|
||||
r"(.*gitlab[^/]+)/api/v4/projects/([^/]+)%2F([^/]+)",
|
||||
lambda m: m.group(1) + "/" + m.group(2) + "/" + m.group(3) + "/tags",
|
||||
),
|
||||
# GitLab non-API endpoint
|
||||
# e.g. https://gitlab.dkrz.de/k202009/libaec/uploads/631e85bcf877c2dcaca9b2e6d6526339/libaec-1.0.0.tar.gz
|
||||
(r"(.*gitlab[^/]+/(?!api/v4/projects)[^/]+/[^/]+)", lambda m: m.group(1) + "/tags"),
|
||||
# BitBucket
|
||||
# e.g. https://bitbucket.org/eigen/eigen/get/3.3.3.tar.bz2
|
||||
(r"(.*bitbucket.org/[^/]+/[^/]+)", lambda m: m.group(1) + "/downloads/?tab=tags"),
|
||||
# CRAN
|
||||
# e.g. https://cran.r-project.org/src/contrib/Rcpp_0.12.9.tar.gz
|
||||
# e.g. https://cloud.r-project.org/src/contrib/rgl_0.98.1.tar.gz
|
||||
(
|
||||
r"(.*\.r-project\.org/src/contrib)/([^_]+)",
|
||||
lambda m: m.group(1) + "/Archive/" + m.group(2),
|
||||
),
|
||||
# PyPI
|
||||
# e.g. https://pypi.io/packages/source/n/numpy/numpy-1.19.4.zip
|
||||
# e.g. https://www.pypi.io/packages/source/n/numpy/numpy-1.19.4.zip
|
||||
# e.g. https://pypi.org/packages/source/n/numpy/numpy-1.19.4.zip
|
||||
# e.g. https://pypi.python.org/packages/source/n/numpy/numpy-1.19.4.zip
|
||||
# e.g. https://files.pythonhosted.org/packages/source/n/numpy/numpy-1.19.4.zip
|
||||
# e.g. https://pypi.io/packages/py2.py3/o/opencensus-context/opencensus_context-0.1.1-py2.py3-none-any.whl
|
||||
(
|
||||
r"(?:pypi|pythonhosted)[^/]+/packages/[^/]+/./([^/]+)",
|
||||
lambda m: "https://pypi.org/simple/" + m.group(1) + "/",
|
||||
),
|
||||
# LuaRocks
|
||||
# e.g. https://luarocks.org/manifests/gvvaughan/lpeg-1.0.2-1.src.rock
|
||||
# e.g. https://luarocks.org/manifests/openresty/lua-cjson-2.1.0-1.src.rock
|
||||
(
|
||||
r"luarocks[^/]+/(?:modules|manifests)/(?P<org>[^/]+)/"
|
||||
+ r"(?P<name>.+?)-[0-9.-]*\.src\.rock",
|
||||
lambda m: "https://luarocks.org/modules/"
|
||||
+ m.group("org")
|
||||
+ "/"
|
||||
+ m.group("name")
|
||||
+ "/",
|
||||
),
|
||||
]
|
||||
|
||||
list_urls = {os.path.dirname(url)}
|
||||
|
||||
for pattern, fun in url_types:
|
||||
match = re.search(pattern, url)
|
||||
if match:
|
||||
list_urls.add(fun(match))
|
||||
|
||||
return list_urls
|
||||
|
||||
|
||||
def strip_query_and_fragment(url: str) -> Tuple[str, str]:
|
||||
"""Strips query and fragment from a url, then returns the base url and the suffix.
|
||||
|
||||
Args:
|
||||
url: URL to be stripped
|
||||
|
||||
Raises:
|
||||
ValueError: when there is any error parsing the URL
|
||||
"""
|
||||
components = urlsplit(url)
|
||||
stripped = components[:3] + (None, None)
|
||||
|
||||
query, frag = components[3:5]
|
||||
suffix = ""
|
||||
if query:
|
||||
suffix += "?" + query
|
||||
if frag:
|
||||
suffix += "#" + frag
|
||||
|
||||
return urlunsplit(stripped), suffix
|
||||
|
||||
|
||||
SOURCEFORGE_RE = re.compile(r"(.*(?:sourceforge\.net|sf\.net)/.*)(/download)$")
|
||||
|
||||
|
||||
def split_url_on_sourceforge_suffix(url: str) -> Tuple[str, ...]:
|
||||
"""If the input is a sourceforge URL, returns base URL and "/download" suffix. Otherwise,
|
||||
returns the input URL and an empty string.
|
||||
"""
|
||||
match = SOURCEFORGE_RE.search(url)
|
||||
if match is not None:
|
||||
return match.groups()
|
||||
return url, ""
|
||||
|
||||
|
||||
def has_extension(path_or_url: str, ext: str) -> bool:
|
||||
"""Returns true if the extension in input is present in path, false otherwise."""
|
||||
prefix, _ = split_url_on_sourceforge_suffix(path_or_url)
|
||||
if not ext.startswith(r"\."):
|
||||
ext = rf"\.{ext}$"
|
||||
|
||||
if re.search(ext, prefix):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def extension_from_path(path_or_url: Optional[str]) -> Optional[str]:
|
||||
"""Tries to match an allowed archive extension to the input. Returns the first match,
|
||||
or None if no match was found.
|
||||
|
||||
Raises:
|
||||
ValueError: if the input is None
|
||||
"""
|
||||
if path_or_url is None:
|
||||
raise ValueError("Can't call extension() on None")
|
||||
|
||||
for t in ALLOWED_ARCHIVE_TYPES:
|
||||
if has_extension(path_or_url, t):
|
||||
return t
|
||||
return None
|
||||
|
||||
|
||||
def remove_extension(path_or_url: str, *, extension: str) -> str:
|
||||
"""Returns the input with the extension removed"""
|
||||
suffix = rf"\.{extension}$"
|
||||
return re.sub(suffix, "", path_or_url)
|
||||
|
||||
|
||||
def check_and_remove_ext(path: str, *, extension: str) -> str:
|
||||
"""Returns the input path with the extension removed, if the extension is present in path.
|
||||
Otherwise, returns the input unchanged.
|
||||
"""
|
||||
if not has_extension(path, extension):
|
||||
return path
|
||||
path, _ = split_url_on_sourceforge_suffix(path)
|
||||
return remove_extension(path, extension=extension)
|
||||
|
||||
|
||||
def strip_extension(path_or_url: str, *, extension: Optional[str] = None) -> str:
|
||||
"""If a path contains the extension in input, returns the path stripped of the extension.
|
||||
Otherwise, returns the input path.
|
||||
|
||||
If extension is None, attempts to strip any allowed extension from path.
|
||||
"""
|
||||
if extension is None:
|
||||
for t in ALLOWED_ARCHIVE_TYPES:
|
||||
if has_extension(path_or_url, ext=t):
|
||||
extension = t
|
||||
break
|
||||
else:
|
||||
return path_or_url
|
||||
|
||||
return check_and_remove_ext(path_or_url, extension=extension)
|
||||
|
||||
|
||||
def split_url_extension(url: str) -> Tuple[str, ...]:
|
||||
"""Some URLs have a query string, e.g.:
|
||||
|
||||
1. https://github.com/losalamos/CLAMR/blob/packages/PowerParser_v2.0.7.tgz?raw=true
|
||||
2. http://www.apache.org/dyn/closer.cgi?path=/cassandra/1.2.0/apache-cassandra-1.2.0-rc2-bin.tar.gz
|
||||
3. https://gitlab.kitware.com/vtk/vtk/repository/archive.tar.bz2?ref=v7.0.0
|
||||
|
||||
In (1), the query string needs to be stripped to get at the
|
||||
extension, but in (2) & (3), the filename is IN a single final query
|
||||
argument.
|
||||
|
||||
This strips the URL into three pieces: ``prefix``, ``ext``, and ``suffix``.
|
||||
The suffix contains anything that was stripped off the URL to
|
||||
get at the file extension. In (1), it will be ``'?raw=true'``, but
|
||||
in (2), it will be empty. In (3) the suffix is a parameter that follows
|
||||
after the file extension, e.g.:
|
||||
|
||||
1. ``('https://github.com/losalamos/CLAMR/blob/packages/PowerParser_v2.0.7', '.tgz', '?raw=true')``
|
||||
2. ``('http://www.apache.org/dyn/closer.cgi?path=/cassandra/1.2.0/apache-cassandra-1.2.0-rc2-bin', '.tar.gz', None)``
|
||||
3. ``('https://gitlab.kitware.com/vtk/vtk/repository/archive', '.tar.bz2', '?ref=v7.0.0')``
|
||||
"""
|
||||
# Strip off sourceforge download suffix.
|
||||
# e.g. https://sourceforge.net/projects/glew/files/glew/2.0.0/glew-2.0.0.tgz/download
|
||||
prefix, suffix = split_url_on_sourceforge_suffix(url)
|
||||
|
||||
ext = extension_from_path(prefix)
|
||||
if ext is not None:
|
||||
prefix = strip_extension(prefix)
|
||||
return prefix, ext, suffix
|
||||
|
||||
try:
|
||||
prefix, suf = strip_query_and_fragment(prefix)
|
||||
except ValueError:
|
||||
# FIXME: tty.debug("Got error parsing path %s" % path)
|
||||
# Ignore URL parse errors here
|
||||
return url, ""
|
||||
|
||||
ext = extension_from_path(prefix)
|
||||
prefix = strip_extension(prefix)
|
||||
suffix = suf + suffix
|
||||
if ext is None:
|
||||
ext = ""
|
||||
|
||||
return prefix, ext, suffix
|
||||
|
||||
|
||||
def strip_version_suffixes(path_or_url: str) -> str:
|
||||
"""Some tarballs contain extraneous information after the version:
|
||||
|
||||
* ``bowtie2-2.2.5-source``
|
||||
* ``libevent-2.0.21-stable``
|
||||
* ``cuda_8.0.44_linux.run``
|
||||
|
||||
These strings are not part of the version number and should be ignored.
|
||||
This function strips those suffixes off and returns the remaining string.
|
||||
The goal is that the version is always the last thing in ``path``:
|
||||
|
||||
* ``bowtie2-2.2.5``
|
||||
* ``libevent-2.0.21``
|
||||
* ``cuda_8.0.44``
|
||||
|
||||
Args:
|
||||
path_or_url: The filename or URL for the package
|
||||
|
||||
Returns:
|
||||
The ``path`` with any extraneous suffixes removed
|
||||
"""
|
||||
# NOTE: This could be done with complicated regexes in parse_version_offset
|
||||
# NOTE: The problem is that we would have to add these regexes to the end
|
||||
# NOTE: of every single version regex. Easier to just strip them off
|
||||
# NOTE: permanently
|
||||
|
||||
suffix_regexes = [
|
||||
# Download type
|
||||
r"[Ii]nstall",
|
||||
r"all",
|
||||
r"code",
|
||||
r"[Ss]ources?",
|
||||
r"file",
|
||||
r"full",
|
||||
r"single",
|
||||
r"with[a-zA-Z_-]+",
|
||||
r"rock",
|
||||
r"src(_0)?",
|
||||
r"public",
|
||||
r"bin",
|
||||
r"binary",
|
||||
r"run",
|
||||
r"[Uu]niversal",
|
||||
r"jar",
|
||||
r"complete",
|
||||
r"dynamic",
|
||||
r"oss",
|
||||
r"gem",
|
||||
r"tar",
|
||||
r"sh",
|
||||
# Download version
|
||||
r"release",
|
||||
r"bin",
|
||||
r"stable",
|
||||
r"[Ff]inal",
|
||||
r"rel",
|
||||
r"orig",
|
||||
r"dist",
|
||||
r"\+",
|
||||
# License
|
||||
r"gpl",
|
||||
# Arch
|
||||
# Needs to come before and after OS, appears in both orders
|
||||
r"ia32",
|
||||
r"intel",
|
||||
r"amd64",
|
||||
r"linux64",
|
||||
r"x64",
|
||||
r"64bit",
|
||||
r"x86[_-]64",
|
||||
r"i586_64",
|
||||
r"x86",
|
||||
r"i[36]86",
|
||||
r"ppc64(le)?",
|
||||
r"armv?(7l|6l|64)",
|
||||
# Other
|
||||
r"cpp",
|
||||
r"gtk",
|
||||
r"incubating",
|
||||
# OS
|
||||
r"[Ll]inux(_64)?",
|
||||
r"LINUX",
|
||||
r"[Uu]ni?x",
|
||||
r"[Ss]un[Oo][Ss]",
|
||||
r"[Mm]ac[Oo][Ss][Xx]?",
|
||||
r"[Oo][Ss][Xx]",
|
||||
r"[Dd]arwin(64)?",
|
||||
r"[Aa]pple",
|
||||
r"[Ww]indows",
|
||||
r"[Ww]in(64|32)?",
|
||||
r"[Cc]ygwin(64|32)?",
|
||||
r"[Mm]ingw",
|
||||
r"centos",
|
||||
# Arch
|
||||
# Needs to come before and after OS, appears in both orders
|
||||
r"ia32",
|
||||
r"intel",
|
||||
r"amd64",
|
||||
r"linux64",
|
||||
r"x64",
|
||||
r"64bit",
|
||||
r"x86[_-]64",
|
||||
r"i586_64",
|
||||
r"x86",
|
||||
r"i[36]86",
|
||||
r"ppc64(le)?",
|
||||
r"armv?(7l|6l|64)?",
|
||||
# PyPI
|
||||
r"[._-]py[23].*\.whl",
|
||||
r"[._-]cp[23].*\.whl",
|
||||
r"[._-]win.*\.exe",
|
||||
]
|
||||
|
||||
for regex in suffix_regexes:
|
||||
# Remove the suffix from the end of the path
|
||||
# This may be done multiple times
|
||||
path_or_url = re.sub(r"[._-]?" + regex + "$", "", path_or_url)
|
||||
|
||||
return path_or_url
|
||||
|
||||
|
||||
def expand_contracted_extension(extension: str) -> str:
|
||||
"""Returns the expanded version of a known contracted extension.
|
||||
|
||||
This function maps extensions like ".tgz" to ".tar.gz". On unknown extensions,
|
||||
return the input unmodified.
|
||||
"""
|
||||
extension = extension.strip(".")
|
||||
return CONTRACTION_MAP.get(extension, extension)
|
||||
|
||||
|
||||
def expand_contracted_extension_in_path(
|
||||
path_or_url: str, *, extension: Optional[str] = None
|
||||
) -> str:
|
||||
"""Returns the input path or URL with any contraction extension expanded.
|
||||
|
||||
Args:
|
||||
path_or_url: path or URL to be expanded
|
||||
extension: if specified, only attempt to expand that extension
|
||||
"""
|
||||
extension = extension or extension_from_path(path_or_url)
|
||||
if extension is None:
|
||||
return path_or_url
|
||||
|
||||
expanded = expand_contracted_extension(extension)
|
||||
if expanded != extension:
|
||||
return re.sub(rf"{extension}", rf"{expanded}", path_or_url)
|
||||
return path_or_url
|
||||
|
||||
|
||||
def compression_ext_from_compressed_archive(extension: str) -> Optional[str]:
|
||||
"""Returns compression extension for a compressed archive"""
|
||||
extension = expand_contracted_extension(extension)
|
||||
for ext in [*EXTENSIONS]:
|
||||
if ext in extension:
|
||||
return ext
|
||||
return None
|
||||
|
||||
|
||||
def strip_compression_extension(path_or_url: str, ext: Optional[str] = None) -> str:
|
||||
"""Strips the compression extension from the input, and returns it. For instance,
|
||||
"foo.tgz" becomes "foo.tar".
|
||||
|
||||
If no extension is given, try a default list of extensions.
|
||||
|
||||
Args:
|
||||
path_or_url: input to be stripped
|
||||
ext: if given, extension to be stripped
|
||||
"""
|
||||
if not extension_from_path(path_or_url):
|
||||
return path_or_url
|
||||
|
||||
expanded_path = expand_contracted_extension_in_path(path_or_url)
|
||||
candidates = [ext] if ext is not None else EXTENSIONS
|
||||
for current_extension in candidates:
|
||||
modified_path = check_and_remove_ext(expanded_path, extension=current_extension)
|
||||
if modified_path != expanded_path:
|
||||
return modified_path
|
||||
return expanded_path
|
||||
|
||||
|
||||
def allowed_archive(path_or_url: str) -> bool:
|
||||
"""Returns true if the input is a valid archive, False otherwise."""
|
||||
return (
|
||||
False if not path_or_url else any(path_or_url.endswith(t) for t in ALLOWED_ARCHIVE_TYPES)
|
||||
)
|
||||
|
||||
|
||||
def determine_url_file_extension(path: str) -> str:
|
||||
"""This returns the type of archive a URL refers to. This is
|
||||
sometimes confusing because of URLs like:
|
||||
|
||||
(1) https://github.com/petdance/ack/tarball/1.93_02
|
||||
|
||||
Where the URL doesn't actually contain the filename. We need
|
||||
to know what type it is so that we can appropriately name files
|
||||
in mirrors.
|
||||
"""
|
||||
match = re.search(r"github.com/.+/(zip|tar)ball/", path)
|
||||
if match:
|
||||
if match.group(1) == "zip":
|
||||
return "zip"
|
||||
elif match.group(1) == "tar":
|
||||
return "tar.gz"
|
||||
|
||||
prefix, ext, suffix = split_url_extension(path)
|
||||
return ext
|
Reference in New Issue
Block a user