Revert "Revert "Use urllib handler for s3:// and gs://, improve url_exists through HEAD requests (#34324)"" (#34498)

This reverts commit 8035eeb36d. And also removes logic around an additional HEAD request to prevent a more expensive GET request on wrong content-type. Since large files are typically an attachment and only downloaded when reading the stream, it's not an optimization that helps much, and in fact the logic was broken since the GET request was done unconditionally.
2022-12-14 23:47:11 +01:00
parent 43e38d0d12
commit ea029442e6
4 changed files with 91 additions and 158 deletions
--- a/lib/spack/spack/gcs_handler.py
+++ b/lib/spack/spack/gcs_handler.py
@@ -4,8 +4,8 @@
 # SPDX-License-Identifier: (Apache-2.0 OR MIT)
 import urllib.parse
 import urllib.response
-
+from urllib.error import URLError
-import spack.util.web as web_util
+from urllib.request import BaseHandler
 def gcs_open(req, *args, **kwargs):
@@ -16,8 +16,13 @@ def gcs_open(req, *args, **kwargs):
    gcsblob = gcs_util.GCSBlob(url)
    if not gcsblob.exists():
-        raise web_util.SpackWebError("GCS blob {0} does not exist".format(gcsblob.blob_path))
+        raise URLError("GCS blob {0} does not exist".format(gcsblob.blob_path))
    stream = gcsblob.get_blob_byte_stream()
    headers = gcsblob.get_blob_headers()
    return urllib.response.addinfourl(stream, headers, url)
 class GCSHandler(BaseHandler):
    def gs_open(self, req):
        return gcs_open(req)
--- a/lib/spack/spack/s3_handler.py
+++ b/lib/spack/spack/s3_handler.py
@@ -7,7 +7,7 @@
 import urllib.parse
 import urllib.request
 import urllib.response
-from io import BufferedReader, IOBase
+from io import BufferedReader, BytesIO, IOBase
 import spack.util.s3 as s3_util
@@ -42,7 +42,7 @@ def __getattr__(self, key):
        return getattr(self.raw, key)
-def _s3_open(url):
+def _s3_open(url, method="GET"):
    parsed = urllib.parse.urlparse(url)
    s3 = s3_util.get_s3_session(url, method="fetch")
@@ -52,27 +52,29 @@ def _s3_open(url):
    if key.startswith("/"):
        key = key[1:]
-    obj = s3.get_object(Bucket=bucket, Key=key)
+    if method not in ("GET", "HEAD"):
        raise urllib.error.URLError(
            "Only GET and HEAD verbs are currently supported for the s3:// scheme"
        )
    try:
        if method == "GET":
            obj = s3.get_object(Bucket=bucket, Key=key)
            # NOTE(opadron): Apply workaround here (see above)
            stream = WrapStream(obj["Body"])
        elif method == "HEAD":
            obj = s3.head_object(Bucket=bucket, Key=key)
            stream = BytesIO()
    except s3.ClientError as e:
        raise urllib.error.URLError(e) from e
    headers = obj["ResponseMetadata"]["HTTPHeaders"]
    return url, headers, stream
-class UrllibS3Handler(urllib.request.HTTPSHandler):
+class UrllibS3Handler(urllib.request.BaseHandler):
    def s3_open(self, req):
        orig_url = req.get_full_url()
-        from botocore.exceptions import ClientError  # type: ignore[import]
+        url, headers, stream = _s3_open(orig_url, method=req.get_method())
        try:
            url, headers, stream = _s3_open(orig_url)
        return urllib.response.addinfourl(stream, headers, url)
        except ClientError as err:
            raise urllib.error.URLError(err) from err
 S3OpenerDirector = urllib.request.build_opener(UrllibS3Handler())
 open = S3OpenerDirector.open
--- a/lib/spack/spack/test/web.py
+++ b/lib/spack/spack/test/web.py
@@ -224,7 +224,10 @@ def paginate(self, *args, **kwargs):
 class MockClientError(Exception):
    def __init__(self):
-        self.response = {"Error": {"Code": "NoSuchKey"}}
+        self.response = {
            "Error": {"Code": "NoSuchKey"},
            "ResponseMetadata": {"HTTPStatusCode": 404},
        }
 class MockS3Client(object):
@@ -243,7 +246,13 @@ def delete_object(self, *args, **kwargs):
    def get_object(self, Bucket=None, Key=None):
        self.ClientError = MockClientError
        if Bucket == "my-bucket" and Key == "subdirectory/my-file":
-            return True
+            return {"ResponseMetadata": {"HTTPHeaders": {}}}
        raise self.ClientError
    def head_object(self, Bucket=None, Key=None):
        self.ClientError = MockClientError
        if Bucket == "my-bucket" and Key == "subdirectory/my-file":
            return {"ResponseMetadata": {"HTTPHeaders": {}}}
        raise self.ClientError
--- a/lib/spack/spack/util/web.py
+++ b/lib/spack/spack/util/web.py
@@ -18,7 +18,7 @@
 import urllib.parse
 from html.parser import HTMLParser
 from urllib.error import URLError
-from urllib.request import Request, urlopen
+from urllib.request import HTTPSHandler, Request, build_opener
 import llnl.util.lang
 import llnl.util.tty as tty
@@ -27,6 +27,8 @@
 import spack
 import spack.config
 import spack.error
 import spack.gcs_handler
 import spack.s3_handler
 import spack.url
 import spack.util.crypto
 import spack.util.gcs as gcs_util
@@ -36,6 +38,28 @@
 from spack.util.executable import CommandNotFoundError, which
 from spack.util.path import convert_to_posix_path
 def _urlopen():
    s3 = spack.s3_handler.UrllibS3Handler()
    gcs = spack.gcs_handler.GCSHandler()
    # One opener with HTTPS ssl enabled
    with_ssl = build_opener(s3, gcs, HTTPSHandler(context=ssl.create_default_context()))
    # One opener with HTTPS ssl disabled
    without_ssl = build_opener(s3, gcs, HTTPSHandler(context=ssl._create_unverified_context()))
    # And dynamically dispatch based on the config:verify_ssl.
    def dispatch_open(*args, **kwargs):
        opener = with_ssl if spack.config.get("config:verify_ssl", True) else without_ssl
        return opener.open(*args, **kwargs)
    return dispatch_open
 #: Dispatches to the correct OpenerDirector.open, based on Spack configuration.
 urlopen = llnl.util.lang.Singleton(_urlopen)
 #: User-Agent used in Request objects
 SPACK_USER_AGENT = "Spackbot/{0}".format(spack.spack_version)
@@ -60,85 +84,32 @@ def handle_starttag(self, tag, attrs):
                    self.links.append(val)
 def uses_ssl(parsed_url):
    if parsed_url.scheme == "https":
        return True
    if parsed_url.scheme == "s3":
        endpoint_url = os.environ.get("S3_ENDPOINT_URL")
        if not endpoint_url:
            return True
        if urllib.parse.urlparse(endpoint_url).scheme == "https":
            return True
    elif parsed_url.scheme == "gs":
        tty.debug("(uses_ssl) GCS Blob is https")
        return True
    return False
 def read_from_url(url, accept_content_type=None):
    if isinstance(url, str):
        url = urllib.parse.urlparse(url)
    context = None
    # Timeout in seconds for web requests
    timeout = spack.config.get("config:connect_timeout", 10)
-
+    request = Request(url.geturl(), headers={"User-Agent": SPACK_USER_AGENT})
    # Don't even bother with a context unless the URL scheme is one that uses
    # SSL certs.
    if uses_ssl(url):
        if spack.config.get("config:verify_ssl"):
            # User wants SSL verification, and it *can* be provided.
            context = ssl.create_default_context()
        else:
            # User has explicitly indicated that they do not want SSL
            # verification.
            context = ssl._create_unverified_context()
    url_scheme = url.scheme
    url = url_util.format(url)
    if sys.platform == "win32" and url_scheme == "file":
        url = convert_to_posix_path(url)
    req = Request(url, headers={"User-Agent": SPACK_USER_AGENT})
    content_type = None
    is_web_url = url_scheme in ("http", "https")
    if accept_content_type and is_web_url:
        # Make a HEAD request first to check the content type.  This lets
        # us ignore tarballs and gigantic files.
        # It would be nice to do this with the HTTP Accept header to avoid
        # one round-trip.  However, most servers seem to ignore the header
        # if you ask for a tarball with Accept: text/html.
        req.get_method = lambda: "HEAD"
        resp = _urlopen(req, timeout=timeout, context=context)
        content_type = get_header(resp.headers, "Content-type")
    # Do the real GET request when we know it's just HTML.
    req.get_method = lambda: "GET"
    try:
-        response = _urlopen(req, timeout=timeout, context=context)
+        response = urlopen(request, timeout=timeout)
    except URLError as err:
-        raise SpackWebError("Download failed: {ERROR}".format(ERROR=str(err)))
+        raise SpackWebError("Download failed: {}".format(str(err)))
-    if accept_content_type and not is_web_url:
+    if accept_content_type:
        try:
            content_type = get_header(response.headers, "Content-type")
-
+            reject_content_type = not content_type.startswith(accept_content_type)
-    reject_content_type = accept_content_type and (
+        except KeyError:
-        content_type is None or not content_type.startswith(accept_content_type)
+            content_type = None
-    )
+            reject_content_type = True
        if reject_content_type:
-        tty.debug(
+            msg = "ignoring page {}".format(url.geturl())
-            "ignoring page {0}{1}{2}".format(
+            if content_type:
-                url, " with content type " if content_type is not None else "", content_type or ""
+                msg += " with content type {}".format(content_type)
-            )
+            tty.debug(msg)
        )
            return None, None, None
    return response.geturl(), response.headers, response
@@ -349,12 +320,6 @@ def url_exists(url, curl=None):
    Simple Storage Service (`s3`) URLs; otherwise, the configured fetch
    method defined by `config:url_fetch_method` is used.
    If the method is `curl`, it also uses the following configuration option:
        * config:verify_ssl (str): Perform SSL verification
    Otherwise, `urllib` will be used.
    Arguments:
        url (str): URL whose existence is being checked
        curl (spack.util.executable.Executable or None): (optional) curl
@@ -365,31 +330,11 @@ def url_exists(url, curl=None):
    tty.debug("Checking existence of {0}".format(url))
    url_result = urllib.parse.urlparse(url)
-    # Check if a local file
+    # Use curl if configured to do so
-    local_path = url_util.local_file_path(url_result)
+    use_curl = spack.config.get(
-    if local_path:
+        "config:url_fetch_method", "urllib"
-        return os.path.exists(local_path)
+    ) == "curl" and url_result.scheme not in ("gs", "s3")
-
+    if use_curl:
    # Check if Amazon Simple Storage Service (S3) .. urllib-based fetch
    if url_result.scheme == "s3":
        # Check for URL-specific connection information
        s3 = s3_util.get_s3_session(url_result, method="fetch")
        try:
            s3.get_object(Bucket=url_result.netloc, Key=url_result.path.lstrip("/"))
            return True
        except s3.ClientError as err:
            if err.response["Error"]["Code"] == "NoSuchKey":
                return False
            raise err
    # Check if Google Storage .. urllib-based fetch
    if url_result.scheme == "gs":
        gcs = gcs_util.GCSBlob(url_result)
        return gcs.exists()
    # Otherwise, use the configured fetch method
    if spack.config.get("config:url_fetch_method") == "curl":
        curl_exe = _curl(curl)
        if not curl_exe:
            return False
@@ -402,13 +347,14 @@ def url_exists(url, curl=None):
        _ = curl_exe(*curl_args, fail_on_error=False, output=os.devnull)
        return curl_exe.returncode == 0
-    # If we get here, then the only other fetch method option is urllib.
+    # Otherwise use urllib.
    # So try to "read" from the URL and assume that *any* non-throwing
    #  response contains the resource represented by the URL.
    try:
-        read_from_url(url)
+        urlopen(
            Request(url, method="HEAD", headers={"User-Agent": SPACK_USER_AGENT}),
            timeout=spack.config.get("config:connect_timeout", 10),
        )
        return True
-    except (SpackWebError, URLError) as e:
+    except URLError as e:
        tty.debug("Failure reading URL: " + str(e))
        return False
@@ -691,35 +637,6 @@ def _spider(url, collect_nested):
    return pages, links
 def _urlopen(req, *args, **kwargs):
    """Wrapper for compatibility with old versions of Python."""
    url = req
    try:
        url = url.get_full_url()
    except AttributeError:
        pass
    del kwargs["context"]
    opener = urlopen
    if urllib.parse.urlparse(url).scheme == "s3":
        import spack.s3_handler
        opener = spack.s3_handler.open
    elif urllib.parse.urlparse(url).scheme == "gs":
        import spack.gcs_handler
        opener = spack.gcs_handler.gcs_open
    try:
        return opener(req, *args, **kwargs)
    except TypeError as err:
        # If the above fails because of 'context', call without 'context'.
        if "context" in kwargs and "context" in str(err):
            del kwargs["context"]
        return opener(req, *args, **kwargs)
 def find_versions_of_archive(
    archive_urls, list_url=None, list_depth=0, concurrency=32, reference_package=None
 ):