Revert "Revert "Use urllib handler for s3:// and gs://, improve url_exists through HEAD requests (#34324)"" (#34498)

This reverts commit 8035eeb36d. And also removes logic around an additional HEAD request to prevent a more expensive GET request on wrong content-type. Since large files are typically an attachment and only downloaded when reading the stream, it's not an optimization that helps much, and in fact the logic was broken since the GET request was done unconditionally.
2022-12-14 23:47:11 +01:00
parent 43e38d0d12
commit ea029442e6
4 changed files with 91 additions and 158 deletions
--- a/lib/spack/spack/gcs_handler.py
+++ b/lib/spack/spack/gcs_handler.py
@@ -4,8 +4,8 @@
 # SPDX-License-Identifier: (Apache-2.0 OR MIT)
 import urllib.parse
 import urllib.response
-
-import spack.util.web as web_util
+from urllib.error import URLError
+from urllib.request import BaseHandler


 def gcs_open(req, *args, **kwargs):
@@ -16,8 +16,13 @@ def gcs_open(req, *args, **kwargs):
    gcsblob = gcs_util.GCSBlob(url)

    if not gcsblob.exists():
-        raise web_util.SpackWebError("GCS blob {0} does not exist".format(gcsblob.blob_path))
+        raise URLError("GCS blob {0} does not exist".format(gcsblob.blob_path))
    stream = gcsblob.get_blob_byte_stream()
    headers = gcsblob.get_blob_headers()

    return urllib.response.addinfourl(stream, headers, url)
+
+
+class GCSHandler(BaseHandler):
+    def gs_open(self, req):
+        return gcs_open(req)
--- a/lib/spack/spack/s3_handler.py
+++ b/lib/spack/spack/s3_handler.py
@@ -7,7 +7,7 @@
 import urllib.parse
 import urllib.request
 import urllib.response
-from io import BufferedReader, IOBase
+from io import BufferedReader, BytesIO, IOBase

 import spack.util.s3 as s3_util

@@ -42,7 +42,7 @@ def __getattr__(self, key):
        return getattr(self.raw, key)


-def _s3_open(url):
+def _s3_open(url, method="GET"):
    parsed = urllib.parse.urlparse(url)
    s3 = s3_util.get_s3_session(url, method="fetch")

@@ -52,27 +52,29 @@ def _s3_open(url):
    if key.startswith("/"):
        key = key[1:]

-    obj = s3.get_object(Bucket=bucket, Key=key)
+    if method not in ("GET", "HEAD"):
+        raise urllib.error.URLError(
+            "Only GET and HEAD verbs are currently supported for the s3:// scheme"
+        )
+
+    try:
+        if method == "GET":
+            obj = s3.get_object(Bucket=bucket, Key=key)
+            # NOTE(opadron): Apply workaround here (see above)
+            stream = WrapStream(obj["Body"])
+        elif method == "HEAD":
+            obj = s3.head_object(Bucket=bucket, Key=key)
+            stream = BytesIO()
+    except s3.ClientError as e:
+        raise urllib.error.URLError(e) from e

-    # NOTE(opadron): Apply workaround here (see above)
-    stream = WrapStream(obj["Body"])
    headers = obj["ResponseMetadata"]["HTTPHeaders"]

    return url, headers, stream


-class UrllibS3Handler(urllib.request.HTTPSHandler):
+class UrllibS3Handler(urllib.request.BaseHandler):
    def s3_open(self, req):
        orig_url = req.get_full_url()
-        from botocore.exceptions import ClientError  # type: ignore[import]
-
-        try:
-            url, headers, stream = _s3_open(orig_url)
-            return urllib.response.addinfourl(stream, headers, url)
-        except ClientError as err:
-            raise urllib.error.URLError(err) from err
-
-
-S3OpenerDirector = urllib.request.build_opener(UrllibS3Handler())
-
-open = S3OpenerDirector.open
+        url, headers, stream = _s3_open(orig_url, method=req.get_method())
+        return urllib.response.addinfourl(stream, headers, url)
--- a/lib/spack/spack/test/web.py
+++ b/lib/spack/spack/test/web.py
@@ -224,7 +224,10 @@ def paginate(self, *args, **kwargs):

 class MockClientError(Exception):
    def __init__(self):
-        self.response = {"Error": {"Code": "NoSuchKey"}}
+        self.response = {
+            "Error": {"Code": "NoSuchKey"},
+            "ResponseMetadata": {"HTTPStatusCode": 404},
+        }


 class MockS3Client(object):
@@ -243,7 +246,13 @@ def delete_object(self, *args, **kwargs):
    def get_object(self, Bucket=None, Key=None):
        self.ClientError = MockClientError
        if Bucket == "my-bucket" and Key == "subdirectory/my-file":
-            return True
+            return {"ResponseMetadata": {"HTTPHeaders": {}}}
+        raise self.ClientError
+
+    def head_object(self, Bucket=None, Key=None):
+        self.ClientError = MockClientError
+        if Bucket == "my-bucket" and Key == "subdirectory/my-file":
+            return {"ResponseMetadata": {"HTTPHeaders": {}}}
        raise self.ClientError


--- a/lib/spack/spack/util/web.py
+++ b/lib/spack/spack/util/web.py
@@ -18,7 +18,7 @@
 import urllib.parse
 from html.parser import HTMLParser
 from urllib.error import URLError
-from urllib.request import Request, urlopen
+from urllib.request import HTTPSHandler, Request, build_opener

 import llnl.util.lang
 import llnl.util.tty as tty
@@ -27,6 +27,8 @@
 import spack
 import spack.config
 import spack.error
+import spack.gcs_handler
+import spack.s3_handler
 import spack.url
 import spack.util.crypto
 import spack.util.gcs as gcs_util
@@ -36,6 +38,28 @@
 from spack.util.executable import CommandNotFoundError, which
 from spack.util.path import convert_to_posix_path

+
+def _urlopen():
+    s3 = spack.s3_handler.UrllibS3Handler()
+    gcs = spack.gcs_handler.GCSHandler()
+
+    # One opener with HTTPS ssl enabled
+    with_ssl = build_opener(s3, gcs, HTTPSHandler(context=ssl.create_default_context()))
+
+    # One opener with HTTPS ssl disabled
+    without_ssl = build_opener(s3, gcs, HTTPSHandler(context=ssl._create_unverified_context()))
+
+    # And dynamically dispatch based on the config:verify_ssl.
+    def dispatch_open(*args, **kwargs):
+        opener = with_ssl if spack.config.get("config:verify_ssl", True) else without_ssl
+        return opener.open(*args, **kwargs)
+
+    return dispatch_open
+
+
+#: Dispatches to the correct OpenerDirector.open, based on Spack configuration.
+urlopen = llnl.util.lang.Singleton(_urlopen)
+
 #: User-Agent used in Request objects
 SPACK_USER_AGENT = "Spackbot/{0}".format(spack.spack_version)

@@ -60,86 +84,33 @@ def handle_starttag(self, tag, attrs):
                    self.links.append(val)


-def uses_ssl(parsed_url):
-    if parsed_url.scheme == "https":
-        return True
-
-    if parsed_url.scheme == "s3":
-        endpoint_url = os.environ.get("S3_ENDPOINT_URL")
-        if not endpoint_url:
-            return True
-
-        if urllib.parse.urlparse(endpoint_url).scheme == "https":
-            return True
-
-    elif parsed_url.scheme == "gs":
-        tty.debug("(uses_ssl) GCS Blob is https")
-        return True
-
-    return False
-
-
 def read_from_url(url, accept_content_type=None):
    if isinstance(url, str):
        url = urllib.parse.urlparse(url)
-    context = None

    # Timeout in seconds for web requests
    timeout = spack.config.get("config:connect_timeout", 10)
-
-    # Don't even bother with a context unless the URL scheme is one that uses
-    # SSL certs.
-    if uses_ssl(url):
-        if spack.config.get("config:verify_ssl"):
-            # User wants SSL verification, and it *can* be provided.
-            context = ssl.create_default_context()
-        else:
-            # User has explicitly indicated that they do not want SSL
-            # verification.
-            context = ssl._create_unverified_context()
-
-    url_scheme = url.scheme
-    url = url_util.format(url)
-    if sys.platform == "win32" and url_scheme == "file":
-        url = convert_to_posix_path(url)
-    req = Request(url, headers={"User-Agent": SPACK_USER_AGENT})
-
-    content_type = None
-    is_web_url = url_scheme in ("http", "https")
-    if accept_content_type and is_web_url:
-        # Make a HEAD request first to check the content type.  This lets
-        # us ignore tarballs and gigantic files.
-        # It would be nice to do this with the HTTP Accept header to avoid
-        # one round-trip.  However, most servers seem to ignore the header
-        # if you ask for a tarball with Accept: text/html.
-        req.get_method = lambda: "HEAD"
-        resp = _urlopen(req, timeout=timeout, context=context)
-
-        content_type = get_header(resp.headers, "Content-type")
-
-    # Do the real GET request when we know it's just HTML.
-    req.get_method = lambda: "GET"
+    request = Request(url.geturl(), headers={"User-Agent": SPACK_USER_AGENT})

    try:
-        response = _urlopen(req, timeout=timeout, context=context)
+        response = urlopen(request, timeout=timeout)
    except URLError as err:
-        raise SpackWebError("Download failed: {ERROR}".format(ERROR=str(err)))
+        raise SpackWebError("Download failed: {}".format(str(err)))

-    if accept_content_type and not is_web_url:
-        content_type = get_header(response.headers, "Content-type")
+    if accept_content_type:
+        try:
+            content_type = get_header(response.headers, "Content-type")
+            reject_content_type = not content_type.startswith(accept_content_type)
+        except KeyError:
+            content_type = None
+            reject_content_type = True

-    reject_content_type = accept_content_type and (
-        content_type is None or not content_type.startswith(accept_content_type)
-    )
-
-    if reject_content_type:
-        tty.debug(
-            "ignoring page {0}{1}{2}".format(
-                url, " with content type " if content_type is not None else "", content_type or ""
-            )
-        )
-
-        return None, None, None
+        if reject_content_type:
+            msg = "ignoring page {}".format(url.geturl())
+            if content_type:
+                msg += " with content type {}".format(content_type)
+            tty.debug(msg)
+            return None, None, None

    return response.geturl(), response.headers, response

@@ -349,12 +320,6 @@ def url_exists(url, curl=None):
    Simple Storage Service (`s3`) URLs; otherwise, the configured fetch
    method defined by `config:url_fetch_method` is used.

-    If the method is `curl`, it also uses the following configuration option:
-
-        * config:verify_ssl (str): Perform SSL verification
-
-    Otherwise, `urllib` will be used.
-
    Arguments:
        url (str): URL whose existence is being checked
        curl (spack.util.executable.Executable or None): (optional) curl
@@ -365,31 +330,11 @@ def url_exists(url, curl=None):
    tty.debug("Checking existence of {0}".format(url))
    url_result = urllib.parse.urlparse(url)

-    # Check if a local file
-    local_path = url_util.local_file_path(url_result)
-    if local_path:
-        return os.path.exists(local_path)
-
-    # Check if Amazon Simple Storage Service (S3) .. urllib-based fetch
-    if url_result.scheme == "s3":
-        # Check for URL-specific connection information
-        s3 = s3_util.get_s3_session(url_result, method="fetch")
-
-        try:
-            s3.get_object(Bucket=url_result.netloc, Key=url_result.path.lstrip("/"))
-            return True
-        except s3.ClientError as err:
-            if err.response["Error"]["Code"] == "NoSuchKey":
-                return False
-            raise err
-
-    # Check if Google Storage .. urllib-based fetch
-    if url_result.scheme == "gs":
-        gcs = gcs_util.GCSBlob(url_result)
-        return gcs.exists()
-
-    # Otherwise, use the configured fetch method
-    if spack.config.get("config:url_fetch_method") == "curl":
+    # Use curl if configured to do so
+    use_curl = spack.config.get(
+        "config:url_fetch_method", "urllib"
+    ) == "curl" and url_result.scheme not in ("gs", "s3")
+    if use_curl:
        curl_exe = _curl(curl)
        if not curl_exe:
            return False
@@ -402,13 +347,14 @@ def url_exists(url, curl=None):
        _ = curl_exe(*curl_args, fail_on_error=False, output=os.devnull)
        return curl_exe.returncode == 0

-    # If we get here, then the only other fetch method option is urllib.
-    # So try to "read" from the URL and assume that *any* non-throwing
-    #  response contains the resource represented by the URL.
+    # Otherwise use urllib.
    try:
-        read_from_url(url)
+        urlopen(
+            Request(url, method="HEAD", headers={"User-Agent": SPACK_USER_AGENT}),
+            timeout=spack.config.get("config:connect_timeout", 10),
+        )
        return True
-    except (SpackWebError, URLError) as e:
+    except URLError as e:
        tty.debug("Failure reading URL: " + str(e))
        return False

@@ -691,35 +637,6 @@ def _spider(url, collect_nested):
    return pages, links


-def _urlopen(req, *args, **kwargs):
-    """Wrapper for compatibility with old versions of Python."""
-    url = req
-    try:
-        url = url.get_full_url()
-    except AttributeError:
-        pass
-
-    del kwargs["context"]
-
-    opener = urlopen
-    if urllib.parse.urlparse(url).scheme == "s3":
-        import spack.s3_handler
-
-        opener = spack.s3_handler.open
-    elif urllib.parse.urlparse(url).scheme == "gs":
-        import spack.gcs_handler
-
-        opener = spack.gcs_handler.gcs_open
-
-    try:
-        return opener(req, *args, **kwargs)
-    except TypeError as err:
-        # If the above fails because of 'context', call without 'context'.
-        if "context" in kwargs and "context" in str(err):
-            del kwargs["context"]
-        return opener(req, *args, **kwargs)
-
-
 def find_versions_of_archive(
    archive_urls, list_url=None, list_depth=0, concurrency=32, reference_package=None
 ):