Revert "Revert "Use urllib handler for s3:// and gs://, improve url_exists through HEAD requests (#34324)"" (#34498)

This reverts commit 8035eeb36d.

And also removes logic around an additional HEAD request to prevent
a more expensive GET request on wrong content-type. Since large files
are typically an attachment and only downloaded when reading the
stream, it's not an optimization that helps much, and in fact the logic
was broken since the GET request was done unconditionally.
This commit is contained in:
Harmen Stoppels 2022-12-14 23:47:11 +01:00 committed by GitHub
parent 43e38d0d12
commit ea029442e6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 91 additions and 158 deletions

View File

@ -4,8 +4,8 @@
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
import urllib.parse
import urllib.response
import spack.util.web as web_util
from urllib.error import URLError
from urllib.request import BaseHandler
def gcs_open(req, *args, **kwargs):
@ -16,8 +16,13 @@ def gcs_open(req, *args, **kwargs):
gcsblob = gcs_util.GCSBlob(url)
if not gcsblob.exists():
raise web_util.SpackWebError("GCS blob {0} does not exist".format(gcsblob.blob_path))
raise URLError("GCS blob {0} does not exist".format(gcsblob.blob_path))
stream = gcsblob.get_blob_byte_stream()
headers = gcsblob.get_blob_headers()
return urllib.response.addinfourl(stream, headers, url)
class GCSHandler(BaseHandler):
def gs_open(self, req):
return gcs_open(req)

View File

@ -7,7 +7,7 @@
import urllib.parse
import urllib.request
import urllib.response
from io import BufferedReader, IOBase
from io import BufferedReader, BytesIO, IOBase
import spack.util.s3 as s3_util
@ -42,7 +42,7 @@ def __getattr__(self, key):
return getattr(self.raw, key)
def _s3_open(url):
def _s3_open(url, method="GET"):
parsed = urllib.parse.urlparse(url)
s3 = s3_util.get_s3_session(url, method="fetch")
@ -52,27 +52,29 @@ def _s3_open(url):
if key.startswith("/"):
key = key[1:]
obj = s3.get_object(Bucket=bucket, Key=key)
if method not in ("GET", "HEAD"):
raise urllib.error.URLError(
"Only GET and HEAD verbs are currently supported for the s3:// scheme"
)
try:
if method == "GET":
obj = s3.get_object(Bucket=bucket, Key=key)
# NOTE(opadron): Apply workaround here (see above)
stream = WrapStream(obj["Body"])
elif method == "HEAD":
obj = s3.head_object(Bucket=bucket, Key=key)
stream = BytesIO()
except s3.ClientError as e:
raise urllib.error.URLError(e) from e
# NOTE(opadron): Apply workaround here (see above)
stream = WrapStream(obj["Body"])
headers = obj["ResponseMetadata"]["HTTPHeaders"]
return url, headers, stream
class UrllibS3Handler(urllib.request.HTTPSHandler):
class UrllibS3Handler(urllib.request.BaseHandler):
def s3_open(self, req):
orig_url = req.get_full_url()
from botocore.exceptions import ClientError # type: ignore[import]
try:
url, headers, stream = _s3_open(orig_url)
return urllib.response.addinfourl(stream, headers, url)
except ClientError as err:
raise urllib.error.URLError(err) from err
S3OpenerDirector = urllib.request.build_opener(UrllibS3Handler())
open = S3OpenerDirector.open
url, headers, stream = _s3_open(orig_url, method=req.get_method())
return urllib.response.addinfourl(stream, headers, url)

View File

@ -224,7 +224,10 @@ def paginate(self, *args, **kwargs):
class MockClientError(Exception):
def __init__(self):
self.response = {"Error": {"Code": "NoSuchKey"}}
self.response = {
"Error": {"Code": "NoSuchKey"},
"ResponseMetadata": {"HTTPStatusCode": 404},
}
class MockS3Client(object):
@ -243,7 +246,13 @@ def delete_object(self, *args, **kwargs):
def get_object(self, Bucket=None, Key=None):
self.ClientError = MockClientError
if Bucket == "my-bucket" and Key == "subdirectory/my-file":
return True
return {"ResponseMetadata": {"HTTPHeaders": {}}}
raise self.ClientError
def head_object(self, Bucket=None, Key=None):
self.ClientError = MockClientError
if Bucket == "my-bucket" and Key == "subdirectory/my-file":
return {"ResponseMetadata": {"HTTPHeaders": {}}}
raise self.ClientError

View File

@ -18,7 +18,7 @@
import urllib.parse
from html.parser import HTMLParser
from urllib.error import URLError
from urllib.request import Request, urlopen
from urllib.request import HTTPSHandler, Request, build_opener
import llnl.util.lang
import llnl.util.tty as tty
@ -27,6 +27,8 @@
import spack
import spack.config
import spack.error
import spack.gcs_handler
import spack.s3_handler
import spack.url
import spack.util.crypto
import spack.util.gcs as gcs_util
@ -36,6 +38,28 @@
from spack.util.executable import CommandNotFoundError, which
from spack.util.path import convert_to_posix_path
def _urlopen():
s3 = spack.s3_handler.UrllibS3Handler()
gcs = spack.gcs_handler.GCSHandler()
# One opener with HTTPS ssl enabled
with_ssl = build_opener(s3, gcs, HTTPSHandler(context=ssl.create_default_context()))
# One opener with HTTPS ssl disabled
without_ssl = build_opener(s3, gcs, HTTPSHandler(context=ssl._create_unverified_context()))
# And dynamically dispatch based on the config:verify_ssl.
def dispatch_open(*args, **kwargs):
opener = with_ssl if spack.config.get("config:verify_ssl", True) else without_ssl
return opener.open(*args, **kwargs)
return dispatch_open
#: Dispatches to the correct OpenerDirector.open, based on Spack configuration.
urlopen = llnl.util.lang.Singleton(_urlopen)
#: User-Agent used in Request objects
SPACK_USER_AGENT = "Spackbot/{0}".format(spack.spack_version)
@ -60,86 +84,33 @@ def handle_starttag(self, tag, attrs):
self.links.append(val)
def uses_ssl(parsed_url):
if parsed_url.scheme == "https":
return True
if parsed_url.scheme == "s3":
endpoint_url = os.environ.get("S3_ENDPOINT_URL")
if not endpoint_url:
return True
if urllib.parse.urlparse(endpoint_url).scheme == "https":
return True
elif parsed_url.scheme == "gs":
tty.debug("(uses_ssl) GCS Blob is https")
return True
return False
def read_from_url(url, accept_content_type=None):
if isinstance(url, str):
url = urllib.parse.urlparse(url)
context = None
# Timeout in seconds for web requests
timeout = spack.config.get("config:connect_timeout", 10)
# Don't even bother with a context unless the URL scheme is one that uses
# SSL certs.
if uses_ssl(url):
if spack.config.get("config:verify_ssl"):
# User wants SSL verification, and it *can* be provided.
context = ssl.create_default_context()
else:
# User has explicitly indicated that they do not want SSL
# verification.
context = ssl._create_unverified_context()
url_scheme = url.scheme
url = url_util.format(url)
if sys.platform == "win32" and url_scheme == "file":
url = convert_to_posix_path(url)
req = Request(url, headers={"User-Agent": SPACK_USER_AGENT})
content_type = None
is_web_url = url_scheme in ("http", "https")
if accept_content_type and is_web_url:
# Make a HEAD request first to check the content type. This lets
# us ignore tarballs and gigantic files.
# It would be nice to do this with the HTTP Accept header to avoid
# one round-trip. However, most servers seem to ignore the header
# if you ask for a tarball with Accept: text/html.
req.get_method = lambda: "HEAD"
resp = _urlopen(req, timeout=timeout, context=context)
content_type = get_header(resp.headers, "Content-type")
# Do the real GET request when we know it's just HTML.
req.get_method = lambda: "GET"
request = Request(url.geturl(), headers={"User-Agent": SPACK_USER_AGENT})
try:
response = _urlopen(req, timeout=timeout, context=context)
response = urlopen(request, timeout=timeout)
except URLError as err:
raise SpackWebError("Download failed: {ERROR}".format(ERROR=str(err)))
raise SpackWebError("Download failed: {}".format(str(err)))
if accept_content_type and not is_web_url:
content_type = get_header(response.headers, "Content-type")
if accept_content_type:
try:
content_type = get_header(response.headers, "Content-type")
reject_content_type = not content_type.startswith(accept_content_type)
except KeyError:
content_type = None
reject_content_type = True
reject_content_type = accept_content_type and (
content_type is None or not content_type.startswith(accept_content_type)
)
if reject_content_type:
tty.debug(
"ignoring page {0}{1}{2}".format(
url, " with content type " if content_type is not None else "", content_type or ""
)
)
return None, None, None
if reject_content_type:
msg = "ignoring page {}".format(url.geturl())
if content_type:
msg += " with content type {}".format(content_type)
tty.debug(msg)
return None, None, None
return response.geturl(), response.headers, response
@ -349,12 +320,6 @@ def url_exists(url, curl=None):
Simple Storage Service (`s3`) URLs; otherwise, the configured fetch
method defined by `config:url_fetch_method` is used.
If the method is `curl`, it also uses the following configuration option:
* config:verify_ssl (str): Perform SSL verification
Otherwise, `urllib` will be used.
Arguments:
url (str): URL whose existence is being checked
curl (spack.util.executable.Executable or None): (optional) curl
@ -365,31 +330,11 @@ def url_exists(url, curl=None):
tty.debug("Checking existence of {0}".format(url))
url_result = urllib.parse.urlparse(url)
# Check if a local file
local_path = url_util.local_file_path(url_result)
if local_path:
return os.path.exists(local_path)
# Check if Amazon Simple Storage Service (S3) .. urllib-based fetch
if url_result.scheme == "s3":
# Check for URL-specific connection information
s3 = s3_util.get_s3_session(url_result, method="fetch")
try:
s3.get_object(Bucket=url_result.netloc, Key=url_result.path.lstrip("/"))
return True
except s3.ClientError as err:
if err.response["Error"]["Code"] == "NoSuchKey":
return False
raise err
# Check if Google Storage .. urllib-based fetch
if url_result.scheme == "gs":
gcs = gcs_util.GCSBlob(url_result)
return gcs.exists()
# Otherwise, use the configured fetch method
if spack.config.get("config:url_fetch_method") == "curl":
# Use curl if configured to do so
use_curl = spack.config.get(
"config:url_fetch_method", "urllib"
) == "curl" and url_result.scheme not in ("gs", "s3")
if use_curl:
curl_exe = _curl(curl)
if not curl_exe:
return False
@ -402,13 +347,14 @@ def url_exists(url, curl=None):
_ = curl_exe(*curl_args, fail_on_error=False, output=os.devnull)
return curl_exe.returncode == 0
# If we get here, then the only other fetch method option is urllib.
# So try to "read" from the URL and assume that *any* non-throwing
# response contains the resource represented by the URL.
# Otherwise use urllib.
try:
read_from_url(url)
urlopen(
Request(url, method="HEAD", headers={"User-Agent": SPACK_USER_AGENT}),
timeout=spack.config.get("config:connect_timeout", 10),
)
return True
except (SpackWebError, URLError) as e:
except URLError as e:
tty.debug("Failure reading URL: " + str(e))
return False
@ -691,35 +637,6 @@ def _spider(url, collect_nested):
return pages, links
def _urlopen(req, *args, **kwargs):
"""Wrapper for compatibility with old versions of Python."""
url = req
try:
url = url.get_full_url()
except AttributeError:
pass
del kwargs["context"]
opener = urlopen
if urllib.parse.urlparse(url).scheme == "s3":
import spack.s3_handler
opener = spack.s3_handler.open
elif urllib.parse.urlparse(url).scheme == "gs":
import spack.gcs_handler
opener = spack.gcs_handler.gcs_open
try:
return opener(req, *args, **kwargs)
except TypeError as err:
# If the above fails because of 'context', call without 'context'.
if "context" in kwargs and "context" in str(err):
del kwargs["context"]
return opener(req, *args, **kwargs)
def find_versions_of_archive(
archive_urls, list_url=None, list_depth=0, concurrency=32, reference_package=None
):