Revert "Revert "Use urllib handler for s3:// and gs://, improve url_exists through HEAD requests (#34324)"" (#34498)

This reverts commit 8035eeb36d.

And also removes logic around an additional HEAD request to prevent
a more expensive GET request on wrong content-type. Since large files
are typically an attachment and only downloaded when reading the
stream, it's not an optimization that helps much, and in fact the logic
was broken since the GET request was done unconditionally.
This commit is contained in:
Harmen Stoppels 2022-12-14 23:47:11 +01:00 committed by GitHub
parent 43e38d0d12
commit ea029442e6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 91 additions and 158 deletions

View File

@ -4,8 +4,8 @@
# SPDX-License-Identifier: (Apache-2.0 OR MIT) # SPDX-License-Identifier: (Apache-2.0 OR MIT)
import urllib.parse import urllib.parse
import urllib.response import urllib.response
from urllib.error import URLError
import spack.util.web as web_util from urllib.request import BaseHandler
def gcs_open(req, *args, **kwargs): def gcs_open(req, *args, **kwargs):
@ -16,8 +16,13 @@ def gcs_open(req, *args, **kwargs):
gcsblob = gcs_util.GCSBlob(url) gcsblob = gcs_util.GCSBlob(url)
if not gcsblob.exists(): if not gcsblob.exists():
raise web_util.SpackWebError("GCS blob {0} does not exist".format(gcsblob.blob_path)) raise URLError("GCS blob {0} does not exist".format(gcsblob.blob_path))
stream = gcsblob.get_blob_byte_stream() stream = gcsblob.get_blob_byte_stream()
headers = gcsblob.get_blob_headers() headers = gcsblob.get_blob_headers()
return urllib.response.addinfourl(stream, headers, url) return urllib.response.addinfourl(stream, headers, url)
class GCSHandler(BaseHandler):
def gs_open(self, req):
return gcs_open(req)

View File

@ -7,7 +7,7 @@
import urllib.parse import urllib.parse
import urllib.request import urllib.request
import urllib.response import urllib.response
from io import BufferedReader, IOBase from io import BufferedReader, BytesIO, IOBase
import spack.util.s3 as s3_util import spack.util.s3 as s3_util
@ -42,7 +42,7 @@ def __getattr__(self, key):
return getattr(self.raw, key) return getattr(self.raw, key)
def _s3_open(url): def _s3_open(url, method="GET"):
parsed = urllib.parse.urlparse(url) parsed = urllib.parse.urlparse(url)
s3 = s3_util.get_s3_session(url, method="fetch") s3 = s3_util.get_s3_session(url, method="fetch")
@ -52,27 +52,29 @@ def _s3_open(url):
if key.startswith("/"): if key.startswith("/"):
key = key[1:] key = key[1:]
obj = s3.get_object(Bucket=bucket, Key=key) if method not in ("GET", "HEAD"):
raise urllib.error.URLError(
"Only GET and HEAD verbs are currently supported for the s3:// scheme"
)
try:
if method == "GET":
obj = s3.get_object(Bucket=bucket, Key=key)
# NOTE(opadron): Apply workaround here (see above) # NOTE(opadron): Apply workaround here (see above)
stream = WrapStream(obj["Body"]) stream = WrapStream(obj["Body"])
elif method == "HEAD":
obj = s3.head_object(Bucket=bucket, Key=key)
stream = BytesIO()
except s3.ClientError as e:
raise urllib.error.URLError(e) from e
headers = obj["ResponseMetadata"]["HTTPHeaders"] headers = obj["ResponseMetadata"]["HTTPHeaders"]
return url, headers, stream return url, headers, stream
class UrllibS3Handler(urllib.request.HTTPSHandler): class UrllibS3Handler(urllib.request.BaseHandler):
def s3_open(self, req): def s3_open(self, req):
orig_url = req.get_full_url() orig_url = req.get_full_url()
from botocore.exceptions import ClientError # type: ignore[import] url, headers, stream = _s3_open(orig_url, method=req.get_method())
try:
url, headers, stream = _s3_open(orig_url)
return urllib.response.addinfourl(stream, headers, url) return urllib.response.addinfourl(stream, headers, url)
except ClientError as err:
raise urllib.error.URLError(err) from err
S3OpenerDirector = urllib.request.build_opener(UrllibS3Handler())
open = S3OpenerDirector.open

View File

@ -224,7 +224,10 @@ def paginate(self, *args, **kwargs):
class MockClientError(Exception): class MockClientError(Exception):
def __init__(self): def __init__(self):
self.response = {"Error": {"Code": "NoSuchKey"}} self.response = {
"Error": {"Code": "NoSuchKey"},
"ResponseMetadata": {"HTTPStatusCode": 404},
}
class MockS3Client(object): class MockS3Client(object):
@ -243,7 +246,13 @@ def delete_object(self, *args, **kwargs):
def get_object(self, Bucket=None, Key=None): def get_object(self, Bucket=None, Key=None):
self.ClientError = MockClientError self.ClientError = MockClientError
if Bucket == "my-bucket" and Key == "subdirectory/my-file": if Bucket == "my-bucket" and Key == "subdirectory/my-file":
return True return {"ResponseMetadata": {"HTTPHeaders": {}}}
raise self.ClientError
def head_object(self, Bucket=None, Key=None):
self.ClientError = MockClientError
if Bucket == "my-bucket" and Key == "subdirectory/my-file":
return {"ResponseMetadata": {"HTTPHeaders": {}}}
raise self.ClientError raise self.ClientError

View File

@ -18,7 +18,7 @@
import urllib.parse import urllib.parse
from html.parser import HTMLParser from html.parser import HTMLParser
from urllib.error import URLError from urllib.error import URLError
from urllib.request import Request, urlopen from urllib.request import HTTPSHandler, Request, build_opener
import llnl.util.lang import llnl.util.lang
import llnl.util.tty as tty import llnl.util.tty as tty
@ -27,6 +27,8 @@
import spack import spack
import spack.config import spack.config
import spack.error import spack.error
import spack.gcs_handler
import spack.s3_handler
import spack.url import spack.url
import spack.util.crypto import spack.util.crypto
import spack.util.gcs as gcs_util import spack.util.gcs as gcs_util
@ -36,6 +38,28 @@
from spack.util.executable import CommandNotFoundError, which from spack.util.executable import CommandNotFoundError, which
from spack.util.path import convert_to_posix_path from spack.util.path import convert_to_posix_path
def _urlopen():
s3 = spack.s3_handler.UrllibS3Handler()
gcs = spack.gcs_handler.GCSHandler()
# One opener with HTTPS ssl enabled
with_ssl = build_opener(s3, gcs, HTTPSHandler(context=ssl.create_default_context()))
# One opener with HTTPS ssl disabled
without_ssl = build_opener(s3, gcs, HTTPSHandler(context=ssl._create_unverified_context()))
# And dynamically dispatch based on the config:verify_ssl.
def dispatch_open(*args, **kwargs):
opener = with_ssl if spack.config.get("config:verify_ssl", True) else without_ssl
return opener.open(*args, **kwargs)
return dispatch_open
#: Dispatches to the correct OpenerDirector.open, based on Spack configuration.
urlopen = llnl.util.lang.Singleton(_urlopen)
#: User-Agent used in Request objects #: User-Agent used in Request objects
SPACK_USER_AGENT = "Spackbot/{0}".format(spack.spack_version) SPACK_USER_AGENT = "Spackbot/{0}".format(spack.spack_version)
@ -60,85 +84,32 @@ def handle_starttag(self, tag, attrs):
self.links.append(val) self.links.append(val)
def uses_ssl(parsed_url):
if parsed_url.scheme == "https":
return True
if parsed_url.scheme == "s3":
endpoint_url = os.environ.get("S3_ENDPOINT_URL")
if not endpoint_url:
return True
if urllib.parse.urlparse(endpoint_url).scheme == "https":
return True
elif parsed_url.scheme == "gs":
tty.debug("(uses_ssl) GCS Blob is https")
return True
return False
def read_from_url(url, accept_content_type=None): def read_from_url(url, accept_content_type=None):
if isinstance(url, str): if isinstance(url, str):
url = urllib.parse.urlparse(url) url = urllib.parse.urlparse(url)
context = None
# Timeout in seconds for web requests # Timeout in seconds for web requests
timeout = spack.config.get("config:connect_timeout", 10) timeout = spack.config.get("config:connect_timeout", 10)
request = Request(url.geturl(), headers={"User-Agent": SPACK_USER_AGENT})
# Don't even bother with a context unless the URL scheme is one that uses
# SSL certs.
if uses_ssl(url):
if spack.config.get("config:verify_ssl"):
# User wants SSL verification, and it *can* be provided.
context = ssl.create_default_context()
else:
# User has explicitly indicated that they do not want SSL
# verification.
context = ssl._create_unverified_context()
url_scheme = url.scheme
url = url_util.format(url)
if sys.platform == "win32" and url_scheme == "file":
url = convert_to_posix_path(url)
req = Request(url, headers={"User-Agent": SPACK_USER_AGENT})
content_type = None
is_web_url = url_scheme in ("http", "https")
if accept_content_type and is_web_url:
# Make a HEAD request first to check the content type. This lets
# us ignore tarballs and gigantic files.
# It would be nice to do this with the HTTP Accept header to avoid
# one round-trip. However, most servers seem to ignore the header
# if you ask for a tarball with Accept: text/html.
req.get_method = lambda: "HEAD"
resp = _urlopen(req, timeout=timeout, context=context)
content_type = get_header(resp.headers, "Content-type")
# Do the real GET request when we know it's just HTML.
req.get_method = lambda: "GET"
try: try:
response = _urlopen(req, timeout=timeout, context=context) response = urlopen(request, timeout=timeout)
except URLError as err: except URLError as err:
raise SpackWebError("Download failed: {ERROR}".format(ERROR=str(err))) raise SpackWebError("Download failed: {}".format(str(err)))
if accept_content_type and not is_web_url: if accept_content_type:
try:
content_type = get_header(response.headers, "Content-type") content_type = get_header(response.headers, "Content-type")
reject_content_type = not content_type.startswith(accept_content_type)
reject_content_type = accept_content_type and ( except KeyError:
content_type is None or not content_type.startswith(accept_content_type) content_type = None
) reject_content_type = True
if reject_content_type: if reject_content_type:
tty.debug( msg = "ignoring page {}".format(url.geturl())
"ignoring page {0}{1}{2}".format( if content_type:
url, " with content type " if content_type is not None else "", content_type or "" msg += " with content type {}".format(content_type)
) tty.debug(msg)
)
return None, None, None return None, None, None
return response.geturl(), response.headers, response return response.geturl(), response.headers, response
@ -349,12 +320,6 @@ def url_exists(url, curl=None):
Simple Storage Service (`s3`) URLs; otherwise, the configured fetch Simple Storage Service (`s3`) URLs; otherwise, the configured fetch
method defined by `config:url_fetch_method` is used. method defined by `config:url_fetch_method` is used.
If the method is `curl`, it also uses the following configuration option:
* config:verify_ssl (str): Perform SSL verification
Otherwise, `urllib` will be used.
Arguments: Arguments:
url (str): URL whose existence is being checked url (str): URL whose existence is being checked
curl (spack.util.executable.Executable or None): (optional) curl curl (spack.util.executable.Executable or None): (optional) curl
@ -365,31 +330,11 @@ def url_exists(url, curl=None):
tty.debug("Checking existence of {0}".format(url)) tty.debug("Checking existence of {0}".format(url))
url_result = urllib.parse.urlparse(url) url_result = urllib.parse.urlparse(url)
# Check if a local file # Use curl if configured to do so
local_path = url_util.local_file_path(url_result) use_curl = spack.config.get(
if local_path: "config:url_fetch_method", "urllib"
return os.path.exists(local_path) ) == "curl" and url_result.scheme not in ("gs", "s3")
if use_curl:
# Check if Amazon Simple Storage Service (S3) .. urllib-based fetch
if url_result.scheme == "s3":
# Check for URL-specific connection information
s3 = s3_util.get_s3_session(url_result, method="fetch")
try:
s3.get_object(Bucket=url_result.netloc, Key=url_result.path.lstrip("/"))
return True
except s3.ClientError as err:
if err.response["Error"]["Code"] == "NoSuchKey":
return False
raise err
# Check if Google Storage .. urllib-based fetch
if url_result.scheme == "gs":
gcs = gcs_util.GCSBlob(url_result)
return gcs.exists()
# Otherwise, use the configured fetch method
if spack.config.get("config:url_fetch_method") == "curl":
curl_exe = _curl(curl) curl_exe = _curl(curl)
if not curl_exe: if not curl_exe:
return False return False
@ -402,13 +347,14 @@ def url_exists(url, curl=None):
_ = curl_exe(*curl_args, fail_on_error=False, output=os.devnull) _ = curl_exe(*curl_args, fail_on_error=False, output=os.devnull)
return curl_exe.returncode == 0 return curl_exe.returncode == 0
# If we get here, then the only other fetch method option is urllib. # Otherwise use urllib.
# So try to "read" from the URL and assume that *any* non-throwing
# response contains the resource represented by the URL.
try: try:
read_from_url(url) urlopen(
Request(url, method="HEAD", headers={"User-Agent": SPACK_USER_AGENT}),
timeout=spack.config.get("config:connect_timeout", 10),
)
return True return True
except (SpackWebError, URLError) as e: except URLError as e:
tty.debug("Failure reading URL: " + str(e)) tty.debug("Failure reading URL: " + str(e))
return False return False
@ -691,35 +637,6 @@ def _spider(url, collect_nested):
return pages, links return pages, links
def _urlopen(req, *args, **kwargs):
"""Wrapper for compatibility with old versions of Python."""
url = req
try:
url = url.get_full_url()
except AttributeError:
pass
del kwargs["context"]
opener = urlopen
if urllib.parse.urlparse(url).scheme == "s3":
import spack.s3_handler
opener = spack.s3_handler.open
elif urllib.parse.urlparse(url).scheme == "gs":
import spack.gcs_handler
opener = spack.gcs_handler.gcs_open
try:
return opener(req, *args, **kwargs)
except TypeError as err:
# If the above fails because of 'context', call without 'context'.
if "context" in kwargs and "context" in str(err):
del kwargs["context"]
return opener(req, *args, **kwargs)
def find_versions_of_archive( def find_versions_of_archive(
archive_urls, list_url=None, list_depth=0, concurrency=32, reference_package=None archive_urls, list_url=None, list_depth=0, concurrency=32, reference_package=None
): ):