bugfix: spack.util.url.join() now handles absolute paths correctly (#13488)
* fix issue where spack.util.url.join() failed to correctly handle absolute path components * add url util tests
This commit is contained in:
parent
3bdab6f686
commit
0f816561db
303
lib/spack/spack/test/util/util_url.py
Normal file
303
lib/spack/spack/test/util/util_url.py
Normal file
@ -0,0 +1,303 @@
|
|||||||
|
# Copyright 2013-2019 Lawrence Livermore National Security, LLC and other
|
||||||
|
# Spack Project Developers. See the top-level COPYRIGHT file for details.
|
||||||
|
#
|
||||||
|
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
|
||||||
|
|
||||||
|
"""Test Spack's URL handling utility functions."""
|
||||||
|
import os
|
||||||
|
import os.path
|
||||||
|
import spack.util.url as url_util
|
||||||
|
|
||||||
|
|
||||||
|
def test_url_parse():
|
||||||
|
parsed = url_util.parse('/path/to/resource')
|
||||||
|
assert(parsed.scheme == 'file')
|
||||||
|
assert(parsed.netloc == '')
|
||||||
|
assert(parsed.path == '/path/to/resource')
|
||||||
|
|
||||||
|
parsed = url_util.parse('/path/to/resource', scheme='fake')
|
||||||
|
assert(parsed.scheme == 'fake')
|
||||||
|
assert(parsed.netloc == '')
|
||||||
|
assert(parsed.path == '/path/to/resource')
|
||||||
|
|
||||||
|
parsed = url_util.parse('file:///path/to/resource')
|
||||||
|
assert(parsed.scheme == 'file')
|
||||||
|
assert(parsed.netloc == '')
|
||||||
|
assert(parsed.path == '/path/to/resource')
|
||||||
|
|
||||||
|
parsed = url_util.parse('file:///path/to/resource', scheme='fake')
|
||||||
|
assert(parsed.scheme == 'file')
|
||||||
|
assert(parsed.netloc == '')
|
||||||
|
assert(parsed.path == '/path/to/resource')
|
||||||
|
|
||||||
|
parsed = url_util.parse('file://path/to/resource')
|
||||||
|
assert(parsed.scheme == 'file')
|
||||||
|
assert(parsed.netloc == '')
|
||||||
|
expected = os.path.abspath(os.path.join('path', 'to', 'resource'))
|
||||||
|
assert(parsed.path == expected)
|
||||||
|
|
||||||
|
parsed = url_util.parse('https://path/to/resource')
|
||||||
|
assert(parsed.scheme == 'https')
|
||||||
|
assert(parsed.netloc == 'path')
|
||||||
|
assert(parsed.path == '/to/resource')
|
||||||
|
|
||||||
|
spack_root = os.path.abspath(os.environ['SPACK_ROOT'])
|
||||||
|
parsed = url_util.parse('$spack')
|
||||||
|
assert(parsed.scheme == 'file')
|
||||||
|
assert(parsed.netloc == '')
|
||||||
|
assert(parsed.path == spack_root)
|
||||||
|
|
||||||
|
parsed = url_util.parse('/a/b/c/$spack')
|
||||||
|
assert(parsed.scheme == 'file')
|
||||||
|
assert(parsed.netloc == '')
|
||||||
|
expected = os.path.abspath(os.path.join(
|
||||||
|
'/', 'a', 'b', 'c', './' + spack_root))
|
||||||
|
assert(parsed.path == expected)
|
||||||
|
|
||||||
|
|
||||||
|
def test_url_local_file_path():
|
||||||
|
spack_root = os.path.abspath(os.environ['SPACK_ROOT'])
|
||||||
|
|
||||||
|
lfp = url_util.local_file_path('/a/b/c.txt')
|
||||||
|
assert(lfp == '/a/b/c.txt')
|
||||||
|
|
||||||
|
lfp = url_util.local_file_path('file:///a/b/c.txt')
|
||||||
|
assert(lfp == '/a/b/c.txt')
|
||||||
|
|
||||||
|
lfp = url_util.local_file_path('file://a/b/c.txt')
|
||||||
|
expected = os.path.abspath(os.path.join('a', 'b', 'c.txt'))
|
||||||
|
assert(lfp == expected)
|
||||||
|
|
||||||
|
lfp = url_util.local_file_path('$spack/a/b/c.txt')
|
||||||
|
expected = os.path.abspath(os.path.join(spack_root, 'a', 'b', 'c.txt'))
|
||||||
|
assert(lfp == expected)
|
||||||
|
|
||||||
|
lfp = url_util.local_file_path('file:///$spack/a/b/c.txt')
|
||||||
|
expected = os.path.abspath(os.path.join(spack_root, 'a', 'b', 'c.txt'))
|
||||||
|
assert(lfp == expected)
|
||||||
|
|
||||||
|
lfp = url_util.local_file_path('file://$spack/a/b/c.txt')
|
||||||
|
expected = os.path.abspath(os.path.join(spack_root, 'a', 'b', 'c.txt'))
|
||||||
|
assert(lfp == expected)
|
||||||
|
|
||||||
|
# not a file:// URL - so no local file path
|
||||||
|
lfp = url_util.local_file_path('http:///a/b/c.txt')
|
||||||
|
assert(lfp is None)
|
||||||
|
|
||||||
|
lfp = url_util.local_file_path('http://a/b/c.txt')
|
||||||
|
assert(lfp is None)
|
||||||
|
|
||||||
|
lfp = url_util.local_file_path('http:///$spack/a/b/c.txt')
|
||||||
|
assert(lfp is None)
|
||||||
|
|
||||||
|
lfp = url_util.local_file_path('http://$spack/a/b/c.txt')
|
||||||
|
assert(lfp is None)
|
||||||
|
|
||||||
|
|
||||||
|
def test_url_join_local_paths():
|
||||||
|
# Resolve local link against page URL
|
||||||
|
|
||||||
|
# wrong:
|
||||||
|
assert(
|
||||||
|
url_util.join(
|
||||||
|
's3://bucket/index.html',
|
||||||
|
'../other-bucket/document.txt')
|
||||||
|
==
|
||||||
|
's3://bucket/other-bucket/document.txt')
|
||||||
|
|
||||||
|
# correct - need to specify resolve_href=True:
|
||||||
|
assert(
|
||||||
|
url_util.join(
|
||||||
|
's3://bucket/index.html',
|
||||||
|
'../other-bucket/document.txt',
|
||||||
|
resolve_href=True)
|
||||||
|
==
|
||||||
|
's3://other-bucket/document.txt')
|
||||||
|
|
||||||
|
# same as above: make sure several components are joined together correctly
|
||||||
|
assert(
|
||||||
|
url_util.join(
|
||||||
|
# with resolve_href=True, first arg is the base url; can not be
|
||||||
|
# broken up
|
||||||
|
's3://bucket/index.html',
|
||||||
|
|
||||||
|
# with resolve_href=True, remaining arguments are the components of
|
||||||
|
# the local href that needs to be resolved
|
||||||
|
'..', 'other-bucket', 'document.txt',
|
||||||
|
resolve_href=True)
|
||||||
|
==
|
||||||
|
's3://other-bucket/document.txt')
|
||||||
|
|
||||||
|
# Append local path components to prefix URL
|
||||||
|
|
||||||
|
# wrong:
|
||||||
|
assert(
|
||||||
|
url_util.join(
|
||||||
|
'https://mirror.spack.io/build_cache',
|
||||||
|
'my-package',
|
||||||
|
resolve_href=True)
|
||||||
|
==
|
||||||
|
'https://mirror.spack.io/my-package')
|
||||||
|
|
||||||
|
# correct - Need to specify resolve_href=False:
|
||||||
|
assert(
|
||||||
|
url_util.join(
|
||||||
|
'https://mirror.spack.io/build_cache',
|
||||||
|
'my-package',
|
||||||
|
resolve_href=False)
|
||||||
|
==
|
||||||
|
'https://mirror.spack.io/build_cache/my-package')
|
||||||
|
|
||||||
|
# same as above; make sure resolve_href=False is default
|
||||||
|
assert(
|
||||||
|
url_util.join(
|
||||||
|
'https://mirror.spack.io/build_cache',
|
||||||
|
'my-package')
|
||||||
|
==
|
||||||
|
'https://mirror.spack.io/build_cache/my-package')
|
||||||
|
|
||||||
|
# same as above: make sure several components are joined together correctly
|
||||||
|
assert(
|
||||||
|
url_util.join(
|
||||||
|
# with resolve_href=False, first arg is just a prefix. No
|
||||||
|
# resolution is done. So, there should be no difference between
|
||||||
|
# join('/a/b/c', 'd/e'),
|
||||||
|
# join('/a/b', 'c', 'd/e'),
|
||||||
|
# join('/a', 'b/c', 'd', 'e'), etc.
|
||||||
|
'https://mirror.spack.io',
|
||||||
|
'build_cache',
|
||||||
|
'my-package')
|
||||||
|
==
|
||||||
|
'https://mirror.spack.io/build_cache/my-package')
|
||||||
|
|
||||||
|
# file:// URL path components are *NOT* canonicalized
|
||||||
|
spack_root = os.path.abspath(os.environ['SPACK_ROOT'])
|
||||||
|
|
||||||
|
join_result = url_util.join('/a/b/c', '$spack')
|
||||||
|
assert(join_result == 'file:///a/b/c/$spack') # not canonicalized
|
||||||
|
format_result = url_util.format(join_result)
|
||||||
|
# canoncalize by hand
|
||||||
|
expected = url_util.format(os.path.abspath(os.path.join(
|
||||||
|
'/', 'a', 'b', 'c', '.' + spack_root)))
|
||||||
|
assert(format_result == expected)
|
||||||
|
|
||||||
|
# see test_url_join_absolute_paths() for more on absolute path components
|
||||||
|
join_result = url_util.join('/a/b/c', '/$spack')
|
||||||
|
assert(join_result == 'file:///$spack') # not canonicalized
|
||||||
|
format_result = url_util.format(join_result)
|
||||||
|
expected = url_util.format(spack_root)
|
||||||
|
assert(format_result == expected)
|
||||||
|
|
||||||
|
# For s3:// URLs, the "netloc" (bucket) is considered part of the path.
|
||||||
|
# Make sure join() can cross bucket boundaries in this case.
|
||||||
|
args = ['s3://bucket/a/b', 'new-bucket', 'c']
|
||||||
|
assert(url_util.join(*args) == 's3://bucket/a/b/new-bucket/c')
|
||||||
|
|
||||||
|
args.insert(1, '..')
|
||||||
|
assert(url_util.join(*args) == 's3://bucket/a/new-bucket/c')
|
||||||
|
|
||||||
|
args.insert(1, '..')
|
||||||
|
assert(url_util.join(*args) == 's3://bucket/new-bucket/c')
|
||||||
|
|
||||||
|
# new-bucket is now the "netloc" (bucket name)
|
||||||
|
args.insert(1, '..')
|
||||||
|
assert(url_util.join(*args) == 's3://new-bucket/c')
|
||||||
|
|
||||||
|
|
||||||
|
def test_url_join_absolute_paths():
|
||||||
|
# Handling absolute path components is a little tricky. To this end, we
|
||||||
|
# distinguish "absolute path components", from the more-familiar concept of
|
||||||
|
# "absolute paths" as they are understood for local filesystem paths.
|
||||||
|
#
|
||||||
|
# - All absolute paths are absolute path components. Joining a URL with
|
||||||
|
# these components has the effect of completely replacing the path of the
|
||||||
|
# URL with the absolute path. These components do not specify a URL
|
||||||
|
# scheme, so the scheme of the URL procuced when joining them depend on
|
||||||
|
# those provided by components that came before it (file:// assumed if no
|
||||||
|
# such scheme is provided).
|
||||||
|
|
||||||
|
# For eaxmple:
|
||||||
|
p = '/path/to/resource'
|
||||||
|
# ...is an absolute path
|
||||||
|
|
||||||
|
# http:// URL
|
||||||
|
assert(
|
||||||
|
url_util.join('http://example.com/a/b/c', p)
|
||||||
|
== 'http://example.com/path/to/resource')
|
||||||
|
|
||||||
|
# s3:// URL
|
||||||
|
# also notice how the netloc is treated as part of the path for s3:// URLs
|
||||||
|
assert(
|
||||||
|
url_util.join('s3://example.com/a/b/c', p)
|
||||||
|
== 's3://path/to/resource')
|
||||||
|
|
||||||
|
# - URL components that specify a scheme are always absolute path
|
||||||
|
# components. Joining a base URL with these components effectively
|
||||||
|
# discards the base URL and "resets" the joining logic starting at the
|
||||||
|
# component in question and using it as the new base URL.
|
||||||
|
|
||||||
|
# For eaxmple:
|
||||||
|
p = 'http://example.com/path/to'
|
||||||
|
# ...is an http:// URL
|
||||||
|
|
||||||
|
join_result = url_util.join(p, 'resource')
|
||||||
|
assert(join_result == 'http://example.com/path/to/resource')
|
||||||
|
|
||||||
|
# works as if everything before the http:// URL was left out
|
||||||
|
assert(
|
||||||
|
url_util.join(
|
||||||
|
'literally', 'does', 'not', 'matter',
|
||||||
|
p, 'resource')
|
||||||
|
== join_result)
|
||||||
|
|
||||||
|
# It's important to keep in mind that this logic applies even if the
|
||||||
|
# component's path is not an absolute path!
|
||||||
|
|
||||||
|
# For eaxmple:
|
||||||
|
p = './d'
|
||||||
|
# ...is *NOT* an absolute path
|
||||||
|
# ...is also *NOT* an absolute path component
|
||||||
|
|
||||||
|
u = 'file://./d'
|
||||||
|
# ...is a URL
|
||||||
|
# The path of this URL is *NOT* an absolute path
|
||||||
|
# HOWEVER, the URL, itself, *is* an absolute path component
|
||||||
|
|
||||||
|
# (We just need...
|
||||||
|
cwd = os.getcwd()
|
||||||
|
# ...to work out what resource it points to)
|
||||||
|
|
||||||
|
# So, even though parse() assumes "file://" URL, the scheme is still
|
||||||
|
# significant in URL path components passed to join(), even if the base
|
||||||
|
# is a file:// URL.
|
||||||
|
|
||||||
|
path_join_result = 'file:///a/b/c/d'
|
||||||
|
assert(url_util.join('/a/b/c', p) == path_join_result)
|
||||||
|
assert(url_util.join('file:///a/b/c', p) == path_join_result)
|
||||||
|
|
||||||
|
url_join_result = 'file://{CWD}/d'.format(CWD=cwd)
|
||||||
|
assert(url_util.join('/a/b/c', u) == url_join_result)
|
||||||
|
assert(url_util.join('file:///a/b/c', u) == url_join_result)
|
||||||
|
|
||||||
|
# Finally, resolve_href should have no effect for how absolute path
|
||||||
|
# components are handled because local hrefs can not be absolute path
|
||||||
|
# components.
|
||||||
|
args = ['s3://does', 'not', 'matter',
|
||||||
|
'http://example.com',
|
||||||
|
'also', 'does', 'not', 'matter',
|
||||||
|
'/path']
|
||||||
|
|
||||||
|
expected = 'http://example.com/path'
|
||||||
|
assert(url_util.join(*args, resolve_href=True) == expected)
|
||||||
|
assert(url_util.join(*args, resolve_href=False) == expected)
|
||||||
|
|
||||||
|
# resolve_href only matters for the local path components at the end of the
|
||||||
|
# argument list.
|
||||||
|
args[-1] = '/path/to/page'
|
||||||
|
args.extend(('..', '..', 'resource'))
|
||||||
|
|
||||||
|
assert(url_util.join(*args, resolve_href=True) ==
|
||||||
|
'http://example.com/resource')
|
||||||
|
|
||||||
|
assert(url_util.join(*args, resolve_href=False) ==
|
||||||
|
'http://example.com/path/resource')
|
@ -51,7 +51,7 @@ def local_file_path(url):
|
|||||||
|
|
||||||
|
|
||||||
def parse(url, scheme='file'):
|
def parse(url, scheme='file'):
|
||||||
"""Parse a mirror url.
|
"""Parse a url.
|
||||||
|
|
||||||
For file:// URLs, the netloc and path components are concatenated and
|
For file:// URLs, the netloc and path components are concatenated and
|
||||||
passed through spack.util.path.canoncalize_path().
|
passed through spack.util.path.canoncalize_path().
|
||||||
@ -105,6 +105,9 @@ def join(base_url, path, *extra, **kwargs):
|
|||||||
If resolve_href is False (default), then the URL path components are joined
|
If resolve_href is False (default), then the URL path components are joined
|
||||||
as in os.path.join().
|
as in os.path.join().
|
||||||
|
|
||||||
|
Note: file:// URL path components are not canonicalized as part of this
|
||||||
|
operation. To canonicalize, pass the joined url to format().
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
base_url = 's3://bucket/index.html'
|
base_url = 's3://bucket/index.html'
|
||||||
body = fetch_body(prefix)
|
body = fetch_body(prefix)
|
||||||
@ -127,7 +130,79 @@ def join(base_url, path, *extra, **kwargs):
|
|||||||
# correct - simply append additional URL path components
|
# correct - simply append additional URL path components
|
||||||
spack.util.url.join(prefix, 'my-package', resolve_href=False) # default
|
spack.util.url.join(prefix, 'my-package', resolve_href=False) # default
|
||||||
'https://mirror.spack.io/build_cache/my-package'
|
'https://mirror.spack.io/build_cache/my-package'
|
||||||
|
|
||||||
|
# For canonicalizing file:// URLs, take care to explicitly differentiate
|
||||||
|
# between absolute and relative join components.
|
||||||
|
|
||||||
|
# '$spack' is not an absolute path component
|
||||||
|
join_result = spack.util.url.join('/a/b/c', '$spack') ; join_result
|
||||||
|
'file:///a/b/c/$spack'
|
||||||
|
spack.util.url.format(join_result)
|
||||||
|
'file:///a/b/c/opt/spack'
|
||||||
|
|
||||||
|
# '/$spack' *is* an absolute path component
|
||||||
|
join_result = spack.util.url.join('/a/b/c', '/$spack') ; join_result
|
||||||
|
'file:///$spack'
|
||||||
|
spack.util.url.format(join_result)
|
||||||
|
'file:///opt/spack'
|
||||||
"""
|
"""
|
||||||
|
paths = [
|
||||||
|
(x if isinstance(x, string_types) else x.geturl())
|
||||||
|
for x in itertools.chain((base_url, path), extra)]
|
||||||
|
n = len(paths)
|
||||||
|
last_abs_component = None
|
||||||
|
scheme = None
|
||||||
|
for i in range(n - 1, -1, -1):
|
||||||
|
obj = urllib_parse.urlparse(
|
||||||
|
paths[i], scheme=None, allow_fragments=False)
|
||||||
|
|
||||||
|
scheme = obj.scheme
|
||||||
|
|
||||||
|
# in either case the component is absolute
|
||||||
|
if scheme is not None or obj.path.startswith('/'):
|
||||||
|
if scheme is None:
|
||||||
|
# Without a scheme, we have to go back looking for the
|
||||||
|
# next-last component that specifies a scheme.
|
||||||
|
for j in range(i - 1, -1, -1):
|
||||||
|
obj = urllib_parse.urlparse(
|
||||||
|
paths[j], scheme=None, allow_fragments=False)
|
||||||
|
|
||||||
|
if obj.scheme:
|
||||||
|
paths[i] = '{SM}://{NL}{PATH}'.format(
|
||||||
|
SM=obj.scheme,
|
||||||
|
NL=(
|
||||||
|
(obj.netloc + '/')
|
||||||
|
if obj.scheme != 's3' else ''),
|
||||||
|
PATH=paths[i][1:])
|
||||||
|
break
|
||||||
|
|
||||||
|
last_abs_component = i
|
||||||
|
break
|
||||||
|
|
||||||
|
if last_abs_component is not None:
|
||||||
|
paths = paths[last_abs_component:]
|
||||||
|
if len(paths) == 1:
|
||||||
|
result = urllib_parse.urlparse(
|
||||||
|
paths[0], scheme='file', allow_fragments=False)
|
||||||
|
|
||||||
|
# another subtlety: If the last argument to join() is an absolute
|
||||||
|
# file:// URL component with a relative path, the relative path
|
||||||
|
# needs to be resolved.
|
||||||
|
if result.scheme == 'file' and result.netloc:
|
||||||
|
result = urllib_parse.ParseResult(
|
||||||
|
scheme=result.scheme,
|
||||||
|
netloc='',
|
||||||
|
path=os.path.abspath(result.netloc + result.path),
|
||||||
|
params=result.params,
|
||||||
|
query=result.query,
|
||||||
|
fragment=None)
|
||||||
|
|
||||||
|
return result.geturl()
|
||||||
|
|
||||||
|
return _join(*paths, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def _join(base_url, path, *extra, **kwargs):
|
||||||
base_url = parse(base_url)
|
base_url = parse(base_url)
|
||||||
resolve_href = kwargs.get('resolve_href', False)
|
resolve_href = kwargs.get('resolve_href', False)
|
||||||
|
|
||||||
|
@ -477,7 +477,6 @@ def spider(root, depth=0):
|
|||||||
performance over a sequential fetch.
|
performance over a sequential fetch.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
root = url_util.parse(root)
|
root = url_util.parse(root)
|
||||||
pages, links = _spider(root, set(), root, 0, depth, False)
|
pages, links = _spider(root, set(), root, 0, depth, False)
|
||||||
return pages, links
|
return pages, links
|
||||||
|
Loading…
Reference in New Issue
Block a user