bugfix: spack.util.url.join() now handles absolute paths correctly (#13488)
* fix issue where spack.util.url.join() failed to correctly handle absolute path components * add url util tests
This commit is contained in:
		 Omar Padron
					Omar Padron
				
			
				
					committed by
					
						 Todd Gamblin
						Todd Gamblin
					
				
			
			
				
	
			
			
			 Todd Gamblin
						Todd Gamblin
					
				
			
						parent
						
							2a9d6b9fbf
						
					
				
				
					commit
					01a0d554f5
				
			
							
								
								
									
										303
									
								
								lib/spack/spack/test/util/util_url.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										303
									
								
								lib/spack/spack/test/util/util_url.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,303 @@ | |||||||
|  | # Copyright 2013-2019 Lawrence Livermore National Security, LLC and other | ||||||
|  | # Spack Project Developers. See the top-level COPYRIGHT file for details. | ||||||
|  | # | ||||||
|  | # SPDX-License-Identifier: (Apache-2.0 OR MIT) | ||||||
|  |  | ||||||
|  | """Test Spack's URL handling utility functions.""" | ||||||
|  | import os | ||||||
|  | import os.path | ||||||
|  | import spack.util.url as url_util | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_url_parse(): | ||||||
|  |     parsed = url_util.parse('/path/to/resource') | ||||||
|  |     assert(parsed.scheme == 'file') | ||||||
|  |     assert(parsed.netloc == '') | ||||||
|  |     assert(parsed.path == '/path/to/resource') | ||||||
|  |  | ||||||
|  |     parsed = url_util.parse('/path/to/resource', scheme='fake') | ||||||
|  |     assert(parsed.scheme == 'fake') | ||||||
|  |     assert(parsed.netloc == '') | ||||||
|  |     assert(parsed.path == '/path/to/resource') | ||||||
|  |  | ||||||
|  |     parsed = url_util.parse('file:///path/to/resource') | ||||||
|  |     assert(parsed.scheme == 'file') | ||||||
|  |     assert(parsed.netloc == '') | ||||||
|  |     assert(parsed.path == '/path/to/resource') | ||||||
|  |  | ||||||
|  |     parsed = url_util.parse('file:///path/to/resource', scheme='fake') | ||||||
|  |     assert(parsed.scheme == 'file') | ||||||
|  |     assert(parsed.netloc == '') | ||||||
|  |     assert(parsed.path == '/path/to/resource') | ||||||
|  |  | ||||||
|  |     parsed = url_util.parse('file://path/to/resource') | ||||||
|  |     assert(parsed.scheme == 'file') | ||||||
|  |     assert(parsed.netloc == '') | ||||||
|  |     expected = os.path.abspath(os.path.join('path', 'to', 'resource')) | ||||||
|  |     assert(parsed.path == expected) | ||||||
|  |  | ||||||
|  |     parsed = url_util.parse('https://path/to/resource') | ||||||
|  |     assert(parsed.scheme == 'https') | ||||||
|  |     assert(parsed.netloc == 'path') | ||||||
|  |     assert(parsed.path == '/to/resource') | ||||||
|  |  | ||||||
|  |     spack_root = os.path.abspath(os.environ['SPACK_ROOT']) | ||||||
|  |     parsed = url_util.parse('$spack') | ||||||
|  |     assert(parsed.scheme == 'file') | ||||||
|  |     assert(parsed.netloc == '') | ||||||
|  |     assert(parsed.path == spack_root) | ||||||
|  |  | ||||||
|  |     parsed = url_util.parse('/a/b/c/$spack') | ||||||
|  |     assert(parsed.scheme == 'file') | ||||||
|  |     assert(parsed.netloc == '') | ||||||
|  |     expected = os.path.abspath(os.path.join( | ||||||
|  |         '/', 'a', 'b', 'c', './' + spack_root)) | ||||||
|  |     assert(parsed.path == expected) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_url_local_file_path(): | ||||||
|  |     spack_root = os.path.abspath(os.environ['SPACK_ROOT']) | ||||||
|  |  | ||||||
|  |     lfp = url_util.local_file_path('/a/b/c.txt') | ||||||
|  |     assert(lfp == '/a/b/c.txt') | ||||||
|  |  | ||||||
|  |     lfp = url_util.local_file_path('file:///a/b/c.txt') | ||||||
|  |     assert(lfp == '/a/b/c.txt') | ||||||
|  |  | ||||||
|  |     lfp = url_util.local_file_path('file://a/b/c.txt') | ||||||
|  |     expected = os.path.abspath(os.path.join('a', 'b', 'c.txt')) | ||||||
|  |     assert(lfp == expected) | ||||||
|  |  | ||||||
|  |     lfp = url_util.local_file_path('$spack/a/b/c.txt') | ||||||
|  |     expected = os.path.abspath(os.path.join(spack_root, 'a', 'b', 'c.txt')) | ||||||
|  |     assert(lfp == expected) | ||||||
|  |  | ||||||
|  |     lfp = url_util.local_file_path('file:///$spack/a/b/c.txt') | ||||||
|  |     expected = os.path.abspath(os.path.join(spack_root, 'a', 'b', 'c.txt')) | ||||||
|  |     assert(lfp == expected) | ||||||
|  |  | ||||||
|  |     lfp = url_util.local_file_path('file://$spack/a/b/c.txt') | ||||||
|  |     expected = os.path.abspath(os.path.join(spack_root, 'a', 'b', 'c.txt')) | ||||||
|  |     assert(lfp == expected) | ||||||
|  |  | ||||||
|  |     # not a file:// URL - so no local file path | ||||||
|  |     lfp = url_util.local_file_path('http:///a/b/c.txt') | ||||||
|  |     assert(lfp is None) | ||||||
|  |  | ||||||
|  |     lfp = url_util.local_file_path('http://a/b/c.txt') | ||||||
|  |     assert(lfp is None) | ||||||
|  |  | ||||||
|  |     lfp = url_util.local_file_path('http:///$spack/a/b/c.txt') | ||||||
|  |     assert(lfp is None) | ||||||
|  |  | ||||||
|  |     lfp = url_util.local_file_path('http://$spack/a/b/c.txt') | ||||||
|  |     assert(lfp is None) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_url_join_local_paths(): | ||||||
|  |     # Resolve local link against page URL | ||||||
|  |  | ||||||
|  |     # wrong: | ||||||
|  |     assert( | ||||||
|  |         url_util.join( | ||||||
|  |             's3://bucket/index.html', | ||||||
|  |             '../other-bucket/document.txt') | ||||||
|  |         == | ||||||
|  |         's3://bucket/other-bucket/document.txt') | ||||||
|  |  | ||||||
|  |     # correct - need to specify resolve_href=True: | ||||||
|  |     assert( | ||||||
|  |         url_util.join( | ||||||
|  |             's3://bucket/index.html', | ||||||
|  |             '../other-bucket/document.txt', | ||||||
|  |             resolve_href=True) | ||||||
|  |         == | ||||||
|  |         's3://other-bucket/document.txt') | ||||||
|  |  | ||||||
|  |     # same as above: make sure several components are joined together correctly | ||||||
|  |     assert( | ||||||
|  |         url_util.join( | ||||||
|  |             # with resolve_href=True, first arg is the base url; can not be | ||||||
|  |             # broken up | ||||||
|  |             's3://bucket/index.html', | ||||||
|  |  | ||||||
|  |             # with resolve_href=True, remaining arguments are the components of | ||||||
|  |             # the local href that needs to be resolved | ||||||
|  |             '..', 'other-bucket', 'document.txt', | ||||||
|  |             resolve_href=True) | ||||||
|  |         == | ||||||
|  |         's3://other-bucket/document.txt') | ||||||
|  |  | ||||||
|  |     # Append local path components to prefix URL | ||||||
|  |  | ||||||
|  |     # wrong: | ||||||
|  |     assert( | ||||||
|  |         url_util.join( | ||||||
|  |             'https://mirror.spack.io/build_cache', | ||||||
|  |             'my-package', | ||||||
|  |             resolve_href=True) | ||||||
|  |         == | ||||||
|  |         'https://mirror.spack.io/my-package') | ||||||
|  |  | ||||||
|  |     # correct - Need to specify resolve_href=False: | ||||||
|  |     assert( | ||||||
|  |         url_util.join( | ||||||
|  |             'https://mirror.spack.io/build_cache', | ||||||
|  |             'my-package', | ||||||
|  |             resolve_href=False) | ||||||
|  |         == | ||||||
|  |         'https://mirror.spack.io/build_cache/my-package') | ||||||
|  |  | ||||||
|  |     # same as above; make sure resolve_href=False is default | ||||||
|  |     assert( | ||||||
|  |         url_util.join( | ||||||
|  |             'https://mirror.spack.io/build_cache', | ||||||
|  |             'my-package') | ||||||
|  |         == | ||||||
|  |         'https://mirror.spack.io/build_cache/my-package') | ||||||
|  |  | ||||||
|  |     # same as above: make sure several components are joined together correctly | ||||||
|  |     assert( | ||||||
|  |         url_util.join( | ||||||
|  |             # with resolve_href=False, first arg is just a prefix. No | ||||||
|  |             # resolution is done.  So, there should be no difference between | ||||||
|  |             # join('/a/b/c', 'd/e'), | ||||||
|  |             # join('/a/b', 'c', 'd/e'), | ||||||
|  |             # join('/a', 'b/c', 'd', 'e'), etc. | ||||||
|  |             'https://mirror.spack.io', | ||||||
|  |             'build_cache', | ||||||
|  |             'my-package') | ||||||
|  |         == | ||||||
|  |         'https://mirror.spack.io/build_cache/my-package') | ||||||
|  |  | ||||||
|  |     # file:// URL path components are *NOT* canonicalized | ||||||
|  |     spack_root = os.path.abspath(os.environ['SPACK_ROOT']) | ||||||
|  |  | ||||||
|  |     join_result = url_util.join('/a/b/c', '$spack') | ||||||
|  |     assert(join_result == 'file:///a/b/c/$spack')  # not canonicalized | ||||||
|  |     format_result = url_util.format(join_result) | ||||||
|  |     # canoncalize by hand | ||||||
|  |     expected = url_util.format(os.path.abspath(os.path.join( | ||||||
|  |         '/', 'a', 'b', 'c', '.' + spack_root))) | ||||||
|  |     assert(format_result == expected) | ||||||
|  |  | ||||||
|  |     # see test_url_join_absolute_paths() for more on absolute path components | ||||||
|  |     join_result = url_util.join('/a/b/c', '/$spack') | ||||||
|  |     assert(join_result == 'file:///$spack')  # not canonicalized | ||||||
|  |     format_result = url_util.format(join_result) | ||||||
|  |     expected = url_util.format(spack_root) | ||||||
|  |     assert(format_result == expected) | ||||||
|  |  | ||||||
|  |     # For s3:// URLs, the "netloc" (bucket) is considered part of the path. | ||||||
|  |     # Make sure join() can cross bucket boundaries in this case. | ||||||
|  |     args = ['s3://bucket/a/b', 'new-bucket', 'c'] | ||||||
|  |     assert(url_util.join(*args) == 's3://bucket/a/b/new-bucket/c') | ||||||
|  |  | ||||||
|  |     args.insert(1, '..') | ||||||
|  |     assert(url_util.join(*args) == 's3://bucket/a/new-bucket/c') | ||||||
|  |  | ||||||
|  |     args.insert(1, '..') | ||||||
|  |     assert(url_util.join(*args) == 's3://bucket/new-bucket/c') | ||||||
|  |  | ||||||
|  |     # new-bucket is now the "netloc" (bucket name) | ||||||
|  |     args.insert(1, '..') | ||||||
|  |     assert(url_util.join(*args) == 's3://new-bucket/c') | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_url_join_absolute_paths(): | ||||||
|  |     # Handling absolute path components is a little tricky.  To this end, we | ||||||
|  |     # distinguish "absolute path components", from the more-familiar concept of | ||||||
|  |     # "absolute paths" as they are understood for local filesystem paths. | ||||||
|  |     # | ||||||
|  |     # - All absolute paths are absolute path components.  Joining a URL with | ||||||
|  |     #   these components has the effect of completely replacing the path of the | ||||||
|  |     #   URL with the absolute path.  These components do not specify a URL | ||||||
|  |     #   scheme, so the scheme of the URL procuced when joining them depend on | ||||||
|  |     #   those provided by components that came before it (file:// assumed if no | ||||||
|  |     #   such scheme is provided). | ||||||
|  |  | ||||||
|  |     # For eaxmple: | ||||||
|  |     p = '/path/to/resource' | ||||||
|  |     # ...is an absolute path | ||||||
|  |  | ||||||
|  |     # http:// URL | ||||||
|  |     assert( | ||||||
|  |         url_util.join('http://example.com/a/b/c', p) | ||||||
|  |         == 'http://example.com/path/to/resource') | ||||||
|  |  | ||||||
|  |     # s3:// URL | ||||||
|  |     # also notice how the netloc is treated as part of the path for s3:// URLs | ||||||
|  |     assert( | ||||||
|  |         url_util.join('s3://example.com/a/b/c', p) | ||||||
|  |         == 's3://path/to/resource') | ||||||
|  |  | ||||||
|  |     # - URL components that specify a scheme are always absolute path | ||||||
|  |     #   components.  Joining a base URL with these components effectively | ||||||
|  |     #   discards the base URL and "resets" the joining logic starting at the | ||||||
|  |     #   component in question and using it as the new base URL. | ||||||
|  |  | ||||||
|  |     # For eaxmple: | ||||||
|  |     p = 'http://example.com/path/to' | ||||||
|  |     # ...is an http:// URL | ||||||
|  |  | ||||||
|  |     join_result = url_util.join(p, 'resource') | ||||||
|  |     assert(join_result == 'http://example.com/path/to/resource') | ||||||
|  |  | ||||||
|  |     # works as if everything before the http:// URL was left out | ||||||
|  |     assert( | ||||||
|  |         url_util.join( | ||||||
|  |             'literally', 'does', 'not', 'matter', | ||||||
|  |             p, 'resource') | ||||||
|  |         == join_result) | ||||||
|  |  | ||||||
|  |     # It's important to keep in mind that this logic applies even if the | ||||||
|  |     # component's path is not an absolute path! | ||||||
|  |  | ||||||
|  |     # For eaxmple: | ||||||
|  |     p = './d' | ||||||
|  |     # ...is *NOT* an absolute path | ||||||
|  |     # ...is also *NOT* an absolute path component | ||||||
|  |  | ||||||
|  |     u = 'file://./d' | ||||||
|  |     # ...is a URL | ||||||
|  |     #     The path of this URL is *NOT* an absolute path | ||||||
|  |     #     HOWEVER, the URL, itself, *is* an absolute path component | ||||||
|  |  | ||||||
|  |     # (We just need... | ||||||
|  |     cwd = os.getcwd() | ||||||
|  |     # ...to work out what resource it points to) | ||||||
|  |  | ||||||
|  |     # So, even though parse() assumes "file://" URL, the scheme is still | ||||||
|  |     # significant in URL path components passed to join(), even if the base | ||||||
|  |     # is a file:// URL. | ||||||
|  |  | ||||||
|  |     path_join_result = 'file:///a/b/c/d' | ||||||
|  |     assert(url_util.join('/a/b/c', p) == path_join_result) | ||||||
|  |     assert(url_util.join('file:///a/b/c', p) == path_join_result) | ||||||
|  |  | ||||||
|  |     url_join_result = 'file://{CWD}/d'.format(CWD=cwd) | ||||||
|  |     assert(url_util.join('/a/b/c', u) == url_join_result) | ||||||
|  |     assert(url_util.join('file:///a/b/c', u) == url_join_result) | ||||||
|  |  | ||||||
|  |     # Finally, resolve_href should have no effect for how absolute path | ||||||
|  |     # components are handled because local hrefs can not be absolute path | ||||||
|  |     # components. | ||||||
|  |     args = ['s3://does', 'not', 'matter', | ||||||
|  |             'http://example.com', | ||||||
|  |             'also', 'does', 'not', 'matter', | ||||||
|  |             '/path'] | ||||||
|  |  | ||||||
|  |     expected = 'http://example.com/path' | ||||||
|  |     assert(url_util.join(*args, resolve_href=True) == expected) | ||||||
|  |     assert(url_util.join(*args, resolve_href=False) == expected) | ||||||
|  |  | ||||||
|  |     # resolve_href only matters for the local path components at the end of the | ||||||
|  |     # argument list. | ||||||
|  |     args[-1] = '/path/to/page' | ||||||
|  |     args.extend(('..', '..', 'resource')) | ||||||
|  |  | ||||||
|  |     assert(url_util.join(*args, resolve_href=True) == | ||||||
|  |            'http://example.com/resource') | ||||||
|  |  | ||||||
|  |     assert(url_util.join(*args, resolve_href=False) == | ||||||
|  |            'http://example.com/path/resource') | ||||||
| @@ -51,7 +51,7 @@ def local_file_path(url): | |||||||
|  |  | ||||||
|  |  | ||||||
| def parse(url, scheme='file'): | def parse(url, scheme='file'): | ||||||
|     """Parse a mirror url. |     """Parse a url. | ||||||
|  |  | ||||||
|     For file:// URLs, the netloc and path components are concatenated and |     For file:// URLs, the netloc and path components are concatenated and | ||||||
|     passed through spack.util.path.canoncalize_path(). |     passed through spack.util.path.canoncalize_path(). | ||||||
| @@ -105,6 +105,9 @@ def join(base_url, path, *extra, **kwargs): | |||||||
|     If resolve_href is False (default), then the URL path components are joined |     If resolve_href is False (default), then the URL path components are joined | ||||||
|     as in os.path.join(). |     as in os.path.join(). | ||||||
|  |  | ||||||
|  |     Note: file:// URL path components are not canonicalized as part of this | ||||||
|  |     operation.  To canonicalize, pass the joined url to format(). | ||||||
|  |  | ||||||
|     Examples: |     Examples: | ||||||
|       base_url = 's3://bucket/index.html' |       base_url = 's3://bucket/index.html' | ||||||
|       body = fetch_body(prefix) |       body = fetch_body(prefix) | ||||||
| @@ -127,7 +130,79 @@ def join(base_url, path, *extra, **kwargs): | |||||||
|       # correct - simply append additional URL path components |       # correct - simply append additional URL path components | ||||||
|       spack.util.url.join(prefix, 'my-package', resolve_href=False) # default |       spack.util.url.join(prefix, 'my-package', resolve_href=False) # default | ||||||
|       'https://mirror.spack.io/build_cache/my-package' |       'https://mirror.spack.io/build_cache/my-package' | ||||||
|  |  | ||||||
|  |       # For canonicalizing file:// URLs, take care to explicitly differentiate | ||||||
|  |       # between absolute and relative join components. | ||||||
|  |  | ||||||
|  |       # '$spack' is not an absolute path component | ||||||
|  |       join_result = spack.util.url.join('/a/b/c', '$spack') ; join_result | ||||||
|  |       'file:///a/b/c/$spack' | ||||||
|  |       spack.util.url.format(join_result) | ||||||
|  |       'file:///a/b/c/opt/spack' | ||||||
|  |  | ||||||
|  |       # '/$spack' *is* an absolute path component | ||||||
|  |       join_result = spack.util.url.join('/a/b/c', '/$spack') ; join_result | ||||||
|  |       'file:///$spack' | ||||||
|  |       spack.util.url.format(join_result) | ||||||
|  |       'file:///opt/spack' | ||||||
|     """ |     """ | ||||||
|  |     paths = [ | ||||||
|  |         (x if isinstance(x, string_types) else x.geturl()) | ||||||
|  |         for x in itertools.chain((base_url, path), extra)] | ||||||
|  |     n = len(paths) | ||||||
|  |     last_abs_component = None | ||||||
|  |     scheme = None | ||||||
|  |     for i in range(n - 1, -1, -1): | ||||||
|  |         obj = urllib_parse.urlparse( | ||||||
|  |             paths[i], scheme=None, allow_fragments=False) | ||||||
|  |  | ||||||
|  |         scheme = obj.scheme | ||||||
|  |  | ||||||
|  |         # in either case the component is absolute | ||||||
|  |         if scheme is not None or obj.path.startswith('/'): | ||||||
|  |             if scheme is None: | ||||||
|  |                 # Without a scheme, we have to go back looking for the | ||||||
|  |                 # next-last component that specifies a scheme. | ||||||
|  |                 for j in range(i - 1, -1, -1): | ||||||
|  |                     obj = urllib_parse.urlparse( | ||||||
|  |                         paths[j], scheme=None, allow_fragments=False) | ||||||
|  |  | ||||||
|  |                     if obj.scheme: | ||||||
|  |                         paths[i] = '{SM}://{NL}{PATH}'.format( | ||||||
|  |                             SM=obj.scheme, | ||||||
|  |                             NL=( | ||||||
|  |                                 (obj.netloc + '/') | ||||||
|  |                                 if obj.scheme != 's3' else ''), | ||||||
|  |                             PATH=paths[i][1:]) | ||||||
|  |                         break | ||||||
|  |  | ||||||
|  |             last_abs_component = i | ||||||
|  |             break | ||||||
|  |  | ||||||
|  |     if last_abs_component is not None: | ||||||
|  |         paths = paths[last_abs_component:] | ||||||
|  |         if len(paths) == 1: | ||||||
|  |             result = urllib_parse.urlparse( | ||||||
|  |                 paths[0], scheme='file', allow_fragments=False) | ||||||
|  |  | ||||||
|  |             # another subtlety: If the last argument to join() is an absolute | ||||||
|  |             # file:// URL component with a relative path, the relative path | ||||||
|  |             # needs to be resolved. | ||||||
|  |             if result.scheme == 'file' and result.netloc: | ||||||
|  |                 result = urllib_parse.ParseResult( | ||||||
|  |                     scheme=result.scheme, | ||||||
|  |                     netloc='', | ||||||
|  |                     path=os.path.abspath(result.netloc + result.path), | ||||||
|  |                     params=result.params, | ||||||
|  |                     query=result.query, | ||||||
|  |                     fragment=None) | ||||||
|  |  | ||||||
|  |             return result.geturl() | ||||||
|  |  | ||||||
|  |     return _join(*paths, **kwargs) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _join(base_url, path, *extra, **kwargs): | ||||||
|     base_url = parse(base_url) |     base_url = parse(base_url) | ||||||
|     resolve_href = kwargs.get('resolve_href', False) |     resolve_href = kwargs.get('resolve_href', False) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -477,7 +477,6 @@ def spider(root, depth=0): | |||||||
|        performance over a sequential fetch. |        performance over a sequential fetch. | ||||||
|  |  | ||||||
|     """ |     """ | ||||||
|  |  | ||||||
|     root = url_util.parse(root) |     root = url_util.parse(root) | ||||||
|     pages, links = _spider(root, set(), root, 0, depth, False) |     pages, links = _spider(root, set(), root, 0, depth, False) | ||||||
|     return pages, links |     return pages, links | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user