Fix broken sanitize_file_path (#38926)
				
					
				
			The sanitization function is completely bogus as it tries to replace / on unix after ... splitting on it. The way it's implemented is very questionable: the input is a file name, not a path. It doesn't make sense to interpret the input as a path and then make the components valid -- you'll interpret / in a filename as a dir separator. It also fails to deal with path components that contain just unsupported characters (resulting in empty component). The correct way to deal with this is to have a function that takes a potential file name and replaces unsupported characters. I'm not going to fix the other issues on Windows, such as reserved file names, but left a note, and hope that @johnwparent can fix that separately. (Obviously we wouldn't have this problem at all if we just fixed the filename in a safe way instead of trying to derive something from the url; we could use the content digest when available for example)
This commit is contained in:
		| @@ -387,8 +387,7 @@ def expected_archive_files(self): | ||||
|         expanded = True | ||||
|         if isinstance(self.default_fetcher, fs.URLFetchStrategy): | ||||
|             expanded = self.default_fetcher.expand_archive | ||||
|             clean_url = os.path.basename(sup.sanitize_file_path(self.default_fetcher.url)) | ||||
|             fnames.append(clean_url) | ||||
|             fnames.append(url_util.default_download_filename(self.default_fetcher.url)) | ||||
| 
 | ||||
|         if self.mirror_paths: | ||||
|             fnames.extend(os.path.basename(x) for x in self.mirror_paths) | ||||
|   | ||||
| @@ -29,15 +29,13 @@ | ||||
| ] | ||||
| 
 | ||||
| 
 | ||||
| def test_sanitze_file_path(tmpdir): | ||||
|     """Test filtering illegal characters out of potential file paths""" | ||||
|     # *nix illegal files characters are '/' and none others | ||||
|     illegal_file_path = str(tmpdir) + "//" + "abcdefghi.txt" | ||||
| def test_sanitize_filename(): | ||||
|     """Test filtering illegal characters out of potential filenames""" | ||||
|     sanitized = sup.sanitize_filename("""a<b>cd/?e:f"g|h*i.\0txt""") | ||||
|     if sys.platform == "win32": | ||||
|         # Windows has a larger set of illegal characters | ||||
|         illegal_file_path = os.path.join(tmpdir, 'a<b>cd?e:f"g|h*i.txt') | ||||
|     real_path = sup.sanitize_file_path(illegal_file_path) | ||||
|     assert real_path == os.path.join(str(tmpdir), "abcdefghi.txt") | ||||
|         assert sanitized == "a_b_cd__e_f_g_h_i._txt" | ||||
|     else: | ||||
|         assert sanitized == """a<b>cd_?e:f"g|h*i._txt""" | ||||
| 
 | ||||
| 
 | ||||
| # This class pertains to path string padding manipulation specifically | ||||
|   | ||||
| @@ -10,6 +10,7 @@ | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
| import spack.util.path | ||||
| import spack.util.url as url_util | ||||
| 
 | ||||
| 
 | ||||
| @@ -278,3 +279,16 @@ def test_git_url_parse(url, parts): | ||||
|             url_util.parse_git_url(url) | ||||
|     else: | ||||
|         assert parts == url_util.parse_git_url(url) | ||||
| 
 | ||||
| 
 | ||||
| def test_default_download_name(): | ||||
|     url = "https://example.com:1234/path/to/file.txt;params?abc=def#file=blob.tar" | ||||
|     filename = url_util.default_download_filename(url) | ||||
|     assert filename == spack.util.path.sanitize_filename(filename) | ||||
| 
 | ||||
| 
 | ||||
| def test_default_download_name_dot_dot(): | ||||
|     """Avoid that downloaded files get names computed as ., .. or any hidden file.""" | ||||
|     assert url_util.default_download_filename("https://example.com/.") == "_" | ||||
|     assert url_util.default_download_filename("https://example.com/..") == "_." | ||||
|     assert url_util.default_download_filename("https://example.com/.abcdef") == "_abcdef" | ||||
|   | ||||
| @@ -12,6 +12,7 @@ | ||||
| import spack.config | ||||
| import spack.mirror | ||||
| import spack.paths | ||||
| import spack.util.path | ||||
| import spack.util.s3 | ||||
| import spack.util.url as url_util | ||||
| import spack.util.web | ||||
|   | ||||
| @@ -132,40 +132,26 @@ def path_to_os_path(*pths): | ||||
|     return ret_pths | ||||
| 
 | ||||
| 
 | ||||
| def sanitize_file_path(pth): | ||||
| def sanitize_filename(filename: str) -> str: | ||||
|     """ | ||||
|     Formats strings to contain only characters that can | ||||
|     be used to generate legal file paths. | ||||
|     Replaces unsupported characters (for the host) in a filename with underscores. | ||||
| 
 | ||||
|     Criteria for legal files based on | ||||
|     https://en.wikipedia.org/wiki/Filename#Comparison_of_filename_limitations | ||||
| 
 | ||||
|     Args: | ||||
|         pth: string containing path to be created | ||||
|             on the host filesystem | ||||
|         filename: string containing filename to be created on the host filesystem | ||||
| 
 | ||||
|     Return: | ||||
|         sanitized string that can legally be made into a path | ||||
|         filename that can be created on the host filesystem | ||||
|     """ | ||||
|     # on unix, splitting path by seperators will remove | ||||
|     # instances of illegal characters on join | ||||
|     pth_cmpnts = pth.split(os.path.sep) | ||||
|     if sys.platform != "win32": | ||||
|         # Only disallow null bytes and directory separators. | ||||
|         return re.sub("[\0/]", "_", filename) | ||||
| 
 | ||||
|     if sys.platform == "win32": | ||||
|         drive_match = r"[a-zA-Z]:" | ||||
|         is_abs = bool(re.match(drive_match, pth_cmpnts[0])) | ||||
|         drive = pth_cmpnts[0] + os.path.sep if is_abs else "" | ||||
|         pth_cmpnts = pth_cmpnts[1:] if drive else pth_cmpnts | ||||
|         illegal_chars = r'[<>?:"|*\\]' | ||||
|     else: | ||||
|         drive = "/" if not pth_cmpnts[0] else "" | ||||
|         illegal_chars = r"[/]" | ||||
| 
 | ||||
|     pth = [] | ||||
|     for cmp in pth_cmpnts: | ||||
|         san_cmp = re.sub(illegal_chars, "", cmp) | ||||
|         pth.append(san_cmp) | ||||
|     return drive + os.path.join(*pth) | ||||
|     # On Windows, things are more involved. | ||||
|     # NOTE: this is incomplete, missing reserved names | ||||
|     return re.sub(r'[\x00-\x1F\x7F"*/:<>?\\|]', "_", filename) | ||||
| 
 | ||||
| 
 | ||||
| def system_path_filter(_func=None, arg_slice=None): | ||||
|   | ||||
| @@ -15,7 +15,7 @@ | ||||
| import urllib.parse | ||||
| import urllib.request | ||||
| 
 | ||||
| from spack.util.path import convert_to_posix_path | ||||
| from spack.util.path import convert_to_posix_path, sanitize_filename | ||||
| 
 | ||||
| 
 | ||||
| def validate_scheme(scheme): | ||||
| @@ -294,3 +294,22 @@ def parse_git_url(url): | ||||
|             raise ValueError("bad port in git url: %s" % url) | ||||
| 
 | ||||
|     return (scheme, user, hostname, port, path) | ||||
| 
 | ||||
| 
 | ||||
| def default_download_filename(url: str) -> str: | ||||
|     """This method computes a default file name for a given URL. | ||||
|     Note that it makes no request, so this is not the same as the | ||||
|     option curl -O, which uses the remote file name from the response | ||||
|     header.""" | ||||
|     parsed_url = urllib.parse.urlparse(url) | ||||
|     # Only use the last path component + params + query + fragment | ||||
|     name = urllib.parse.urlunparse( | ||||
|         parsed_url._replace(scheme="", netloc="", path=posixpath.basename(parsed_url.path)) | ||||
|     ) | ||||
|     valid_name = sanitize_filename(name) | ||||
| 
 | ||||
|     # Don't download to hidden files please | ||||
|     if valid_name[0] == ".": | ||||
|         valid_name = "_" + valid_name[1:] | ||||
| 
 | ||||
|     return valid_name | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Harmen Stoppels
					Harmen Stoppels