Fix broken sanitize_file_path
(#38926)
The sanitization function is completely bogus as it tries to replace / on unix after ... splitting on it. The way it's implemented is very questionable: the input is a file name, not a path. It doesn't make sense to interpret the input as a path and then make the components valid -- you'll interpret / in a filename as a dir separator. It also fails to deal with path components that contain just unsupported characters (resulting in empty component). The correct way to deal with this is to have a function that takes a potential file name and replaces unsupported characters. I'm not going to fix the other issues on Windows, such as reserved file names, but left a note, and hope that @johnwparent can fix that separately. (Obviously we wouldn't have this problem at all if we just fixed the filename in a safe way instead of trying to derive something from the url; we could use the content digest when available for example)
This commit is contained in:
parent
ac3c0a4347
commit
0fff219aa4
@ -387,8 +387,7 @@ def expected_archive_files(self):
|
|||||||
expanded = True
|
expanded = True
|
||||||
if isinstance(self.default_fetcher, fs.URLFetchStrategy):
|
if isinstance(self.default_fetcher, fs.URLFetchStrategy):
|
||||||
expanded = self.default_fetcher.expand_archive
|
expanded = self.default_fetcher.expand_archive
|
||||||
clean_url = os.path.basename(sup.sanitize_file_path(self.default_fetcher.url))
|
fnames.append(url_util.default_download_filename(self.default_fetcher.url))
|
||||||
fnames.append(clean_url)
|
|
||||||
|
|
||||||
if self.mirror_paths:
|
if self.mirror_paths:
|
||||||
fnames.extend(os.path.basename(x) for x in self.mirror_paths)
|
fnames.extend(os.path.basename(x) for x in self.mirror_paths)
|
||||||
|
@ -29,15 +29,13 @@
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_sanitze_file_path(tmpdir):
|
def test_sanitize_filename():
|
||||||
"""Test filtering illegal characters out of potential file paths"""
|
"""Test filtering illegal characters out of potential filenames"""
|
||||||
# *nix illegal files characters are '/' and none others
|
sanitized = sup.sanitize_filename("""a<b>cd/?e:f"g|h*i.\0txt""")
|
||||||
illegal_file_path = str(tmpdir) + "//" + "abcdefghi.txt"
|
|
||||||
if sys.platform == "win32":
|
if sys.platform == "win32":
|
||||||
# Windows has a larger set of illegal characters
|
assert sanitized == "a_b_cd__e_f_g_h_i._txt"
|
||||||
illegal_file_path = os.path.join(tmpdir, 'a<b>cd?e:f"g|h*i.txt')
|
else:
|
||||||
real_path = sup.sanitize_file_path(illegal_file_path)
|
assert sanitized == """a<b>cd_?e:f"g|h*i._txt"""
|
||||||
assert real_path == os.path.join(str(tmpdir), "abcdefghi.txt")
|
|
||||||
|
|
||||||
|
|
||||||
# This class pertains to path string padding manipulation specifically
|
# This class pertains to path string padding manipulation specifically
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
import spack.util.path
|
||||||
import spack.util.url as url_util
|
import spack.util.url as url_util
|
||||||
|
|
||||||
|
|
||||||
@ -278,3 +279,16 @@ def test_git_url_parse(url, parts):
|
|||||||
url_util.parse_git_url(url)
|
url_util.parse_git_url(url)
|
||||||
else:
|
else:
|
||||||
assert parts == url_util.parse_git_url(url)
|
assert parts == url_util.parse_git_url(url)
|
||||||
|
|
||||||
|
|
||||||
|
def test_default_download_name():
|
||||||
|
url = "https://example.com:1234/path/to/file.txt;params?abc=def#file=blob.tar"
|
||||||
|
filename = url_util.default_download_filename(url)
|
||||||
|
assert filename == spack.util.path.sanitize_filename(filename)
|
||||||
|
|
||||||
|
|
||||||
|
def test_default_download_name_dot_dot():
|
||||||
|
"""Avoid that downloaded files get names computed as ., .. or any hidden file."""
|
||||||
|
assert url_util.default_download_filename("https://example.com/.") == "_"
|
||||||
|
assert url_util.default_download_filename("https://example.com/..") == "_."
|
||||||
|
assert url_util.default_download_filename("https://example.com/.abcdef") == "_abcdef"
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
import spack.config
|
import spack.config
|
||||||
import spack.mirror
|
import spack.mirror
|
||||||
import spack.paths
|
import spack.paths
|
||||||
|
import spack.util.path
|
||||||
import spack.util.s3
|
import spack.util.s3
|
||||||
import spack.util.url as url_util
|
import spack.util.url as url_util
|
||||||
import spack.util.web
|
import spack.util.web
|
||||||
|
@ -132,40 +132,26 @@ def path_to_os_path(*pths):
|
|||||||
return ret_pths
|
return ret_pths
|
||||||
|
|
||||||
|
|
||||||
def sanitize_file_path(pth):
|
def sanitize_filename(filename: str) -> str:
|
||||||
"""
|
"""
|
||||||
Formats strings to contain only characters that can
|
Replaces unsupported characters (for the host) in a filename with underscores.
|
||||||
be used to generate legal file paths.
|
|
||||||
|
|
||||||
Criteria for legal files based on
|
Criteria for legal files based on
|
||||||
https://en.wikipedia.org/wiki/Filename#Comparison_of_filename_limitations
|
https://en.wikipedia.org/wiki/Filename#Comparison_of_filename_limitations
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
pth: string containing path to be created
|
filename: string containing filename to be created on the host filesystem
|
||||||
on the host filesystem
|
|
||||||
|
|
||||||
Return:
|
Return:
|
||||||
sanitized string that can legally be made into a path
|
filename that can be created on the host filesystem
|
||||||
"""
|
"""
|
||||||
# on unix, splitting path by seperators will remove
|
if sys.platform != "win32":
|
||||||
# instances of illegal characters on join
|
# Only disallow null bytes and directory separators.
|
||||||
pth_cmpnts = pth.split(os.path.sep)
|
return re.sub("[\0/]", "_", filename)
|
||||||
|
|
||||||
if sys.platform == "win32":
|
# On Windows, things are more involved.
|
||||||
drive_match = r"[a-zA-Z]:"
|
# NOTE: this is incomplete, missing reserved names
|
||||||
is_abs = bool(re.match(drive_match, pth_cmpnts[0]))
|
return re.sub(r'[\x00-\x1F\x7F"*/:<>?\\|]', "_", filename)
|
||||||
drive = pth_cmpnts[0] + os.path.sep if is_abs else ""
|
|
||||||
pth_cmpnts = pth_cmpnts[1:] if drive else pth_cmpnts
|
|
||||||
illegal_chars = r'[<>?:"|*\\]'
|
|
||||||
else:
|
|
||||||
drive = "/" if not pth_cmpnts[0] else ""
|
|
||||||
illegal_chars = r"[/]"
|
|
||||||
|
|
||||||
pth = []
|
|
||||||
for cmp in pth_cmpnts:
|
|
||||||
san_cmp = re.sub(illegal_chars, "", cmp)
|
|
||||||
pth.append(san_cmp)
|
|
||||||
return drive + os.path.join(*pth)
|
|
||||||
|
|
||||||
|
|
||||||
def system_path_filter(_func=None, arg_slice=None):
|
def system_path_filter(_func=None, arg_slice=None):
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
import urllib.parse
|
import urllib.parse
|
||||||
import urllib.request
|
import urllib.request
|
||||||
|
|
||||||
from spack.util.path import convert_to_posix_path
|
from spack.util.path import convert_to_posix_path, sanitize_filename
|
||||||
|
|
||||||
|
|
||||||
def validate_scheme(scheme):
|
def validate_scheme(scheme):
|
||||||
@ -294,3 +294,22 @@ def parse_git_url(url):
|
|||||||
raise ValueError("bad port in git url: %s" % url)
|
raise ValueError("bad port in git url: %s" % url)
|
||||||
|
|
||||||
return (scheme, user, hostname, port, path)
|
return (scheme, user, hostname, port, path)
|
||||||
|
|
||||||
|
|
||||||
|
def default_download_filename(url: str) -> str:
|
||||||
|
"""This method computes a default file name for a given URL.
|
||||||
|
Note that it makes no request, so this is not the same as the
|
||||||
|
option curl -O, which uses the remote file name from the response
|
||||||
|
header."""
|
||||||
|
parsed_url = urllib.parse.urlparse(url)
|
||||||
|
# Only use the last path component + params + query + fragment
|
||||||
|
name = urllib.parse.urlunparse(
|
||||||
|
parsed_url._replace(scheme="", netloc="", path=posixpath.basename(parsed_url.path))
|
||||||
|
)
|
||||||
|
valid_name = sanitize_filename(name)
|
||||||
|
|
||||||
|
# Don't download to hidden files please
|
||||||
|
if valid_name[0] == ".":
|
||||||
|
valid_name = "_" + valid_name[1:]
|
||||||
|
|
||||||
|
return valid_name
|
||||||
|
Loading…
Reference in New Issue
Block a user