Reduce the number of stat calls in "spack verify" (#37251)
Spack comes to a crawl post-install of nvhpc, which is partly thanks to this post install hook which has a lot of redundancy, and isn't correct. 1. There's no need to store "type" because that _is_ "mode". 2. There are more file types than "symlink", "dir", "file". 3. Don't checksum device type things 4. Don't run 3 stat calls (exists, stat, isdir/islink), but one lstat call 5. Don't read entire files into memory I also don't know why `spack.crypto` wasn't used for checksumming, but I guess it's too late for that now. Finally md5 would've been the faster algorithm, which would've been fine given that a non cryptographicall checksum was used anyways.
This commit is contained in:
parent
fc201a2b75
commit
c85877566f
@ -6,6 +6,7 @@
|
||||
"""Tests for the `spack.verify` module"""
|
||||
import os
|
||||
import shutil
|
||||
import stat
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
@ -30,22 +31,12 @@ def test_link_manifest_entry(tmpdir):
|
||||
os.symlink(file, link)
|
||||
|
||||
data = spack.verify.create_manifest_entry(link)
|
||||
assert data["type"] == "link"
|
||||
assert data["dest"] == file
|
||||
assert all(x in data for x in ("mode", "owner", "group"))
|
||||
|
||||
results = spack.verify.check_entry(link, data)
|
||||
assert not results.has_errors()
|
||||
|
||||
data["type"] = "garbage"
|
||||
|
||||
results = spack.verify.check_entry(link, data)
|
||||
assert results.has_errors()
|
||||
assert link in results.errors
|
||||
assert results.errors[link] == ["type"]
|
||||
|
||||
data["type"] = "link"
|
||||
|
||||
file2 = str(tmpdir.join("file2"))
|
||||
open(file2, "a").close()
|
||||
os.remove(link)
|
||||
@ -64,18 +55,18 @@ def test_dir_manifest_entry(tmpdir):
|
||||
fs.mkdirp(dirent)
|
||||
|
||||
data = spack.verify.create_manifest_entry(dirent)
|
||||
assert data["type"] == "dir"
|
||||
assert stat.S_ISDIR(data["mode"])
|
||||
assert all(x in data for x in ("mode", "owner", "group"))
|
||||
|
||||
results = spack.verify.check_entry(dirent, data)
|
||||
assert not results.has_errors()
|
||||
|
||||
data["type"] = "garbage"
|
||||
data["mode"] = "garbage"
|
||||
|
||||
results = spack.verify.check_entry(dirent, data)
|
||||
assert results.has_errors()
|
||||
assert dirent in results.errors
|
||||
assert results.errors[dirent] == ["type"]
|
||||
assert results.errors[dirent] == ["mode"]
|
||||
|
||||
|
||||
def test_file_manifest_entry(tmpdir):
|
||||
@ -89,25 +80,25 @@ def test_file_manifest_entry(tmpdir):
|
||||
f.write(orig_str)
|
||||
|
||||
data = spack.verify.create_manifest_entry(file)
|
||||
assert data["type"] == "file"
|
||||
assert stat.S_ISREG(data["mode"])
|
||||
assert data["size"] == len(orig_str)
|
||||
assert all(x in data for x in ("mode", "owner", "group"))
|
||||
assert all(x in data for x in ("owner", "group"))
|
||||
|
||||
results = spack.verify.check_entry(file, data)
|
||||
assert not results.has_errors()
|
||||
|
||||
data["type"] = "garbage"
|
||||
data["mode"] = 0x99999
|
||||
|
||||
results = spack.verify.check_entry(file, data)
|
||||
assert results.has_errors()
|
||||
assert file in results.errors
|
||||
assert results.errors[file] == ["type"]
|
||||
|
||||
data["type"] = "file"
|
||||
assert results.errors[file] == ["mode"]
|
||||
|
||||
with open(file, "w") as f:
|
||||
f.write(new_str)
|
||||
|
||||
data["mode"] = os.stat(file).st_mode
|
||||
|
||||
results = spack.verify.check_entry(file, data)
|
||||
|
||||
expected = ["size", "hash"]
|
||||
|
@ -5,6 +5,8 @@
|
||||
import base64
|
||||
import hashlib
|
||||
import os
|
||||
import stat
|
||||
from typing import Any, Dict
|
||||
|
||||
import llnl.util.tty as tty
|
||||
|
||||
@ -14,35 +16,33 @@
|
||||
import spack.util.spack_json as sjson
|
||||
|
||||
|
||||
def compute_hash(path):
|
||||
with open(path, "rb") as f:
|
||||
sha1 = hashlib.sha1(f.read()).digest()
|
||||
b32 = base64.b32encode(sha1)
|
||||
return b32.decode()
|
||||
def compute_hash(path: str, block_size: int = 1048576) -> str:
|
||||
# why is this not using spack.util.crypto.checksum...
|
||||
hasher = hashlib.sha1()
|
||||
with open(path, "rb") as file:
|
||||
while True:
|
||||
data = file.read(block_size)
|
||||
if not data:
|
||||
break
|
||||
hasher.update(data)
|
||||
return base64.b32encode(hasher.digest()).decode()
|
||||
|
||||
|
||||
def create_manifest_entry(path):
|
||||
data = {}
|
||||
def create_manifest_entry(path: str) -> Dict[str, Any]:
|
||||
try:
|
||||
s = os.lstat(path)
|
||||
except OSError:
|
||||
return {}
|
||||
|
||||
if os.path.exists(path):
|
||||
stat = os.stat(path)
|
||||
data: Dict[str, Any] = {"mode": s.st_mode, "owner": s.st_uid, "group": s.st_gid}
|
||||
|
||||
data["mode"] = stat.st_mode
|
||||
data["owner"] = stat.st_uid
|
||||
data["group"] = stat.st_gid
|
||||
if stat.S_ISLNK(s.st_mode):
|
||||
data["dest"] = os.readlink(path)
|
||||
|
||||
if os.path.islink(path):
|
||||
data["type"] = "link"
|
||||
data["dest"] = os.readlink(path)
|
||||
|
||||
elif os.path.isdir(path):
|
||||
data["type"] = "dir"
|
||||
|
||||
else:
|
||||
data["type"] = "file"
|
||||
data["hash"] = compute_hash(path)
|
||||
data["time"] = stat.st_mtime
|
||||
data["size"] = stat.st_size
|
||||
elif stat.S_ISREG(s.st_mode):
|
||||
data["hash"] = compute_hash(path)
|
||||
data["time"] = s.st_mtime
|
||||
data["size"] = s.st_size
|
||||
|
||||
return data
|
||||
|
||||
@ -75,38 +75,28 @@ def check_entry(path, data):
|
||||
res.add_error(path, "added")
|
||||
return res
|
||||
|
||||
stat = os.stat(path)
|
||||
s = os.lstat(path)
|
||||
|
||||
# Check for all entries
|
||||
if stat.st_mode != data["mode"]:
|
||||
res.add_error(path, "mode")
|
||||
if stat.st_uid != data["owner"]:
|
||||
if s.st_uid != data["owner"]:
|
||||
res.add_error(path, "owner")
|
||||
if stat.st_gid != data["group"]:
|
||||
if s.st_gid != data["group"]:
|
||||
res.add_error(path, "group")
|
||||
|
||||
# Check for symlink targets and listed as symlink
|
||||
if os.path.islink(path):
|
||||
if data["type"] != "link":
|
||||
res.add_error(path, "type")
|
||||
if os.readlink(path) != data.get("dest", ""):
|
||||
res.add_error(path, "link")
|
||||
|
||||
# Check directories are listed as directory
|
||||
elif os.path.isdir(path):
|
||||
if data["type"] != "dir":
|
||||
res.add_error(path, "type")
|
||||
|
||||
else:
|
||||
# In the past, `stat(...).st_mode` was stored
|
||||
# instead of `lstat(...).st_mode`. So, ignore mode errors for symlinks.
|
||||
if not stat.S_ISLNK(s.st_mode) and s.st_mode != data["mode"]:
|
||||
res.add_error(path, "mode")
|
||||
elif stat.S_ISLNK(s.st_mode) and os.readlink(path) != data.get("dest"):
|
||||
res.add_error(path, "link")
|
||||
elif stat.S_ISREG(s.st_mode):
|
||||
# Check file contents against hash and listed as file
|
||||
# Check mtime and size as well
|
||||
if stat.st_size != data["size"]:
|
||||
if s.st_size != data["size"]:
|
||||
res.add_error(path, "size")
|
||||
if stat.st_mtime != data["time"]:
|
||||
if s.st_mtime != data["time"]:
|
||||
res.add_error(path, "mtime")
|
||||
if data["type"] != "file":
|
||||
res.add_error(path, "type")
|
||||
if compute_hash(path) != data.get("hash", ""):
|
||||
if compute_hash(path) != data.get("hash"):
|
||||
res.add_error(path, "hash")
|
||||
|
||||
return res
|
||||
|
Loading…
Reference in New Issue
Block a user