Reduce the number of stat calls in "spack verify" (#37251)

Spack comes to a crawl post-install of nvhpc, which is partly thanks to
this post install hook which has a lot of redundancy, and isn't correct.

1. There's no need to store "type" because that _is_ "mode".
2. There are more file types than "symlink", "dir", "file".
3. Don't checksum device type things
4. Don't run 3 stat calls (exists, stat, isdir/islink), but one lstat
   call
5. Don't read entire files into memory

I also don't know why `spack.crypto` wasn't used for checksumming, but I
guess it's too late for that now. Finally md5 would've been the faster
algorithm, which would've been fine given that a non cryptographicall
checksum was used anyways.
This commit is contained in:
Harmen Stoppels 2023-04-28 15:24:24 +02:00 committed by GitHub
parent fc201a2b75
commit c85877566f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 47 additions and 66 deletions

View File

@ -6,6 +6,7 @@
"""Tests for the `spack.verify` module"""
import os
import shutil
import stat
import sys
import pytest
@ -30,22 +31,12 @@ def test_link_manifest_entry(tmpdir):
os.symlink(file, link)
data = spack.verify.create_manifest_entry(link)
assert data["type"] == "link"
assert data["dest"] == file
assert all(x in data for x in ("mode", "owner", "group"))
results = spack.verify.check_entry(link, data)
assert not results.has_errors()
data["type"] = "garbage"
results = spack.verify.check_entry(link, data)
assert results.has_errors()
assert link in results.errors
assert results.errors[link] == ["type"]
data["type"] = "link"
file2 = str(tmpdir.join("file2"))
open(file2, "a").close()
os.remove(link)
@ -64,18 +55,18 @@ def test_dir_manifest_entry(tmpdir):
fs.mkdirp(dirent)
data = spack.verify.create_manifest_entry(dirent)
assert data["type"] == "dir"
assert stat.S_ISDIR(data["mode"])
assert all(x in data for x in ("mode", "owner", "group"))
results = spack.verify.check_entry(dirent, data)
assert not results.has_errors()
data["type"] = "garbage"
data["mode"] = "garbage"
results = spack.verify.check_entry(dirent, data)
assert results.has_errors()
assert dirent in results.errors
assert results.errors[dirent] == ["type"]
assert results.errors[dirent] == ["mode"]
def test_file_manifest_entry(tmpdir):
@ -89,25 +80,25 @@ def test_file_manifest_entry(tmpdir):
f.write(orig_str)
data = spack.verify.create_manifest_entry(file)
assert data["type"] == "file"
assert stat.S_ISREG(data["mode"])
assert data["size"] == len(orig_str)
assert all(x in data for x in ("mode", "owner", "group"))
assert all(x in data for x in ("owner", "group"))
results = spack.verify.check_entry(file, data)
assert not results.has_errors()
data["type"] = "garbage"
data["mode"] = 0x99999
results = spack.verify.check_entry(file, data)
assert results.has_errors()
assert file in results.errors
assert results.errors[file] == ["type"]
data["type"] = "file"
assert results.errors[file] == ["mode"]
with open(file, "w") as f:
f.write(new_str)
data["mode"] = os.stat(file).st_mode
results = spack.verify.check_entry(file, data)
expected = ["size", "hash"]

View File

@ -5,6 +5,8 @@
import base64
import hashlib
import os
import stat
from typing import Any, Dict
import llnl.util.tty as tty
@ -14,35 +16,33 @@
import spack.util.spack_json as sjson
def compute_hash(path):
with open(path, "rb") as f:
sha1 = hashlib.sha1(f.read()).digest()
b32 = base64.b32encode(sha1)
return b32.decode()
def compute_hash(path: str, block_size: int = 1048576) -> str:
# why is this not using spack.util.crypto.checksum...
hasher = hashlib.sha1()
with open(path, "rb") as file:
while True:
data = file.read(block_size)
if not data:
break
hasher.update(data)
return base64.b32encode(hasher.digest()).decode()
def create_manifest_entry(path):
data = {}
def create_manifest_entry(path: str) -> Dict[str, Any]:
try:
s = os.lstat(path)
except OSError:
return {}
if os.path.exists(path):
stat = os.stat(path)
data: Dict[str, Any] = {"mode": s.st_mode, "owner": s.st_uid, "group": s.st_gid}
data["mode"] = stat.st_mode
data["owner"] = stat.st_uid
data["group"] = stat.st_gid
if stat.S_ISLNK(s.st_mode):
data["dest"] = os.readlink(path)
if os.path.islink(path):
data["type"] = "link"
data["dest"] = os.readlink(path)
elif os.path.isdir(path):
data["type"] = "dir"
else:
data["type"] = "file"
data["hash"] = compute_hash(path)
data["time"] = stat.st_mtime
data["size"] = stat.st_size
elif stat.S_ISREG(s.st_mode):
data["hash"] = compute_hash(path)
data["time"] = s.st_mtime
data["size"] = s.st_size
return data
@ -75,38 +75,28 @@ def check_entry(path, data):
res.add_error(path, "added")
return res
stat = os.stat(path)
s = os.lstat(path)
# Check for all entries
if stat.st_mode != data["mode"]:
res.add_error(path, "mode")
if stat.st_uid != data["owner"]:
if s.st_uid != data["owner"]:
res.add_error(path, "owner")
if stat.st_gid != data["group"]:
if s.st_gid != data["group"]:
res.add_error(path, "group")
# Check for symlink targets and listed as symlink
if os.path.islink(path):
if data["type"] != "link":
res.add_error(path, "type")
if os.readlink(path) != data.get("dest", ""):
res.add_error(path, "link")
# Check directories are listed as directory
elif os.path.isdir(path):
if data["type"] != "dir":
res.add_error(path, "type")
else:
# In the past, `stat(...).st_mode` was stored
# instead of `lstat(...).st_mode`. So, ignore mode errors for symlinks.
if not stat.S_ISLNK(s.st_mode) and s.st_mode != data["mode"]:
res.add_error(path, "mode")
elif stat.S_ISLNK(s.st_mode) and os.readlink(path) != data.get("dest"):
res.add_error(path, "link")
elif stat.S_ISREG(s.st_mode):
# Check file contents against hash and listed as file
# Check mtime and size as well
if stat.st_size != data["size"]:
if s.st_size != data["size"]:
res.add_error(path, "size")
if stat.st_mtime != data["time"]:
if s.st_mtime != data["time"]:
res.add_error(path, "mtime")
if data["type"] != "file":
res.add_error(path, "type")
if compute_hash(path) != data.get("hash", ""):
if compute_hash(path) != data.get("hash"):
res.add_error(path, "hash")
return res