Reduce the number of stat calls in "spack verify" (#37251)
Spack comes to a crawl post-install of nvhpc, which is partly thanks to this post install hook which has a lot of redundancy, and isn't correct. 1. There's no need to store "type" because that _is_ "mode". 2. There are more file types than "symlink", "dir", "file". 3. Don't checksum device type things 4. Don't run 3 stat calls (exists, stat, isdir/islink), but one lstat call 5. Don't read entire files into memory I also don't know why `spack.crypto` wasn't used for checksumming, but I guess it's too late for that now. Finally md5 would've been the faster algorithm, which would've been fine given that a non cryptographicall checksum was used anyways.
This commit is contained in:
		@@ -6,6 +6,7 @@
 | 
				
			|||||||
"""Tests for the `spack.verify` module"""
 | 
					"""Tests for the `spack.verify` module"""
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import shutil
 | 
					import shutil
 | 
				
			||||||
 | 
					import stat
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
@@ -30,22 +31,12 @@ def test_link_manifest_entry(tmpdir):
 | 
				
			|||||||
    os.symlink(file, link)
 | 
					    os.symlink(file, link)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    data = spack.verify.create_manifest_entry(link)
 | 
					    data = spack.verify.create_manifest_entry(link)
 | 
				
			||||||
    assert data["type"] == "link"
 | 
					 | 
				
			||||||
    assert data["dest"] == file
 | 
					    assert data["dest"] == file
 | 
				
			||||||
    assert all(x in data for x in ("mode", "owner", "group"))
 | 
					    assert all(x in data for x in ("mode", "owner", "group"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    results = spack.verify.check_entry(link, data)
 | 
					    results = spack.verify.check_entry(link, data)
 | 
				
			||||||
    assert not results.has_errors()
 | 
					    assert not results.has_errors()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    data["type"] = "garbage"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    results = spack.verify.check_entry(link, data)
 | 
					 | 
				
			||||||
    assert results.has_errors()
 | 
					 | 
				
			||||||
    assert link in results.errors
 | 
					 | 
				
			||||||
    assert results.errors[link] == ["type"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    data["type"] = "link"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    file2 = str(tmpdir.join("file2"))
 | 
					    file2 = str(tmpdir.join("file2"))
 | 
				
			||||||
    open(file2, "a").close()
 | 
					    open(file2, "a").close()
 | 
				
			||||||
    os.remove(link)
 | 
					    os.remove(link)
 | 
				
			||||||
@@ -64,18 +55,18 @@ def test_dir_manifest_entry(tmpdir):
 | 
				
			|||||||
    fs.mkdirp(dirent)
 | 
					    fs.mkdirp(dirent)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    data = spack.verify.create_manifest_entry(dirent)
 | 
					    data = spack.verify.create_manifest_entry(dirent)
 | 
				
			||||||
    assert data["type"] == "dir"
 | 
					    assert stat.S_ISDIR(data["mode"])
 | 
				
			||||||
    assert all(x in data for x in ("mode", "owner", "group"))
 | 
					    assert all(x in data for x in ("mode", "owner", "group"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    results = spack.verify.check_entry(dirent, data)
 | 
					    results = spack.verify.check_entry(dirent, data)
 | 
				
			||||||
    assert not results.has_errors()
 | 
					    assert not results.has_errors()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    data["type"] = "garbage"
 | 
					    data["mode"] = "garbage"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    results = spack.verify.check_entry(dirent, data)
 | 
					    results = spack.verify.check_entry(dirent, data)
 | 
				
			||||||
    assert results.has_errors()
 | 
					    assert results.has_errors()
 | 
				
			||||||
    assert dirent in results.errors
 | 
					    assert dirent in results.errors
 | 
				
			||||||
    assert results.errors[dirent] == ["type"]
 | 
					    assert results.errors[dirent] == ["mode"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_file_manifest_entry(tmpdir):
 | 
					def test_file_manifest_entry(tmpdir):
 | 
				
			||||||
@@ -89,25 +80,25 @@ def test_file_manifest_entry(tmpdir):
 | 
				
			|||||||
        f.write(orig_str)
 | 
					        f.write(orig_str)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    data = spack.verify.create_manifest_entry(file)
 | 
					    data = spack.verify.create_manifest_entry(file)
 | 
				
			||||||
    assert data["type"] == "file"
 | 
					    assert stat.S_ISREG(data["mode"])
 | 
				
			||||||
    assert data["size"] == len(orig_str)
 | 
					    assert data["size"] == len(orig_str)
 | 
				
			||||||
    assert all(x in data for x in ("mode", "owner", "group"))
 | 
					    assert all(x in data for x in ("owner", "group"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    results = spack.verify.check_entry(file, data)
 | 
					    results = spack.verify.check_entry(file, data)
 | 
				
			||||||
    assert not results.has_errors()
 | 
					    assert not results.has_errors()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    data["type"] = "garbage"
 | 
					    data["mode"] = 0x99999
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    results = spack.verify.check_entry(file, data)
 | 
					    results = spack.verify.check_entry(file, data)
 | 
				
			||||||
    assert results.has_errors()
 | 
					    assert results.has_errors()
 | 
				
			||||||
    assert file in results.errors
 | 
					    assert file in results.errors
 | 
				
			||||||
    assert results.errors[file] == ["type"]
 | 
					    assert results.errors[file] == ["mode"]
 | 
				
			||||||
 | 
					 | 
				
			||||||
    data["type"] = "file"
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    with open(file, "w") as f:
 | 
					    with open(file, "w") as f:
 | 
				
			||||||
        f.write(new_str)
 | 
					        f.write(new_str)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    data["mode"] = os.stat(file).st_mode
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    results = spack.verify.check_entry(file, data)
 | 
					    results = spack.verify.check_entry(file, data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    expected = ["size", "hash"]
 | 
					    expected = ["size", "hash"]
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -5,6 +5,8 @@
 | 
				
			|||||||
import base64
 | 
					import base64
 | 
				
			||||||
import hashlib
 | 
					import hashlib
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
 | 
					import stat
 | 
				
			||||||
 | 
					from typing import Any, Dict
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import llnl.util.tty as tty
 | 
					import llnl.util.tty as tty
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -14,35 +16,33 @@
 | 
				
			|||||||
import spack.util.spack_json as sjson
 | 
					import spack.util.spack_json as sjson
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def compute_hash(path):
 | 
					def compute_hash(path: str, block_size: int = 1048576) -> str:
 | 
				
			||||||
    with open(path, "rb") as f:
 | 
					    # why is this not using spack.util.crypto.checksum...
 | 
				
			||||||
        sha1 = hashlib.sha1(f.read()).digest()
 | 
					    hasher = hashlib.sha1()
 | 
				
			||||||
        b32 = base64.b32encode(sha1)
 | 
					    with open(path, "rb") as file:
 | 
				
			||||||
        return b32.decode()
 | 
					        while True:
 | 
				
			||||||
 | 
					            data = file.read(block_size)
 | 
				
			||||||
 | 
					            if not data:
 | 
				
			||||||
 | 
					                break
 | 
				
			||||||
 | 
					            hasher.update(data)
 | 
				
			||||||
 | 
					    return base64.b32encode(hasher.digest()).decode()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def create_manifest_entry(path):
 | 
					def create_manifest_entry(path: str) -> Dict[str, Any]:
 | 
				
			||||||
    data = {}
 | 
					    try:
 | 
				
			||||||
 | 
					        s = os.lstat(path)
 | 
				
			||||||
 | 
					    except OSError:
 | 
				
			||||||
 | 
					        return {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if os.path.exists(path):
 | 
					    data: Dict[str, Any] = {"mode": s.st_mode, "owner": s.st_uid, "group": s.st_gid}
 | 
				
			||||||
        stat = os.stat(path)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        data["mode"] = stat.st_mode
 | 
					    if stat.S_ISLNK(s.st_mode):
 | 
				
			||||||
        data["owner"] = stat.st_uid
 | 
					        data["dest"] = os.readlink(path)
 | 
				
			||||||
        data["group"] = stat.st_gid
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if os.path.islink(path):
 | 
					    elif stat.S_ISREG(s.st_mode):
 | 
				
			||||||
            data["type"] = "link"
 | 
					        data["hash"] = compute_hash(path)
 | 
				
			||||||
            data["dest"] = os.readlink(path)
 | 
					        data["time"] = s.st_mtime
 | 
				
			||||||
 | 
					        data["size"] = s.st_size
 | 
				
			||||||
        elif os.path.isdir(path):
 | 
					 | 
				
			||||||
            data["type"] = "dir"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            data["type"] = "file"
 | 
					 | 
				
			||||||
            data["hash"] = compute_hash(path)
 | 
					 | 
				
			||||||
            data["time"] = stat.st_mtime
 | 
					 | 
				
			||||||
            data["size"] = stat.st_size
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return data
 | 
					    return data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -75,38 +75,28 @@ def check_entry(path, data):
 | 
				
			|||||||
        res.add_error(path, "added")
 | 
					        res.add_error(path, "added")
 | 
				
			||||||
        return res
 | 
					        return res
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    stat = os.stat(path)
 | 
					    s = os.lstat(path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Check for all entries
 | 
					    # Check for all entries
 | 
				
			||||||
    if stat.st_mode != data["mode"]:
 | 
					    if s.st_uid != data["owner"]:
 | 
				
			||||||
        res.add_error(path, "mode")
 | 
					 | 
				
			||||||
    if stat.st_uid != data["owner"]:
 | 
					 | 
				
			||||||
        res.add_error(path, "owner")
 | 
					        res.add_error(path, "owner")
 | 
				
			||||||
    if stat.st_gid != data["group"]:
 | 
					    if s.st_gid != data["group"]:
 | 
				
			||||||
        res.add_error(path, "group")
 | 
					        res.add_error(path, "group")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Check for symlink targets  and listed as symlink
 | 
					    # In the past, `stat(...).st_mode` was stored
 | 
				
			||||||
    if os.path.islink(path):
 | 
					    # instead of `lstat(...).st_mode`. So, ignore mode errors for symlinks.
 | 
				
			||||||
        if data["type"] != "link":
 | 
					    if not stat.S_ISLNK(s.st_mode) and s.st_mode != data["mode"]:
 | 
				
			||||||
            res.add_error(path, "type")
 | 
					        res.add_error(path, "mode")
 | 
				
			||||||
        if os.readlink(path) != data.get("dest", ""):
 | 
					    elif stat.S_ISLNK(s.st_mode) and os.readlink(path) != data.get("dest"):
 | 
				
			||||||
            res.add_error(path, "link")
 | 
					        res.add_error(path, "link")
 | 
				
			||||||
 | 
					    elif stat.S_ISREG(s.st_mode):
 | 
				
			||||||
    # Check directories are listed as directory
 | 
					 | 
				
			||||||
    elif os.path.isdir(path):
 | 
					 | 
				
			||||||
        if data["type"] != "dir":
 | 
					 | 
				
			||||||
            res.add_error(path, "type")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        # Check file contents against hash and listed as file
 | 
					        # Check file contents against hash and listed as file
 | 
				
			||||||
        # Check mtime and size as well
 | 
					        # Check mtime and size as well
 | 
				
			||||||
        if stat.st_size != data["size"]:
 | 
					        if s.st_size != data["size"]:
 | 
				
			||||||
            res.add_error(path, "size")
 | 
					            res.add_error(path, "size")
 | 
				
			||||||
        if stat.st_mtime != data["time"]:
 | 
					        if s.st_mtime != data["time"]:
 | 
				
			||||||
            res.add_error(path, "mtime")
 | 
					            res.add_error(path, "mtime")
 | 
				
			||||||
        if data["type"] != "file":
 | 
					        if compute_hash(path) != data.get("hash"):
 | 
				
			||||||
            res.add_error(path, "type")
 | 
					 | 
				
			||||||
        if compute_hash(path) != data.get("hash", ""):
 | 
					 | 
				
			||||||
            res.add_error(path, "hash")
 | 
					            res.add_error(path, "hash")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return res
 | 
					    return res
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user