Avoid quadratic complexity in log parser (#26568)
TL;DR: there are matching groups trying to match 1 or more occurrences of
something. We don't use the matching group. Therefore it's sufficient to test
for 1 occurrence. This reduce quadratic complexity to linear time.
---
When parsing logs of an mpich build, I'm getting a 4 minute (!!) wait
with 16 threads for regexes to run:
```
In [1]: %time p.parse("mpich.log")
Wall time: 4min 14s
```
That's really unacceptably slow... 
After some digging, it seems a few regexes tend to have `O(n^2)` scaling
where `n` is the string / log line length. I don't think they *necessarily*
should scale like that, but it seems that way. The common pattern is this
```
([^:]+): error
```
which matches `: error` literally, and then one or more non-colons before that. So
for a log line like this:
```
abcdefghijklmnopqrstuvwxyz: error etc etc
```
Any of these are potential group matches when using `search` in Python:
```
abcdefghijklmnopqrstuvwxyz
 bcdefghijklmnopqrstuvwxyz
  cdefghijklmnopqrstuvwxyz
                         ⋮
                        yz
                         z
```
but clearly the capture group should return the longest match.
My hypothesis is that Python has a very bad implementation of `search`
that somehow considers all of these, even though it can be implemented
in linear time by scanning for `: error` first, and then greedily expanding
the longest possible `[^:]+` match to the left. If Python indeed considers
all possible matches, then with `n` matches of length `1 .. n` you
see the `O(n^2)` slowness (i verified this by replacing + with {1,k}
and doubling `k`, it doubles the execution time indeed).
This PR fixes this by removing the `+`, so effectively changing the 
O(n^2) into a O(n) worst case.
The reason we are fine with dropping `+` is that we don't use the
capture group anywhere, so, we just ensure `:: error` is not a match
but `x: error` is.
After going from O(n^2) to O(n), the 15MB mpich build log is parsed
in `1.288s`, so about 200x faster.
Just to be sure I've also updated `^CMake Error.*:` to `^CMake Error`,
so that it does not match with all the possible `:`'s in the line.
Another option is to use `.*?` there to make it quit scanning as soon as
possible, but what line that starts with `CMake Error` that does not have
a colon is really a false positive...
			
			
This commit is contained in:
		
							
								
								
									
										83
									
								
								lib/spack/external/ctest_log_parser.py
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										83
									
								
								lib/spack/external/ctest_log_parser.py
									
									
									
									
										vendored
									
									
								
							@@ -77,47 +77,7 @@
 | 
			
		||||
from six import StringIO
 | 
			
		||||
from six import string_types
 | 
			
		||||
 | 
			
		||||
class prefilter(object):
 | 
			
		||||
    """Make regular expressions faster with a simple prefiltering predicate.
 | 
			
		||||
 | 
			
		||||
    Some regular expressions seem to be much more costly than others.  In
 | 
			
		||||
    most cases, we can evaluate a simple precondition, e.g.::
 | 
			
		||||
 | 
			
		||||
        lambda x: "error" in x
 | 
			
		||||
 | 
			
		||||
    to avoid evaluating expensive regexes on all lines in a file. This
 | 
			
		||||
    can reduce parse time for large files by orders of magnitude when
 | 
			
		||||
    evaluating lots of expressions.
 | 
			
		||||
 | 
			
		||||
    A ``prefilter`` object is designed to act like a regex,, but
 | 
			
		||||
    ``search`` and ``match`` check the precondition before bothering to
 | 
			
		||||
    evaluate the regular expression.
 | 
			
		||||
 | 
			
		||||
    Note that ``match`` and ``search`` just return ``True`` and ``False``
 | 
			
		||||
    at the moment. Make them return a ``MatchObject`` or ``None`` if it
 | 
			
		||||
    becomes necessary.
 | 
			
		||||
    """
 | 
			
		||||
    def __init__(self, precondition, *patterns):
 | 
			
		||||
        self.patterns = [re.compile(p) for p in patterns]
 | 
			
		||||
        self.pre = precondition
 | 
			
		||||
        self.pattern = "\n                            ".join(
 | 
			
		||||
            ('MERGED:',) + patterns)
 | 
			
		||||
 | 
			
		||||
    def search(self, text):
 | 
			
		||||
        return self.pre(text) and any(p.search(text) for p in self.patterns)
 | 
			
		||||
 | 
			
		||||
    def match(self, text):
 | 
			
		||||
        return self.pre(text) and any(p.match(text) for p in self.patterns)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
_error_matches = [
 | 
			
		||||
    prefilter(
 | 
			
		||||
        lambda x: any(s in x for s in (
 | 
			
		||||
            'Error:', 'error', 'undefined reference', 'multiply defined')),
 | 
			
		||||
        "([^:]+): error[ \\t]*[0-9]+[ \\t]*:",
 | 
			
		||||
        "([^:]+): (Error:|error|undefined reference|multiply defined)",
 | 
			
		||||
        "([^ :]+) ?: (error|fatal error|catastrophic error)",
 | 
			
		||||
        "([^:]+)\\(([^\\)]+)\\) ?: (error|fatal error|catastrophic error)"),
 | 
			
		||||
    "^FAIL: ",
 | 
			
		||||
    "^FATAL: ",
 | 
			
		||||
    "^failed ",
 | 
			
		||||
@@ -127,6 +87,8 @@ def match(self, text):
 | 
			
		||||
    "^[Ss]egmentation [Vv]iolation",
 | 
			
		||||
    "^[Ss]egmentation [Ff]ault",
 | 
			
		||||
    ":.*[Pp]ermission [Dd]enied",
 | 
			
		||||
    "[^ :]:[0-9]+: [^ \\t]",
 | 
			
		||||
    "[^:]: error[ \\t]*[0-9]+[ \\t]*:",
 | 
			
		||||
    "^Error ([0-9]+):",
 | 
			
		||||
    "^Fatal",
 | 
			
		||||
    "^[Ee]rror: ",
 | 
			
		||||
@@ -136,6 +98,9 @@ def match(self, text):
 | 
			
		||||
    "^cc[^C]*CC: ERROR File = ([^,]+), Line = ([0-9]+)",
 | 
			
		||||
    "^ld([^:])*:([ \\t])*ERROR([^:])*:",
 | 
			
		||||
    "^ild:([ \\t])*\\(undefined symbol\\)",
 | 
			
		||||
    "[^ :] : (error|fatal error|catastrophic error)",
 | 
			
		||||
    "[^:]: (Error:|error|undefined reference|multiply defined)",
 | 
			
		||||
    "[^:]\\([^\\)]+\\) ?: (error|fatal error|catastrophic error)",
 | 
			
		||||
    "^fatal error C[0-9]+:",
 | 
			
		||||
    ": syntax error ",
 | 
			
		||||
    "^collect2: ld returned 1 exit status",
 | 
			
		||||
@@ -144,7 +109,7 @@ def match(self, text):
 | 
			
		||||
    "^Unresolved:",
 | 
			
		||||
    "Undefined symbol",
 | 
			
		||||
    "^Undefined[ \\t]+first referenced",
 | 
			
		||||
    "^CMake Error.*:",
 | 
			
		||||
    "^CMake Error",
 | 
			
		||||
    ":[ \\t]cannot find",
 | 
			
		||||
    ":[ \\t]can't find",
 | 
			
		||||
    ": \\*\\*\\* No rule to make target [`'].*\\'.  Stop",
 | 
			
		||||
@@ -180,6 +145,7 @@ def match(self, text):
 | 
			
		||||
    "instantiated from ",
 | 
			
		||||
    "candidates are:",
 | 
			
		||||
    ": warning",
 | 
			
		||||
    ": WARNING",
 | 
			
		||||
    ": \\(Warning\\)",
 | 
			
		||||
    ": note",
 | 
			
		||||
    "    ok",
 | 
			
		||||
@@ -187,38 +153,32 @@ def match(self, text):
 | 
			
		||||
    "makefile:",
 | 
			
		||||
    "Makefile:",
 | 
			
		||||
    ":[ \\t]+Where:",
 | 
			
		||||
    "([^ :]+):([0-9]+): Warning",
 | 
			
		||||
    "[^ :]:[0-9]+: Warning",
 | 
			
		||||
    "------ Build started: .* ------",
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
#: Regexes to match file/line numbers in error/warning messages
 | 
			
		||||
_warning_matches = [
 | 
			
		||||
    prefilter(
 | 
			
		||||
        lambda x: 'warning' in x,
 | 
			
		||||
        "([^ :]+):([0-9]+): warning:",
 | 
			
		||||
        "([^:]+): warning ([0-9]+):",
 | 
			
		||||
        "([^:]+): warning[ \\t]*[0-9]+[ \\t]*:",
 | 
			
		||||
        "([^ :]+) : warning",
 | 
			
		||||
        "([^:]+): warning"),
 | 
			
		||||
    prefilter(
 | 
			
		||||
        lambda x: 'note:' in x,
 | 
			
		||||
        "^([^ :]+):([0-9]+): note:"),
 | 
			
		||||
    prefilter(
 | 
			
		||||
        lambda x: any(s in x for s in ('Warning', 'Warnung')),
 | 
			
		||||
        "^(Warning|Warnung) ([0-9]+):",
 | 
			
		||||
        "^(Warning|Warnung)[ :]",
 | 
			
		||||
        "^cxx: Warning:",
 | 
			
		||||
        "([^ :]+):([0-9]+): (Warning|Warnung)",
 | 
			
		||||
        "^CMake Warning.*:"),
 | 
			
		||||
    "file: .* has no symbols",
 | 
			
		||||
    "[^ :]:[0-9]+: warning:",
 | 
			
		||||
    "[^ :]:[0-9]+: note:",
 | 
			
		||||
    "^cc[^C]*CC: WARNING File = ([^,]+), Line = ([0-9]+)",
 | 
			
		||||
    "^ld([^:])*:([ \\t])*WARNING([^:])*:",
 | 
			
		||||
    "[^:]: warning [0-9]+:",
 | 
			
		||||
    "^\"[^\"]+\", line [0-9]+: [Ww](arning|arnung)",
 | 
			
		||||
    "[^:]: warning[ \\t]*[0-9]+[ \\t]*:",
 | 
			
		||||
    "^(Warning|Warnung) ([0-9]+):",
 | 
			
		||||
    "^(Warning|Warnung)[ :]",
 | 
			
		||||
    "WARNING: ",
 | 
			
		||||
    "[^ :] : warning",
 | 
			
		||||
    "[^:]: warning",
 | 
			
		||||
    "\", line [0-9]+\\.[0-9]+: [0-9]+-[0-9]+ \\([WI]\\)",
 | 
			
		||||
    "^cxx: Warning:",
 | 
			
		||||
    "file: .* has no symbols",
 | 
			
		||||
    "[^ :]:[0-9]+: (Warning|Warnung)",
 | 
			
		||||
    "\\([0-9]*\\): remark #[0-9]*",
 | 
			
		||||
    "\".*\", line [0-9]+: remark\\([0-9]*\\):",
 | 
			
		||||
    "cc-[0-9]* CC: REMARK File = .*, Line = [0-9]*",
 | 
			
		||||
    "^CMake Warning",
 | 
			
		||||
    "^\\[WARNING\\]",
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
@@ -349,8 +309,7 @@ def _profile_match(matches, exceptions, line, match_times, exc_times):
 | 
			
		||||
 | 
			
		||||
def _parse(lines, offset, profile):
 | 
			
		||||
    def compile(regex_array):
 | 
			
		||||
        return [regex if isinstance(regex, prefilter) else re.compile(regex)
 | 
			
		||||
                for regex in regex_array]
 | 
			
		||||
        return [re.compile(regex) for regex in regex_array]
 | 
			
		||||
 | 
			
		||||
    error_matches      = compile(_error_matches)
 | 
			
		||||
    error_exceptions   = compile(_error_exceptions)
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user