Avoid quadratic complexity in log parser (#26568)
TL;DR: there are matching groups trying to match 1 or more occurrences of something. We don't use the matching group. Therefore it's sufficient to test for 1 occurrence. This reduce quadratic complexity to linear time. --- When parsing logs of an mpich build, I'm getting a 4 minute (!!) wait with 16 threads for regexes to run: ``` In [1]: %time p.parse("mpich.log") Wall time: 4min 14s ``` That's really unacceptably slow... After some digging, it seems a few regexes tend to have `O(n^2)` scaling where `n` is the string / log line length. I don't think they *necessarily* should scale like that, but it seems that way. The common pattern is this ``` ([^:]+): error ``` which matches `: error` literally, and then one or more non-colons before that. So for a log line like this: ``` abcdefghijklmnopqrstuvwxyz: error etc etc ``` Any of these are potential group matches when using `search` in Python: ``` abcdefghijklmnopqrstuvwxyz bcdefghijklmnopqrstuvwxyz cdefghijklmnopqrstuvwxyz ⋮ yz z ``` but clearly the capture group should return the longest match. My hypothesis is that Python has a very bad implementation of `search` that somehow considers all of these, even though it can be implemented in linear time by scanning for `: error` first, and then greedily expanding the longest possible `[^:]+` match to the left. If Python indeed considers all possible matches, then with `n` matches of length `1 .. n` you see the `O(n^2)` slowness (i verified this by replacing + with {1,k} and doubling `k`, it doubles the execution time indeed). This PR fixes this by removing the `+`, so effectively changing the O(n^2) into a O(n) worst case. The reason we are fine with dropping `+` is that we don't use the capture group anywhere, so, we just ensure `:: error` is not a match but `x: error` is. After going from O(n^2) to O(n), the 15MB mpich build log is parsed in `1.288s`, so about 200x faster. Just to be sure I've also updated `^CMake Error.*:` to `^CMake Error`, so that it does not match with all the possible `:`'s in the line. Another option is to use `.*?` there to make it quit scanning as soon as possible, but what line that starts with `CMake Error` that does not have a colon is really a false positive...
This commit is contained in:
parent
580a20c243
commit
0c0831861c
83
lib/spack/external/ctest_log_parser.py
vendored
83
lib/spack/external/ctest_log_parser.py
vendored
@ -77,47 +77,7 @@
|
|||||||
from six import StringIO
|
from six import StringIO
|
||||||
from six import string_types
|
from six import string_types
|
||||||
|
|
||||||
class prefilter(object):
|
|
||||||
"""Make regular expressions faster with a simple prefiltering predicate.
|
|
||||||
|
|
||||||
Some regular expressions seem to be much more costly than others. In
|
|
||||||
most cases, we can evaluate a simple precondition, e.g.::
|
|
||||||
|
|
||||||
lambda x: "error" in x
|
|
||||||
|
|
||||||
to avoid evaluating expensive regexes on all lines in a file. This
|
|
||||||
can reduce parse time for large files by orders of magnitude when
|
|
||||||
evaluating lots of expressions.
|
|
||||||
|
|
||||||
A ``prefilter`` object is designed to act like a regex,, but
|
|
||||||
``search`` and ``match`` check the precondition before bothering to
|
|
||||||
evaluate the regular expression.
|
|
||||||
|
|
||||||
Note that ``match`` and ``search`` just return ``True`` and ``False``
|
|
||||||
at the moment. Make them return a ``MatchObject`` or ``None`` if it
|
|
||||||
becomes necessary.
|
|
||||||
"""
|
|
||||||
def __init__(self, precondition, *patterns):
|
|
||||||
self.patterns = [re.compile(p) for p in patterns]
|
|
||||||
self.pre = precondition
|
|
||||||
self.pattern = "\n ".join(
|
|
||||||
('MERGED:',) + patterns)
|
|
||||||
|
|
||||||
def search(self, text):
|
|
||||||
return self.pre(text) and any(p.search(text) for p in self.patterns)
|
|
||||||
|
|
||||||
def match(self, text):
|
|
||||||
return self.pre(text) and any(p.match(text) for p in self.patterns)
|
|
||||||
|
|
||||||
|
|
||||||
_error_matches = [
|
_error_matches = [
|
||||||
prefilter(
|
|
||||||
lambda x: any(s in x for s in (
|
|
||||||
'Error:', 'error', 'undefined reference', 'multiply defined')),
|
|
||||||
"([^:]+): error[ \\t]*[0-9]+[ \\t]*:",
|
|
||||||
"([^:]+): (Error:|error|undefined reference|multiply defined)",
|
|
||||||
"([^ :]+) ?: (error|fatal error|catastrophic error)",
|
|
||||||
"([^:]+)\\(([^\\)]+)\\) ?: (error|fatal error|catastrophic error)"),
|
|
||||||
"^FAIL: ",
|
"^FAIL: ",
|
||||||
"^FATAL: ",
|
"^FATAL: ",
|
||||||
"^failed ",
|
"^failed ",
|
||||||
@ -127,6 +87,8 @@ def match(self, text):
|
|||||||
"^[Ss]egmentation [Vv]iolation",
|
"^[Ss]egmentation [Vv]iolation",
|
||||||
"^[Ss]egmentation [Ff]ault",
|
"^[Ss]egmentation [Ff]ault",
|
||||||
":.*[Pp]ermission [Dd]enied",
|
":.*[Pp]ermission [Dd]enied",
|
||||||
|
"[^ :]:[0-9]+: [^ \\t]",
|
||||||
|
"[^:]: error[ \\t]*[0-9]+[ \\t]*:",
|
||||||
"^Error ([0-9]+):",
|
"^Error ([0-9]+):",
|
||||||
"^Fatal",
|
"^Fatal",
|
||||||
"^[Ee]rror: ",
|
"^[Ee]rror: ",
|
||||||
@ -136,6 +98,9 @@ def match(self, text):
|
|||||||
"^cc[^C]*CC: ERROR File = ([^,]+), Line = ([0-9]+)",
|
"^cc[^C]*CC: ERROR File = ([^,]+), Line = ([0-9]+)",
|
||||||
"^ld([^:])*:([ \\t])*ERROR([^:])*:",
|
"^ld([^:])*:([ \\t])*ERROR([^:])*:",
|
||||||
"^ild:([ \\t])*\\(undefined symbol\\)",
|
"^ild:([ \\t])*\\(undefined symbol\\)",
|
||||||
|
"[^ :] : (error|fatal error|catastrophic error)",
|
||||||
|
"[^:]: (Error:|error|undefined reference|multiply defined)",
|
||||||
|
"[^:]\\([^\\)]+\\) ?: (error|fatal error|catastrophic error)",
|
||||||
"^fatal error C[0-9]+:",
|
"^fatal error C[0-9]+:",
|
||||||
": syntax error ",
|
": syntax error ",
|
||||||
"^collect2: ld returned 1 exit status",
|
"^collect2: ld returned 1 exit status",
|
||||||
@ -144,7 +109,7 @@ def match(self, text):
|
|||||||
"^Unresolved:",
|
"^Unresolved:",
|
||||||
"Undefined symbol",
|
"Undefined symbol",
|
||||||
"^Undefined[ \\t]+first referenced",
|
"^Undefined[ \\t]+first referenced",
|
||||||
"^CMake Error.*:",
|
"^CMake Error",
|
||||||
":[ \\t]cannot find",
|
":[ \\t]cannot find",
|
||||||
":[ \\t]can't find",
|
":[ \\t]can't find",
|
||||||
": \\*\\*\\* No rule to make target [`'].*\\'. Stop",
|
": \\*\\*\\* No rule to make target [`'].*\\'. Stop",
|
||||||
@ -180,6 +145,7 @@ def match(self, text):
|
|||||||
"instantiated from ",
|
"instantiated from ",
|
||||||
"candidates are:",
|
"candidates are:",
|
||||||
": warning",
|
": warning",
|
||||||
|
": WARNING",
|
||||||
": \\(Warning\\)",
|
": \\(Warning\\)",
|
||||||
": note",
|
": note",
|
||||||
" ok",
|
" ok",
|
||||||
@ -187,38 +153,32 @@ def match(self, text):
|
|||||||
"makefile:",
|
"makefile:",
|
||||||
"Makefile:",
|
"Makefile:",
|
||||||
":[ \\t]+Where:",
|
":[ \\t]+Where:",
|
||||||
"([^ :]+):([0-9]+): Warning",
|
"[^ :]:[0-9]+: Warning",
|
||||||
"------ Build started: .* ------",
|
"------ Build started: .* ------",
|
||||||
]
|
]
|
||||||
|
|
||||||
#: Regexes to match file/line numbers in error/warning messages
|
#: Regexes to match file/line numbers in error/warning messages
|
||||||
_warning_matches = [
|
_warning_matches = [
|
||||||
prefilter(
|
"[^ :]:[0-9]+: warning:",
|
||||||
lambda x: 'warning' in x,
|
"[^ :]:[0-9]+: note:",
|
||||||
"([^ :]+):([0-9]+): warning:",
|
|
||||||
"([^:]+): warning ([0-9]+):",
|
|
||||||
"([^:]+): warning[ \\t]*[0-9]+[ \\t]*:",
|
|
||||||
"([^ :]+) : warning",
|
|
||||||
"([^:]+): warning"),
|
|
||||||
prefilter(
|
|
||||||
lambda x: 'note:' in x,
|
|
||||||
"^([^ :]+):([0-9]+): note:"),
|
|
||||||
prefilter(
|
|
||||||
lambda x: any(s in x for s in ('Warning', 'Warnung')),
|
|
||||||
"^(Warning|Warnung) ([0-9]+):",
|
|
||||||
"^(Warning|Warnung)[ :]",
|
|
||||||
"^cxx: Warning:",
|
|
||||||
"([^ :]+):([0-9]+): (Warning|Warnung)",
|
|
||||||
"^CMake Warning.*:"),
|
|
||||||
"file: .* has no symbols",
|
|
||||||
"^cc[^C]*CC: WARNING File = ([^,]+), Line = ([0-9]+)",
|
"^cc[^C]*CC: WARNING File = ([^,]+), Line = ([0-9]+)",
|
||||||
"^ld([^:])*:([ \\t])*WARNING([^:])*:",
|
"^ld([^:])*:([ \\t])*WARNING([^:])*:",
|
||||||
|
"[^:]: warning [0-9]+:",
|
||||||
"^\"[^\"]+\", line [0-9]+: [Ww](arning|arnung)",
|
"^\"[^\"]+\", line [0-9]+: [Ww](arning|arnung)",
|
||||||
|
"[^:]: warning[ \\t]*[0-9]+[ \\t]*:",
|
||||||
|
"^(Warning|Warnung) ([0-9]+):",
|
||||||
|
"^(Warning|Warnung)[ :]",
|
||||||
"WARNING: ",
|
"WARNING: ",
|
||||||
|
"[^ :] : warning",
|
||||||
|
"[^:]: warning",
|
||||||
"\", line [0-9]+\\.[0-9]+: [0-9]+-[0-9]+ \\([WI]\\)",
|
"\", line [0-9]+\\.[0-9]+: [0-9]+-[0-9]+ \\([WI]\\)",
|
||||||
|
"^cxx: Warning:",
|
||||||
|
"file: .* has no symbols",
|
||||||
|
"[^ :]:[0-9]+: (Warning|Warnung)",
|
||||||
"\\([0-9]*\\): remark #[0-9]*",
|
"\\([0-9]*\\): remark #[0-9]*",
|
||||||
"\".*\", line [0-9]+: remark\\([0-9]*\\):",
|
"\".*\", line [0-9]+: remark\\([0-9]*\\):",
|
||||||
"cc-[0-9]* CC: REMARK File = .*, Line = [0-9]*",
|
"cc-[0-9]* CC: REMARK File = .*, Line = [0-9]*",
|
||||||
|
"^CMake Warning",
|
||||||
"^\\[WARNING\\]",
|
"^\\[WARNING\\]",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -349,8 +309,7 @@ def _profile_match(matches, exceptions, line, match_times, exc_times):
|
|||||||
|
|
||||||
def _parse(lines, offset, profile):
|
def _parse(lines, offset, profile):
|
||||||
def compile(regex_array):
|
def compile(regex_array):
|
||||||
return [regex if isinstance(regex, prefilter) else re.compile(regex)
|
return [re.compile(regex) for regex in regex_array]
|
||||||
for regex in regex_array]
|
|
||||||
|
|
||||||
error_matches = compile(_error_matches)
|
error_matches = compile(_error_matches)
|
||||||
error_exceptions = compile(_error_exceptions)
|
error_exceptions = compile(_error_exceptions)
|
||||||
|
Loading…
Reference in New Issue
Block a user