From 0c0831861c57b747d915731ef3ad214346a9dcbe Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Tue, 12 Oct 2021 09:05:11 +0200 Subject: [PATCH] Avoid quadratic complexity in log parser (#26568) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TL;DR: there are matching groups trying to match 1 or more occurrences of something. We don't use the matching group. Therefore it's sufficient to test for 1 occurrence. This reduce quadratic complexity to linear time. --- When parsing logs of an mpich build, I'm getting a 4 minute (!!) wait with 16 threads for regexes to run: ``` In [1]: %time p.parse("mpich.log") Wall time: 4min 14s ``` That's really unacceptably slow... After some digging, it seems a few regexes tend to have `O(n^2)` scaling where `n` is the string / log line length. I don't think they *necessarily* should scale like that, but it seems that way. The common pattern is this ``` ([^:]+): error ``` which matches `: error` literally, and then one or more non-colons before that. So for a log line like this: ``` abcdefghijklmnopqrstuvwxyz: error etc etc ``` Any of these are potential group matches when using `search` in Python: ``` abcdefghijklmnopqrstuvwxyz bcdefghijklmnopqrstuvwxyz cdefghijklmnopqrstuvwxyz ⋮ yz z ``` but clearly the capture group should return the longest match. My hypothesis is that Python has a very bad implementation of `search` that somehow considers all of these, even though it can be implemented in linear time by scanning for `: error` first, and then greedily expanding the longest possible `[^:]+` match to the left. If Python indeed considers all possible matches, then with `n` matches of length `1 .. n` you see the `O(n^2)` slowness (i verified this by replacing + with {1,k} and doubling `k`, it doubles the execution time indeed). This PR fixes this by removing the `+`, so effectively changing the O(n^2) into a O(n) worst case. The reason we are fine with dropping `+` is that we don't use the capture group anywhere, so, we just ensure `:: error` is not a match but `x: error` is. After going from O(n^2) to O(n), the 15MB mpich build log is parsed in `1.288s`, so about 200x faster. Just to be sure I've also updated `^CMake Error.*:` to `^CMake Error`, so that it does not match with all the possible `:`'s in the line. Another option is to use `.*?` there to make it quit scanning as soon as possible, but what line that starts with `CMake Error` that does not have a colon is really a false positive... --- lib/spack/external/ctest_log_parser.py | 83 +++++++------------------- 1 file changed, 21 insertions(+), 62 deletions(-) diff --git a/lib/spack/external/ctest_log_parser.py b/lib/spack/external/ctest_log_parser.py index 4e80a2b6889..61cc6df0296 100644 --- a/lib/spack/external/ctest_log_parser.py +++ b/lib/spack/external/ctest_log_parser.py @@ -77,47 +77,7 @@ from six import StringIO from six import string_types -class prefilter(object): - """Make regular expressions faster with a simple prefiltering predicate. - - Some regular expressions seem to be much more costly than others. In - most cases, we can evaluate a simple precondition, e.g.:: - - lambda x: "error" in x - - to avoid evaluating expensive regexes on all lines in a file. This - can reduce parse time for large files by orders of magnitude when - evaluating lots of expressions. - - A ``prefilter`` object is designed to act like a regex,, but - ``search`` and ``match`` check the precondition before bothering to - evaluate the regular expression. - - Note that ``match`` and ``search`` just return ``True`` and ``False`` - at the moment. Make them return a ``MatchObject`` or ``None`` if it - becomes necessary. - """ - def __init__(self, precondition, *patterns): - self.patterns = [re.compile(p) for p in patterns] - self.pre = precondition - self.pattern = "\n ".join( - ('MERGED:',) + patterns) - - def search(self, text): - return self.pre(text) and any(p.search(text) for p in self.patterns) - - def match(self, text): - return self.pre(text) and any(p.match(text) for p in self.patterns) - - _error_matches = [ - prefilter( - lambda x: any(s in x for s in ( - 'Error:', 'error', 'undefined reference', 'multiply defined')), - "([^:]+): error[ \\t]*[0-9]+[ \\t]*:", - "([^:]+): (Error:|error|undefined reference|multiply defined)", - "([^ :]+) ?: (error|fatal error|catastrophic error)", - "([^:]+)\\(([^\\)]+)\\) ?: (error|fatal error|catastrophic error)"), "^FAIL: ", "^FATAL: ", "^failed ", @@ -127,6 +87,8 @@ def match(self, text): "^[Ss]egmentation [Vv]iolation", "^[Ss]egmentation [Ff]ault", ":.*[Pp]ermission [Dd]enied", + "[^ :]:[0-9]+: [^ \\t]", + "[^:]: error[ \\t]*[0-9]+[ \\t]*:", "^Error ([0-9]+):", "^Fatal", "^[Ee]rror: ", @@ -136,6 +98,9 @@ def match(self, text): "^cc[^C]*CC: ERROR File = ([^,]+), Line = ([0-9]+)", "^ld([^:])*:([ \\t])*ERROR([^:])*:", "^ild:([ \\t])*\\(undefined symbol\\)", + "[^ :] : (error|fatal error|catastrophic error)", + "[^:]: (Error:|error|undefined reference|multiply defined)", + "[^:]\\([^\\)]+\\) ?: (error|fatal error|catastrophic error)", "^fatal error C[0-9]+:", ": syntax error ", "^collect2: ld returned 1 exit status", @@ -144,7 +109,7 @@ def match(self, text): "^Unresolved:", "Undefined symbol", "^Undefined[ \\t]+first referenced", - "^CMake Error.*:", + "^CMake Error", ":[ \\t]cannot find", ":[ \\t]can't find", ": \\*\\*\\* No rule to make target [`'].*\\'. Stop", @@ -180,6 +145,7 @@ def match(self, text): "instantiated from ", "candidates are:", ": warning", + ": WARNING", ": \\(Warning\\)", ": note", " ok", @@ -187,38 +153,32 @@ def match(self, text): "makefile:", "Makefile:", ":[ \\t]+Where:", - "([^ :]+):([0-9]+): Warning", + "[^ :]:[0-9]+: Warning", "------ Build started: .* ------", ] #: Regexes to match file/line numbers in error/warning messages _warning_matches = [ - prefilter( - lambda x: 'warning' in x, - "([^ :]+):([0-9]+): warning:", - "([^:]+): warning ([0-9]+):", - "([^:]+): warning[ \\t]*[0-9]+[ \\t]*:", - "([^ :]+) : warning", - "([^:]+): warning"), - prefilter( - lambda x: 'note:' in x, - "^([^ :]+):([0-9]+): note:"), - prefilter( - lambda x: any(s in x for s in ('Warning', 'Warnung')), - "^(Warning|Warnung) ([0-9]+):", - "^(Warning|Warnung)[ :]", - "^cxx: Warning:", - "([^ :]+):([0-9]+): (Warning|Warnung)", - "^CMake Warning.*:"), - "file: .* has no symbols", + "[^ :]:[0-9]+: warning:", + "[^ :]:[0-9]+: note:", "^cc[^C]*CC: WARNING File = ([^,]+), Line = ([0-9]+)", "^ld([^:])*:([ \\t])*WARNING([^:])*:", + "[^:]: warning [0-9]+:", "^\"[^\"]+\", line [0-9]+: [Ww](arning|arnung)", + "[^:]: warning[ \\t]*[0-9]+[ \\t]*:", + "^(Warning|Warnung) ([0-9]+):", + "^(Warning|Warnung)[ :]", "WARNING: ", + "[^ :] : warning", + "[^:]: warning", "\", line [0-9]+\\.[0-9]+: [0-9]+-[0-9]+ \\([WI]\\)", + "^cxx: Warning:", + "file: .* has no symbols", + "[^ :]:[0-9]+: (Warning|Warnung)", "\\([0-9]*\\): remark #[0-9]*", "\".*\", line [0-9]+: remark\\([0-9]*\\):", "cc-[0-9]* CC: REMARK File = .*, Line = [0-9]*", + "^CMake Warning", "^\\[WARNING\\]", ] @@ -349,8 +309,7 @@ def _profile_match(matches, exceptions, line, match_times, exc_times): def _parse(lines, offset, profile): def compile(regex_array): - return [regex if isinstance(regex, prefilter) else re.compile(regex) - for regex in regex_array] + return [re.compile(regex) for regex in regex_array] error_matches = compile(_error_matches) error_exceptions = compile(_error_exceptions)