feat: cache syntax scanner to speedup

This commit is contained in:
ToruNiina
2025-02-10 20:01:21 +09:00
parent 37359d42e6
commit 405fd8e2ca
2 changed files with 606 additions and 445 deletions

View File

@@ -18,10 +18,10 @@ using char_type = location::char_type;
// avoid redundant representation and out-of-unicode sequence
character_in_range utf8_1byte (const spec&);
sequence utf8_2bytes(const spec&);
sequence utf8_3bytes(const spec&);
sequence utf8_4bytes(const spec&);
character_in_range const& utf8_1byte (const spec&);
sequence const& utf8_2bytes(const spec&);
sequence const& utf8_3bytes(const spec&);
sequence const& utf8_4bytes(const spec&);
class non_ascii final : public scanner_base
{
@@ -79,27 +79,27 @@ class non_ascii final : public scanner_base
// ===========================================================================
// Whitespace
character_either wschar(const spec&);
character_either const& wschar(const spec&);
repeat_at_least ws(const spec& s);
repeat_at_least const& ws(const spec& s);
// ===========================================================================
// Newline
either newline(const spec&);
either const& newline(const spec&);
// ===========================================================================
// Comments
either allowed_comment_char(const spec& s);
either const& allowed_comment_char(const spec& s);
// XXX Note that it does not take newline
sequence comment(const spec& s);
sequence const& comment(const spec& s);
// ===========================================================================
// Boolean
either boolean(const spec&);
either const& boolean(const spec&);
// ===========================================================================
// Integer
@@ -246,62 +246,58 @@ class hexdig final : public scanner_base
character_in_range uppercase_;
};
sequence num_suffix(const spec& s);
sequence const& num_suffix(const spec& s);
sequence dec_int(const spec& s);
sequence hex_int(const spec& s);
sequence oct_int(const spec&);
sequence bin_int(const spec&);
either integer(const spec& s);
sequence const& dec_int(const spec& s);
sequence const& hex_int(const spec& s);
sequence const& oct_int(const spec&);
sequence const& bin_int(const spec&);
either const& integer(const spec& s);
// ===========================================================================
// Floating
sequence zero_prefixable_int(const spec& s);
sequence fractional_part(const spec& s);
sequence exponent_part(const spec& s);
sequence hex_floating(const spec& s);
either floating(const spec& s);
sequence const& zero_prefixable_int(const spec& s);
sequence const& fractional_part(const spec& s);
sequence const& exponent_part(const spec& s);
sequence const& hex_floating(const spec& s);
either const& floating(const spec& s);
// ===========================================================================
// Datetime
sequence local_date(const spec& s);
sequence local_time(const spec& s);
either time_offset(const spec& s);
sequence full_time(const spec& s);
character_either time_delim(const spec&);
sequence local_datetime(const spec& s);
sequence offset_datetime(const spec& s);
sequence const& local_date(const spec& s);
sequence const& local_time(const spec& s);
either const& time_offset(const spec& s);
sequence const& full_time(const spec& s);
character_either const& time_delim(const spec&);
sequence const& local_datetime(const spec& s);
sequence const& offset_datetime(const spec& s);
// ===========================================================================
// String
sequence escaped_x2(const spec& s);
sequence escaped_u4(const spec& s);
sequence escaped_U8(const spec& s);
sequence const& escaped_x2(const spec& s);
sequence const& escaped_u4(const spec& s);
sequence const& escaped_U8(const spec& s);
sequence escaped(const spec& s);
either basic_char(const spec& s);
sequence basic_string(const spec& s);
sequence const& escaped (const spec& s);
either const& basic_char (const spec& s);
sequence const& basic_string(const spec& s);
// ---------------------------------------------------------------------------
// multiline string
sequence escaped_newline(const spec& s);
sequence ml_basic_string(const spec& s);
sequence const& escaped_newline(const spec& s);
sequence const& ml_basic_string(const spec& s);
// ---------------------------------------------------------------------------
// literal string
either literal_char(const spec& s);
sequence literal_string(const spec& s);
sequence ml_literal_string(const spec& s);
either string(const spec& s);
either const& literal_char(const spec& s);
sequence const& literal_string(const spec& s);
sequence const& ml_literal_string(const spec& s);
either const& string(const spec& s);
// ===========================================================================
// Keys
@@ -345,15 +341,11 @@ class non_ascii_key_char final : public scanner_base
};
repeat_at_least unquoted_key(const spec& s);
either quoted_key(const spec& s);
either simple_key(const spec& s);
sequence dot_sep(const spec& s);
sequence dotted_key(const spec& s);
repeat_at_least const& unquoted_key(const spec& s);
either const& quoted_key(const spec& s);
either const& simple_key(const spec& s);
sequence const& dot_sep(const spec& s);
sequence const& dotted_key(const spec& s);
class key final : public scanner_base
{
@@ -403,19 +395,19 @@ class key final : public scanner_base
either simple_;
};
sequence keyval_sep(const spec& s);
sequence const& keyval_sep(const spec& s);
// ===========================================================================
// Table key
sequence std_table(const spec& s);
sequence const& std_table(const spec& s);
sequence array_table(const spec& s);
sequence const& array_table(const spec& s);
// ===========================================================================
// extension: null
literal null_value(const spec&);
literal const& null_value(const spec&);
} // namespace syntax
} // namespace detail

View File

@@ -14,67 +14,116 @@ namespace syntax
using char_type = location::char_type;
template<typename F>
struct syntax_cache
{
using value_type = cxx::return_type_of_t<F, const spec&>;
static_assert(std::is_base_of<scanner_base, value_type>::value, "");
explicit syntax_cache(F f)
: func_(std::move(f)), cache_{}
{}
value_type const& at(const spec& s)
{
const auto found = std::find_if(cache_.begin(), cache_.end(),
[&s](const std::pair<spec, value_type>& kv) { return kv.first == s; });
if(found == cache_.end())
{
this->cache_.emplace_back(s, func_(s));
return cache_.back().second;
}
else
{
return found->second;
}
}
private:
F func_;
std::vector<std::pair<spec, value_type>> cache_;
};
template<typename F>
syntax_cache<cxx::remove_cvref_t<F>> make_cache(F&& f)
{
return syntax_cache<cxx::remove_cvref_t<F>>(std::forward<F>(f));
}
// ===========================================================================
// UTF-8
// avoid redundant representation and out-of-unicode sequence
TOML11_INLINE character_in_range utf8_1byte(const spec&)
TOML11_INLINE character_in_range const& utf8_1byte(const spec&)
{
return character_in_range(0x00, 0x7F);
static thread_local character_in_range cache(0x00, 0x7F);
return cache;
}
TOML11_INLINE sequence utf8_2bytes(const spec&)
TOML11_INLINE sequence const& utf8_2bytes(const spec&)
{
return sequence(character_in_range(0xC2, 0xDF),
static thread_local sequence cache(
character_in_range(0xC2, 0xDF),
character_in_range(0x80, 0xBF));
return cache;
}
TOML11_INLINE sequence utf8_3bytes(const spec&)
TOML11_INLINE sequence const& utf8_3bytes(const spec&)
{
return sequence(/*1~2 bytes = */either(
static thread_local sequence cache(/*1~2 bytes = */either(
sequence(character (0xE0), character_in_range(0xA0, 0xBF)),
sequence(character_in_range(0xE1, 0xEC), character_in_range(0x80, 0xBF)),
sequence(character (0xED), character_in_range(0x80, 0x9F)),
sequence(character_in_range(0xEE, 0xEF), character_in_range(0x80, 0xBF))
), /*3rd byte = */ character_in_range(0x80, 0xBF));
return cache;
}
TOML11_INLINE sequence utf8_4bytes(const spec&)
TOML11_INLINE sequence const& utf8_4bytes(const spec&)
{
return sequence(/*1~2 bytes = */either(
static thread_local sequence cache(/*1~2 bytes = */either(
sequence(character (0xF0), character_in_range(0x90, 0xBF)),
sequence(character_in_range(0xF1, 0xF3), character_in_range(0x80, 0xBF)),
sequence(character (0xF4), character_in_range(0x80, 0x8F))
), character_in_range(0x80, 0xBF), character_in_range(0x80, 0xBF));
return cache;
}
// ===========================================================================
// Whitespace
TOML11_INLINE character_either wschar(const spec&)
TOML11_INLINE character_either const& wschar(const spec&)
{
return character_either(" \t");
static thread_local character_either cache(" \t");
return cache;
}
TOML11_INLINE repeat_at_least ws(const spec& s)
TOML11_INLINE repeat_at_least const& ws(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s){
return repeat_at_least(0, wschar(s));
});
return cache.at(sp);
}
// ===========================================================================
// Newline
TOML11_INLINE either newline(const spec&)
TOML11_INLINE either const& newline(const spec&)
{
return either(character(char_type('\n')), literal("\r\n"));
static thread_local either cache(character(char_type('\n')), literal("\r\n"));
return cache;
}
// ===========================================================================
// Comments
TOML11_INLINE either allowed_comment_char(const spec& s)
TOML11_INLINE either const& allowed_comment_char(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s){
if(s.v1_1_0_allow_control_characters_in_comments)
{
return either(
@@ -91,21 +140,27 @@ TOML11_INLINE either allowed_comment_char(const spec& s)
non_ascii(s)
);
}
});
return cache.at(sp);
}
// XXX Note that it does not take newline
TOML11_INLINE sequence comment(const spec& s)
TOML11_INLINE sequence const& comment(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s){
return sequence(character(char_type('#')),
repeat_at_least(0, allowed_comment_char(s)));
});
return cache.at(sp);
}
// ===========================================================================
// Boolean
TOML11_INLINE either boolean(const spec&)
TOML11_INLINE either const& boolean(const spec&)
{
return either(literal("true"), literal("false"));
static thread_local either cache(literal("true"), literal("false"));
return cache;
}
// ===========================================================================
@@ -114,8 +169,9 @@ TOML11_INLINE either boolean(const spec&)
// non-digit-graph = ([a-zA-Z]|unicode mb char)
// graph = ([a-zA-Z0-9]|unicode mb char)
// suffix = _ non-digit-graph (graph | _graph)
TOML11_INLINE sequence num_suffix(const spec& s)
TOML11_INLINE sequence const& num_suffix(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
const auto non_digit_graph = [&s]() {
return either(
alpha(s),
@@ -140,10 +196,13 @@ TOML11_INLINE sequence num_suffix(const spec& s)
)
)
);
});
return cache.at(sp);
}
TOML11_INLINE sequence dec_int(const spec& s)
TOML11_INLINE sequence const& dec_int(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
const auto digit19 = []() {
return character_in_range(char_type('1'), char_type('9'));
};
@@ -162,10 +221,13 @@ TOML11_INLINE sequence dec_int(const spec& s)
digit(s)
)
);
});
return cache.at(sp);
}
TOML11_INLINE sequence hex_int(const spec& s)
TOML11_INLINE sequence const& hex_int(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return sequence(
literal("0x"),
hexdig(s),
@@ -176,10 +238,13 @@ TOML11_INLINE sequence hex_int(const spec& s)
)
)
);
});
return cache.at(sp);
}
TOML11_INLINE sequence oct_int(const spec&)
TOML11_INLINE sequence const& oct_int(const spec& s)
{
static thread_local auto cache = make_cache([](const spec&) {
const auto digit07 = []() {
return character_in_range(char_type('0'), char_type('7'));
};
@@ -193,10 +258,13 @@ TOML11_INLINE sequence oct_int(const spec&)
)
)
);
});
return cache.at(s);
}
TOML11_INLINE sequence bin_int(const spec&)
TOML11_INLINE sequence const& bin_int(const spec& s)
{
static thread_local auto cache = make_cache([](const spec&) {
const auto digit01 = []() {
return character_either("01");
};
@@ -210,24 +278,30 @@ TOML11_INLINE sequence bin_int(const spec&)
)
)
);
});
return cache.at(s);
}
TOML11_INLINE either integer(const spec& s)
TOML11_INLINE either const& integer(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return either(
hex_int(s),
oct_int(s),
bin_int(s),
dec_int(s)
);
});
return cache.at(sp);
}
// ===========================================================================
// Floating
TOML11_INLINE sequence zero_prefixable_int(const spec& s)
TOML11_INLINE sequence const& zero_prefixable_int(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return sequence(
digit(s),
repeat_at_least(0,
@@ -237,27 +311,36 @@ TOML11_INLINE sequence zero_prefixable_int(const spec& s)
)
)
);
});
return cache.at(sp);
}
TOML11_INLINE sequence fractional_part(const spec& s)
TOML11_INLINE sequence const& fractional_part(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return sequence(
character('.'),
zero_prefixable_int(s)
);
});
return cache.at(sp);
}
TOML11_INLINE sequence exponent_part(const spec& s)
TOML11_INLINE sequence const& exponent_part(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return sequence(
character_either("eE"),
maybe(character_either("+-")),
zero_prefixable_int(s)
);
});
return cache.at(sp);
}
TOML11_INLINE sequence hex_floating(const spec& s)
TOML11_INLINE sequence const& hex_floating(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
// C99 hexfloat (%a)
// [+-]? 0x ( [0-9a-fA-F]*\.[0-9a-fA-F]+ | [0-9a-fA-F]+\.? ) [pP] [+-]? [0-9]+
@@ -285,10 +368,13 @@ TOML11_INLINE sequence hex_floating(const spec& s)
maybe(character_either("+-")),
repeat_at_least(1, character_in_range('0', '9'))
);
});
return cache.at(sp);
}
TOML11_INLINE either floating(const spec& s)
TOML11_INLINE either const& floating(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return either(
sequence(
dec_int(s),
@@ -302,13 +388,16 @@ TOML11_INLINE either floating(const spec& s)
either(literal("inf"), literal("nan"))
)
);
});
return cache.at(sp);
}
// ===========================================================================
// Datetime
TOML11_INLINE sequence local_date(const spec& s)
TOML11_INLINE sequence const& local_date(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return sequence(
repeat_exact(4, digit(s)),
character('-'),
@@ -316,9 +405,12 @@ TOML11_INLINE sequence local_date(const spec& s)
character('-'),
repeat_exact(2, digit(s))
);
});
return cache.at(sp);
}
TOML11_INLINE sequence local_time(const spec& s)
TOML11_INLINE sequence const& local_time(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
if(s.v1_1_0_make_seconds_optional)
{
return sequence(
@@ -342,9 +434,12 @@ TOML11_INLINE sequence local_time(const spec& s)
maybe(sequence(character('.'), repeat_at_least(1, digit(s))))
);
}
});
return cache.at(sp);
}
TOML11_INLINE either time_offset(const spec& s)
TOML11_INLINE either const& time_offset(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return either(
character_either("zZ"),
sequence(character_either("+-"),
@@ -353,42 +448,66 @@ TOML11_INLINE either time_offset(const spec& s)
repeat_exact(2, digit(s))
)
);
});
return cache.at(sp);
}
TOML11_INLINE sequence full_time(const spec& s)
TOML11_INLINE sequence const& full_time(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return sequence(local_time(s), time_offset(s));
});
return cache.at(sp);
}
TOML11_INLINE character_either time_delim(const spec&)
TOML11_INLINE character_either const& time_delim(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec&) {
return character_either("Tt ");
});
return cache.at(sp);
}
TOML11_INLINE sequence local_datetime(const spec& s)
TOML11_INLINE sequence const& local_datetime(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return sequence(local_date(s), time_delim(s), local_time(s));
});
return cache.at(sp);
}
TOML11_INLINE sequence offset_datetime(const spec& s)
TOML11_INLINE sequence const& offset_datetime(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return sequence(local_date(s), time_delim(s), full_time(s));
});
return cache.at(sp);
}
// ===========================================================================
// String
TOML11_INLINE sequence escaped_x2(const spec& s)
TOML11_INLINE sequence const& escaped_x2(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return sequence(character('x'), repeat_exact(2, hexdig(s)));
});
return cache.at(sp);
}
TOML11_INLINE sequence escaped_u4(const spec& s)
TOML11_INLINE sequence const& escaped_u4(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return sequence(character('u'), repeat_exact(4, hexdig(s)));
});
return cache.at(sp);
}
TOML11_INLINE sequence escaped_U8(const spec& s)
TOML11_INLINE sequence const& escaped_U8(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return sequence(character('U'), repeat_exact(8, hexdig(s)));
});
return cache.at(sp);
}
TOML11_INLINE sequence escaped(const spec& s)
TOML11_INLINE sequence const& escaped(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
const auto escape_char = [&s] {
if(s.v1_1_0_add_escape_sequence_e)
{
@@ -421,10 +540,13 @@ TOML11_INLINE sequence escaped(const spec& s)
};
return sequence(character('\\'), escape_seq());
});
return cache.at(sp);
}
TOML11_INLINE either basic_char(const spec& s)
TOML11_INLINE either const& basic_char(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
const auto basic_unescaped = [&s]() {
return either(
wschar(s),
@@ -435,30 +557,39 @@ TOML11_INLINE either basic_char(const spec& s)
);
};
return either(basic_unescaped(), escaped(s));
});
return cache.at(sp);
}
TOML11_INLINE sequence basic_string(const spec& s)
TOML11_INLINE sequence const& basic_string(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return sequence(
character('"'),
repeat_at_least(0, basic_char(s)),
character('"')
);
});
return cache.at(sp);
}
// ---------------------------------------------------------------------------
// multiline string
TOML11_INLINE sequence escaped_newline(const spec& s)
TOML11_INLINE sequence const& escaped_newline(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return sequence(
character('\\'), ws(s), newline(s),
repeat_at_least(0, either(wschar(s), newline(s)))
);
});
return cache.at(sp);
}
TOML11_INLINE sequence ml_basic_string(const spec& s)
TOML11_INLINE sequence const& ml_basic_string(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
const auto mlb_content = [&s]() {
return either(basic_char(s), newline(s), escaped_newline(s));
};
@@ -481,32 +612,41 @@ TOML11_INLINE sequence ml_basic_string(const spec& s)
literal("\"\"\""),
maybe(mlb_quotes())
);
});
return cache.at(sp);
}
// ---------------------------------------------------------------------------
// literal string
TOML11_INLINE either literal_char(const spec& s)
TOML11_INLINE either const& literal_char(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return either(
character (0x09),
character_in_range(0x20, 0x26),
character_in_range(0x28, 0x7E),
non_ascii(s)
);
});
return cache.at(sp);
}
TOML11_INLINE sequence literal_string(const spec& s)
TOML11_INLINE sequence const& literal_string(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return sequence(
character('\''),
repeat_at_least(0, literal_char(s)),
character('\'')
);
});
return cache.at(sp);
}
TOML11_INLINE sequence ml_literal_string(const spec& s)
TOML11_INLINE sequence const& ml_literal_string(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
const auto mll_quotes = []() {
return either(literal("''"), character('\''));
};
@@ -528,16 +668,21 @@ TOML11_INLINE sequence ml_literal_string(const spec& s)
// XXX ''' and mll_quotes are intentionally reordered to avoid
// unexpected match of mll_quotes
);
});
return cache.at(sp);
}
TOML11_INLINE either string(const spec& s)
TOML11_INLINE either const& string(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return either(
ml_basic_string(s),
ml_literal_string(s),
basic_string(s),
literal_string(s)
);
});
return cache.at(sp);
}
// ===========================================================================
@@ -654,8 +799,9 @@ TOML11_INLINE region non_ascii_key_char::scan(location& loc) const
return region{};
}
TOML11_INLINE repeat_at_least unquoted_key(const spec& s)
TOML11_INLINE repeat_at_least const& unquoted_key(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
const auto keychar = [&s] {
if(s.v1_1_0_allow_non_english_in_bare_keys)
{
@@ -667,57 +813,80 @@ TOML11_INLINE repeat_at_least unquoted_key(const spec& s)
return either(alpha(s), digit(s), character{0x2D}, character{0x5F});
}
};
return repeat_at_least(1, keychar());
});
return cache.at(sp);
}
TOML11_INLINE either quoted_key(const spec& s)
TOML11_INLINE either const& quoted_key(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return either(basic_string(s), literal_string(s));
});
return cache.at(sp);
}
TOML11_INLINE either simple_key(const spec& s)
TOML11_INLINE either const& simple_key(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return either(unquoted_key(s), quoted_key(s));
});
return cache.at(sp);
}
TOML11_INLINE sequence dot_sep(const spec& s)
TOML11_INLINE sequence const& dot_sep(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return sequence(ws(s), character('.'), ws(s));
});
return cache.at(sp);
}
TOML11_INLINE sequence dotted_key(const spec& s)
TOML11_INLINE sequence const& dotted_key(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return sequence(
simple_key(s),
repeat_at_least(1, sequence(dot_sep(s), simple_key(s)))
);
});
return cache.at(sp);
}
TOML11_INLINE sequence keyval_sep(const spec& s)
TOML11_INLINE sequence const& keyval_sep(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return sequence(ws(s), character('='), ws(s));
});
return cache.at(sp);
}
// ===========================================================================
// Table key
TOML11_INLINE sequence std_table(const spec& s)
TOML11_INLINE sequence const& std_table(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return sequence(character('['), ws(s), key(s), ws(s), character(']'));
});
return cache.at(sp);
}
TOML11_INLINE sequence array_table(const spec& s)
TOML11_INLINE sequence const& array_table(const spec& sp)
{
static thread_local auto cache = make_cache([](const spec& s) {
return sequence(literal("[["), ws(s), key(s), ws(s), literal("]]"));
});
return cache.at(sp);
}
// ===========================================================================
// extension: null
TOML11_INLINE literal null_value(const spec&)
TOML11_INLINE literal const& null_value(const spec&)
{
return literal("null");
static thread_local literal cache("null");
return cache;
}
} // namespace syntax