mirror of
https://github.com/ToruNiina/toml11.git
synced 2025-09-17 17:58:09 +08:00
fix: handle edge-cases with quotes in ml-string
See comments in the code for detail.
This commit is contained in:
@@ -154,12 +154,53 @@ using lex_basic_string = sequence<lex_quotation_mark,
|
|||||||
repeat<lex_basic_char, unlimited>,
|
repeat<lex_basic_char, unlimited>,
|
||||||
lex_quotation_mark>;
|
lex_quotation_mark>;
|
||||||
|
|
||||||
|
// After toml post-v0.5.0, it is explicitly clarified how quotes in ml-strings
|
||||||
|
// are allowed to be used.
|
||||||
|
// After this, the following strings are *explicitly* allowed.
|
||||||
|
// - One or two `"`s in a multi-line basic string is allowed wherever it is.
|
||||||
|
// - Three consecutive `"`s in a multi-line basic string is considered as a delimiter.
|
||||||
|
// - One or two `"`s can appear just before or after the delimiter.
|
||||||
|
// ```toml
|
||||||
|
// str4 = """Here are two quotation marks: "". Simple enough."""
|
||||||
|
// str5 = """Here are three quotation marks: ""\"."""
|
||||||
|
// str6 = """Here are fifteen quotation marks: ""\"""\"""\"""\"""\"."""
|
||||||
|
// str7 = """"This," she said, "is just a pointless statement.""""
|
||||||
|
// ```
|
||||||
|
// In the current implementation (v3.3.0), it is difficult to parse `str7` in
|
||||||
|
// the above example. It is difficult to recognize `"` at the end of string body
|
||||||
|
// collectly. It will be misunderstood as a `"""` delimiter and an additional,
|
||||||
|
// invalid `"`. Like this:
|
||||||
|
// ```console
|
||||||
|
// what(): [error] toml::parse_table: invalid line format
|
||||||
|
// --> hoge.toml
|
||||||
|
// |
|
||||||
|
// 13 | str7 = """"This," she said, "is just a pointless statement.""""
|
||||||
|
// | ^- expected newline, but got '"'.
|
||||||
|
// ```
|
||||||
|
// As a quick workaround for this problem, `lex_ml_basic_string_delim` was
|
||||||
|
// splitted into two, `lex_ml_basic_string_open` and `lex_ml_basic_string_close`.
|
||||||
|
// `lex_ml_basic_string_open` allows only `"""`. `_close` allows 3-5 `"`s.
|
||||||
|
// In parse_ml_basic_string() function, the trailing `"`s will be attached to
|
||||||
|
// the string body.
|
||||||
|
//
|
||||||
|
// Note: This feature is a "clarification". Therefore this change is considered
|
||||||
|
// as a spec that has been defined since the time when the multi-line
|
||||||
|
// basic string was introduced. Although it is a post-v0.5.0 changes,
|
||||||
|
// this change will be activated regardless of the flag,
|
||||||
|
// `TOML11_USE_UNRELEASED_TOML_FEATURES`.
|
||||||
|
//
|
||||||
using lex_ml_basic_string_delim = repeat<lex_quotation_mark, exactly<3>>;
|
using lex_ml_basic_string_delim = repeat<lex_quotation_mark, exactly<3>>;
|
||||||
|
using lex_ml_basic_string_open = lex_ml_basic_string_delim;
|
||||||
|
using lex_ml_basic_string_close = sequence<
|
||||||
|
repeat<lex_quotation_mark, exactly<3>>,
|
||||||
|
maybe<lex_quotation_mark>, maybe<lex_quotation_mark>
|
||||||
|
>;
|
||||||
|
|
||||||
#ifdef TOML11_USE_UNRELEASED_TOML_FEATURES
|
#ifdef TOML11_USE_UNRELEASED_TOML_FEATURES
|
||||||
using lex_ml_basic_unescaped = exclude<either<in_range<0x00, 0x08>, // 0x09
|
using lex_ml_basic_unescaped = exclude<either<in_range<0x00, 0x08>, // 0x09
|
||||||
in_range<0x0a, 0x1F>, // is tab
|
in_range<0x0a, 0x1F>, // is tab
|
||||||
character<0x5C>,
|
character<0x5C>, // backslash
|
||||||
character<0x7F>,
|
character<0x7F>, // DEL
|
||||||
lex_ml_basic_string_delim>>;
|
lex_ml_basic_string_delim>>;
|
||||||
#else // TOML v0.5.0
|
#else // TOML v0.5.0
|
||||||
using lex_ml_basic_unescaped = exclude<either<in_range<0x00,0x1F>,
|
using lex_ml_basic_unescaped = exclude<either<in_range<0x00,0x1F>,
|
||||||
@@ -176,9 +217,9 @@ using lex_ml_basic_char = either<lex_ml_basic_unescaped, lex_escaped>;
|
|||||||
using lex_ml_basic_body = repeat<either<lex_ml_basic_char, lex_newline,
|
using lex_ml_basic_body = repeat<either<lex_ml_basic_char, lex_newline,
|
||||||
lex_ml_basic_escaped_newline>,
|
lex_ml_basic_escaped_newline>,
|
||||||
unlimited>;
|
unlimited>;
|
||||||
using lex_ml_basic_string = sequence<lex_ml_basic_string_delim,
|
using lex_ml_basic_string = sequence<lex_ml_basic_string_open,
|
||||||
lex_ml_basic_body,
|
lex_ml_basic_body,
|
||||||
lex_ml_basic_string_delim>;
|
lex_ml_basic_string_close>;
|
||||||
|
|
||||||
using lex_literal_char = exclude<either<in_range<0x00, 0x08>,
|
using lex_literal_char = exclude<either<in_range<0x00, 0x08>,
|
||||||
in_range<0x10, 0x19>, character<0x27>>>;
|
in_range<0x10, 0x19>, character<0x27>>>;
|
||||||
@@ -187,7 +228,13 @@ using lex_literal_string = sequence<lex_apostrophe,
|
|||||||
repeat<lex_literal_char, unlimited>,
|
repeat<lex_literal_char, unlimited>,
|
||||||
lex_apostrophe>;
|
lex_apostrophe>;
|
||||||
|
|
||||||
|
// the same reason as above.
|
||||||
using lex_ml_literal_string_delim = repeat<lex_apostrophe, exactly<3>>;
|
using lex_ml_literal_string_delim = repeat<lex_apostrophe, exactly<3>>;
|
||||||
|
using lex_ml_literal_string_open = lex_ml_literal_string_delim;
|
||||||
|
using lex_ml_literal_string_close = sequence<
|
||||||
|
repeat<lex_apostrophe, exactly<3>>,
|
||||||
|
maybe<lex_apostrophe>, maybe<lex_apostrophe>
|
||||||
|
>;
|
||||||
|
|
||||||
using lex_ml_literal_char = exclude<either<in_range<0x00, 0x08>,
|
using lex_ml_literal_char = exclude<either<in_range<0x00, 0x08>,
|
||||||
in_range<0x10, 0x1F>,
|
in_range<0x10, 0x1F>,
|
||||||
@@ -195,9 +242,9 @@ using lex_ml_literal_char = exclude<either<in_range<0x00, 0x08>,
|
|||||||
lex_ml_literal_string_delim>>;
|
lex_ml_literal_string_delim>>;
|
||||||
using lex_ml_literal_body = repeat<either<lex_ml_literal_char, lex_newline>,
|
using lex_ml_literal_body = repeat<either<lex_ml_literal_char, lex_newline>,
|
||||||
unlimited>;
|
unlimited>;
|
||||||
using lex_ml_literal_string = sequence<lex_ml_literal_string_delim,
|
using lex_ml_literal_string = sequence<lex_ml_literal_string_open,
|
||||||
lex_ml_literal_body,
|
lex_ml_literal_body,
|
||||||
lex_ml_literal_string_delim>;
|
lex_ml_literal_string_close>;
|
||||||
|
|
||||||
using lex_string = either<lex_ml_basic_string, lex_basic_string,
|
using lex_string = either<lex_ml_basic_string, lex_basic_string,
|
||||||
lex_ml_literal_string, lex_literal_string>;
|
lex_ml_literal_string, lex_literal_string>;
|
||||||
|
@@ -375,7 +375,7 @@ parse_ml_basic_string(location<Container>& loc)
|
|||||||
std::string retval;
|
std::string retval;
|
||||||
retval.reserve(token.unwrap().size());
|
retval.reserve(token.unwrap().size());
|
||||||
|
|
||||||
auto delim = lex_ml_basic_string_delim::invoke(inner_loc);
|
auto delim = lex_ml_basic_string_open::invoke(inner_loc);
|
||||||
if(!delim)
|
if(!delim)
|
||||||
{
|
{
|
||||||
throw internal_error(format_underline(
|
throw internal_error(format_underline(
|
||||||
@@ -410,7 +410,26 @@ parse_ml_basic_string(location<Container>& loc)
|
|||||||
{{std::addressof(inner_loc), "not sufficient token"}}),
|
{{std::addressof(inner_loc), "not sufficient token"}}),
|
||||||
source_location(std::addressof(inner_loc)));
|
source_location(std::addressof(inner_loc)));
|
||||||
}
|
}
|
||||||
delim = lex_ml_basic_string_delim::invoke(inner_loc);
|
delim = lex_ml_basic_string_close::invoke(inner_loc);
|
||||||
|
}
|
||||||
|
// `lex_ml_basic_string_close` allows 3 to 5 `"`s to allow 1 or 2 `"`s
|
||||||
|
// at just before the delimiter. Here, we need to attach `"`s at the
|
||||||
|
// end of the string body, if it exists.
|
||||||
|
// For detail, see the definition of `lex_ml_basic_string_close`.
|
||||||
|
assert(std::all_of(delim.unwrap().first(), delim.unwrap().last(),
|
||||||
|
[](const char c) noexcept {return c == '\"';}));
|
||||||
|
switch(delim.unwrap().size())
|
||||||
|
{
|
||||||
|
case 3: {break;}
|
||||||
|
case 4: {retval += "\""; break;}
|
||||||
|
case 5: {retval += "\"\""; break;}
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
throw internal_error(format_underline(
|
||||||
|
"parse_ml_basic_string: closing delimiter has invalid length",
|
||||||
|
{{std::addressof(inner_loc), "end of this"}}),
|
||||||
|
source_location(std::addressof(inner_loc)));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return ok(std::make_pair(toml::string(retval), token.unwrap()));
|
return ok(std::make_pair(toml::string(retval), token.unwrap()));
|
||||||
}
|
}
|
||||||
@@ -485,7 +504,7 @@ parse_ml_literal_string(location<Container>& loc)
|
|||||||
{
|
{
|
||||||
location<std::string> inner_loc(loc.name(), token.unwrap().str());
|
location<std::string> inner_loc(loc.name(), token.unwrap().str());
|
||||||
|
|
||||||
const auto open = lex_ml_literal_string_delim::invoke(inner_loc);
|
const auto open = lex_ml_literal_string_open::invoke(inner_loc);
|
||||||
if(!open)
|
if(!open)
|
||||||
{
|
{
|
||||||
throw internal_error(format_underline(
|
throw internal_error(format_underline(
|
||||||
@@ -498,7 +517,7 @@ parse_ml_literal_string(location<Container>& loc)
|
|||||||
|
|
||||||
const auto body = lex_ml_literal_body::invoke(inner_loc);
|
const auto body = lex_ml_literal_body::invoke(inner_loc);
|
||||||
|
|
||||||
const auto close = lex_ml_literal_string_delim::invoke(inner_loc);
|
const auto close = lex_ml_literal_string_close::invoke(inner_loc);
|
||||||
if(!close)
|
if(!close)
|
||||||
{
|
{
|
||||||
throw internal_error(format_underline(
|
throw internal_error(format_underline(
|
||||||
@@ -506,8 +525,28 @@ parse_ml_literal_string(location<Container>& loc)
|
|||||||
{{std::addressof(inner_loc), "should be '''"}}),
|
{{std::addressof(inner_loc), "should be '''"}}),
|
||||||
source_location(std::addressof(inner_loc)));
|
source_location(std::addressof(inner_loc)));
|
||||||
}
|
}
|
||||||
return ok(std::make_pair(
|
// `lex_ml_literal_string_close` allows 3 to 5 `'`s to allow 1 or 2 `'`s
|
||||||
toml::string(body.unwrap().str(), toml::string_t::literal),
|
// at just before the delimiter. Here, we need to attach `'`s at the
|
||||||
|
// end of the string body, if it exists.
|
||||||
|
// For detail, see the definition of `lex_ml_basic_string_close`.
|
||||||
|
|
||||||
|
std::string retval = body.unwrap().str();
|
||||||
|
assert(std::all_of(close.unwrap().first(), close.unwrap().last(),
|
||||||
|
[](const char c) noexcept {return c == '\'';}));
|
||||||
|
switch(close.unwrap().size())
|
||||||
|
{
|
||||||
|
case 3: {break;}
|
||||||
|
case 4: {retval += "'"; break;}
|
||||||
|
case 5: {retval += "''"; break;}
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
throw internal_error(format_underline(
|
||||||
|
"parse_ml_literal_string: closing delimiter has invalid length",
|
||||||
|
{{std::addressof(inner_loc), "end of this"}}),
|
||||||
|
source_location(std::addressof(inner_loc)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ok(std::make_pair(toml::string(retval, toml::string_t::literal),
|
||||||
token.unwrap()));
|
token.unwrap()));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
Reference in New Issue
Block a user