diff --git a/toml/lexer.hpp b/toml/lexer.hpp index 978dc5f..509217a 100644 --- a/toml/lexer.hpp +++ b/toml/lexer.hpp @@ -154,12 +154,53 @@ using lex_basic_string = sequence, lex_quotation_mark>; +// After toml post-v0.5.0, it is explicitly clarified how quotes in ml-strings +// are allowed to be used. +// After this, the following strings are *explicitly* allowed. +// - One or two `"`s in a multi-line basic string is allowed wherever it is. +// - Three consecutive `"`s in a multi-line basic string is considered as a delimiter. +// - One or two `"`s can appear just before or after the delimiter. +// ```toml +// str4 = """Here are two quotation marks: "". Simple enough.""" +// str5 = """Here are three quotation marks: ""\".""" +// str6 = """Here are fifteen quotation marks: ""\"""\"""\"""\"""\".""" +// str7 = """"This," she said, "is just a pointless statement."""" +// ``` +// In the current implementation (v3.3.0), it is difficult to parse `str7` in +// the above example. It is difficult to recognize `"` at the end of string body +// collectly. It will be misunderstood as a `"""` delimiter and an additional, +// invalid `"`. Like this: +// ```console +// what(): [error] toml::parse_table: invalid line format +// --> hoge.toml +// | +// 13 | str7 = """"This," she said, "is just a pointless statement."""" +// | ^- expected newline, but got '"'. +// ``` +// As a quick workaround for this problem, `lex_ml_basic_string_delim` was +// splitted into two, `lex_ml_basic_string_open` and `lex_ml_basic_string_close`. +// `lex_ml_basic_string_open` allows only `"""`. `_close` allows 3-5 `"`s. +// In parse_ml_basic_string() function, the trailing `"`s will be attached to +// the string body. +// +// Note: This feature is a "clarification". Therefore this change is considered +// as a spec that has been defined since the time when the multi-line +// basic string was introduced. Although it is a post-v0.5.0 changes, +// this change will be activated regardless of the flag, +// `TOML11_USE_UNRELEASED_TOML_FEATURES`. +// using lex_ml_basic_string_delim = repeat>; +using lex_ml_basic_string_open = lex_ml_basic_string_delim; +using lex_ml_basic_string_close = sequence< + repeat>, + maybe, maybe + >; + #ifdef TOML11_USE_UNRELEASED_TOML_FEATURES using lex_ml_basic_unescaped = exclude, // 0x09 in_range<0x0a, 0x1F>, // is tab - character<0x5C>, - character<0x7F>, + character<0x5C>, // backslash + character<0x7F>, // DEL lex_ml_basic_string_delim>>; #else // TOML v0.5.0 using lex_ml_basic_unescaped = exclude, @@ -176,9 +217,9 @@ using lex_ml_basic_char = either; using lex_ml_basic_body = repeat, unlimited>; -using lex_ml_basic_string = sequence; + lex_ml_basic_string_close>; using lex_literal_char = exclude, in_range<0x10, 0x19>, character<0x27>>>; @@ -187,7 +228,13 @@ using lex_literal_string = sequence, lex_apostrophe>; +// the same reason as above. using lex_ml_literal_string_delim = repeat>; +using lex_ml_literal_string_open = lex_ml_literal_string_delim; +using lex_ml_literal_string_close = sequence< + repeat>, + maybe, maybe + >; using lex_ml_literal_char = exclude, in_range<0x10, 0x1F>, @@ -195,9 +242,9 @@ using lex_ml_literal_char = exclude, lex_ml_literal_string_delim>>; using lex_ml_literal_body = repeat, unlimited>; -using lex_ml_literal_string = sequence; + lex_ml_literal_string_close>; using lex_string = either; diff --git a/toml/parser.hpp b/toml/parser.hpp index db36cd1..80fe704 100644 --- a/toml/parser.hpp +++ b/toml/parser.hpp @@ -375,7 +375,7 @@ parse_ml_basic_string(location& loc) std::string retval; retval.reserve(token.unwrap().size()); - auto delim = lex_ml_basic_string_delim::invoke(inner_loc); + auto delim = lex_ml_basic_string_open::invoke(inner_loc); if(!delim) { throw internal_error(format_underline( @@ -410,7 +410,26 @@ parse_ml_basic_string(location& loc) {{std::addressof(inner_loc), "not sufficient token"}}), source_location(std::addressof(inner_loc))); } - delim = lex_ml_basic_string_delim::invoke(inner_loc); + delim = lex_ml_basic_string_close::invoke(inner_loc); + } + // `lex_ml_basic_string_close` allows 3 to 5 `"`s to allow 1 or 2 `"`s + // at just before the delimiter. Here, we need to attach `"`s at the + // end of the string body, if it exists. + // For detail, see the definition of `lex_ml_basic_string_close`. + assert(std::all_of(delim.unwrap().first(), delim.unwrap().last(), + [](const char c) noexcept {return c == '\"';})); + switch(delim.unwrap().size()) + { + case 3: {break;} + case 4: {retval += "\""; break;} + case 5: {retval += "\"\""; break;} + default: + { + throw internal_error(format_underline( + "parse_ml_basic_string: closing delimiter has invalid length", + {{std::addressof(inner_loc), "end of this"}}), + source_location(std::addressof(inner_loc))); + } } return ok(std::make_pair(toml::string(retval), token.unwrap())); } @@ -485,7 +504,7 @@ parse_ml_literal_string(location& loc) { location inner_loc(loc.name(), token.unwrap().str()); - const auto open = lex_ml_literal_string_delim::invoke(inner_loc); + const auto open = lex_ml_literal_string_open::invoke(inner_loc); if(!open) { throw internal_error(format_underline( @@ -498,7 +517,7 @@ parse_ml_literal_string(location& loc) const auto body = lex_ml_literal_body::invoke(inner_loc); - const auto close = lex_ml_literal_string_delim::invoke(inner_loc); + const auto close = lex_ml_literal_string_close::invoke(inner_loc); if(!close) { throw internal_error(format_underline( @@ -506,9 +525,29 @@ parse_ml_literal_string(location& loc) {{std::addressof(inner_loc), "should be '''"}}), source_location(std::addressof(inner_loc))); } - return ok(std::make_pair( - toml::string(body.unwrap().str(), toml::string_t::literal), - token.unwrap())); + // `lex_ml_literal_string_close` allows 3 to 5 `'`s to allow 1 or 2 `'`s + // at just before the delimiter. Here, we need to attach `'`s at the + // end of the string body, if it exists. + // For detail, see the definition of `lex_ml_basic_string_close`. + + std::string retval = body.unwrap().str(); + assert(std::all_of(close.unwrap().first(), close.unwrap().last(), + [](const char c) noexcept {return c == '\'';})); + switch(close.unwrap().size()) + { + case 3: {break;} + case 4: {retval += "'"; break;} + case 5: {retval += "''"; break;} + default: + { + throw internal_error(format_underline( + "parse_ml_literal_string: closing delimiter has invalid length", + {{std::addressof(inner_loc), "end of this"}}), + source_location(std::addressof(inner_loc))); + } + } + return ok(std::make_pair(toml::string(retval, toml::string_t::literal), + token.unwrap())); } else {