fix: handle edge-cases with quotes in ml-string

See comments in the code for detail.
2025-12-16 03:08:52 +08:00 · 2020-02-04 22:33:30 +09:00
parent d495df93a6
commit 0582e1535b
2 changed files with 99 additions and 13 deletions
--- a/toml/lexer.hpp
+++ b/toml/lexer.hpp
@@ -154,12 +154,53 @@ using lex_basic_string = sequence<lex_quotation_mark,
                                  repeat<lex_basic_char, unlimited>,
                                  lex_quotation_mark>;
 // After toml post-v0.5.0, it is explicitly clarified how quotes in ml-strings
 // are allowed to be used.
 // After this, the following strings are *explicitly* allowed.
 // - One or two `"`s in a multi-line basic string is allowed wherever it is.
 // - Three consecutive `"`s in a multi-line basic string is considered as a delimiter.
 // - One or two `"`s can appear just before or after the delimiter.
 // ```toml
 // str4 = """Here are two quotation marks: "". Simple enough."""
 // str5 = """Here are three quotation marks: ""\"."""
 // str6 = """Here are fifteen quotation marks: ""\"""\"""\"""\"""\"."""
 // str7 = """"This," she said, "is just a pointless statement.""""
 // ```
 // In the current implementation (v3.3.0), it is difficult to parse `str7` in
 // the above example. It is difficult to recognize `"` at the end of string body
 // collectly. It will be misunderstood as a `"""` delimiter and an additional,
 // invalid `"`. Like this:
 // ```console
 //   what():  [error] toml::parse_table: invalid line format
 //  --> hoge.toml
 //     |
 //  13 | str7 = """"This," she said, "is just a pointless statement.""""
 //     |                                                               ^- expected newline, but got '"'.
 // ```
 // As a quick workaround for this problem, `lex_ml_basic_string_delim` was
 // splitted into two, `lex_ml_basic_string_open` and `lex_ml_basic_string_close`.
 // `lex_ml_basic_string_open` allows only `"""`. `_close` allows 3-5 `"`s.
 // In parse_ml_basic_string() function, the trailing `"`s will be attached to
 // the string body.
 //
 // Note: This feature is a "clarification". Therefore this change is considered
 //       as a spec that has been defined since the time when the multi-line
 //       basic string was introduced. Although it is a post-v0.5.0 changes,
 //       this change will be activated regardless of the flag,
 //       `TOML11_USE_UNRELEASED_TOML_FEATURES`.
 //
 using lex_ml_basic_string_delim = repeat<lex_quotation_mark, exactly<3>>;
 using lex_ml_basic_string_open  = lex_ml_basic_string_delim;
 using lex_ml_basic_string_close = sequence<
        repeat<lex_quotation_mark, exactly<3>>,
        maybe<lex_quotation_mark>, maybe<lex_quotation_mark>
    >;
 #ifdef TOML11_USE_UNRELEASED_TOML_FEATURES
 using lex_ml_basic_unescaped    = exclude<either<in_range<0x00, 0x08>, // 0x09
                                                 in_range<0x0a, 0x1F>, // is tab
-                                                 character<0x5C>,
+                                                 character<0x5C>, // backslash
-                                                 character<0x7F>,
+                                                 character<0x7F>, // DEL
                                                 lex_ml_basic_string_delim>>;
 #else // TOML v0.5.0
 using lex_ml_basic_unescaped    = exclude<either<in_range<0x00,0x1F>,
@@ -176,9 +217,9 @@ using lex_ml_basic_char = either<lex_ml_basic_unescaped, lex_escaped>;
 using lex_ml_basic_body = repeat<either<lex_ml_basic_char, lex_newline,
                                        lex_ml_basic_escaped_newline>,
                                 unlimited>;
-using lex_ml_basic_string = sequence<lex_ml_basic_string_delim,
+using lex_ml_basic_string = sequence<lex_ml_basic_string_open,
                                     lex_ml_basic_body,
-                                     lex_ml_basic_string_delim>;
+                                     lex_ml_basic_string_close>;
 using lex_literal_char = exclude<either<in_range<0x00, 0x08>,
                                        in_range<0x10, 0x19>, character<0x27>>>;
@@ -187,7 +228,13 @@ using lex_literal_string = sequence<lex_apostrophe,
                                    repeat<lex_literal_char, unlimited>,
                                    lex_apostrophe>;
 // the same reason as above.
 using lex_ml_literal_string_delim = repeat<lex_apostrophe, exactly<3>>;
 using lex_ml_literal_string_open  = lex_ml_literal_string_delim;
 using lex_ml_literal_string_close = sequence<
        repeat<lex_apostrophe, exactly<3>>,
        maybe<lex_apostrophe>, maybe<lex_apostrophe>
    >;
 using lex_ml_literal_char = exclude<either<in_range<0x00, 0x08>,
                                           in_range<0x10, 0x1F>,
@@ -195,9 +242,9 @@ using lex_ml_literal_char = exclude<either<in_range<0x00, 0x08>,
                                           lex_ml_literal_string_delim>>;
 using lex_ml_literal_body = repeat<either<lex_ml_literal_char, lex_newline>,
                                   unlimited>;
-using lex_ml_literal_string = sequence<lex_ml_literal_string_delim,
+using lex_ml_literal_string = sequence<lex_ml_literal_string_open,
                                       lex_ml_literal_body,
-                                       lex_ml_literal_string_delim>;
+                                       lex_ml_literal_string_close>;
 using lex_string = either<lex_ml_basic_string,   lex_basic_string,
                          lex_ml_literal_string, lex_literal_string>;
--- a/toml/parser.hpp
+++ b/toml/parser.hpp
@@ -375,7 +375,7 @@ parse_ml_basic_string(location<Container>& loc)
        std::string retval;
        retval.reserve(token.unwrap().size());
-        auto delim = lex_ml_basic_string_delim::invoke(inner_loc);
+        auto delim = lex_ml_basic_string_open::invoke(inner_loc);
        if(!delim)
        {
            throw internal_error(format_underline(
@@ -410,7 +410,26 @@ parse_ml_basic_string(location<Container>& loc)
                    {{std::addressof(inner_loc), "not sufficient token"}}),
                    source_location(std::addressof(inner_loc)));
            }
-            delim = lex_ml_basic_string_delim::invoke(inner_loc);
+            delim = lex_ml_basic_string_close::invoke(inner_loc);
        }
        // `lex_ml_basic_string_close` allows 3 to 5 `"`s to allow 1 or 2 `"`s
        // at just before the delimiter. Here, we need to attach `"`s at the
        // end of the string body, if it exists.
        // For detail, see the definition of `lex_ml_basic_string_close`.
        assert(std::all_of(delim.unwrap().first(), delim.unwrap().last(),
                           [](const char c) noexcept {return c == '\"';}));
        switch(delim.unwrap().size())
        {
            case 3: {break;}
            case 4: {retval += "\"";  break;}
            case 5: {retval += "\"\""; break;}
            default:
            {
                throw internal_error(format_underline(
                    "parse_ml_basic_string: closing delimiter has invalid length",
                    {{std::addressof(inner_loc), "end of this"}}),
                    source_location(std::addressof(inner_loc)));
            }
        }
        return ok(std::make_pair(toml::string(retval), token.unwrap()));
    }
@@ -485,7 +504,7 @@ parse_ml_literal_string(location<Container>& loc)
    {
        location<std::string> inner_loc(loc.name(), token.unwrap().str());
-        const auto open = lex_ml_literal_string_delim::invoke(inner_loc);
+        const auto open = lex_ml_literal_string_open::invoke(inner_loc);
        if(!open)
        {
            throw internal_error(format_underline(
@@ -498,7 +517,7 @@ parse_ml_literal_string(location<Container>& loc)
        const auto body = lex_ml_literal_body::invoke(inner_loc);
-        const auto close = lex_ml_literal_string_delim::invoke(inner_loc);
+        const auto close = lex_ml_literal_string_close::invoke(inner_loc);
        if(!close)
        {
            throw internal_error(format_underline(
@@ -506,8 +525,28 @@ parse_ml_literal_string(location<Container>& loc)
                {{std::addressof(inner_loc), "should be '''"}}),
                source_location(std::addressof(inner_loc)));
        }
-        return ok(std::make_pair(
+        // `lex_ml_literal_string_close` allows 3 to 5 `'`s to allow 1 or 2 `'`s
-                  toml::string(body.unwrap().str(), toml::string_t::literal),
+        // at just before the delimiter. Here, we need to attach `'`s at the
        // end of the string body, if it exists.
        // For detail, see the definition of `lex_ml_basic_string_close`.
        std::string retval = body.unwrap().str();
        assert(std::all_of(close.unwrap().first(), close.unwrap().last(),
                           [](const char c) noexcept {return c == '\'';}));
        switch(close.unwrap().size())
        {
            case 3: {break;}
            case 4: {retval += "'";  break;}
            case 5: {retval += "''"; break;}
            default:
            {
                throw internal_error(format_underline(
                    "parse_ml_literal_string: closing delimiter has invalid length",
                    {{std::addressof(inner_loc), "end of this"}}),
                    source_location(std::addressof(inner_loc)));
            }
        }
        return ok(std::make_pair(toml::string(retval, toml::string_t::literal),
                                 token.unwrap()));
    }
    else