fix: diagnose invalid UTF-8 codepoints

2025-12-16 03:08:52 +08:00 · 2019-03-02 01:51:27 +09:00
parent 536b23dc84
commit 7f870d5861
1 changed files with 18 additions and 10 deletions
--- a/toml/parser.hpp
+++ b/toml/parser.hpp
@@ -226,8 +226,9 @@ parse_floating(location<Container>& loc)
                           "the next token is not a float"));
 }

-template<typename Container>
-std::string read_utf8_codepoint(const region<Container>& reg)
+template<typename Container, typename Container2>
+std::string read_utf8_codepoint(const region<Container>& reg,
+              /* for err msg */ const location<Container2>& loc)
 {
    const auto str = reg.str().substr(1);
    std::uint_least32_t codepoint;
@@ -247,20 +248,27 @@ std::string read_utf8_codepoint(const region<Container>& reg)
    }
    else if(codepoint < 0x10000) // U+0800...U+FFFF
    {
+        if(0xD800 <= codepoint && codepoint <= 0xDFFF)
+        {
+            throw syntax_error(format_underline("[error] "
+                "toml::read_utf8_codepoint: codepoints in the range "
+                "[0xD800, 0xDFFF] are not valid UTF-8.",
+                loc, "not a valid UTF-8 codepoint"));
+        }
+        assert(codepoint < 0xD800 || 0xDFFF < codepoint);
        // 1110yyyy 10yxxxxx 10xxxxxx
        character += static_cast<unsigned char>(0xE0| codepoint >> 12);
        character += static_cast<unsigned char>(0x80|(codepoint >> 6 & 0x3F));
        character += static_cast<unsigned char>(0x80|(codepoint      & 0x3F));
    }
-    else if(codepoint < 0x200000) // U+10000 ... U+1FFFFF
+    else if(codepoint < 0x200000) // U+010000 ... U+1FFFFF
    {
        if(0x10FFFF < codepoint) // out of Unicode region
        {
-            std::cerr << format_underline(concat_to_string("[warning] "
-                    "input codepoint (", str, ") is too large to decode as "
-                    "a unicode character. The result may not be able to render "
-                    "to your screen."), reg, "should be in [0x00..0x10FFFF]")
-                      << std::endl;
+            throw syntax_error(format_underline("[error] "
+                "toml::read_utf8_codepoint: input codepoint is too large to "
+                "decode as a unicode character.", loc,
+                "should be in [0x00..0x10FFFF]"));
        }
        // 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
        character += static_cast<unsigned char>(0xF0| codepoint >> 18);
@@ -300,7 +308,7 @@ result<std::string, std::string> parse_escape_sequence(location<Container>& loc)
        {
            if(const auto token = lex_escape_unicode_short::invoke(loc))
            {
-                return ok(read_utf8_codepoint(token.unwrap()));
+                return ok(read_utf8_codepoint(token.unwrap(), loc));
            }
            else
            {
@@ -313,7 +321,7 @@ result<std::string, std::string> parse_escape_sequence(location<Container>& loc)
        {
            if(const auto token = lex_escape_unicode_long::invoke(loc))
            {
-                return ok(read_utf8_codepoint(token.unwrap()));
+                return ok(read_utf8_codepoint(token.unwrap(), loc));
            }
            else
            {