mirror of
https://github.com/ToruNiina/toml11.git
synced 2025-09-17 00:38:08 +08:00
fix: diagnose invalid UTF-8 codepoints
This commit is contained in:
@@ -226,8 +226,9 @@ parse_floating(location<Container>& loc)
|
||||
"the next token is not a float"));
|
||||
}
|
||||
|
||||
template<typename Container>
|
||||
std::string read_utf8_codepoint(const region<Container>& reg)
|
||||
template<typename Container, typename Container2>
|
||||
std::string read_utf8_codepoint(const region<Container>& reg,
|
||||
/* for err msg */ const location<Container2>& loc)
|
||||
{
|
||||
const auto str = reg.str().substr(1);
|
||||
std::uint_least32_t codepoint;
|
||||
@@ -247,20 +248,27 @@ std::string read_utf8_codepoint(const region<Container>& reg)
|
||||
}
|
||||
else if(codepoint < 0x10000) // U+0800...U+FFFF
|
||||
{
|
||||
if(0xD800 <= codepoint && codepoint <= 0xDFFF)
|
||||
{
|
||||
throw syntax_error(format_underline("[error] "
|
||||
"toml::read_utf8_codepoint: codepoints in the range "
|
||||
"[0xD800, 0xDFFF] are not valid UTF-8.",
|
||||
loc, "not a valid UTF-8 codepoint"));
|
||||
}
|
||||
assert(codepoint < 0xD800 || 0xDFFF < codepoint);
|
||||
// 1110yyyy 10yxxxxx 10xxxxxx
|
||||
character += static_cast<unsigned char>(0xE0| codepoint >> 12);
|
||||
character += static_cast<unsigned char>(0x80|(codepoint >> 6 & 0x3F));
|
||||
character += static_cast<unsigned char>(0x80|(codepoint & 0x3F));
|
||||
}
|
||||
else if(codepoint < 0x200000) // U+10000 ... U+1FFFFF
|
||||
else if(codepoint < 0x200000) // U+010000 ... U+1FFFFF
|
||||
{
|
||||
if(0x10FFFF < codepoint) // out of Unicode region
|
||||
{
|
||||
std::cerr << format_underline(concat_to_string("[warning] "
|
||||
"input codepoint (", str, ") is too large to decode as "
|
||||
"a unicode character. The result may not be able to render "
|
||||
"to your screen."), reg, "should be in [0x00..0x10FFFF]")
|
||||
<< std::endl;
|
||||
throw syntax_error(format_underline("[error] "
|
||||
"toml::read_utf8_codepoint: input codepoint is too large to "
|
||||
"decode as a unicode character.", loc,
|
||||
"should be in [0x00..0x10FFFF]"));
|
||||
}
|
||||
// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
|
||||
character += static_cast<unsigned char>(0xF0| codepoint >> 18);
|
||||
@@ -300,7 +308,7 @@ result<std::string, std::string> parse_escape_sequence(location<Container>& loc)
|
||||
{
|
||||
if(const auto token = lex_escape_unicode_short::invoke(loc))
|
||||
{
|
||||
return ok(read_utf8_codepoint(token.unwrap()));
|
||||
return ok(read_utf8_codepoint(token.unwrap(), loc));
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -313,7 +321,7 @@ result<std::string, std::string> parse_escape_sequence(location<Container>& loc)
|
||||
{
|
||||
if(const auto token = lex_escape_unicode_long::invoke(loc))
|
||||
{
|
||||
return ok(read_utf8_codepoint(token.unwrap()));
|
||||
return ok(read_utf8_codepoint(token.unwrap(), loc));
|
||||
}
|
||||
else
|
||||
{
|
||||
|
Reference in New Issue
Block a user