feat: consider invalid UTF-8 as syntax_error

the following codepoints are considered to be a syntax_error
- [0xD800, 0xDFFF]
- larger than 0x10FFFF
This commit is contained in:
ToruNiina
2019-03-15 17:39:31 +09:00
parent 9eb4008d6d
commit 514df99e40

View File

@@ -250,11 +250,11 @@ std::string read_utf8_codepoint(const region<Container>& reg,
{ {
if(0xD800 <= codepoint && codepoint <= 0xDFFF) if(0xD800 <= codepoint && codepoint <= 0xDFFF)
{ {
std::cerr << format_underline("[warning] " throw syntax_error(format_underline("[error] "
"toml::read_utf8_codepoint: codepoints in the range " "toml::read_utf8_codepoint: codepoints in the range "
"[0xD800, 0xDFFF] are not valid UTF-8.", {{ "[0xD800, 0xDFFF] are not valid UTF-8.", {{
std::addressof(loc), "not a valid UTF-8 codepoint" std::addressof(loc), "not a valid UTF-8 codepoint"
}}) << std::endl; }}));
} }
assert(codepoint < 0xD800 || 0xDFFF < codepoint); assert(codepoint < 0xD800 || 0xDFFF < codepoint);
// 1110yyyy 10yxxxxx 10xxxxxx // 1110yyyy 10yxxxxx 10xxxxxx
@@ -266,10 +266,10 @@ std::string read_utf8_codepoint(const region<Container>& reg,
{ {
if(0x10FFFF < codepoint) // out of Unicode region if(0x10FFFF < codepoint) // out of Unicode region
{ {
std::cerr << format_underline("[error] " throw syntax_error(format_underline("[error] "
"toml::read_utf8_codepoint: input codepoint is too large to " "toml::read_utf8_codepoint: input codepoint is too large to "
"decode as a unicode character.", {{std::addressof(loc), "decode as a unicode character.", {{std::addressof(loc),
"should be in [0x00..0x10FFFF]"}}) << std::endl; "should be in [0x00..0x10FFFF]"}}));
} }
// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx // 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
character += static_cast<unsigned char>(0xF0| codepoint >> 18); character += static_cast<unsigned char>(0xF0| codepoint >> 18);