improve format of error message for utf-8

This commit is contained in:
ToruNiina
2018-12-12 19:01:22 +09:00
parent c33ad31981
commit 879b7d3bff

View File

@@ -226,8 +226,10 @@ parse_floating(location<Container>& loc)
"token is not a float", {"floating point is like: -3.14e+1"})); "token is not a float", {"floating point is like: -3.14e+1"}));
} }
inline std::string read_utf8_codepoint(const std::string& str) template<typename Container>
std::string read_utf8_codepoint(const region<Container>& reg)
{ {
const auto str = reg.str().substr(1);
std::uint_least32_t codepoint; std::uint_least32_t codepoint;
std::istringstream iss(str); std::istringstream iss(str);
iss >> std::hex >> codepoint; iss >> std::hex >> codepoint;
@@ -254,10 +256,11 @@ inline std::string read_utf8_codepoint(const std::string& str)
{ {
if(0x10FFFF < codepoint) // out of Unicode region if(0x10FFFF < codepoint) // out of Unicode region
{ {
std::cerr << "WARNING: input codepoint " << str << " is too large " std::cerr << format_underline(concat_to_string("[warning] "
<< "to decode as a unicode character. It should be in " "input codepoint (", str, ") is too large to decode as "
<< "range [0x00 .. 0x10FFFF]. The result may not be able " "a unicode character. The result may not be able to render "
<< "to be rendered to your screen." << std::endl; "to your screen."), reg, "should be in [0x00..0x10FFFF]")
<< std::endl;
} }
// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx // 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
character += static_cast<unsigned char>(0xF0| codepoint >> 18); character += static_cast<unsigned char>(0xF0| codepoint >> 18);
@@ -267,9 +270,9 @@ inline std::string read_utf8_codepoint(const std::string& str)
} }
else // out of UTF-8 region else // out of UTF-8 region
{ {
throw std::range_error("toml::read_utf8_codepoint: input codepoint `" + throw std::range_error(format_underline(concat_to_string("[error] "
str + "` is too large to decode as utf-8. It should be in range" "input codepoint (", str, ") is too large to encode as utf-8."),
" 0x00 ... 0x1FFFFF."); reg, "should be in [0x00..0x1FFFFF]"));
} }
return character; return character;
} }
@@ -278,7 +281,7 @@ template<typename Container>
result<std::string, std::string> parse_escape_sequence(location<Container>& loc) result<std::string, std::string> parse_escape_sequence(location<Container>& loc)
{ {
const auto first = loc.iter(); const auto first = loc.iter();
if(*first != '\\') if(first == loc.end() || *first != '\\')
{ {
return err(format_underline("[error]: " return err(format_underline("[error]: "
"toml::parse_escape_sequence: location does not points \"\\\"", "toml::parse_escape_sequence: location does not points \"\\\"",
@@ -296,10 +299,9 @@ result<std::string, std::string> parse_escape_sequence(location<Container>& loc)
case 'r' :{++loc.iter(); return ok(std::string("\r"));} case 'r' :{++loc.iter(); return ok(std::string("\r"));}
case 'u' : case 'u' :
{ {
++loc.iter(); if(const auto token = lex_escape_unicode_short::invoke(loc))
if(const auto token = repeat<lex_hex_dig, exactly<4>>::invoke(loc))
{ {
return ok(read_utf8_codepoint(token.unwrap().str())); return ok(read_utf8_codepoint(token.unwrap()));
} }
else else
{ {
@@ -310,10 +312,9 @@ result<std::string, std::string> parse_escape_sequence(location<Container>& loc)
} }
case 'U': case 'U':
{ {
++loc.iter(); if(const auto token = lex_escape_unicode_long::invoke(loc))
if(const auto token = repeat<lex_hex_dig, exactly<8>>::invoke(loc))
{ {
return ok(read_utf8_codepoint(token.unwrap().str())); return ok(read_utf8_codepoint(token.unwrap()));
} }
else else
{ {
@@ -341,7 +342,6 @@ parse_ml_basic_string(location<Container>& loc)
if(const auto token = lex_ml_basic_string::invoke(loc)) if(const auto token = lex_ml_basic_string::invoke(loc))
{ {
location<std::string> inner_loc(loc.name(), token.unwrap().str()); location<std::string> inner_loc(loc.name(), token.unwrap().str());
std::string retval; std::string retval;
retval.reserve(inner_loc.source()->size()); retval.reserve(inner_loc.source()->size());