From 5d29509d9851cf67ef010d70e0262c96ea30a491 Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Wed, 12 Dec 2018 18:58:54 +0900 Subject: [PATCH 1/5] remove duplicated default argument for SFINAE --- toml/get.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/toml/get.hpp b/toml/get.hpp index 1820585..c2aece5 100644 --- a/toml/get.hpp +++ b/toml/get.hpp @@ -154,7 +154,7 @@ template, // T is container detail::has_resize_method, // T::resize(N) works detail::negation> // but not toml::array - >::value, std::nullptr_t>::type = nullptr> + >::value, std::nullptr_t>::type> T get(const value& v) { using value_type = typename T::value_type; @@ -173,7 +173,7 @@ template, // T is container detail::negation>, // no T::resize() exists detail::negation> // not toml::array - >::value, std::nullptr_t>::type = nullptr> + >::value, std::nullptr_t>::type> T get(const value& v) { using value_type = typename T::value_type; @@ -195,7 +195,7 @@ T get(const value& v) // std::pair. template::value, std::nullptr_t>::type = nullptr> + detail::is_std_pair::value, std::nullptr_t>::type> T get(const value& v) { using first_type = typename T::first_type; @@ -228,7 +228,7 @@ T get_tuple_impl(const toml::Array& a, index_sequence) } // detail template::value, std::nullptr_t>::type = nullptr> + detail::is_std_tuple::value, std::nullptr_t>::type> T get(const value& v) { const auto& ar = v.cast(); @@ -249,7 +249,7 @@ T get(const value& v) template, // T is map detail::negation> // but not toml::table - >::value, std::nullptr_t>::type = nullptr> + >::value, std::nullptr_t>::type> T get(const toml::value& v) { using key_type = typename T::key_type; From c33ad31981fa43b2f33a812c89234b51b9073c73 Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Wed, 12 Dec 2018 18:59:20 +0900 Subject: [PATCH 2/5] split lexer for escape sequence for unicode --- toml/lexer.hpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/toml/lexer.hpp b/toml/lexer.hpp index 408d087..060195e 100644 --- a/toml/lexer.hpp +++ b/toml/lexer.hpp @@ -117,14 +117,16 @@ using lex_basic_unescaped = exclude, character<0x22>, character<0x5C>, character<0x7F>>>; using lex_escape = character<'\\'>; +using lex_escape_unicode_short = sequence, + repeat>>; +using lex_escape_unicode_long = sequence, + repeat>>; using lex_escape_seq_char = either, character<'\\'>, character<'/'>, character<'b'>, character<'f'>, character<'n'>, character<'r'>, character<'t'>, - sequence, - repeat>>, - sequence, - repeat>> + lex_escape_unicode_short, + lex_escape_unicode_long >; using lex_escaped = sequence; using lex_basic_char = either; From 879b7d3bfffb701c8a3f610187cb03dc411bc6de Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Wed, 12 Dec 2018 19:01:22 +0900 Subject: [PATCH 3/5] improve format of error message for utf-8 --- toml/parser.hpp | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/toml/parser.hpp b/toml/parser.hpp index 8cfb884..d630967 100644 --- a/toml/parser.hpp +++ b/toml/parser.hpp @@ -226,8 +226,10 @@ parse_floating(location& loc) "token is not a float", {"floating point is like: -3.14e+1"})); } -inline std::string read_utf8_codepoint(const std::string& str) +template +std::string read_utf8_codepoint(const region& reg) { + const auto str = reg.str().substr(1); std::uint_least32_t codepoint; std::istringstream iss(str); iss >> std::hex >> codepoint; @@ -254,10 +256,11 @@ inline std::string read_utf8_codepoint(const std::string& str) { if(0x10FFFF < codepoint) // out of Unicode region { - std::cerr << "WARNING: input codepoint " << str << " is too large " - << "to decode as a unicode character. It should be in " - << "range [0x00 .. 0x10FFFF]. The result may not be able " - << "to be rendered to your screen." << std::endl; + std::cerr << format_underline(concat_to_string("[warning] " + "input codepoint (", str, ") is too large to decode as " + "a unicode character. The result may not be able to render " + "to your screen."), reg, "should be in [0x00..0x10FFFF]") + << std::endl; } // 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx character += static_cast(0xF0| codepoint >> 18); @@ -267,9 +270,9 @@ inline std::string read_utf8_codepoint(const std::string& str) } else // out of UTF-8 region { - throw std::range_error("toml::read_utf8_codepoint: input codepoint `" + - str + "` is too large to decode as utf-8. It should be in range" - " 0x00 ... 0x1FFFFF."); + throw std::range_error(format_underline(concat_to_string("[error] " + "input codepoint (", str, ") is too large to encode as utf-8."), + reg, "should be in [0x00..0x1FFFFF]")); } return character; } @@ -278,7 +281,7 @@ template result parse_escape_sequence(location& loc) { const auto first = loc.iter(); - if(*first != '\\') + if(first == loc.end() || *first != '\\') { return err(format_underline("[error]: " "toml::parse_escape_sequence: location does not points \"\\\"", @@ -296,10 +299,9 @@ result parse_escape_sequence(location& loc) case 'r' :{++loc.iter(); return ok(std::string("\r"));} case 'u' : { - ++loc.iter(); - if(const auto token = repeat>::invoke(loc)) + if(const auto token = lex_escape_unicode_short::invoke(loc)) { - return ok(read_utf8_codepoint(token.unwrap().str())); + return ok(read_utf8_codepoint(token.unwrap())); } else { @@ -310,10 +312,9 @@ result parse_escape_sequence(location& loc) } case 'U': { - ++loc.iter(); - if(const auto token = repeat>::invoke(loc)) + if(const auto token = lex_escape_unicode_long::invoke(loc)) { - return ok(read_utf8_codepoint(token.unwrap().str())); + return ok(read_utf8_codepoint(token.unwrap())); } else { @@ -341,7 +342,6 @@ parse_ml_basic_string(location& loc) if(const auto token = lex_ml_basic_string::invoke(loc)) { location inner_loc(loc.name(), token.unwrap().str()); - std::string retval; retval.reserve(inner_loc.source()->size()); From 0f83ee60398d9d27c5f39fe29d43a94b5f011e72 Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Wed, 12 Dec 2018 19:12:23 +0900 Subject: [PATCH 4/5] change temporaly loc from token to copy of loc location constructed from token string does not has correct line number information. to show an informative error message about UTF-8 and escape sequences, parse_(ml_)basic_string requires those information that can only be given from root location. --- toml/parser.hpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/toml/parser.hpp b/toml/parser.hpp index d630967..2cd4e85 100644 --- a/toml/parser.hpp +++ b/toml/parser.hpp @@ -341,9 +341,11 @@ parse_ml_basic_string(location& loc) const auto first = loc.iter(); if(const auto token = lex_ml_basic_string::invoke(loc)) { - location inner_loc(loc.name(), token.unwrap().str()); + auto inner_loc = loc; + inner_loc.iter() = first; + std::string retval; - retval.reserve(inner_loc.source()->size()); + retval.reserve(token.unwrap().size()); auto delim = lex_ml_basic_string_delim::invoke(inner_loc); if(!delim) @@ -396,7 +398,8 @@ parse_basic_string(location& loc) const auto first = loc.iter(); if(const auto token = lex_basic_string::invoke(loc)) { - location inner_loc(loc.name(), token.unwrap().str()); + auto inner_loc = loc; + inner_loc.iter() = first; auto quot = lex_quotation_mark::invoke(inner_loc); if(!quot) @@ -406,7 +409,7 @@ parse_basic_string(location& loc) } std::string retval; - retval.reserve(inner_loc.source()->size()); + retval.reserve(token.unwrap().size()); quot = err("tmp"); while(!quot) From 5aae0b17c88cd305bb3c12e8f5af1b67de0f4bdb Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Wed, 12 Dec 2018 19:14:27 +0900 Subject: [PATCH 5/5] change error message; require unicode codepoint before this, it recommends the range that can be represented by utf-8 but the range of valid unicode codepoint is narrower than that. for error message, it is good to recommend valid unicode codepoint. --- toml/parser.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/toml/parser.hpp b/toml/parser.hpp index 2cd4e85..15954a7 100644 --- a/toml/parser.hpp +++ b/toml/parser.hpp @@ -272,7 +272,7 @@ std::string read_utf8_codepoint(const region& reg) { throw std::range_error(format_underline(concat_to_string("[error] " "input codepoint (", str, ") is too large to encode as utf-8."), - reg, "should be in [0x00..0x1FFFFF]")); + reg, "should be in [0x00..0x10FFFF]")); } return character; }