From e929d2f00f545b2148c8a83aea1248661edef6f2 Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Wed, 27 Feb 2019 12:30:57 +0900 Subject: [PATCH 1/8] fix: allow empty input file (to be an empty table) --- toml/parser.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/toml/parser.hpp b/toml/parser.hpp index faa7213..d7108d4 100644 --- a/toml/parser.hpp +++ b/toml/parser.hpp @@ -1421,7 +1421,7 @@ result parse_toml_file(location& loc) const auto first = loc.iter(); if(first == loc.end()) { - return err(std::string("toml::detail::parse_toml_file: input is empty")); + return ok(toml::table{}); } table data; From 5a929320191f209725e64966357242f949907415 Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Fri, 1 Mar 2019 22:13:32 +0900 Subject: [PATCH 2/8] fix: disallow invalid escape sequence --- toml/lexer.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/toml/lexer.hpp b/toml/lexer.hpp index 4f170c5..67df844 100644 --- a/toml/lexer.hpp +++ b/toml/lexer.hpp @@ -124,9 +124,9 @@ using lex_escape_unicode_short = sequence, using lex_escape_unicode_long = sequence, repeat>>; using lex_escape_seq_char = either, character<'\\'>, - character<'/'>, character<'b'>, - character<'f'>, character<'n'>, - character<'r'>, character<'t'>, + character<'b'>, character<'f'>, + character<'n'>, character<'r'>, + character<'t'>, lex_escape_unicode_short, lex_escape_unicode_long >; From 0c9806e99fa4df8b59cd3c7f90d96cdf6c8a8ad1 Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Fri, 1 Mar 2019 22:37:52 +0900 Subject: [PATCH 3/8] fix: diagnose key after [table.key] pattern the following is not a valid toml format. ``` [table] key = "value" ``` this commit enables to diagnose that pattern. --- toml/parser.hpp | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/toml/parser.hpp b/toml/parser.hpp index d7108d4..1deba39 100644 --- a/toml/parser.hpp +++ b/toml/parser.hpp @@ -1289,6 +1289,20 @@ parse_table_key(location& loc) throw internal_error(format_underline("[error] " "toml::parse_table_key: no `]`", inner_loc, "should be `]`")); } + + // after [table.key], newline or EOF(empty table) requried. + if(loc.iter() != loc.end()) + { + using lex_newline_after_table_key = + sequence, maybe, lex_newline>; + const auto nl = lex_newline_after_table_key::invoke(loc); + if(!nl) + { + throw syntax_error(format_underline("[error] " + "toml::parse_table_key: newline required after [table.key]", + loc, "expected newline")); + } + } return ok(std::make_pair(keys.unwrap().first, token.unwrap())); } else @@ -1327,6 +1341,20 @@ parse_array_table_key(location& loc) throw internal_error(format_underline("[error] " "toml::parse_table_key: no `]]`", inner_loc, "should be `]]`")); } + + // after [[table.key]], newline or EOF(empty table) requried. + if(loc.iter() != loc.end()) + { + using lex_newline_after_table_key = + sequence, maybe, lex_newline>; + const auto nl = lex_newline_after_table_key::invoke(loc); + if(!nl) + { + throw syntax_error(format_underline("[error] " + "toml::parse_array_table_key: newline required after " + "[[table.key]]", loc, "expected newline")); + } + } return ok(std::make_pair(keys.unwrap().first, token.unwrap())); } else From 536b23dc8442853e4a3975fb3acb016073f7940b Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Fri, 1 Mar 2019 22:53:16 +0900 Subject: [PATCH 4/8] fix: allow empty table in the middle of a file --- toml/parser.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/toml/parser.hpp b/toml/parser.hpp index 1deba39..57f2211 100644 --- a/toml/parser.hpp +++ b/toml/parser.hpp @@ -1370,7 +1370,7 @@ result parse_ml_table(location& loc) const auto first = loc.iter(); if(first == loc.end()) { - return err(std::string("toml::parse_ml_table: input is empty")); + return ok(toml::table{}); } // XXX at lest one newline is needed. @@ -1453,7 +1453,7 @@ result parse_toml_file(location& loc) } table data; - /* root object is also table, but without [tablename] */ + // root object is also a table, but without [tablename] if(auto tab = parse_ml_table(loc)) { data = std::move(tab.unwrap()); From 7f870d58611bc90aaba41166469f2fdeacc5de37 Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Sat, 2 Mar 2019 01:51:27 +0900 Subject: [PATCH 5/8] fix: diagnose invalid UTF-8 codepoints --- toml/parser.hpp | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/toml/parser.hpp b/toml/parser.hpp index 57f2211..983d54e 100644 --- a/toml/parser.hpp +++ b/toml/parser.hpp @@ -226,8 +226,9 @@ parse_floating(location& loc) "the next token is not a float")); } -template -std::string read_utf8_codepoint(const region& reg) +template +std::string read_utf8_codepoint(const region& reg, + /* for err msg */ const location& loc) { const auto str = reg.str().substr(1); std::uint_least32_t codepoint; @@ -247,20 +248,27 @@ std::string read_utf8_codepoint(const region& reg) } else if(codepoint < 0x10000) // U+0800...U+FFFF { + if(0xD800 <= codepoint && codepoint <= 0xDFFF) + { + throw syntax_error(format_underline("[error] " + "toml::read_utf8_codepoint: codepoints in the range " + "[0xD800, 0xDFFF] are not valid UTF-8.", + loc, "not a valid UTF-8 codepoint")); + } + assert(codepoint < 0xD800 || 0xDFFF < codepoint); // 1110yyyy 10yxxxxx 10xxxxxx character += static_cast(0xE0| codepoint >> 12); character += static_cast(0x80|(codepoint >> 6 & 0x3F)); character += static_cast(0x80|(codepoint & 0x3F)); } - else if(codepoint < 0x200000) // U+10000 ... U+1FFFFF + else if(codepoint < 0x200000) // U+010000 ... U+1FFFFF { if(0x10FFFF < codepoint) // out of Unicode region { - std::cerr << format_underline(concat_to_string("[warning] " - "input codepoint (", str, ") is too large to decode as " - "a unicode character. The result may not be able to render " - "to your screen."), reg, "should be in [0x00..0x10FFFF]") - << std::endl; + throw syntax_error(format_underline("[error] " + "toml::read_utf8_codepoint: input codepoint is too large to " + "decode as a unicode character.", loc, + "should be in [0x00..0x10FFFF]")); } // 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx character += static_cast(0xF0| codepoint >> 18); @@ -300,7 +308,7 @@ result parse_escape_sequence(location& loc) { if(const auto token = lex_escape_unicode_short::invoke(loc)) { - return ok(read_utf8_codepoint(token.unwrap())); + return ok(read_utf8_codepoint(token.unwrap(), loc)); } else { @@ -313,7 +321,7 @@ result parse_escape_sequence(location& loc) { if(const auto token = lex_escape_unicode_long::invoke(loc)) { - return ok(read_utf8_codepoint(token.unwrap())); + return ok(read_utf8_codepoint(token.unwrap(), loc)); } else { From 944b83642a614debdf52196a36f261cfdd9fcdb7 Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Sat, 2 Mar 2019 17:52:00 +0900 Subject: [PATCH 6/8] feat: make location to inherit region_base To generate error message, it is better to have the same interface. Also, location can be considered as a region having only one character. --- toml/region.hpp | 259 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 178 insertions(+), 81 deletions(-) diff --git a/toml/region.hpp b/toml/region.hpp index 7a0acba..62a28e1 100644 --- a/toml/region.hpp +++ b/toml/region.hpp @@ -28,44 +28,6 @@ inline std::string make_string(std::size_t len, char c) return std::string(len, c); } -// location in a container, normally in a file content. -// shared_ptr points the resource that the iter points. -// it can be used not only for resource handling, but also error message. -template -struct location -{ - static_assert(std::is_same::value,""); - using const_iterator = typename Container::const_iterator; - using source_ptr = std::shared_ptr; - - location(std::string name, Container cont) - : source_(std::make_shared(std::move(cont))), - source_name_(std::move(name)), iter_(source_->cbegin()) - {} - location(const location&) = default; - location(location&&) = default; - location& operator=(const location&) = default; - location& operator=(location&&) = default; - ~location() = default; - - const_iterator& iter() noexcept {return iter_;} - const_iterator iter() const noexcept {return iter_;} - - const_iterator begin() const noexcept {return source_->cbegin();} - const_iterator end() const noexcept {return source_->cend();} - - source_ptr const& source() const& noexcept {return source_;} - source_ptr&& source() && noexcept {return std::move(source_);} - - std::string const& name() const noexcept {return source_name_;} - - private: - - source_ptr source_; - std::string source_name_; - const_iterator iter_; -}; - // region in a container, normally in a file content. // shared_ptr points the resource that the iter points. // combinators returns this. @@ -86,12 +48,89 @@ struct region_base virtual std::string line() const {return std::string("unknown line");} virtual std::string line_num() const {return std::string("?");} - virtual std::size_t before() const noexcept {return 0;} virtual std::size_t size() const noexcept {return 0;} virtual std::size_t after() const noexcept {return 0;} }; +// location in a container, normally in a file content. +// shared_ptr points the resource that the iter points. +// it can be used not only for resource handling, but also error message. +// +// it can be considered as a region that contains only one character. +template +struct location final : public region_base +{ + static_assert(std::is_same::value,""); + using const_iterator = typename Container::const_iterator; + using source_ptr = std::shared_ptr; + + location(std::string name, Container cont) + : source_(std::make_shared(std::move(cont))), + source_name_(std::move(name)), iter_(source_->cbegin()) + {} + location(const location&) = default; + location(location&&) = default; + location& operator=(const location&) = default; + location& operator=(location&&) = default; + ~location() = default; + + bool is_ok() const noexcept override {return static_cast(source_);} + + const_iterator& iter() noexcept {return iter_;} + const_iterator iter() const noexcept {return iter_;} + + const_iterator begin() const noexcept {return source_->cbegin();} + const_iterator end() const noexcept {return source_->cend();} + + std::string str() const override {return make_string(1, *this->iter());} + std::string name() const override {return source_name_;} + + std::string line_num() const override + { + return std::to_string(1+std::count(this->begin(), this->iter(), '\n')); + } + + std::string line() const override + { + return make_string(this->line_begin(), this->line_end()); + } + + const_iterator line_begin() const noexcept + { + using reverse_iterator = std::reverse_iterator; + return std::find(reverse_iterator(this->iter()), + reverse_iterator(this->begin()), '\n').base(); + } + const_iterator line_end() const noexcept + { + return std::find(this->iter(), this->end(), '\n'); + } + + // location is always points a character. so the size is 1. + std::size_t size() const noexcept override + { + return 1u; + } + std::size_t before() const noexcept override + { + return std::distance(this->line_begin(), this->iter()); + } + std::size_t after() const noexcept override + { + return std::distance(this->iter(), this->line_end()); + } + + source_ptr const& source() const& noexcept {return source_;} + source_ptr&& source() && noexcept {return std::move(source_);} + + private: + + source_ptr source_; + std::string source_name_; + const_iterator iter_; +}; + template struct region final : public region_base { @@ -225,7 +264,19 @@ inline std::string format_underline(const std::string& message, retval += make_string(line_number.size() + 1, ' '); retval += " | "; retval += make_string(reg.before(), ' '); - retval += make_string(reg.size(), '~'); + if(reg.size() == 1) + { + // invalid + // ^------ + retval += '^'; + retval += make_string(reg.after(), '-'); + } + else + { + // invalid + // ~~~~~~~ + retval += make_string(reg.size(), '~'); + } retval += ' '; retval += comment_for_underline; if(helps.size() != 0) @@ -270,7 +321,19 @@ inline std::string format_underline(const std::string& message, retval << make_string(line_num_width + 1, ' '); retval << " | "; retval << make_string(reg1.before(), ' '); - retval << make_string(reg1.size(), '~'); + if(reg1.size() == 1) + { + // invalid + // ^------ + retval << '^'; + retval << make_string(reg1.after(), '-'); + } + else + { + // invalid + // ~~~~~~~ + retval << make_string(reg1.size(), '~'); + } retval << ' '; retval << comment_for_underline1 << newline; // --------------------------------------- @@ -287,7 +350,19 @@ inline std::string format_underline(const std::string& message, retval << make_string(line_num_width + 1, ' '); retval << " | "; retval << make_string(reg2.before(), ' '); - retval << make_string(reg2.size(), '~'); + if(reg2.size() == 1) + { + // invalid + // ^------ + retval << '^'; + retval << make_string(reg2.after(), '-'); + } + else + { + // invalid + // ~~~~~~~ + retval << make_string(reg2.size(), '~'); + } retval << ' '; retval << comment_for_underline2; if(helps.size() != 0) @@ -305,62 +380,84 @@ inline std::string format_underline(const std::string& message, return retval.str(); } - // to show a better error message. -template -std::string -format_underline(const std::string& message, const location& loc, - const std::string& comment_for_underline, - std::vector helps = {}) +inline std::string format_underline(const std::string& message, + std::vector> reg_com, + std::vector helps = {}) { + assert(!reg_com.empty()); + #ifdef _WIN32 const auto newline = "\r\n"; #else const char newline = '\n'; #endif - using const_iterator = typename location::const_iterator; - using reverse_iterator = std::reverse_iterator; - const auto line_begin = std::find(reverse_iterator(loc.iter()), - reverse_iterator(loc.begin()), - '\n').base(); - const auto line_end = std::find(loc.iter(), loc.end(), '\n'); - const auto line_number = std::to_string( - 1 + std::count(loc.begin(), loc.iter(), '\n')); + const auto line_num_width = std::max_element(reg_com.begin(), reg_com.end(), + [](std::pair const& lhs, + std::pair const& rhs) + { + return lhs.first->line_num().size() < rhs.first->line_num().size(); + } + )->first->line_num().size(); + + std::ostringstream retval; + retval << message << newline; + + for(std::size_t i=0; iname() == reg_com.at(i).first->name()) + { + retval << " ..." << newline; + } + else + { + retval << " --> " << reg_com.at(i).first->name() << newline; + } + + const region_base* const reg = reg_com.at(i).first; + const std::string& comment = reg_com.at(i).second; + + + retval << ' ' << std::setw(line_num_width) << reg->line_num(); + retval << " | " << reg->line() << newline; + retval << make_string(line_num_width + 1, ' '); + retval << " | " << make_string(reg->before(), ' '); + + if(reg->size() == 1) + { + // invalid + // ^------ + retval << '^'; + retval << make_string(reg->after(), '-'); + } + else + { + // invalid + // ~~~~~~~ + retval << make_string(reg->size(), '~'); + } + + retval << ' '; + retval << comment << newline; + } - std::string retval; - retval += message; - retval += newline; - retval += " --> "; - retval += loc.name(); - retval += newline; - retval += ' '; - retval += line_number; - retval += " | "; - retval += make_string(line_begin, line_end); - retval += newline; - retval += make_string(line_number.size() + 1, ' '); - retval += " | "; - retval += make_string(std::distance(line_begin, loc.iter()),' '); - retval += '^'; - retval += make_string(std::distance(loc.iter(), line_end), '-'); - retval += ' '; - retval += comment_for_underline; if(helps.size() != 0) { - retval += newline; - retval += make_string(line_number.size() + 1, ' '); - retval += " | "; + retval << newline; + retval << make_string(line_num_width + 1, ' '); + retval << " | "; for(const auto help : helps) { - retval += newline; - retval += "Hint: "; - retval += help; + retval << newline; + retval << "Hint: "; + retval << help; } } - return retval; + return retval.str(); } + } // detail } // toml #endif// TOML11_REGION_H From ae793fb6311c613b7e6c8935fe3aa445333df426 Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Sat, 2 Mar 2019 17:55:51 +0900 Subject: [PATCH 7/8] feat: improve error message for invalid array --- toml/parser.hpp | 47 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/toml/parser.hpp b/toml/parser.hpp index 983d54e..111f145 100644 --- a/toml/parser.hpp +++ b/toml/parser.hpp @@ -876,16 +876,39 @@ parse_array(location& loc) { if(!retval.empty() && retval.front().type() != val.as_ok().type()) { - throw syntax_error(format_underline( - "[error] toml::parse_array: type of elements should be the " - "same each other.", region(loc, first, loc.iter()), - "inhomogeneous types")); + auto array_start_loc = loc; + array_start_loc.iter() = first; + + throw syntax_error(format_underline("[error] toml::parse_array: " + "type of elements should be the same each other.", + std::vector>{ + std::make_pair( + std::addressof(array_start_loc), + std::string("array starts here") + ), + std::make_pair( + std::addressof(get_region(retval.front())), + std::string("value has type ") + + stringize(retval.front().type()) + ), + std::make_pair( + std::addressof(get_region(val.unwrap())), + std::string("value has different type, ") + + stringize(val.unwrap().type()) + ) + })); } retval.push_back(std::move(val.unwrap())); } else { - return err(val.unwrap_err()); + auto array_start_loc = loc; + array_start_loc.iter() = first; + + throw syntax_error(format_underline("[error] toml::parse_array: " + "value having invalid format appeared in an array", + array_start_loc, "array starts here", + loc, "it is not a valid value.")); } using lex_array_separator = sequence, character<','>>; @@ -901,8 +924,12 @@ parse_array(location& loc) } else { + auto array_start_loc = loc; + array_start_loc.iter() = first; + throw syntax_error(format_underline("[error] toml::parse_array:" - " missing array separator `,`", loc, "should be `,`")); + " missing array separator `,` after a value", + array_start_loc, "array starts here", loc, "should be `,`")); } } } @@ -960,6 +987,7 @@ parse_key_value_pair(location& loc) { std::string msg; loc.iter() = after_kvsp; + // check there is something not a comment/whitespace after `=` if(sequence, maybe, lex_newline>::invoke(loc)) { loc.iter() = after_kvsp; @@ -967,10 +995,9 @@ parse_key_value_pair(location& loc) "missing value after key-value separator '='", loc, "expected value, but got nothing"); } - else + else // there is something not a comment/whitespace, so invalid format. { - msg = format_underline("[error] toml::parse_key_value_pair: " - "invalid value format", loc, val.unwrap_err()); + msg = std::move(val.unwrap_err()); } loc.iter() = first; return err(msg); @@ -1114,7 +1141,7 @@ insert_nested_key(table& root, const toml::value& v, "[error] toml::insert_value: value (\"", format_dotted_keys(first, last), "\") already exists."), get_region(tab->at(k)), "value already exists here", - get_region(v), "value inserted twice")); + get_region(v), "value defined twice")); } } tab->insert(std::make_pair(k, v)); From 2accc9d22ca5d66e6ecd2fbeea2ba08d88be2534 Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Sun, 3 Mar 2019 18:56:45 +0900 Subject: [PATCH 8/8] fix: diagnose, but not throw for unicode error in 2.0.x and 2.1.0, README says "it shows warning" for invalid unicode codepoints. So far, this library just show an error message in stderr for this case. It is not good to change the behavior fatal in the next minor release, 2.1.1, that includes patches and improved error msgs. I will make it throw syntax_error after 2.2.0 for invalid unicode codepoints. For now, I will keep it to be "warning". --- toml/parser.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/toml/parser.hpp b/toml/parser.hpp index 111f145..378be6e 100644 --- a/toml/parser.hpp +++ b/toml/parser.hpp @@ -250,10 +250,10 @@ std::string read_utf8_codepoint(const region& reg, { if(0xD800 <= codepoint && codepoint <= 0xDFFF) { - throw syntax_error(format_underline("[error] " + std::cerr << format_underline("[warning] " "toml::read_utf8_codepoint: codepoints in the range " "[0xD800, 0xDFFF] are not valid UTF-8.", - loc, "not a valid UTF-8 codepoint")); + loc, "not a valid UTF-8 codepoint") << std::endl; } assert(codepoint < 0xD800 || 0xDFFF < codepoint); // 1110yyyy 10yxxxxx 10xxxxxx @@ -265,10 +265,10 @@ std::string read_utf8_codepoint(const region& reg, { if(0x10FFFF < codepoint) // out of Unicode region { - throw syntax_error(format_underline("[error] " + std::cerr << format_underline("[error] " "toml::read_utf8_codepoint: input codepoint is too large to " "decode as a unicode character.", loc, - "should be in [0x00..0x10FFFF]")); + "should be in [0x00..0x10FFFF]") << std::endl; } // 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx character += static_cast(0xF0| codepoint >> 18);