From b3917aaadf02dce193c58ebd4bcde05a73a42eb4 Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Tue, 16 Apr 2019 20:54:29 +0900 Subject: [PATCH 1/8] refactor: use snprintf to show char in hex instead of std::ostringstream. --- toml/combinator.hpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/toml/combinator.hpp b/toml/combinator.hpp index ac8dd14..52aaaae 100644 --- a/toml/combinator.hpp +++ b/toml/combinator.hpp @@ -9,7 +9,10 @@ #include #include #include +#include #include +#include +#include #include // they scans characters and returns region if it matches to the condition. @@ -38,10 +41,12 @@ inline std::string show_char(const char c) } else { - std::ostringstream oss; - oss << "0x" << std::hex << std::setfill('0') << std::setw(2) - << static_cast(c); - return oss.str(); + std::array buf; + buf.fill('\0'); + const auto r = std::snprintf( + buf.data(), buf.size(), "0x%02x", static_cast(c) & 0xFF); + assert(r == buf.size() - 1); + return std::string(buf.data()); } } From 91966a6917b9c2be32e5cb580874021a713a3d97 Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Tue, 16 Apr 2019 21:09:59 +0900 Subject: [PATCH 2/8] perf: do not use concat_string if it is not needed At the earlier stage of the development, I thought that it is useful if lexer-combinators generate error messages, because by doing this, parser would not need to generate an error message. But now it turned out that to show an appropriate error message, parser need to generate according to the context. And almost all the messages from lexer are discarded. So I added another parameter to lexer-combinator to suppress error message generation. In the future, we may want to remove messages completely from lexers, but currently I will keep it. Removing those unused message generation makes the parsing process faster. --- toml/combinator.hpp | 91 ++++++++++++++++++++++++++++----------------- 1 file changed, 57 insertions(+), 34 deletions(-) diff --git a/toml/combinator.hpp b/toml/combinator.hpp index 52aaaae..36d367c 100644 --- a/toml/combinator.hpp +++ b/toml/combinator.hpp @@ -56,7 +56,8 @@ struct character static constexpr char target = C; template - static result, std::string> invoke(location& loc) + static result, std::string> + invoke(location& loc, const bool msg = false) { static_assert(std::is_same::value, "internal error: container::value_type should be `char`."); @@ -67,8 +68,12 @@ struct character const char c = *(loc.iter()); if(c != target) { - return err(concat_to_string("expected '", show_char(target), - "' but got '", show_char(c), "'.")); + if(msg) + { + return err(concat_to_string("expected '", show_char(target), + "' but got '", show_char(c), "'.")); + } + return err(""); } loc.advance(); // update location @@ -91,7 +96,8 @@ struct in_range static constexpr char lower = Low; template - static result, std::string> invoke(location& loc) + static result, std::string> + invoke(location& loc, const bool msg = false) { static_assert(std::is_same::value, "internal error: container::value_type should be `char`."); @@ -102,9 +108,13 @@ struct in_range const char c = *(loc.iter()); if(c < lower || upper < c) { - return err(concat_to_string("expected character in range " - "[", show_char(lower), ", ", show_char(upper), "] but got ", - "'", show_char(c), "'.")); + if(msg) + { + return err(concat_to_string("expected character in range " + "[", show_char(lower), ", ", show_char(upper), "] but got ", + "'", show_char(c), "'.")); + } + return err(""); } loc.advance(); @@ -125,7 +135,8 @@ template struct exclude { template - static result, std::string> invoke(location& loc) + static result, std::string> + invoke(location& loc, const bool msg = false) { static_assert(std::is_same::value, "internal error: container::value_type should be `char`."); @@ -133,13 +144,16 @@ struct exclude if(loc.iter() == loc.end()) {return err("not sufficient characters");} auto first = loc.iter(); - auto rslt = Combinator::invoke(loc); + auto rslt = Combinator::invoke(loc, msg); if(rslt.is_ok()) { loc.reset(first); - return err(concat_to_string( - "invalid pattern (", Combinator::pattern(), ") appeared ", - rslt.unwrap().str())); + if(msg) + { + return err(concat_to_string("invalid pattern (", + Combinator::pattern(), ") appeared ", rslt.unwrap().str())); + } + return err(""); } loc.reset(std::next(first)); // XXX maybe loc.advance() is okay but... return ok(region(loc, first, loc.iter())); @@ -156,12 +170,13 @@ template struct maybe { template - static result, std::string> invoke(location& loc) + static result, std::string> + invoke(location& loc, const bool msg = false) { static_assert(std::is_same::value, "internal error: container::value_type should be `char`."); - const auto rslt = Combinator::invoke(loc); + const auto rslt = Combinator::invoke(loc, msg); if(rslt.is_ok()) { return rslt; @@ -182,34 +197,36 @@ template struct sequence { template - static result, std::string> invoke(location& loc) + static result, std::string> + invoke(location& loc, const bool msg = false) { static_assert(std::is_same::value, "internal error: container::value_type should be `char`."); const auto first = loc.iter(); - const auto rslt = Head::invoke(loc); + const auto rslt = Head::invoke(loc, msg); if(rslt.is_err()) { loc.reset(first); return err(rslt.unwrap_err()); } - return sequence::invoke(loc, std::move(rslt.unwrap()), first); + return sequence::invoke(loc, std::move(rslt.unwrap()), first, msg); } // called from the above function only, recursively. template static result, std::string> - invoke(location& loc, region reg, Iterator first) + invoke(location& loc, region reg, Iterator first, + const bool msg = false) { - const auto rslt = Head::invoke(loc); + const auto rslt = Head::invoke(loc, msg); if(rslt.is_err()) { loc.reset(first); return err(rslt.unwrap_err()); } reg += rslt.unwrap(); // concat regions - return sequence::invoke(loc, std::move(reg), first); + return sequence::invoke(loc, std::move(reg), first, msg); } static std::string pattern() @@ -224,9 +241,10 @@ struct sequence // would be called from sequence::invoke only. template static result, std::string> - invoke(location& loc, region reg, Iterator first) + invoke(location& loc, region reg, Iterator first, + const bool msg = false) { - const auto rslt = Head::invoke(loc); + const auto rslt = Head::invoke(loc, msg); if(rslt.is_err()) { loc.reset(first); @@ -245,14 +263,15 @@ template struct either { template - static result, std::string> invoke(location& loc) + static result, std::string> + invoke(location& loc, const bool msg = false) { static_assert(std::is_same::value, "internal error: container::value_type should be `char`."); - const auto rslt = Head::invoke(loc); + const auto rslt = Head::invoke(loc, msg); if(rslt.is_ok()) {return rslt;} - return either::invoke(loc); + return either::invoke(loc, msg); } static std::string pattern() @@ -264,11 +283,12 @@ template struct either { template - static result, std::string> invoke(location& loc) + static result, std::string> + invoke(location& loc, const bool msg = false) { static_assert(std::is_same::value, "internal error: container::value_type should be `char`."); - return Head::invoke(loc); + return Head::invoke(loc, msg); } static std::string pattern() { @@ -287,13 +307,14 @@ template struct repeat> { template - static result, std::string> invoke(location& loc) + static result, std::string> + invoke(location& loc, const bool msg = false) { region retval(loc); const auto first = loc.iter(); for(std::size_t i=0; i struct repeat> { template - static result, std::string> invoke(location& loc) + static result, std::string> + invoke(location& loc, const bool msg = false) { region retval(loc); const auto first = loc.iter(); for(std::size_t i=0; i> } while(true) { - auto rslt = T::invoke(loc); + auto rslt = T::invoke(loc, msg); if(rslt.is_err()) { return ok(std::move(retval)); @@ -348,12 +370,13 @@ template struct repeat { template - static result, std::string> invoke(location& loc) + static result, std::string> + invoke(location& loc, const bool msg = false) { region retval(loc); while(true) { - auto rslt = T::invoke(loc); + auto rslt = T::invoke(loc, msg); if(rslt.is_err()) { return ok(std::move(retval)); From 4db486d76d3f5a3e2b10d0035c8950e848966e1a Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Tue, 16 Apr 2019 21:37:12 +0900 Subject: [PATCH 3/8] perf: check integer prefix before trying to parse all the parsers generate error messages and error message generation is not a lightweight task. It concatenates a lot of strings, it formats many values, etc. To avoid useless error-message generation, first check which prefix is used and then parse special integers. Additionally, by checking that, the quality of the error message can be improved (later). --- toml/parser.hpp | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/toml/parser.hpp b/toml/parser.hpp index c1b98f7..cec7968 100644 --- a/toml/parser.hpp +++ b/toml/parser.hpp @@ -116,10 +116,28 @@ parse_integer(location& loc) const auto first = loc.iter(); if(first != loc.end() && *first == '0') { - if(const auto bin = parse_binary_integer (loc)) {return bin;} - if(const auto oct = parse_octal_integer (loc)) {return oct;} - if(const auto hex = parse_hexadecimal_integer(loc)) {return hex;} - // else, maybe just zero. + const auto second = std::next(first); + if(second == loc.end()) // the token is just zero. + { + return ok(std::make_pair(0, region(loc, first, second))); + } + + if(*second == 'b') {return parse_binary_integer (loc);} // 0b1100 + if(*second == 'o') {return parse_octal_integer (loc);} // 0o775 + if(*second == 'x') {return parse_hexadecimal_integer(loc);} // 0xC0FFEE + + if(std::isdigit(*second)) + { + return err(format_underline("[error] toml::parse_integer: " + "leading zero in an Integer is not allowed.", + {{std::addressof(loc), "leading zero"}})); + } + else if(std::isalpha(*second)) + { + return err(format_underline("[error] toml::parse_integer: " + "unknown integer prefix appeared.", + {{std::addressof(loc), "none of 0x, 0o, 0b"}})); + } } if(const auto token = lex_dec_int::invoke(loc)) From c82e76a1114338a7ff2520eaa0a71d27f201b08a Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Tue, 16 Apr 2019 21:47:24 +0900 Subject: [PATCH 4/8] perf: check string type before parsing it to avoid unncessary error message generation, check the first some characters before parsing it. It makes parsing process faster and is also helpful to generate more accurate error messages. --- toml/parser.hpp | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/toml/parser.hpp b/toml/parser.hpp index cec7968..6e2f17c 100644 --- a/toml/parser.hpp +++ b/toml/parser.hpp @@ -539,10 +539,30 @@ template result>, std::string> parse_string(location& loc) { - if(const auto rslt = parse_ml_basic_string(loc)) {return rslt;} - if(const auto rslt = parse_ml_literal_string(loc)) {return rslt;} - if(const auto rslt = parse_basic_string(loc)) {return rslt;} - if(const auto rslt = parse_literal_string(loc)) {return rslt;} + if(loc.iter() != loc.end() && *(loc.iter()) == '"') + { + if(loc.iter() + 1 != loc.end() && *(loc.iter() + 1) == '"' && + loc.iter() + 2 != loc.end() && *(loc.iter() + 2) == '"') + { + return parse_ml_basic_string(loc); + } + else + { + return parse_basic_string(loc); + } + } + else if(loc.iter() != loc.end() && *(loc.iter()) == '\'') + { + if(loc.iter() + 1 != loc.end() && *(loc.iter() + 1) == '\'' && + loc.iter() + 2 != loc.end() && *(loc.iter() + 2) == '\'') + { + return parse_ml_literal_string(loc); + } + else + { + return parse_literal_string(loc); + } + } return err(format_underline("[error] toml::parse_string: ", {{std::addressof(loc), "the next token is not a string"}})); } From c5b6ee6f81227f5363aa8d570129113ff7e77f19 Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Wed, 17 Apr 2019 23:43:42 +0900 Subject: [PATCH 5/8] feat: add yet another constructor to value to make implementation of parse_value easier --- toml/value.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/toml/value.hpp b/toml/value.hpp index 74b2265..ef4bd15 100644 --- a/toml/value.hpp +++ b/toml/value.hpp @@ -572,6 +572,14 @@ class value return *this; } + // for internal use ------------------------------------------------------ + + template::value, std::nullptr_t>::type = nullptr> + value(std::pair> parse_result) + : value(std::move(parse_result.first), std::move(parse_result.second)) + {} + // type checking and casting ============================================ template From 61e69c92514c1e915c58f0357c656464de705f7e Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Thu, 18 Apr 2019 13:56:19 +0900 Subject: [PATCH 6/8] fix: count line number from 1, not 0 --- toml/region.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/toml/region.hpp b/toml/region.hpp index 57c1ab3..1720a7f 100644 --- a/toml/region.hpp +++ b/toml/region.hpp @@ -71,7 +71,7 @@ struct location final : public region_base "container should be randomly accessible"); location(std::string name, Container cont) - : source_(std::make_shared(std::move(cont))), line_number_(0), + : source_(std::make_shared(std::move(cont))), line_number_(1), source_name_(std::move(name)), iter_(source_->cbegin()) {} location(const location&) = default; @@ -88,7 +88,7 @@ struct location final : public region_base const_iterator begin() const noexcept {return source_->cbegin();} const_iterator end() const noexcept {return source_->cend();} - // XXX At first, `location::line_num()` is implemented using `std::count` to + // XXX `location::line_num()` used to be implemented using `std::count` to // count a number of '\n'. But with a long toml file (typically, 10k lines), // it becomes intolerably slow because each time it generates error messages, // it counts '\n' from thousands of characters. To workaround it, I decided @@ -110,8 +110,8 @@ struct location final : public region_base } void reset(const_iterator rollback) noexcept { - // since c++11, std::distance works in both ways and returns a negative - // value if `first` is ahead from `last`. + // since c++11, std::distance works in both ways for random-access + // iterators and returns a negative value if `first > last`. if(0 <= std::distance(rollback, this->iter_)) // rollback < iter { this->line_number_ -= std::count(rollback, this->iter_, '\n'); From 0f48852730af8d61e5c5434fa630a481cbac074f Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Thu, 18 Apr 2019 14:26:27 +0900 Subject: [PATCH 7/8] perf: check value type before parsing to avoid needless error message generation --- toml/parser.hpp | 84 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 60 insertions(+), 24 deletions(-) diff --git a/toml/parser.hpp b/toml/parser.hpp index 73cd739..f8d6df0 100644 --- a/toml/parser.hpp +++ b/toml/parser.hpp @@ -1412,6 +1412,46 @@ parse_inline_table(location& loc) {{std::addressof(loc), "should be closed"}})); } +template +value_t guess_number_type(const location& l) +{ + location loc = l; + + if(lex_offset_date_time::invoke(loc)) {return value_t::OffsetDatetime;} + loc.reset(l.iter()); + + if(lex_local_date_time::invoke(loc)) {return value_t::LocalDatetime;} + loc.reset(l.iter()); + + if(lex_local_date::invoke(loc)) {return value_t::LocalDate;} + loc.reset(l.iter()); + + if(lex_local_time::invoke(loc)) {return value_t::LocalTime;} + loc.reset(l.iter()); + + if(lex_float::invoke(loc)) {return value_t::Float;} + loc.reset(l.iter()); + + return value_t::Integer; +} + +template +value_t guess_value_type(const location& loc) +{ + switch(*loc.iter()) + { + case '"' : {return value_t::String; } + case '\'': {return value_t::String; } + case 't' : {return value_t::Boolean;} + case 'f' : {return value_t::Boolean;} + case '[' : {return value_t::Array; } + case '{' : {return value_t::Table; } + case 'i' : {return value_t::Float; } // inf. + case 'n' : {return value_t::Float; } // nan. + default : {return guess_number_type(loc);} + } +} + template result parse_value(location& loc) { @@ -1421,31 +1461,27 @@ result parse_value(location& loc) return err(format_underline("[error] toml::parse_value: input is empty", {{std::addressof(loc), ""}})); } - if(auto r = parse_string (loc)) - {return ok(value(std::move(r.unwrap().first), std::move(r.unwrap().second)));} - if(auto r = parse_array (loc)) - {return ok(value(std::move(r.unwrap().first), std::move(r.unwrap().second)));} - if(auto r = parse_inline_table (loc)) - {return ok(value(std::move(r.unwrap().first), std::move(r.unwrap().second)));} - if(auto r = parse_boolean (loc)) - {return ok(value(std::move(r.unwrap().first), std::move(r.unwrap().second)));} - if(auto r = parse_offset_datetime(loc)) - {return ok(value(std::move(r.unwrap().first), std::move(r.unwrap().second)));} - if(auto r = parse_local_datetime (loc)) - {return ok(value(std::move(r.unwrap().first), std::move(r.unwrap().second)));} - if(auto r = parse_local_date (loc)) - {return ok(value(std::move(r.unwrap().first), std::move(r.unwrap().second)));} - if(auto r = parse_local_time (loc)) - {return ok(value(std::move(r.unwrap().first), std::move(r.unwrap().second)));} - if(auto r = parse_floating (loc)) - {return ok(value(std::move(r.unwrap().first), std::move(r.unwrap().second)));} - if(auto r = parse_integer (loc)) - {return ok(value(std::move(r.unwrap().first), std::move(r.unwrap().second)));} - const auto msg = format_underline("[error] toml::parse_value: " - "unknown token appeared", {{std::addressof(loc), "unknown"}}); - loc.reset(first); - return err(msg); + switch(guess_value_type(loc)) + { + case value_t::Boolean : {return parse_boolean(loc); } + case value_t::Integer : {return parse_integer(loc); } + case value_t::Float : {return parse_floating(loc); } + case value_t::String : {return parse_string(loc); } + case value_t::OffsetDatetime : {return parse_offset_datetime(loc);} + case value_t::LocalDatetime : {return parse_local_datetime(loc); } + case value_t::LocalDate : {return parse_local_date(loc); } + case value_t::LocalTime : {return parse_local_time(loc); } + case value_t::Array : {return parse_array(loc); } + case value_t::Table : {return parse_inline_table(loc); } + default: + { + const auto msg = format_underline("[error] toml::parse_value: " + "unknown token appeared", {{std::addressof(loc), "unknown"}}); + loc.reset(first); + return err(msg); + } + } } template From 637c99d637404ef2a2a9ef92f37f07bb41ee94b9 Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Thu, 18 Apr 2019 15:09:58 +0900 Subject: [PATCH 8/8] refactor: generate error message in parser --- toml/parser.hpp | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/toml/parser.hpp b/toml/parser.hpp index f8d6df0..dc74913 100644 --- a/toml/parser.hpp +++ b/toml/parser.hpp @@ -326,7 +326,7 @@ result parse_escape_sequence(location& loc) { return err(format_underline("[error] parse_escape_sequence: " "invalid token found in UTF-8 codepoint uXXXX.", - {{std::addressof(loc), token.unwrap_err()}})); + {{std::addressof(loc), "here"}})); } } case 'U': @@ -339,7 +339,7 @@ result parse_escape_sequence(location& loc) { return err(format_underline("[error] parse_escape_sequence: " "invalid token found in UTF-8 codepoint Uxxxxxxxx", - {{std::addressof(loc), token.unwrap_err()}})); + {{std::addressof(loc), "here"}})); } } } @@ -406,7 +406,9 @@ parse_ml_basic_string(location& loc) else { loc.reset(first); - return err(token.unwrap_err()); + return err(format_underline("[error] toml::parse_ml_basic_string: " + "the next token is not a multiline string", + {{std::addressof(loc), "here"}})); } } @@ -455,7 +457,9 @@ parse_basic_string(location& loc) else { loc.reset(first); // rollback - return err(token.unwrap_err()); + return err(format_underline("[error] toml::parse_basic_string: " + "the next token is not a string", + {{std::addressof(loc), "here"}})); } } @@ -494,7 +498,9 @@ parse_ml_literal_string(location& loc) else { loc.reset(first); // rollback - return err(token.unwrap_err()); + return err(format_underline("[error] toml::parse_ml_literal_string: " + "the next token is not a multiline literal string", + {{std::addressof(loc), "here"}})); } } @@ -531,7 +537,9 @@ parse_literal_string(location& loc) else { loc.reset(first); // rollback - return err(token.unwrap_err()); + return err(format_underline("[error] toml::parse_literal_string: " + "the next token is not a literal string", + {{std::addressof(loc), "here"}})); } } @@ -1537,7 +1545,8 @@ parse_table_key(location& loc) } else { - return err(token.unwrap_err()); + return err(format_underline("[error] toml::parse_table_key: " + "not a valid table key", {{std::addressof(loc), "here"}})); } } @@ -1545,7 +1554,7 @@ template result, region>, std::string> parse_array_table_key(location& loc) { - if(auto token = lex_array_table::invoke(loc)) + if(auto token = lex_array_table::invoke(loc, true)) { location inner_loc(loc.name(), token.unwrap().str()); @@ -1590,7 +1599,8 @@ parse_array_table_key(location& loc) } else { - return err(token.unwrap_err()); + return err(format_underline("[error] toml::parse_array_table_key: " + "not a valid table key", {{std::addressof(loc), "here"}})); } }