From e929d2f00f545b2148c8a83aea1248661edef6f2 Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Wed, 27 Feb 2019 12:30:57 +0900 Subject: [PATCH 1/5] fix: allow empty input file (to be an empty table) --- toml/parser.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/toml/parser.hpp b/toml/parser.hpp index faa7213..d7108d4 100644 --- a/toml/parser.hpp +++ b/toml/parser.hpp @@ -1421,7 +1421,7 @@ result parse_toml_file(location& loc) const auto first = loc.iter(); if(first == loc.end()) { - return err(std::string("toml::detail::parse_toml_file: input is empty")); + return ok(toml::table{}); } table data; From 5a929320191f209725e64966357242f949907415 Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Fri, 1 Mar 2019 22:13:32 +0900 Subject: [PATCH 2/5] fix: disallow invalid escape sequence --- toml/lexer.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/toml/lexer.hpp b/toml/lexer.hpp index 4f170c5..67df844 100644 --- a/toml/lexer.hpp +++ b/toml/lexer.hpp @@ -124,9 +124,9 @@ using lex_escape_unicode_short = sequence, using lex_escape_unicode_long = sequence, repeat>>; using lex_escape_seq_char = either, character<'\\'>, - character<'/'>, character<'b'>, - character<'f'>, character<'n'>, - character<'r'>, character<'t'>, + character<'b'>, character<'f'>, + character<'n'>, character<'r'>, + character<'t'>, lex_escape_unicode_short, lex_escape_unicode_long >; From 0c9806e99fa4df8b59cd3c7f90d96cdf6c8a8ad1 Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Fri, 1 Mar 2019 22:37:52 +0900 Subject: [PATCH 3/5] fix: diagnose key after [table.key] pattern the following is not a valid toml format. ``` [table] key = "value" ``` this commit enables to diagnose that pattern. --- toml/parser.hpp | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/toml/parser.hpp b/toml/parser.hpp index d7108d4..1deba39 100644 --- a/toml/parser.hpp +++ b/toml/parser.hpp @@ -1289,6 +1289,20 @@ parse_table_key(location& loc) throw internal_error(format_underline("[error] " "toml::parse_table_key: no `]`", inner_loc, "should be `]`")); } + + // after [table.key], newline or EOF(empty table) requried. + if(loc.iter() != loc.end()) + { + using lex_newline_after_table_key = + sequence, maybe, lex_newline>; + const auto nl = lex_newline_after_table_key::invoke(loc); + if(!nl) + { + throw syntax_error(format_underline("[error] " + "toml::parse_table_key: newline required after [table.key]", + loc, "expected newline")); + } + } return ok(std::make_pair(keys.unwrap().first, token.unwrap())); } else @@ -1327,6 +1341,20 @@ parse_array_table_key(location& loc) throw internal_error(format_underline("[error] " "toml::parse_table_key: no `]]`", inner_loc, "should be `]]`")); } + + // after [[table.key]], newline or EOF(empty table) requried. + if(loc.iter() != loc.end()) + { + using lex_newline_after_table_key = + sequence, maybe, lex_newline>; + const auto nl = lex_newline_after_table_key::invoke(loc); + if(!nl) + { + throw syntax_error(format_underline("[error] " + "toml::parse_array_table_key: newline required after " + "[[table.key]]", loc, "expected newline")); + } + } return ok(std::make_pair(keys.unwrap().first, token.unwrap())); } else From 536b23dc8442853e4a3975fb3acb016073f7940b Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Fri, 1 Mar 2019 22:53:16 +0900 Subject: [PATCH 4/5] fix: allow empty table in the middle of a file --- toml/parser.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/toml/parser.hpp b/toml/parser.hpp index 1deba39..57f2211 100644 --- a/toml/parser.hpp +++ b/toml/parser.hpp @@ -1370,7 +1370,7 @@ result parse_ml_table(location& loc) const auto first = loc.iter(); if(first == loc.end()) { - return err(std::string("toml::parse_ml_table: input is empty")); + return ok(toml::table{}); } // XXX at lest one newline is needed. @@ -1453,7 +1453,7 @@ result parse_toml_file(location& loc) } table data; - /* root object is also table, but without [tablename] */ + // root object is also a table, but without [tablename] if(auto tab = parse_ml_table(loc)) { data = std::move(tab.unwrap()); From 7f870d58611bc90aaba41166469f2fdeacc5de37 Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Sat, 2 Mar 2019 01:51:27 +0900 Subject: [PATCH 5/5] fix: diagnose invalid UTF-8 codepoints --- toml/parser.hpp | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/toml/parser.hpp b/toml/parser.hpp index 57f2211..983d54e 100644 --- a/toml/parser.hpp +++ b/toml/parser.hpp @@ -226,8 +226,9 @@ parse_floating(location& loc) "the next token is not a float")); } -template -std::string read_utf8_codepoint(const region& reg) +template +std::string read_utf8_codepoint(const region& reg, + /* for err msg */ const location& loc) { const auto str = reg.str().substr(1); std::uint_least32_t codepoint; @@ -247,20 +248,27 @@ std::string read_utf8_codepoint(const region& reg) } else if(codepoint < 0x10000) // U+0800...U+FFFF { + if(0xD800 <= codepoint && codepoint <= 0xDFFF) + { + throw syntax_error(format_underline("[error] " + "toml::read_utf8_codepoint: codepoints in the range " + "[0xD800, 0xDFFF] are not valid UTF-8.", + loc, "not a valid UTF-8 codepoint")); + } + assert(codepoint < 0xD800 || 0xDFFF < codepoint); // 1110yyyy 10yxxxxx 10xxxxxx character += static_cast(0xE0| codepoint >> 12); character += static_cast(0x80|(codepoint >> 6 & 0x3F)); character += static_cast(0x80|(codepoint & 0x3F)); } - else if(codepoint < 0x200000) // U+10000 ... U+1FFFFF + else if(codepoint < 0x200000) // U+010000 ... U+1FFFFF { if(0x10FFFF < codepoint) // out of Unicode region { - std::cerr << format_underline(concat_to_string("[warning] " - "input codepoint (", str, ") is too large to decode as " - "a unicode character. The result may not be able to render " - "to your screen."), reg, "should be in [0x00..0x10FFFF]") - << std::endl; + throw syntax_error(format_underline("[error] " + "toml::read_utf8_codepoint: input codepoint is too large to " + "decode as a unicode character.", loc, + "should be in [0x00..0x10FFFF]")); } // 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx character += static_cast(0xF0| codepoint >> 18); @@ -300,7 +308,7 @@ result parse_escape_sequence(location& loc) { if(const auto token = lex_escape_unicode_short::invoke(loc)) { - return ok(read_utf8_codepoint(token.unwrap())); + return ok(read_utf8_codepoint(token.unwrap(), loc)); } else { @@ -313,7 +321,7 @@ result parse_escape_sequence(location& loc) { if(const auto token = lex_escape_unicode_long::invoke(loc)) { - return ok(read_utf8_codepoint(token.unwrap())); + return ok(read_utf8_codepoint(token.unwrap(), loc)); } else {