From 0aa377386054b34ba1c0a36b935deff234fdb9be Mon Sep 17 00:00:00 2001 From: ToruNiina Date: Wed, 30 Jun 2021 00:58:50 +0900 Subject: [PATCH] feat: add bare minimum utf8 seq validity check --- toml/lexer.hpp | 36 ++++++++++++++++++----- toml/parser.hpp | 77 ++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 99 insertions(+), 14 deletions(-) diff --git a/toml/lexer.hpp b/toml/lexer.hpp index c4dcc95..046cdf7 100644 --- a/toml/lexer.hpp +++ b/toml/lexer.hpp @@ -225,13 +225,6 @@ using lex_string = either; // =========================================================================== - -using lex_comment_start_symbol = character<'#'>; -using lex_non_eol = exclude, /*0x09 == tab is allowed*/ - in_range<0x0A, 0x1F>, character<0x7F>>>; -using lex_comment = sequence>; - using lex_dot_sep = sequence, character<'.'>, maybe>; using lex_unquoted_key = repeat, lex_array_table_close>; +using lex_utf8_1byte = in_range<0x00, 0x7F>; +using lex_utf8_2byte = sequence< + in_range(0xC2), static_cast(0xDF)>, + in_range(0x80), static_cast(0xBF)> + >; +using lex_utf8_3byte = sequence(0xE0)>, in_range(0xA0), static_cast(0xBF)>>, + sequence(0xE1), static_cast(0xEC)>, in_range(0x80), static_cast(0xBF)>>, + sequence(0xED)>, in_range(0x80), static_cast(0x9F)>>, + sequence(0xEE), static_cast(0xEF)>, in_range(0x80), static_cast(0xBF)>> + >, in_range(0x80), static_cast(0xBF)>>; +using lex_utf8_4byte = sequence(0xF0)>, in_range(0x90), static_cast(0xBF)>>, + sequence(0xF1), static_cast(0xF3)>, in_range(0x80), static_cast(0xBF)>>, + sequence(0xF4)>, in_range(0x80), static_cast(0x8F)>> + >, in_range(0x80), static_cast(0xBF)>, + in_range(0x80), static_cast(0xBF)>>; +using lex_utf8_code = either< + lex_utf8_1byte, + lex_utf8_2byte, + lex_utf8_3byte, + lex_utf8_4byte + >; + +using lex_comment_start_symbol = character<'#'>; +using lex_non_eol_ascii = either, in_range<0x20, 0x7E>>; +using lex_comment = sequence, unlimited>>; + } // detail } // toml #endif // TOML_LEXER_HPP diff --git a/toml/parser.hpp b/toml/parser.hpp index 2eadc36..47c7911 100644 --- a/toml/parser.hpp +++ b/toml/parser.hpp @@ -364,6 +364,17 @@ inline result parse_escape_sequence(location& loc) return err(msg); } +inline result check_utf8_validity(const std::string& reg) +{ + location loc("tmp", reg); + const auto u8 = repeat::invoke(loc); + if(!u8 || loc.iter() != loc.end()) + { + return err(std::distance(loc.begin(), loc.iter())); + } + return ok(none_t{}); +} + inline result, std::string> parse_ml_basic_string(location& loc) { @@ -432,7 +443,20 @@ parse_ml_basic_string(location& loc) source_location(inner_loc)); } } - return ok(std::make_pair(toml::string(retval), token.unwrap())); + + if(const auto u8 = check_utf8_validity(token.unwrap().str())) + { + return ok(std::make_pair(toml::string(retval), token.unwrap())); + } + else + { + inner_loc.reset(first); + inner_loc.advance(u8.as_err()); + throw syntax_error(format_underline( + "parse_ml_basic_string: invalid utf8 sequence found", + {{source_location(inner_loc), "here"}}), + source_location(inner_loc)); + } } else { @@ -484,7 +508,20 @@ parse_basic_string(location& loc) } quot = lex_quotation_mark::invoke(inner_loc); } - return ok(std::make_pair(toml::string(retval), token.unwrap())); + + if(const auto u8 = check_utf8_validity(token.unwrap().str())) + { + return ok(std::make_pair(toml::string(retval), token.unwrap())); + } + else + { + inner_loc.reset(first); + inner_loc.advance(u8.as_err()); + throw syntax_error(format_underline( + "parse_ml_basic_string: invalid utf8 sequence found", + {{source_location(inner_loc), "here"}}), + source_location(inner_loc)); + } } else { @@ -545,8 +582,21 @@ parse_ml_literal_string(location& loc) source_location(inner_loc)); } } - return ok(std::make_pair(toml::string(retval, toml::string_t::literal), - token.unwrap())); + + if(const auto u8 = check_utf8_validity(token.unwrap().str())) + { + return ok(std::make_pair(toml::string(retval, toml::string_t::literal), + token.unwrap())); + } + else + { + inner_loc.reset(first); + inner_loc.advance(u8.as_err()); + throw syntax_error(format_underline( + "parse_ml_basic_string: invalid utf8 sequence found", + {{source_location(inner_loc), "here"}}), + source_location(inner_loc)); + } } else { @@ -584,9 +634,22 @@ parse_literal_string(location& loc) {{source_location(inner_loc), "should be '"}}), source_location(inner_loc)); } - return ok(std::make_pair( - toml::string(body.unwrap().str(), toml::string_t::literal), - token.unwrap())); + + if(const auto u8 = check_utf8_validity(token.unwrap().str())) + { + return ok(std::make_pair( + toml::string(body.unwrap().str(), toml::string_t::literal), + token.unwrap())); + } + else + { + inner_loc.reset(first); + inner_loc.advance(u8.as_err()); + throw syntax_error(format_underline( + "parse_ml_basic_string: invalid utf8 sequence found", + {{source_location(inner_loc), "here"}}), + source_location(inner_loc)); + } } else {