From 42820c4161edcabb1e36336d175bb8645f529df3 Mon Sep 17 00:00:00 2001 From: Egor Tensin Date: Fri, 3 Jan 2020 04:13:47 +0300 Subject: lexer: add alternative boost::regex implementations --- server/lexer/CMakeLists.txt | 3 ++ server/lexer/details/parse.cpp | 118 ++++++++++++++++++++++++++++++++--------- server/lexer/details/parse.hpp | 8 +++ 3 files changed, 104 insertions(+), 25 deletions(-) (limited to 'server/lexer') diff --git a/server/lexer/CMakeLists.txt b/server/lexer/CMakeLists.txt index b62e47a..b5b8b63 100644 --- a/server/lexer/CMakeLists.txt +++ b/server/lexer/CMakeLists.txt @@ -1,2 +1,5 @@ +find_package(Boost REQUIRED COMPONENTS regex) + add_library(lexer details/parse.cpp lexer.cpp token.cpp token_type.cpp) target_link_libraries(lexer PUBLIC common) +target_link_libraries(lexer PRIVATE Boost::regex) diff --git a/server/lexer/details/parse.cpp b/server/lexer/details/parse.cpp index 464dfb2..01ef11c 100644 --- a/server/lexer/details/parse.cpp +++ b/server/lexer/details/parse.cpp @@ -6,9 +6,12 @@ #include "../error.hpp" #include "../token_type.hpp" +#include + #include #include +#include #include #include #include @@ -17,45 +20,77 @@ namespace math::server::lexer::details { namespace { -std::string_view match_number(const std::string_view& input) { - static constexpr std::regex::flag_type flags = - std::regex_constants::ECMAScript | - std::regex_constants::icase; - // This is a hacky attempt to describe a C-like grammar for floating-point - // numbers using a regex (the tests seem to pass though). - // A proper NFA would be better, I guess. - static const std::regex number_regex{R"REGEX(^(?:\d+(?:\.\d*)?|\.\d+)(e[+-]?(\d*))?)REGEX", flags}; - +std::cmatch std_regex_search(const std::string_view& input, const std::regex& regex) { std::cmatch match; { const auto begin = input.data(); const auto end = begin + input.length(); - if (!std::regex_search(begin, end, match, number_regex)) { - return {}; - } + std::regex_search(begin, end, match, regex); } + return match; +} + +boost::cmatch boost_regex_search(const std::string_view& input, const boost::regex& regex) { + boost::cmatch match; { - // If we have the numeric part of a number followed by 'e' and no digits, - // 1) that 'e' definitely belongs to this number token, - // 2) the user forgot to type in the required digits. - const auto& exponent = match[1]; - const auto& abs_power = match[2]; - if (exponent.matched && abs_power.matched && abs_power.length() == 0) { - throw LexerError{"exponent has no digits: " + match[0].str()}; - } + const auto begin = input.data(); + const auto end = begin + input.length(); + boost::regex_search(begin, end, match, regex); + } + return match; +} + +// CMatch is either std::cmatch or boost::cmatch. +template +void check_exponent(const CMatch& match) { + // If we have the numeric part of a number followed by 'e' and no digits, + // 1) that 'e' definitely belongs to this number token, + // 2) the user forgot to type in the required digits. + const auto& exponent = match[1]; + const auto& abs_power = match[2]; + if (exponent.matched && abs_power.matched && abs_power.length() == 0) { + throw LexerError{"exponent has no digits: " + match[0].str()}; } +} + +// This is a hacky attempt to describe a C-like grammar for floating-point +// numbers using a regex (the tests seem to pass though). +// A proper NFA would be better, I guess. +const std::string_view NUMBER_REGEX = R"REGEX(^(?:\d+(?:\.\d*)?|\.\d+)(e[+-]?(\d*))?)REGEX"; + +std::string_view std_match_number(const std::string_view& input) { + static constexpr auto flags = + std::regex_constants::ECMAScript | + std::regex_constants::icase; + static const std::regex number_regex{NUMBER_REGEX.data(), NUMBER_REGEX.length(), flags}; + + const auto match = std_regex_search(input, number_regex); + if (match.empty()) { + return {}; + } + check_exponent(match); return {match[0].first, static_cast(match[0].length())}; } -bool starts_with(const std::string_view& a, const std::string_view& b) noexcept { - return a.length() >= b.length() - && a.compare(0, b.length(), b) == 0; +std::string_view boost_match_number(const std::string_view& input) { + static const boost::regex number_regex{NUMBER_REGEX.data(), NUMBER_REGEX.length(), boost::regex::icase}; + + const auto match = boost_regex_search(input, number_regex); + if (match.empty()) { + return {}; + } + check_exponent(match); + return {match[0].first, static_cast(match[0].length())}; } +std::string_view match_number(const std::string_view& input) { + return std_match_number(input); } -std::optional parse_number(const std::string_view& input, std::string_view& token) { - const auto view = match_number(input); +using NumberMatcher = std::function; + +std::optional parse_number(const std::string_view& input, const NumberMatcher& match, std::string_view& token) { + const auto view = match(input); if (!view.data()) { return {}; } @@ -69,6 +104,39 @@ std::optional parse_number(const std::string_view& input, std::string_vi return {}; } +bool starts_with(const std::string_view& a, const std::string_view& b) noexcept { + return a.length() >= b.length() + && a.compare(0, b.length(), b) == 0; +} + +} + +namespace impl { + +std::optional std_parse_number(const std::string_view& input, std::string_view& token) { + return parse_number(input, &std_match_number, token); +} + +std::optional std_parse_number(const std::string_view& input) { + std::string_view token; + return std_parse_number(input, token); +} + +std::optional boost_parse_number(const std::string_view& input, std::string_view& token) { + return parse_number(input, &boost_match_number, token); +} + +std::optional boost_parse_number(const std::string_view& input) { + std::string_view token; + return boost_parse_number(input, token); +} + +} + +std::optional parse_number(const std::string_view& input, std::string_view& token) { + return impl::std_parse_number(input, token); +} + std::optional parse_number(const std::string_view& input) { std::string_view token; return parse_number(input, token); diff --git a/server/lexer/details/parse.hpp b/server/lexer/details/parse.hpp index 72da234..6a8688b 100644 --- a/server/lexer/details/parse.hpp +++ b/server/lexer/details/parse.hpp @@ -11,6 +11,14 @@ #include namespace math::server::lexer::details { +namespace impl { + +std::optional std_parse_number(const std::string_view&, std::string_view&); +std::optional std_parse_number(const std::string_view&); +std::optional boost_parse_number(const std::string_view&, std::string_view&); +std::optional boost_parse_number(const std::string_view&); + +} // Exposed for testing: std::string_view parse_whitespace(const std::string_view&); -- cgit v1.2.3