From 5c11d731cadd06204c8c6d6c1aea2253450204a8 Mon Sep 17 00:00:00 2001 From: Egor Tensin Date: Mon, 6 Jan 2020 13:40:22 +0300 Subject: lexer: std:: vs boost:: for whitespace parsing Added the corresponding benchmarks too. --- server/lexer/details/parse.cpp | 132 ++++++++++++++++++++++++++--------------- server/lexer/details/parse.hpp | 5 ++ test/benchmarks/lexer.cpp | 33 ++++++++++- 3 files changed, 119 insertions(+), 51 deletions(-) diff --git a/server/lexer/details/parse.cpp b/server/lexer/details/parse.cpp index 79ad4cf..25a77ce 100644 --- a/server/lexer/details/parse.cpp +++ b/server/lexer/details/parse.cpp @@ -20,7 +20,28 @@ namespace math::server::lexer::details { namespace { -class RegexNumberMatcher { +template typename MatchResultsT> +class RegexMatcher { +public: + using MatchResults = MatchResultsT; + + virtual ~RegexMatcher() = default; + + virtual bool match_regex(const std::string_view& input) = 0; + + std::string_view to_view() const { + return {&*m_match[0].first, static_cast(m_match[0].length())}; + // ^ I fucking hate C++. + } + + std::string to_str() const { return m_match[0].str(); } + +protected: + MatchResults m_match; +}; + +template typename MatchResultsT> +class RegexNumberMatcher : public RegexMatcher { public: bool match(const std::string_view& input) { if (!match_regex(input)) { @@ -36,8 +57,6 @@ public: return true; } - virtual std::string_view to_view() const = 0; - protected: // This is a hacky attempt to describe a C-like grammar for floating-point // numbers using a regex (the tests seem to pass though). @@ -45,27 +64,18 @@ protected: static constexpr std::string_view NUMBER_REGEX{R"REGEX(^(?:\d+(?:\.\d*)?|\.\d+)(e[+-]?(\d*))?)REGEX"}; private: - virtual bool match_regex(const std::string_view& input) = 0; + bool matched_e() const { return m_match[1].matched; } - virtual std::string to_str() const = 0; - - virtual bool matched_e() const = 0; - - virtual bool matched_e_power() const = 0; + bool matched_e_power() const { return m_match[2].matched && m_match[2].length() != 0; } }; -class StdNumberMatcher : public RegexNumberMatcher { +class StdNumberMatcher : public RegexNumberMatcher { public: - std::string_view to_view() const override { - return {&*m_match[0].first, static_cast(m_match[0].length())}; - // ^ I fucking hate C++. - } - -private: bool match_regex(const std::string_view& input) override { return std::regex_search(input.cbegin(), input.cend(), m_match, get_regex()); } +private: static const std::regex& get_regex() { static constexpr auto flags = std::regex_constants::ECMAScript | @@ -73,28 +83,15 @@ private: static const std::regex regex{NUMBER_REGEX.data(), NUMBER_REGEX.length(), flags}; return regex; } - - std::string to_str() const override { return m_match[0].str(); } - - bool matched_e() const override { return m_match[1].matched; } - - bool matched_e_power() const override { return m_match[2].matched && m_match[2].length() != 0; } - - std::match_results m_match; }; -class BoostNumberMatcher : public RegexNumberMatcher { +class BoostNumberMatcher : public RegexNumberMatcher { public: - std::string_view to_view() const override { - return {&*m_match[0].first, static_cast(m_match[0].length())}; - // ^ I fucking hate C++. - } - -private: bool match_regex(const std::string_view& input) override { return boost::regex_search(input.cbegin(), input.cend(), m_match, get_regex()); } +private: static const boost::regex& get_regex() { static constexpr boost::regex::flag_type flags = boost::regex::ECMAScript | @@ -102,17 +99,10 @@ private: static const boost::regex regex{NUMBER_REGEX.data(), NUMBER_REGEX.length(), flags}; return regex; } - - std::string to_str() const override { return m_match[0].str(); } - - bool matched_e() const override { return m_match[1].matched; } - - bool matched_e_power() const override { return m_match[2].matched && m_match[2].length() != 0; } - - boost::match_results m_match; }; -std::optional parse_number(const std::string_view& input, RegexNumberMatcher&& matcher, std::string_view& token) { +template typename MatchResultsT> +std::optional parse_number(const std::string_view& input, RegexNumberMatcher&& matcher, std::string_view& token) { if (!matcher.match(input)) { return {}; } @@ -127,6 +117,51 @@ std::optional parse_number(const std::string_view& input, RegexNumberMat return {}; } +template typename MatchResultsT> +class RegexWhitespaceMatcher : public RegexMatcher { +protected: + // This is a hacky attempt to describe a C-like grammar for floating-point + // numbers using a regex (the tests seem to pass though). + // A proper NFA would be better, I guess. + static constexpr std::string_view WS_REGEX{R"(^\s+)"}; +}; + +class StdWhitespaceMatcher : public RegexWhitespaceMatcher { +public: + bool match_regex(const std::string_view& input) override { + return std::regex_search(input.cbegin(), input.cend(), m_match, get_regex()); + } + +private: + static const std::regex& get_regex() { + static constexpr auto flags = std::regex_constants::ECMAScript; + static const std::regex regex{WS_REGEX.data(), WS_REGEX.length(), flags}; + return regex; + } +}; + +class BoostWhitespaceMatcher : public RegexWhitespaceMatcher { +public: + bool match_regex(const std::string_view& input) override { + return boost::regex_search(input.cbegin(), input.cend(), m_match, get_regex()); + } + +private: + static const boost::regex& get_regex() { + static constexpr boost::regex::flag_type flags = boost::regex::ECMAScript; + static const boost::regex regex{WS_REGEX.data(), WS_REGEX.length(), flags}; + return regex; + } +}; + +template typename MatchResultsT> +std::string_view parse_whitespace(const std::string_view& input, RegexWhitespaceMatcher&& matcher) { + if (matcher.match_regex(input)) { + return matcher.to_view(); + } + return {}; +} + bool starts_with(const std::string_view& a, const std::string_view& b) noexcept { return a.length() >= b.length() && a.compare(0, b.length(), b) == 0; @@ -154,6 +189,14 @@ std::optional boost_parse_number(const std::string_view& input) { return boost_parse_number(input, token); } +std::string_view std_parse_whitespace(const std::string_view& input) { + return parse_whitespace(input, StdWhitespaceMatcher{}); +} + +std::string_view boost_parse_whitespace(const std::string_view& input) { + return parse_whitespace(input, BoostWhitespaceMatcher{}); +} + } std::optional parse_number(const std::string_view& input, std::string_view& token) { @@ -182,14 +225,7 @@ std::optional parse_const_token(const std::string_view& input) { } std::string_view parse_whitespace(const std::string_view& input) { - static const boost::regex ws_regex{R"(^\s+)"}; - - boost::match_results match; - if (boost::regex_search(input.cbegin(), input.cend(), match, ws_regex)) { - return {&*match[0].first, static_cast(match[0].length())}; - // ^ Still fucking hate C++. - } - return {}; + return impl::boost_parse_whitespace(input); } } diff --git a/server/lexer/details/parse.hpp b/server/lexer/details/parse.hpp index 6a8688b..693dd35 100644 --- a/server/lexer/details/parse.hpp +++ b/server/lexer/details/parse.hpp @@ -13,11 +13,16 @@ namespace math::server::lexer::details { namespace impl { +// Exposed for benchmarking: + std::optional std_parse_number(const std::string_view&, std::string_view&); std::optional std_parse_number(const std::string_view&); std::optional boost_parse_number(const std::string_view&, std::string_view&); std::optional boost_parse_number(const std::string_view&); +std::string_view std_parse_whitespace(const std::string_view&); +std::string_view boost_parse_whitespace(const std::string_view&); + } // Exposed for testing: diff --git a/test/benchmarks/lexer.cpp b/test/benchmarks/lexer.cpp index 4279d8b..3eb895c 100644 --- a/test/benchmarks/lexer.cpp +++ b/test/benchmarks/lexer.cpp @@ -2,7 +2,7 @@ #include -class SelectionOfNumbers : public benchmark::Fixture { +class NumberExamples : public benchmark::Fixture { protected: std::vector m_numbers{ "0", @@ -15,7 +15,16 @@ protected: }; }; -BENCHMARK_F(SelectionOfNumbers, ParseStdRegex)(benchmark::State& state) { +class WhitespaceExamples : public benchmark::Fixture { +protected: + std::vector m_whitespace{ + "", + " 1", + " 123", + }; +}; + +BENCHMARK_F(NumberExamples, StdParseNumber)(benchmark::State& state) { using namespace math::server::lexer::details; for (auto _ : state) { for (const auto& src : m_numbers) { @@ -24,7 +33,7 @@ BENCHMARK_F(SelectionOfNumbers, ParseStdRegex)(benchmark::State& state) { } } -BENCHMARK_F(SelectionOfNumbers, ParseBoostRegex)(benchmark::State& state) { +BENCHMARK_F(NumberExamples, BoostParseNumber)(benchmark::State& state) { using namespace math::server::lexer::details; for (auto _ : state) { for (const auto& src : m_numbers) { @@ -32,3 +41,21 @@ BENCHMARK_F(SelectionOfNumbers, ParseBoostRegex)(benchmark::State& state) { } } } + +BENCHMARK_F(WhitespaceExamples, StdParseWhitespace)(benchmark::State& state) { + using namespace math::server::lexer::details; + for (auto _ : state) { + for (const auto& src : m_whitespace) { + impl::std_parse_whitespace(src); + } + } +} + +BENCHMARK_F(WhitespaceExamples, BoostParseWhitespace)(benchmark::State& state) { + using namespace math::server::lexer::details; + for (auto _ : state) { + for (const auto& src : m_whitespace) { + impl::boost_parse_whitespace(src); + } + } +} -- cgit v1.2.3