#pragma once #include "error.hpp" #include #include #include #include #include #include #include #include #include #include namespace math::server { namespace lexer { class Error : public server::Error { public: explicit Error(const std::string &what) : server::Error{"lexer error: " + what} { } }; class Token { public: enum class Type { LEFT_PAREN, RIGHT_PAREN, PLUS, MINUS, ASTERISK, SLASH, NUMBER, }; explicit Token(Type type) : m_type{type}, m_number_value{nan()} { } explicit Token(double number_value) : m_type{Type::NUMBER}, m_number_value{number_value} { } bool operator==(const Token& other) const { return m_type == other.m_type && ((is_nan(m_number_value) && is_nan(other.m_number_value)) || m_number_value == other.m_number_value); } bool operator!=(const Token& other) const { return !(*this == other); } Type get_type() const { return m_type; } double get_number_value() const { if (get_type() != Type::NUMBER) { throw Error{"token must be a number to query its value"}; } return m_number_value; } private: static constexpr double nan() { return std::numeric_limits::quiet_NaN(); } static bool is_nan(double x) { return std::isnan(x); } Type m_type; double m_number_value; }; namespace details { inline std::string_view match_number(const std::string_view& input) { static constexpr std::regex::flag_type flags = std::regex_constants::ECMAScript | std::regex_constants::icase; // This is a hacky attempt to describe a C-like grammar for floating-point // numbers using a regex (the tests seem to pass though). // A proper NFA would be better, I guess. static const std::regex number_regex{R"REGEX(^(?:\d+(?:\.\d*)?|\.\d+)(e[+-]?(\d*))?)REGEX", flags}; std::cmatch match; if (!std::regex_search(input.cbegin(), input.cend(), match, number_regex)) { return {}; } // If we have the numeric part of a number followed by 'e' and no digits, // 1) that 'e' definitely belongs to this number token, // 2) the user forgot to type in the required digits. const auto& exponent = match[1]; const auto& abs_power = match[2]; if (exponent.matched && abs_power.matched && abs_power.length() == 0) { throw lexer::Error{"exponent has no digits: " + match[0].str()}; } return {match[0].first, match[0].length()}; } inline std::optional parse_number(const std::string_view& input, std::string_view& token) { const auto view = match_number(input); if (!view.data()) { return {}; } try { const auto result = std::stod(std::string{view}); token = view; return result; } catch (const std::exception& e) { throw lexer::Error{"couldn't parse number from: " + std::string{view}}; } return {}; } inline std::optional parse_number(const std::string_view& input) { std::string_view token; return parse_number(input, token); } inline bool starts_with(const std::string_view& a, const std::string_view& b) noexcept { return a.length() >= b.length() && a.compare(0, b.length(), b) == 0; } inline std::optional parse_const_token(const std::string_view& input, std::string_view& token) { // FIXME: Potentially error-prone if there's const token A which is a // prefix of token B (if the map is not sorted, we'd parse token A, when it // could've been token B). // Can be solved by sorting the keys accordingly. static const std::unordered_map const_tokens{ {"(", Token::Type::LEFT_PAREN}, {")", Token::Type::RIGHT_PAREN}, {"+", Token::Type::PLUS}, {"-", Token::Type::MINUS}, {"*", Token::Type::ASTERISK}, {"/", Token::Type::SLASH}, }; for (const auto& it : const_tokens) { const auto& str = it.first; const auto& type = it.second; if (starts_with(input, str)) { token = input.substr(0, str.length()); return type; } } return {}; } inline std::optional parse_const_token(const std::string_view& input) { std::string_view token; return parse_const_token(input, token); } inline std::string_view parse_whitespace(const std::string_view& input) { static const std::regex ws_regex{R"(\s*)"}; std::cmatch match; if (std::regex_search(input.cbegin(), input.cend(), match, ws_regex)) { return {match[0].first, match[0].length()}; } return {}; } } } class Lexer { public: explicit Lexer(const std::string_view& input) : m_input{input} { } using TokenProcessor = std::function; bool for_each_token(const TokenProcessor& process) { parse_token(); for (auto token = peek_token(); token.has_value(); drop_token(), token = peek_token()) { if (!process(*token)) { return false; } } return true; } std::vector get_tokens() { std::vector tokens; for_each_token([&tokens] (const lexer::Token& token) { tokens.emplace_back(token); return true; }); return tokens; } void parse_token() { if (m_input.length() == 0) { return; } std::string_view token_view; m_token_buffer = parse_token(token_view); if (m_token_buffer.has_value()) { m_input.remove_prefix(token_view.length()); } } bool has_token() const { return peek_token().has_value(); } std::optional peek_token() const { return m_token_buffer; } void drop_token() { if (!has_token()) { throw lexer::Error{"internal: no tokens to drop"}; } m_token_buffer = {}; parse_token(); } std::optional drop_token_if(lexer::Token::Type type) { if (!has_token()) { throw lexer::Error{"internal: no tokens to drop"}; } if (m_token_buffer.value().get_type() != type) { return {}; } const auto result = m_token_buffer; drop_token(); return result; } private: void consume_whitespace() { const auto ws = lexer::details::parse_whitespace(m_input); m_input.remove_prefix(ws.length()); } std::optional parse_token(std::string_view& token_view) { consume_whitespace(); if (m_input.length() == 0) { return {}; } if (const auto const_token = lexer::details::parse_const_token(m_input, token_view); const_token.has_value()) { return lexer::Token{*const_token}; } if (const auto number = lexer::details::parse_number(m_input, token_view); number.has_value()) { return lexer::Token{*number}; } throw lexer::Error{"invalid input at: " + std::string{m_input}}; } std::string_view m_input; std::optional m_token_buffer; }; }