1 files changed, 261 insertions, 0 deletions
diff --git a/server/lexer.hpp b/server/lexer.hpp
new file mode 100644
index 0000000..8afe15c
--- /dev/null
+++ b/server/lexer.hpp
@@ -0,0 +1,261 @@
+#pragma once
+
+#include "error.hpp"
+
+#include <cmath>
+
+#include <exception>
+#include <functional>
+#include <limits>
+#include <optional>
+#include <regex>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <vector>
+
+namespace math::server {
+namespace lexer {
+
+class Error : public server::Error {
+public:
+    explicit Error(const std::string &what)
+        : server::Error{"lexer error: " + what}
+    { }
+};
+
+class Token {
+public:
+    enum class Type {
+        LEFT_PAREN,
+        RIGHT_PAREN,
+        PLUS,
+        MINUS,
+        ASTERISK,
+        SLASH,
+        NUMBER,
+    };
+
+    explicit Token(Type type)
+        : m_type{type}, m_number_value{nan()}
+    { }
+
+    explicit Token(double number_value)
+        : m_type{Type::NUMBER}, m_number_value{number_value}
+    { }
+
+    bool operator==(const Token& other) const {
+        return m_type == other.m_type
+            && ((is_nan(m_number_value) && is_nan(other.m_number_value))
+                || m_number_value == other.m_number_value);
+    }
+
+    bool operator!=(const Token& other) const { return !(*this == other); }
+
+    Type get_type() const { return m_type; }
+
+    double get_number_value() const {
+        if (get_type() != Type::NUMBER) {
+            throw Error{"token must be a number to query its value"};
+        }
+        return m_number_value;
+    }
+
+private:
+    static constexpr double nan() { return std::numeric_limits<double>::quiet_NaN(); }
+
+    static bool is_nan(double x) { return std::isnan(x); }
+
+    Type m_type;
+    double m_number_value;
+};
+
+namespace details {
+
+inline std::string_view match_number(const std::string_view& input) {
+    static constexpr std::regex::flag_type flags =
+        std::regex_constants::ECMAScript |
+        std::regex_constants::icase;
+    // This is a hacky attempt to describe a C-like grammar for floating-point
+    // numbers using a regex (the tests seem to pass though).
+    // A proper NFA would be better, I guess.
+    static const std::regex number_regex{R"REGEX(^(?:\d+(?:\.\d*)?|\.\d+)(e[+-]?(\d*))?)REGEX", flags};
+
+    std::cmatch match;
+    if (!std::regex_search(input.cbegin(), input.cend(), match, number_regex)) {
+        return {};
+    }
+    // If we have the numeric part of a number followed by 'e' and no digits,
+    // 1) that 'e' definitely belongs to this number token,
+    // 2) the user forgot to type in the required digits.
+    const auto& exponent = match[1];
+    const auto& abs_power = match[2];
+    if (exponent.matched && abs_power.matched && abs_power.length() == 0) {
+        throw lexer::Error{"exponent has no digits: " + match[0].str()};
+    }
+    return {match[0].first, match[0].length()};
+}
+
+inline std::optional<double> parse_number(const std::string_view& input, std::string_view& token) {
+    const auto view = match_number(input);
+    if (!view.data()) {
+        return {};
+    }
+    try {
+        const auto result = std::stod(std::string{view});
+        token = view;
+        return result;
+    } catch (const std::exception& e) {
+        throw lexer::Error{"couldn't parse number from: " + std::string{view}};
+    }
+    return {};
+}
+
+inline std::optional<double> parse_number(const std::string_view& input) {
+    std::string_view token;
+    return parse_number(input, token);
+}
+
+inline bool starts_with(const std::string_view& a, const std::string_view& b) noexcept {
+    return a.length() >= b.length()
+        && a.compare(0, b.length(), b) == 0;
+}
+
+inline std::optional<Token::Type> parse_const_token(const std::string_view& input, std::string_view& token) {
+    // FIXME: Potentially error-prone if there's const token A which is a
+    // prefix of token B (if the map is not sorted, we'd parse token A, when it
+    // could've been token B).
+    // Can be solved by sorting the keys accordingly.
+
+    static const std::unordered_map<std::string_view, Token::Type> const_tokens{
+        {"(", Token::Type::LEFT_PAREN},
+        {")", Token::Type::RIGHT_PAREN},
+        {"+", Token::Type::PLUS},
+        {"-", Token::Type::MINUS},
+        {"*", Token::Type::ASTERISK},
+        {"/", Token::Type::SLASH},
+    };
+
+    for (const auto& it : const_tokens) {
+        const auto& str = it.first;
+        const auto& type = it.second;
+
+        if (starts_with(input, str)) {
+            token = input.substr(0, str.length());
+            return type;
+        }
+    }
+
+    return {};
+}
+
+inline std::optional<Token::Type> parse_const_token(const std::string_view& input) {
+    std::string_view token;
+    return parse_const_token(input, token);
+}
+
+inline std::string_view parse_whitespace(const std::string_view& input) {
+    static const std::regex ws_regex{R"(\s*)"};
+
+    std::cmatch match;
+    if (std::regex_search(input.cbegin(), input.cend(), match, ws_regex)) {
+        return {match[0].first, match[0].length()};
+    }
+    return {};
+}
+
+}
+
+}
+
+class Lexer {
+public:
+    explicit Lexer(const std::string_view& input)
+        : m_input{input} {
+    }
+
+    using TokenProcessor = std::function<bool (const lexer::Token&)>;
+
+    bool for_each_token(const TokenProcessor& process) {
+        parse_token();
+        for (auto token = peek_token(); token.has_value(); drop_token(), token = peek_token()) {
+            if (!process(*token)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    std::vector<lexer::Token> get_tokens() {
+        std::vector<lexer::Token> tokens;
+        for_each_token([&tokens] (const lexer::Token& token) {
+            tokens.emplace_back(token);
+            return true;
+        });
+        return tokens;
+    }
+
+    void parse_token() {
+        if (m_input.length() == 0) {
+            return;
+        }
+        std::string_view token_view;
+        m_token_buffer = parse_token(token_view);
+        if (m_token_buffer.has_value()) {
+            m_input.remove_prefix(token_view.length());
+        }
+    }
+
+    bool has_token() const {
+        return peek_token().has_value();
+    }
+
+    std::optional<lexer::Token> peek_token() const {
+        return m_token_buffer;
+    }
+
+    void drop_token() {
+        if (!has_token()) {
+            throw lexer::Error{"internal: no tokens to drop"};
+        }
+        m_token_buffer = {};
+        parse_token();
+    }
+
+    std::optional<lexer::Token> drop_token_if(lexer::Token::Type type) {
+        if (!has_token()) {
+            throw lexer::Error{"internal: no tokens to drop"};
+        }
+        if (m_token_buffer.value().get_type() != type) {
+            return {};
+        }
+        const auto result = m_token_buffer;
+        drop_token();
+        return result;
+    }
+
+private:
+    void consume_whitespace() {
+        const auto ws = lexer::details::parse_whitespace(m_input);
+        m_input.remove_prefix(ws.length());
+    }
+
+    std::optional<lexer::Token> parse_token(std::string_view& token_view) {
+        consume_whitespace();
+        if (m_input.length() == 0) {
+            return {};
+        }
+        if (const auto const_token = lexer::details::parse_const_token(m_input, token_view); const_token.has_value()) {
+            return lexer::Token{*const_token};
+        }
+        if (const auto number = lexer::details::parse_number(m_input, token_view); number.has_value()) {
+            return lexer::Token{*number};
+        }
+        throw lexer::Error{"invalid input at: " + std::string{m_input}};
+    }
+
+    std::string_view m_input;
+    std::optional<lexer::Token> m_token_buffer;
+};
+
+}