diff options
Diffstat (limited to 'server/lexer')
-rw-r--r-- | server/lexer/CMakeLists.txt | 2 | ||||
-rw-r--r-- | server/lexer/error.hpp | 16 | ||||
-rw-r--r-- | server/lexer/input.hpp | 42 | ||||
-rw-r--r-- | server/lexer/lexer.cpp | 206 | ||||
-rw-r--r-- | server/lexer/lexer.hpp | 61 | ||||
-rw-r--r-- | server/lexer/token.cpp | 71 | ||||
-rw-r--r-- | server/lexer/token.hpp | 49 | ||||
-rw-r--r-- | server/lexer/token_type.cpp | 164 | ||||
-rw-r--r-- | server/lexer/token_type.hpp | 37 |
9 files changed, 648 insertions, 0 deletions
diff --git a/server/lexer/CMakeLists.txt b/server/lexer/CMakeLists.txt new file mode 100644 index 0000000..65523a7 --- /dev/null +++ b/server/lexer/CMakeLists.txt @@ -0,0 +1,2 @@ +add_library(lexer lexer.cpp token.cpp token_type.cpp) +target_link_libraries(lexer PUBLIC common) diff --git a/server/lexer/error.hpp b/server/lexer/error.hpp new file mode 100644 index 0000000..99944c7 --- /dev/null +++ b/server/lexer/error.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include "../common/error.hpp" + +#include <string> + +namespace math::server { + +class LexerError : public Error { +public: + explicit LexerError(const std::string &what) + : Error{"lexer error: " + what} + { } +}; + +} diff --git a/server/lexer/input.hpp b/server/lexer/input.hpp new file mode 100644 index 0000000..1104a4b --- /dev/null +++ b/server/lexer/input.hpp @@ -0,0 +1,42 @@ +#pragma once + +#include "error.hpp" + +#include <cstddef> + +#include <string_view> + +namespace math::server::lexer { + +class Input { +public: + explicit Input(const std::string_view& input) + : m_pos{0}, m_input{input} + { } + + const std::string_view& get_input() const { return m_input; } + + std::size_t get_pos() const { return m_pos; } + + std::size_t get_length() const { return m_input.length(); } + + bool empty() const { return m_input.empty(); } + + void consume(std::size_t len) { + if (m_input.length() < len) { + throw LexerError{"internal: not enough input to consume"}; + } + m_pos += len; + m_input.remove_prefix(len); + } + + void consume(const std::string_view& sub) { + consume(sub.length()); + } + +private: + std::size_t m_pos; + std::string_view m_input; +}; + +} diff --git a/server/lexer/lexer.cpp b/server/lexer/lexer.cpp new file mode 100644 index 0000000..c7eea6d --- /dev/null +++ b/server/lexer/lexer.cpp @@ -0,0 +1,206 @@ +#include "error.hpp" +#include "lexer.hpp" +#include "token.hpp" +#include "token_type.hpp" + +#include <exception> +#include <optional> +#include <regex> +#include <string_view> +#include <string> +#include <vector> + +namespace math::server { +namespace lexer { +namespace { + +std::string_view match_number(const std::string_view& input) { + static constexpr std::regex::flag_type flags = + std::regex_constants::ECMAScript | + std::regex_constants::icase; + // This is a hacky attempt to describe a C-like grammar for floating-point + // numbers using a regex (the tests seem to pass though). + // A proper NFA would be better, I guess. + static const std::regex number_regex{R"REGEX(^(?:\d+(?:\.\d*)?|\.\d+)(e[+-]?(\d*))?)REGEX", flags}; + + std::cmatch match; + if (!std::regex_search(input.cbegin(), input.cend(), match, number_regex)) { + return {}; + } + { + // If we have the numeric part of a number followed by 'e' and no digits, + // 1) that 'e' definitely belongs to this number token, + // 2) the user forgot to type in the required digits. + const auto& exponent = match[1]; + const auto& abs_power = match[2]; + if (exponent.matched && abs_power.matched && abs_power.length() == 0) { + throw LexerError{"exponent has no digits: " + match[0].str()}; + } + } + return {match[0].first, match[0].length()}; +} + +std::optional<double> parse_number(const std::string_view& input, std::string_view& token) { + const auto view = match_number(input); + if (!view.data()) { + return {}; + } + try { + const auto result = std::stod(std::string{view}); + token = view; + return result; + } catch (const std::exception& e) { + throw LexerError{"internal: couldn't parse number from: " + std::string{view}}; + } + return {}; +} + +bool starts_with(const std::string_view& a, const std::string_view& b) noexcept { + return a.length() >= b.length() + && a.compare(0, b.length(), b) == 0; +} + +std::optional<token::Type> parse_const_token(const std::string_view& input, std::string_view& token) { + for (const auto type : token::const_tokens()) { + const auto str = token::type_to_string(type); + if (starts_with(input, str)) { + token = {input.cbegin(), str.length()}; + return {type}; + } + } + return {}; +} + +} + +namespace details { + +std::optional<double> parse_number(const std::string_view& input) { + std::string_view token; + return lexer::parse_number(input, token); +} + +std::optional<token::Type> parse_const_token(const std::string_view& input) { + std::string_view token; + return lexer::parse_const_token(input, token); +} + +std::string_view parse_whitespace(const std::string_view& input) { + static const std::regex ws_regex{R"(^\s+)"}; + + std::cmatch match; + if (std::regex_search(input.cbegin(), input.cend(), match, ws_regex)) { + return {match[0].first, match[0].length()}; + } + return {}; +} + +} +} + +Lexer::Lexer(const std::string_view& input) + : Lexer{lexer::Input{input}} { +} + +Lexer::Lexer(const lexer::Input& input) + : m_input{input} { + + consume_token(); +} + +bool Lexer::for_each_token(const TokenProcessor& process) { + for (auto token = peek_token(); token.has_value(); drop_token(), token = peek_token()) { + if (!process(*token)) { + return false; + } + } + return true; +} + +std::vector<Lexer::ParsedToken> Lexer::get_tokens() { + std::vector<ParsedToken> tokens; + for_each_token([&tokens] (const ParsedToken& token) { + tokens.emplace_back(token); + return true; + }); + return tokens; +} + +void Lexer::drop_token() { + if (!has_token()) { + throw LexerError{"internal: no tokens to drop"}; + } + m_token_buffer = {}; + consume_token(); +} + +std::optional<Lexer::ParsedToken> Lexer::drop_token_of_type(Type type) { + if (!has_token()) { + throw LexerError{"internal: no tokens to drop"}; + } + if (m_token_buffer.value().get_type() != type) { + return {}; + } + const auto result = m_token_buffer; + drop_token(); + return result; +} + +void Lexer::consume_whitespace() { + const auto ws = parse_whitespace(); + if (!ws.has_value()) { + return; + } + m_input.consume(ws->get_length()); +} + +void Lexer::consume_token() { + if (m_input.empty()) { + return; + } + consume_whitespace(); + if (m_input.empty()) { + return; + } + auto token{parse_token()}; + m_input.consume(token.get_length()); + m_token_buffer = std::move(token); +} + +std::optional<Lexer::ParsedToken> Lexer::parse_whitespace() const { + const auto token_view = lexer::details::parse_whitespace(m_input.get_input()); + if (token_view.empty()) { + return {}; + } + return ParsedToken{Token{Token::Type::WHITESPACE}, m_input.get_pos(), token_view}; +} + +std::optional<Lexer::ParsedToken> Lexer::parse_const_token() const { + std::string_view token_view; + const auto type = lexer::parse_const_token(m_input.get_input(), token_view); + if (!type.has_value()) { + return {}; + } + return ParsedToken{Token{*type}, m_input.get_pos(), token_view}; +} + +std::optional<Lexer::ParsedToken> Lexer::parse_number() const { + std::string_view token_view; + const auto number = lexer::parse_number(m_input.get_input(), token_view); + if (!number.has_value()) { + return {}; + } + return ParsedToken{Token{*number}, m_input.get_pos(), token_view}; +} + +Lexer::ParsedToken Lexer::parse_token() const { + if (const auto const_token = parse_const_token(); const_token.has_value()) { + return *const_token; + } + if (const auto number = parse_number(); number.has_value()) { + return *number; + } + throw LexerError{"invalid input at: " + std::string{m_input.get_input()}}; +} + +} diff --git a/server/lexer/lexer.hpp b/server/lexer/lexer.hpp new file mode 100644 index 0000000..d08a2df --- /dev/null +++ b/server/lexer/lexer.hpp @@ -0,0 +1,61 @@ +#pragma once + +#include "input.hpp" +#include "token.hpp" +#include "token_type.hpp" + +#include <functional> +#include <optional> +#include <string_view> +#include <vector> + +namespace math::server { +namespace lexer::details { + +// Exposed for testing: +std::string_view parse_whitespace(const std::string_view&); +std::optional<double> parse_number(const std::string_view&); +std::optional<token::Type> parse_const_token(const std::string_view&); + +} + +class Lexer { +public: + explicit Lexer(const std::string_view& input); + explicit Lexer(const lexer::Input& input); + + using Token = lexer::Token; + using ParsedToken = lexer::ParsedToken; + using Type = Token::Type; + using TokenProcessor = std::function<bool (const ParsedToken&)>; + + bool for_each_token(const TokenProcessor& process); + + std::vector<ParsedToken> get_tokens(); + + bool has_token() const { + return peek_token().has_value(); + } + + std::optional<ParsedToken> peek_token() const { + return m_token_buffer; + } + + void drop_token(); + std::optional<ParsedToken> drop_token_of_type(Type type); + +private: + std::optional<ParsedToken> parse_whitespace() const; + std::optional<ParsedToken> parse_const_token() const; + std::optional<ParsedToken> parse_number() const; + + ParsedToken parse_token() const; + + void consume_whitespace(); + void consume_token(); + + lexer::Input m_input; + std::optional<ParsedToken> m_token_buffer; +}; + +} diff --git a/server/lexer/token.cpp b/server/lexer/token.cpp new file mode 100644 index 0000000..6ffb721 --- /dev/null +++ b/server/lexer/token.cpp @@ -0,0 +1,71 @@ +#include "error.hpp" +#include "token.hpp" +#include "token_type.hpp" + +#include <cmath> + +#include <limits> +#include <variant> + +namespace math::server::lexer { +namespace { + +static constexpr double nan() { return std::numeric_limits<double>::quiet_NaN(); } + +static bool is_nan(double x) { return std::isnan(x); } + +static bool numbers_equal(double x, double y) { + if (is_nan(x) && is_nan(y)) { + return true; + } + return x == y; +} + +} + +Token::Token(Type type) + : m_type{type} { + + if (token::token_has_value(type)) { + throw LexerError{"internal: must have a value: " + token::type_to_int_string(type)}; + } +} + +Token::Token(double value) + : m_type{Type::NUMBER}, m_value{value} +{ } + +bool Token::operator==(const Token& other) const { + if (m_type != other.m_type) { + return false; + } + if (token::is_const_token(m_type)) { + return true; + } + if (m_type == Type::NUMBER) { + return numbers_equal(as_number(), other.as_number()); + } + throw LexerError{"internal: can't compare tokens of type: " + token::type_to_int_string(m_type)}; +} + +double Token::as_number() const { + const auto type = get_type(); + if (type != Type::NUMBER) { + throw LexerError{"internal: not a number: " + token::type_to_int_string(type)}; + } + return std::get<double>(m_value); +} + +std::ostream& operator<<(std::ostream& os, const Token& token) { + switch (token.m_type) { + case token::Type::NUMBER: + os << token.as_number(); + break; + default: + os << token::type_to_string(token.m_type); + break; + } + return os; +} + +} diff --git a/server/lexer/token.hpp b/server/lexer/token.hpp new file mode 100644 index 0000000..6f98383 --- /dev/null +++ b/server/lexer/token.hpp @@ -0,0 +1,49 @@ +#pragma once + +#include "token_type.hpp" + +#include <cstddef> + +#include <string_view> +#include <utility> +#include <variant> + +namespace math::server::lexer { + +class Token { +public: + using Type = token::Type; + + explicit Token(token::Type type); + explicit Token(double value); + + bool operator==(const Token& other) const; + bool operator!=(const Token& other) const { return !(*this == other); } + + Type get_type() const { return m_type; } + + double as_number() const; + +private: + token::Type m_type; + std::variant<double> m_value; + + friend std::ostream& operator<<(std::ostream&, const Token&); +}; + +class ParsedToken : public Token { +public: + ParsedToken(Token&& token, std::size_t pos, const std::string_view& view) + : Token{std::move(token)}, m_pos{pos}, m_view{view} { + } + + std::size_t get_pos() const { return m_pos; } + + std::size_t get_length() const { return m_view.length(); } + +private: + std::size_t m_pos; + std::string_view m_view; +}; + +} diff --git a/server/lexer/token_type.cpp b/server/lexer/token_type.cpp new file mode 100644 index 0000000..9a69ba1 --- /dev/null +++ b/server/lexer/token_type.cpp @@ -0,0 +1,164 @@ +#include "error.hpp" +#include "token_type.hpp" + +#include <functional> +#include <map> +#include <ostream> +#include <stdexcept> +#include <string> +#include <unordered_map> + +namespace math::server::lexer::token { +namespace { + +using ToStringMap = std::unordered_map<Type, std::string>; +using FromStringMap = std::map<std::string, Type, std::greater<std::string>>; + +class ToStringConverter { +public: + ToStringConverter() : m_map{to_string_map()} { + validate(); + } + + const ToStringMap& map() const { return m_map; } + +private: + static const ToStringMap& to_string_map() { + static const ToStringMap map{ + {Type::WHITESPACE, "whitespace"}, + {Type::PLUS, "+"}, + {Type::MINUS, "-"}, + {Type::ASTERISK, "*"}, + {Type::SLASH, "/"}, + {Type::LEFT_PAREN, "("}, + {Type::RIGHT_PAREN, ")"}, + {Type::NUMBER, "number"}, + }; + return map; + } + + void validate() const { + check_for_duplicates(); + } + + void check_for_duplicates() const { + std::unordered_set<std::string> strings; + for (const auto& [type, str] : m_map) { + const auto [_, inserted] = strings.emplace(str); + if (!inserted) { + throw std::logic_error{"multiple tokens have the same string representation: " + str}; + } + } + } + + const ToStringMap& m_map; +}; + +const ToStringMap& to_string_map() { + static const ToStringConverter converter; + return converter.map(); +} + +class FromStringConverter { +public: + FromStringConverter(const ToStringMap& to_string) + : m_map{build_map(to_string)} { + } + + const FromStringMap& map() const { return m_map; } + +private: + static FromStringMap build_map(const ToStringMap& to_string) { + FromStringMap from_string; + for (const auto& [type, str] : to_string) { + const auto [_, inserted] = from_string.emplace(str, type); + if (!inserted) { + throw std::logic_error{"multiple tokens have the same string representation: " + str}; + } + } + return from_string; + } + + FromStringMap m_map; +}; + +const FromStringMap& from_string_map() { + static const FromStringConverter converter{to_string_map()}; + return converter.map(); +} + +class ConstTokens { +public: + ConstTokens() { + const auto& map = to_string_map(); + for (const auto& [type, _] : map) { + if (is_const_token(type)) { + m_set.emplace(type); + } + } + } + + const TypeSet& set() const { return m_set; } + +private: + TypeSet m_set; +}; + +} + +TypeInt type_to_int(Type type) { + return static_cast<TypeInt>(type); +} + +std::string type_to_int_string(Type type) { + return std::to_string(type_to_int(type)); +} + +bool is_const_token(Type type) { + switch (type) { + case Type::WHITESPACE: + case Type::NUMBER: + return false; + default: + return true; + } +} + +const TypeSet& const_tokens() { + static const ConstTokens tokens; + return tokens.set(); +} + +bool token_has_value(Type type) { + switch (type) { + case Type::NUMBER: + return true; + default: + return false; + } +} + +std::string type_to_string(Type type) { + const auto& map = to_string_map(); + const auto it = map.find(type); + if (it == map.cend()) { + throw LexerError{"type_to_string: unsupported token type: " + type_to_int_string(type)}; + } + return it->second; +} + +Type type_from_string(const std::string& src) { + const auto& map = from_string_map(); + const auto it = map.find(src); + if (it == map.cend()) { + throw LexerError{"type_from_string: unsupported token: " + std::string{src}}; + } + return it->second; +} + +std::ostream& operator<<(std::ostream& os, const Type& type) { + os << type_to_int(type); + return os; +} + +} diff --git a/server/lexer/token_type.hpp b/server/lexer/token_type.hpp new file mode 100644 index 0000000..9489915 --- /dev/null +++ b/server/lexer/token_type.hpp @@ -0,0 +1,37 @@ +#pragma once + +#include <ostream> +#include <string> +#include <type_traits> +#include <unordered_set> + +namespace math::server::lexer::token { + +enum class Type { + WHITESPACE, + PLUS, + MINUS, + ASTERISK, + SLASH, + LEFT_PAREN, + RIGHT_PAREN, + NUMBER, +}; + +using TypeInt = std::underlying_type<Type>::type; +using TypeSet = std::unordered_set<Type>; + +TypeInt type_to_int(Type); +std::string type_to_int_string(Type); + +bool is_const_token(Type); +const TypeSet& const_tokens(); + +bool token_has_value(Type); + +std::string type_to_string(Type); +Type type_from_string(const std::string&); + +std::ostream& operator<<(std::ostream&, const Type&); + +} |