aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/server/lexer
diff options
context:
space:
mode:
authorEgor Tensin <Egor.Tensin@gmail.com>2019-12-07 03:36:21 +0300
committerEgor Tensin <Egor.Tensin@gmail.com>2019-12-07 03:36:21 +0300
commit00863566ec4601c65c435b74e575d49546a1c707 (patch)
tree479a0a6e96aba8191c7a65ea9bee2f4d5e3a4aba /server/lexer
parentadd stress_test.py (diff)
downloadmath-server-00863566ec4601c65c435b74e575d49546a1c707.tar.gz
math-server-00863566ec4601c65c435b74e575d49546a1c707.zip
split server into multiple components
In a vague attempt to make header files more readable, split server/ into a number of components. Also, refactor the unit tests to use the "Data-driven test cases" of Boost.Test.
Diffstat (limited to 'server/lexer')
-rw-r--r--server/lexer/CMakeLists.txt2
-rw-r--r--server/lexer/error.hpp16
-rw-r--r--server/lexer/input.hpp42
-rw-r--r--server/lexer/lexer.cpp206
-rw-r--r--server/lexer/lexer.hpp61
-rw-r--r--server/lexer/token.cpp71
-rw-r--r--server/lexer/token.hpp49
-rw-r--r--server/lexer/token_type.cpp164
-rw-r--r--server/lexer/token_type.hpp37
9 files changed, 648 insertions, 0 deletions
diff --git a/server/lexer/CMakeLists.txt b/server/lexer/CMakeLists.txt
new file mode 100644
index 0000000..65523a7
--- /dev/null
+++ b/server/lexer/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_library(lexer lexer.cpp token.cpp token_type.cpp)
+target_link_libraries(lexer PUBLIC common)
diff --git a/server/lexer/error.hpp b/server/lexer/error.hpp
new file mode 100644
index 0000000..99944c7
--- /dev/null
+++ b/server/lexer/error.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../common/error.hpp"
+
+#include <string>
+
+namespace math::server {
+
+class LexerError : public Error {
+public:
+ explicit LexerError(const std::string &what)
+ : Error{"lexer error: " + what}
+ { }
+};
+
+}
diff --git a/server/lexer/input.hpp b/server/lexer/input.hpp
new file mode 100644
index 0000000..1104a4b
--- /dev/null
+++ b/server/lexer/input.hpp
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "error.hpp"
+
+#include <cstddef>
+
+#include <string_view>
+
+namespace math::server::lexer {
+
+class Input {
+public:
+ explicit Input(const std::string_view& input)
+ : m_pos{0}, m_input{input}
+ { }
+
+ const std::string_view& get_input() const { return m_input; }
+
+ std::size_t get_pos() const { return m_pos; }
+
+ std::size_t get_length() const { return m_input.length(); }
+
+ bool empty() const { return m_input.empty(); }
+
+ void consume(std::size_t len) {
+ if (m_input.length() < len) {
+ throw LexerError{"internal: not enough input to consume"};
+ }
+ m_pos += len;
+ m_input.remove_prefix(len);
+ }
+
+ void consume(const std::string_view& sub) {
+ consume(sub.length());
+ }
+
+private:
+ std::size_t m_pos;
+ std::string_view m_input;
+};
+
+}
diff --git a/server/lexer/lexer.cpp b/server/lexer/lexer.cpp
new file mode 100644
index 0000000..c7eea6d
--- /dev/null
+++ b/server/lexer/lexer.cpp
@@ -0,0 +1,206 @@
+#include "error.hpp"
+#include "lexer.hpp"
+#include "token.hpp"
+#include "token_type.hpp"
+
+#include <exception>
+#include <optional>
+#include <regex>
+#include <string_view>
+#include <string>
+#include <vector>
+
+namespace math::server {
+namespace lexer {
+namespace {
+
+std::string_view match_number(const std::string_view& input) {
+ static constexpr std::regex::flag_type flags =
+ std::regex_constants::ECMAScript |
+ std::regex_constants::icase;
+ // This is a hacky attempt to describe a C-like grammar for floating-point
+ // numbers using a regex (the tests seem to pass though).
+ // A proper NFA would be better, I guess.
+ static const std::regex number_regex{R"REGEX(^(?:\d+(?:\.\d*)?|\.\d+)(e[+-]?(\d*))?)REGEX", flags};
+
+ std::cmatch match;
+ if (!std::regex_search(input.cbegin(), input.cend(), match, number_regex)) {
+ return {};
+ }
+ {
+ // If we have the numeric part of a number followed by 'e' and no digits,
+ // 1) that 'e' definitely belongs to this number token,
+ // 2) the user forgot to type in the required digits.
+ const auto& exponent = match[1];
+ const auto& abs_power = match[2];
+ if (exponent.matched && abs_power.matched && abs_power.length() == 0) {
+ throw LexerError{"exponent has no digits: " + match[0].str()};
+ }
+ }
+ return {match[0].first, match[0].length()};
+}
+
+std::optional<double> parse_number(const std::string_view& input, std::string_view& token) {
+ const auto view = match_number(input);
+ if (!view.data()) {
+ return {};
+ }
+ try {
+ const auto result = std::stod(std::string{view});
+ token = view;
+ return result;
+ } catch (const std::exception& e) {
+ throw LexerError{"internal: couldn't parse number from: " + std::string{view}};
+ }
+ return {};
+}
+
+bool starts_with(const std::string_view& a, const std::string_view& b) noexcept {
+ return a.length() >= b.length()
+ && a.compare(0, b.length(), b) == 0;
+}
+
+std::optional<token::Type> parse_const_token(const std::string_view& input, std::string_view& token) {
+ for (const auto type : token::const_tokens()) {
+ const auto str = token::type_to_string(type);
+ if (starts_with(input, str)) {
+ token = {input.cbegin(), str.length()};
+ return {type};
+ }
+ }
+ return {};
+}
+
+}
+
+namespace details {
+
+std::optional<double> parse_number(const std::string_view& input) {
+ std::string_view token;
+ return lexer::parse_number(input, token);
+}
+
+std::optional<token::Type> parse_const_token(const std::string_view& input) {
+ std::string_view token;
+ return lexer::parse_const_token(input, token);
+}
+
+std::string_view parse_whitespace(const std::string_view& input) {
+ static const std::regex ws_regex{R"(^\s+)"};
+
+ std::cmatch match;
+ if (std::regex_search(input.cbegin(), input.cend(), match, ws_regex)) {
+ return {match[0].first, match[0].length()};
+ }
+ return {};
+}
+
+}
+}
+
+Lexer::Lexer(const std::string_view& input)
+ : Lexer{lexer::Input{input}} {
+}
+
+Lexer::Lexer(const lexer::Input& input)
+ : m_input{input} {
+
+ consume_token();
+}
+
+bool Lexer::for_each_token(const TokenProcessor& process) {
+ for (auto token = peek_token(); token.has_value(); drop_token(), token = peek_token()) {
+ if (!process(*token)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+std::vector<Lexer::ParsedToken> Lexer::get_tokens() {
+ std::vector<ParsedToken> tokens;
+ for_each_token([&tokens] (const ParsedToken& token) {
+ tokens.emplace_back(token);
+ return true;
+ });
+ return tokens;
+}
+
+void Lexer::drop_token() {
+ if (!has_token()) {
+ throw LexerError{"internal: no tokens to drop"};
+ }
+ m_token_buffer = {};
+ consume_token();
+}
+
+std::optional<Lexer::ParsedToken> Lexer::drop_token_of_type(Type type) {
+ if (!has_token()) {
+ throw LexerError{"internal: no tokens to drop"};
+ }
+ if (m_token_buffer.value().get_type() != type) {
+ return {};
+ }
+ const auto result = m_token_buffer;
+ drop_token();
+ return result;
+}
+
+void Lexer::consume_whitespace() {
+ const auto ws = parse_whitespace();
+ if (!ws.has_value()) {
+ return;
+ }
+ m_input.consume(ws->get_length());
+}
+
+void Lexer::consume_token() {
+ if (m_input.empty()) {
+ return;
+ }
+ consume_whitespace();
+ if (m_input.empty()) {
+ return;
+ }
+ auto token{parse_token()};
+ m_input.consume(token.get_length());
+ m_token_buffer = std::move(token);
+}
+
+std::optional<Lexer::ParsedToken> Lexer::parse_whitespace() const {
+ const auto token_view = lexer::details::parse_whitespace(m_input.get_input());
+ if (token_view.empty()) {
+ return {};
+ }
+ return ParsedToken{Token{Token::Type::WHITESPACE}, m_input.get_pos(), token_view};
+}
+
+std::optional<Lexer::ParsedToken> Lexer::parse_const_token() const {
+ std::string_view token_view;
+ const auto type = lexer::parse_const_token(m_input.get_input(), token_view);
+ if (!type.has_value()) {
+ return {};
+ }
+ return ParsedToken{Token{*type}, m_input.get_pos(), token_view};
+}
+
+std::optional<Lexer::ParsedToken> Lexer::parse_number() const {
+ std::string_view token_view;
+ const auto number = lexer::parse_number(m_input.get_input(), token_view);
+ if (!number.has_value()) {
+ return {};
+ }
+ return ParsedToken{Token{*number}, m_input.get_pos(), token_view};
+}
+
+Lexer::ParsedToken Lexer::parse_token() const {
+ if (const auto const_token = parse_const_token(); const_token.has_value()) {
+ return *const_token;
+ }
+ if (const auto number = parse_number(); number.has_value()) {
+ return *number;
+ }
+ throw LexerError{"invalid input at: " + std::string{m_input.get_input()}};
+}
+
+}
diff --git a/server/lexer/lexer.hpp b/server/lexer/lexer.hpp
new file mode 100644
index 0000000..d08a2df
--- /dev/null
+++ b/server/lexer/lexer.hpp
@@ -0,0 +1,61 @@
+#pragma once
+
+#include "input.hpp"
+#include "token.hpp"
+#include "token_type.hpp"
+
+#include <functional>
+#include <optional>
+#include <string_view>
+#include <vector>
+
+namespace math::server {
+namespace lexer::details {
+
+// Exposed for testing:
+std::string_view parse_whitespace(const std::string_view&);
+std::optional<double> parse_number(const std::string_view&);
+std::optional<token::Type> parse_const_token(const std::string_view&);
+
+}
+
+class Lexer {
+public:
+ explicit Lexer(const std::string_view& input);
+ explicit Lexer(const lexer::Input& input);
+
+ using Token = lexer::Token;
+ using ParsedToken = lexer::ParsedToken;
+ using Type = Token::Type;
+ using TokenProcessor = std::function<bool (const ParsedToken&)>;
+
+ bool for_each_token(const TokenProcessor& process);
+
+ std::vector<ParsedToken> get_tokens();
+
+ bool has_token() const {
+ return peek_token().has_value();
+ }
+
+ std::optional<ParsedToken> peek_token() const {
+ return m_token_buffer;
+ }
+
+ void drop_token();
+ std::optional<ParsedToken> drop_token_of_type(Type type);
+
+private:
+ std::optional<ParsedToken> parse_whitespace() const;
+ std::optional<ParsedToken> parse_const_token() const;
+ std::optional<ParsedToken> parse_number() const;
+
+ ParsedToken parse_token() const;
+
+ void consume_whitespace();
+ void consume_token();
+
+ lexer::Input m_input;
+ std::optional<ParsedToken> m_token_buffer;
+};
+
+}
diff --git a/server/lexer/token.cpp b/server/lexer/token.cpp
new file mode 100644
index 0000000..6ffb721
--- /dev/null
+++ b/server/lexer/token.cpp
@@ -0,0 +1,71 @@
+#include "error.hpp"
+#include "token.hpp"
+#include "token_type.hpp"
+
+#include <cmath>
+
+#include <limits>
+#include <variant>
+
+namespace math::server::lexer {
+namespace {
+
+static constexpr double nan() { return std::numeric_limits<double>::quiet_NaN(); }
+
+static bool is_nan(double x) { return std::isnan(x); }
+
+static bool numbers_equal(double x, double y) {
+ if (is_nan(x) && is_nan(y)) {
+ return true;
+ }
+ return x == y;
+}
+
+}
+
+Token::Token(Type type)
+ : m_type{type} {
+
+ if (token::token_has_value(type)) {
+ throw LexerError{"internal: must have a value: " + token::type_to_int_string(type)};
+ }
+}
+
+Token::Token(double value)
+ : m_type{Type::NUMBER}, m_value{value}
+{ }
+
+bool Token::operator==(const Token& other) const {
+ if (m_type != other.m_type) {
+ return false;
+ }
+ if (token::is_const_token(m_type)) {
+ return true;
+ }
+ if (m_type == Type::NUMBER) {
+ return numbers_equal(as_number(), other.as_number());
+ }
+ throw LexerError{"internal: can't compare tokens of type: " + token::type_to_int_string(m_type)};
+}
+
+double Token::as_number() const {
+ const auto type = get_type();
+ if (type != Type::NUMBER) {
+ throw LexerError{"internal: not a number: " + token::type_to_int_string(type)};
+ }
+ return std::get<double>(m_value);
+}
+
+std::ostream& operator<<(std::ostream& os, const Token& token) {
+ switch (token.m_type) {
+ case token::Type::NUMBER:
+ os << token.as_number();
+ break;
+ default:
+ os << token::type_to_string(token.m_type);
+ break;
+ }
+ return os;
+}
+
+}
diff --git a/server/lexer/token.hpp b/server/lexer/token.hpp
new file mode 100644
index 0000000..6f98383
--- /dev/null
+++ b/server/lexer/token.hpp
@@ -0,0 +1,49 @@
+#pragma once
+
+#include "token_type.hpp"
+
+#include <cstddef>
+
+#include <string_view>
+#include <utility>
+#include <variant>
+
+namespace math::server::lexer {
+
+class Token {
+public:
+ using Type = token::Type;
+
+ explicit Token(token::Type type);
+ explicit Token(double value);
+
+ bool operator==(const Token& other) const;
+ bool operator!=(const Token& other) const { return !(*this == other); }
+
+ Type get_type() const { return m_type; }
+
+ double as_number() const;
+
+private:
+ token::Type m_type;
+ std::variant<double> m_value;
+
+ friend std::ostream& operator<<(std::ostream&, const Token&);
+};
+
+class ParsedToken : public Token {
+public:
+ ParsedToken(Token&& token, std::size_t pos, const std::string_view& view)
+ : Token{std::move(token)}, m_pos{pos}, m_view{view} {
+ }
+
+ std::size_t get_pos() const { return m_pos; }
+
+ std::size_t get_length() const { return m_view.length(); }
+
+private:
+ std::size_t m_pos;
+ std::string_view m_view;
+};
+
+}
diff --git a/server/lexer/token_type.cpp b/server/lexer/token_type.cpp
new file mode 100644
index 0000000..9a69ba1
--- /dev/null
+++ b/server/lexer/token_type.cpp
@@ -0,0 +1,164 @@
+#include "error.hpp"
+#include "token_type.hpp"
+
+#include <functional>
+#include <map>
+#include <ostream>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+
+namespace math::server::lexer::token {
+namespace {
+
+using ToStringMap = std::unordered_map<Type, std::string>;
+using FromStringMap = std::map<std::string, Type, std::greater<std::string>>;
+
+class ToStringConverter {
+public:
+ ToStringConverter() : m_map{to_string_map()} {
+ validate();
+ }
+
+ const ToStringMap& map() const { return m_map; }
+
+private:
+ static const ToStringMap& to_string_map() {
+ static const ToStringMap map{
+ {Type::WHITESPACE, "whitespace"},
+ {Type::PLUS, "+"},
+ {Type::MINUS, "-"},
+ {Type::ASTERISK, "*"},
+ {Type::SLASH, "/"},
+ {Type::LEFT_PAREN, "("},
+ {Type::RIGHT_PAREN, ")"},
+ {Type::NUMBER, "number"},
+ };
+ return map;
+ }
+
+ void validate() const {
+ check_for_duplicates();
+ }
+
+ void check_for_duplicates() const {
+ std::unordered_set<std::string> strings;
+ for (const auto& [type, str] : m_map) {
+ const auto [_, inserted] = strings.emplace(str);
+ if (!inserted) {
+ throw std::logic_error{"multiple tokens have the same string representation: " + str};
+ }
+ }
+ }
+
+ const ToStringMap& m_map;
+};
+
+const ToStringMap& to_string_map() {
+ static const ToStringConverter converter;
+ return converter.map();
+}
+
+class FromStringConverter {
+public:
+ FromStringConverter(const ToStringMap& to_string)
+ : m_map{build_map(to_string)} {
+ }
+
+ const FromStringMap& map() const { return m_map; }
+
+private:
+ static FromStringMap build_map(const ToStringMap& to_string) {
+ FromStringMap from_string;
+ for (const auto& [type, str] : to_string) {
+ const auto [_, inserted] = from_string.emplace(str, type);
+ if (!inserted) {
+ throw std::logic_error{"multiple tokens have the same string representation: " + str};
+ }
+ }
+ return from_string;
+ }
+
+ FromStringMap m_map;
+};
+
+const FromStringMap& from_string_map() {
+ static const FromStringConverter converter{to_string_map()};
+ return converter.map();
+}
+
+class ConstTokens {
+public:
+ ConstTokens() {
+ const auto& map = to_string_map();
+ for (const auto& [type, _] : map) {
+ if (is_const_token(type)) {
+ m_set.emplace(type);
+ }
+ }
+ }
+
+ const TypeSet& set() const { return m_set; }
+
+private:
+ TypeSet m_set;
+};
+
+}
+
+TypeInt type_to_int(Type type) {
+ return static_cast<TypeInt>(type);
+}
+
+std::string type_to_int_string(Type type) {
+ return std::to_string(type_to_int(type));
+}
+
+bool is_const_token(Type type) {
+ switch (type) {
+ case Type::WHITESPACE:
+ case Type::NUMBER:
+ return false;
+ default:
+ return true;
+ }
+}
+
+const TypeSet& const_tokens() {
+ static const ConstTokens tokens;
+ return tokens.set();
+}
+
+bool token_has_value(Type type) {
+ switch (type) {
+ case Type::NUMBER:
+ return true;
+ default:
+ return false;
+ }
+}
+
+std::string type_to_string(Type type) {
+ const auto& map = to_string_map();
+ const auto it = map.find(type);
+ if (it == map.cend()) {
+ throw LexerError{"type_to_string: unsupported token type: " + type_to_int_string(type)};
+ }
+ return it->second;
+}
+
+Type type_from_string(const std::string& src) {
+ const auto& map = from_string_map();
+ const auto it = map.find(src);
+ if (it == map.cend()) {
+ throw LexerError{"type_from_string: unsupported token: " + std::string{src}};
+ }
+ return it->second;
+}
+
+std::ostream& operator<<(std::ostream& os, const Type& type) {
+ os << type_to_int(type);
+ return os;
+}
+
+}
diff --git a/server/lexer/token_type.hpp b/server/lexer/token_type.hpp
new file mode 100644
index 0000000..9489915
--- /dev/null
+++ b/server/lexer/token_type.hpp
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <ostream>
+#include <string>
+#include <type_traits>
+#include <unordered_set>
+
+namespace math::server::lexer::token {
+
+enum class Type {
+ WHITESPACE,
+ PLUS,
+ MINUS,
+ ASTERISK,
+ SLASH,
+ LEFT_PAREN,
+ RIGHT_PAREN,
+ NUMBER,
+};
+
+using TypeInt = std::underlying_type<Type>::type;
+using TypeSet = std::unordered_set<Type>;
+
+TypeInt type_to_int(Type);
+std::string type_to_int_string(Type);
+
+bool is_const_token(Type);
+const TypeSet& const_tokens();
+
+bool token_has_value(Type);
+
+std::string type_to_string(Type);
+Type type_from_string(const std::string&);
+
+std::ostream& operator<<(std::ostream&, const Type&);
+
+}