From ffc3e3423897d77b5e43af8ed567b544f00cf526 Mon Sep 17 00:00:00 2001 From: Egor Tensin Date: Wed, 6 May 2015 06:00:26 +0300 Subject: initial commit --- src/lexer.py | 202 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/nodes.py | 144 +++++++++++++++++++++++++++++++++++++++++ src/parser.py | 196 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/tokens.py | 105 ++++++++++++++++++++++++++++++ 4 files changed, 647 insertions(+) create mode 100644 src/lexer.py create mode 100644 src/nodes.py create mode 100644 src/parser.py create mode 100644 src/tokens.py (limited to 'src') diff --git a/src/lexer.py b/src/lexer.py new file mode 100644 index 0000000..e4c26ce --- /dev/null +++ b/src/lexer.py @@ -0,0 +1,202 @@ +# Copyright 2015 Egor Tensin +# This file is licensed under the terms of the MIT License. +# See LICENSE.txt for details. + +import re + +from tokens import * + +class LexerError(RuntimeError): + pass + +class Lexer: + def __init__(self, src_file): + self._line_buf = '' + self._tok_buf = [] + self._src_file = src_file + self._ws_re = re.compile(r'^\s+') + self._identifier_re = re.compile(r'^[^\W\d]\w*') + self._const_toks = { + '+': AdditionOpToken, + '-': SubtractionOpToken, + '*': MultiplicationOpToken, + '/': DivisionOpToken, + ':=': AssignmentOpToken, + ';': SemicolonToken, + '(': OpeningParenToken, + ')': ClosingParenToken, + '{': OpeningBraceToken, + '}': ClosingBraceToken, + '&&': AndOpToken, + '||': OrOpToken, + '==': EqualsOpToken, + '!=': NotEqualsOpToken, + } + self._const_toks_sorted = list(self._const_toks.keys()) + self._const_toks_sorted.sort() + self._make_sure_const_tokens_dont_match_identifier_re() + self._keywords = { + 'if': IfToken, + 'print': PrintToken, + 'True': TrueToken, + 'False': FalseToken, + } + self._make_sure_keywords_match_identifier_re() + + def _make_sure_keywords_match_identifier_re(self): + for kw in self._keywords: + if not re.match(self._identifier_re, kw): + raise LexerError("keyword '%s' is not an identifier" % (kw)) + + def _make_sure_const_tokens_dont_match_identifier_re(self): + for t in self._const_toks_sorted: + if re.match(self._identifier_re, t): + raise LexerError("const token '%s' is an identifier" % (t)) + + def _try_require_line_buf(self): + if self._line_buf: + return True + if self._src_file is None: + return False + self._line_buf = self._src_file.readline() + if not self._line_buf: + self._src_file = None + return False + return True + + def _eat_number_after_e_sign(self, acc): + m_after_e_sign = re.match(r'^[\-+]?\d+', self._line_buf) + if m_after_e_sign: + after_e_sign = m_after_e_sign.group(0) + self._line_buf = self._line_buf[len(after_e_sign):] + return FloatingPointNumberToken('%s%s' % (acc, after_e_sign)) + raise LexerError("'%c' unexpected" % (self._line_buf[0])) + + def _eat_number_after_dec_mark(self, acc): + if not self._line_buf: + return FloatingPointNumberToken(acc) + m_digits = re.match(r'^\d+', self._line_buf) + if m_digits: + digits = m_digits.group(0) + self._line_buf = self._line_buf[len(digits):] + return self._eat_number_after_dec_mark('%s%s' % (acc, digits)) + m_e_sign = re.match(r'^[eE]', self._line_buf) + if m_e_sign: + e_sign = m_e_sign.group(0) + self._line_buf = self._line_buf[len(e_sign):] + return self._eat_number_after_e_sign('%s%s' % (acc, e_sign)) + return FloatingPointNumberToken(acc) + + def _eat_number_after_first_digit(self, acc): + if not self._line_buf: + return IntegerNumberToken(acc) + m_digits = re.match(r'^\d+', self._line_buf) + if m_digits: + digits = m_digits.group(0) + self._line_buf = self._line_buf[len(digits):] + return self._eat_number_after_first_digit('%s%s' % (acc, digits)) + m_e_sign = re.match(r'^[eE]', self._line_buf) + if m_e_sign: + e_sign = m_e_sign.group(0) + self._line_buf = self._line_buf[len(e_sign):] + return self._eat_number_after_e_sign('%s%s' % (acc, e_sign)) + m_dec_mark = re.match(r'^\.', self._line_buf) + if m_dec_mark: + dec_mark = m_dec_mark.group(0) + self._line_buf = self._line_buf[len(dec_mark):] + return self._eat_number_after_dec_mark('%s%s' % (acc, dec_mark)) + return IntegerNumberToken(acc) + + def _try_eat_number(self): + m_first_digit = re.match(r'^\d', self._line_buf) + if m_first_digit: + first_digit = m_first_digit.group(0) + self._line_buf = self._line_buf[len(first_digit):] + return self._eat_number_after_first_digit(first_digit) + m_dec_mark = re.match(r'^\.\d', self._line_buf) + if m_dec_mark: + dec_mark = m_dec_mark.group(0) + self._line_buf = self._line_buf[len(dec_mark):] + return self._eat_number_after_dec_mark(dec_mark) + return False + + def _try_eat_ws(self): + while True: + if not self._try_require_line_buf(): + return False + m_ws = re.match(self._ws_re, self._line_buf) + if not m_ws: + return True + self._line_buf = self._line_buf[len(m_ws.group(0)):] + return True + + def _try_eat_identifier_or_keyword(self): + m_identifier = re.match(self._identifier_re, self._line_buf) + if m_identifier: + identifier = m_identifier.group(0) + self._line_buf = self._line_buf[len(identifier):] + if identifier in self._keywords: + return self._keywords[identifier]() + else: + return IdentifierToken(identifier) + return False + + def _try_eat_const_token(self): + for t in self._const_toks_sorted: + if self._line_buf.startswith(t): + self._line_buf = self._line_buf[len(t):] + return self._const_toks[t]() + return False + + def _try_eat_token(self): + if not self._try_eat_ws(): + return False + const_tok = self._try_eat_const_token() + if const_tok: + self._tok_buf.append(const_tok) + return const_tok + identifier_or_keyword = self._try_eat_identifier_or_keyword() + if identifier_or_keyword: + self._tok_buf.append(identifier_or_keyword) + return identifier_or_keyword + number = self._try_eat_number() + if number: + self._tok_buf.append(number) + return number + raise LexerError("'%c' unexpected" % (self._line_buf[0])) + + def _try_require_tok_buf(self, n = 1): + if n < 1: + raise LexerError("unable to require %d tokens" % (n)) + while len(self._tok_buf) < n: + if not self._try_eat_token(): + return False + return True + + def has_next_token(self, n = 1): + return self._try_require_tok_buf(n) + + def preview_next_token(self, n = 0): + if not self.has_next_token(n + 1): + raise LexerError("not enough tokens") + return self._tok_buf[n] + + def drop_next_token(self, n = 1): + if not self.has_next_token(n): + raise LexerError("not enough tokens") + if n == 1: + return self._tok_buf.pop(0) + else: + xs = self._tok_buf[:n] + self._tok_buf = self._tok_buf[n:] + return xs + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('src_path', help='set soure file path') + args = parser.parse_args() + with open(args.src_path, 'r') as src_file: + lexer = Lexer(src_file) + while lexer.has_next_token(): + print(lexer.drop_next_token().__class__.__name__) diff --git a/src/nodes.py b/src/nodes.py new file mode 100644 index 0000000..ea9517d --- /dev/null +++ b/src/nodes.py @@ -0,0 +1,144 @@ +# Copyright 2015 Egor Tensin +# This file is licensed under the terms of the MIT License. +# See LICENSE.txt for details. + +class ProgramNode: + def __init__(self, stmt_list): + self._stmt_list = stmt_list + + def execute(self): + for stmt in self._stmt_list: + stmt.execute() + +_varmap = { } + +class CompoundStatementNode: + def __init__(self, stmt_list): + self._stmt_list = stmt_list + + def execute(self): + for stmt in self._stmt_list: + stmt.execute() + +class EmptyStatementNode: + def execute(self): + pass + +class AssignmentNode: + def __init__(self, identifier, arithm_expr): + self._identifier = identifier + self._arithm_expr = arithm_expr + + def execute(self): + _varmap[str(self._identifier)] = self._arithm_expr.execute() + return None + +class PrintStatementNode: + def __init__(self, arithm_expr): + self._arithm_expr = arithm_expr + + def execute(self): + print(self._arithm_expr.execute()) + return None + +class IdentifierNode: + def __init__(self, identifier): + self._identifier = identifier + + def execute(self): + return _varmap[str(self._identifier)] + +class AdditionOpNode: + def __init__(self, left, right): + self._left = left + self._right = right + + def execute(self): + return self._left.execute() + self._right.execute() + +class SubtractionOpNode: + def __init__(self, left, right): + self._left = left + self._right = right + + def execute(self): + return self._left.execute() + self._right.execute() + +class MultiplicationOpNode: + def __init__(self, left, right): + self._left = left + self._right = right + + def execute(self): + return self._left.execute() * self._right.execute() + +class DivisionOpNode: + def __init__(self, left, right): + self._left = left + self._right = right + + def execute(self): + return self._left.execute() / self._right.execute() + +class IntegerNumberNode: + def __init__(self, n): + self._n = n + + def execute(self): + return int(self._n) + +class FloatingPointNumberNode: + def __init__(self, n): + self._n = n + + def execute(self): + return float(self._n) + +class IfStatementNode: + def __init__(self, cond, body): + self._cond = cond + self._body = body + + def execute(self): + if self._cond.execute(): + return self._body.execute() + +class TrueNode: + def execute(self): + return True + +class FalseNode: + def execute(self): + return False + +class AndOpNode: + def __init__(self, left, right): + self._left = left + self._right = right + + def execute(self): + return self._left.execute() and self._right.execute() + +class OrOpNode: + def __init__(self, left, right): + self._left = left + self._right = right + + def execute(self): + return self._left.execute() or self._right.execute() + +class EqualsOpNode: + def __init__(self, left, right): + self._left = left + self._right = right + + def execute(self): + return self._left.execute() == self._right.execute() + +class NotEqualsOpNode: + def __init__(self, left, right): + self._left = left + self._right = right + + def execute(self): + return self._left.execute() != self._right.execute() diff --git a/src/parser.py b/src/parser.py new file mode 100644 index 0000000..5ee5207 --- /dev/null +++ b/src/parser.py @@ -0,0 +1,196 @@ +# Copyright 2015 Egor Tensin +# This file is licensed under the terms of the MIT License. +# See LICENSE.txt for details. + +from lexer import * +from nodes import * + +class ParserError(RuntimeError): + pass + +class Parser: + def __init__(self, src_file): + self._lexer = Lexer(src_file) + + def parse(self): + return self._parse_program() + + def _try_parse_token(self, cls): + if not self._lexer.has_next_token(): + return False + t = self._lexer.preview_next_token() + if not isinstance(t, cls): + return False + self._lexer.drop_next_token() + return t + + def _parse_token(self, cls): + if not self._lexer.has_next_token(): + raise ParserError("%s expected" % cls.__name__) + t = self._lexer.preview_next_token() + if not isinstance(t, cls): + raise ParserError("%s expected instead of %s" % ( + cls.__name__, t.__class__.__name__)) + self._lexer.drop_next_token() + return t + + def _parse_program(self): + stmt_list = [] + while self._lexer.has_next_token(): + stmt = self._try_parse_stmt() + if not stmt: + raise ParserError("unexpected token '%s'" % ( + self._lexer.preview_next_token())) + stmt_list.append(stmt) + return ProgramNode(stmt_list) + + def _try_parse_stmt(self): + print_stmt = self._try_parse_print_stmt() + if print_stmt: + return print_stmt + assignment = self._try_parse_assignment() + if assignment: + return assignment + empty_stmt = self._try_parse_empty_stmt() + if empty_stmt: + return empty_stmt + if_stmt = self._try_parse_if_stmt() + if if_stmt: + return if_stmt + block = self._try_parse_block() + if block: + return block + return False + + def _try_parse_empty_stmt(self): + if not self._try_parse_token(SemicolonToken): + return False + return EmptyStatementNode() + + def _try_parse_block(self): + if not self._try_parse_token(OpeningBraceToken): + return False + stmt_list = [] + while True: + stmt = self._try_parse_stmt() + if stmt: + stmt_list.append(stmt) + else: + break + self._parse_token(ClosingBraceToken) + return CompoundStatementNode(stmt_list) + + def _try_parse_if_stmt(self): + if not self._try_parse_token(IfToken): + return False + self._parse_token(OpeningParenToken) + cond = self._parse_logical_expr() + self._parse_token(ClosingParenToken) + stmt = self._try_parse_stmt() + if not stmt: + raise ParserError("unexpected token '%s'" % ( + self._lexer.preview_next_token())) + return IfStatementNode(cond, stmt) + + def _try_parse_print_stmt(self): + if not self._try_parse_token(PrintToken): + return False + arithm_expr = self._parse_arithm_expr() + self._parse_token(SemicolonToken) + return PrintStatementNode(arithm_expr) + + def _try_parse_assignment(self): + if not self._lexer.has_next_token(2): + return False + identifier = self._lexer.preview_next_token(0) + if not isinstance(identifier, IdentifierToken): + return False + op = self._lexer.preview_next_token(1) + if not isinstance(op, AssignmentOpToken): + return False + self._lexer.drop_next_token(2) + arithm_expr = self._parse_arithm_expr() + self._parse_token(SemicolonToken) + return AssignmentNode(identifier, arithm_expr) + + def _parse_logical_expr(self): + left = self._parse_logical_term() + while self._lexer.has_next_token(): + if self._try_parse_token(AndOpToken): + left = AndOpNode(left, self._parse_logical_term()) + elif self._try_parse_token(OrOpToken): + left = OrOpNode(left, self._parse_logical_term()) + else: + return left + return left + + def _parse_logical_term(self): + left = self._parse_logical_factor() + if self._lexer.has_next_token(): + if self._try_parse_token(EqualsOpToken): + left = EqualsOpNode(left, self._parse_logical_factor()) + elif self._try_parse_token(NotEqualsOpToken): + left = NotEqualsOpNode(left, self._parse_logical_factor()) + else: + return left + return left + + def _parse_logical_factor(self): + if self._try_parse_token(TrueToken): + return TrueNode() + elif self._try_parse_token(FalseToken): + return FalseNode() + elif self._try_parse_token(OpeningParenToken): + logical_expr = self._parse_logical_expr() + self._parse_token(ClosingParenToken) + return logical_expr + else: + raise ParserError('expected \'True\', \'False\' or \'(\'') + + def _parse_arithm_expr(self): + left = self._parse_arithm_term() + while self._lexer.has_next_token(): + if self._try_parse_token(AdditionOpToken): + left = AdditionOpNode(left, self._parse_arithm_term()) + elif self._try_parse_token(SubtractionOpToken): + left = SubtractionOpNode(left, self._parse_arithm_term()) + else: + return left + return left + + def _parse_arithm_term(self): + left = self._parse_arithm_factor() + while self._lexer.has_next_token(): + if self._try_parse_token(MultiplicationOpToken): + left = MultiplicationOpNode(left, self._parse_arithm_factor()) + elif self._try_parse_token(DivisionOpToken): + left = DivisionOpNode(left, self._parse_arithm_factor()) + else: + return left + return left + + def _parse_arithm_factor(self): + identifier = self._try_parse_token(IdentifierToken) + if identifier: + return IdentifierNode(identifier) + int_num = self._try_parse_token(IntegerNumberToken) + if int_num: + return IntegerNumberNode(int_num) + float_num = self._try_parse_token(FloatingPointNumberToken) + if float_num: + return FloatingPointNumberNode(float_num) + if self._try_parse_token(OpeningParenToken): + arithm_expr = self._parse_arithm_expr() + self._parse_token(ClosingParenToken) + return arithm_expr + else: + raise ParserError('expected an identifier, a number or \'(\'') + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('src_path', help='set source file path') + args = parser.parse_args() + with open(args.src_path, 'r') as src_file: + parser = Parser(src_file) + parser.parse().execute() diff --git a/src/tokens.py b/src/tokens.py new file mode 100644 index 0000000..3c04668 --- /dev/null +++ b/src/tokens.py @@ -0,0 +1,105 @@ +# Copyright 2015 Egor Tensin +# This file is licensed under the terms of the MIT License. +# See LICENSE.txt for details. + +class Token: + pass + +class AdditionOpToken(Token): + def __str__(self): + return '+' + +class SubtractionOpToken(Token): + def __str__(self): + return '-' + +class MultiplicationOpToken(Token): + def __str__(self): + return '*' + +class DivisionOpToken(Token): + def __str__(self): + return '/' + +class AssignmentOpToken(Token): + def __str__(self): + return ':=' + +class SemicolonToken(Token): + def __str__(self): + return ';' + +class PrintToken(Token): + def __str__(self): + return 'print' + +class OpeningParenToken(Token): + def __str__(self): + return '(' + +class ClosingParenToken(Token): + def __str__(self): + return ')' + +class OpeningBraceToken(Token): + def __str__(self): + return '{' + +class ClosingBraceToken(Token): + def __str__(self): + return '}' + +class IdentifierToken(Token): + def __init__(self, i): + self._i = i + + def __str__(self): + return str(self._i) + +class FloatingPointNumberToken(Token): + def __init__(self, n): + self._n = n + + def __float__(self): + return float(self._n) + + def __str__(self): + return str(self._n) + +class IntegerNumberToken(Token): + def __init__(self, n): + self._n = n + + def __int__(self): + return int(self._n) + + def __str__(self): + return str(self._n) + +class TrueToken(Token): + def __str__(self): + return 'True' + +class FalseToken(Token): + def __str__(self): + return 'False' + +class AndOpToken(Token): + def __str__(self): + return '&&' + +class OrOpToken(Token): + def __str__(self): + return '||' + +class IfToken(Token): + def __str__(self): + return 'if' + +class EqualsOpToken(Token): + def __str__(self): + return '==' + +class NotEqualsOpToken(Token): + def __str__(self): + return '!=' -- cgit v1.2.3