aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/lexer.py202
-rw-r--r--src/nodes.py144
-rw-r--r--src/parser.py196
-rw-r--r--src/tokens.py105
4 files changed, 647 insertions, 0 deletions
diff --git a/src/lexer.py b/src/lexer.py
new file mode 100644
index 0000000..e4c26ce
--- /dev/null
+++ b/src/lexer.py
@@ -0,0 +1,202 @@
+# Copyright 2015 Egor Tensin <Egor.Tensin@gmail.com>
+# This file is licensed under the terms of the MIT License.
+# See LICENSE.txt for details.
+
+import re
+
+from tokens import *
+
+class LexerError(RuntimeError):
+ pass
+
+class Lexer:
+ def __init__(self, src_file):
+ self._line_buf = ''
+ self._tok_buf = []
+ self._src_file = src_file
+ self._ws_re = re.compile(r'^\s+')
+ self._identifier_re = re.compile(r'^[^\W\d]\w*')
+ self._const_toks = {
+ '+': AdditionOpToken,
+ '-': SubtractionOpToken,
+ '*': MultiplicationOpToken,
+ '/': DivisionOpToken,
+ ':=': AssignmentOpToken,
+ ';': SemicolonToken,
+ '(': OpeningParenToken,
+ ')': ClosingParenToken,
+ '{': OpeningBraceToken,
+ '}': ClosingBraceToken,
+ '&&': AndOpToken,
+ '||': OrOpToken,
+ '==': EqualsOpToken,
+ '!=': NotEqualsOpToken,
+ }
+ self._const_toks_sorted = list(self._const_toks.keys())
+ self._const_toks_sorted.sort()
+ self._make_sure_const_tokens_dont_match_identifier_re()
+ self._keywords = {
+ 'if': IfToken,
+ 'print': PrintToken,
+ 'True': TrueToken,
+ 'False': FalseToken,
+ }
+ self._make_sure_keywords_match_identifier_re()
+
+ def _make_sure_keywords_match_identifier_re(self):
+ for kw in self._keywords:
+ if not re.match(self._identifier_re, kw):
+ raise LexerError("keyword '%s' is not an identifier" % (kw))
+
+ def _make_sure_const_tokens_dont_match_identifier_re(self):
+ for t in self._const_toks_sorted:
+ if re.match(self._identifier_re, t):
+ raise LexerError("const token '%s' is an identifier" % (t))
+
+ def _try_require_line_buf(self):
+ if self._line_buf:
+ return True
+ if self._src_file is None:
+ return False
+ self._line_buf = self._src_file.readline()
+ if not self._line_buf:
+ self._src_file = None
+ return False
+ return True
+
+ def _eat_number_after_e_sign(self, acc):
+ m_after_e_sign = re.match(r'^[\-+]?\d+', self._line_buf)
+ if m_after_e_sign:
+ after_e_sign = m_after_e_sign.group(0)
+ self._line_buf = self._line_buf[len(after_e_sign):]
+ return FloatingPointNumberToken('%s%s' % (acc, after_e_sign))
+ raise LexerError("'%c' unexpected" % (self._line_buf[0]))
+
+ def _eat_number_after_dec_mark(self, acc):
+ if not self._line_buf:
+ return FloatingPointNumberToken(acc)
+ m_digits = re.match(r'^\d+', self._line_buf)
+ if m_digits:
+ digits = m_digits.group(0)
+ self._line_buf = self._line_buf[len(digits):]
+ return self._eat_number_after_dec_mark('%s%s' % (acc, digits))
+ m_e_sign = re.match(r'^[eE]', self._line_buf)
+ if m_e_sign:
+ e_sign = m_e_sign.group(0)
+ self._line_buf = self._line_buf[len(e_sign):]
+ return self._eat_number_after_e_sign('%s%s' % (acc, e_sign))
+ return FloatingPointNumberToken(acc)
+
+ def _eat_number_after_first_digit(self, acc):
+ if not self._line_buf:
+ return IntegerNumberToken(acc)
+ m_digits = re.match(r'^\d+', self._line_buf)
+ if m_digits:
+ digits = m_digits.group(0)
+ self._line_buf = self._line_buf[len(digits):]
+ return self._eat_number_after_first_digit('%s%s' % (acc, digits))
+ m_e_sign = re.match(r'^[eE]', self._line_buf)
+ if m_e_sign:
+ e_sign = m_e_sign.group(0)
+ self._line_buf = self._line_buf[len(e_sign):]
+ return self._eat_number_after_e_sign('%s%s' % (acc, e_sign))
+ m_dec_mark = re.match(r'^\.', self._line_buf)
+ if m_dec_mark:
+ dec_mark = m_dec_mark.group(0)
+ self._line_buf = self._line_buf[len(dec_mark):]
+ return self._eat_number_after_dec_mark('%s%s' % (acc, dec_mark))
+ return IntegerNumberToken(acc)
+
+ def _try_eat_number(self):
+ m_first_digit = re.match(r'^\d', self._line_buf)
+ if m_first_digit:
+ first_digit = m_first_digit.group(0)
+ self._line_buf = self._line_buf[len(first_digit):]
+ return self._eat_number_after_first_digit(first_digit)
+ m_dec_mark = re.match(r'^\.\d', self._line_buf)
+ if m_dec_mark:
+ dec_mark = m_dec_mark.group(0)
+ self._line_buf = self._line_buf[len(dec_mark):]
+ return self._eat_number_after_dec_mark(dec_mark)
+ return False
+
+ def _try_eat_ws(self):
+ while True:
+ if not self._try_require_line_buf():
+ return False
+ m_ws = re.match(self._ws_re, self._line_buf)
+ if not m_ws:
+ return True
+ self._line_buf = self._line_buf[len(m_ws.group(0)):]
+ return True
+
+ def _try_eat_identifier_or_keyword(self):
+ m_identifier = re.match(self._identifier_re, self._line_buf)
+ if m_identifier:
+ identifier = m_identifier.group(0)
+ self._line_buf = self._line_buf[len(identifier):]
+ if identifier in self._keywords:
+ return self._keywords[identifier]()
+ else:
+ return IdentifierToken(identifier)
+ return False
+
+ def _try_eat_const_token(self):
+ for t in self._const_toks_sorted:
+ if self._line_buf.startswith(t):
+ self._line_buf = self._line_buf[len(t):]
+ return self._const_toks[t]()
+ return False
+
+ def _try_eat_token(self):
+ if not self._try_eat_ws():
+ return False
+ const_tok = self._try_eat_const_token()
+ if const_tok:
+ self._tok_buf.append(const_tok)
+ return const_tok
+ identifier_or_keyword = self._try_eat_identifier_or_keyword()
+ if identifier_or_keyword:
+ self._tok_buf.append(identifier_or_keyword)
+ return identifier_or_keyword
+ number = self._try_eat_number()
+ if number:
+ self._tok_buf.append(number)
+ return number
+ raise LexerError("'%c' unexpected" % (self._line_buf[0]))
+
+ def _try_require_tok_buf(self, n = 1):
+ if n < 1:
+ raise LexerError("unable to require %d tokens" % (n))
+ while len(self._tok_buf) < n:
+ if not self._try_eat_token():
+ return False
+ return True
+
+ def has_next_token(self, n = 1):
+ return self._try_require_tok_buf(n)
+
+ def preview_next_token(self, n = 0):
+ if not self.has_next_token(n + 1):
+ raise LexerError("not enough tokens")
+ return self._tok_buf[n]
+
+ def drop_next_token(self, n = 1):
+ if not self.has_next_token(n):
+ raise LexerError("not enough tokens")
+ if n == 1:
+ return self._tok_buf.pop(0)
+ else:
+ xs = self._tok_buf[:n]
+ self._tok_buf = self._tok_buf[n:]
+ return xs
+
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser()
+ parser.add_argument('src_path', help='set soure file path')
+ args = parser.parse_args()
+ with open(args.src_path, 'r') as src_file:
+ lexer = Lexer(src_file)
+ while lexer.has_next_token():
+ print(lexer.drop_next_token().__class__.__name__)
diff --git a/src/nodes.py b/src/nodes.py
new file mode 100644
index 0000000..ea9517d
--- /dev/null
+++ b/src/nodes.py
@@ -0,0 +1,144 @@
+# Copyright 2015 Egor Tensin <Egor.Tensin@gmail.com>
+# This file is licensed under the terms of the MIT License.
+# See LICENSE.txt for details.
+
+class ProgramNode:
+ def __init__(self, stmt_list):
+ self._stmt_list = stmt_list
+
+ def execute(self):
+ for stmt in self._stmt_list:
+ stmt.execute()
+
+_varmap = { }
+
+class CompoundStatementNode:
+ def __init__(self, stmt_list):
+ self._stmt_list = stmt_list
+
+ def execute(self):
+ for stmt in self._stmt_list:
+ stmt.execute()
+
+class EmptyStatementNode:
+ def execute(self):
+ pass
+
+class AssignmentNode:
+ def __init__(self, identifier, arithm_expr):
+ self._identifier = identifier
+ self._arithm_expr = arithm_expr
+
+ def execute(self):
+ _varmap[str(self._identifier)] = self._arithm_expr.execute()
+ return None
+
+class PrintStatementNode:
+ def __init__(self, arithm_expr):
+ self._arithm_expr = arithm_expr
+
+ def execute(self):
+ print(self._arithm_expr.execute())
+ return None
+
+class IdentifierNode:
+ def __init__(self, identifier):
+ self._identifier = identifier
+
+ def execute(self):
+ return _varmap[str(self._identifier)]
+
+class AdditionOpNode:
+ def __init__(self, left, right):
+ self._left = left
+ self._right = right
+
+ def execute(self):
+ return self._left.execute() + self._right.execute()
+
+class SubtractionOpNode:
+ def __init__(self, left, right):
+ self._left = left
+ self._right = right
+
+ def execute(self):
+ return self._left.execute() + self._right.execute()
+
+class MultiplicationOpNode:
+ def __init__(self, left, right):
+ self._left = left
+ self._right = right
+
+ def execute(self):
+ return self._left.execute() * self._right.execute()
+
+class DivisionOpNode:
+ def __init__(self, left, right):
+ self._left = left
+ self._right = right
+
+ def execute(self):
+ return self._left.execute() / self._right.execute()
+
+class IntegerNumberNode:
+ def __init__(self, n):
+ self._n = n
+
+ def execute(self):
+ return int(self._n)
+
+class FloatingPointNumberNode:
+ def __init__(self, n):
+ self._n = n
+
+ def execute(self):
+ return float(self._n)
+
+class IfStatementNode:
+ def __init__(self, cond, body):
+ self._cond = cond
+ self._body = body
+
+ def execute(self):
+ if self._cond.execute():
+ return self._body.execute()
+
+class TrueNode:
+ def execute(self):
+ return True
+
+class FalseNode:
+ def execute(self):
+ return False
+
+class AndOpNode:
+ def __init__(self, left, right):
+ self._left = left
+ self._right = right
+
+ def execute(self):
+ return self._left.execute() and self._right.execute()
+
+class OrOpNode:
+ def __init__(self, left, right):
+ self._left = left
+ self._right = right
+
+ def execute(self):
+ return self._left.execute() or self._right.execute()
+
+class EqualsOpNode:
+ def __init__(self, left, right):
+ self._left = left
+ self._right = right
+
+ def execute(self):
+ return self._left.execute() == self._right.execute()
+
+class NotEqualsOpNode:
+ def __init__(self, left, right):
+ self._left = left
+ self._right = right
+
+ def execute(self):
+ return self._left.execute() != self._right.execute()
diff --git a/src/parser.py b/src/parser.py
new file mode 100644
index 0000000..5ee5207
--- /dev/null
+++ b/src/parser.py
@@ -0,0 +1,196 @@
+# Copyright 2015 Egor Tensin <Egor.Tensin@gmail.com>
+# This file is licensed under the terms of the MIT License.
+# See LICENSE.txt for details.
+
+from lexer import *
+from nodes import *
+
+class ParserError(RuntimeError):
+ pass
+
+class Parser:
+ def __init__(self, src_file):
+ self._lexer = Lexer(src_file)
+
+ def parse(self):
+ return self._parse_program()
+
+ def _try_parse_token(self, cls):
+ if not self._lexer.has_next_token():
+ return False
+ t = self._lexer.preview_next_token()
+ if not isinstance(t, cls):
+ return False
+ self._lexer.drop_next_token()
+ return t
+
+ def _parse_token(self, cls):
+ if not self._lexer.has_next_token():
+ raise ParserError("%s expected" % cls.__name__)
+ t = self._lexer.preview_next_token()
+ if not isinstance(t, cls):
+ raise ParserError("%s expected instead of %s" % (
+ cls.__name__, t.__class__.__name__))
+ self._lexer.drop_next_token()
+ return t
+
+ def _parse_program(self):
+ stmt_list = []
+ while self._lexer.has_next_token():
+ stmt = self._try_parse_stmt()
+ if not stmt:
+ raise ParserError("unexpected token '%s'" % (
+ self._lexer.preview_next_token()))
+ stmt_list.append(stmt)
+ return ProgramNode(stmt_list)
+
+ def _try_parse_stmt(self):
+ print_stmt = self._try_parse_print_stmt()
+ if print_stmt:
+ return print_stmt
+ assignment = self._try_parse_assignment()
+ if assignment:
+ return assignment
+ empty_stmt = self._try_parse_empty_stmt()
+ if empty_stmt:
+ return empty_stmt
+ if_stmt = self._try_parse_if_stmt()
+ if if_stmt:
+ return if_stmt
+ block = self._try_parse_block()
+ if block:
+ return block
+ return False
+
+ def _try_parse_empty_stmt(self):
+ if not self._try_parse_token(SemicolonToken):
+ return False
+ return EmptyStatementNode()
+
+ def _try_parse_block(self):
+ if not self._try_parse_token(OpeningBraceToken):
+ return False
+ stmt_list = []
+ while True:
+ stmt = self._try_parse_stmt()
+ if stmt:
+ stmt_list.append(stmt)
+ else:
+ break
+ self._parse_token(ClosingBraceToken)
+ return CompoundStatementNode(stmt_list)
+
+ def _try_parse_if_stmt(self):
+ if not self._try_parse_token(IfToken):
+ return False
+ self._parse_token(OpeningParenToken)
+ cond = self._parse_logical_expr()
+ self._parse_token(ClosingParenToken)
+ stmt = self._try_parse_stmt()
+ if not stmt:
+ raise ParserError("unexpected token '%s'" % (
+ self._lexer.preview_next_token()))
+ return IfStatementNode(cond, stmt)
+
+ def _try_parse_print_stmt(self):
+ if not self._try_parse_token(PrintToken):
+ return False
+ arithm_expr = self._parse_arithm_expr()
+ self._parse_token(SemicolonToken)
+ return PrintStatementNode(arithm_expr)
+
+ def _try_parse_assignment(self):
+ if not self._lexer.has_next_token(2):
+ return False
+ identifier = self._lexer.preview_next_token(0)
+ if not isinstance(identifier, IdentifierToken):
+ return False
+ op = self._lexer.preview_next_token(1)
+ if not isinstance(op, AssignmentOpToken):
+ return False
+ self._lexer.drop_next_token(2)
+ arithm_expr = self._parse_arithm_expr()
+ self._parse_token(SemicolonToken)
+ return AssignmentNode(identifier, arithm_expr)
+
+ def _parse_logical_expr(self):
+ left = self._parse_logical_term()
+ while self._lexer.has_next_token():
+ if self._try_parse_token(AndOpToken):
+ left = AndOpNode(left, self._parse_logical_term())
+ elif self._try_parse_token(OrOpToken):
+ left = OrOpNode(left, self._parse_logical_term())
+ else:
+ return left
+ return left
+
+ def _parse_logical_term(self):
+ left = self._parse_logical_factor()
+ if self._lexer.has_next_token():
+ if self._try_parse_token(EqualsOpToken):
+ left = EqualsOpNode(left, self._parse_logical_factor())
+ elif self._try_parse_token(NotEqualsOpToken):
+ left = NotEqualsOpNode(left, self._parse_logical_factor())
+ else:
+ return left
+ return left
+
+ def _parse_logical_factor(self):
+ if self._try_parse_token(TrueToken):
+ return TrueNode()
+ elif self._try_parse_token(FalseToken):
+ return FalseNode()
+ elif self._try_parse_token(OpeningParenToken):
+ logical_expr = self._parse_logical_expr()
+ self._parse_token(ClosingParenToken)
+ return logical_expr
+ else:
+ raise ParserError('expected \'True\', \'False\' or \'(\'')
+
+ def _parse_arithm_expr(self):
+ left = self._parse_arithm_term()
+ while self._lexer.has_next_token():
+ if self._try_parse_token(AdditionOpToken):
+ left = AdditionOpNode(left, self._parse_arithm_term())
+ elif self._try_parse_token(SubtractionOpToken):
+ left = SubtractionOpNode(left, self._parse_arithm_term())
+ else:
+ return left
+ return left
+
+ def _parse_arithm_term(self):
+ left = self._parse_arithm_factor()
+ while self._lexer.has_next_token():
+ if self._try_parse_token(MultiplicationOpToken):
+ left = MultiplicationOpNode(left, self._parse_arithm_factor())
+ elif self._try_parse_token(DivisionOpToken):
+ left = DivisionOpNode(left, self._parse_arithm_factor())
+ else:
+ return left
+ return left
+
+ def _parse_arithm_factor(self):
+ identifier = self._try_parse_token(IdentifierToken)
+ if identifier:
+ return IdentifierNode(identifier)
+ int_num = self._try_parse_token(IntegerNumberToken)
+ if int_num:
+ return IntegerNumberNode(int_num)
+ float_num = self._try_parse_token(FloatingPointNumberToken)
+ if float_num:
+ return FloatingPointNumberNode(float_num)
+ if self._try_parse_token(OpeningParenToken):
+ arithm_expr = self._parse_arithm_expr()
+ self._parse_token(ClosingParenToken)
+ return arithm_expr
+ else:
+ raise ParserError('expected an identifier, a number or \'(\'')
+
+if __name__ == '__main__':
+ import argparse
+ parser = argparse.ArgumentParser()
+ parser.add_argument('src_path', help='set source file path')
+ args = parser.parse_args()
+ with open(args.src_path, 'r') as src_file:
+ parser = Parser(src_file)
+ parser.parse().execute()
diff --git a/src/tokens.py b/src/tokens.py
new file mode 100644
index 0000000..3c04668
--- /dev/null
+++ b/src/tokens.py
@@ -0,0 +1,105 @@
+# Copyright 2015 Egor Tensin <Egor.Tensin@gmail.com>
+# This file is licensed under the terms of the MIT License.
+# See LICENSE.txt for details.
+
+class Token:
+ pass
+
+class AdditionOpToken(Token):
+ def __str__(self):
+ return '+'
+
+class SubtractionOpToken(Token):
+ def __str__(self):
+ return '-'
+
+class MultiplicationOpToken(Token):
+ def __str__(self):
+ return '*'
+
+class DivisionOpToken(Token):
+ def __str__(self):
+ return '/'
+
+class AssignmentOpToken(Token):
+ def __str__(self):
+ return ':='
+
+class SemicolonToken(Token):
+ def __str__(self):
+ return ';'
+
+class PrintToken(Token):
+ def __str__(self):
+ return 'print'
+
+class OpeningParenToken(Token):
+ def __str__(self):
+ return '('
+
+class ClosingParenToken(Token):
+ def __str__(self):
+ return ')'
+
+class OpeningBraceToken(Token):
+ def __str__(self):
+ return '{'
+
+class ClosingBraceToken(Token):
+ def __str__(self):
+ return '}'
+
+class IdentifierToken(Token):
+ def __init__(self, i):
+ self._i = i
+
+ def __str__(self):
+ return str(self._i)
+
+class FloatingPointNumberToken(Token):
+ def __init__(self, n):
+ self._n = n
+
+ def __float__(self):
+ return float(self._n)
+
+ def __str__(self):
+ return str(self._n)
+
+class IntegerNumberToken(Token):
+ def __init__(self, n):
+ self._n = n
+
+ def __int__(self):
+ return int(self._n)
+
+ def __str__(self):
+ return str(self._n)
+
+class TrueToken(Token):
+ def __str__(self):
+ return 'True'
+
+class FalseToken(Token):
+ def __str__(self):
+ return 'False'
+
+class AndOpToken(Token):
+ def __str__(self):
+ return '&&'
+
+class OrOpToken(Token):
+ def __str__(self):
+ return '||'
+
+class IfToken(Token):
+ def __str__(self):
+ return 'if'
+
+class EqualsOpToken(Token):
+ def __str__(self):
+ return '=='
+
+class NotEqualsOpToken(Token):
+ def __str__(self):
+ return '!='