diff options
Diffstat (limited to '')
-rw-r--r-- | src/lexer.py | 202 |
1 files changed, 202 insertions, 0 deletions
diff --git a/src/lexer.py b/src/lexer.py new file mode 100644 index 0000000..e4c26ce --- /dev/null +++ b/src/lexer.py @@ -0,0 +1,202 @@ +# Copyright 2015 Egor Tensin <Egor.Tensin@gmail.com> +# This file is licensed under the terms of the MIT License. +# See LICENSE.txt for details. + +import re + +from tokens import * + +class LexerError(RuntimeError): + pass + +class Lexer: + def __init__(self, src_file): + self._line_buf = '' + self._tok_buf = [] + self._src_file = src_file + self._ws_re = re.compile(r'^\s+') + self._identifier_re = re.compile(r'^[^\W\d]\w*') + self._const_toks = { + '+': AdditionOpToken, + '-': SubtractionOpToken, + '*': MultiplicationOpToken, + '/': DivisionOpToken, + ':=': AssignmentOpToken, + ';': SemicolonToken, + '(': OpeningParenToken, + ')': ClosingParenToken, + '{': OpeningBraceToken, + '}': ClosingBraceToken, + '&&': AndOpToken, + '||': OrOpToken, + '==': EqualsOpToken, + '!=': NotEqualsOpToken, + } + self._const_toks_sorted = list(self._const_toks.keys()) + self._const_toks_sorted.sort() + self._make_sure_const_tokens_dont_match_identifier_re() + self._keywords = { + 'if': IfToken, + 'print': PrintToken, + 'True': TrueToken, + 'False': FalseToken, + } + self._make_sure_keywords_match_identifier_re() + + def _make_sure_keywords_match_identifier_re(self): + for kw in self._keywords: + if not re.match(self._identifier_re, kw): + raise LexerError("keyword '%s' is not an identifier" % (kw)) + + def _make_sure_const_tokens_dont_match_identifier_re(self): + for t in self._const_toks_sorted: + if re.match(self._identifier_re, t): + raise LexerError("const token '%s' is an identifier" % (t)) + + def _try_require_line_buf(self): + if self._line_buf: + return True + if self._src_file is None: + return False + self._line_buf = self._src_file.readline() + if not self._line_buf: + self._src_file = None + return False + return True + + def _eat_number_after_e_sign(self, acc): + m_after_e_sign = re.match(r'^[\-+]?\d+', self._line_buf) + if m_after_e_sign: + after_e_sign = m_after_e_sign.group(0) + self._line_buf = self._line_buf[len(after_e_sign):] + return FloatingPointNumberToken('%s%s' % (acc, after_e_sign)) + raise LexerError("'%c' unexpected" % (self._line_buf[0])) + + def _eat_number_after_dec_mark(self, acc): + if not self._line_buf: + return FloatingPointNumberToken(acc) + m_digits = re.match(r'^\d+', self._line_buf) + if m_digits: + digits = m_digits.group(0) + self._line_buf = self._line_buf[len(digits):] + return self._eat_number_after_dec_mark('%s%s' % (acc, digits)) + m_e_sign = re.match(r'^[eE]', self._line_buf) + if m_e_sign: + e_sign = m_e_sign.group(0) + self._line_buf = self._line_buf[len(e_sign):] + return self._eat_number_after_e_sign('%s%s' % (acc, e_sign)) + return FloatingPointNumberToken(acc) + + def _eat_number_after_first_digit(self, acc): + if not self._line_buf: + return IntegerNumberToken(acc) + m_digits = re.match(r'^\d+', self._line_buf) + if m_digits: + digits = m_digits.group(0) + self._line_buf = self._line_buf[len(digits):] + return self._eat_number_after_first_digit('%s%s' % (acc, digits)) + m_e_sign = re.match(r'^[eE]', self._line_buf) + if m_e_sign: + e_sign = m_e_sign.group(0) + self._line_buf = self._line_buf[len(e_sign):] + return self._eat_number_after_e_sign('%s%s' % (acc, e_sign)) + m_dec_mark = re.match(r'^\.', self._line_buf) + if m_dec_mark: + dec_mark = m_dec_mark.group(0) + self._line_buf = self._line_buf[len(dec_mark):] + return self._eat_number_after_dec_mark('%s%s' % (acc, dec_mark)) + return IntegerNumberToken(acc) + + def _try_eat_number(self): + m_first_digit = re.match(r'^\d', self._line_buf) + if m_first_digit: + first_digit = m_first_digit.group(0) + self._line_buf = self._line_buf[len(first_digit):] + return self._eat_number_after_first_digit(first_digit) + m_dec_mark = re.match(r'^\.\d', self._line_buf) + if m_dec_mark: + dec_mark = m_dec_mark.group(0) + self._line_buf = self._line_buf[len(dec_mark):] + return self._eat_number_after_dec_mark(dec_mark) + return False + + def _try_eat_ws(self): + while True: + if not self._try_require_line_buf(): + return False + m_ws = re.match(self._ws_re, self._line_buf) + if not m_ws: + return True + self._line_buf = self._line_buf[len(m_ws.group(0)):] + return True + + def _try_eat_identifier_or_keyword(self): + m_identifier = re.match(self._identifier_re, self._line_buf) + if m_identifier: + identifier = m_identifier.group(0) + self._line_buf = self._line_buf[len(identifier):] + if identifier in self._keywords: + return self._keywords[identifier]() + else: + return IdentifierToken(identifier) + return False + + def _try_eat_const_token(self): + for t in self._const_toks_sorted: + if self._line_buf.startswith(t): + self._line_buf = self._line_buf[len(t):] + return self._const_toks[t]() + return False + + def _try_eat_token(self): + if not self._try_eat_ws(): + return False + const_tok = self._try_eat_const_token() + if const_tok: + self._tok_buf.append(const_tok) + return const_tok + identifier_or_keyword = self._try_eat_identifier_or_keyword() + if identifier_or_keyword: + self._tok_buf.append(identifier_or_keyword) + return identifier_or_keyword + number = self._try_eat_number() + if number: + self._tok_buf.append(number) + return number + raise LexerError("'%c' unexpected" % (self._line_buf[0])) + + def _try_require_tok_buf(self, n = 1): + if n < 1: + raise LexerError("unable to require %d tokens" % (n)) + while len(self._tok_buf) < n: + if not self._try_eat_token(): + return False + return True + + def has_next_token(self, n = 1): + return self._try_require_tok_buf(n) + + def preview_next_token(self, n = 0): + if not self.has_next_token(n + 1): + raise LexerError("not enough tokens") + return self._tok_buf[n] + + def drop_next_token(self, n = 1): + if not self.has_next_token(n): + raise LexerError("not enough tokens") + if n == 1: + return self._tok_buf.pop(0) + else: + xs = self._tok_buf[:n] + self._tok_buf = self._tok_buf[n:] + return xs + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('src_path', help='set soure file path') + args = parser.parse_args() + with open(args.src_path, 'r') as src_file: + lexer = Lexer(src_file) + while lexer.has_next_token(): + print(lexer.drop_next_token().__class__.__name__) |