# Copyright (c) 2015 Egor Tensin # This file is part of the "Simple interpreter" project. # For details, see https://github.com/egor-tensin/simple-interpreter. # Distributed under the MIT License. import re from tokens import * class LexerError(RuntimeError): pass class Lexer: def __init__(self, src_file): self._line_buf = '' self._tok_buf = [] self._src_file = src_file self._ws_re = re.compile(r'^\s+') self._identifier_re = re.compile(r'^[^\W\d]\w*') self._const_toks = { '+': AdditionOpToken, '-': SubtractionOpToken, '*': MultiplicationOpToken, '/': DivisionOpToken, ':=': AssignmentOpToken, ';': SemicolonToken, '(': OpeningParenToken, ')': ClosingParenToken, '{': OpeningBraceToken, '}': ClosingBraceToken, '&&': AndOpToken, '||': OrOpToken, '==': EqualsOpToken, '!=': NotEqualsOpToken, } self._const_toks_sorted = list(self._const_toks.keys()) self._const_toks_sorted.sort() self._make_sure_const_tokens_dont_match_identifier_re() self._keywords = { 'if': IfToken, 'print': PrintToken, 'True': TrueToken, 'False': FalseToken, } self._make_sure_keywords_match_identifier_re() def _make_sure_keywords_match_identifier_re(self): for kw in self._keywords: if not re.match(self._identifier_re, kw): raise LexerError("keyword '%s' is not an identifier" % (kw)) def _make_sure_const_tokens_dont_match_identifier_re(self): for t in self._const_toks_sorted: if re.match(self._identifier_re, t): raise LexerError("const token '%s' is an identifier" % (t)) def _try_require_line_buf(self): if self._line_buf: return True if self._src_file is None: return False self._line_buf = self._src_file.readline() if not self._line_buf: self._src_file = None return False return True def _eat_number_after_e_sign(self, acc): m_after_e_sign = re.match(r'^[\-+]?\d+', self._line_buf) if m_after_e_sign: after_e_sign = m_after_e_sign.group(0) self._line_buf = self._line_buf[len(after_e_sign):] return FloatingPointNumberToken('%s%s' % (acc, after_e_sign)) raise LexerError("'%c' unexpected" % (self._line_buf[0])) def _eat_number_after_dec_mark(self, acc): if not self._line_buf: return FloatingPointNumberToken(acc) m_digits = re.match(r'^\d+', self._line_buf) if m_digits: digits = m_digits.group(0) self._line_buf = self._line_buf[len(digits):] return self._eat_number_after_dec_mark('%s%s' % (acc, digits)) m_e_sign = re.match(r'^[eE]', self._line_buf) if m_e_sign: e_sign = m_e_sign.group(0) self._line_buf = self._line_buf[len(e_sign):] return self._eat_number_after_e_sign('%s%s' % (acc, e_sign)) return FloatingPointNumberToken(acc) def _eat_number_after_first_digit(self, acc): if not self._line_buf: return IntegerNumberToken(acc) m_digits = re.match(r'^\d+', self._line_buf) if m_digits: digits = m_digits.group(0) self._line_buf = self._line_buf[len(digits):] return self._eat_number_after_first_digit('%s%s' % (acc, digits)) m_e_sign = re.match(r'^[eE]', self._line_buf) if m_e_sign: e_sign = m_e_sign.group(0) self._line_buf = self._line_buf[len(e_sign):] return self._eat_number_after_e_sign('%s%s' % (acc, e_sign)) m_dec_mark = re.match(r'^\.', self._line_buf) if m_dec_mark: dec_mark = m_dec_mark.group(0) self._line_buf = self._line_buf[len(dec_mark):] return self._eat_number_after_dec_mark('%s%s' % (acc, dec_mark)) return IntegerNumberToken(acc) def _try_eat_number(self): m_first_digit = re.match(r'^\d', self._line_buf) if m_first_digit: first_digit = m_first_digit.group(0) self._line_buf = self._line_buf[len(first_digit):] return self._eat_number_after_first_digit(first_digit) m_dec_mark = re.match(r'^\.\d', self._line_buf) if m_dec_mark: dec_mark = m_dec_mark.group(0) self._line_buf = self._line_buf[len(dec_mark):] return self._eat_number_after_dec_mark(dec_mark) return False def _try_eat_ws(self): while True: if not self._try_require_line_buf(): return False m_ws = re.match(self._ws_re, self._line_buf) if not m_ws: return True self._line_buf = self._line_buf[len(m_ws.group(0)):] return True def _try_eat_identifier_or_keyword(self): m_identifier = re.match(self._identifier_re, self._line_buf) if m_identifier: identifier = m_identifier.group(0) self._line_buf = self._line_buf[len(identifier):] if identifier in self._keywords: return self._keywords[identifier]() else: return IdentifierToken(identifier) return False def _try_eat_const_token(self): for t in self._const_toks_sorted: if self._line_buf.startswith(t): self._line_buf = self._line_buf[len(t):] return self._const_toks[t]() return False def _try_eat_token(self): if not self._try_eat_ws(): return False const_tok = self._try_eat_const_token() if const_tok: self._tok_buf.append(const_tok) return const_tok identifier_or_keyword = self._try_eat_identifier_or_keyword() if identifier_or_keyword: self._tok_buf.append(identifier_or_keyword) return identifier_or_keyword number = self._try_eat_number() if number: self._tok_buf.append(number) return number raise LexerError("'%c' unexpected" % (self._line_buf[0])) def _try_require_tok_buf(self, n = 1): if n < 1: raise LexerError("unable to require %d tokens" % (n)) while len(self._tok_buf) < n: if not self._try_eat_token(): return False return True def has_next_token(self, n = 1): return self._try_require_tok_buf(n) def preview_next_token(self, n = 0): if not self.has_next_token(n + 1): raise LexerError("not enough tokens") return self._tok_buf[n] def drop_next_token(self, n = 1): if not self.has_next_token(n): raise LexerError("not enough tokens") if n == 1: return self._tok_buf.pop(0) else: xs = self._tok_buf[:n] self._tok_buf = self._tok_buf[n:] return xs if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('src_path', help='set soure file path') args = parser.parse_args() with open(args.src_path, 'r') as src_file: lexer = Lexer(src_file) while lexer.has_next_token(): print(lexer.drop_next_token().__class__.__name__)