Build a simple interpreter --Part 2

　　“If you learn only methods,you will be tied to your methods.But if you learn principles, you can devise your own methods.”

　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　---- Ralph Waldo Emerson

　　On taht note,let's dive into interpreters and compilers again.The new version of the code can do these things:

　　　　1. Handle whitespace characters anywhere in the input string

　　　　2.Consume multi-digit integers from the input.

　　　　3.Subtract two integers

　　　Here is the source code for our new version of the calculator that can do all of the above.

  1 # Token types
  2 # EOF (end-of-file) token is used to indicate that
  3 # there is no more input left for lexical analysis
  4 INTEGER, PLUS, MINUS, EOF = 'INTEGER', 'PLUS', 'MINUS', 'EOF'
  5 
  6 
  7 class Token(object):
  8     def __init__(self, type, value):
  9         # token type: INTEGER, PLUS, MINUS, or EOF
 10         self.type = type
 11         # token value: non-negative integer value, '+', '-', or None
 12         self.value = value
 13 
 14     def __str__(self):
 15         """String representation of the class instance.
 16 
 17         Examples:
 18             Token(INTEGER, 3)
 19             Token(PLUS '+')
 20         """
 21         return 'Token({type}, {value})'.format(
 22             type=self.type,
 23             value=repr(self.value)
 24         )
 25 
 26     def __repr__(self):
 27         return self.__str__()
 28 
 29 
 30 class Interpreter(object):
 31     def __init__(self, text):
 32         # client string input, e.g. "3 + 5", "12 - 5", etc
 33         self.text = text
 34         # self.pos is an index into self.text
 35         self.pos = 0
 36         # current token instance
 37         self.current_token = None
 38         self.current_char = self.text[self.pos]
 39 
 40     def error(self):
 41         raise Exception('Error parsing input')
 42 
 43     def advance(self):
 44         """Advance the 'pos' pointer and set the 'current_char' variable."""
 45         self.pos += 1
 46         if self.pos > len(self.text) - 1:
 47             self.current_char = None  # Indicates end of input
 48         else:
 49             self.current_char = self.text[self.pos]
 50 
 51     def skip_whitespace(self):
 52         while self.current_char is not None and self.current_char.isspace():
 53             self.advance()
 54 
 55     def integer(self):
 56         """Return a (multidigit) integer consumed from the input."""
 57         result = ''
 58         while self.current_char is not None and self.current_char.isdigit():
 59             result += self.current_char
 60             self.advance()
 61         return int(result)
 62 
 63     def get_next_token(self):
 64         """Lexical analyzer (also known as scanner or tokenizer)
 65 
 66         This method is responsible for breaking a sentence
 67         apart into tokens.
 68         """
 69         while self.current_char is not None:
 70 
 71             if self.current_char.isspace():
 72                 self.skip_whitespace()
 73                 continue
 74 
 75             if self.current_char.isdigit():
 76                 return Token(INTEGER, self.integer())
 77 
 78             if self.current_char == '+':
 79                 self.advance()
 80                 return Token(PLUS, '+')
 81 
 82             if self.current_char == '-':
 83                 self.advance()
 84                 return Token(MINUS, '-')
 85 
 86             self.error()
 87 
 88         return Token(EOF, None)
 89 
 90     def eat(self, token_type):
 91         # compare the current token type with the passed token
 92         # type and if they match then "eat" the current token
 93         # and assign the next token to the self.current_token,
 94         # otherwise raise an exception.
 95         if self.current_token.type == token_type:
 96             self.current_token = self.get_next_token()
 97         else:
 98             self.error()
 99 
100     def expr(self):
101         """Parser / Interpreter
102 
103         expr -> INTEGER PLUS INTEGER
104         expr -> INTEGER MINUS INTEGER
105         """
106         # set current token to the first token taken from the input
107         self.current_token = self.get_next_token()
108 
109         # we expect the current token to be an integer
110         left = self.current_token
111         self.eat(INTEGER)
112 
113         # we expect the current token to be either a '+' or '-'
114         op = self.current_token
115         if op.type == PLUS:
116             self.eat(PLUS)
117         else:
118             self.eat(MINUS)
119 
120         # we expect the current token to be an integer
121         right = self.current_token
122         self.eat(INTEGER)
123         # after the above call the self.current_token is set to
124         # EOF token
125 
126         # at this point either the INTEGER PLUS INTEGER or
127         # the INTEGER MINUS INTEGER sequence of tokens
128         # has been successfully found and the method can just
129         # return the result of adding or subtracting two integers,
130         # thus effectively interpreting client input
131         if op.type == PLUS:
132             result = left.value + right.value
133         else:
134             result = left.value - right.value
135         return result
136 
137 
138 def main():
139     while True:
140         try:
141             # To run under Python3 replace 'raw_input' call
142             # with 'input'
143             text = raw_input('calc> ')
144         except EOFError:
145             break
146         if not text:
147             continue
148         interpreter = Interpreter(text)
149         result = interpreter.expr()
150         print(result)
151 
152 
153 if __name__ == '__main__':
154     main()

　　　　The major code changes compared with the version from part 1 are:

　　　　　　1. The get_next_token method was refactored a bit.The logic to increment the pos pointer was factored into a separate method advance

　　　　　　2. Two more methods were added:skip_whitespace to ignore whitespace characters and integer to handle multi-digit integers in the input

　　　　　　3. The expr method was modified to recognize INTEGER -> MINUS -> INTEGER phrase in addition to INTEGER -> PLUS -> INTEGER phrase.

　　　　　　　The method also successfully interprets both addition and subtraction after having recognized corresponeding phrase.

　　　　In order for us to round out the discussion of tokens we need to mention lexemes.A lexeme is a sequence of characters from a token.Here I directly use examples in the original blog to explain.

　　　　　　　　　　　　Token Sample lexemes

　　　　　　　　　　INTEGER 342,9,0,1...

　　　　　　　　　　PLUS +

　　　　　　　　　　MINUS -

　　We have already mentioned that the expr method is where the interpretation of an arithmetic expression actually happens.But before you can interpreter an expression you first need to recognize what kind of phrase it is, whether it is addition or subtraction,for example.That's what the expr method essentially does:it find the structure in the stream of tokens that it gets from the get_next_token method and then it interpreters the phrase that it has recognized,gengerating the result of the arithmetic expression.

　　The process of finding the structure in the stream of tokens,or put differently,the process of recognizing a phrase in the stream of tokens is called parsing.The part of an interpreter or a compiler is called parser.

　　So we konw the expr method is the part of our interpreter where both parsing and interpreting hanppens:the expr method first tries to recognize(parsing) the INTEGER -> PLUS -> INTEGER or the INTEGER -> MINUS -> INTEGER in the stream of tokens and after it has successfully recognized(parsed) one of those phrases,the method interpretes it returns the result of either addition or subtraction of two integers to the caller.

　　And this time we need to rewrite the code to meet the following requirements:

　　　　1.Extend the calculator to handle multiplication and division of two integers

　　　　2.Modify the code to interpreter the expressions containing an arbitrary number additions and subtractions,for example "3 + 4 - 5 + 11 "

　　The following is the code I rewritten according to the requirements:

  1 INTEGER, PLUS, MINUS, EOF, MULTI , DIV = 'INTEGER', 'PLUS', 'MINUS', 'EOF' , 'MULTI' , 'DIV'
  2 
  3 class Token(object):
  4     def __init__(self, type, value):
  5         # token type: INTEGER,PLUS,MINUS,MULTI,DIV,EOF
  6         self.type = type
  7         # token value: non-negative integer value, '+', '-', '*','/' or None
  8         self.value = value
  9 
 10     def __str__(self):
 11         """String representation of the class instance.
 12 
 13         Examples:
 14             Token(INTEGER, 3)
 15             Token(PLUS '+')
 16         """
 17         return 'Token({type}, {value})'.format(
 18             type=self.type,
 19             value=repr(self.value)
 20         )
 21 
 22     def __repr__(self):
 23         return self.__str__()
 24 
 25 class Interpreter(object):
 26     def __init__(self,text):
 27         self.text = text
 28         self.pos = 0
 29         self.current_token = None
 30         self.current_char = self.text[self.pos]
 31 
 32     def error(self):
 33         raise Exception('Error parsing input')
 34 
 35     def advance(self):
 36         """Advance the 'pos' pointer and
 37         set the 'current_char'variable."""
 38         self.pos += 1
 39         if self.pos > len(self.text) - 1:
 40             self.current_char = None
 41         else:
 42             self.current_char = self.text[self.pos]
 43 
 44     def skip_whitespace(self):
 45         while self.current_char is not None and self.current_char.isspace():
 46             self.advance()
 47 
 48     def integer(self):
 49         result = ''
 50         while self.current_char is not None and self.current_char.isdigit():
 51             result += self.current_char
 52             self.advance()
 53         return int(result)
 54 
 55     def get_next_token(self):
 56         """
 57         This method is responsible for breaking a sentence
 58         apart into tokens.
 59         :return: Token
 60         """
 61         while self.current_char is not None:
 62             if self.current_char.isspace():
 63                 self.skip_whitespace()
 64                 continue
 65             if self.current_char.isdigit():
 66                 return Token(INTEGER,self.integer())
 67             if self.current_char == '+':
 68                 self.advance()
 69                 return Token(PLUS,'+')
 70             if self.current_char == '-':
 71                 self.advance()
 72                 return Token(MINUS,'-')
 73             if self.current_char == '*':
 74                 self.advance()
 75                 return Token(MULTI,'*')
 76             if self.current_char == '/':
 77                 self.advance()
 78                 return Token(DIV,'/')
 79             self.error()
 80         return Token(EOF,None)
 81 
 82     def eat(self, token_type):
 83         if self.current_token.type == token_type:
 84             self.current_token = self.get_next_token()
 85         else:
 86             self.error()
 87 
 88     def expr(self):
 89         self.current_token = self.get_next_token()
 90         result = self.current_token
 91         while self.pos < len(self.text) :
 92             left = result
 93             self.eat(self.current_token.type)
 94             op = self.current_token
 95             if op.type == PLUS:
 96                 self.eat(PLUS)
 97             elif op.type == MINUS:
 98                 self.eat(MINUS)
 99             elif op.type == MULTI:
100                 self.eat(MULTI)
101             else:
102                 self.eat(DIV)
103             right = self.current_token
104             #self.eat(INTEGER)
105             result.type = INTEGER
106             if op.type == PLUS:
107                 result.value = left.value + right.value
108             elif op.type == MINUS:
109                 result.value = left.value - right.value
110             elif op.type == MULTI:
111                 result.value = left.value * right.value
112             else:
113                 result.value = left.value / right.value
114         return result
115 
116 def main():
117     while True:
118         try:
119             text = input('calc>')
120         except EOFError:
121             break
122         if not text:
123             continue
124         interpreter = Interpreter(text)
125         result = interpreter.expr()
126         print(result.value)
127 
128 if __name__ == '__main__':
129     main()

　　This is the result of running on my computer:　

　　I added the variable result to the expr method and created a loop based on the position of pos.We ues the variable result to receive the first token and assign result to left every time in the loop.Because the type of current_token is INTEGER every time we loop,we can replace self.eat(self.current_token.type) with self.eat(INTEGER).

　　　That's all for today.I hope there will be new progress everyday.　　　　　　　　　　　　　　　　　　　　　　　　　　　　　　

发表于 2021-12-04 23:00 睡键盘的猫阅读(80) 评论(0) 收藏举报

刷新页面返回顶部

Build a simple interpreter --Part 2

公告

导航