Markup添加标记
本文内容来自《python基础教程第二版》上的项目。
Markup要做的就是未纯文本添加一些格式。以一个文本文件作为输入,然后再浏览器中查看输出的结果或者直接检查新增的标签。
首先,我们要做的是将文本分块,具体是不断地读入行直到遇到空行,再将前面的所有行加在一起算一个块。
util.py 两个工具性的函数,第一个只是在文件的末尾加了一个空行,为了能让block函数找到最后一个块的结束标志。
def lines(file): ''' Add one blank line at the end of file as a mark of the last block. ''' for line in file: yield line yield '\n' def blocks(file): ''' Divide a file into blocks. ''' block = [] for line in lines(file): if line.strip(): block.append(line) elif block: yield ''.join(block).strip() block = []
后面的handler,rule,和主程序markup我好解释,可能是自己理解上还不到位。
handler.py用来处理具体的标记添加和文本替换。
class Handler: ''' An object that handles method calls from the Parser. The Parser will call the start() and end() methods at the beginning of each block, with the proper block name as a parameter. The sub() method will be used in regular expression substitution. When called with a name such as 'emphasis', it will return a proper substitution function. ''' def callback(self, prefix, name, *args): method = getattr(self, prefix+name, None) if callable(method): return method(*args) def start(self, name): self.callback('start_', name) def end(self, name): self.callback('end_', name) def sub(self, name): def substitution(match): result = self.callback('sub_', name, match) if result is None: match.group(0) return result return substitution class HTMLRenderer(Handler): ''' A specific handler used for rendering HTML. The methods in HTMLRenderer are accessed from the superclass Handler's start(), end(), and sub() methods. They implement basic markup as used in HTML documents. ''' def start_document(self): print '<html><head><title>...</title></head><body>' def end_document(self): print '</body></html>' def start_paragraph(self): print '<p>' def end_paragragh(self): print '</p>' def start_heading(self): print '<h2>' def end_heading(self): print '</h2>' def start_list(self): print '<ul>' def end_list(self): print '</ul>' def start_listitem(self): print '<li>' def end_listitem(self): print '</li>' def start_tile(self): print '<h1>' def end_tile(self): print '</h1>' def sub_emphasis(self, match): return '<em>%s</em>' % match.group(1) def sub_url(self, match): return '<a href="%s">%s</a>' % (match.group(1), match.group(1)) def sub_mail(self, match): return '<a href="mailto:%s">%s</a>' % (match.group(1), match.group(1)) def feed(self, data): print data
rules.py 用来识别文本块中的标题,段落,列表等格式。
class Rule: ''' Base class for all rules. ''' def action(self, block, handler): handler.start(self.type) handler.feed(block) handler.end(self.type) return True class HeadingRule(Rule): ''' A heading is a single line that is at most 70 characters and that doesn't end with a colon. ''' type = 'heading' def condition(self, block): return not '\n' in block and len(block) <= 70 and not block[-1] == ':' class TitleRule(HeadingRule): ''' The title is the first block in the document, provided that it is a heading. ''' type = 'title' first = True def condition(self, block): if not self.first: return False self.first = False return HeadingRule.condition(self, block) class ListItemRule(Rule): ''' A list item is a paragraph that begins with a hyphen. As part of the formatting, the hyphen is removed. ''' type = 'listitem' def condition(self, block): return block[0] == '-' def action(self, block, handler): handler.start(self.type) handler.feed(block[1:].strip()) handler.end(self.type) return True class ListRule(ListItemRule): ''' A list begins between a block that is not a list item and a subsequent list item. It ends after the last consecutive list item. ''' type = 'list' inside = False def condition(self, block): return True def action(self, block, handler): if not self.inside and ListItemRule.condition(self, block): handler.start(self.type) self.inside = True elif self.inside and not ListItemRule.condition(self, block): handler.end(self.type) self.inside = False return False class ParagraphRule(Rule): ''' A paragraph is simply a block that isn't covered by any of the other rules. ''' type = 'paragraph' def condition(self, block): return True
markup主程序。一个简单的文本分析器。
import sys, re from handlers import * from util import * from rules import * class Parser: ''' A Parser reads a text file, applying rules and controlling a handler. ''' def __init__(self, handler): self.handler = handler self.rules = [] self.filters = [] def addRule(self, rule): self.rules.append(rule) def addFilter(self, pattern, name): def filter(block, handler): return re.sub(pattern, handler.sub(name), block) self.filters.append(filter) def parse(self, file): self.handler.start('document') for block in blocks(file): for filter in self.filters: block = filter(block, self.handler) for rule in self.rules: if rule.condition(block): last = rule.action(block, self.handler) if last: break self.handler.end('document') class BasicTextParser(Parser): ''' A specific Parser that adds rules and filters in its constructor. ''' def __init__(self, handler): Parser.__init__(self, handler) self.addRule(ListRule()) self.addRule(ListItemRule()) self.addRule(TitleRule()) self.addRule(HeadingRule()) self.addRule(ParagraphRule()) self.addFilter(r'\*(.+?)\*', 'emphasis') self.addFilter(r'(http://[\.a-zA-Z/]+)', 'url') self.addFilter(r'([\.a-zA-Z]+@[\.a-zA-Z]+[a-zA-Z]+)', 'mail') if __name__ == '__main__': handler = HTMLRenderer() parser = BasicTextParser(handler) parser.parse(sys.stdin)