Markup添加标记

本文内容来自《python基础教程第二版》上的项目。

Markup要做的就是未纯文本添加一些格式。以一个文本文件作为输入,然后再浏览器中查看输出的结果或者直接检查新增的标签。

首先,我们要做的是将文本分块,具体是不断地读入行直到遇到空行,再将前面的所有行加在一起算一个块。

util.py 两个工具性的函数,第一个只是在文件的末尾加了一个空行,为了能让block函数找到最后一个块的结束标志。

def lines(file):
	'''
	Add one blank line at the end of file as a mark of the last block.	
	'''
	for line in file:
		yield line
	yield '\n'

def blocks(file):
	'''
	Divide a file into blocks.
	'''
	block = []
	for line in lines(file):
		if line.strip():
			block.append(line)
		elif block:
			yield ''.join(block).strip()
			block = []

后面的handler,rule,和主程序markup我好解释,可能是自己理解上还不到位。

handler.py用来处理具体的标记添加和文本替换。

class Handler:
	'''
	An object that handles method calls from the Parser.

    The Parser will call the start() and end() methods at the
    beginning of each block, with the proper block name as a
    parameter. The sub() method will be used in regular expression
    substitution. When called with a name such as 'emphasis', it will
    return a proper substitution function.
	'''
	def callback(self, prefix, name, *args):
		method = getattr(self, prefix+name, None)
		if callable(method):
			return method(*args)
	def start(self, name):
		self.callback('start_', name)
	def end(self, name):
		self.callback('end_', name)
	def sub(self, name):
		def substitution(match):
			result = self.callback('sub_', name, match)
			if result is None:
				match.group(0)
			return result
		return substitution

class HTMLRenderer(Handler):
	'''
	A specific handler used for rendering HTML.

    The methods in HTMLRenderer are accessed from the superclass
    Handler's start(), end(), and sub() methods. They implement basic
    markup as used in HTML documents.
	'''
	def start_document(self):
		print '<html><head><title>...</title></head><body>'
	def end_document(self):
		print '</body></html>'
	def start_paragraph(self):
		print '<p>'
	def end_paragragh(self):
		print '</p>'
	def start_heading(self):
		print '<h2>'
	def end_heading(self):
		print '</h2>'
	def start_list(self):
		print '<ul>'
	def end_list(self):
		print '</ul>'
	def start_listitem(self):
		print '<li>'
	def end_listitem(self):
		print '</li>'
	def start_tile(self):
		print '<h1>'
	def end_tile(self):
		print '</h1>'
	def sub_emphasis(self, match):
		return '<em>%s</em>' % match.group(1)
	def sub_url(self, match):
		return '<a href="%s">%s</a>' % (match.group(1), match.group(1))
	def sub_mail(self, match):
		return '<a href="mailto:%s">%s</a>' % (match.group(1), match.group(1))
	def feed(self, data):
		print data

rules.py 用来识别文本块中的标题,段落,列表等格式。

class Rule:
	'''
	Base class for all rules.
	'''
	def action(self, block, handler):
		handler.start(self.type)
		handler.feed(block)
		handler.end(self.type)
		return True

class HeadingRule(Rule):
	'''
	A heading is a single line that is at most 70 characters and
    that doesn't end with a colon.
	'''
	type = 'heading'
	def condition(self, block):
		return not '\n' in block and len(block) <= 70 and not block[-1] == ':'

class TitleRule(HeadingRule):
	'''
	The title is the first block in the document, provided that it is
    a heading.
	'''
	type = 'title'
	first = True

	def condition(self, block):
		if not self.first:
			return False
		self.first = False
		return HeadingRule.condition(self, block)

class ListItemRule(Rule):
	'''
	A list item is a paragraph that begins with a hyphen. As part of
    the formatting, the hyphen is removed.
	'''
	type = 'listitem'
	def condition(self, block):
		return block[0] == '-'

	def action(self, block, handler):
		handler.start(self.type)
		handler.feed(block[1:].strip())
		handler.end(self.type)
		return True

class ListRule(ListItemRule):
	'''
	A list begins between a block that is not a list item and a
    subsequent list item. It ends after the last consecutive list
    item.
	'''
	type = 'list'
	inside = False
	def condition(self, block):
		return True
	def action(self, block, handler):
		if not self.inside and ListItemRule.condition(self, block):
			handler.start(self.type)
			self.inside = True
		elif self.inside and not ListItemRule.condition(self, block):
			handler.end(self.type)
			self.inside = False
		return False

class ParagraphRule(Rule):
	'''
	 A paragraph is simply a block that isn't covered by any of the
    other rules.
	'''
	type = 'paragraph'
	def condition(self, block):
		return True

markup主程序。一个简单的文本分析器。

import sys, re
from handlers import *
from util import *
from rules import *

class Parser:
	'''
	A Parser reads a text file, applying rules and controlling a handler.
	'''
	def __init__(self, handler):
		self.handler = handler
		self.rules = []
		self.filters = []
	def addRule(self, rule):
		self.rules.append(rule)
	def addFilter(self, pattern, name):
		def filter(block, handler):
			return re.sub(pattern, handler.sub(name), block)
		self.filters.append(filter)
	def parse(self, file):
		self.handler.start('document')
		for block in blocks(file):
			for filter in self.filters:
				block = filter(block, self.handler)
			for rule in self.rules:
				if rule.condition(block):
					last = rule.action(block, self.handler)
					if last:
						break
		self.handler.end('document')

class BasicTextParser(Parser):
	'''
	A specific Parser that adds rules and filters in its
    constructor.
	'''
	def __init__(self, handler):
		Parser.__init__(self, handler)
		self.addRule(ListRule())
		self.addRule(ListItemRule())
		self.addRule(TitleRule())
		self.addRule(HeadingRule())
		self.addRule(ParagraphRule())

		self.addFilter(r'\*(.+?)\*', 'emphasis')
		self.addFilter(r'(http://[\.a-zA-Z/]+)', 'url')
		self.addFilter(r'([\.a-zA-Z]+@[\.a-zA-Z]+[a-zA-Z]+)', 'mail')

if __name__ == '__main__':
	handler = HTMLRenderer()
	parser = BasicTextParser(handler)

	parser.parse(sys.stdin)

  

 

posted @ 2014-09-28 15:21  Seandor  阅读(265)  评论(0编辑  收藏  举报