Best of Ruby Quiz - GEDCOM Parser
[0] 首先,分析书后源代码,整题建立在下面的假设上:Level 标号从0开始,依次递增,且递增跨度为1,输入不存在错误
也正是这一点假设,LEVEL值可以不作为结点属性处理,即LEVEL值实际上是节点在堆栈中位置的代表,用堆栈结构可以轻松化解LEVEL带来的麻烦
[1] 第一种想法是用REXML,将字符串解析后存入XML结构,最后统一输出,源代码和书上一样
require 'rexml/document'
doc = REXML::Document.new '<gedcom/>'
stack = [doc.root]
IO.read(ARGV[0]).each do |line|
next if line =~ /^\s*$/
line =~ /^\s*(\d+)\s+(@\S+@|\S+)\s*(.*?)$/ or raise "Invalid GEDCOM"
level , tag , data = $1.to_i , $2 , $3
stack.pop while (level + 1 < stack.size)
parent = stack.last
if tag =~ /^@(\S+)@$/
ele = parent.add_element data
ele.attributes['id'] = tag
else
ele = parent.add_element tag
ele.text = data
end
stack.push ele
end
File.open("output_std.txt","w") do |file|
doc.write(file,0)
end
doc = REXML::Document.new '<gedcom/>'
stack = [doc.root]
IO.read(ARGV[0]).each do |line|
next if line =~ /^\s*$/
line =~ /^\s*(\d+)\s+(@\S+@|\S+)\s*(.*?)$/ or raise "Invalid GEDCOM"
level , tag , data = $1.to_i , $2 , $3
stack.pop while (level + 1 < stack.size)
parent = stack.last
if tag =~ /^@(\S+)@$/
ele = parent.add_element data
ele.attributes['id'] = tag
else
ele = parent.add_element tag
ele.text = data
end
stack.push ele
end
File.open("output_std.txt","w") do |file|
doc.write(file,0)
end
[2] 当数据规模很大时,不可能将所有数据存入XML结构内存,再统一输出,So fight and run
class Node
def initialize(tag_or_id,data = "")
if tag_or_id =~ /@.*@/
@name , @myid , @value = data , tag_or_id , ""
else
@name , @value ,@myid = tag_or_id , data , ""
end
end
def to_s_first
s = @myid.empty? ? "<#{@name}>\n" : "<#{@name} id=\"#{@myid}\">\n"
s += (@value+"\n") unless @value.empty?
s
end
def to_s_last
"</#{@name}>\n"
end
end
stack = [Node.new "gedcom"]
File.open("output.txt","w") do |file|
file.print stack.first.to_s_first
IO.read($*[0]).each do |line|
next if line =~ /^\s*$/
line =~ /^\s*(\d+)\s+(@\S+@|\S+)\s*(.*?)$/ or raise "error"
level , tag_or_id , data = $1.to_i , $2 , $3
file.print stack.pop.to_s_last while (level + 1 < stack.size)
node = Node.new(tag_or_id,data)
file.print node.to_s_first
stack.push node
end
file.print stack.first.to_s_last
end
def initialize(tag_or_id,data = "")
if tag_or_id =~ /@.*@/
@name , @myid , @value = data , tag_or_id , ""
else
@name , @value ,@myid = tag_or_id , data , ""
end
end
def to_s_first
s = @myid.empty? ? "<#{@name}>\n" : "<#{@name} id=\"#{@myid}\">\n"
s += (@value+"\n") unless @value.empty?
s
end
def to_s_last
"</#{@name}>\n"
end
end
stack = [Node.new "gedcom"]
File.open("output.txt","w") do |file|
file.print stack.first.to_s_first
IO.read($*[0]).each do |line|
next if line =~ /^\s*$/
line =~ /^\s*(\d+)\s+(@\S+@|\S+)\s*(.*?)$/ or raise "error"
level , tag_or_id , data = $1.to_i , $2 , $3
file.print stack.pop.to_s_last while (level + 1 < stack.size)
node = Node.new(tag_or_id,data)
file.print node.to_s_first
stack.push node
end
file.print stack.first.to_s_last
end
将节点打包成类,第二步解析交给类构造去做,输出也交给类做
在入栈时输出第一部分,在出栈时输出第二部分
自然想到将栈也打包
1require 'rexml/text'
2
3class Node
4 def initialize(tag_or_id,data = "")
5 if tag_or_id =~ /@.*@/
6 @name , @myid , @value = data , tag_or_id , ""
7 else
8 @name , @value ,@myid = tag_or_id , data , ""
9 end
10 end
11
12 def to_s_first
13 s = @myid.empty? ? "<#{@name}>\n" : "<#{@name} id=\'#{@myid}\'>\n"
14 s += (@value+"\n") unless @value.empty?
15 s
16 end
17
18 def to_s_last
19 "</#{@name}>\n"
20 end
21
22end
23
24class Stack < Array
25 def push(obj)
26 raise "type error" unless obj.is_a? Node
27 print obj.to_s_first
28 super(obj)
29 end
30
31 def pop
32 print self.last.to_s_last
33 super
34 end
35end
36
37def file_write_env(file)
38 $stdout = file
39 yield
40 $stdout = STDOUT
41end
42
43stack = Stack.new
44File.open("output.txt","w") do |file|
45 file_write_env(file) do
46 stack.push(Node.new "gedcom")
47 IO.read($*[0]).each do |line|
48 next if line =~ /^\s*$/
49 line =~ /^\s*(\d+)\s+(@\S+@|\S+)\s*(.*?)$/ or raise "error"
50 level , tag_or_id , data = $1.to_i , $2 , REXML::Text::normalize($3)
51 stack.pop while (level + 1 < stack.size)
52 stack.push Node.new(tag_or_id,data)
53 end
54 stack.pop
55 end
56end
2
3class Node
4 def initialize(tag_or_id,data = "")
5 if tag_or_id =~ /@.*@/
6 @name , @myid , @value = data , tag_or_id , ""
7 else
8 @name , @value ,@myid = tag_or_id , data , ""
9 end
10 end
11
12 def to_s_first
13 s = @myid.empty? ? "<#{@name}>\n" : "<#{@name} id=\'#{@myid}\'>\n"
14 s += (@value+"\n") unless @value.empty?
15 s
16 end
17
18 def to_s_last
19 "</#{@name}>\n"
20 end
21
22end
23
24class Stack < Array
25 def push(obj)
26 raise "type error" unless obj.is_a? Node
27 print obj.to_s_first
28 super(obj)
29 end
30
31 def pop
32 print self.last.to_s_last
33 super
34 end
35end
36
37def file_write_env(file)
38 $stdout = file
39 yield
40 $stdout = STDOUT
41end
42
43stack = Stack.new
44File.open("output.txt","w") do |file|
45 file_write_env(file) do
46 stack.push(Node.new "gedcom")
47 IO.read($*[0]).each do |line|
48 next if line =~ /^\s*$/
49 line =~ /^\s*(\d+)\s+(@\S+@|\S+)\s*(.*?)$/ or raise "error"
50 level , tag_or_id , data = $1.to_i , $2 , REXML::Text::normalize($3)
51 stack.pop while (level + 1 < stack.size)
52 stack.push Node.new(tag_or_id,data)
53 end
54 stack.pop
55 end
56end
第50行调用 REXML::Text::normalize 将字符串escaping ,达到和XML输出差不多的效果,完成字符转义
[PS] 有一点不很明白,用REXML构造法得到的输出,和最后的方法得到的输出在空格处理上有一定差异,REXML对于多空格只输出一个,而最后两种方法忠实于输入