繁体字转化为简体字 https://share.weiyun.com/ZsmZl6g5 密码:rc8mva

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# Copyright © 2022, 飞麦 <fitmap@qq.com>, All rights reserved.

# frozen_string_literal: true

#
# 将字符串中的繁体字转换为对应的简体字
#
module Chinese
    # 对字符串中的有序字符进行二分查找
    #
    # * +ele+ - 待查找的字符
    #
    def self.binary_search(ele)
        low = 0
        # 升序繁体字符串
        @traditional ||= File.read('/N/chn/traditional.txt')
        high = @traditional.length - 1
        while low <= high
            mid = (low + high) / 2
            case @traditional[mid] <=> ele
            when -1 then low = mid + 1
            when +1 then high = mid - 1
            else return mid
            end
        end
        -1
    end

    # 将繁体字符串转变为简体字符串, 非繁体字符原样输出
    # 特别注意: 由于存在一些多个繁体字对应单个简体字的情况, 反向的单字转换不准确
    # 另外也存在少量多个简体字对应单个繁体字的情况, 正向的单字转换也不一定非常准确
    #
    # * +str_traditional+ - 繁体字符串(可含有简体字符)
    # * return 简体字符串
    #
    def self.simplify(str_traditional)
        buf_a = []
        # 对应简体字符串
        @simplified ||= File.read('/N/chn/simplified.txt')
        str_traditional.each_char do |ch|
            index = binary_search(ch)
            buf_a << (index.negative? ? ch : @simplified[index])
        end
        buf_a.join
    end

    # 将繁体字符串转变为简体字符串, 非繁体字符原样输出
    # 特别注意: 由于存在一些多个繁体字对应单个简体字的情况, 反向的单字转换不准确
    # 另外也存在少量多个简体字对应单个繁体字的情况, 正向的单字转换也不一定非常准确
    #
    # * +str_traditional+ - 繁体字符串(可含有简体字符)
    # * return 简体字符串
    #
    def self.simplify2(str_traditional)
        buf = ''
        # 对应简体字符串
        @simplified ||= File.read('/N/chn/simplified.txt')
        str_traditional.each_char do |ch|
            index = binary_search(ch)
            buf += (index.negative? ? ch : @simplified[index])
        end
        buf
    end

    # 检查繁体字是否按升序排列
    def self.check
        last = ''
        okay = true
        @traditional ||= File.read('/N/chn/traditional.txt')
        @traditional.each_char do |ch|
            if ch <= last
                puts "Bad sequence on #{last} ~ #{ch}"
                okay = false
            end
            last = ch
        end
        okay
    end
end

if $PROGRAM_NAME == __FILE__
    input_name = ARGV[0]
    output_name = ARGV[1]
    File.open(input_name, 'r') do |file_input|
        File.open(output_name, 'wb') do |file_output|
            file_input.each_line do |line|
                file_output.print Chinese.simplify(line)
            end
        end
    end
end
posted @ 2024-01-30 09:48  飞麦  阅读(178)  评论(0编辑  收藏  举报