python-手动借助google翻译来翻译文档

  1 import os
  2 import re
  3 '''
  4 读取指定的html文件
  5 去掉所有的换行符
  6 正则匹配特定项目：(?<=<div class="block">).+?(?=</div>)
  7     然后替换掉：</code>|<code>|<i>|</i> ==> ""
  8                 <sup> ==> "^"
  9                 </sup> ==> ""
 10 输出取得的项目到文件中
 11 手工翻译并按行形成一个新文本（此时程序未结束运行，等待指令）
 12 直接回车（读取默认路径的文本文件）或者输入文件绝对路径
 13 读取结果文件，先检查行数是否一致。
 14 没问题就逐行查找替换
 15 然后格式化html内容，输出到源文件
 16 '''
 17 
 18 english_str_list = list();
 19 Label_list=list();                      #记录标签，防止因为标签的原因，导致html解析不正常，就是说，源文档提取出来的里面有多少标签，翻译出来的里面，最终写入的也要有那么多标签，至于文档可读性再说，解析都解析不出来，读个锤子
 20 chinese_str_list = list();
 21 html_txt = ""   #保存html文件内容
 22 english_out_file_path = "english.txt"   #保存需要翻译的内容
 23 temp_html_out_file_path = "temp_html.txt"   #保存中间处理过程中的html内容
 24 chinese_out_file_path = "翻译结果.txt"   #保存需要翻译的内容
 25 html_file_path = '' #程序运行后，设置该值，保存修改内容时需要使用
 26 
 27 '''
 28 从指定的文件中读取html内容并提取要翻译的内容
 29 '''
 30 def english_process(file_path:str):
 31     global html_txt,english_str_list
 32     with open(file_path,'r',encoding="utf-8")as fd:
 33         html_txt = fd.readlines()
 34         html_txt="".join(html_txt)
 35     html_txt = html_txt.replace("\n","")
 36     regex_div_block = r'(?<=<div class="block">).+?(?=</div>)'
 37     regex_del = r'</code>|<code>|<i>|</i>'
 38     html_txt = re.sub(regex_del,"",html_txt)
 39     html_txt = html_txt.replace("<sup>","^").replace("</sup>","")
 40     #提取
 41     result = re.findall(regex_div_block,html_txt)
 42     class_number = html_txt.count(r'<div class="block">')
 43     if(class_number != len(result)):
 44         print(r'<div class="block">'+"的数量为"+str(class_number)+"，但是正则匹配的数量为："+str(len(result)))
 45     else:
 46         print(r'<div class="block">'+"的数量为"+str(class_number))
 47     if(len(result)):
 48         for txt in result:
 49             english_str_list.append(txt)
 50     regex_div_deprecation_block = r'(?<=<div class="deprecation-block">).+?(?=</div>)'
 51     result = re.findall(regex_div_deprecation_block,html_txt)
 52     class_number = html_txt.count(r'<div class="deprecation-block">')
 53     if(class_number != len(result)):
 54         print(r'<div class="deprecation-block">'+"的数量为"+str(class_number)+"，但是正则匹配的数量为："+str(len(result)))
 55     else:
 56         print(r'<div class="deprecation-block">'+"的数量为"+str(class_number))
 57     if(len(result)):
 58         for txt in result:
 59             english_str_list.append(txt)
 60     regex_div_notes = r'(?<=<dl class="notes">).+?(?=</dl>)'
 61     result = re.findall(regex_div_notes,html_txt)
 62     class_number = html_txt.count(r'<dl class="notes">')
 63     if(class_number != len(result)):
 64         print('<dl class="notes">'+"的数量为"+str(class_number)+"，但是正则匹配的数量为："+str(len(result)))
 65     else:
 66         print(r'<dl class="notes">'+"的数量为"+str(class_number))
 67     if(len(result)):
 68         for txt in result:
 69             english_str_list.append(txt)
 70     if( not len(english_str_list)):
 71         print("没有匹配到任何要翻译的内容，请检查！")
 72         exit(0);
 73     #替换<>字符
 74     html_txt = html_txt.replace("&lt;","<").replace("&gt;",">")
 75     for index in range(len(english_str_list)):
 76         english_str_list[index] = english_str_list[index].replace("&lt;","<").replace("&gt;",">")
 77     #输出
 78     with open(english_out_file_path,'w',encoding="utf-8")as fd:
 79         for content in english_str_list:
 80             print(content,file=fd)
 81     #记录标签
 82     regex_label = r'<.+?>|\&lt;.+?\&gt;'
 83     for content in english_str_list:
 84         Label_list.append( re.findall(regex_label,content));
 85 '''
 86 翻译结果的标签处理
 87 '''
 88 def label_process(index:int):
 89     global Label_list,chinese_str_list
 90     #替换<>字符，理论上没有
 91     chinese_str_list[index] = chinese_str_list[index].replace("&lt;","<").replace("&gt;",">")
 92     size = len(chinese_str_list[index])
 93     result_str=""
 94     str_index=0
 95     Label_list_index=0
 96     while str_index<size:
 97         temp_str =""
 98         if('<' == chinese_str_list[index][str_index]):
 99             temp_str+=chinese_str_list[index][str_index]
100             str_index = str_index + 1
101             while '>' != chinese_str_list[index][str_index]:
102                 temp_str+=chinese_str_list[index][str_index]
103                 str_index = str_index + 1
104             temp_str+=chinese_str_list[index][str_index]
105             str_index = str_index + 1
106             if(Label_list_index<len(Label_list[index])):
107                 if(Label_list[index][Label_list_index] == temp_str):    #这里抛出越界异常的话，请查看翻译结果文档，和英文文档对应的行比对下，处理翻译后多余的内容后重新运行脚本。
108                     #标签一样
109                     Label_list_index += 1   #下次该比对下一个标签
110                     result_str+=temp_str
111                     temp_str = ""
112                 else:
113                     #标签不一样
114                     '''
115                         若只是有多余空格或者多、少了某个字符怎么办？
116                         若是就是缺失了原标签该怎么办
117                         目前暂时就在当前插入这个标签
118                     '''
119                     result_str+=Label_list[index][Label_list_index]
120                     Label_list_index += 1
121             else:
122                 break;
123         else:
124             result_str += chinese_str_list[index][str_index]
125             str_index+=1
126     chinese_str_list[index] = result_str
127 
128 '''
129 将翻译结果填充到原来的位置
130 '''
131 def chinese_process(file_path:str):
132     global html_txt,chinese_str_list
133     with open(file_path,'r',encoding="utf-8")as fd:
134         chinese_str_list = fd.readlines()
135     if(len(chinese_str_list)==len(english_str_list)):
136         for i in range(len(english_str_list)):
137             chinese_str_list[i] = re.sub(r"<\s+","<",chinese_str_list[i])   #< li>
138             chinese_str_list[i] = re.sub(r"\s+>",">",chinese_str_list[i])   #<li >
139             chinese_str_list[i] = re.sub(r'([^<])a href',r"\1<a href",chinese_str_list[i])  #你a href
140             chinese_str_list[i] = re.sub(r'(?<=</)\s+',r"",chinese_str_list[i])      #</ xxx>这种形式的要去除掉
141             #对付<a href="../../../java.base/java/util/package-summary.html#CollectionsFramework">Java 集合框架</的成员一个>。这种情况
142             chinese_str_list[i] = re.sub(r'<([a-zA-Z]+?)(\s*[a-zA-Z"/=#\?\$\. -]+?>[^</>].+?)</([^a-zA-Z]+)>',r"<\1\2</\1>\3",chinese_str_list[i])
143             label_process(i)    #这步调用要在常规处理最后以及在写入之前
144             if("</a>" in english_str_list[i]):
145                 #这种翻译的可能稀碎，需要把原文也加上去
146                 #原先直接将原文加上去，这样就会导致问题：文档里面的重复字符串怎么办？全部替换的话，可能导致其他字符串不全，替换一次，那么后面的重复串就不能被翻译了
147                 #html_txt = html_txt.replace(english_str_list[i],chinese_str_list[i] + r"<p>原文：<p>" + english_str_list[i],1)
148                 temp_str = ""
149                 for char in english_str_list[i]:
150                     if(" " == char):
151                         temp_str += "  "
152                     else:
153                         temp_str += char
154                 html_txt = html_txt.replace(english_str_list[i],chinese_str_list[i] + r"<p>原文：<p>" + temp_str,1)
155             else:
156                 html_txt = html_txt.replace(english_str_list[i],chinese_str_list[i],1)
157     else:
158         print("待翻译的行数和翻译结果文件中的行数不一致，请检查！")
159         return
160 '''
161 一些额外的替换
162 '''
163 def other_translate():
164     global html_txt
165     html_txt = html_txt.replace(r"<h2>Field Summary</h2>",r"<h2>Field Summary（字段摘要）</h2>").replace(r'<div class="caption"><span>Fields</span></div>',r'<div class="caption"><span>Fields（字段）</span></div>').replace(r'<div class="table-header col-first">Modifier and Type</div>',r'<div class="table-header col-first">Modifier and Type（修饰符和类型）</div>').replace(r'<h2>Constructor Summary</h2>',r'<h2>Constructor Summary（构造函数摘要）</h2>').replace(r'<h2>Method Summary</h2>',r'<h2>Method Summary（函数概要）</h2>').replace(r'<h2>Field Details</h2>',r'<h2>Field Details（字段细节）</h2>').replace(r'<h2>Constructor Details</h2>',r'<h2>Constructor Details（构造函数详细信息）</h2>').replace(r'<h2>Method Details</h2>',r'<h2>Method Details（函数细节）</h2>').replace(r'class="table-tab">Concrete Methods</button>',r'class="table-tab">Concrete Methods（具体函数）</button>')
166 
167 html_file_path = input("请输入html文件所在位置：\n");
168 english_process(html_file_path);
169 #输出现在的html内容，以供检查
170 with open(temp_html_out_file_path,'w',encoding="utf-8")as fd:
171     print(html_txt,file=fd);
172 print("要翻译的内容已经输出到"+os.getcwd()+"  目录中，名称为："+english_out_file_path)
173 temp_str = input("请输入翻译结果文件所在位置，不输入直接回车则使用默认位置的文件：\n");
174 if(len(temp_str)):
175     chinese_out_file_path = temp_str
176 chinese_process(chinese_out_file_path);
177 other_translate();
178 #输出翻译结果
179 with open(html_file_path,'w',encoding="utf-8")as fd:
180     print(html_txt,file=fd);
181 '''
182 有时候会不小心造成替换后的文档不可滑动，那是因为<div class="flex-content">对应的闭合标签</div>被提前了，找到“<div class="flex-content">”对应的闭合标签，将其移动到在</body>标签前即可。
183 '''