'''
一:需求:读取压缩包,根据客户要求,查看是否有用户要求的文件,如果没有,则放弃这个压缩包,
如果有则解压这个压缩包,打开特定文件,对其中的某些字段进行翻译并重写,然后重新压缩
二:需求分析:
1.编写配置文件,方便用户配置需要翻译的文件及字段
2.编写翻译程序,输入一个压缩包,输出一个翻译后的压缩包
三:具体实现:
1.读取配置文件
{
"transconfig": [
{
"filetype": "a",
"field": "following_ch_name,following_ch_username,following_ch_location"
}
],
"inputdir": "D:\\PyCharm Community Edition 2021.3.3\\gepukeji\\iputdir",
"outputdir": "D:\\PyCharm Community Edition 2021.3.3\\gepukeji\\outputdir",
"workdir": "D:\\PyCharm Community Edition 2021.3.3\\gepukeji\\workdir"
}
'''
'''
2.调用讯飞翻译接口并封装成函数,可以翻译任意语种成为汉语简体
#!/usr/bin/env python
# -*- coding:utf-8 -*-
#
# 机器翻译 WebAPI 接口调用示例
# 运行前:请先填写Appid、APIKey、APISecret
# 运行方法:直接运行 main 即可
# 结果: 控制台输出结果信息
#
# 1.接口文档(必看):https://www.xfyun.cn/doc/nlp/xftrans/API.html
# 2.错误码链接:https://www.xfyun.cn/document/error-code (错误码code为5位数字)
#
import requests
import datetime
import hashlib
import base64
import hmac
import json
class get_result(object):
def __init__(self, host, text):
# 应用ID(到控制台获取)
self.APPID = "2d37e266"
# 接口APISercet(到控制台机器翻译服务页面获取)
self.Secret = "ZTJkMzUzN2Q5NmI0NjE4NmE1M2FmYmZj"
# 接口APIKey(到控制台机器翻译服务页面获取)
self.APIKey = "bb8a3117df956534db3404e1755bbaeb"
# 以下为POST请求
self.Host = host
self.RequestUri = "/v2/ots"
# 设置url
# print(host)
self.url = "https://" + host + self.RequestUri
self.HttpMethod = "POST"
self.Algorithm = "hmac-sha256"
self.HttpProto = "HTTP/1.1"
# 设置当前时间
curTime_utc = datetime.datetime.utcnow()
self.Date = self.httpdate(curTime_utc)
# 设置业务参数
# 语种列表参数值请参照接口文档:https://www.xfyun.cn/doc/nlp/xftrans/API.html
# 英语,俄语,印度语 土耳其,波兰,越南
# self.Text = "Привет"
# self.Text = text
self.Text = str(text).replace(',', '')
self.BusinessArgs = {
"from": "auto",
"to": "cn",
}
def hashlib_256(self, res):
m = hashlib.sha256(bytes(res.encode(encoding='utf-8'))).digest()
result = "SHA-256=" + base64.b64encode(m).decode(encoding='utf-8')
return result
def httpdate(self, dt):
"""
Return a string representation of a date according to RFC 1123
(HTTP/1.1).
The supplied date must be in UTC.
"""
weekday = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()]
month = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep",
"Oct", "Nov", "Dec"][dt.month - 1]
return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (weekday, dt.day, month,
dt.year, dt.hour, dt.minute, dt.second)
def generateSignature(self, digest):
signatureStr = "host: " + self.Host + "\n"
signatureStr += "date: " + self.Date + "\n"
signatureStr += self.HttpMethod + " " + self.RequestUri \
+ " " + self.HttpProto + "\n"
signatureStr += "digest: " + digest
signature = hmac.new(bytes(self.Secret.encode(encoding='utf-8')),
bytes(signatureStr.encode(encoding='utf-8')),
digestmod=hashlib.sha256).digest()
result = base64.b64encode(signature)
return result.decode(encoding='utf-8')
def init_header(self, data):
digest = self.hashlib_256(data)
# print(digest)
sign = self.generateSignature(digest)
authHeader = 'api_key="%s", algorithm="%s", ' \
'headers="host date request-line digest", ' \
'signature="%s"' \
% (self.APIKey, self.Algorithm, sign)
# print(authHeader)
headers = {
"Content-Type": "application/json",
"Accept": "application/json",
"Method": "POST",
"Host": self.Host,
"Date": self.Date,
"Digest": digest,
"Authorization": authHeader
}
return headers
def get_body(self):
content = str(base64.b64encode(self.Text.encode('utf-8')), 'utf-8')
postdata = {
"common": {"app_id": self.APPID},
"business": self.BusinessArgs,
"data": {
"text": content,
}
}
body = json.dumps(postdata)
# print(body)
return body
def call_url(self):
if self.APPID == '' or self.APIKey == '' or self.Secret == '':
print('Appid 或APIKey 或APISecret 为空!请打开demo代码,填写相关信息。')
else:
code = 0
body = self.get_body()
headers = self.init_header(body)
# print(self.url)
response = requests.post(self.url, data=body, headers=headers, timeout=8)
status_code = response.status_code
# print(response.content)
if status_code != 200:
# 鉴权失败
print("Http请求失败,状态码:" + str(status_code) + ",错误信息:" + response.text)
print("请根据错误信息检查代码,接口文档:https://www.xfyun.cn/doc/nlp/xftrans/API.html")
else:
# 鉴权成功
respData = json.loads(response.text)
# print(respData)
# 以下仅用于调试
code = str(respData["code"])
if code != '0':
print("请前往https://www.xfyun.cn/document/error-code?code=" + code + "查询解决办法")
return respData
# 其他语种翻译成英语
def autolan_cn(text):
host = "ntrans.xfyun.cn"
# 初始化类
gClass = get_result(host, text)
result = gClass.call_url()
result = result['data']['result']['trans_result']['dst']
return result
if __name__ == '__main__':
text = "洪申翰 Hung,Sun-Han"
a = autolan_cn(text)
print(a)
3.调用函数翻译特定文件中特定字段,并重写文件
import json
# tweet_createtime,tweet_topics需要修改的字段,有就添加,没有就新增
import os
import random
import shutil
import time
from gepukeji.讯飞翻译.WebITS import autolan_cn
f = open('配置.json', 'r').read()
field = json.loads(f)['transconfig'][0]['field']
name = str(field).split(',')
inputdir = json.loads(f)['inputdir']
outputdir = json.loads(f)['outputdir']
workdir = json.loads(f)['workdir']
def save_data(data, path, content_new, content_old, name1, name2):
fileObject = open(path, 'a', encoding="utf-8")
data = json.loads(data)
print(f'********************保存到文件{path}*******************')
for i in range(len(content_new)):
data[name1[i]] = content_old[i]
data[name2[i]] = content_new[i]
jsObj = json.dumps(data, ensure_ascii=False)
fileObject.write(jsObj)
fileObject.write('\n')
print('修改之后的文件:', jsObj)
fileObject.close()
return jsObj
def xunfei_trans(path):
# 读取bcp文件
# 创建一个新文件
path = path.replace(r'\\', '/')
print('path', path)
with open(path, encoding='utf8') as f:
data2 = f.readlines()
# 用户输入的关键字
data_all = []
new_path_all = []
content_new_all = []
content_old_all = []
name1_all = []
name2_all = []
for data in data2:
# 一个新文件
new_path = path.split('.bcp')[0] + '_new.bcp'
# 每行数据,json化
data = json.loads(json.dumps(data))
# print('data', data)
# 用户给的索引关键词
for index1 in range(len(name)):
name1 = name[index1]
content_old = json.loads(data)[name1]
# print(f'原始内容:{content_old}')
# 翻译内容
content_new = autolan_cn(content_old)
# 原字段修改之后的内容
content_old = content_old + '--参考译文:' + content_new
# 新增字段内容
content_new = content_new
# print('源文件', content_old)
# print('翻译之后的文件', content_new)
name2 = name1 + '_lst'
# datas.append(data)
# 删除老文件夹
# shutil.rmtree(path)
# 把数据存到新文件夹中
content_new_all.append(content_new)
content_old_all.append(content_old)
name1_all.append(name1)
name2_all.append(name2)
save_data(data, new_path, content_new_all, content_old_all, name1_all, name2_all)
if __name__ == '__main__':
path = 'D:\PyCharm Community Edition 2021.3.3\gepukeji\iputdir\\a\\a.bcp'
xunfei_trans(path)
4.扫描压缩包,如果有特定文件,则解压压缩包,然后调用翻译函数对压缩包中的特定文件进行翻译并重写
'''
import random
import re
import shutil
import time
import json
from gepukeji.讯飞翻译.讯飞翻译 import xunfei_trans
f = open('配置.json', 'r').read()
field = json.loads(f)['transconfig'][0]['field']
filetype = json.loads(f)['transconfig'][0]['filetype']
name = str(field).split(',')
inputdir = json.loads(f)['inputdir']
outputdir = json.loads(f)['outputdir']
workdir = json.loads(f)['workdir']
'''
扫描压缩文件
输出路径
'''
def traversal_zip(root_path):
"""
扫描出所有的压缩文件,并输出路径
:param root_path:
:return:
"""
list_zip = []
# 遍历文件
list_files = os.walk(root_path)
for dirpath, dirnames, filename in list_files:
time.sleep(random.randint(1, 3))
for file in filename:
time.sleep(random.randint(1, 3))
path_file = os.path.join(dirpath, file)
if re.search('\.zip$', path_file):
list_zip.append([dirpath, path_file])
else:
pass
return list_zip
import os
import zipfile
'''
zip_src: 是zip文件的全路径
dst_dir: 是要解压到的目的文件夹
'''
# 把压缩文件解压到解压文件夹
def unzip_file(zip_src, dst_dir):
r = zipfile.is_zipfile(zip_src)
if r:
fz = zipfile.ZipFile(zip_src, 'r')
for file in fz.namelist():
print(f'******************解压中{(fz.namelist()).index(file) + 1}/{len(fz.namelist())}*****************')
time.sleep(random.randint(1, 3))
fz.extract(file, dst_dir)
print('解压完成')
else:
print('This is not zip')
'''
压缩指定文件夹
src_dir:你要压缩的文件夹的路径
zip_name:压缩后zip文件的路径及名称
'''
def zip_file(src_dir):
zip_name = src_dir + '.zip'
z = zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED)
for dirpath, dirnames, filenames in os.walk(src_dir):
fpath = dirpath.replace(src_dir, '')
fpath = fpath and fpath + os.sep or ''
for filename in filenames:
z.write(os.path.join(dirpath, filename), fpath + filename)
print('==压缩中==')
z.close()
print('****************压缩成功**************')
return zip_name
# 查看文件夹中是否有zip压缩文件
time1 = int(time.time())
# 扫描出所有的压缩文件,并输出路径
get_zip = traversal_zip(inputdir)
print(f'压缩文件名称:{get_zip}')
'''
len(get_bcp) > 0说明有压缩文件
'''
if len(get_zip) > 0:
# 文件夹中有多个压缩文件
for i in range(len(get_zip)):
# time.sleep(random.randint(1, 3))
# 获取压缩文件名
zipname = get_zip[i][1]
# 扫描指定压缩文件
zf = zipfile.ZipFile(zipname)
# 获取文件名
ll = zf.namelist()
print(f'文件名:{ll}')
lls = []
bcp_names = []
# 便利所有文件
for ll in ll:
ll = ll.split('/')[1]
# 查看文件是否符合要求
if '.bcp' in ll and filetype in ll:
print(f'{ll}*******是符合要求的文件')
a = '1'
lls.append(a)
bcp_names.append(ll)
else:
print(f'{ll}*******不符合抓取要求')
a = '0'
lls.append(a)
print('1' in lls)
if '1' in lls:
print(f'压缩文件:{zipname}')
# 解压到工作文件夹
dst_dir = workdir
# workdir = workdir.replace(r'\\', '\\')
# 满足要求,解压文件到指定解压位置
unzip_file(zipname, dst_dir)
print(f'*******************{zipname}解压成功*******************')
# 找到符合要求的文件
print(f'所有需要处理的文件:{bcp_names}')
bcp_name1 = zipname.replace('inputdir', 'workdir').split('.zip')[0]
print('bcp_name1', bcp_name1)
for bcp_name in bcp_names:
bcp_name1 = str(bcp_name1).replace('iputdir', 'workdir')
print(f'处理的文件:{bcp_name1}\\{bcp_name}')
time.sleep(random.randint(1, 3))
# 打开bcp文件,翻译对应字段,并重写文件
trans_dir = f'{bcp_name1}\\{bcp_name}'
trans_dir = trans_dir.replace(r'\\', '\\')
try:
datas = xunfei_trans(trans_dir)
except Exception as e:
print(e)
# 把解压修改之后的文件重新压缩,后缀名为zip
dst_dir = dst_dir.replace(r'\\', '\\')
outputdir = outputdir.replace(r'\\', '\\')
re_zip_name = zip_file(dst_dir)
# 把压缩后的文件移动到初始压缩文件夹zipname
shutil.move(re_zip_name, outputdir)
# shutil.move(re_zip_name, outputdir)
print(f'{re_zip_name}移动完成')
print(f'移动号位于{outputdir}')
# 删除解压的文件夹
# shutil.rmtree(bcp_name1)
# shutil.rmtree(outputdir)
# print(f'文件夹{outputdir}已删除')
else:
print('没有符合要求文件,跳过')
pass
else:
pass
print('没有压缩文件')
time2 = int(time.time())
print(f'**************本次解析{len(get_zip)}个压缩包***************')
print(f'***********共用时{time2 - time1}秒****************')