.doc 2 .docx可用代码
1 # # -*- coding:utf-8 -*- 2 #读取docx中的文本代码示例 3 import docx 4 from win32com import client as wc 5 from pyhanlp import * 6 import time 7 import eventlet#导入eventlet这个模块 8 import shutil 9 word = wc.Dispatch('Word.Application') 10 '''重启计时器''' 11 def restart(time_start,time_end,sub_deadline): 12 #E:\pycharm\WorkPlace\Graduation_project\Util\file_process.py 13 time_sub = time_end - time_start 14 15 i = 0 16 print("NJNNNNNNNNN",time_sub,"NNNNNNNNNNNNNNN") 17 if (time_sub > sub_deadline): 18 str = "CHCP 65001" + "&&" 19 str += "E:" + "&&" 20 str += r"cd E:\pycharm\WorkPlace\Graduation_project\Util" + "&&" 21 str += "python file_process.py " 22 # print(str) 23 print("TIME______", i, "________", time_sub) 24 i = i + 1 25 cmd = os.system(str) 26 # print(cmd) 27 '''文件操作''' 28 #将doc转成docx 29 def doSaveAas(doc_path,docx_path): 30 # time_start = time.time() 31 print("MMMMMMMMMMMMMMMMMMMMMMMMMMMMM") 32 eventlet.monkey_patch()#必须加这条代码 33 with eventlet.Timeout(10, False): # 设置超时时间为2秒 34 # print("*$$$$$$$$$$$$$$$$") 35 # time.sleep(2) 36 # print("^^^^^^^^^^^^") 37 doc = word.Documents.Open(doc_path) # 目标路径下的文件 38 # print("@@@@@@@@@@@@") 39 doc.SaveAs(docx_path, 12, False, "", True, "", False, False, False, False) # 转化后路径下的文件 40 doc.Close() 41 print("《《《《《《《《《《《《《《《《《《《《《《《《《《《《《") 42 # time_end = time.time() 43 # restart(time_start,time_end,20) 44 45 #将相对路径转换乘绝对路径,同时调用转换文件进行转换,同时再顺便删除之前的文件 46 def Dir_doc2docx(Dir_path): 47 i=0 48 for file_name in os.listdir(Dir_path): 49 50 print("********************************************************************************************") 51 try: 52 print("文件名:"+file_name) 53 file_path = os.path.join(Dir_path, file_name) 54 print("文件后缀:"+os.path.splitext(file_name)[1] ) 55 if os.path.splitext(file_name)[1] == '.doc': 56 i = i + 1 57 abs_file_path=os.path.abspath(file_path) 58 print(i," 绝对路径:"+abs_file_path) 59 doSaveAas(abs_file_path,abs_file_path+'x') 60 os.remove(file_path) 61 62 except: 63 continue 64 65 def Get_num_file_end(Dir_path,end): 66 i=0 67 for file_name in os.listdir(Dir_path): 68 print("********************************************************************************************") 69 try: 70 if os.path.splitext(file_name)[1] == end: 71 i=i+1 72 except: 73 continue 74 return i 75 #获取文件值 76 def Get_file_value(Dir_path,file_name): 77 paragraph_id=[] 78 paragraph_value=[] 79 file_path = os.path.join(Dir_path, file_name) 80 file = docx.Document(file_path) 81 # 输出段落编号及段落内容 82 for i in range(len(file.paragraphs)): 83 paragraph_id.append(i) 84 paragraph_value.append(file.paragraphs[i].text.strip().replace(u'\u3000', u'').replace(u'\xa0', u'').replace(' ', '')) 85 return paragraph_id,paragraph_value 86 #移动文件 87 def remove_file(Dir_path,To_dirpath): 88 i = 0 89 for file_name in os.listdir(Dir_path): 90 print("********************************************************************************************") 91 try: 92 print("文件名:" + file_name) 93 file_path = os.path.join(Dir_path, file_name) 94 print("文件后缀:" + os.path.splitext(file_name)[1]) 95 if os.path.splitext(file_name)[1] == '.docx': 96 i = i + 1 97 abs_file_path = os.path.abspath(file_path) 98 abs_to_file_path=os.path.abspath(os.path.join(To_dirpath, file_name)) 99 shutil.move(abs_file_path,abs_to_file_path) 100 print(i, " 绝对路径:" + abs_file_path) 101 print(i, " 目标绝对路径:" + abs_to_file_path) 102 103 except: 104 continue 105 if __name__ =="__main__": 106 print("AAAAAAAAA") 107 # a=Get_num_doc("D:\ATEST\jie") 108 Dir_doc2docx("D:\ATEST\jie") 109 # remove_file(r"D:\ATEST\jie", r"D:\ATEST\tojie") 110 print("LLLLLLL") 111 # word.Quit()