【Python】Docx解析

1、cd D:\ProgramData\Anaconda3

2、pip install python-docx

3、python代码处理

# -*- coding: utf-8 -*-
 


import os
import docx
from win32com import client as wc

docs = []
 
def traverse(f):
    fs = os.listdir(f)
    for f1 in fs:
        tmp_path = os.path.join(f,f1)
        if not os.path.isdir(tmp_path):
            #print('文件: %s'%tmp_path)
            if  os.path.splitext(tmp_path)[-1].lower() == ".doc" or os.path.splitext(tmp_path)[-1].lower() == ".docx":
                #print('文件: %s'%tmp_path)
                docs.append(tmp_path)
        else:
            #print('文件夹:%s'%tmp_path)
            traverse(tmp_path)


def parseDoc(f):
    doc = docx.Document(f)
    parag_num = 0
    for para in doc.paragraphs :
        print("----------------------------------------------------")
        print(para.text)
        print("----------------------------------------------------")
        parag_num += 1      
    print ('This document has ', parag_num, ' paragraphs')

def doc2docx(full_path):
    #dirname = os.path.dirname(full_path)
    #filename = os.path.basename(full_path)
    #newpath = full_path.replace('doc','docx')
    newpath = full_path + "x"

    if os.path.exists(newpath):
        return

    # 首先将doc转换成docx
    word = wc.Dispatch("Word.Application")

    # 找到word路径 + 文件名 ,即可打开文件 
    doc = word.Documents.Open(full_path)
    
    # 使用参数16表示将doc转换成docx,保存成docx后才能 读文件
    doc.SaveAs(newpath,16)
    doc.Close()
    word.Quit()

            
path = 'E:/NLP/Docs/'

traverse(path)
 
for k,v in enumerate(docs):
    if k < 1:
        print(k,v)
        parseDoc(v)
        #doc2docx(v)

 

posted @ 2018-11-06 16:04  咸鱼翻身  阅读(388)  评论(0编辑  收藏  举报