python docx通过关键字标注字体以及颜色大小等
主要使用python-docx 与pandas
因为python-docx对表格的解析不够友好且效率低,故需转换一次
代码如下
# coding:utf-8 import os, re import docx from docx.document import Document as dc from docx.oxml.table import CT_Tbl from docx.oxml.text.paragraph import CT_P from docx.table import _Cell, Table from docx.text.paragraph import Paragraph from docx.shared import RGBColor # 设置字体颜色 from docx import Document from docx.shared import Pt # 设置字体 from docx.oxml.ns import qn # 设置中文字体 import pandas as pd FILE_PATH = r"D:\xxxx\xxxx\xxxx\xxxx.docx" obj = docx.Document(FILE_PATH) def iter_block_items(parent): # print('utils.py ----> iter_block_items:', 2) if isinstance(parent, dc): parent_elm = parent.element.body elif isinstance(parent, _Cell): parent_elm = parent._tc else: raise ValueError("[TypeError] Document in insuitable type.") for child in parent_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, parent) elif isinstance(child, CT_Tbl): yield Table(child, parent) def table2list(table): data = [] for i, row in enumerate(table.rows): row_data = [] for cell in row.cells: row_data.append(cell.text) data.append(row_data) return data #替换的段落关键字 word = '段落关键字' #替换的表格关键字 table_text = '表格关键字' def set_run(run, font_size, bold, color, name): ''' 设置run对象 :param run: :param font_size: 字体大小 :param bold: 是否加粗 :param color: 字体颜色 :param name: 字体名 :return: ''' run.font.size = font_size run.bold = bold run.font.color.rgb = color run.font.name = name # 设置字体必须要下面2步 s = run._element s.rPr.rFonts.set(qn('w:eastAsia'), name) def paragraphs_utils(obj): for p in obj.paragraphs: # 先循环得到单个段落p for r in p.runs: if word not in r.text: # 判断关键字是否存在于段落文本中 continue # print(r.text) # print(r.style.name) font_size = r.font.size bold = r.bold color = r.font.color.rgb name = u'楷体' # 使用关键词切分当前run的文本 rest = r.text.split(word) # 清除当前run的内容 r.text = '' for text in rest[:-1]: # 循环切割出来的列表 ['','xxxxxxx']或者['xxxxx',''] run = p.add_run(text=text) set_run(run, font_size, bold, color, name) run = p.add_run(word) # 重写关键字部分 set_run(run, font_size, bold, color, name) run.font.color.rgb = RGBColor(255, 0, 0) run = p.add_run(rest[-1]) # 在补齐r.text的内容 set_run(run, font_size, bold, color, name) obj.save('标注后的文档.docx') def table_utils(obj): for p in obj.tables: # 先循环得到单个表格p pd_block = pd.DataFrame(table2list(p)) # 使用table2list 将table转成列表,然后转成pandas的DateFrame对象 for rows in range(pd_block.shape[0]): # 循环pd_block(DateFrame对象)的行数 -》shape方法得到元祖 为行数和列数 if rows == 0: continue if table_text != pd_block.iloc[rows, 0]: continue # 判断关键字是否等于当前表的 rows行0列,否则跳过 for cols in range(pd_block.shape[1]): if cols == 0: continue rs = p.cell(rows, cols).paragraphs[0] # 此时rows和cols肯定为关键字所在的那行数据,用document对象获取paragraphs取0 for r in rs.runs: # paragraphs中有个runs 是个列表 font_size = r.font.size bold = r.bold color = r.font.color.rgb name = u'楷体' data = r.text.strip() # 清除当前run的内容 r.text = '' run = rs.add_run(data) # 此时要使用paragraphs的add_run方法重写data数据 set_run(run, font_size, bold, color, name) run.font.color.rgb = RGBColor(255, 0, 0) obj.save('标注后的表格.docx') for block in iter_block_items(obj): if isinstance(block, Paragraph): for r in block.runs: if word not in r.text: continue print(r.text) print(r.style.name) font_size = r.font.size bold = r.bold color = r.font.color.rgb name = u'楷体' # 使用关键词切分当前run的文本 rest = r.text.split(word) # 清除当前run的内容 r.text = '' for text in rest[:-1]: run = block.add_run(text=text) set_run(run, font_size, bold, color, name) run = block.add_run(word) set_run(run, font_size, bold, color, name) run.font.color.rgb = RGBColor(255, 0, 0) run = block.add_run(rest[-1]) set_run(run, font_size, bold, color, name) else: pd_block = pd.DataFrame(table2list(block)) # 使用table2list 将table转成列表,然后转成pandas的DateFrame对象 for rows in range(pd_block.shape[0]): # 循环pd_block(DateFrame对象)的行数 -》shape方法得到元祖 为行数和列数 if rows == 0: continue if table_text != pd_block.iloc[rows, 0]: continue # 判断关键字是否等于当前表的 rows行0列,否则跳过 for cols in range(pd_block.shape[1]): if cols == 0: continue rs = block.cell(rows, cols).paragraphs[0] # 此时rows和cols肯定为关键字所在的那行数据,用document对象获取paragraphs取0 for r in rs.runs: # paragraphs中有个runs 是个列表 font_size = r.font.size bold = r.bold color = r.font.color.rgb name = u'楷体' data = r.text.strip() # 清除当前run的内容 r.text = '' run = rs.add_run(data) # 此时要使用paragraphs的add_run方法重写data数据 set_run(run, font_size, bold, color, name) run.font.color.rgb = RGBColor(255, 0, 0) obj.save('段落与表格标注后的文档.docx')
匹配关键字回写docx替换颜色