python分段读取word文件数据到MySQL数据库和Java读取word数据到MySQL数据库
1、python分段读取word文件数据到MySQL数据库
示例:(注:此示例为读取某个文件夹下的所有文件,并对文件后缀名为doc的文件进行读取,并以文件名称为id完成对该word 内容的插入。)
# 导入os模块 import os #导入所需库 import pymysql from docx import Document # path定义要获取的文件名称的目录 path = "your path" # os.listdir()方法获取文件夹名字,返回数组 file_name_list = os.listdir(path) # 转为转为字符串 file_name = file_name_list # replace替换"["、"]"、" "、"'" # file_name = file_name.replace("[", "").replace("]", "").replace("'", "").replace(",", "\n").replace(" ", "") for fileitem in file_name: totalname = fileitem.split('.') print(totalname[0],end=">>") print(totalname[1]) try: # print(fileitem) if totalname[1] == 'doc': strtext = "" # 打开word文档 document = Document(path+"/"+totalname[0]+".doc") # 获取所有段落 all_paragraphs = document.paragraphs # 打印看看all_paragraphs是什么东西 # print(type(all_paragraphs)) #<class 'list'>,打印后发现是列表 # 是列表就开始循环读取 for paragraph in all_paragraphs: # 打印每一个段落的文字 # print(paragraph.text) # 循环读取每个段落里的run内容 for run in paragraph.runs: if run.text != ' ': strtext = strtext + run.text + "</br>" # print(strtext) try: db = pymysql.connect(host='localhost', port=3306, user='root', passwd='yourpassword', db='your数据库', charset='utf8') cursor = db.cursor() try: sql = "update lawfiles_information_context1 set file_context = '"+strtext+"' where file_title = '"+totalname[0]+"'" cursor.execute(sql) db.commit() except Exception as e: # db.rollback() # print(e) # 关闭光标对象 cursor.close() # 关闭数据库连接 db.close() # print(run.text, "</br>") # 打印run内容 except Exception as es: print("files update failed!!") except Exception as efile: print("file reading failed")
2、Java读取word数据到MySQL数据库
示例(注:以下Java代码同上述Python代码功能相同,均为读取某个文件夹下所有word文件内容并进行逐段读取同时存储至数据库,此处仅为更新数据表某个字段的内容,若要进行插入,可自行更改sql语句)
package testJava; import com.spire.doc.Document; import com.spire.doc.Section; import com.spire.doc.documents.Paragraph; import util.DbHelper; import java.io.File; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.SQLException; /** * @author June * @date 2023/4/9 13:42 * 5341 * 12099 */ public class Addcontext { //更新文件信息 public static boolean fileupdateInfo(String file_context,String file_title) { int count = 0; boolean flag = false; String sql = "update lawfiles_information_deal_context set file_context = ? where file_title = ?"; Connection conn = DbHelper.getConnection(); PreparedStatement pst = null; try { pst = conn.prepareStatement(sql); pst.setString(1,file_context); pst.setString(2,file_title); count = pst.executeUpdate(); pst.close(); } catch (SQLException e) { // TODO 自动生成的 catch 块 e.printStackTrace(); } finally { if(count>0) flag = true; } return flag; } //更新文件信息 public static boolean judgenuLL(String file_context,String file_title) { int count = 0; boolean flag = false; String sql = "update lawfiles_information_deal_context set file_context = ? where file_title = ?"; Connection conn = DbHelper.getConnection(); PreparedStatement pst = null; try { pst = conn.prepareStatement(sql); pst.setString(1,file_context); pst.setString(2,file_title); count = pst.executeUpdate(); pst.close(); } catch (SQLException e) { // TODO 自动生成的 catch 块 e.printStackTrace(); } finally { if(count>0) flag = true; } return flag; } //更新文件信息 public static boolean fileupdatepart(String file_context,String file_title) { int count = 0; boolean flag = false; //String sql = "update lawfiles_information_decision set file_context = ? where file_title = ?"; //String sql = "update lawfiles_information_interpreter set file_context = ? where file_title = ?"; String sql = "update lawfiles_information_place set file_context = ? where file_title = ?"; Connection conn = DbHelper.getConnection(); PreparedStatement pst = null; try { pst = conn.prepareStatement(sql); pst.setString(1,file_context); pst.setString(2,file_title); count = pst.executeUpdate(); pst.close(); } catch (SQLException e) { // TODO 自动生成的 catch 块 e.printStackTrace(); } finally { if(count>0) flag = true; } return flag; } public static void main(String[] args) { //获取文件路径文件夹下的全部文件列表 System.out.println("文件有如下:"); //表示一个文件路径 File file = new File("D:\\GraduationProject\\program\\coding\\paqu\\laws_regulations\\1crawling\\laws_files3"); //用数组把文件夹下的文件存起来 File[] files = file.listFiles(); System.out.println("共有文件数"+files.length); int count = 0; int hace = 0; //foreach遍历数组 for (int j=0;j<2000;j++) { File file2 = files[j]; count++; hace = files.length - count; System.out.println("count_have ==>>"+hace); //打印文件列表:只读取名称使用getName(); //System.out.println("路径:"+file2.getPath()); //System.out.println("文件夹/文件名:"+file2.getName()); try{ //加载Word文档 Document doc = new Document(file2.getPath()); String fileName = file2.getName(); //得到上传文件的扩展名 String fileExtName = fileName.substring(fileName.lastIndexOf(".")+1); if("doc".equals(fileExtName)){ //文件名 System.out.println(count+">>doc名字:"+file2.getName().substring(0,file2.getName().length()-4)); System.out.println(count+">>doc名字:"+file2.getName()); System.out.println("doc内容:--------------------------------------------------------------------"); String context = ""; //遍历文档中的节和段落,获取每个段落的文本 for(int i = 0; i < doc.getSections().getCount(); i++) { context = ""; try { Section section = doc.getSections().get(i); for(int p=0;p<section.getParagraphs().getCount();p++){ context = context + "\n" + section.getParagraphs().get(p).getText(); } //System.out.println("context:"+context); //System.out.println(context); fileupdatepart(context,file2.getName().substring(0,file2.getName().length()-4)); }catch (Exception e){ System.out.println(count+">>read------false"+file2.getName().substring(0,file2.getName().length()-4)); } } } }catch (Exception e){ System.out.println(count+">>file------false"); } } } }
对比而言:
(别问,实践证明。。。。python快。。。。。)python读取word文件的速度相比Java读取word文件的速度要快。