End

PDF 转 TXT 后格式化处理

本文地址


目录

PDF 转 TXT 后格式化处理

public class PdfUtils {
    private static final int MIN_CHARS = 38; //【这个参数至关重要】
    private static final boolean DEBUG = false;
    private static final String IGNORE_CONTENT = "本文档资源来自互联网,仅供个人学习交流,请勿用作商业";
    private static final char[] TAGS_END_CHARS = { '。', '!', '”', '”' };
    private static final String TAG_NO_LINE = "【不换行】";

    public static void main(String[] args) {
        replaceFileContent("D:\\from.txt", "D:\\to.txt");
    }

    /**
     * 替换文件中的内容
     */
    public static void replaceFileContent(String from, String to) {
        BufferedReader reader = null;
        try {
            reader = new BufferedReader(new InputStreamReader(new FileInputStream(from), "UTF-8"));
            String currentLine;//当前行的内容
            String nextLine;//下一行的内容
            List<String> origineList = new ArrayList<>();//每一行的内容集合
            while ((currentLine = reader.readLine()) != null) {
                origineList.add(currentLine);
            }

            List<String> contentList = new ArrayList<>();
            for (int i = 0; i < origineList.size(); i++) {
                currentLine = origineList.get(i);
                if (currentLine.equals(IGNORE_CONTENT)) {
                    contentList.remove(contentList.size() - 1);
                    contentList.remove(contentList.size() - 1);
                    contentList.remove(contentList.size() - 1);
                    i += 4;
                } else if (currentLine.equals("o")) {
                    //忽略
                } else {
                    contentList.add(origineList.get(i));
                }
            }

            for (int i = 0; i < contentList.size() - 1; i++) {
                currentLine = contentList.get(i);
                if (currentLine.length() >= MIN_CHARS) {//当前行较长,说明可能不需要换行
                    char currentEndChar = currentLine.charAt(currentLine.length() - 1);
                    if (!isHasChar(currentEndChar)) { //当前行不以标点符号结尾,进一步确定可能不需要换行【重中之重】
                        contentList.set(i, currentLine + TAG_NO_LINE);
                    }
                } else {//当前行较短,说明可能需要增加换行
                    nextLine = contentList.get(i + 1);
                    if (currentLine.length() == 0) { //当前行是空行
                        if (nextLine.length() == 0) { //如果下一行也是空行,则不需要换行
                            contentList.set(i, TAG_NO_LINE);
                        }
                    } else {
                        if (nextLine.length() >= MIN_CHARS) { //如果下一行较长,则增加换行
                            contentList.set(i, currentLine + "\n");
                        }
                    }
                }
                if (DEBUG) {
                    contentList.set(i, "【" + currentLine.length() + "】" + contentList.get(i));
                }
            }

            writeFile(to, contentList);
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            closeStream(reader);
        }
    }

    /**
     * 写内容到指定文件
     */
    private static void writeFile(String file, List<String> contentList) {
        PrintWriter writer = null;
        try {
            writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(file), "UTF-8"));
            for (String string : contentList) {
                if (string.endsWith(TAG_NO_LINE)) {
                    writer.append(string.replace(TAG_NO_LINE, ""));
                } else {
                    writer.append(string).append("\n");
                }
            }
            writer.close();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            closeStream(writer);
        }
    }

    /**
     * 关闭流
     */
    private static void closeStream(Closeable... closeable) {
        for (Closeable c : closeable) {
            if (c != null) {
                try {
                    c.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }

    private static boolean isHasChar(char c) {
        for (char c2 : TAGS_END_CHARS) {
            if (c2 == c) {
                return true;
            }
        }
        return false;
    }
}

2017-03-09

posted @   白乾涛  阅读(589)  评论(0编辑  收藏  举报
编辑推荐:
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 探究高空视频全景AR技术的实现原理
阅读排行:
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 单线程的Redis速度为什么快?
· SQL Server 2025 AI相关能力初探
· AI编程工具终极对决:字节Trae VS Cursor,谁才是开发者新宠?
· 展开说说关于C#中ORM框架的用法!
点击右上角即可分享
微信分享提示