正则表达式捕获并替换

正则表达式捕获并替换

const fs = require('fs');
const path = require('path');
const { spawnSync } = require('child_process');

const pandocPath = 'E:\\pandoc.exe';

// 定义要转换的文件类型和转换后的文件类型
const inputExtension = '.docx';
const outputExtension = '.md';

const i = "xxx/数据标注--图像数据标注";
const o = 'xxx/label';

function doTransfer(inputDir, outputDir) {
  if (!fs.existsSync(outputDir)) {
    fs.mkdirSync(outputDir);
  }

  // 获取当前目录下的所有文件
  let i = 1;
  const files = fs.readdirSync(inputDir).filter(f => f.endsWith(inputExtension))
  for (let file of files) {

    if (!file.endsWith(inputExtension)) {
      continue;
    }
    const inputFilePath = path.join(inputDir, file);

    const outputFileName = `${i}`.padStart(3, "0") + outputExtension;
    const outputFilePath = path.join(outputDir, outputFileName);

    // 构造 Pandoc 命令
    const c1 = `${pandocPath} ${inputFilePath} -f docx -t markdown_strict+pipe_tables -o ${outputFilePath} --extract-media=${outputDir}`;
    const c2 = `mkdir ${outputDir}\\media${i}`;
    const c3 = `move ${outputDir}\\media\\* ${outputDir}\\media${i}`;
    const c4 = `rmdir ${outputDir}\\media`;
    execSync(c1);
    execSync(c2);
    execSync(c3);
    execSync(c4);
    insertChar(outputFilePath, i);

    i = i + 1;
  }

  console.log("----------done----------")
}

function execSync(cmd) {
  //chcp 65001是为了正常显示中文,不乱码
  const result = spawnSync(`chcp 65001 && ${cmd}`, { shell: true, encoding: 'utf-8' });

  if (result.error) {
    console.error(`执行命令时发生错误: ${result.error.message}`);
    return;
  }

  const stdout = result.stdout.toString();
  const stderr = result.stderr.toString();

  stdout && console.log(`命令输出: ${stdout}`);
  stderr && console.error(`命令错误输出: ${stderr}`);
}

function insertChar(filepath, i) {
  try {
    const data = fs.readFileSync(filepath, 'utf8');

    // 在所有以#开头的行的前面再插入一个#
    let modifiedData = data.replace(/^(#+?)([^#+?])/gm, '#$1$2');

    const regex = /<img[^>]*src=["'].*?(image\d+\.(?:png|jpe?g)).*?["'][^>]*>/g;
    // 将图片内容替换为Markdown格式,此处仅捕获类似image1.png,image1.png,image1.png等的字符串并将之放到替换后的里面
    modifiedData = modifiedData.replace(regex, `![](media${i}\\$1)`);

    const iconMarkRegex = /(图 \d+ .*)/g;
    modifiedData = modifiedData.replace(iconMarkRegex, '<div class="center">$1</div>');

    fs.writeFileSync(filepath, modifiedData, 'utf8');

    console.log(`${filepath} # updated.`);
  } catch (err) {
    console.error(err);
  }
}

doTransfer(i, o);

 

posted @ 2023-09-21 14:47  透明飞起来了  阅读(105)  评论(0编辑  收藏  举报