文本数据预处理(一)
# 将所有txt文件拷贝至alltxt这个文件夹 import os import shutil # Create new folder if it doesn't exist if not os.path.exists("alltxt"): os.makedirs("alltxt") # Loop over all folders and subfolders for root, dirs, files in os.walk("."): # Loop over files in current folder for file in files: # Check if file is .txt if file.endswith(".txt"): # Get source and destination paths src_path = os.path.join(root, file) dst_path = os.path.join("alltxt", file) # Check if file already exists in destination folder if not os.path.exists(dst_path): # Copy file to destination folder shutil.copy(src_path, dst_path) # 删除所有小于1kB的文件 # Loop over all files in alltxt directory for file in os.listdir("alltxt"): # Check if file is .txt if file.endswith(".txt"): # Get file size in bytes file_size = os.path.getsize(os.path.join("alltxt", file)) # Check if file size is smaller than 1kB if file_size < 1024: # Delete file os.remove(os.path.join("alltxt", file)) # 删除所有含有招标/中标的txt文件 # Loop over all files in alltxt directory for file in os.listdir("alltxt"): # Check if file is .txt and its name contains "招标" or "中标" if file.endswith(".txt") and ("招标" in file or "中标" in file): # Delete file os.remove(os.path.join("alltxt", file)) # 把 # Define list of prefixes prefixes = ["(一)", "(二)", "(三)","(四)","(五)","(六)", "(一)", "(二)", "(三)","(四)","(五)","(六)", "一、", "二、", "三、", "四、", "五、", "六、", "1.", "2.", "3.", "4.", "5.", "6.", "1、", "2、", "3、","4、", "5、", "6、", "(1)", "(2)", "(3)","(4)", "(5)", "(6)"] # Loop over all files in alltxt directory for file in os.listdir("alltxt"): # Check if file is .txt if file.endswith(".txt"): # Read file contents with open(os.path.join("alltxt", file), "r") as f: contents = f.readlines() # Remove trailing newline character from each line, except for those that start with specific prefixes contents = [line.rstrip("\n") + "\n" if any(line.startswith(prefix) for prefix in prefixes) else line.rstrip("\n") for line in contents] # Write modified contents back to file with open(os.path.join("alltxt", file), "w") as f: f.writelines(contents)