合并所有文件

import os
import pandas as pd
import os
import shutil

def clear_folder(folder_path):
# 检查文件夹是否存在
if not os.path.exists(folder_path):
print(f"文件夹 {folder_path} 不存在。")
return

# 遍历文件夹中的所有文件和子文件夹
for filename in os.listdir(folder_path):
file_path = os.path.join(folder_path, filename)

try:
# 如果是文件,删除文件
if os.path.isfile(file_path) or os.path.islink(file_path):
os.unlink(file_path)
# 如果是文件夹,删除文件夹及其内容
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
except Exception as e:
print(f'删除 {file_path} 时出错: {e}')

print(f"文件夹 {folder_path} 已清空。")

# 指定要清空的文件夹路径
folder_path = './folder'

# 调用函数清空文件夹
clear_folder(folder_path)
def find_sn_row(df):
for i, row in df.iterrows():
if any('SN' in str(cell) for cell in row):
return i
if any('条码' in str(cell) for cell in row):
return i
return None

def read_file_with_fallback(file_path, file_type):
encodings = ['utf-8', 'gb18030']
for encoding in encodings:
try:
if file_type == 'csv':
return pd.read_csv(file_path, header=None, encoding=encoding)
elif file_type == 'xls':
return pd.read_excel(file_path, engine='xlrd', header=None)
elif file_type == 'xlsx':
return pd.read_excel(file_path, engine='openpyxl', header=None)
except (UnicodeDecodeError, ValueError):
continue
raise ValueError(f"Failed to read {file_path} with available encodings")

def convert_to_xlsx(source_directory, target_directory):
# 如果目标目录不存在,则创建它
if not os.path.exists(target_directory):
os.makedirs(target_directory)

# 遍历源目录及其所有子目录下的所有文件
for root, dirs, files in os.walk(source_directory):
for filename in files:
file_path = os.path.join(root, filename)

# 检查文件扩展名并进行相应处理
if filename.endswith('.csv'):
df = read_file_with_fallback(file_path, 'csv')
new_filename = filename.replace('.csv', '.xlsx')
elif filename.endswith('.xls'):
df = read_file_with_fallback(file_path, 'xls')
new_filename = filename.replace('.xls', '.xlsx')
elif filename.endswith('.xlsx'):
df = read_file_with_fallback(file_path, 'xlsx')
new_filename = filename # 保持原来的文件名
else:
continue

# 找到包含 "SN" 的行并设置为新的索引行
sn_row_index = find_sn_row(df)
if sn_row_index is not None:
new_header = df.iloc[sn_row_index]
df.columns = new_header
df = df.drop(index=sn_row_index).reset_index(drop=True)

# 保存为新的 xlsx 文件到目标目录
relative_path = os.path.relpath(root, source_directory)
new_file_dir = os.path.join(target_directory, relative_path)
if not os.path.exists(new_file_dir):
os.makedirs(new_file_dir)
new_file_path = os.path.join(new_file_dir, new_filename)
df.to_excel(new_file_path, index=False)
print(f"Converted {file_path} to {new_file_path}")

# 使用示例
source_directory = './root' # 替换为你的源目录路径
target_directory = './folder' # 替换为你的目标目录路径
convert_to_xlsx(source_directory, target_directory)
def get_all_excel_files(directory):
excel_files = []
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith('.xlsx') :
excel_files.append(os.path.join(root, file))
return excel_files
# 读取 CSV 文件

root_directory = './folder'
file_paths = get_all_excel_files(root_directory)
print(file_paths)
dataframes = []
for file_path in file_paths:
df = pd.read_excel(file_path) # 根据需要指定编码
dataframes.append(df)


# 读取第二个文件

concatenated_df = pd.concat(dataframes, axis=0, ignore_index=True)
def find_limits_and_data(df):
# 初始化变量
up_row, low_row = None, None
up_col, low_col = None, None

# 遍历DataFrame以找到包含"上限"或"UP"的单元格(从后往前)
for row in range(df.shape[0] - 1, -1, -1):
for col in range(df.shape[1] - 1, -1, -1):
cell_value = str(df.iat[row, col]).strip().lower()
if up_row is None and ("up" in cell_value or "上限" in cell_value or "usp" in cell_value):
up_row, up_col = row, col
if low_row is None and ("low" in cell_value or "下限" in cell_value or "lsp" in cell_value):
low_row, low_col = row, col

if up_row!=None and low_row!=None:
break

# 如果找到了上限和下限行,将它们移动到最前面
if up_row is not None and low_row is not None:
# 获取上限和下限行
up_limit_row = df.iloc[up_row]
low_limit_row = df.iloc[low_row]

# 删除原来的上限和下限行
df = df.drop([up_row, low_row])

# 将上限和下限行插入到最前面
df = pd.concat([pd.DataFrame([up_limit_row]), pd.DataFrame([low_limit_row]), df.reset_index(drop=True)], ignore_index=True)

return df
concatenated_df = find_limits_and_data(concatenated_df)
import pandas as pd

def clean_dataframe(df):
# 找到包含 "SN" 或 "条码" 的列索引
sn_barcode_cols = []

for col in df.columns:
if 'sn' in col.lower() or '条码' in col.lower():
sn_barcode_cols.append(col)

# 如果没有找到相关列,直接返回原始 DataFrame
if not sn_barcode_cols:
return df

# 从第三行(包括第三行)开始遍历 DataFrame
rows_to_drop = []
for row in range(2, df.shape[0]):
for col in sn_barcode_cols:
cell_value = df.at[row, col]

# 检查单元格是否为空或长度小于4
cell_invalid = (pd.isna(cell_value) or cell_value == '' or len(str(cell_value)) < 4)

# 如果单元格无效,则记录该行索引并跳出内层循环
if cell_invalid:
rows_to_drop.append(row)
break

# 删除记录的行
df = df.drop(rows_to_drop).reset_index(drop=True)

return df

concatenated_df1 = clean_dataframe(concatenated_df)
concatenated_df1.to_excel('merged_output.xlsx', index=False, engine='xlsxwriter')

posted on 2024-09-14 16:31  梁锋华  阅读(1)  评论(0编辑  收藏  举报