分类模型用到的excel转csv脚本
一、应用pandas
# coding=utf-8 import pandas as pd from pandas import DataFrame, Series import re def rule(item_str): """ 对内容的规则 :param item_str: str :return: """ item_str = re.sub('\s', "", item_str) item_str = item_str.replace(",", ",").replace("(", "(").replace(")", ")").replace('"', "“") return item_str def get_csv(file_name, csv_filename, col_li): """ 获取分类模型需要的csv :param file_name: 原始excel文件 :param csv_filename: 转换后的csv文件 :param col_li: [] 【text, label】对应的列名 :return: """ df = pd.read_excel(file_name)[col_li].dropna() # df["item_info"] = [for i in range()] # df["工作内容"] = df["工作内容"] # df["item_info"] = ["" for i in range(len(df[col_li[1]]))] # 处理为需写入csv的数据 csv_key = ['text', 'label', 'item_info'] data = df.values.tolist() data_list = [] for i in data: text_content = rule(str(i[0])) label_content = rule(str(i[1])) if text_content in ["", "nan", "None"] or label_content in ["", "nan", "None"]: continue # 风险等级为所队,错误数据 # if label_content == "所队": # continue item_info = "" data_list.append([text_content, label_content, item_info]) df2 = DataFrame(data=data_list, columns=csv_key) df2.to_csv(csv_filename, index=False, encoding="utf-8") return "ok" if __name__ == '__main__': excel_path = r"../files_excl/作业风险数据.xlsx" # excel_path = r"../files_excl/4-5月全部计划V2.xlsx" # csv_path = r"../files_csv/feng_xian.csv" # text_label_li = ["工作内容", "风险等级"] csv_path = r"../files_csv/guan_kong.csv" text_label_li = ["工作内容", "管控层级"] res = get_csv(excel_path, csv_path, text_label_li) print(res)
二、应用xlrd和csv
# coding=utf-8 import codecs import xlrd import csv import re def rule(item_str): """ 对内容的规则 :param item_str: str :return: """ item_str = re.sub('\s', "", item_str) item_str = item_str.replace(",", ",").replace("(", "(").replace(")", ")").replace('"', "“") return item_str def xlsx_to_csv(file_path, csv_filename, col_li): """ 用于分类模型, excel转csv :param file_path: 文件路径 :param csv_filename: 转换后的csv文件名 :param col_li: 需写到csv列名【text,label】 :return: """ workbook = xlrd.open_workbook(file_path) table = workbook.sheet_by_index(0) row_num = table.nrows # sheet行数 # col_num = table.ncols # sheet列数 head_li = table.row_values(0, start_colx=0, end_colx=None) # 返回由该行中所有单元格的数据组成的列表 text_index = head_li.index(col_li[0]) label_index = head_li.index(col_li[1]) # print(text_index, label_index) # print(head_li) data_list = [["text", "label", "item_info"]] for row in range(1, row_num): item_li = table.row_values(row, start_colx=0, end_colx=None) text_content = rule(str(item_li[text_index])) label_content = rule(str(item_li[label_index])) if text_content in ["", "nan", "None"] or label_content in ["", "nan", "None"]: continue # 风险等级为所队,错误数据 # if label_content == "所队": # continue item_info = "" data_list.append([text_content, label_content, item_info]) # 写入csv with codecs.open(csv_filename, "w", encoding="utf-8") as f: csv_writer = csv.writer(f) for row_content in data_list: csv_writer.writerow(row_content) return "ok" if __name__ == '__main__': excel_path = r"../files_excl/作业风险数据.xlsx" # excel_path = r"../files_excl/4-5月全部计划V2.xlsx" # csv_path = r"../files_csv/feng_xian.csv" # text, label # text_label_li = ["工作内容", "风险等级"] csv_path = r"../files_csv/guan_kong.csv" text_label_li = ["工作内容", "管控层级"] res = xlsx_to_csv(excel_path, csv_path, text_label_li) print(res)