分类模型用到的excel转csv脚本

一、应用pandas

# coding=utf-8
import pandas as pd
from pandas import DataFrame, Series
import re


def rule(item_str):
    """
    对内容的规则
    :param item_str: str
    :return:
    """
    item_str = re.sub('\s', "", item_str)
    item_str = item_str.replace(",", "").replace("(", "").replace(")", "").replace('"', "")
    return item_str


def get_csv(file_name, csv_filename, col_li):
    """
    获取分类模型需要的csv
    :param file_name: 原始excel文件
    :param csv_filename: 转换后的csv文件
    :param col_li: [] 【text, label】对应的列名
    :return:
    """
    df = pd.read_excel(file_name)[col_li].dropna()
    # df["item_info"] = [for i in range()]
    # df["工作内容"] = df["工作内容"]
    # df["item_info"] = ["" for i in range(len(df[col_li[1]]))]

    # 处理为需写入csv的数据
    csv_key = ['text', 'label', 'item_info']
    data = df.values.tolist()
    data_list = []
    for i in data:
        text_content = rule(str(i[0]))
        label_content = rule(str(i[1]))
        if text_content in ["", "nan", "None"] or label_content in ["", "nan", "None"]:
            continue
        # 风险等级为所队,错误数据
        # if label_content == "所队":
        #     continue
        item_info = ""
        data_list.append([text_content, label_content, item_info])

    df2 = DataFrame(data=data_list, columns=csv_key)
    df2.to_csv(csv_filename, index=False, encoding="utf-8")
    return "ok"


if __name__ == '__main__':
    excel_path = r"../files_excl/作业风险数据.xlsx"
    # excel_path = r"../files_excl/4-5月全部计划V2.xlsx"
    # csv_path = r"../files_csv/feng_xian.csv"
    # text_label_li = ["工作内容", "风险等级"]

    csv_path = r"../files_csv/guan_kong.csv"
    text_label_li = ["工作内容", "管控层级"]

    res = get_csv(excel_path, csv_path, text_label_li)
    print(res)

二、应用xlrd和csv

# coding=utf-8
import codecs
import xlrd
import csv
import re


def rule(item_str):
    """
    对内容的规则
    :param item_str: str
    :return:
    """
    item_str = re.sub('\s', "", item_str)
    item_str = item_str.replace(",", "").replace("(", "").replace(")", "").replace('"', "")
    return item_str


def xlsx_to_csv(file_path, csv_filename, col_li):
    """
    用于分类模型, excel转csv
    :param file_path: 文件路径
    :param csv_filename: 转换后的csv文件名
    :param col_li: 需写到csv列名【text,label】
    :return:
    """
    workbook = xlrd.open_workbook(file_path)
    table = workbook.sheet_by_index(0)

    row_num = table.nrows  # sheet行数
    # col_num = table.ncols  # sheet列数
    head_li = table.row_values(0, start_colx=0, end_colx=None)  # 返回由该行中所有单元格的数据组成的列表
    text_index = head_li.index(col_li[0])
    label_index = head_li.index(col_li[1])
    # print(text_index, label_index)
    # print(head_li)

    data_list = [["text", "label", "item_info"]]
    for row in range(1, row_num):
        item_li = table.row_values(row, start_colx=0, end_colx=None)
        text_content = rule(str(item_li[text_index]))
        label_content = rule(str(item_li[label_index]))

        if text_content in ["", "nan", "None"] or label_content in ["", "nan", "None"]:
            continue
        # 风险等级为所队,错误数据
        # if label_content == "所队":
        #     continue

        item_info = ""
        data_list.append([text_content, label_content, item_info])
    # 写入csv
    with codecs.open(csv_filename, "w", encoding="utf-8") as f:
        csv_writer = csv.writer(f)
        for row_content in data_list:
            csv_writer.writerow(row_content)
    return "ok"


if __name__ == '__main__':
    excel_path = r"../files_excl/作业风险数据.xlsx"
    # excel_path = r"../files_excl/4-5月全部计划V2.xlsx"
    # csv_path = r"../files_csv/feng_xian.csv"
    # text, label
    # text_label_li = ["工作内容", "风险等级"]

    csv_path = r"../files_csv/guan_kong.csv"
    text_label_li = ["工作内容", "管控层级"]

    res = xlsx_to_csv(excel_path, csv_path, text_label_li)
    print(res)

 

posted @ 2020-05-22 18:41  清风_Z  阅读(289)  评论(0编辑  收藏  举报