pandas(二):在pandas中搜索包含关键词的行
一、代码
# -*- coding: UTF-8 -*- import json import pandas as pd """获得所有的文本""" def get_all_text(): file_path = "../datas/format/primary.json" names = [] roles = [] texts = [] with open(file_path, "r", encoding="utf8") as f: for data_line in f.readlines(): json_data = json.loads(data_line) file_name = json_data["file_name"] file_data = json_data["datas"] for k,v in file_data.items(): names.append(file_name) roles.append(k) texts.append(v) file_out = "../datas/format/all_text.csv" dataframe = pd.DataFrame({'names': names, 'roles': roles, "texts": texts}) dataframe.to_csv(file_out, index=False, sep='\t') """从csv搜索数据""" def search_text(key): file_out = "../datas/classes/" + key + ".csv" file_path = "../datas/format/all_text.csv" data = pd.read_csv(file_path, sep="\t") da = data[data["texts"].str.contains(key)] da.to_csv(file_out, index=False, sep='\t') """提取带有婚字的数据""" def data_annotate(): file_in = "../datas/format/primary.json" file_out = "../datas/annotate/label.json" with open(file_out, "w", encoding="utf8") as fo: with open(file_in, "r", encoding="utf8") as f: for line in f.readlines(): item = {} label = 0 json_data = json.loads(line) for k,v in json_data["datas"].items(): if "婚" in v: label = 1 if label == 1: item["name"] = json_data["file_name"] item["label"] = "" item["datas"] = json_data["datas"] fo.write(json.dumps(item, ensure_ascii=False) + "\n") return "success" """提取标注过的数据""" def annotate(): file_in = "../datas/annotate/label.json" file_labeled = "../datas/annotate/labeled.json" file_unlabeled = "../datas/annotate/unlabel.json" with open(file_in, "r", encoding="utf8") as f_in: with open(file_labeled, "w", encoding="utf8") as f_labeled: with open(file_unlabeled, "w", encoding="utf8") as f_unlabeled: for line in f_in.readlines(): json_data = json.loads(line) if json_data["label"]: f_labeled.write(json.dumps(json_data, ensure_ascii=False) + "\n") else: f_unlabeled.write(json.dumps(json_data, ensure_ascii=False) + "\n") return "success" def label_to_csv(): file_path = "../datas/annotate/labeled.json" labels = [] datas = [] data_dict = [] with open(file_path, "r", encoding="utf8") as f: for data_line in f.readlines(): json_data = json.loads(data_line) _label = json_data["label"] _data = "|".join(json_data["datas"].values()) labels.append(_label) datas.append(_data) data_dict.append(data_line.replace("\n", "")) file_out = "../datas/annotate/labeled.csv" dataframe = pd.DataFrame({'labels': labels, 'datas': datas, "data_dict": data_dict}) dataframe.to_csv(file_out, index=False, sep='\t') """提取带工作的数据""" def get_work(): search_text("工作") if __name__ == '__main__': label_to_csv()