pandas(二):在pandas中搜索包含关键词的行
一、代码
# -*- coding: UTF-8 -*- import json import pandas as pd """获得所有的文本""" def get_all_text(): file_path = "../datas/format/primary.json" names = [] roles = [] texts = [] with open(file_path, "r", encoding="utf8") as f: for data_line in f.readlines(): json_data = json.loads(data_line) file_name = json_data["file_name"] file_data = json_data["datas"] for k,v in file_data.items(): names.append(file_name) roles.append(k) texts.append(v) file_out = "../datas/format/all_text.csv" dataframe = pd.DataFrame({'names': names, 'roles': roles, "texts": texts}) dataframe.to_csv(file_out, index=False, sep='\t') """从csv搜索数据""" def search_text(key): file_out = "../datas/classes/" + key + ".csv" file_path = "../datas/format/all_text.csv" data = pd.read_csv(file_path, sep="\t") da = data[data["texts"].str.contains(key)] da.to_csv(file_out, index=False, sep='\t') """提取带有婚字的数据""" def data_annotate(): file_in = "../datas/format/primary.json" file_out = "../datas/annotate/label.json" with open(file_out, "w", encoding="utf8") as fo: with open(file_in, "r", encoding="utf8") as f: for line in f.readlines(): item = {} label = 0 json_data = json.loads(line) for k,v in json_data["datas"].items(): if "婚" in v: label = 1 if label == 1: item["name"] = json_data["file_name"] item["label"] = "" item["datas"] = json_data["datas"] fo.write(json.dumps(item, ensure_ascii=False) + "\n") return "success" """提取标注过的数据""" def annotate(): file_in = "../datas/annotate/label.json" file_labeled = "../datas/annotate/labeled.json" file_unlabeled = "../datas/annotate/unlabel.json" with open(file_in, "r", encoding="utf8") as f_in: with open(file_labeled, "w", encoding="utf8") as f_labeled: with open(file_unlabeled, "w", encoding="utf8") as f_unlabeled: for line in f_in.readlines(): json_data = json.loads(line) if json_data["label"]: f_labeled.write(json.dumps(json_data, ensure_ascii=False) + "\n") else: f_unlabeled.write(json.dumps(json_data, ensure_ascii=False) + "\n") return "success" def label_to_csv(): file_path = "../datas/annotate/labeled.json" labels = [] datas = [] data_dict = [] with open(file_path, "r", encoding="utf8") as f: for data_line in f.readlines(): json_data = json.loads(data_line) _label = json_data["label"] _data = "|".join(json_data["datas"].values()) labels.append(_label) datas.append(_data) data_dict.append(data_line.replace("\n", "")) file_out = "../datas/annotate/labeled.csv" dataframe = pd.DataFrame({'labels': labels, 'datas': datas, "data_dict": data_dict}) dataframe.to_csv(file_out, index=False, sep='\t') """提取带工作的数据""" def get_work(): search_text("工作") if __name__ == '__main__': label_to_csv()
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧