python为火车头写插件
火车头的官方现在已支持python写插件,最开始按照官方文档安装了一个python3.8.8,调用插件总是报错,后面咨询客服说是版本太高,后面删除后python重新安装了一个python3.6,重新测试发现完美解决
贴一个写好的插件
# -*- coding: utf-8 -*- # @Author: kaka # @Date: 2022-03-03 09:45:11 # @Last Modified by: kaka # @Last Modified time: 2022-03-17 11:08:08 # @Email: zhckaka@sina.com import os import sys import re import datetime # from w3lib import html import html from scrapy.selector import Selector import importlib from urllib.parse import urljoin from urllib import parse import json import requests import emoji import re if len(sys.argv) != 5: print(len(sys.argv)) print("命令行参数长度不为5") sys.exit() else: LabelCookie = parse.unquote(sys.argv[1]) LabelUrl = parse.unquote(sys.argv[2]) # PageType为List,Content,Pages分别代表列表页,内容页,多页http请求处理,Save代表内容处理 PageType = sys.argv[3] SerializerStr = parse.unquote(sys.argv[4]) if (SerializerStr[0:2] != '''{"'''): file_object = open(SerializerStr) try: SerializerStr = file_object.read() SerializerStr = parse.unquote(SerializerStr) finally: file_object.close() LabelArray = json.loads(SerializerStr) # 以下是用户编写代码区域 if(PageType == "Save"): if(LabelArray['content_comments']): # 提取comment-copy中的内容 comments = LabelArray['content_comments'] text_str = r'''{0}'''.format(comments) json_data = json.loads(text_str) model_list = json_data["features"]["comments"]["models"] if model_list and len(model_list) > 0: level_one = {} # 先把一级的数据保存起来 for ml_key, ml_val in model_list.items(): user_id = ml_key parentId = ml_val.get("parentId", "") media_list = ml_val.get("media", {}).get( "richtextContent", {}).get("document", []) if not parentId: for md in media_list: md_c = md.get("c", []) # 保存评论 mc_list = [] for mc in md_c: mt_text = mc.get("t") mc_list.append(mt_text) # print(mt_text) level_one[user_id] = " ".join(mc_list) for ml_key, ml_val in model_list.items(): # print(ml_key) user_id = ml_key # print(user_id) parentId = ml_val.get("parentId", "") # print("pid",parentId) media_list = ml_val.get("media", {}).get( "richtextContent", {}).get("document", []) for md in media_list: md_c = md.get("c", []) # 保存评论 if parentId and parentId in level_one.keys(): mc_list = [] for mc in md_c: mt_text = mc.get("t") mc_list.append(mt_text) lv_one = level_one[parentId] # new_one = "" if "$$$" not in lv_one: new_one = "{0}$$${1}".format( lv_one, " ".join(mc_list)) level_one[parentId] = new_one else: new_one = "{0}@@@{1}".format( lv_one, " ".join(mc_list)) level_one[parentId] = new_one level_one_list = [] for k, v in level_one.items(): level_one_list.append(v) comments = "&&&".join(level_one_list) emoji_str = emoji.demojize(comments) # comments_new = re.sub(r':(.*?):', '', emoji_str).strip() LabelArray['content_comments'] = emoji_str else: LabelArray['content_comments'] = "errors" else: LabelArray['Html'] = '当前页面的网址为:' + LabelUrl + "\r\n页面类型为:" + PageType + \ "\r\nCookies数据为:"+LabelCookie+"\r\n接收到的数据是:" + LabelArray['Html'] # 以上是用户编写代码区域 LabelArray = json.dumps(LabelArray) print(LabelArray)
这种方式清洗数据真是太舒服了,特此记录,python的路是越来越广了