爬取天眼查工商信息
import requests import re import json import time from xlrd import open_workbook from xlutils.copy import copy class TianYanChaSpider(): def __init__(self,keyword): self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36" } self.keyword = keyword self.xlsx_file = r'business_datas.xlsx' def timeStamp(self,timeNum): if timeNum and timeNum > 0: try: timeStamp = float(timeNum / 1000) timeArray = time.localtime(timeStamp) otherStyleTime = time.strftime("%Y-%m-%d", timeArray) return otherStyleTime except: return '' else: return '' def start_url(self): url = 'https://www.tianyancha.com/search?key=' + self.keyword response = requests.get(url=url, headers=self.headers) detail_id = re.findall(r'https://www.tianyancha.com/company/(.*?)"', response.text)[0] #默认取第一个 detail_url = 'https://www.tianyancha.com/company/' + detail_id return detail_url def detail_context(self): url = self.start_url() detail_response = requests.get(url=url,headers=self.headers) json_data = re.findall(r'type="application/json">(.*?)</script></body></html>',detail_response.text)[0] data= json.loads(json_data) queries = data['props']['pageProps']['dehydratedState']['queries'][0]['state']['data']['data'] name = queries.get('name','') # 公司名称 creditCode = queries.get('creditCode','') # 统一社会信用代码 companyOrgType = queries.get('companyOrgType','') # 企业类型 taxAddress = queries.get('taxAddress','') # 注册地址 legalPersonName = queries.get('legalPersonName','')# 法定代表人 regCapital = queries.get('regCapital','') # 注册资本 fromTime = self.timeStamp(queries.get('fromTime','')) # 营业期限开始时间 toTime = self.timeStamp(queries.get('toTime','')) # 营业期限截止时间 if toTime and fromTime: # 营业期限截止时间 timeRange = fromTime + ' 至 ' + toTime elif not fromTime and not toTime: timeRange = '' else: if fromTime: timeRange = fromTime + ' 至 ' + '无固定期限' else: timeRange = '' estiblishTime = self.timeStamp(queries.get('estiblishTime','')) # 成立时间 businessScope = queries.get('businessScope','') # 经营范围 self.save_data(name,creditCode,companyOrgType,taxAddress,legalPersonName,regCapital,timeRange,estiblishTime,businessScope) def save_data(self,name,creditCode,companyOrgType,taxAddress,legalPersonName,regCapital,timeRange,estiblishTime,businessScope): # 获取源表内容 r_xls = open_workbook(self.xlsx_file) # 读取excel文件 row = r_xls.sheets()[0].nrows # 获取已有的行数 excel = copy(r_xls) # 将xlrd的对象转化为xlwt的对象 table = excel.get_sheet(0) # 获取要操作的sheet,默认是第一个表 # 对excel表追加一行内容 table.write(row, 0, name) # 括号内分别为行数、列数、内容 table.write(row, 1, creditCode) table.write(row, 2, companyOrgType) table.write(row, 3, taxAddress) table.write(row, 4, legalPersonName) table.write(row, 5, regCapital) table.write(row, 6, timeRange) table.write(row, 7, estiblishTime) table.write(row, 8, businessScope) # 保存并覆盖文件 excel.save(self.xlsx_file) print('{}---保存成功'.format(name)) if __name__ == '__main__': with open('gs_lists_good.txt','r',encoding='gbk')as f: data = f.readlines() for i in data: keyword = i.strip() TianYanChaSpider(keyword).detail_context()
注:本项目仅用于学习研究使用,请勿将本项目的任何内容用于商业或非法目的,否则后果自负。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 分享一个免费、快速、无限量使用的满血 DeepSeek R1 模型,支持深度思考和联网搜索!
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· 25岁的心里话
· ollama系列01:轻松3步本地部署deepseek,普通电脑可用
· 按钮权限的设计及实现