python协程爬取某网站的老赖数据
import re import json import aiohttp import asyncio import time import pymysql from asyncio.locks import Semaphore from functools import partial headers = { 'Cookie': 'auth_token=your_token_here', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36' } def save_data(cursor, addr_dic, obj): try: data = obj.result()['data'] name = data['iname'] addr = addr_dic.get(name, '') idcard = data['cardnum'] assert re.match('\d{10}[\d*]{4}\d{3}[\dxX]', idcard) birth = idcard[6:10] assert birth.isdigit() birth += '年' sex = data.get('sex') if not sex: n = int(idcard[-2]) sex = '男' if (n % 2) == 1 else '女' tm = time.localtime(data.get('regdate', 0) / 1000) createtime = f'{tm.tm_year}-{tm.tm_mon}-{tm.tm_mday}' cursor.execute("insert into tianyancha(name, birth, sex, idcard, court, createtime, caseno, base, duty, status, detail, addr) values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % ( name, birth, sex, idcard, data['courtname'], createtime, data['casecode'], data['gistunit'], data['duty'], data['performance'], data['disrupttypename'], addr )) except Exception as e: print('插入错误', e.args) async def parse_case_data(sem, session, cid): # 爬取详情记录 async with sem: # 控制并发量 async with session.get(f"https://shixin.tianyancha.com/shixin/getDishonestinfoDetailWeb.json?bussinessId={cid}") as rsp: return await rsp.json() async def parse_province(sem, session, cursor, url): page = 1 while True: # 翻页爬取 page_url = f'{url}/p{page}' async with session.get(page_url) as rsp: try: txt = await rsp.text() # 解析出人名对应的地址 addr_dic = {} pps = [i.strip() for i in re.findall('dishonest_base_info_detail">(.*?)</', txt, re.S)] for itm in pps: try: name, _, _, addr = itm.split(',') assert addr.endswith('人。') addr = addr.rstrip('人。') addr_dic[name] = addr except: pass # 解析出每条失信记录的id cid_lis = re.findall('data-id="([\da-z]{32})"', txt) tasks = [] for cid in cid_lis: # 开启协程爬取解析每条记录 task = asyncio.create_task(parse_case_data(sem, session, cid)) # 回调存入mysql task.add_done_callback(partial(save_data, cursor, addr_dic)) tasks.append(task) await asyncio.wait(tasks) print(f'第{page}页爬取完成') if 'tic-icon-arrow-right' not in txt: break page += 1 except: print(f'爬取到第{page}页失败') break async def main(): province = "广东" url_data = json.load(open('url.json', 'r', encoding='utf-8')) # url.json: 存储省份对应的url的json文件 url_lis = [url_data.get(province)] # 此处是支持全国所有省份一起爬取的,但是我只爬取广东的 sem = Semaphore(4) conn = pymysql.connect(host='localhost', port=3306, user='user', password='password', charset='utf8', database='db', autocommit=True) cursor = conn.cursor() async with aiohttp.ClientSession(headers=headers) as session: for url in url_lis: await parse_province(sem, session, cursor, url) cursor.close() conn.close() if __name__ == '__main__': asyncio.run(main())