Python爬虫(八)
源码:
1 import requests 2 import re 3 from my_mysql import MysqlConnect 4 import time,random 5 6 7 # 获取招聘详情链接 8 def get_urls(page, headers): 9 url = 'https://hr.tencent.com/position.php?lid=&tid=&keywords=python&start=page'.format(page) 10 response = requests.get(url, headers=headers) 11 pat = r'href="(position_detail.*?)">' 12 url_list_bytes = re.findall(pat.encode('utf-8'), response.content) 13 return url_list_bytes 14 15 # 获取招聘详情 16 def get_info(url, headers): 17 response = requests.get(url, headers=headers) 18 html_bytes = response.content 19 # print(html_bytes) 20 21 # title 标题 22 pat = r'id="sharetitle">(.*?)</td>' 23 res = re.search(pat.encode('utf-8'), html_bytes) 24 title = res.group(1).decode('utf-8') 25 # address 地点 26 pat = r'工作地点:</span>(.*?)</td>' 27 res = re.search(pat.encode('utf-8'), html_bytes) 28 address = res.group(1).decode('utf-8') 29 # types 类别 30 pat = r'职位类别:</span>(.*?)</td>' 31 res = re.search(pat.encode('utf-8'), html_bytes) 32 types = res.group(1).decode('utf-8') 33 # counts 人数 34 pat = r'招聘人数:</span>(.*?)</td>' 35 res = re.search(pat.encode('utf-8'), html_bytes) 36 counts = res.group(1).decode('utf-8') 37 # duty 职责 38 pat = r'工作职责.*?<ul class="squareli">(.*?)</ul>' 39 res = re.search(pat.encode('utf-8'), html_bytes) 40 duty_str = res.group(1).decode('utf-8') 41 pat = r'<li>(.*?)</li>' 42 duty = re.findall(pat,duty_str) 43 duty = ('\n').join(duty) 44 # requires 要求 45 pat = r'工作要求.*?<ul class="squareli">(.*?)</ul>' 46 res = re.search(pat.encode('utf-8'), html_bytes) 47 requires_str = res.group(1).decode('utf-8') 48 pat = r'<li>(.*?)</li>' 49 requires = re.findall(pat, requires_str) 50 requires = ('\n').join(requires) 51 return title,address,types,counts,duty,requires 52 53 54 if __name__ == '__main__': 55 mc = MysqlConnect('127.0.0.1','root','123456','homework') 56 sql = "insert into tencentzp(title,address,types,counts,duty,requires) values(%s,%s,%s,%s,%s,%s)" 57 headers = { 58 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' 59 } 60 for page in range(0,200,10): 61 url_list_bytes = get_urls(page,headers) 62 # print(url_list_bytes) 63 for url in url_list_bytes: 64 # print(url.decode('utf-8')) 65 url = 'https://hr.tencent.com/' + url.decode('utf-8') 66 info = get_info(url,headers) 67 print(info) 68 mc.exec_data(sql,info) 69 time.sleep(random.random()*5)
【推荐】还在用 ECharts 开发大屏?试试这款永久免费的开源 BI 工具!
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· .NET 原生驾驭 AI 新基建实战系列:向量数据库的应用与畅想
· 从问题排查到源码分析:ActiveMQ消费端频繁日志刷屏的秘密
· 一次Java后端服务间歇性响应慢的问题排查记录
· dotnet 源代码生成器分析器入门
· ASP.NET Core 模型验证消息的本地化新姿势
· ThreeJs-16智慧城市项目(重磅以及未来发展ai)
· .NET 原生驾驭 AI 新基建实战系列(一):向量数据库的应用与畅想
· Ai满嘴顺口溜,想考研?浪费我几个小时
· Browser-use 详细介绍&使用文档
· 软件产品开发中常见的10个问题及处理方法