简单爬虫爬去51job职位
#-*- coding:utf-8 -*- from urllib import request from bs4 import BeautifulSoup from urllib import parse import pymysql from sqlalchemy import * from sqlalchemy.orm import * def getYao(url): url = url urlFirst = request.Request(url) urlFirst.add_header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36") urlFirst.add_header("Origin", "http://search.51job.com") postData = parse.urlencode([ ("s", "01"), ("t", "0"), ]) print(postData) return_ = request.urlopen(urlFirst, data=postData.encode("gbk")) contentNei = return_.read().decode("gbk") neisp = BeautifulSoup(contentNei,"html.parser") return neisp.find("div",class_="job_msg").get_text() engine=create_engine("mysql://root:root@localhost:3306/laravel?charset=utf8",echo=True) metadata=MetaData(engine) users_table = Table("jobs",metadata,autoload=True) for i in list(range(1,11)): url = "http://search.51job.com/list/000000,000000,0000,00,9,99,C,2,"+str(i)+".html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=102&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=" rep = request.Request(url) rep.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36") rep.add_header("Origin","http://search.51job.com") postData = parse.urlencode([ ("fromJs", "1"), ("jobarea", "040000"), ("keyword", "php"), ("keywordtype", "2"), ("lang", "c"), ("stype", "2"), ("postchannel", "0000"), ("fromType", "1"), ("confirmdate", "9") ]) print(postData) return_ = request.urlopen(rep,data=postData.encode("gbk")) content = return_.read().decode("gbk") sp = BeautifulSoup(content,"html.parser") print(content) sql_moban = users_table.insert() info_set = set([]) j = 0 for i in sp.find("div",class_="dw_table").find_all("div",class_="el"): if j==0: j = j + 1 continue j = j + 1 getYao(i.find('a').get('href')) result = sql_moban.execute(zhiwei=i.find("a").get_text().strip(), company=i.find("span",class_="t2").string,address=i.find("span",class_="t3").string,slary=i.find("span",class_="t4").string,riqi=i.find("span",class_="t5").string,yaoqiu=getYao(i.find('a').get('href'))) print("下载完成") print(info_set) #print(sql_moban)
将爬到的数据写到数据库中,php工资真不高;
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 智能桌面机器人:用.NET IoT库控制舵机并多方法播放表情
· Linux glibc自带哈希表的用例及性能测试
· 深入理解 Mybatis 分库分表执行原理
· 如何打造一个高并发系统?
· .NET Core GC压缩(compact_phase)底层原理浅谈
· 手把手教你在本地部署DeepSeek R1,搭建web-ui ,建议收藏!
· 新年开篇:在本地部署DeepSeek大模型实现联网增强的AI应用
· Janus Pro:DeepSeek 开源革新,多模态 AI 的未来
· 互联网不景气了那就玩玩嵌入式吧,用纯.NET开发并制作一个智能桌面机器人(三):用.NET IoT库
· 【非技术】说说2024年我都干了些啥