python 爬虫 5i5j房屋信息 获取并存储到数据库
1 from lxml import etree 2 from selenium import webdriver 3 import pymysql 4 5 def Geturl(fullurl):#获取每个招聘网页的链接 6 browser.get(fullurl) 7 shouye_html_text = browser.page_source 8 shouye_ele = etree.HTML(shouye_html_text) 9 zf_list = shouye_ele.xpath('/html/body/div[4]/div[1]/div[2]/ul/li/div/h3/a/@href')#链接url 10 zf_url_list = [] 11 for zf_url_lost in zf_list: 12 zf_url = 'https://bj.5i5j.com'+zf_url_lost 13 zf_url_list.append(zf_url) 14 return zf_url_list 15 def Getinfo(zp_url_list): 16 for zp_url in zp_url_list: 17 browser.get(zp_url) 18 zp_info_html = browser.page_source 19 zp_ele = etree.HTML(zp_info_html) 20 zp_info_title = str(zp_ele.xpath('//html/body/div[3]/div[1]/div[1]/h1/text()')[0]) 21 zp_info_num = str(zp_ele.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/div[1]/div/p[1]/text()')[0])+'元/月'#价格 22 zp_info_type = str(zp_ele.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/div[2]/div/p[1]/text()')[0])#户型 23 zp_info_zone = str(zp_ele.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/div[3]/div/p[1]/text()')[0])+'平米'#房屋大小 24 zp_info_need_1 = str(zp_ele.xpath('/html/body/div[3]/div[2]/div[2]/div[2]/ul/li[1]/span/text()')[0])#房屋信息 25 zp_info_need_2 = str(zp_ele.xpath('/html/body/div[3]/div[2]/div[2]/div[2]/ul/li[1]/a/text()')[0])#房屋信息 26 zp_info_need = zp_info_need_1+zp_info_need_2 27 connection = pymysql.connect(host='localhost', user='root', password='1234', db='5i5j', ) 28 try: 29 with connection.cursor() as cursor: 30 sql = "INSERT INTO `5i5j_info` (`title`,`num`,`type`, `zone`,`need`) VALUES (%s,%s,%s,%s, %s)" 31 cursor.execute(sql, (zp_info_title,zp_info_num,zp_info_type,zp_info_zone,zp_info_need)) 32 connection.commit() 33 finally: 34 connection.close() 35 print(zp_info_title,zp_info_num,zp_info_type,zp_info_zone,zp_info_need) 36 if __name__ == '__main__': 37 browser = webdriver.Chrome() 38 pags = int(input('需要几页?')) 39 for i in range(1,pags+1): 40 url = 'https://bj.5i5j.com/zufang/huilongguan/n{}/' 41 fullurl = url.format(str(i)) 42 zf_url_list = Geturl(fullurl) 43 print(fullurl) 44 # print(zf_url_list) 45 Getinfo(zf_url_list) 46 browser.close()
- -疏影横斜水清浅,暗香浮动月黄昏。- -
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· Linux系列:如何用heaptrack跟踪.NET程序的非托管内存泄露
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· Manus的开源复刻OpenManus初探
· AI 智能体引爆开源社区「GitHub 热点速览」
· 三行代码完成国际化适配,妙~啊~
· .NET Core 中如何实现缓存的预热?