python爬虫之BeautifulSoup
# -*- coding: UTF-8 -*- import re from bs4 import BeautifulSoup import requests import codecs import sys reload(sys) sys.setdefaultencoding('utf8') def mei_url(): url = 'http://mdl.com/product' web_data = requests.get(url) web_data.encoding = 'utf-8' soup = BeautifulSoup(web_data.text, 'lxml') return soup def mei_info(sub_url='/product/item/293410'): url = 'http://mdl.com'+sub_url web_data = requests.get(url) web_data.encoding = 'utf-8' soup = BeautifulSoup(web_data.text, 'lxml') title=soup.select('#main > div.boundary > div > div.container__main > div.section.section-info.clearfix > h2')[0].get_text() introduce=soup.select('#main > div.boundary > div > div.container__main > div.section.section-intro.clearfix > div > div.section-intro__item__body.rich-text')[0].get_text() effect=soup.select('#main > div.boundary > div > div.container__main > div.section.section-intro.clearfix > div > div.section-intro__item__body.rich-text > span')[0].get_text() crowd=soup.select('#main > div.boundary > div > div.container__main > div.section.section-intro.clearfix > div > div.section-intro__item__body.rich-text')[2].get_text() print title with codecs.open(r'E:\note\mei_infov3.txt', "a+",'utf8') as file: file.write('&'.join(map(lambda x:str(x),[title,introduce,effect,crowd]))) file.write('\n') file.write('$') if __name__=='__main__': # items=mei_url() # items=str(items) soup1 = BeautifulSoup(open(r'E:\note\mei.htm'),'lxml') items1=str(soup1) url_list1=re.findall(r'/product/item/\d{6}',items1 ) soup2 = BeautifulSoup(open(r'E:\note\mei2.htm'),'lxml') items2=str(soup2) url_list2=re.findall(r'/product/item/\d{6}',items2 ) url_list3=url_list1+url_list2 print len(url_list3) for sub_url in url_list3: mei_info(sub_url)
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· AI与.NET技术实操系列:基于图像分类模型对图像进行分类
· go语言实现终端里的倒计时
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 25岁的心里话
· 闲置电脑爆改个人服务器(超详细) #公网映射 #Vmware虚拟网络编辑器
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· 零经验选手,Compose 一天开发一款小游戏!
· 通过 API 将Deepseek响应流式内容输出到前端