python小白学习记录 爬取斗图啦网站
from lxml import etree import requests from urllib import request import time import os number = 0 def get_page(): for x in range(1,20): url = "https://www.doutula.com/article/list/?page=%s" %x getpackagetag(url) def getpackagetag(url): #url = "https://www.doutula.com/fonts/glyphicons-halflings-regular.woff2" headers = { #"Referer":"https://googleads.g.doubleclick.net/pagead/ads?client=ca-pub-8376044552838383&output=html&h=250&slotname=6194807158&adk=3723069261&adf=4258015783&w=300&lmt=1581570879&rafmt=12&psa=1&guci=2.2.0.0.2.2.0.0&format=300x250&url=https%3A%2F%2Fwww.doutula.com%2Farticle%2Flist%2F%3Fpage%3D4&flash=0&wgl=1&adsid=NT&dt=1581570879041&bpp=7&bdt=324&fdt=279&idt=281&shv=r20200211&cbv=r20190131&ptt=9&saldr=aa&abxe=1&cookie=ID%3D6cd6ae7a7d78e8cb%3AT%3D1581569611%3AS%3DALNI_MbXz-4N5bhr9iZjzYRg-dsDoxRS0g&crv=1&prev_fmts=0x0%2C792x438&nras=1&correlator=4911436719917&frm=20&pv=1&ga_vid=1814906950.1581569608&ga_sid=1581570879&ga_hid=96154551&ga_fc=0&iag=0&icsg=1275476607762480&dssz=41&mdo=0&mso=8&u_tz=480&u_his=1&u_java=0&u_h=1080&u_w=1920&u_ah=1040&u_aw=1920&u_cd=24&u_nplug=3&u_nmime=4&adx=1253&ady=88&biw=1903&bih=266&scr_x=0&scr_y=0&eid=21065304%2C21065531%2C44714169&oid=3&pvsid=2227037262823962&pem=303&rx=0&eae=0&fc=1920&brdim=0%2C0%2C0%2C0%2C1920%2C0%2C1920%2C1040%2C1920%2C266&vis=1&rsz=%7C%7CoeE%7C&abl=CS&pfx=0&fu=280&bc=31&jar=2020-2-8-13&ifi=3&uci=a!3&fsb=1&xpc=tQDORgDAgo&p=https%3A//www.doutula.com&dtd=290", "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36" } #time.sleep(0.4) resp = requests.get(url, headers=headers) result = resp.text html = etree.HTML(result) listhref = html.xpath('//a[(@class="list-group-item random_list tg-article" or @class="list-group-item random_list") and @href]/@href') listtitle = html.xpath('//a[(@class="list-group-item random_list tg-article" or @class="list-group-item random_list") and @href]/div[@class="random_title"]/text()') for index in range(len(listtitle)): print(listtitle[index],listhref[index]) get_package(listtitle[index],listhref[index]) def get_package(title,href): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36" } time.sleep(0.4) resp = requests.get(href, headers=headers) result = resp.text html = etree.HTML(result) imagetitle = html.xpath('//div[@class]/h1/a/text()')[0] imagenames = html.xpath('//div[@class="artile_des"]//img/@alt') imagecontentes = html.xpath('//div[@class="artile_des"]//img/@src') global number path = "f:/testimages/" +str(number)+"--"+ str(imagetitle) path = path.replace(".","") path = path.replace("<", "[") path = path.replace(">", "]") path = path.replace("?", "") path = path.replace("|", "") os.makedirs(path) number = number + 1 print(path) for index in range(len(imagecontentes)): suffix = os.path.splitext(imagecontentes[index]) indexname = str(imagenames[index]).replace("?","") request.urlretrieve(imagecontentes[index],path+"/"+indexname+"--"+str(index)+"--"+suffix[1]) os.makedirs("f:/testimages") get_page()
有点问题 在爬取30个表情包时,出现403访问错误
后发现由于 此网站第三十个表情包其中一个图片地址没错 ,其网站数据库貌似丢失了此图片,其本身也无法显示
暂行解决办法 跳过第三十个表情包 就可以一直爬取
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· AI技术革命,工作效率10个最佳AI工具