python小白学习记录 爬取斗图啦网站

复制代码
from lxml import etree
import requests
from urllib import request
import time
import os
number = 0
def get_page():
    for x in range(1,20):
        url = "https://www.doutula.com/article/list/?page=%s" %x
        getpackagetag(url)
def getpackagetag(url):
    #url = "https://www.doutula.com/fonts/glyphicons-halflings-regular.woff2"
    headers = {
    #"Referer":"https://googleads.g.doubleclick.net/pagead/ads?client=ca-pub-8376044552838383&output=html&h=250&slotname=6194807158&adk=3723069261&adf=4258015783&w=300&lmt=1581570879&rafmt=12&psa=1&guci=2.2.0.0.2.2.0.0&format=300x250&url=https%3A%2F%2Fwww.doutula.com%2Farticle%2Flist%2F%3Fpage%3D4&flash=0&wgl=1&adsid=NT&dt=1581570879041&bpp=7&bdt=324&fdt=279&idt=281&shv=r20200211&cbv=r20190131&ptt=9&saldr=aa&abxe=1&cookie=ID%3D6cd6ae7a7d78e8cb%3AT%3D1581569611%3AS%3DALNI_MbXz-4N5bhr9iZjzYRg-dsDoxRS0g&crv=1&prev_fmts=0x0%2C792x438&nras=1&correlator=4911436719917&frm=20&pv=1&ga_vid=1814906950.1581569608&ga_sid=1581570879&ga_hid=96154551&ga_fc=0&iag=0&icsg=1275476607762480&dssz=41&mdo=0&mso=8&u_tz=480&u_his=1&u_java=0&u_h=1080&u_w=1920&u_ah=1040&u_aw=1920&u_cd=24&u_nplug=3&u_nmime=4&adx=1253&ady=88&biw=1903&bih=266&scr_x=0&scr_y=0&eid=21065304%2C21065531%2C44714169&oid=3&pvsid=2227037262823962&pem=303&rx=0&eae=0&fc=1920&brdim=0%2C0%2C0%2C0%2C1920%2C0%2C1920%2C1040%2C1920%2C266&vis=1&rsz=%7C%7CoeE%7C&abl=CS&pfx=0&fu=280&bc=31&jar=2020-2-8-13&ifi=3&uci=a!3&fsb=1&xpc=tQDORgDAgo&p=https%3A//www.doutula.com&dtd=290",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36"
    }
    #time.sleep(0.4)
    resp = requests.get(url, headers=headers)
    result = resp.text
    html = etree.HTML(result)
    listhref = html.xpath('//a[(@class="list-group-item random_list tg-article" or @class="list-group-item random_list") and @href]/@href')
    listtitle = html.xpath('//a[(@class="list-group-item random_list tg-article" or @class="list-group-item random_list") and @href]/div[@class="random_title"]/text()')
    for index in range(len(listtitle)):
        print(listtitle[index],listhref[index])
        get_package(listtitle[index],listhref[index])
def get_package(title,href):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36"
    }
    time.sleep(0.4)
    resp = requests.get(href, headers=headers)
    result = resp.text
    html = etree.HTML(result)
    imagetitle = html.xpath('//div[@class]/h1/a/text()')[0]

    imagenames = html.xpath('//div[@class="artile_des"]//img/@alt')
    imagecontentes = html.xpath('//div[@class="artile_des"]//img/@src')
    global number


    path = "f:/testimages/" +str(number)+"--"+ str(imagetitle)
    path = path.replace(".","")
    path = path.replace("<", "[")
    path = path.replace(">", "]")
    path = path.replace("?", "")
    path = path.replace("|", "")
    os.makedirs(path)
    number = number + 1
    print(path)
    for index in range(len(imagecontentes)):
        suffix = os.path.splitext(imagecontentes[index])
        indexname = str(imagenames[index]).replace("?","")
        request.urlretrieve(imagecontentes[index],path+"/"+indexname+"--"+str(index)+"--"+suffix[1])

os.makedirs("f:/testimages")
get_page()
复制代码

有点问题 在爬取30个表情包时,出现403访问错误

后发现由于 此网站第三十个表情包其中一个图片地址没错 ,其网站数据库貌似丢失了此图片,其本身也无法显示

暂行解决办法 跳过第三十个表情包 就可以一直爬取

posted @   jswf  阅读(721)  评论(0编辑  收藏  举报
编辑推荐:
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
阅读排行:
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· AI技术革命,工作效率10个最佳AI工具
点击右上角即可分享
微信分享提示