python小白学习记录爬取斗图啦网站

from lxml import etree
import requests
from urllib import request
import time
import os
number = 0
def get_page():
    for x in range(1,20):
        url = "https://www.doutula.com/article/list/?page=%s" %x
        getpackagetag(url)
def getpackagetag(url):
    #url = "https://www.doutula.com/fonts/glyphicons-halflings-regular.woff2"
    headers = {
    #"Referer":"https://googleads.g.doubleclick.net/pagead/ads?client=ca-pub-8376044552838383&output=html&h=250&slotname=6194807158&adk=3723069261&adf=4258015783&w=300&lmt=1581570879&rafmt=12&psa=1&guci=2.2.0.0.2.2.0.0&format=300x250&url=https%3A%2F%2Fwww.doutula.com%2Farticle%2Flist%2F%3Fpage%3D4&flash=0&wgl=1&adsid=NT&dt=1581570879041&bpp=7&bdt=324&fdt=279&idt=281&shv=r20200211&cbv=r20190131&ptt=9&saldr=aa&abxe=1&cookie=ID%3D6cd6ae7a7d78e8cb%3AT%3D1581569611%3AS%3DALNI_MbXz-4N5bhr9iZjzYRg-dsDoxRS0g&crv=1&prev_fmts=0x0%2C792x438&nras=1&correlator=4911436719917&frm=20&pv=1&ga_vid=1814906950.1581569608&ga_sid=1581570879&ga_hid=96154551&ga_fc=0&iag=0&icsg=1275476607762480&dssz=41&mdo=0&mso=8&u_tz=480&u_his=1&u_java=0&u_h=1080&u_w=1920&u_ah=1040&u_aw=1920&u_cd=24&u_nplug=3&u_nmime=4&adx=1253&ady=88&biw=1903&bih=266&scr_x=0&scr_y=0&eid=21065304%2C21065531%2C44714169&oid=3&pvsid=2227037262823962&pem=303&rx=0&eae=0&fc=1920&brdim=0%2C0%2C0%2C0%2C1920%2C0%2C1920%2C1040%2C1920%2C266&vis=1&rsz=%7C%7CoeE%7C&abl=CS&pfx=0&fu=280&bc=31&jar=2020-2-8-13&ifi=3&uci=a!3&fsb=1&xpc=tQDORgDAgo&p=https%3A//www.doutula.com&dtd=290",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36"
    }
    #time.sleep(0.4)
    resp = requests.get(url, headers=headers)
    result = resp.text
    html = etree.HTML(result)
    listhref = html.xpath('//a[(@class="list-group-item random_list tg-article" or @class="list-group-item random_list") and @href]/@href')
    listtitle = html.xpath('//a[(@class="list-group-item random_list tg-article" or @class="list-group-item random_list") and @href]/div[@class="random_title"]/text()')
    for index in range(len(listtitle)):
        print(listtitle[index],listhref[index])
        get_package(listtitle[index],listhref[index])
def get_package(title,href):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36"
    }
    time.sleep(0.4)
    resp = requests.get(href, headers=headers)
    result = resp.text
    html = etree.HTML(result)
    imagetitle = html.xpath('//div[@class]/h1/a/text()')[0]

    imagenames = html.xpath('//div[@class="artile_des"]//img/@alt')
    imagecontentes = html.xpath('//div[@class="artile_des"]//img/@src')
    global number


    path = "f:/testimages/" +str(number)+"--"+ str(imagetitle)
    path = path.replace(".","")
    path = path.replace("<", "[")
    path = path.replace(">", "]")
    path = path.replace("?", "")
    path = path.replace("|", "")
    os.makedirs(path)
    number = number + 1
    print(path)
    for index in range(len(imagecontentes)):
        suffix = os.path.splitext(imagecontentes[index])
        indexname = str(imagenames[index]).replace("?","")
        request.urlretrieve(imagecontentes[index],path+"/"+indexname+"--"+str(index)+"--"+suffix[1])

os.makedirs("f:/testimages")
get_page()

有点问题在爬取30个表情包时，出现403访问错误

后发现由于此网站第三十个表情包其中一个图片地址没错，其网站数据库貌似丢失了此图片，其本身也无法显示

暂行解决办法跳过第三十个表情包就可以一直爬取

posted @ 2020-02-13 15:12 jswf 阅读(741) 评论(0) 收藏举报

刷新页面返回顶部

jswf

python小白学习记录 爬取斗图啦网站

公告

python小白学习记录爬取斗图啦网站