python爬虫获取下一页

from time import sleep

import faker
import requests
from lxml import etree

fake = faker.Faker()

base_url = "http://angelimg.spbeen.com"

def get_next_link(url):
    content = downloadHtml(url)
    html = etree.HTML(content)
    next_url = html.xpath("//a[@class='ch next']/@href")
    if next_url:
        return base_url + next_url[0]
    else:
        return False

def downloadHtml(ur):
    user_agent = fake.user_agent()
    headers = {'User-Agent': user_agent,"Referer":"http://angelimg.spbeen.com/"}
    response = requests.get(url, headers=headers)
    return response.text

def getImgUrl(content):
    html  = etree.HTML(content)
    img_url = html.xpath('//*[@id="content"]/a/img/@src')
    title = html.xpath(".//div['@class=article']/h2/text()")

    return img_url[0],title[0]

def saveImg(title,img_url):
    if img_url is not None and title is not None:
        with open("txt/"+str(title)+".jpg",'wb') as f:
            user_agent = fake.user_agent()
            headers = {'User-Agent': user_agent,"Referer":"http://angelimg.spbeen.com/"}
            content = requests.get(img_url, headers=headers)
            #request_view(content)
            f.write(content.content)
            f.close()

def request_view(response):
    import webbrowser
    request_url = response.url
    base_url = '<head><base href="%s">' %(request_url)
    base_url = base_url.encode()
    content = response.content.replace(b"<head>",base_url)
    tem_html = open('tmp.html','wb')
    tem_html.write(content)
    tem_html.close()
    webbrowser.open_new_tab('tmp.html')

def crawl_img(url):
    content = downloadHtml(url)
    res = getImgUrl(content)
    title = res[1]
    img_url = res[0]
    saveImg(title,img_url)

if __name__ == "__main__":
    url = "http://angelimg.spbeen.com/ang/4968/1"

    while url:
        print(url)
        crawl_img(url)
        url = get_next_link(url)

还有种方式,获取到总页数,再循环 

posted @ 2020-03-13 12:25  brady-wang  阅读(3680)  评论(0编辑  收藏  举报