爬虫基础-爬取某神坛图片

线程池方式 

import requests
from lxml import etree
from bs4 import BeautifulSoup
from lxml.html import tostring
import time
import os
import re
from multiprocessing.dummy import Pool


def get_title_list(Max_page_Num):
    list_title = []
    for pageNum in range(1, Max_page_Num):
        url = page_url.format(Num=pageNum)
        try:
            response = requests.get(url=url, headers=headers)
            if response.status_code == 200:
                print('<第', pageNum, '页>获取页面信息完成')
            response.encoding = 'gbk'
            tree = etree.HTML(response.text)
            if pageNum == 1:
                title_list = tree.xpath('//div[@id="main"]//table[@id="ajaxtable"]//tr[@class="tr3 t_one tac"][position()>9]')
            else:
                title_list = tree.xpath('//div[@id="main"]//table[@id="ajaxtable"]//tr[@class="tr3 t_one tac"]')

            for tr in title_list:
                # 取分类,把空格去掉,保留字典
                f_name = tr.xpath('normalize-space(./td[@class="tal"]/text())')
                # 取标题
                e_name = tr.xpath('./td[@class="tal"]/h3//text()')[0]
                # 组合之后类似这样  [真心话系列]我爱你中国
                title_name = f_name + e_name
                # 标题链接
                title_src = server + tr.xpath('./td[@class="tal"]//a/@href[1]')[0]
                title_dir = Download_dir + title_name
                dic_title = {
                    'name': title_name,
                    'dir': title_dir,
                    'src': title_src,
                    'pageNum': pageNum
                }
                list_title.append(dic_title)
        except Exception as re_index:
            print(re_index, '页面请求超时', '链接地址: ', url)
    return list_title

def get_img_list(dic_title):
    title_name = dic_title['name']
    title_dir = dic_title['dir']
    title_src = dic_title['src']
    pageNum = dic_title['pageNum']
    if not os.path.exists(title_dir):
        os.makedirs(title_dir)
    print('获取标题完成: ', '<第', pageNum, '页>' ,title_name, '链接地址: ', title_src)
    try:
        parse_title_url = requests.get(url=title_src, headers=headers)
        parse_title_url.encoding = 'gbk'
        title_tree = etree.HTML(parse_title_url.text)
        img_list = title_tree.xpath('//div[@class="t t2"][1]//td[@valign="top"]//div[@class="tpc_content do_not_catch"]/img')
        list_img = []  # 定义一个列表,用来封装字典(字典内容为图片相关信息)
        for img in img_list:
            # imgNum = len(img)
            # img_decode = tostring(img).decode('utf-8')
            img_src = img.xpath('./@ess-data')[0]
            img_name = img_src.split('/')[-1]
            img_path = title_dir + '/' + img_name
            # 定义一个字典,封装图片相关信息
            dic_img = {
                'name': img_name,
                'src': img_src,
                'path': img_path
            }
            list_img.append(dic_img)
        if list_img:
            print(title_name, '图片信息解析完成')
            return list_img
    except Exception as re_title:
        print(re_title, '<', title_name, '>', '标题链接请求超时', '链接地址: ', title_src)

def download_img(dic_img):
    img_name = dic_img['name']
    img_src = dic_img['src']
    img_path = dic_img['path']

    if not os.path.exists(img_path):
        try:
            print(img_name,'开始下载')
            img_response = requests.get(url=img_src, headers=headers).content
            fp = open(img_path, 'wb')
            fp.write(img_response)
            fp.close()
            print(img_src, '下载完成',time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
        except Exception as re_img:
            print(re_img, img_src, '请求失败')
    else:
        print(img_name, '已存在,跳过下载',time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

if __name__ == "__main__":
    page_url = 'https://cl.fs55.xyz/thread0806.php?fid=16&page={Num}'
    # page_url = 'https://cl.fs55.xyz/thread0806.php?fid=16&search=digest&page={Num}'
    server = 'https://cl.fs55.xyz/'
    Max_page_Num = 3
    Download_dir = 'K:/Download/CL_Images/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }
    title_list = get_title_list(Max_page_Num)
    for dic_title in title_list:
        img_list = get_img_list(dic_title)
        # 多线程模式
        pool = Pool(5)
        pool.map(download_img, img_list)
        pool.close()
        pool.join()

协程方式

import requests
from lxml import etree
from bs4 import BeautifulSoup
from lxml.html import tostring
import time
import os
import re
import asyncio
import aiohttp


def get_title_list(Max_page_Num):
    list_title = []
    for pageNum in range(1, Max_page_Num):
        url = page_url.format(Num=pageNum)
        try:
            response = requests.get(url=url, headers=headers)
            if response.status_code == 200:
                print('<第', pageNum, '页>获取页面信息完成')
            response.encoding = 'gbk'
            tree = etree.HTML(response.text)
            if pageNum == 1:
                title_list = tree.xpath('//div[@id="main"]//table[@id="ajaxtable"]//tr[@class="tr3 t_one tac"][position()>9]')
            else:
                title_list = tree.xpath('//div[@id="main"]//table[@id="ajaxtable"]//tr[@class="tr3 t_one tac"]')

            for tr in title_list:
                # 取分类,把空格去掉,保留字典
                f_name = tr.xpath('normalize-space(./td[@class="tal"]/text())')
                # 取标题
                e_name = tr.xpath('./td[@class="tal"]/h3//text()')[0]
                # 组合之后类似这样  [真心话系列]我爱你中国
                title_name = f_name + e_name
                # 标题链接
                title_src = server + tr.xpath('./td[@class="tal"]//a/@href[1]')[0]
                title_dir = Download_dir + title_name
                dic_title = {
                    'name': title_name,
                    'dir': title_dir,
                    'src': title_src,
                    'pageNum': pageNum
                }
                list_title.append(dic_title)
            time.sleep(2)
        except Exception as re_index:
            print(re_index, '页面请求超时', '链接地址: ', url)
    if list_title:
        return list_title

def get_img_list(dic_title):
    title_name = dic_title['name']
    title_dir = dic_title['dir']
    title_src = dic_title['src']
    pageNum = dic_title['pageNum']
    if not os.path.exists(title_dir):
        os.makedirs(title_dir)
    print('标题信息: ', '<第', pageNum, '页>' ,title_name, '链接地址: ', title_src)
    try:
        parse_title_url = requests.get(url=title_src, headers=headers)
        parse_title_url.encoding = 'gbk'
        title_tree = etree.HTML(parse_title_url.text)
        img_list = title_tree.xpath('//div[@class="t t2"][1]//td[@valign="top"]//div[@class="tpc_content do_not_catch"]/img')
        list_img = []  # 定义一个列表,用来封装字典(字典内容为图片相关信息)
        for img in img_list:
            # imgNum = len(img)
            # img_decode = tostring(img).decode('utf-8')
            img_src = img.xpath('./@ess-data')[0]
            img_name = img_src.split('/')[-1]
            img_path = title_dir + '/' + img_name
            # 定义一个字典,封装图片相关信息
            dic_img = {
                'name': img_name,
                'src': img_src,
                'path': img_path
            }
            list_img.append(dic_img)
        if list_img:
            print(title_name,'图片信息解析完成')
            return list_img
    except Exception as re_title:
        print(re_title, '<', title_name, '>', '标题链接请求超时', '链接地址: ', title_src)

async def download_img(dic_img):
    img_name = dic_img['name']
    img_src = dic_img['src']
    img_path = dic_img['path']

    if not os.path.exists(img_path):
            print(img_name,'开始下载')
            async with aiohttp.ClientSession() as session:
                async with await session.get(url=img_src, headers=headers) as response:
                    img_response = await response.read()
            fp = open(img_path, 'wb')
            fp.write(img_response)
            fp.close()
            print(img_src, '下载完成',time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    else:
        print(img_name, '已存在,跳过下载',time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

if __name__ == "__main__":
    # page_url = 'https://cl.fs55.xyz/thread0806.php?fid=16&page=%d'
    page_url = 'https://cl.fs55.xyz/thread0806.php?fid=16&search=digest&page={Num}'
    server = 'https://cl.fs55.xyz/'
    Max_page_Num = 50
    Download_dir = 'K:/Download/CL_Images/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }
    title_list = get_title_list(Max_page_Num)
    for dic_title in title_list:
        img_list = get_img_list(dic_title)
        if not img_list:
            print('任务列表为空,跳过任务')
            continue
        # 协程模式
        tasks = []
        for dic_img in img_list:
            c = download_img(dic_img)
            task = asyncio.ensure_future(c)
            tasks.append(task)

        # 创建一个事件循环对象
        loop = asyncio.get_event_loop()
        # 将协程对象注册到loop中,然后启动loop-需要将任务列表封装到wait中
        loop.run_until_complete(asyncio.wait(tasks))

 

 

posted @ 2020-08-16 20:11  消磨_时间  阅读(1039)  评论(0编辑  收藏  举报