Python多线程爬虫爬取网页图片

临近期末考试，但是根本不想复习！啊啊啊啊啊啊啊！！！！

于是做了一个爬虫，网址为 https://yande.re，网页图片为动漫美图（图片带点颜色........宅男福利

github项目地址为：https://github.com/MyBules/yande_pider

多线程代码分为两个版本：一个是基于多页面多线程，一个是基于单页面多线程

一下是第一种代码：

'''
基于多页面多线程
'''
import os  # 引入文件模块
import re  # 正则表达式
import urllib.request
import threading

# 连接网页并返回源码
def open_url(url):
    try:
        req = urllib.request.Request(url)
        req.add_header("User-Agent",
                       "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36")
        response = urllib.request.urlopen(req)
        status_code = response.code
        html = response.read()
        return html
    except:
        print(url + " 404")
        return 404

def mkdir(path):
    '''
    :param path: 路径
    :return:
    '''
    # 引入模块
    import os

    # 去除首位空格
    path = path.strip()
    # 去除尾部 \ 符号
    path = path.rstrip("\\")
    # 判断路径是否存在
    # 存在     True
    # 不存在   False
    isExists = os.path.exists(path)

    # 判断结果
    if not isExists:
        # 如果不存在则创建目录
        # 创建目录操作函数
        os.makedirs(path)
        print(path + ' 创建成功')
        return True
    else:
        # 如果目录存在则不创建，并提示目录已存在
        print(path + ' 目录已存在')
        return False


def Yande1(i):
    imgs = 1
    url = 'https://yande.re/post?page=' + str(i)
    floder = "E:\\Python\\爬虫\\yande\\img\\page" + str(i)
    mkdir(floder)

    html = open_url(url)
    html = html.decode('gbk', 'ignore')
    img_adds = []
    img_adds = re.findall(r'<a class="directlink largeimg" href="([^"]+\.jpg)"', html)
    for i in img_adds:
        filename = floder + "\\" + str(imgs) + '.jpg'
        imgs += 1
        img_html = open_url(i)
        if img_html == 404:
            continue
        with open(filename, 'wb') as f:
            f.write(img_html)
            print(i + ' 下载完成......')

exitflag = 0

class myThread(threading.Thread):
    def __init__(self, threadID, name, list):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.list = list
    def run(self):
        print("开始线程：" + self.name)
        # threadLock.acquire()
        get_img(self.name, self.list)
        # threadLock.release()
        print("退出线程："+ self.name)


def get_img(threadname, list):
    if len(list):
        for i in list:
            if exitflag:
                threadname.exit()
            Yande1(i)

if __name__ == '__main__':
    pages1 = int(input('请输入你要下载的起始页面数：'))
    pages2 = int(input('请输入你要下载的末尾页面数：'))
    mkdir('img')

    # for i in range()
    list1 = []
    list2 = []
    list3 = []
    for i in range(pages1, pages2+1):
        if i % 3 == 0:
            list3.append(i)
        if i % 3 == 1:
            list1.append(i)
        if i % 3 == 2:
            list2.append(i)
    threadLock = threading.Lock()
    threads = []
    thread1 = myThread(1, "thread-1", list1)
    thread2 = myThread(2, "thread-2", list2)
    thread3 = myThread(3, "thread-3", list3)
    thread1.start()
    thread2.start()
    thread3.start()
    threads.append(thread1)
    threads.append(thread2)
    threads.append(thread3)

    for t in threads:
        t.join()
    print("退出主线程")

经测试，两种方法速度相差不大。

第二种方法放在github项目地址里了，如果各位游客是为了学习的话，第二种方法的代码还是去看一下较好。

posted @ 2019-01-14 09:41 我星伤悲阅读(3087) 评论(0) 收藏举报

刷新页面返回顶部

StarMan

无人知晓

Python多线程爬虫爬取网页图片

公告