# -*- coding: utf-8 -*-
#2019/12/16 10:35
import os
import time
import requests
import traceback
from bs4 import BeautifulSoup
from  multiprocessing import Process, Pool
import threading
from threading import Thread
from concurrent.futures import ThreadPoolExecutor  # 线程池
import gevent
from conf import settings as ss
from core.log import Log as log

def save_img(file_name,content):
    '''保存图片到本地'''
    with open(file_name, 'wb') as f:
        f.write(content)

def spider(num):
    '''爬虫操作'''
    rep_url = r"https://www.autohome.com.cn/all/{}/#liststart".format(num)
    print(rep_url)
    # 1. 模拟浏览器发请求
    response = requests.get(url=rep_url)
    # response.encoding = 'gbk'
    # 2. 获取请求内容
    text = response.text
    # 3. 使用bs4库解析请求
    soup = BeautifulSoup(text, 'html.parser')  # html.parser:解析器,负责解析文本
    # 从整个文本中进一步缩小定位范围, div:所有图片外部的盒子
    div_obj = soup.find(name='div', attrs={"class": "article-wrapper"})
    # print(div_obj)
    # 4. 定位图片位置
    # 从盒子中找所有li标签
    li_list = div_obj.find_all(name="li")
    for li in li_list:
        # 5. 获取图片链接
        img = li.find(name='img')
        try:
            src = img.get("src")
            # print(src)
            img_name = src.rsplit('/',1)[-1]
            file_name = os.path.join(ss.IMG_PATH,img_name)
            # 获取到的图片链接没有http,给补齐
            if not src.startswith('http'): src = 'http:' + src
            # 6. 使用requests模块向图片链接发请求
            res = requests.get(url=src)
            # 7.保存图片到本地
            save_img(file_name, res.content)
        except Exception as e:
            log.error(traceback.format_exc())

        # break


def thread_func(n):
    '''协程操作'''
    gevent_list = []
    t = 50  # 开50个协程
    m = t * n
    s = 1 + (m - t)
    for i in range(s, m+1):
        # print(i)
        gevent_list.append(gevent.spawn(spider,i)) # 生成的协程加入列表
    gevent.joinall(gevent_list) #执行协程,遇到IO阻塞时会自动切换任务

def process_func(n):
    '''线程操作'''
    t = 20 # 开20个线程
    m = t*n
    s = 1+(m-t)
    for num in range(s, m+1):
        m = threading.Thread(target=thread_func,args=(num,))
        m.start()
        # m.join()

if __name__ == '__main__':
    start = time.time()
    for i in range(1,6): # 开5个进程
        m = Process(target=process_func,args=(i,))
        m.start()
        # m.join()
    m.join()
    print(time.time() - start)
# -*- coding: utf-8 -*-
#2019/12/19 16:41

import os
import threading
import gevent
from multiprocessing import Process
# 进程、线程和协程结合使用

def gevent_func(i):
    print('@@子协程@@',i)

def thread_func():
    print('-->子线程-->', os.getpid(), threading.current_thread().ident)
    gevent_list = []
    for i in range(3):
        gevent_list.append(gevent.spawn(gevent_func,i)) # 生成3个协程加入列表
    gevent.joinall(gevent_list) #执行协程,遇到IO阻塞时会自动切换任务


def process_func():
    print('!!子进程!!', os.getpid(), threading.current_thread().ident)
    for i in range(3):# 开3个线程
        threading.Thread(target=thread_func).start()

if __name__ == '__main__':
    print('-->主进程',os.getpid(), threading.current_thread().ident)
    for i in range(3): #开3个进程
        Process(target=process_func).start()

 

 

 

只用线程池的情况

# -*- coding: utf-8 -*-
#2019/12/16 10:35
import os
import time
import requests
import traceback
from bs4 import BeautifulSoup
from  multiprocessing import Process, Pool
from threading import Thread
from concurrent.futures import ThreadPoolExecutor  # 线程池
import gevent
from conf import settings as ss
from core.log import Log as log

def save_img(file_name,content):
    '''保存图片到本地'''
    with open(file_name, 'wb') as f:
        f.write(content)

def spider(num):
    '''爬虫操作'''
    rep_url = r"https://www.autohome.com.cn/all/{}/#liststart".format(num)
    print(rep_url)
    # 1. 模拟浏览器发请求
    response = requests.get(url=rep_url)
    # response.encoding = 'gbk'
    # 2. 获取请求内容
    text = response.text
    # 3. 使用bs4库解析请求
    soup = BeautifulSoup(text, 'html.parser')  # html.parser:解析器,负责解析文本
    # 从整个文本中进一步缩小定位范围, div:所有图片外部的盒子
    div_obj = soup.find(name='div', attrs={"class": "article-wrapper"})
    # print(div_obj)
    # 4. 定位图片位置
    # 从盒子中找所有li标签
    li_list = div_obj.find_all(name="li")
    for li in li_list:
        # 5. 获取图片链接
        img = li.find(name='img')
        try:
            src = img.get("src")
            # print(src)
            img_name = src.rsplit('/',1)[-1]
            file_name = os.path.join(ss.IMG_PATH,img_name)
            # 获取到的图片链接没有http,给补齐
            if not src.startswith('http'): src = 'http:' + src
            # 6. 使用requests模块向图片链接发请求
            res = requests.get(url=src)
            # 7.保存图片到本地
            save_img(file_name, res.content)
        except Exception as e:
            log.error(traceback.format_exc())

        # break


def thread_pool():
    t = ThreadPoolExecutor(max_workers=10)
    for num in range(1, 101):
        t.submit(spider,num)
    t.shutdown()


if __name__ == '__main__':
    start = time.time()
    thread_pool()
    print(time.time() - start)
posted on 2019-12-23 17:07  雨之夜&秋  阅读(662)  评论(0编辑  收藏  举报