#对于io操作来说,多线程和多进程性能差别不大 #1.通过Thread类实例化 import time import threading class GetDetailHtml(threading.Thread): def __init__(self,name): super().__init__(name=name) def run(self): print('get detail html started') time.sleep(2) print("get detail html end") class GetDetailUrl(threading.Thread): def __init__(self,name): super().__init__(name=name) def run(self): print("get detail url started") time.sleep(4) print("get detail url end") if __name__ == '__main__': thread1 = GetDetailHtml('get_detail_html') thread2 = GetDetailUrl('get_detail_url') start_time = time.time() thread1.start() thread2.start() # thread1.join() # thread2.join() #当主线程退出的时候, 子线程kill掉 print ("last time: {}".format(time.time()-start_time))
get detail html started get detail url started last time: 0.0010001659393310547 get detail html end get detail url end 进程已结束,退出代码0
开启join后,程序就会hang住(阻塞),不会继续往下执行
get detail html started get detail url started get detail html end get detail url end last time: 4.0012288093566895
上述代码创建了10个“前台”线程,然后控制器就交给了CPU,CPU根据指定算法进行调度,分片执行指令。
更多方法:
- start 线程准备就绪,等待CPU调度
- setName 为线程设置名称
- getName 获取线程名称
- setDaemon 设置为后台线程或前台线程(默认)
如果是后台线程,主线程执行过程中,后台线程也在进行,主线程执行完毕后,后台线程不论成功与否,均停止
如果是前台线程,主线程执行过程中,前台线程也在进行,主线程执行完毕后,等待前台线程也执行完成后,程序停止 - join 逐个执行每个线程,执行完毕后继续往下执行,该方法使得多线程变得无意义
- run 线程被cpu调度后自动执行线程对象的run方法
当设置如下:
# thread1.setDaemon(True) thread2.setDaemon(True)
执行结果:
thread2线程此时会随着其他线程结束而终止执行(其它线程不会等待此线程执行完毕)
get detail html started get detail url started last time: 0.002000093460083008 get detail html end 进程已结束,退出代码0
数据共享:
1:全局变量(不推荐,除非对锁很了解)
#线程间通信 import time import threading from chapter11 import variables from threading import Condition # variables 文件: # detail_url_list = [] #1. 生产者当生产10个url以后就就等待,保证detail_url_list中最多只有十个url #2. 当url_list为空的时候,消费者就暂停 def get_detail_html(lock): #爬取文章详情页 detail_url_list = variables.detail_url_list while True: if len(variables.detail_url_list): lock.acquire() if len(detail_url_list): url = detail_url_list.pop() lock.release() print("get detail html started") time.sleep(2) print("get detail html end") else: lock.release() time.sleep(1) def get_detail_url(lock): # 爬取文章列表页 detail_url_list = variables.detail_url_list while True: print("get detail url started") time.sleep(4) for i in range(20): lock.acquire() if len(detail_url_list) >= 10: lock.release() time.sleep(1) else: detail_url_list.append("http://projectsedu.com/{id}".format(id=i)) lock.release() print("get detail url end") #1. 线程通信方式- 共享变量 if __name__ == "__main__": lock = RLock() thread_detail_url = threading.Thread(target=get_detail_url, args=(lock,)) for i in range(10): html_thread = threading.Thread(target=get_detail_html, args=(lock,)) html_thread.start() start_time = time.time() #当主线程退出的时候, 子线程kill掉 print ("last time: {}".format(time.time()-start_time))
2,Queue
#通过queue的方式进行线程间同步 from queue import Queue import time import threading def get_detail_html(queue): #爬取文章详情页 while True: time.sleep(2) url = queue.get() print(url) def get_detail_url(queue): # 爬取文章列表页 while True: time.sleep(4) for i in range(18): queue.put("http://projectsedu.com/{id}".format(id=i)) #1. 线程通信方式- 共享变量 if __name__ == "__main__": detail_url_queue = Queue(maxsize=20) thread_detail_url = threading.Thread(target=get_detail_url, args=(detail_url_queue,)) for i in range(3): html_thread = threading.Thread(target=get_detail_html, args=(detail_url_queue,)) html_thread.start() start_time = time.time() # detail_url_queue.task_done() # detail_url_queue.join() #当主线程退出的时候, 子线程kill掉 thread_detail_url.start() print ("last time: {}".format(time.time()-start_time))
线程锁(Lock、RLock)
由于线程之间是进行随机调度,并且每个线程可能只执行n条执行之后,当多个线程同时修改同一条数据时可能会出现脏数据,所以,出现了线程锁 - 同一时刻允许一个线程执行操作。
lock:
from threading import Lock, RLock, Condition #可重入的锁 #在同一个线程里面,可以连续调用多次acquire, 一定要注意acquire的次数要和release的次数相等 total = 0 lock = RLock() def add(): #1. dosomething1 #2. io操作 # 1. dosomething3 global lock global total for i in range(1000000): lock.acquire() lock.acquire() total += 1 lock.release() lock.release() def desc(): global total global lock for i in range(1000000): lock.acquire() total -= 1 lock.release() import threading thread1 = threading.Thread(target=add) thread2 = threading.Thread(target=desc) thread1.start() thread2.start() # thread1.join() thread2.join() print(total) #1. 用锁会影响性能 #2. 锁会引起死锁 #死锁的情况 A(a,b) """ A(a、b) acquire (a) acquire (b) B(a、b) acquire (a) acquire (b) """
条件(Condition)
使得线程等待,只有满足某条件时,才释放n个线程
from threading import Lock, RLock, Condition #可重入的锁 import threading def run(n): # con.acquire() # con.wait() # print("run the thread: %s" %n) # con.release() with con : con.wait() print("run the thread: %s" %n) if __name__ == '__main__': con = threading.Condition() for i in range(10): t = threading.Thread(target=run, args=(i,)) t.start() while True: inp = input('>>>') if inp == 'q': break # con.acquire() # con.notify(int(inp)) # con.release() with con: #当inp变量为真时候执行 con.notify(int(inp))
信号量(Semaphore)
互斥锁 同时只允许一个线程更改数据,而Semaphore是同时允许一定数量的线程更改数据 ,比如厕所有3个坑,那最多只允许3个人上厕所,后面的人只能等里面有人出来了才能再进去。
import threading,time def run(n): semaphore.acquire() time.sleep(1) print("run the thread: %s" %n) semaphore.release() if __name__ == '__main__': num= 0 semaphore = threading.BoundedSemaphore(5) #最多允许5个线程同时运行 for i in range(20): t = threading.Thread(target=run,args=(i,)) t.start()
#Semaphore 是用于控制进入数量的锁 #文件, 读、写, 写一般只是用于一个线程写,读可以允许有多个 #做爬虫 import threading import time class HtmlSpider(threading.Thread): def __init__(self, url, sem): super().__init__() self.url = url self.sem = sem def run(self): time.sleep(2) print("got html text success") self.sem.release() class UrlProducer(threading.Thread): def __init__(self, sem): super().__init__() self.sem = sem def run(self): for i in range(20): self.sem.acquire() html_thread = HtmlSpider("https://baidu.com/{}".format(i), self.sem) html_thread.start() if __name__ == "__main__": sem = threading.Semaphore(3) url_producer = UrlProducer(sem) url_producer.start()
ThreadPoolExecutor(线程池)
from concurrent.futures import ThreadPoolExecutor,as_completed,wait from concurrent.futures import Future import time #未来对象,task的返回容器 #线程池, 为什么要线程池 #主线程中可以获取某一个线程的状态或者某一个任务的状态,以及返回值 #当一个线程完成的时候我们主线程能立即知道 #futures可以让多线程和多进程编码接口一致 def get_html(times): time.sleep(times) print("get page {} success".format(times)) return times executor = ThreadPoolExecutor(max_workers=2) #通过submit函数提交执行的函数到线程池中, submit 是立即返回 task1 = executor.submit(get_html, (3)) task2 = executor.submit(get_html, (2)) # #done方法用于判定某个任务是否完成 print(task1.done()) print(task2.done())
#取消某个任务,前提是还没开始执行
#task2.cancel()
time.sleep(4) print(task1.done()) print(task2.done()) # #result方法可以获取task的执行结果 print(task1.result()) print(task2.result())
False
False
get page 2 success
get page 3 success
True
True
3
2进程已结束,退出代码0
as_completed wait
from concurrent.futures import ThreadPoolExecutor,as_completed,wait from concurrent.futures import Future import time #未来对象,task的返回容器 #线程池, 为什么要线程池 #主线程中可以获取某一个线程的状态或者某一个任务的状态,以及返回值 #当一个线程完成的时候我们主线程能立即知道 #futures可以让多线程和多进程编码接口一致 def get_html(times): time.sleep(times) print("get page {} success".format(times)) return times executor = ThreadPoolExecutor(max_workers=2) urls = [3,2,4] all_task = []; for url in urls: task = executor.submit(get_html,(url)) all_task.append(task) """ 此处加上wait方法,代码不会往下执行,会一直等待上面task都执行完成后 """ wait(all_task) print("main") for futrue in as_completed(all_task): data = futrue.result() print("get {} page".format(data))
get page 2 success get page 3 success get page 4 success main get 4 page get 3 page get 2 page 进程已结束,退出代码0
本文来自博客园,作者:孙龙-程序员,转载请注明原文链接:https://www.cnblogs.com/sunlong88/articles/9478438.html