Python多线程爬虫
前言
用上多线程,感觉爬虫跑起来带着风
运行情况
爬取了9万多条文本记录,耗时比较短,一会儿就是几千条
关键点
多个线程对同一全局变量进行修改要加锁
# 获取锁,用于线程同步
threadLock.acquire()
global htmlId
htmlId=htmlId+1
self.htmlId=htmlId
# 释放锁
threadLock.release()
改进
如果因为某种原因爬取失败,这个爬虫并未对失败原因进行记录,所以不方便对漏网之鱼重新爬取。而且由于对方的反爬虫机制,每爬取5000多条时发现对方没有响应,此时应该插入等待时间。
代码
# coding:UTF-8
# https://******.com/article/91510.html
from bs4 import BeautifulSoup
import requests
import threading
import time
htmlId=70570
class Spider(threading.Thread):
def __init__(self, threadID, name):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
'''
logging.basicConfig(level=logging.DEBUG,#控制台打印的日志级别
filename='new.log',
filemode='a',##模式,有w和a,w就是写模式,每次都会重新写日志,覆盖之前的日志
#a是追加模式,默认如果不写的话,就是追加模式
format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s'
#日志格式
)
'''
self.targetURL='https://******.com/article/'
self.header={'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
threadLock.acquire()
global htmlId
self.htmlId=htmlId
threadLock.release()
def nextURL(self):
# 获取锁,用于线程同步
threadLock.acquire()
global htmlId
htmlId=htmlId+1
self.htmlId=htmlId
# 释放锁
threadLock.release()
url=self.targetURL+str(self.htmlId)+'.html'
return url
def getHtml(self,url):
try:
html=requests.get(url,headers=self.header,timeout=10).text
return html
except:
#logging.warning('warning: id= '+str(self.htmlId)+' can\'t connect!')
return url
def parserHtml(self,html):
try:
bf = BeautifulSoup(html)
title=bf.find_all('title')[0].text
content=bf.find_all('div', class_ = 'img-center')[0].text
return title,content
except:
#logging.warning('warning: id= '+str(self.htmlId)+' can\'t parser!')
return str(self.htmlId),str(self.htmlId)
def saveContent(self,title,content):
try:
with open("./fiction/"+title+'.html','a') as f:
content=title+'\r\n'+content
f.write(content)
#logging.info('success: id= '+str(self.htmlId)+' '+title)
except:
#logging.warning('warning: id= '+str(self.htmlId)+' failed to save content!')
self.relax()
def finishCondition(self):
if(self.htmlId>=91510):
return True
else:
return False
def relax(self):
time.sleep(5)
def run(self):
print ("开启线程: " + self.name)
'''
# 获取锁,用于线程同步
threadLock.acquire()
# 释放锁,开启下一个线程
threadLock.release()
'''
while(not self.finishCondition()):
url=self.nextURL()
print(url)
html=self.getHtml(url)
title,content=self.parserHtml(html)
self.saveContent(title,content)
if __name__ == "__main__":
threadLock = threading.Lock()
threads = []
i=1
num=51
# 创建新线程
for item in range(num):
exp="thread"+str(i)+" = "+"Spider("+str(i)+","+"\"Thread-"+str(i)+"\")"
exec(exp)
i=i+1
i=1
for item in range(num):
exp="thread"+str(i)+".start()"
exec(exp)
i=i+1
i=1
for item in range(num):
exp="threads.append(thread"+str(i)+")"
exec(exp)
i=i+1
'''
thread1 = Spider(1, "Thread-1")
# 开启新线程
thread1.start()
# 添加线程到线程列表
threads.append(thread1)
'''
# 等待所有线程完成
for t in threads:
t.join()
print ("退出主线程")
print (htmlId)
'''
target = 'https://******.com/article/91510.html'
req = requests.get(url = target)
html = req.text
bf = BeautifulSoup(html)
texts = bf.find_all('div', class_ = 'img-center')
print(texts[0].text)
'''