Python多线程爬虫

前言

用上多线程，感觉爬虫跑起来带着风

运行情况

爬取了9万多条文本记录，耗时比较短，一会儿就是几千条

关键点

多个线程对同一全局变量进行修改要加锁

          # 获取锁，用于线程同步
          threadLock.acquire()
          global htmlId
          htmlId=htmlId+1
          self.htmlId=htmlId
          # 释放锁 
          threadLock.release()

改进

如果因为某种原因爬取失败，这个爬虫并未对失败原因进行记录，所以不方便对漏网之鱼重新爬取。而且由于对方的反爬虫机制，每爬取5000多条时发现对方没有响应，此时应该插入等待时间。

代码

# coding:UTF-8 
# https://******.com/article/91510.html


from bs4 import BeautifulSoup
import requests
import threading
import time

htmlId=70570
class Spider(threading.Thread):

     def __init__(self, threadID, name):
          threading.Thread.__init__(self)
          self.threadID = threadID
          self.name = name
          '''
          logging.basicConfig(level=logging.DEBUG,#控制台打印的日志级别
                    filename='new.log',
                    filemode='a',##模式，有w和a，w就是写模式，每次都会重新写日志，覆盖之前的日志
                    #a是追加模式，默认如果不写的话，就是追加模式
                    format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s'
                    #日志格式
                    )
          '''
          
          self.targetURL='https://******.com/article/'
          self.header={'User-Agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}

          threadLock.acquire()
          global htmlId
          self.htmlId=htmlId
          threadLock.release() 
     def nextURL(self):
          
          # 获取锁，用于线程同步
          threadLock.acquire()
          global htmlId
          htmlId=htmlId+1
          self.htmlId=htmlId
          # 释放锁
          
          threadLock.release() 
          url=self.targetURL+str(self.htmlId)+'.html'
          return url
     def getHtml(self,url):
          try:
               html=requests.get(url,headers=self.header,timeout=10).text
               
               return html
          except:
               #logging.warning('warning: id= '+str(self.htmlId)+' can\'t connect!')
               return url

     def parserHtml(self,html):
          try:
               bf = BeautifulSoup(html)
               title=bf.find_all('title')[0].text
               content=bf.find_all('div', class_ = 'img-center')[0].text
               return title,content  
          except:
               #logging.warning('warning: id= '+str(self.htmlId)+' can\'t parser!')
               return str(self.htmlId),str(self.htmlId)
     def saveContent(self,title,content):
          try:
               with open("./fiction/"+title+'.html','a') as f:
                    content=title+'\r\n'+content
                    f.write(content)
               #logging.info('success: id= '+str(self.htmlId)+' '+title)
          except:
               #logging.warning('warning: id= '+str(self.htmlId)+' failed to save content!')
               self.relax()
     def finishCondition(self):
          if(self.htmlId>=91510):
               return True
          else:
               return False
     def relax(self):
          time.sleep(5)
     def run(self): 
          print ("开启线程： " + self.name)
          '''
          # 获取锁，用于线程同步
          threadLock.acquire()
        
          # 释放锁，开启下一个线程
          threadLock.release()
          '''

          while(not self.finishCondition()):
               url=self.nextURL()
               print(url)
               html=self.getHtml(url)
               title,content=self.parserHtml(html)
               self.saveContent(title,content)





if __name__ == "__main__":
          
     threadLock = threading.Lock()
     threads = []
     i=1
     num=51
     # 创建新线程
     for item in range(num):
          exp="thread"+str(i)+" = "+"Spider("+str(i)+","+"\"Thread-"+str(i)+"\")"
          exec(exp)
          i=i+1
     i=1
     for item in range(num):
          exp="thread"+str(i)+".start()"
          exec(exp)
          i=i+1
     i=1
     for item in range(num):
          exp="threads.append(thread"+str(i)+")"
          exec(exp)
          i=i+1


     '''     
     thread1 = Spider(1, "Thread-1")

     # 开启新线程
     thread1.start()

     # 添加线程到线程列表
     threads.append(thread1)
     '''
     # 等待所有线程完成
     for t in threads:
          t.join()
     print ("退出主线程")
     print (htmlId)

     '''
     target = 'https://******.com/article/91510.html'
     req = requests.get(url = target) 
     html = req.text
     bf = BeautifulSoup(html)
     texts = bf.find_all('div', class_ = 'img-center')
     print(texts[0].text)
     '''

posted @ 2018-12-04 09:14 灰太狼的喜羊羊阅读(441) 评论(0) 编辑收藏举报

刷新页面返回顶部

路漫求索

Python多线程爬虫

前言

运行情况

关键点

改进

代码

公告