python爬乌云dorps文章

有时候翻看drops 无赖drops没有一个全部文章列表,所以就有了这个想法把所有文件标题链接都爬出来这样又直观又好找感兴趣的文章

#coding=utf-8
import re
import urllib2
 
class dropsSpider:
    def __init__(self):
        self.list = ["papers","tips","tools","news","%E8%BF%90%E7%BB%B4%E5%AE%89%E5%85%A8","web","pentesting","wireless","database","binary"]
        self.re_getpage = re.compile(r"<span\sclass='pages'>.*?1.*? (\d+).*?</span>")
        self.re_gettitleandlinks = re.compile(r"<a href=\"(.*?)\" rel=\"bookmark\" title=\"Permanent Link to (.*?)\">")
        #self.category = category
        self.url = "http://drops.wooyun.org/category/"
        self.filename = "text.html"
     
    def getPages(self,category):
        self.category = category
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        headers = { 'User-Agent' : user_agent }
        url = self.url + self.category
        #print url
        request = urllib2.Request(url,headers = headers)
        response = urllib2.urlopen(request,timeout=5)
        res = response.read()
        pages = re.findall(self.re_getpage, res)
        if pages:
            return pages[0]
        else :
            return str(1)
  
    def getTitleAndLinks(self,link):
        self.link = link
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        headers = { 'User-Agent' : user_agent }
        request = urllib2.Request(self.link,headers = headers)
        response = urllib2.urlopen(request,timeout=5)
        res = response.read()
        titleandlinks = re.findall(self.re_gettitleandlinks, res)
        return titleandlinks
 
    def startSpider(self):
        f = open(self.filename,"w+")
        for i in self.list:
            sum = self.getPages(i)
            for j in range(1,int(sum)+1):
                link = self.url+"category/"+ i + "/" + "page/" + str(j)
                aaa = self.getTitleAndLinks(link)
                for s in aaa:
                    res = '<a href="'+s[0]+'">'+s[1]+'</a>'+'<br>'
                    #res = s[0] + '===>' + s[1]
                    f.write(res)       
         f.close()
 
 
if __name__=='__main__':
    myname = dropsSpider()
    myname.startSpider()

脚本有点臃肿可以大大的优化,更可做成多线程。

posted @ 2016-02-15 10:54  depycode  阅读(380)  评论(0编辑  收藏  举报