使用Scrapy爬取开源项目文档
import scrapy import re import os class MySpider(scrapy.Spider): name = "deeplearning4j-api" start_urls = ["https://deeplearning4j.org/api/latest/index-files/index-1.html"] crawledLinks = {} # times = 200 def parse(self, response): # 输出html页面 def output2html(htmlcontent,filepath,folder='./document-deeplearning4j/'): filepath = folder + filepath if not os.path.exists(os.path.split(filepath)[0]): os.makedirs(os.path.split(filepath)[0]) f = open(filepath, 'wb') f.write(htmlcontent) f.close() # 处理链接成统一格式 def htmlprocess(url,preStr="https://deeplearning4j.org",endStr=".html"): url = url.replace(preStr,'') pattern = re.compile(r'\.html.*$') url = pattern.sub('.html',url) return url url = htmlprocess( response.url ) output2html(response.body,url) # 选取页面元素(找出所有连接) links = response.xpath('//a/@href').extract() # 限定域名(绝对链接) linkPattern = re.compile(r"(^https\:\/\/deeplearning4j\.org\/api\/)") for link in links: link = response.urljoin(link) # 将所有连接处理成绝对链接 link = htmlprocess( link, preStr='',endStr=".html") # 不处理链接域名,不区分链接后缀 if linkPattern.match(link) and not link in self.crawledLinks: self.crawledLinks[link]=1 yield scrapy.Request(link, callback = self.parse) item = {} item["link"] = link yield item