scrapy爬取新浪新闻目录
本项目主要爬取"http://news.sina.com.cn/guide/"网站下标题结构,然后在本地中新建对应的目录结构
scrapy startproject sina 新建项目sina
scrapy genspider sinaSpider sina.com.cn 新建爬虫sinaUrl
编写./sina/sina/spiders/sinaSpider.py文件
1 import scrapy, os,codecs 2 from sina.items import SinaUrlItem 3 4 class sinaUrl(scrapy.Spider): 5 name = "sinaUrl" 6 allowed_domains = ["sina.com.cn"] 7 start_urls = [ 8 "http://news.sina.com.cn/guide/" 9 ] 10 11 def parse(self, response): 12 #获取导航页面下的一级标题地址与内容 13 parentUrls = response.xpath('//div[@id="tab01"]/div/h3/a/@href').extract() 14 parentTitles = response.xpath('//div[@id="tab01"]/div/h3/a/text()').extract() 15 # 获取导航页面下的二级标题地址与内容 16 subUrls = response.xpath('//div[@id="tab01"]/div/ul/li/a/@href').extract() 17 subTitles = response.xpath('//div[@id="tab01"]/div/ul/li/a/text()').extract() 18 19 #将一级标题与二级标题的信息写入Item结构 20 for i in range(len(parentUrls)): 21 item = SinaUrlItem() 22 item_sub_urls = [] 23 item_sub_titles = [] 24 #判断当前目录下是否存在./data/下的一级标题目录,如果不存在则创建 25 if not os.path.exists("./data/" + parentTitles[i]): 26 os.makedirs("./data/" + parentTitles[i]) 27 # 将二级标题的信息写入Item结构 28 for j in range(len(subUrls)): 29 if subUrls[j].startswith(parentUrls[i]): 30 item_sub_urls.append(subUrls[j]) 31 item_sub_titles.append(subTitles[j]) 32 # 判断当前目录下是否存在./data/一级标题下二级标题目录,如果不存在则创建 33 if not os.path.exists("./data/" + parentTitles[i] + "/" + subTitles[j]): 34 os.makedirs("./data/" + parentTitles[i] + "/" + subTitles[j]) 35 item['parentDir'] ="./data/" + parentTitles[i] + "/" + subTitles[j] 36 for suburl in item_sub_urls: 37 #向二级标题网址发送请求,深度爬取,使用meta将记录当前二级标题信息的item传入下一个处理函数 38 yield scrapy.Request(suburl,meta={"subItem":item},callback=self.parse_suburl) 39 40 def parse_suburl(self,response): 41 #接收传入的item信息 42 item = response.meta["subItem"] 43 #获取二级标题页面下满足要求的三级标题的Url 44 sonUrls = self.getSonUrl(response) 45 item['sonUrls'] = sonUrls 46 for sonUrl in sonUrls: 47 yield scrapy.Request(sonUrl,meta={"sonItem":item},callback=self.parse_sonurl) 48 49 def parse_sonurl(self,response): 50 item = response.meta["sonItem"] 51 pageDir = item["parentDir"] 52 #标题 53 title = response.xpath("//h1/text()").extract_first() 54 #日期 55 date = response.xpath("//span[@class='date']/text()").extract_first() 56 #内容 57 content = "\n".join(response.xpath("//div[@class='article-content-left']//p/text() | //div[@id='artibody']//p/text()").extract()) 58 59 f = codecs.open(pageDir+"/"+title+".txt","w",encoding="utf-8") 60 if title: 61 f.write(title+"\n") 62 if date: 63 f.write(date + "\n") 64 if content: 65 f.write(content) 66 # f.write(title + "\n" + date + "\n" + content) 67 f.close() 68 69 yield item 70 71 def getSonUrl(self,response): 72 sonUrls = response.xpath("//a/@href").extract() 73 pUrl = response.url 74 pUrl = pUrl.split("/") 75 pUrl[-2] = pUrl[-2][0] 76 pUrl = "/".join(pUrl) 77 allUrls = [] 78 for i in sonUrls: 79 if (i.startswith(pUrl) or i.startswith(response.url)) and i.endswith(".shtml"): 80 allUrls.append(i) 81 return allUrls
编写./sina/items.py
1 import scrapy 2 class SinaUrlItem(scrapy.Item): 3 # 三级标题Url 4 sonUrls = scrapy.Field() 5 # 三级标题的父目录 6 parentDir = scrapy.Field()
由此例子可以看出,Item类可用于跟进爬虫时结构化传递参数。