scrapy爬取新浪新闻目录

  本项目主要爬取"http://news.sina.com.cn/guide/"网站下标题结构,然后在本地中新建对应的目录结构

  scrapy startproject sina    新建项目sina

  scrapy genspider sinaSpider sina.com.cn    新建爬虫sinaUrl

编写./sina/sina/spiders/sinaSpider.py文件

 1 import scrapy, os,codecs
 2 from sina.items import SinaUrlItem
 3 
 4 class sinaUrl(scrapy.Spider):
 5     name = "sinaUrl"
 6     allowed_domains = ["sina.com.cn"]
 7     start_urls = [
 8         "http://news.sina.com.cn/guide/"
 9     ]
10 
11     def parse(self, response):
12         #获取导航页面下的一级标题地址与内容
13         parentUrls = response.xpath('//div[@id="tab01"]/div/h3/a/@href').extract()
14         parentTitles = response.xpath('//div[@id="tab01"]/div/h3/a/text()').extract()
15         # 获取导航页面下的二级标题地址与内容
16         subUrls = response.xpath('//div[@id="tab01"]/div/ul/li/a/@href').extract()
17         subTitles = response.xpath('//div[@id="tab01"]/div/ul/li/a/text()').extract()
18 
19         #将一级标题与二级标题的信息写入Item结构
20         for i in range(len(parentUrls)):
21             item = SinaUrlItem()
22             item_sub_urls = []
23             item_sub_titles = []
24             #判断当前目录下是否存在./data/下的一级标题目录,如果不存在则创建
25             if not os.path.exists("./data/" + parentTitles[i]):
26                 os.makedirs("./data/" + parentTitles[i])
27             # 将二级标题的信息写入Item结构
28             for j in range(len(subUrls)):
29                 if subUrls[j].startswith(parentUrls[i]):
30                     item_sub_urls.append(subUrls[j])
31                     item_sub_titles.append(subTitles[j])
32                     # 判断当前目录下是否存在./data/一级标题下二级标题目录,如果不存在则创建
33                     if not os.path.exists("./data/" + parentTitles[i] + "/" + subTitles[j]):
34                         os.makedirs("./data/" + parentTitles[i] + "/" + subTitles[j])
35                         item['parentDir'] ="./data/" + parentTitles[i] + "/" + subTitles[j]
36             for suburl in item_sub_urls:
37                 #向二级标题网址发送请求,深度爬取,使用meta将记录当前二级标题信息的item传入下一个处理函数
38                 yield scrapy.Request(suburl,meta={"subItem":item},callback=self.parse_suburl)
39 
40     def parse_suburl(self,response):
41         #接收传入的item信息
42         item = response.meta["subItem"]
43         #获取二级标题页面下满足要求的三级标题的Url
44         sonUrls = self.getSonUrl(response)
45         item['sonUrls'] = sonUrls
46         for sonUrl in sonUrls:
47             yield scrapy.Request(sonUrl,meta={"sonItem":item},callback=self.parse_sonurl)
48 
49     def parse_sonurl(self,response):
50         item = response.meta["sonItem"]
51         pageDir = item["parentDir"]
52         #标题
53         title = response.xpath("//h1/text()").extract_first()
54         #日期
55         date = response.xpath("//span[@class='date']/text()").extract_first()
56         #内容
57         content = "\n".join(response.xpath("//div[@class='article-content-left']//p/text() | //div[@id='artibody']//p/text()").extract())
58 
59         f = codecs.open(pageDir+"/"+title+".txt","w",encoding="utf-8")
60         if title:
61             f.write(title+"\n")
62         if date:
63             f.write(date + "\n")
64         if content:
65             f.write(content)
66         # f.write(title + "\n" + date + "\n" + content)
67         f.close()
68 
69         yield item
70 
71     def getSonUrl(self,response):
72         sonUrls = response.xpath("//a/@href").extract()
73         pUrl = response.url
74         pUrl = pUrl.split("/")
75         pUrl[-2] = pUrl[-2][0]
76         pUrl = "/".join(pUrl)
77         allUrls = []
78         for i in sonUrls:
79             if (i.startswith(pUrl) or i.startswith(response.url)) and i.endswith(".shtml"):
80                 allUrls.append(i)
81         return allUrls

编写./sina/items.py

1 import scrapy
2 class SinaUrlItem(scrapy.Item):
3     # 三级标题Url
4     sonUrls = scrapy.Field()
5     # 三级标题的父目录
6     parentDir = scrapy.Field()

  由此例子可以看出,Item类可用于跟进爬虫时结构化传递参数。

 

posted on 2019-08-07 19:15  南华  阅读(375)  评论(0编辑  收藏  举报