Scrapy爬虫-spider.py
1 class xiaoshuoSpider(scrapy.Spider): 2 name = "freenovel" 3 headers={ 4 'Upgrade - Insecure - Requests': '1', 5 } 6 def start_requests(self): 7 #完本、免费小说 8 start_url=["url起始网址"] 9 for url in start_url: 10 yield scrapy.Request(url=url, headers=self.headers,callback=self.first_parse) 11 12 def first_parse(self, response): 13 sel=Selector(response) 14 category=sel.css('div[class="select-list"] div ul[type="category"] li a::text').extract() 15 category_url=sel.css('div[class="select-list"] div ul[type="category"] li a::attr(href)').extract() 16 items=[] 17 for i in range(1,len(category_url)): 18 item=XiaoshuoItem() 19 item['category']=category[i] 20 item['category_url']="https:"+category_url[i] 21 items.append(item) 22 for item in items: 23 yield scrapy.Request(url=item['category_url'],meta={"category":item['category']},callback=self.second_parse,headers=self.headers) 24 25 def second_parse(self,response): 26 sel=Selector(response) 27 novel_url=sel.css('div[class="book-mid-info"] h4 a::attr(href)').extract() 28 item=XiaoshuoItem() 29 item['category']=response.meta['category'] 30 yield scrapy.Request(url="https:" + novel_url[1] + "#Catalog",callback=self.article_parse, 31 headers=self.headers) 32 for i in range(len(novel_url)): 33 novel_url[i]="https:" + novel_url[i] + "#Catalog" 34 yield scrapy.Request(url=novel_url[i], meta={"category":item['category']},callback=self.article_parse, headers=self.headers) 35 36 def article_parse(self, response): 37 sel=Selector(response) 38 article_name=sel.xpath('//h1/em/text()').extract_first() 39 article_url=sel.css( 40 'div[id="j-catalogWrap"] div[class="volume-wrap"] div[class="volume"] ul li a::attr(href)').extract_first() 41 article_url="https:" + article_url 42 item=XiaoshuoItem() 43 item['article_name']=article_name 44 item['category']=response.meta['category'] 45 yield scrapy.Request(url=article_url, meta={'article_name': item['article_name'],"category":item['category']}, callback=self.detail_parse, 46 headers=self.headers) 47 48 def detail_parse(self, response): 49 sel=Selector(response) 50 content="" 51 item=XiaoshuoItem() 52 content_list=sel.css( 53 'div[id="j_chapterBox"] div[class="text-wrap"] div[class="main-text-wrap"] div[class="read-content j_readContent"] p::text').extract() 54 content_name=sel.css('h3[class="j_chapterName"]::text').extract_first() 55 next_page=sel.css('a[id="j_chapterNext"]::attr(href)').extract_first() 56 for content_one in content_list: 57 content+=content_one 58 item['content']=content 59 item['content_name']=content_name 60 item['article_name']=response.meta['article_name'] 61 item['category']=response.meta['category'] 62 yield item 63 if next_page is not None: 64 next_page="https:" + next_page 65 yield scrapy.Request(url=next_page, meta={'article_name': item['article_name'],"category":item['category']}, callback=self.detail_parse, 66 headers=self.headers)