scrapy项目4

# -*- coding: utf-8 -*-
import scrapy
from sun0769.items import Sun0769Item

class Sun07Spider(scrapy.Spider):
    name = 'sun07'
    allowed_domains = ['wz.sun0769.com']
    start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=0']

    def parse(self, response):

        tr_list = response.xpath("//div[@class='greyframe']/table[2]/tr/td/table/tr")   #获取所有tr列表
        print(tr_list)

        for tr in tr_list:
            item = Sun0769Item()
            item["title"] = tr.xpath("./td[2]/a[@class='news14']/@title").extract_first()
            item["href"] = tr.xpath("./td[2]/a[@class='news14']/@href").extract_first()  #详情页网址
            item["publish_date"] = tr.xpath("./td[last()]/text()").extract_first()
            yield scrapy.Request(
                item["href"],  #详情页网址
                callback=self.parse_detail,
                meta = {"item":item}

            )
        #<a href="http://wz.sun0769.com/index.php/question/questionType?type=4&amp;page=100500">></a>
        # 尖括号href有值代表有下一页,没有代表最后一页
        next_url = response.xpath("//a[text()='>']/@href").extract_first()
        if next_url is not None:
            yield scrapy.Request(
                next_url,
                callback=self.parse
            )

    #处理详情页
    def parse_detail(self,response):
        item = response.meta["item"]
        item["content"] = response.xpath("//td[@class='txt16_3']//text()").extract()
        item["content_img"] = response.xpath("//td[@class='txt16_3']//img/@src").extract() #路径不完整,要拼接
        item["content_img"] =['http://wz.sun0769.com'+i for i in item["content_img"]] #拼接路径
        # print(item)
        yield item

pipelines.py

import re
class Sun0769Pipeline(object):
    def process_item(self, item, spider):
        # print(item)
        item["content"] = self.process_content(item["content"]) #得到process_content里面content值
        print(item)
        return item

    def process_content(self,content):
        ##处理content里面的空白字符,\r\n,\t
        content = [re.sub(r"\xa0|\s|\r\n|\t","",i) for i in content] #把\xa0,空格,\r\n,\t替换成空字符串
        content = [i for i in content if len(i)>0]  #然后去除列表中的空字符串

        return content

settings

LOG_LEVEL = "WARNING"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'

 项目地址:https://github.com/CH-chen/sun0769

posted @ 2019-01-29 06:16  CHVV  阅读(224)  评论(0编辑  收藏  举报