scrapy目录结构与解析方式

scrapy目录结构

myfirstscrapy  # 项目名字
    -myfirstscrapy # 包
    	-__init__.py
    	-spiders # 包 放爬虫,可能会有很多爬虫
        	-__init__.py
    		-cnblogs.py # 爬虫文件--》一个爬虫就是一个文件,可以写多个
    
    	-items.py # 放一个个类---》类似于django 的models--》模型类
    	-middlewares.py # 中间件,下载,爬虫中间件
    	-pipelines.py # 持久化,保存mysql,需要写的位置
    	-settings.py # 配置文件
    -scrapy.cfg     # 上线会用

解析方式

1 response对象有css方法和xpath方法
    -css中写css选择器     response.css('')
    -xpath中写xpath选择   response.xpath('')
2 重点1:
    -xpath取文本内容
        './/a[contains(@class,"link-title")]/text()'
    -xpath取属性
       	'.//a[contains(@class,"link-title")]/@href'
    -css取文本
        'a.link-title::text'
    -css取属性
        'img.image-scale::attr(src)'
3 重点2:
    .extract_first()  取一个
    .extract()        取所有

案例:

# 使用css解析
def parse(self, response):  # css解析
        # response 就是爬完后的对象
        # print(response.text)
        # 使用css解析
        article_list = response.css('article.post-item')
        for article in article_list:
            # 标题
            title = article.css('a.post-item-title::text').extract_first()
            # 摘要 取出所有,单独处理一下
            desc = article.css('p.post-item-summary::text').extract()
            real_desc = desc[0].replace('\n', '').replace(' ', '')
            if not real_desc:
                real_desc = desc[1].replace('\n', '').replace(' ', '')
            # print(real_desc)
            # 作者:author
            author = article.css('footer.post-item-foot>a>span::text').extract_first()
            # print(author)
            # 头像
            image_url = article.css('img.avatar::attr(src)').extract_first()
            # print(image_url)
            # 发布日期
            date = article.css('span.post-meta-item>span::text').extract_first()
            # print(date)

            # 文章地址
            url = article.css('a.post-item-title::attr(href)').extract_first()

            print('''
            文章名:%s
            文章摘要:%s
            文章作者:%s
            作者头像:%s
            文章日期:%s
            文章地址:%s
            ''' % (title, real_desc, author, image_url, date, url))

# 使用xpath解析
def parse(self, response):  # xpath解析
        article_list = response.xpath('//article[@class="post-item"]')
        for article in article_list:
            # 标题
            title = article.xpath('.//a[@class="post-item-title"]/text()').extract_first()

            # 摘要 取出所有,单独处理一下
            desc = article.xpath('.//p[@class="post-item-summary"]/text()').extract()
            real_desc = desc[0].replace('\n', '').replace(' ', '')
            if not real_desc:
                real_desc = desc[1].replace('\n', '').replace(' ', '')
            # print(real_desc)
            # 作者:author
            # author = article.css('footer.post-item-foot>a>span::text').extract_first()
            author = article.xpath('.//footer[@class="post-item-foot"]/a/span/text()').extract_first()
            # print(author)
            # 头像
            # image_url = article.css('img.avatar::attr(src)').extract_first()
            image_url = article.xpath('.//img[@class="avatar"]/@src').extract_first()
            # print(image_url)
            # 发布日期
            # date = article.css('span.post-meta-item>span::text').extract_first()
            date = article.xpath('.//span[@class="post-meta-item"]/span/text()').extract_first()
            # print(date)

            # 文章地址
            # url = article.css('a.post-item-title::attr(href)').extract_first()
            url = article.xpath('.//a[@class="post-item-title"]/@href').extract_first()

            print('''
              文章名:%s
              文章摘要:%s
              文章作者:%s
              作者头像:%s
              文章日期:%s
              文章地址:%s
              ''' % (title, real_desc, author, image_url, date, url))
posted @ 2024-02-23 16:04  wellplayed  阅读(18)  评论(0编辑  收藏  举报