scrapy电影天堂练习

movie.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import scrapy
from movieProject.items import MovieprojectItem
 
 
class MovieSpider(scrapy.Spider):
    name = 'movie'
    allowed_domains = ['www.ygdy8.net']
    start_urls = ['https://www.ygdy8.net/html/gndy/china/index.html']
 
    def parse(self, response):
        print("电影天堂")
        movieList = response.xpath('//table//tr[2]/td[2]/b/a[2]')
        for item in movieList:
            movieName = item.xpath('./text()').extract_first()
            movieUrl = 'https://www.ygdy8.net' + item.xpath('./@href').extract_first()
            print(movieName, movieUrl)<br>        #使用meta将movieName传给secon_parse方法
            yield scrapy.Request(url=movieUrl, callback=self.second_parse,meta={'movieName':movieName})
 
 
    def second_parse(self,response):
        print("二次解析之前")<br>      #打开src进入到详情页面,然后获取详情页面的图片地址
        secondUrl = response.xpath('//div[@id="Zoom"]//img/@src').extract_first()
        print("第二次访问", secondUrl)
        movieName = response.meta['movieName']
        movie = MovieprojectItem(movieName=movieName, movieUrl=secondUrl)
 
        yield movie

  items.py

1
2
3
4
5
6
class MovieprojectItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    movieName = scrapy.Field()
    movieUrl = scrapy.Field()
    pass

  pipelines.py

1
2
3
4
5
6
7
8
9
10
class MovieprojectPipeline:
    def open_spider(self, spider):
        self.fp = open('movie.json', 'w', encoding='utf-8')
 
    def process_item(self, item, spider):
        self.fp.write(str(item))
        return item
 
    def close_spider(self, spider):
        self.fp.close()

  

posted @   sgj191024  阅读(55)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· 分享一个免费、快速、无限量使用的满血 DeepSeek R1 模型,支持深度思考和联网搜索!
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· ollama系列01:轻松3步本地部署deepseek,普通电脑可用
· 按钮权限的设计及实现
· 25岁的心里话
点击右上角即可分享
微信分享提示