python爬虫(十五) 豆瓣电影爬虫

from lxml import etree

import  requests
# 1、将目标网站上的页面抓取下来
headers={
    'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
url="https://movie.douban.com/cinema/nowplaying/changsha/"

response=requests.get(url,headers=headers)
text=response.text

#2、对抓取的数据根据一定的规则进行提取

html=etree.HTML(text)
ul=html.xpath("//ul[@class='lists']")[0]
lis = ul.xpath("//li")


for li in lis:
 try:
    title=li.xpath("@data-title")[0]
    score=li.xpath("@data-score")[0]
    duration=li.xpath("@data-duration")[0]
    region=li.xpath("@data-region")[0]
    director=li.xpath("@data-director")[0]
    actors=li.xpath("@data-actors")[0]
    thumbnail=li.xpath(".//img/@src")[0]
    movie={
        'title':title,
        'score':score,
        'duration':duration,
        'region':region,
        'director':director,
        'actors':actors,
        'thumbnail':thumbnail
    }

    print(movie)
 except IndexError:
    pass

 

 点击里面的网址,出来的就是电影的海报

 

posted on 2020-03-01 21:55  方木Fengl  阅读(542)  评论(0编辑  收藏  举报

导航