python爬虫(十五) 豆瓣电影爬虫
from lxml import etree import requests # 1、将目标网站上的页面抓取下来 headers={ 'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" } url="https://movie.douban.com/cinema/nowplaying/changsha/" response=requests.get(url,headers=headers) text=response.text #2、对抓取的数据根据一定的规则进行提取 html=etree.HTML(text) ul=html.xpath("//ul[@class='lists']")[0] lis = ul.xpath("//li") for li in lis: try: title=li.xpath("@data-title")[0] score=li.xpath("@data-score")[0] duration=li.xpath("@data-duration")[0] region=li.xpath("@data-region")[0] director=li.xpath("@data-director")[0] actors=li.xpath("@data-actors")[0] thumbnail=li.xpath(".//img/@src")[0] movie={ 'title':title, 'score':score, 'duration':duration, 'region':region, 'director':director, 'actors':actors, 'thumbnail':thumbnail } print(movie) except IndexError: pass
点击里面的网址,出来的就是电影的海报