【爬虫】爬取猫眼热门电影预告片
import re import requests from lxml import etree from requests.exceptions import RequestException def get_one_page(url): try: headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'} response = requests.get(url, headers=headers, verify=False) if response.status_code == 200: return response.text return None except RequestException: return None def process_movie(html): dom = etree.HTML(html) movie_urls = dom.xpath('//h4[@class="video-name one-line"]/a[@href]/@href') movie_names = dom.xpath('//h4[@class="video-name one-line"]/a/text()') for movie_url, movie_name in zip(movie_urls, movie_names): movie_id_string = requests.get(movie_url).text movie_mp4_url = re.search('source src="(.*)"', movie_id_string).group(1) movie_result = requests.get(movie_mp4_url).content with open('%s.mp4' % movie_name, 'wb') as f: print('正在下载{}'.format(movie_name)) f.write(movie_result) def main(): url = 'https://maoyan.com/news?showTab=3' html = get_one_page(url) process_movie(html) if __name__ == '__main__': main()