使用xpath表达式实现简单的豆瓣电影一周口碑榜数据爬取

# -*- coding: utf-8 -*-
"""
Created on Sat Oct  8 13:09:04 2022

@author: 小徐同学
"""
#使用xpath豆瓣
import requests
from lxml.html import fromstring

base_url = "https://movie.douban.com/"
headers= {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.34"}
request = requests.get(url=base_url,headers=headers)
request.encoding = 'utf-8'

#获取每部电影的url并且以列表形式返回
def get_url(html_text):
    doc = fromstring(request.text)
    movies_url_list = doc.xpath("//table/tr/td/a")#返回一个a标签的列表
    if movies_url_list:
        movie_title_urls = [elem.xpath('@href')[0] for elem in movies_url_list]
    return movie_title_urls
#获取每部电影的网页源代码
def get_every_text(movie_title_url):
    request = requests.get(url=movie_title_url,headers=headers)
    request.encoding = 'utf-8'
    every_detail=request.text
    return every_detail
#获取每部电影的导演，演员，片长等详细信息
def get_every_detail_content(every_detail):
    doc = fromstring(every_detail)
    list1 = []
    title = doc.xpath("//*[@id='content']/h1/span[1]/text()")
    list2=[] 
    list1.append(list2)
    list2.append(title)
    director = doc.xpath(".//*[@id='info']/span/span/a[@rel='v:directedBy']/text()")#导演
    actor_and_actress = doc.xpath(".//div[@id='info']/span[@class='actor']/span//a/text()")#演员
    date = doc.xpath(".//div[@id='info']/span[@property='v:initialReleaseDate']/text()")#上映日期
    time_long = doc.xpath(".//div[@id='info']/span[@property='v:initialReleaseDate']/text()")#电影时长
    score = doc.xpath(".//div[@id='interest_sectl']/div/div[@class=contains(rating_self,clearfix)]/strong/text()")#豆瓣评分
    list2.append(director)
    list2.append(actor_and_actress)
    list2.append(date)
    list2.append(time_long)
    list2.append(score)

    print(list1)
    
if __name__=="__main__":
     movie_title_urls = get_url(request.text)
     for url in movie_title_urls:
         every_detail = get_every_text(url)
         get_every_detail_content(every_detail)
posted @ 2022-10-09 23:52 Janair 阅读(53) 评论(0) 编辑收藏举报
刷新页面返回顶部