爬虫-电影演员

# -*-coding:utf-8-*-
import requests
import re
from bs4 import BeautifulSoup
import json

headers = {
        "Accept": "*/*",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6",
        "Connection": "keep-alive",
        "Cookie": 'll="118237"; bid=I01Ods0OrJA; __gads=ID=8c2ee8adc452b1dd-226296c262d30071:T=1653474012:RT=1653474012:S=ALNI_MbuAdJQ8W92lI2c2ppXsJ_P2_Ydfg; __utma=30149280.1346640552.1654946721.1654946721.1654946721.1; __utmc=30149280; __utmz=30149280.1654946721.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; __utmb=30149280.1.10.1654946721; __utma=223695111.1075054031.1654946726.1654946726.1654946726.1; __utmb=223695111.0.10.1654946726; __utmc=223695111; __utmz=223695111.1654946726.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1654946726%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DryXr-GuTWHjxdd4DH12MpPbsVyyvP7ODlL-Y4K8jE9dnheeNOtr-Dp5otIX841po%26wd%3D%26eqid%3Dece2d0e0001ac7a90000000262a47b9a%22%5D; _pk_ses.100001.4cf6=*; __gpi=UID=000005b82ead9fc6:T=1653474012:RT=1654946726:S=ALNI_MZTl9rO1QOcR3yeHARedOY_xtnAtA; _vwo_uuid_v2=D4B899194D0A310952B78BD259E69F4BB|7e0650f469f7bccd2db729323dbb7556; _pk_id.100001.4cf6=fdfde254c89a301d.1653473993.2.1654946807.1653474011.',
        "Host": "movie.douban.com",
        "Referer": "https://movie.douban.com/explore",
        "Sec-Fetch-Dest": "empty",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "same-origin",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36",
        "X-Requested-With": "XMLHttpRequest",
        "sec-ch-ua": 'Not A;Brand";v="99", "Chromium";v="101", "Google Chrome";v="101"',
        "sec-ch-ua-mobile": '"?0"',
        "sec-ch-ua-platform": '"macOS"',
    }

def get_detail(url, title):
    """获取影片详情页面中所有演员角色"""
    with open("a.txt", "a+") as f:
        response = requests.get(url, headers=headers)
        text = response.content.decode("utf-8")
        soup = BeautifulSoup(text, "lxml")
        role_div = soup.find_all("div", attrs={"class": "list-wrapper"})[1]
        li_list = role_div.find_all("li", attrs={"class": "celebrity"})
        for item in li_list:
            role = item.find("span", attrs={"class": "role"}).text
            if "(饰" in role:
                role_name = role.split("(饰")[1].split(")")[0].strip()
                f.write("\t".join([role_name, title, "\r\n"]))
                print("-解析结果:" + "  ".join([role_name, title]))
        print("-------------------------")


if __name__=="__main__":
    for i in range(100):
        url = "http://movie.douban.com/j/search_subjects?type=movie&tag=华语&sort=recommend&page_limit=100&page_start=" + str(i*100)
        r1 = requests.get(url, headers=headers)
        data = json.loads(r1.text)
        for movie in data["subjects"]:
            title = movie.get("title", "")
            detail_url = movie.get("url", "") + "celebrities"
            print("当前解析电影:%s" % title)
            get_detail(detail_url, title)
        pass

    

 

posted @ 2022-06-11 21:56  你看起来真的很好吃  阅读(30)  评论(0编辑  收藏  举报