爬虫-电影演员
# -*-coding:utf-8-*- import requests import re from bs4 import BeautifulSoup import json headers = { "Accept": "*/*", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6", "Connection": "keep-alive", "Cookie": 'll="118237"; bid=I01Ods0OrJA; __gads=ID=8c2ee8adc452b1dd-226296c262d30071:T=1653474012:RT=1653474012:S=ALNI_MbuAdJQ8W92lI2c2ppXsJ_P2_Ydfg; __utma=30149280.1346640552.1654946721.1654946721.1654946721.1; __utmc=30149280; __utmz=30149280.1654946721.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; __utmb=30149280.1.10.1654946721; __utma=223695111.1075054031.1654946726.1654946726.1654946726.1; __utmb=223695111.0.10.1654946726; __utmc=223695111; __utmz=223695111.1654946726.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1654946726%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DryXr-GuTWHjxdd4DH12MpPbsVyyvP7ODlL-Y4K8jE9dnheeNOtr-Dp5otIX841po%26wd%3D%26eqid%3Dece2d0e0001ac7a90000000262a47b9a%22%5D; _pk_ses.100001.4cf6=*; __gpi=UID=000005b82ead9fc6:T=1653474012:RT=1654946726:S=ALNI_MZTl9rO1QOcR3yeHARedOY_xtnAtA; _vwo_uuid_v2=D4B899194D0A310952B78BD259E69F4BB|7e0650f469f7bccd2db729323dbb7556; _pk_id.100001.4cf6=fdfde254c89a301d.1653473993.2.1654946807.1653474011.', "Host": "movie.douban.com", "Referer": "https://movie.douban.com/explore", "Sec-Fetch-Dest": "empty", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Site": "same-origin", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36", "X-Requested-With": "XMLHttpRequest", "sec-ch-ua": 'Not A;Brand";v="99", "Chromium";v="101", "Google Chrome";v="101"', "sec-ch-ua-mobile": '"?0"', "sec-ch-ua-platform": '"macOS"', } def get_detail(url, title): """获取影片详情页面中所有演员角色""" with open("a.txt", "a+") as f: response = requests.get(url, headers=headers) text = response.content.decode("utf-8") soup = BeautifulSoup(text, "lxml") role_div = soup.find_all("div", attrs={"class": "list-wrapper"})[1] li_list = role_div.find_all("li", attrs={"class": "celebrity"}) for item in li_list: role = item.find("span", attrs={"class": "role"}).text if "(饰" in role: role_name = role.split("(饰")[1].split(")")[0].strip() f.write("\t".join([role_name, title, "\r\n"])) print("-解析结果:" + " ".join([role_name, title])) print("-------------------------") if __name__=="__main__": for i in range(100): url = "http://movie.douban.com/j/search_subjects?type=movie&tag=华语&sort=recommend&page_limit=100&page_start=" + str(i*100) r1 = requests.get(url, headers=headers) data = json.loads(r1.text) for movie in data["subjects"]: title = movie.get("title", "") detail_url = movie.get("url", "") + "celebrities" print("当前解析电影:%s" % title) get_detail(detail_url, title) pass