14 天堂电影信息爬取
1 """电影天堂爬虫""" 2 3 4 import requests 5 from lxml import etree 6 7 BASE_DOMAIN = 'https://dytt8.net/' 8 HEADERS = { 9 'User-Agent': 'Mozilla/5.0' 10 } 11 12 def get_detail_urls(url): 13 """爬取指定页面下所有子页面的超链接并返回""" 14 15 respose = requests.get(url, headers=HEADERS) 16 # 页面编码有些GBK无法识别的字符ignore 17 text = respose.content.decode('GBK', errors='ignore') 18 # print(text) 19 html = etree.HTML(text) 20 # 获取当前页面下每个电影信息的子页面的超链接 21 detail_urls = html.xpath("//table[@class='tbspan']//a/@href") 22 detail_urls = map(lambda url: BASE_DOMAIN + url, detail_urls) 23 map(lambda url: BASE_DOMAIN + url, detail_urls) 24 # for detail in detail_urls: 25 # print(detail) 26 return detail_urls 27 28 def parse_detail_page(detail_urls): 29 """爬取子页面的电影具体信息""" 30 31 movie = {} # 电影信息存放字典 32 r = requests.get(detail_urls, headers=HEADERS) 33 text = r.content.decode('GBK', errors='ignore') 34 html = etree.HTML(text) 35 title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0] 36 #print(title) 37 movie['title'] = title 38 zoomE = html.xpath("//div[@id='Zoom']")[0] 39 imgs = zoomE.xpath(".//img/@src") 40 cover = imgs[0] 41 movie['cover'] = cover 42 # 不是所有子页面下的电影信息都有电影截屏信息,会超界出错 43 # screenshot = imgs[1] 44 #movie['screenshot'] = screenshot 45 46 def parse_info(info, rule): 47 return info.replace(rule, "").strip() 48 49 infos = zoomE.xpath("//text()") 50 actors = [] 51 for index,info in enumerate(infos): 52 #print(index) 53 #print(info) 54 #print("*"*30) 55 if info.startswith("◎年 代"): 56 info = parse_info(info, "◎年 代") 57 movie['year'] = info 58 #print(info) 59 elif info.startswith("◎产 地"): 60 info = parse_info(info, "◎产 地") 61 movie['country'] = info 62 elif info.startswith("◎类 别"): 63 info = parse_info(info, "◎类 别") 64 movie['category'] = info 65 elif info.startswith("◎豆瓣评分"): 66 info = parse_info(info, "◎豆瓣评分") 67 movie['duoban_rating'] = info 68 elif info.startswith("◎片 长"): 69 info = parse_info(info, "◎片 长") 70 movie['duration'] = info 71 elif info.startswith("◎导 演"): 72 info = parse_info(info, "◎导 演") 73 movie['director'] = info 74 elif info.startswith("◎主 演"): 75 # 得到第一行主演信息 76 info = parse_info(info, "◎主 演") 77 actors = [info] 78 # 主演有多行,对后续信息遍历,直到出现"◎"为止 79 for x in range(index+1, len(infos)): 80 actor = infos[x].strip() 81 if actor.startswith("◎"): 82 break 83 # 不为空时加入主演 84 if actor != "": 85 actors.append(actor) 86 movie['actors'] = actors 87 88 movie['download'] = html.xpath("//div[@id='Zoom']//tbody//a/text()") 89 movie['magnet'] = html.xpath("//div[@id='Zoom']//a/@href")[0] 90 91 return movie 92 93 def spider(): 94 base_url = "https://dytt8.net/html/gndy/dyzz/list_23_{}.html" 95 movies = [] # 所有电影信息 96 count = 0 # 爬取的电影数 97 for x in range(1,3): 98 """所有页数""" 99 url = base_url.format(x) 100 detail_urls = get_detail_urls(url) 101 for detail_url in detail_urls: 102 """对每一页进行解析""" 103 #print(detail_url) 104 movies.append(parse_detail_page(detail_url)) 105 count += 1 106 print(movies) 107 print(count) 108 109 110 if __name__ == '__main__': 111 spider()
来自Python网络爬虫视频