python3电影详细信息爬取-------------------电影天堂
1 # -*- coding: utf-8 -*- 2 # author:zxy 3 #Date:2018-9-19 4 5 import requests 6 from lxml import etree 7 HEADERS = { 8 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) ' 9 'AppleWebKit/537.36 (KHTML, like Gecko)' 10 ' Chrome/67.0.3396.99 Safari/537.36' 11 } 12 BASE_DOMAIN="http://www.dytt8.net" 13 14 15 def get_detail_url(url): 16 response = requests.get(url, headers=HEADERS) #print(response.content.decode('gbk')) 17 text = response.text.encode("utf-8") #拿到数据,,再解码 18 html = etree.HTML(text) 19 detail_urls = html.xpath("//table[@class='tbspan']//a/@href") 20 detail_urls=map(lambda url:BASE_DOMAIN+url,detail_urls) 21 return detail_urls 22 23 def parse_detail_page(url): 24 movie={} 25 response=requests.get(url,headers=HEADERS) 26 text=response.content.decode('gbk') #text = response.text.encode("utf-8") 27 html=etree.HTML(text) 28 title=html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0] 29 # for x in title: 30 # print(etree.tostring(x,encoding="utf-8").decode("utf-8")) 31 #print(title) 32 movie['title']=title 33 Zoome=html.xpath("//div[@id='Zoom']")[0] #return list 34 imgs=Zoome.xpath(".//img/@src") 35 #print(cover) 36 cover=imgs[0] 37 # screenshot=imgs[1] 38 movie['cover']=cover 39 # movie['screenshot']=screenshot not all movie has screenshot ,so discard for this moment 40 41 def parse_info(info,rule): 42 return info.replace(rule,"").strip() 43 44 infos=Zoome.xpath(".//text()") 45 # print(infos) each line is a element of the list 46 47 for index,info in enumerate(infos): 48 if info.startswith("◎年 代"): 49 info=parse_info(info,"◎年 代") 50 # print(info) 51 movie['year']=info 52 elif info.startswith("◎产 地"): 53 info=parse_info(info,"◎产 地") 54 movie['country']=info 55 elif info.startswith("◎类 别"): 56 info=parse_info(info,"◎类 别") 57 movie['category']=info 58 elif info.startswith("◎语 言"): 59 info=parse_info(info,"◎语 言") 60 movie['language']=info 61 elif info.startswith("◎字 幕"): 62 info=parse_info(info,"◎字 幕") 63 movie['sub_title']=info 64 elif info.startswith("◎上映日期"): 65 info=parse_info(info,"◎上映日期") 66 movie['release_time']=info 67 elif info.startswith("◎豆瓣评分"): 68 info=parse_info(info,"◎豆瓣评分") 69 movie['douban_score']=info 70 elif info.startswith("◎片 长"): 71 info=parse_info(info,"◎片 长") 72 movie['length']=info 73 elif info.startswith("◎导 演"): 74 info=parse_info(info,"◎导 演") 75 movie['director']=info 76 elif info.startswith("◎主 演"): 77 info=parse_info(info,"◎主 演") 78 actors=[info] 79 for x in range(index+1,len(infos)): 80 actor=infos[x].strip() 81 if actor.startswith("◎"): 82 break 83 actors.append(actor) 84 movie['actors']=actors 85 elif info.startswith("◎简 介"): 86 info=parse_info(info,"◎简 介") 87 profiles=[info] 88 for x in range(index+1,len(infos)): 89 profile=infos[x].strip() 90 if profile.startswith("【下载地址】"): 91 break 92 profiles.append(profile) 93 movie['profiles']=profiles 94 download_url=html.xpath("//td[@bgcolor='#fdfddf']/a/@href")[0] 95 #print(download_url) 96 movie['download_url']=download_url 97 return movie 98 99 movies=[] 100 101 def spider(): 102 base_url = 'http://www.dytt8.net/html/gndy/dyzz/list_23_{}.html' 103 for x in range(1,2): #how much page depend on you 104 # print("==="*30) 105 # print(x) 106 url=base_url.format(x) 107 detail_urls=get_detail_url(url) 108 for detail_url in detail_urls: 109 # print(detail_url) 110 movie=parse_detail_page(detail_url) 111 movies.append(movie) 112 113 if __name__ == '__main__': 114 spider() 115 with open('movies.txt','a',encoding='utf-8') as f: 116 for movie in movies: 117 f.write("="*30) 118 f.write('\n'*2) 119 for (key,value) in movie.items(): 120 if(key=='actors'): 121 str='actors :{}' 122 f.write(str.format(value)) 123 f.write('\n') 124 elif(key=='profiles'): 125 str='profiles :{}' 126 f.write(str.format(value)) 127 f.write('\n') 128 else: 129 f.write(key+":"+value) 130 f.write('\n') 131 f.write('\n'*3)
效果如图所示: