爬虫11-lxml爬取复杂网页,电影天堂
import requests from lxml import etree url_domain="https://www.dytt8.net" headers={ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" } def get_detail_urls(url): response=requests.get(url,headers=headers) text =response.content.decode('gbk', "ignore") html=etree.HTML(text) detail_urls=html.xpath("//table[@class='tbspan']//a[2]/@href") detail_urls=map(lambda url:url_domain+url,detail_urls) return detail_urls def parse_info(info,rule): return info.replace(rule, "").strip() def parse_detail_url(url="https://www.dytt8.net/html/gndy/dyzz/20200306/59787.html"): movie={} response=requests.get(url,headers=headers) text=response.content.decode("gbk","ignore") html=etree.HTML(text) title=html.xpath("//div[@class='title_all']//font[@color='#07519a']//text()")[0] movie['title']=title zoomE=html.xpath("//div[@id='Zoom']")[0] infos=zoomE.xpath("//text()") for index,info in enumerate(infos): if info.startswith("◎年 代"): info=parse_info(info,"◎年 代") movie['year']=info elif info.startswith("◎产 地"): info = parse_info(info, "◎产 地") movie['country'] = info elif info.startswith("◎类 别"): info = parse_info(info, "◎类 别") movie['category'] = info elif info.startswith("◎上映日期"): info = parse_info(info, "◎上映日期") movie['date'] = info elif info.startswith("◎片 长"): info = parse_info(info, "◎片 长") movie['time'] = info elif info.startswith("◎豆瓣评分"): info = parse_info(info, "◎豆瓣评分") movie['score'] = info elif info.startswith("◎导 演"): info = parse_info(info, "◎导 演") movie['director'] = info elif info.startswith("◎主 演"): info = parse_info(info, "◎主 演") actors = [info] for x in range(index+1,len(infos)): actor=infos[x].strip() if actor.startswith("◎"): break actors.append(actor) movie['actors']=actors return movie def spider(): base_url="https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html" movies=[] for x in range(1,8): url=base_url.format(x) detail_urls=get_detail_urls(url) for detail_url in detail_urls: movie=parse_detail_url(detail_url) movies.append(movie) print(movie) if __name__ == '__main__': spider()