爬虫
下载安装浏览器内核:
from requests_html import HTMLSession
session = HTMLSession()
url = "https://www.baidu.com"
r = session.request(method="get", url='https://www.baidu.com')
r.html.render()
爬取图片案例:
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
import os """ 1.先获取需要爬取的url[生成器] 2.找到详情页的url 3.找到对应的标题 4.保存 """ from requests_html import HTMLSession class Spider(): def __init__(self): self.session = HTMLSession() def get_url(self): for i in range(1, 4): if i == 1: yield "http://www.xiaohuar.com/meinv/" yield "http://www.xiaohuar.com/meinv/index_{}.html".format(i) def get_img_info(self, index_url): r = self.session.get(url=index_url) for element in r.html.find('.items'): src_url:str = element.find('img', first=True).attrs.get("src") if not src_url.startswith("http"): src_url = "http://www.xiaohuar.com" + src_url title_name = element.find('.p_title', first=True).text yield src_url, title_name # print(src_url, title_name) def save(self, src_url, title_name): r = self.session.get(url=src_url) title = title_name.replace("\\", '').replace("/", '').replace('|', '') path = os.path.join("好看", title + ".jpg") with open(path, 'wb') as f: f.write(r.content) print('%s下载完成'%(title)) def run(self): for index_url in self.get_url(): for src_url, title_name in self.get_img_info(index_url): self.save(src_url, title_name) if __name__ == '__main__': spider = Spider() spider.run()