爬虫

下载安装浏览器内核:
  from requests_html import HTMLSession

  session = HTMLSession()

  url = "https://www.baidu.com"

  r = session.request(method="get", url='https://www.baidu.com')
  r.html.render()

爬取图片案例:
import os
"""
1.先获取需要爬取的url[生成器]
2.找到详情页的url
3.找到对应的标题
4.保存
"""

from requests_html import HTMLSession

class Spider():
    def __init__(self):
        self.session = HTMLSession()

    def get_url(self):
        for i in range(1, 4):
            if i == 1:
                yield "http://www.xiaohuar.com/meinv/"
            yield "http://www.xiaohuar.com/meinv/index_{}.html".format(i)

    def get_img_info(self, index_url):
        r = self.session.get(url=index_url)
        for element in r.html.find('.items'):
            src_url:str = element.find('img', first=True).attrs.get("src")
            if not src_url.startswith("http"):
                src_url = "http://www.xiaohuar.com" + src_url
            title_name = element.find('.p_title', first=True).text
            yield src_url, title_name
            # print(src_url, title_name)

    def save(self, src_url, title_name):
        r = self.session.get(url=src_url)
        title = title_name.replace("\\", '').replace("/", '').replace('|', '')
        path = os.path.join("好看", title + ".jpg")
        with open(path, 'wb') as f:
            f.write(r.content)
            print('%s下载完成'%(title))

    def run(self):
        for index_url in self.get_url():
            for src_url, title_name in  self.get_img_info(index_url):
                self.save(src_url, title_name)


if __name__ == '__main__':
    spider = Spider()
    spider.run()
View Code

 






posted @ 2019-09-10 15:28  胖啊  阅读(124)  评论(0编辑  收藏  举报