爬虫实例
# 爬取糗图上的图片
import re import urllib.request import os def handler_request(url, page): url = url + str(page) + "/" headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) Apple\ WebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } request = urllib.request.Request(url, headers=headers) return request def download_image(page, html): headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) Apple\ WebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } pattern = re.compile(r'<img src="(.*?)" alt=".*?" />') src_list = pattern.findall(html) dirs = os.path.join(os.getcwd(), "糗图") if not os.path.exists(dirs): os.makedirs(dirs) for i, src in enumerate(src_list): src = "https:" + src # print(src) file_name = os.path.join(dirs, "第" + str(page) + "页" + str(i) + ".jpg") print("图片%s开始下载..." % (str(page) + "页" + str(i) + ".jpg")) try: request = urllib.request.Request(src, headers=headers) image = urllib.request.urlopen(request).read() except Exception as e: print("图片%s下载出错了" % (str(page) + "页" + str(i) + ".jpg")) continue print("图片%s已经下载完毕" % (str(page) + "页" + str(i) + ".jpg")) with open(file_name, "wb") as f: f.write(image) if __name__ == '__main__': url = "https://www.qiushibaike.com/pic/page/" start_page = int(input("请输入你想要查询的起始页:")) end_page = int(input("请输入你想要查询的结束页:")) for page in range(start_page, end_page + 1): print("第%s页开始下载..." % page) request = handler_request(url, page) content = urllib.request.urlopen(request).read().decode() download_image(page, content) print("第%s页已经下载完毕" % page) print() print() # print(content)