【Python学习】爬虫源码
1、在巨人的肩膀上,结合网上的资源,梳理出来的。
2、其中应用了几个常用的包,requests、re等,
3、注意创建多级文件夹要用--makesdir,创建单级文件用--mkdir
1 # 导入相应的包 2 # 请求网页 3 import requests 4 # 正则解析网页 5 import re 6 # 告诉服务,自己的身份, 7 import time 8 9 import os 10 11 # 函数请求的头部信息 12 headers = { 13 "user-agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.68' 14 } 15 16 ############################################ 开始编写程序 ############################################ 17 # 定义轮询的页面数 18 polling_page_num = int(input("请输入主页页码轮询的数量:")) 19 20 ############################################ 函数区 ############################################ 21 22 # 下载界面html代码的函数 23 def download_page(url, html_encode='utf-8', *args, **kwargs): 24 """ 25 下载界面html代码的函数 26 :param url:需要下载网页代码的链接 27 :param html_encode:网页的解码方式。默认是“utf-8” 28 :param args: 29 :param kwargs: 30 :return:返回值为该页面的html代码 31 """ 32 headers = { 33 "user-agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36 Edg/89.0.774.68'} 34 r = requests.get(url, headers=headers) 35 r.encoding = html_encode 36 response_status = r.status_code 37 return r.text, response_status 38 39 40 # 分析主网页并且返回已经处理后的链接列表 41 def anasy_main_page(html, *args, **kwargs): 42 ex = '<a href="(.*?)" title=".*?"><img alt=".*?" src=".*?"><i>.*?</i></a>' 43 # ex = '<a href="(.*?)" title=.*?><img alt=.*? src=.*?><i>.*?</i></a>' 44 test_src_list = re.findall(ex, html, re.S) 45 new_src_list = test_src_list[1:31] 46 li_piclink = [] 47 for pic_link in new_src_list: 48 new_pic_link = '链接' + pic_link 49 li_piclink.append(new_pic_link) 50 return li_piclink 51 52 53 # 分析副网页,返回图片的下载地址 54 def anasy_Secondary_page(Secondary_html): 55 """ 56 :param Secondary_html: 57 :return: 一个元组, 58 dir_name --文件夹名称 59 pic_link --- 链接 60 """ 61 ex_link = '<img alt=".*?" src="(.*?)" />' 62 ex_name = '<h1>(.*?)</h1>' # 需要修改的地方 63 pic_link = re.findall(ex_link, Secondary_html, re.S)[0] 64 dir_name = re.findall(ex_name, Secondary_html, re.S)[0] 65 return dir_name, pic_link 66 67 68 # 创建文件夹 69 def create_folder(dir_name): 70 dir_name_cl = "".join(dir_name.split()) 71 dir_name = dir_name_cl 72 if not os.path.exists(dir_name): 73 os.mkdir(dir_name) 74 return dir_name 75 76 77 # 下载图片 78 def down_pic(dir_name, pic_link): 79 """ 80 :param dir_name: 81 :param pic_link: 82 :return: 83 """ 84 img_data = requests.get(url=pic_link, headers=headers).content 85 img_name = pic_link.split('/')[-1] 86 imgPath = dir_name + '/' + img_name 87 with open(imgPath, 'wb') as f: 88 f.write(img_data) 89 return 90 91 92 # 网页主页生成器 93 def create_main_url(url_num): 94 url_ys = '子链接' 95 mian_url_list = [] 96 if url_num > 1: 97 start_num = 2 98 else: 99 start_num = 1 100 101 for url_n in range(start_num, url_num + 1): 102 if url_n != 1: 103 url = url_ys + 'index_%d.html' 104 new_url = format(url % url_n) 105 else: 106 new_url = url_ys 107 mian_url_list.append(new_url) 108 return mian_url_list 109 110 111 # 子网页主页生成器 112 def create_sec_url(url, url_num, *args, **kwargs): 113 """ 114 :param url: 115 :param url_num: 116 :return: 117 """ 118 sec_url_list = [] 119 for url_n in range(1, url_num + 1): 120 if url_n != 1: 121 # new_url = url + '_'+str(url_n)+'.html' 122 begin = url.find("h") 123 end = url.rfind(".") 124 find_url = url[begin:end] 125 new_url = find_url + '_' + str(url_n) + '.html' 126 else: 127 new_url = url 128 sec_url_list.append(new_url) 129 return sec_url_list 130 131 132 # 下载日志生成 133 def create_log(log_content): 134 """ 135 下载日志生成函数 136 :param log_content: 写入log的内容 137 :return: 无 138 """ 139 with open("log.txt", "a") as file: 140 file.write(log_content) 141 return 142 143 144 # 页面记录器 145 def page_record(page_num=0, *args, **kwargs): 146 with open("page_record.txt", "w+") as file: 147 file.write(page_num) 148 return 149 150 151 # 读取配置 152 def page_read(): 153 with open("page_record.txt", "r") as file: 154 r_page_num = file.readline() 155 return r_page_num 156 157 158 ############################################ 爬虫工作区 ############################################ 159 160 n_yema = int(page_read()) 161 # print(n_yema) 162 if polling_page_num > 361: 163 print("您输入的超出轮询范围,请重新输入!") 164 elif polling_page_num > n_yema: 165 end_page_num = polling_page_num 166 print("主程序即将进行") 167 168 # 生成主网页 169 mian_url_list_ys = create_main_url(end_page_num) 170 mian_url_list = mian_url_list_ys[int(n_yema)-1:int(end_page_num)+1] 171 172 173 for url in mian_url_list: 174 n_yema = n_yema + 1 175 sec_url_li = anasy_main_page(download_page(url)[0]) # 分析主链接,获得套图列表 176 print(len(sec_url_li), sec_url_li) 177 log_mian_start = "*" * 15 + "第" + str(n_yema) + "页,开始下载-->" + url + "*" * 15 178 print(log_mian_start) # 某些页面开始下载的提示 179 n_tao = 0 180 for url_sec in sec_url_li[0:31]: 181 n_tao = n_tao + 1 182 dir_name = anasy_Secondary_page(download_page(url_sec, html_encode="utf-8")[0])[0] # 分析指定的套图链接,获得套图的名称 183 184 185 print("*" * 15 + "第" + str(n_yema) + "页,第" + str(n_tao) + "套图--" + dir_name + "--" + "开始下载" + "*" * 15) 186 dir_name_sj = create_folder(dir_name) 187 sec_url_list = create_sec_url(url_sec, 60) 188 m = 0 189 for pic_link in sec_url_list: 190 m = m + 1 191 page_text, response_status_pic = download_page(pic_link) 192 if response_status_pic == 200: 193 donw_pic_link = anasy_Secondary_page(page_text)[1] # 分析指定的套图链接,获得图片链接 194 down_pic(dir_name_sj, donw_pic_link) 195 print("第" + str(m) + "张图片下成功", donw_pic_link) 196 time.sleep(1) 197 else: 198 continue 199 200 print("第" + str(n_yema) + "页,第" + str(n_tao) + "套图--" + dir_name + "全部图片下载完毕" + "\n") 201 log_text = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + "---" + "第" + str( 202 n_yema) + "页,第" + str(n_tao) + "套图--" + dir_name + "已经下载完毕" + "\n" 203 create_log(log_content=log_text) 204 log_main_end = "*" * 10 + "第" + str(n_yema) + "页,下载完成-->" + url + "*" * 10 + "\n\n" 205 print(log_main_end) 206 # 要记录已经下载的主页的页面页码(n_yema) 207 page_record(str(n_yema))
作者:小飞
备注:本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须在文章页面给出原文连接,否则保留追究法律责任的权利。
备注:部分图片下载来源于网络,如若有侵权,请联系本人予以删除,邮箱:2777418194@qq.com。
本博客作为本人软件学习记录而用,不提供任何软件的下载链接,敬请谅解
可关注本人微信公众号【软件开发部门】回复“资源”获取部分免费资源