爬取先知论坛的文章(含图片)
为了一些线下ctf比赛无法联网的情况,特准备了此脚本爬取先知论坛的文章(含图片)。
新建xianzhi文件夹,包含如下2个子目录(picture、t)和1个txt文档(exist_tid.txt):
-rw-r--r--@ 1 shellyzhang staff 0 Sep 11 16:30 exist_tid.txt
drwxr-xr-x 3 shellyzhang staff 96 Sep 11 16:30 picture
drwxr-xr-x 3 shellyzhang staff 96 Sep 11 16:30 t
-rw-r--r-- 1 shellyzhang staff 1993 Sep 15 15:30 xianzhi_sprider.py
再在此文件夹下新建脚本xianzhi_sprider.py,内容为:
1 import requests 2 import re 3 4 #把图片的源地址指向本地 5 def format_pic_url(old_url): 6 old_host = 'https://xzfile.aliyuncs.com/media/upload/picture' 7 new_host = '../picture' 8 return old_url.replace(old_host,new_host) 9 10 #通过文章编号,获取文章内容 11 def getXianZhiHTML(tid): 12 url = 'https://xz.aliyun.com/t/{}'.format(tid) 13 resp = requests.get(url) 14 # print(resp.status_code) 15 if resp.status_code != 200: 16 return None 17 else: 18 return resp.text 19 20 #获取图片地址 21 def getImgUlr(resp_text): 22 re_rule = re.compile(r'<img src="https://xzfile.aliyuncs.com.*?">') 23 result = [] 24 for i in re_rule.findall(resp_text): 25 result.append(i.replace('<img src="','').replace('">','')) 26 return result 27 28 #下载图片 29 def download_img(img_url): 30 response = requests.get(img_url) 31 img = response.content 32 local_img = format_pic_url(img_url).replace('..','.') 33 try: 34 with open(local_img, 'wb') as f: 35 f.write(img) 36 except Exception as e: 37 print(e) 38 pass 39 40 #读文件 41 def readFile(target_file): 42 with open(target_file, 'r',encoding='utf-8') as f: 43 return f.readlines() 44 45 #写文件 46 def writeFile(target_file,STR): 47 with open(target_file, 'a',encoding='utf-8') as f: 48 f.write(STR) 49 50 #主函数 51 def main(tid): 52 exist_tid = './exist_tid.txt' 53 if str(tid)+'\n' not in readFile(exist_tid): #之前没有此tid,才进行后续 54 resp_text = getXianZhiHTML(tid) 55 if resp_text == None: 56 pass 57 else: 58 # 下载所有图片 59 for img_url in getImgUlr(resp_text): 60 download_img(img_url) 61 #保存文章 62 t_path = './t/{}.html'.format(tid) 63 resp_text = format_pic_url(resp_text) 64 writeFile(t_path,resp_text) 65 writeFile(exist_tid, str(tid) + '\n') 66 else: 67 print('{} is exist!'.format(tid)) 68 69 if __name__ == '__main__': 70 for tid in range(0,10000): 71 main(tid)