1 import requests 2 from bs4 import BeautifulSoup 3 import re 4 import time 5 6 # https://wwcom/28_28714/19953985.html 7 # https://wwwrg/83_83488/28981145.html 8 9 def URL00(a): 10 url = 'https://wwworg/83_83488/'+ str(a) +'.html' 11 return url 12 13 def DOWN00(a): 14 strhtml=requests.get(URL00(a)) 15 strhtml.encoding = "UTF-8" 16 soup = BeautifulSoup(strhtml.text,'lxml') 17 18 # 正文 19 # 选择“Copy”➔“Copy Selector”命令 20 data02 = soup.select('#read > div.container > div:nth-child(3) > div > div.panel.panel-default > div.panel-body.content-body.content-ext') 21 data02 = str(data02) 22 data02 = re.findall(r'>(.*?)</div>', data02, re.S) 23 data02 = ''.join(data02) + "\n" 24 25 # 标题、章节 26 data01 = soup.select('#read > div.container > div:nth-child(3) > div > div.panel.panel-default > div.panel-heading') 27 data01 = str(data01) 28 data01 = re.findall(r'">(.*?)</div>', data01, re.S) 29 data01 = ''.join(data01) + "\n\n" 30 31 data = data01 + data02 + "=" * 40 + "\n\n" 32 data = data.replace('<br/>','') 33 data = data + "\n" 34 return data 35 36 def SAVE00(data0): 37 try: 38 f = open(r"TXT0XZ.txt", 'a+',encoding='utf-8') 39 f.write(data0) 40 f.close() 41 except IOError: 42 f = open(r"TXT0XZ.txt", 'w',encoding='utf-8') 43 f.write(data0) 44 f.close() 45 46 def JINDU00(n): 47 n = int(n) 48 print('\r' + '#' * n + '=' * (100-n),end="") 49 time.sleep(2) 50 51 if __name__ == "__main__": 52 # for i in range(28981145,28981146): 53 for i in range(28981145,28981337): 54 n = (i - 28981145)/(28981337 - 28981145)*100 55 JINDU00(n) 56 TXT0 = DOWN00(i) 57 SAVE00(TXT0) 58 print("\n完成!")
#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=
本文来自博客园,作者:凡是过去,皆为序曲,转载请注明原文链接:https://www.cnblogs.com/longhai3/p/15887912.html
如有疑问,欢迎提问
#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=#+=