https://www.cnblogs.com/longhai3/longhai

PYTHON>>爬虫爬取小说

Posted on 2022-02-12 23:51  凡是过去,皆为序曲  阅读(65)  评论(0编辑  收藏  举报
 1 import requests
 2 from bs4 import BeautifulSoup
 3 import re
 4 import time
 5 
 6 # https://wwcom/28_28714/19953985.html
 7 # https://wwwrg/83_83488/28981145.html
 8 
 9 def URL00(a):
10     url = 'https://wwworg/83_83488/'+ str(a) +'.html'
11     return url
12 
13 def DOWN00(a):
14     strhtml=requests.get(URL00(a))
15     strhtml.encoding = "UTF-8"
16     soup = BeautifulSoup(strhtml.text,'lxml')
17 
18 # 正文
19 # 选择“Copy”➔“Copy Selector”命令
20     data02 = soup.select('#read > div.container > div:nth-child(3) > div > div.panel.panel-default > div.panel-body.content-body.content-ext')
21     data02 = str(data02)
22     data02 = re.findall(r'>(.*?)</div>', data02, re.S)
23     data02 = ''.join(data02) + "\n"
24 
25 # 标题、章节
26     data01 = soup.select('#read > div.container > div:nth-child(3) > div > div.panel.panel-default > div.panel-heading')
27     data01 = str(data01)
28     data01 = re.findall(r'">(.*?)</div>', data01, re.S)
29     data01 = ''.join(data01) + "\n\n"
30 
31     data = data01 + data02 + "=" * 40 + "\n\n"
32     data = data.replace('<br/>','')
33     data = data + "\n"
34     return data
35 
36 def SAVE00(data0):
37     try:
38         f = open(r"TXT0XZ.txt", 'a+',encoding='utf-8')
39         f.write(data0)
40         f.close()
41     except IOError:
42         f = open(r"TXT0XZ.txt", 'w',encoding='utf-8')
43         f.write(data0)
44         f.close()
45 
46 def JINDU00(n):
47     n = int(n)
48     print('\r' + '#' * n + '=' * (100-n),end="")
49     time.sleep(2)
50 
51 if __name__ == "__main__":
52     # for i in range(28981145,28981146):
53     for i in range(28981145,28981337):
54         n = (i - 28981145)/(28981337 - 28981145)*100
55         JINDU00(n)
56         TXT0 = DOWN00(i)
57         SAVE00(TXT0)
58     print("\n完成!")

 

随心,随记

https://www.cnblogs.com/w1hg/331817