爬虫批量自动下载小说

下载排行榜的所有小说

 

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*- 
 3 #Author: ss
 4 
 5 from bs4 import BeautifulSoup
 6 import requests
 7 import time
 8 import os
 9 
10 headers = {
11     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
12 }
13 
14 def get_text(url,title1):
15     #url = 'https://www.xxbiquge.com/0_36/8840634.html'
16     data = requests.get(url,headers=headers)
17     time.sleep(0.5)
18     soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('utf-8'),'lxml')
19     text = soup.select('div.content_read > div > div#content')[0].text
20     title2 = soup.select('div.content_read > div > div.bookname > h1')[0].text
21     ls = []
22     for i in text:
23         if i in "' \n','\xa0','readx();'":
24             continue
25         else:
26             ls.append(i)
27     text = ''.join(ls)
28     with open('.\\books\\' + title1 + '.txt','ab+') as f:
29         f.write((title1 + '\r\n').encode())
30         #f.write('\r\n'.encode())
31         f.write(text.encode())
32         f.write('\r\n\r\n'.encode())
33     print('正在下载{}'.format(title2))
34 
35 def get_one_links(url):
36     #url = 'https://www.xxbiquge.com/0_36/'
37     data = requests.get(url, headers=headers)
38     soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('utf-8'), 'lxml')
39     links = soup.select('div#list > dl > dd')
40     title = soup.select('div#maininfo > div#info > h1')[0].text
41     print('开始下载{}'.format(title))
42     for i in links:
43         data = i.select('a')
44         for m in data:
45             url = 'https://www.xxbiquge.com' + m.get('href')
46             get_text(url,title)
47 
48 def get_all():
49     url = 'https://www.xxbiquge.com/xbqgph.html'
50     data = requests.get(url,headers=headers)
51     time.sleep(0.5)
52     soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('utf-8'),'lxml')
53     links = soup.select('div.novelslist2 > ul > li')
54     for i in links:
55         data = i.select('span.s2 > a')
56         for m in data:
57             url = 'https://www.xxbiquge.com' + data[0].get('href')
58             get_one_links(url)
59 
60 if not os.path.exists('.\\books'):
61     os.mkdir('.\\books')
62 get_all()

 

posted @ 2018-05-14 16:11  爬虫爬一个  阅读(1751)  评论(0编辑  收藏  举报

人生在与折腾、在于学习、在于进步