爬虫批量自动下载小说
下载排行榜的所有小说
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 #Author: ss 4 5 from bs4 import BeautifulSoup 6 import requests 7 import time 8 import os 9 10 headers = { 11 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36' 12 } 13 14 def get_text(url,title1): 15 #url = 'https://www.xxbiquge.com/0_36/8840634.html' 16 data = requests.get(url,headers=headers) 17 time.sleep(0.5) 18 soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('utf-8'),'lxml') 19 text = soup.select('div.content_read > div > div#content')[0].text 20 title2 = soup.select('div.content_read > div > div.bookname > h1')[0].text 21 ls = [] 22 for i in text: 23 if i in "' \n','\xa0','readx();'": 24 continue 25 else: 26 ls.append(i) 27 text = ''.join(ls) 28 with open('.\\books\\' + title1 + '.txt','ab+') as f: 29 f.write((title1 + '\r\n').encode()) 30 #f.write('\r\n'.encode()) 31 f.write(text.encode()) 32 f.write('\r\n\r\n'.encode()) 33 print('正在下载{}'.format(title2)) 34 35 def get_one_links(url): 36 #url = 'https://www.xxbiquge.com/0_36/' 37 data = requests.get(url, headers=headers) 38 soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('utf-8'), 'lxml') 39 links = soup.select('div#list > dl > dd') 40 title = soup.select('div#maininfo > div#info > h1')[0].text 41 print('开始下载{}'.format(title)) 42 for i in links: 43 data = i.select('a') 44 for m in data: 45 url = 'https://www.xxbiquge.com' + m.get('href') 46 get_text(url,title) 47 48 def get_all(): 49 url = 'https://www.xxbiquge.com/xbqgph.html' 50 data = requests.get(url,headers=headers) 51 time.sleep(0.5) 52 soup = BeautifulSoup(data.text.encode('ISO-8859-1').decode('utf-8'),'lxml') 53 links = soup.select('div.novelslist2 > ul > li') 54 for i in links: 55 data = i.select('span.s2 > a') 56 for m in data: 57 url = 'https://www.xxbiquge.com' + data[0].get('href') 58 get_one_links(url) 59 60 if not os.path.exists('.\\books'): 61 os.mkdir('.\\books') 62 get_all()