python爬虫--一次爬取小说的尝试
一次爬取小说的尝试
1 #!/usr/bin/python 2 # -*- coding:utf-8 -*- 3 import requests 4 from bs4 import BeautifulSoup 5 6 url = 'http://www.zanghaihua.org/nanbudangan/' 7 req = requests.get(url=url) 8 req.encoding = req.apparent_encoding 9 soup = BeautifulSoup(req.text,'html.parser') 10 div = soup.find(name='div',attrs={'class':'booklist'}) 11 # print(div) 12 span_list = div.find_all('span') 13 # print(span_list) 14 15 for span in span_list: 16 a = span.find('a') 17 # span_text=span.find(attrs={'class':'v'}) 18 if not a: 19 continue 20 a_url = a.get('href') 21 # a_text = a.text 22 23 24 response = requests.get(url=a_url) 25 response.encoding =response.apparent_encoding 26 # print(response.text) 27 # print(response.encoding) 28 soup = BeautifulSoup(response.text,'html.parser') 29 30 Bookname = soup.find(name='h1',attrs={'align':'center'}).text 31 # print('书名:%s' %Bookname) 32 ChapterTitle =soup.find(name='div',attrs={'class':'chaptertitle'}).text 33 # print('章节名:%s' %ChapterTitle) 34 35 Title = soup.find(name='div',attrs={'id':'BookText'}).get_text('\n','<br/><br/>') 36 #用get_text获取文本并将<br/><br/>替换成\n 37 # print(Title) 38 39 with open(Bookname,'ab+') as f: 40 #以追加模式写入文件 41 42 if ChapterTitle=='关于南部档案馆的研究': 43 f.write(Bookname.encode('utf-8')) 44 f.write(ChapterTitle.encode('utf-8')) 45 f.write(Title.encode('utf-8'))