python爬虫--一次爬取小说的尝试

 一次爬取小说的尝试

 1 #!/usr/bin/python
 2 # -*- coding:utf-8 -*-
 3 import requests
 4 from bs4 import BeautifulSoup
 5 
 6 url = 'http://www.zanghaihua.org/nanbudangan/'
 7 req = requests.get(url=url)
 8 req.encoding = req.apparent_encoding
 9 soup = BeautifulSoup(req.text,'html.parser')
10 div = soup.find(name='div',attrs={'class':'booklist'})
11 # print(div)
12 span_list = div.find_all('span')
13 # print(span_list)
14 
15 for span in span_list:
16     a = span.find('a')
17     # span_text=span.find(attrs={'class':'v'})
18     if not a:
19         continue
20     a_url = a.get('href')
21     # a_text = a.text
22 
23 
24     response = requests.get(url=a_url)
25     response.encoding =response.apparent_encoding
26     # print(response.text)
27     # print(response.encoding)
28     soup = BeautifulSoup(response.text,'html.parser')
29 
30     Bookname = soup.find(name='h1',attrs={'align':'center'}).text
31     # print('书名:%s' %Bookname)
32     ChapterTitle =soup.find(name='div',attrs={'class':'chaptertitle'}).text
33     # print('章节名:%s' %ChapterTitle)
34 
35     Title = soup.find(name='div',attrs={'id':'BookText'}).get_text('\n','<br/><br/>')
36     #用get_text获取文本并将<br/><br/>替换成\n
37     # print(Title)
38 
39     with open(Bookname,'ab+') as f:
40         #以追加模式写入文件
41 
42         if ChapterTitle=='关于南部档案馆的研究':
43             f.write(Bookname.encode('utf-8'))
44         f.write(ChapterTitle.encode('utf-8'))
45         f.write(Title.encode('utf-8'))

 

posted @ 2019-06-27 18:04  莫使娇躯空对月  阅读(423)  评论(0编辑  收藏  举报