爬取个人随笔内容——练手,待补充

import requests,lxml
from bs4 import BeautifulSoup
url='https://www.cnblogs.com/wjlv/default.html?page=2' # 打开网页
html_index = requests.get(url).text # 获取请求内容
soap = BeautifulSoup(html_index,"lxml")
a_list = soap.find_all('a',{"class":"postTitle2"}) # 得到所有随笔标题和地址标签
for h in a_list:
soap = BeautifulSoup(str(h),'lxml')
# print('{}:{}'.format(soap.find('a').text,soap.find('a')['href'])) # 得到单个随笔的地址
article = requests.get(soap.find('a')['href']).text # 得到单个随笔的内容
soap_a = BeautifulSoup(article,'lxml')
p_lab = soap_a.find_all('p') # 获取单个随笔内容标签
for txt in p_lab:
soap_t = BeautifulSoup(str(txt),'lxml')
print(soap_t.find('p').text) # 得到所有标签的具体内容
拿书旗小树网站的一片小说练练手:
还是有很多地方需要优化:
1、重复提取内容的步骤较多
2、提取效率较为低下,多线程
3、小说获取较单一、固定
 1 import lxml, requests, os
 2 from bs4 import BeautifulSoup
 3 
 4 url = 'http://book.txtbook.com.cn/shu/6478/chapterlist.html'
 5 response = requests.get(url).text  # 请求小说章节总览页
 6 soap = BeautifulSoup(response, 'lxml')
 7 title_list = soap.find('div', {"class": "t_list6"})
 8 soap_t = BeautifulSoup(str(title_list), 'lxml')
 9 href_list = soap_t.find_all('a')
10 for href in href_list:
11     soap_h = BeautifulSoup(str(href), 'lxml')
12     content_url, content_title = soap_h.find('a', {'class': 'nocur'})['href'], soap_h.find('a').text  # 获取章节名称和url
13     content_response = requests.get(content_url).text
14     soap_r = BeautifulSoup(content_response, 'lxml')
15     content_p = soap_r.find_all('div', {'id': 'chaptercontent'})  # 多属性确定内容位置
16     for content in content_p:
17         soap_c = BeautifulSoup(str(content), 'lxml')
18         # print(soap_c.find('p'))
19         if not os.path.exists(r"E:\python_project\01day\books"):  # 创建存放目录
20             os.makedirs(r'../books/')
21         with open(r'../books/' + content_title + '.txt', 'w+',encoding='utf8') as f:
22             f.writelines(soap_c.find('p').text.split("<br/><br/>"))  # 写入章节内容
23         print('%s has already download' %content_title)
View Code

 

 

 

爬取个人博客:

 

 1 import re, os, requests
 2 
 3 
 4 def get_url(url, pattern=None):
 5     try:
 6         response = requests.get(url)
 7         section_info = []
 8         if response.status_code == 200:
 9             if pattern is not None:
10                 p_section = re.compile(pattern, re.S)  # r'class="postTitle2[^.]*?ef="(.*?)">(.*?)</a>'
11                 section_info = re.findall(p_section, response.text)
12             return section_info
13         else:
14             return None
15     except Exception as e:
16         return e
17 
18 def write_content(content,section):
19     if not os.path.exists(r'../blogs/'):
20         os.makedirs(r'../blogs/')
21     try:
22         with open(r'../blogs/'+ section.strip().replace('.','_').replace('——','_').replace(' ','') + '.txt','w+') as f:
23             f.write(str(content).strip(']['))
24             print('%s 下载完成……' %section)
25     except IOError:
26         print('%s 下载失败……' %section)
27 
28 def get_content(section_list, pattern):
29     for section in section_list:
30         content_list = get_url(section[0], pattern)
31         content = []
32         for p in content_list:
33             p = p.strip()
34             content.append(p)
35         write_content(content,section[1])
36 
37 
38 if __name__ == '__main__':
39     url = 'https://www.cnblogs.com/wjlv/'
40     section_list = get_url(url, r'class="postTitle2[^.]*?ef="(.*?)">(.*?)</a>')
41     get_content(section_list, r'<p>(.*?)</p>')
View Code

 

posted @ 2019-08-26 16:57  唐大侠的小迷弟  阅读(183)  评论(0编辑  收藏  举报