爬取个人随笔内容——练手,待补充
import requests,lxml
from bs4 import BeautifulSoup
url='https://www.cnblogs.com/wjlv/default.html?page=2' # 打开网页
html_index = requests.get(url).text # 获取请求内容
soap = BeautifulSoup(html_index,"lxml")
a_list = soap.find_all('a',{"class":"postTitle2"}) # 得到所有随笔标题和地址标签
for h in a_list:
soap = BeautifulSoup(str(h),'lxml')
# print('{}:{}'.format(soap.find('a').text,soap.find('a')['href'])) # 得到单个随笔的地址
article = requests.get(soap.find('a')['href']).text # 得到单个随笔的内容
soap_a = BeautifulSoup(article,'lxml')
p_lab = soap_a.find_all('p') # 获取单个随笔内容标签
for txt in p_lab:
soap_t = BeautifulSoup(str(txt),'lxml')
print(soap_t.find('p').text) # 得到所有标签的具体内容
拿书旗小树网站的一片小说练练手:
还是有很多地方需要优化:
1、重复提取内容的步骤较多
2、提取效率较为低下,多线程
3、小说获取较单一、固定
1 import lxml, requests, os 2 from bs4 import BeautifulSoup 3 4 url = 'http://book.txtbook.com.cn/shu/6478/chapterlist.html' 5 response = requests.get(url).text # 请求小说章节总览页 6 soap = BeautifulSoup(response, 'lxml') 7 title_list = soap.find('div', {"class": "t_list6"}) 8 soap_t = BeautifulSoup(str(title_list), 'lxml') 9 href_list = soap_t.find_all('a') 10 for href in href_list: 11 soap_h = BeautifulSoup(str(href), 'lxml') 12 content_url, content_title = soap_h.find('a', {'class': 'nocur'})['href'], soap_h.find('a').text # 获取章节名称和url 13 content_response = requests.get(content_url).text 14 soap_r = BeautifulSoup(content_response, 'lxml') 15 content_p = soap_r.find_all('div', {'id': 'chaptercontent'}) # 多属性确定内容位置 16 for content in content_p: 17 soap_c = BeautifulSoup(str(content), 'lxml') 18 # print(soap_c.find('p')) 19 if not os.path.exists(r"E:\python_project\01day\books"): # 创建存放目录 20 os.makedirs(r'../books/') 21 with open(r'../books/' + content_title + '.txt', 'w+',encoding='utf8') as f: 22 f.writelines(soap_c.find('p').text.split("<br/><br/>")) # 写入章节内容 23 print('%s has already download' %content_title)
爬取个人博客:
1 import re, os, requests 2 3 4 def get_url(url, pattern=None): 5 try: 6 response = requests.get(url) 7 section_info = [] 8 if response.status_code == 200: 9 if pattern is not None: 10 p_section = re.compile(pattern, re.S) # r'class="postTitle2[^.]*?ef="(.*?)">(.*?)</a>' 11 section_info = re.findall(p_section, response.text) 12 return section_info 13 else: 14 return None 15 except Exception as e: 16 return e 17 18 def write_content(content,section): 19 if not os.path.exists(r'../blogs/'): 20 os.makedirs(r'../blogs/') 21 try: 22 with open(r'../blogs/'+ section.strip().replace('.','_').replace('——','_').replace(' ','') + '.txt','w+') as f: 23 f.write(str(content).strip('][')) 24 print('%s 下载完成……' %section) 25 except IOError: 26 print('%s 下载失败……' %section) 27 28 def get_content(section_list, pattern): 29 for section in section_list: 30 content_list = get_url(section[0], pattern) 31 content = [] 32 for p in content_list: 33 p = p.strip() 34 content.append(p) 35 write_content(content,section[1]) 36 37 38 if __name__ == '__main__': 39 url = 'https://www.cnblogs.com/wjlv/' 40 section_list = get_url(url, r'class="postTitle2[^.]*?ef="(.*?)">(.*?)</a>') 41 get_content(section_list, r'<p>(.*?)</p>')