BeautifulSoup的基本用法
1 # -*- coding:UTF-8 -*- 2 from urllib import request 3 from bs4 import BeautifulSoup 4 import re 5 import sys 6 7 if __name__ == "__main__": 8 #创建txt文件 9 file = open('一念永恒.txt', 'w', encoding='utf-8') 10 #一念永恒小说目录地址 11 target_url = 'http://www.biqukan.com/1_1094/' 12 #User-Agent 13 head = {} 14 head['User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19' 15 target_req = request.Request(url = target_url, headers = head) 16 target_response = request.urlopen(target_req) 17 target_html = target_response.read().decode('gbk','ignore') 18 #创建BeautifulSoup对象 19 listmain_soup = BeautifulSoup(target_html,'lxml') 20 21 #搜索文档树,找出div标签中class为listmain的所有子标签 22 chapters = listmain_soup.find_all('div',class_ = 'listmain') 23 #使用查询结果再创建一个BeautifulSoup对象,对其继续进行解析 24 download_soup = BeautifulSoup(str(chapters), 'lxml') 25 #计算章节个数 26 numbers = (len(download_soup.dl.contents) - 1) / 2 - 8 27 index = 1 28 #开始记录内容标志位,只要正文卷下面的链接,最新章节列表链接剔除 29 begin_flag = False 30 #遍历dl标签下所有子节点 31 for child in download_soup.dl.children: 32 #滤除回车 33 if child != '\n': 34 #找到《一念永恒》正文卷,使能标志位 35 if child.string == u"《一念永恒》正文卷": 36 begin_flag = True 37 #爬取链接并下载链接内容 38 if begin_flag == True and child.a != None: 39 download_url = "http://www.biqukan.com" + child.a.get('href') 40 download_req = request.Request(url = download_url, headers = head) 41 download_response = request.urlopen(download_req) 42 download_html = download_response.read().decode('gbk','ignore') 43 download_name = child.string 44 soup_texts = BeautifulSoup(download_html, 'lxml') 45 texts = soup_texts.find_all(id = 'content', class_ = 'showtxt') 46 soup_text = BeautifulSoup(str(texts), 'lxml') 47 write_flag = True 48 file.write(download_name + '\n\n') 49 #将爬取内容写入文件 50 for each in soup_text.div.text.replace('\xa0',''): 51 if each == 'h': 52 write_flag = False 53 if write_flag == True and each != ' ': 54 file.write(each) 55 if write_flag == True and each == '\r': 56 file.write('\n') 57 file.write('\n\n') 58 #打印爬取进度 59 sys.stdout.write("已下载:%.3f%%" % float(index/numbers) + '\r') 60 sys.stdout.flush() 61 index += 1 62 file.close()
>>> for link in soup.find_all('a'):
... print(link.get('href'))
#用于爬取a标签的链接
Beautiful Soup 4.4.0 文档链接:http://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/
已下文章来自博客园大佬:http://www.cnblogs.com/sakura3/p/8460224.html(为了复习方便,搬一下,谢谢)
爬小说:
1 #!/usr/bin/python 2 # -*- coding: UTF-8 -*- 3 import requests 4 from bs4 import BeautifulSoup 5 # get_url_list 获取所有章节的URL,在一个list里 6 def get_url_list(url): 7 content = requests.get(url).content #获取页面内容 8 soup = BeautifulSoup(content,'lxml') #Beautifulsoup 实例化对象 9 url_list = [] #空的url_list 数组 10 # urls = soup.find('div',{'id':'list'}).find('dl').find_all('dd') 11 urls = soup.select('#list > dl > dd > a') # 根据页面选择到URL ,还可以urls = soup.find('div',{'id':'list'}).find('dl').find_all('dd') 12 for i in urls: #遍历里面的每一章的URL 13 i = i.get('href') #获取URL 14 # print(i) 15 i = 'http://www.biquge.com.tw' + i #分析文章组成,形成最终的URL 16 url_list.append(i) #添加到url_list 里面去 17 # print (url_list) 18 return url_list 19 # 获取这一章的内容 20 def get_data(url): 21 content = requests.get(url).content 22 soup = BeautifulSoup(content, 'lxml') 23 f = open(r'C:\Users\HBX\Documents\staudy\HMXX.txt','a+',encoding='utf-8') #不加utf-8 会有编码报错 24 text_name = soup.find('div',{'class':'bookname'}).find('h1').text #获得章节名字 25 # text_content = soup.select('#content') 26 text_content = soup.find('div',{'id':'content'}).get_text() #获得章节内容 ,还有一种select css 选择的获取章节内容的方式 27 book =text_name+ '\r\n' + text_content #整体的一章 28 # print(book) 29 f.write((book)+'\r\n') #换行写入 30 f.close() #关闭文件 31 # for x in text_content: 32 # a = x.text.replace('readx();', '') 33 # print(a) 34 35 36 37 if __name__ =='__main__': 38 url = 'http://www.biquge.com.tw/18_18049/' #笔趣阁的小说目录页面 39 url_list = get_url_list(url) #获取了所有的url 40 for i in url_list: # 循环一章url 41 get_data(i) #获取文章内容