爬虫基础-bs4数据解析样例
抓取三国演义小说
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import lxml
if __name__ == "__main__":
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
# 1.对首页页面进行爬取
url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
response = requests.get(url=url,headers=headers)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'lxml')
# 取出所有的li标签(章节信息)
list_li = soup.select('.book-mulu > ul > li')
# print(list_li)
fp = open('./sanguo.txt', 'w', encoding='utf-8')
for li in list_li:
detail_name = li.a.string
detail_href = 'https://www.shicimingju.com' + li.a['href']
#对详情页发起请求,解析章节内容
detail_page = requests.get(url=detail_href, headers=headers)
detail_page.encoding = 'utf-8'
detail_soup = BeautifulSoup(detail_page.text,'lxml')
# 根据div属性定位到有章节内容的div
div_tag = detail_soup.find('div', class_='chapter_content')
# 去除标签,只显示文本(text)
content = div_tag.text
fp.write(detail_name + ':' + content + '\n')
print(detail_name,'爬取成功!!!')
fp.close()
抓取圣墟小说
import requests
from bs4 import BeautifulSoup
import lxml
import time
if __name__ == "__main__":
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
fp = open('./sx.txt','w',encoding='utf8')
url = 'http://www.bequgew.com/51561/'
response = requests.get(url=url,headers=headers)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text,'lxml')
# 使用select方法,查找出div属性为article_texttitleb下所有li标签(下面包含多个ul,所以不能用(.article_texttitleb > ul > li),直接空格li)
# list_li = soup.select('.article_texttitleb li')
#### 下面三行效果等同于上面一行效果,配合最下面的for循环使用
# 先使用find_all查找出class属性为article_texttitleb的div标签下所有内容-输出为列表
list_div = soup.find_all('div', class_='article_texttitleb')
# 使用BeautifulSoup解析上面的列表内容-输出为所有标签(文本格式)
div_bf = BeautifulSoup(str(list_div[0]), 'lxml')
# 使用find_all方法查找到所有的li标签-输出为字典
li_all = div_bf.find_all('li')
# for li in list_li:
for li in li_all:
detail_name = li.a.string
detail_link = "http://www.bequgew.com" + li.a['href']
detail_response = requests.get(url=detail_link, headers=headers)
detail_response.encoding = 'utf-8'
detail_soup = BeautifulSoup(detail_response.text,'lxml')
# find方法查找出属性id为book_text下的所有内容,并以文本形式输出(用text方法),find_all方法输出的为列表,find输出符合条件的标签内容
content = detail_soup.find('div', id = 'book_text').text
fp.write(detail_name + content + '\n\n')
print(detail_name,'--','下载完毕')
time.sleep(10)
fp.close()
抓取糗事百科图片
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import lxml
import os
if __name__ == "__main__":
if not os.path.exists('./download'):
os.mkdir('./download')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
url = 'https://www.qiushibaike.com/imgrank/'
response = requests.get(url=url, headers=headers)
parse_page = BeautifulSoup(response.text, 'lxml')
img_list = parse_page.select('.thumb img')
for img_src in img_list:
img_http = 'http:' + img_src['src']
img_name = img_http.split('/')[-1]
img_path = './download/' + img_name
img_content = requests.get(url=img_http, headers=headers).content
with open(img_path, 'wb') as fp:
fp.write(img_content)
fp.close()
print(img_name,'--','下载完成')