爬虫基础-bs4数据解析样例

抓取三国演义小说

# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import lxml
if __name__ == "__main__":
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }
    # 1.对首页页面进行爬取
    url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
    response = requests.get(url=url,headers=headers)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'lxml')
    # 取出所有的li标签(章节信息)
    list_li = soup.select('.book-mulu > ul > li')
#    print(list_li)
    fp = open('./sanguo.txt', 'w', encoding='utf-8')
    for li in list_li:
        detail_name = li.a.string
        detail_href = 'https://www.shicimingju.com' + li.a['href']
        #对详情页发起请求,解析章节内容
        detail_page = requests.get(url=detail_href, headers=headers)
        detail_page.encoding = 'utf-8'
        detail_soup = BeautifulSoup(detail_page.text,'lxml')
        # 根据div属性定位到有章节内容的div
        div_tag = detail_soup.find('div', class_='chapter_content')
        # 去除标签,只显示文本(text)
        content = div_tag.text
        fp.write(detail_name + ':' + content + '\n')
        print(detail_name,'爬取成功!!!')
    fp.close()

抓取圣墟小说

import requests
from bs4 import BeautifulSoup
import lxml
import time
if __name__ == "__main__":
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }
    fp = open('./sx.txt','w',encoding='utf8')
    url = 'http://www.bequgew.com/51561/'
    response = requests.get(url=url,headers=headers)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text,'lxml')
    # 使用select方法,查找出div属性为article_texttitleb下所有li标签(下面包含多个ul,所以不能用(.article_texttitleb > ul > li),直接空格li)
#    list_li = soup.select('.article_texttitleb li')
 
    #### 下面三行效果等同于上面一行效果,配合最下面的for循环使用
    # 先使用find_all查找出class属性为article_texttitleb的div标签下所有内容-输出为列表
    list_div = soup.find_all('div', class_='article_texttitleb')
    # 使用BeautifulSoup解析上面的列表内容-输出为所有标签(文本格式)
    div_bf = BeautifulSoup(str(list_div[0]), 'lxml')
    # 使用find_all方法查找到所有的li标签-输出为字典
    li_all = div_bf.find_all('li')
#    for li in list_li:
    for li in li_all:
        detail_name = li.a.string
        detail_link = "http://www.bequgew.com" + li.a['href']
        detail_response = requests.get(url=detail_link, headers=headers)
        detail_response.encoding = 'utf-8'
        detail_soup = BeautifulSoup(detail_response.text,'lxml')
        # find方法查找出属性id为book_text下的所有内容,并以文本形式输出(用text方法),find_all方法输出的为列表,find输出符合条件的标签内容
        content = detail_soup.find('div', id = 'book_text').text
        fp.write(detail_name + content + '\n\n')
        print(detail_name,'--','下载完毕')
        time.sleep(10)
    fp.close()

抓取糗事百科图片

# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import lxml
import os
 
if __name__ == "__main__":
    if not os.path.exists('./download'):
        os.mkdir('./download')
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }
    url = 'https://www.qiushibaike.com/imgrank/'
    response = requests.get(url=url, headers=headers)
    parse_page = BeautifulSoup(response.text, 'lxml')
    img_list = parse_page.select('.thumb img')
    for img_src in img_list:
        img_http = 'http:' + img_src['src']
        img_name = img_http.split('/')[-1]
        img_path = './download/' + img_name
        img_content = requests.get(url=img_http, headers=headers).content
        with open(img_path, 'wb') as fp:
            fp.write(img_content)
            fp.close()
            print(img_name,'--','下载完成')

 

 

 

posted @ 2020-08-14 10:32  消磨_时间  阅读(141)  评论(0编辑  收藏  举报