9.爬虫案例

爬虫案例

案例1:中图网TOP 1000数据爬取

思路:

  • 使用request和lxml获取网页的HTML代码;
  • 解析HTML代码,获取对应标签下的文本信息;
  • 先试着爬取一页数据,再使用循环爬取多页数据。

单页数据爬取

import requests
from lxml import etree
import pandas as pd

# 获取HTML
url = f'https://www.bookschina.com/24hour/1_0_1/'
headers = {
    'cookie': 'adtanchu=1; indexCache=yes; ASP.NET_SessionId=yqg0xeq1lqokp3x0d0ypq2lv; Hm_lvt_6993f0ad5f90f4e1a0e6b2d471ca113a=1733368583; HMACCOUNT=539CFA4C696961D6; user_sign=59baa7d23c894dfe9ddfb64184ca96cc; BookUser=1%7cff09a25d-60b7-46a3-beb0-7ec7de9a5fd8%7c1%7c0%7c638715864084629584%7c20180722%7cfa4041993af56a1a; UserSign=069f073dff21b10b; Hm_lpvt_6993f0ad5f90f4e1a0e6b2d471ca113a=1733371462',
    'host': 'www.bookschina.com',
    'referer': f'https://www.bookschina.com/24hour/1_0_1/',
    'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36'
}
response = requests.get(url=url, headers=headers)
response.encoding = 'gb2312'
html_str = response.text
html = etree.HTML(html_str)

# 建好空列表,方便后面乘放数据
name_list = []
author_list = []
publisher_list = []
comment_list = []
score_list = []
sellPrice_list = []
discount_list = []
priceTit_list = []

# 找到数据在html中的标签路径
li_list = html.xpath("//div[@class='bookList']/ul/li")
for li in li_list:
    name = str(li.xpath("./div[@class='infor']/h2/a/text()")[0])
    name_list.append(name)

    author = str(li.xpath("./div[@class='infor']/div[@class='author']/a/text()")[0])
    author_list.append(author)

    publisher = str(li.xpath("./div[@class='infor']/div[@class='publisher']/a/text()")[0])
    publisher_list.append(publisher)

    comment = str(li.xpath("./div[@class='infor']/div[@class='startWrap']/a/text()")[0])
    comment_list.append(comment.replace('条评论', ''))

    startWrap_list = li.xpath("./div[@class='infor']/div[@class='startWrap']/i")
    score = 0
    for i in startWrap_list:
        if i.attrib.get('class') == 'one':
            score = score + 1
        elif i.attrib.get('class') == 'half':
            score = score + 0.5
        else:
            score = 0
    score_list.append(score)

    sellPrice = str(li.xpath("./div[@class='infor']/div[@class='priceWrap']/span[@class='sellPrice']/text()")[0])
    sellPrice_list.append(sellPrice.replace('¥', ''))

    discount = str(li.xpath("./div[@class='infor']/div[@class='priceWrap']/span[@class='discount']/text()")[0])
    discount_list.append(discount.replace('(', '').replace(')', ''))

    priceTit = str(li.xpath("./div[@class='infor']/div[@class='priceWrap']/del/text()")[0])
    priceTit_list.append(priceTit.replace('¥', ''))

dict1 = {
    '书名': name_list,
    '作者': author_list,
    '出版社': publisher_list,
    '评论数': comment_list,
    '评分': score_list,
    '售价': sellPrice_list,
    '折扣': discount_list,
    '原价': priceTit_list
}
df = pd.DataFrame(dict1).to_csv('D:\desk\中图网page1.csv')

多页数据爬取

import requests
import pandas as pd
from lxml import etree
from datetime import datetime

# 建好空列表,方便后面乘放数据
name_list = []
author_list = []
publisher_list = []
comment_list = []
score_list = []
sellPrice_list = []
discount_list = []
priceTit_list = []
activeIcon_list = []

if __name__ == '__main__':
    # 获取爬取的时间,用于最后输出为csv文件的命名依据
    time_now = datetime.strftime(datetime.now(), '%Y-%m-%d %H-%M-%S')

    for i in range(1, 35):
        print(f'正在获取第{i}页数据'.center(50, '-'))

        # 获取HTML
        url = f'https://www.bookschina.com/24hour/1_0_{i}/'
        headers = {
            'cookie': 'adtanchu=1; indexCache=yes; ASP.NET_SessionId=yqg0xeq1lqokp3x0d0ypq2lv; Hm_lvt_6993f0ad5f90f4e1a0e6b2d471ca113a=1733368583; HMACCOUNT=539CFA4C696961D6; user_sign=59baa7d23c894dfe9ddfb64184ca96cc; BookUser=1%7cff09a25d-60b7-46a3-beb0-7ec7de9a5fd8%7c1%7c0%7c638715864084629584%7c20180722%7cfa4041993af56a1a; UserSign=069f073dff21b10b; Hm_lpvt_6993f0ad5f90f4e1a0e6b2d471ca113a=1733371462',
            'host': 'www.bookschina.com',
            'referer': f'https://www.bookschina.com/24hour/1_0_{i}/',
            'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36'
        }
        response = requests.get(url=url, headers=headers)
        response.encoding = 'ansi'
        html_str = response.text
        html = etree.HTML(html_str)

        # 找到数据在html中的标签路径并将对应的文本数据提取出来添加到之前创建的空列表中
        li_list = html.xpath("//div[@class='bookList']/ul/li")
        for li in li_list:
            name = str(li.xpath("./div[@class='infor']/h2/a/text()")[0])
            name_list.append(name)

            author = li.xpath("./div[@class='infor']/div[@class='author']/a/text()")
            # 有的书可能没有作者信息,通过使用try,except语句实现报错跳出
            try:
                # 替换掉数据中不需要的部分
                author_list.append(str(author).replace('[', '').replace(']', '').replace('\'', ''))
            except:
                author_list.append('无作者信息')

            publisher = li.xpath("./div[@class='infor']/div[@class='publisher']/a/text()")
            try:
                publisher_list.append(str(publisher).replace('[', '').replace(']', '').replace('\'', ''))
            except:
                publisher_list.append('无出版信息')

            comment = str(li.xpath("./div[@class='infor']/div[@class='startWrap']/a/text()")[0])
            comment_list.append(comment.replace('条评论', ''))

            # 特殊情况灵活处理,此处就是通过循环条件结构实现对总评分的准确汇总
            startWrap_list = li.xpath("./div[@class='infor']/div[@class='startWrap']/i")
            score = 0
            for i in startWrap_list:
                if i.attrib.get('class') == 'one':
                    score = score + 1
                elif i.attrib.get('class') == 'half':
                    score = score + 0.5
                else:
                    score = 0
            score_list.append(score)

            sellPrice = str(
                li.xpath("./div[@class='infor']/div[@class='priceWrap']/span[@class='sellPrice']/text()")[0])
            sellPrice_list.append(sellPrice.replace('¥', ''))

            discount = str(li.xpath("./div[@class='infor']/div[@class='priceWrap']/span[@class='discount']/text()")[0])
            discount_list.append(discount.replace('(', '').replace(')', '').replace('折', ''))

            priceTit = str(li.xpath("./div[@class='infor']/div[@class='priceWrap']/del/text()")[0])
            priceTit_list.append(priceTit.replace('¥', ''))


            try:
                activeIcon = str(li.xpath("./div[@class='infor']/div[@class='activeIcon']/a/text()")[0])
                activeIcon_list.append(activeIcon)
            except:
                activeIcon_list.append('无折上折')

    dict1 = {
        '书名': name_list,
        '作者': author_list,
        '出版社': publisher_list,
        '评论数': comment_list,
        '评分': score_list,
        '售价': sellPrice_list,
        '折扣': discount_list,
        '原价': priceTit_list,
        '折上折': activeIcon_list
    }
    df = pd.DataFrame(dict1).to_csv(f'D:\desk\中图网page1-34({time_now}).csv', index=False)

案例2:猫眼电影TOP 100数据爬取

整体思路:

  • 使用selenium中的webdrive打开浏览器并获取页面HTML代码;
  • 解析HTML代码,获取对应标签下的文本信息;
  • 先试着爬取一页数据,再使用循环爬取多页数据。

单页数据爬取

import time
import pandas as pd
from selenium import webdriver

# 提前创建所需信息空列表,方便后续获取数据后填充
movie_rank_list = []
movie_name_list = []
movie_actor_list = []
movie_time_list = []
movie_score_list = []

# 打开浏览器进入到对应网址
browser = webdriver.Chrome()
browser.get("https://www.maoyan.com/board/4")
time.sleep(5)

# 获取储存电影信息的dd列表
dd_list = browser.find_elements_by_xpath('//dl[@class="board-wrapper"]/dd')

# 遍历列表得到每个字段的具体信息
for i in dd_list:
    movie_rank = i.find_element_by_xpath('./i').text
    movie_rank_list.append(movie_rank)

    movie_name = i.find_element_by_xpath('./div/div/div[1]/p[1]/a').text
    movie_name_list.append(movie_name)

    movie_actor = i.find_element_by_xpath('./div/div/div[1]/p[2]').text
    movie_actor_list.append(movie_actor)

    movie_time = i.find_element_by_xpath('./div/div/div[1]/p[3]').text
    movie_time_list.append(movie_time)

    movie_score1 = str(i.find_element_by_xpath('./div/div/div[2]/p/i[1]').text)
    movie_score2 = str(i.find_element_by_xpath('./div/div/div[2]/p/i[2]').text)
    movie_score = movie_score1 + movie_score2
    movie_score_list.append(movie_score)

# 将获取到的列表数据放入字典,方便后续转化为dataframe结构
data_dict = {
    '排名': movie_rank_list,
    '电影名称': movie_name_list,
    '主演': movie_actor_list,
    '上映时间': movie_time_list,
    '评分': movie_score_list
}

# 将字典转为为dataframe格式并导出为csv文件
pd.DataFrame(data_dict).to_csv('D:\desk\猫眼top10.csv', index=False)

# 关闭浏览器
browser.quit()

多页数据爬取

import time
import pandas as pd
from selenium import webdriver

# 提前创建所需信息空列表,方便后续获取数据后填充
movie_rank_list = []
movie_name_list = []
movie_actor_list = []
movie_time_list = []
movie_score_list = []

# 构建获取一页数据的函数,后面获取多页时循环调用即可
def get_one_page(b):
    # 获取储存电影信息的dd列表
    dd_list = browser.find_elements_by_xpath('//dl[@class="board-wrapper"]/dd')

    # 遍历列表得到每个字段的具体信息
    for i in dd_list:
        movie_rank = i.find_element_by_xpath('./i').text
        movie_rank_list.append(movie_rank)

        movie_name = i.find_element_by_xpath('./div/div/div[1]/p[1]/a').text
        movie_name_list.append(movie_name)

        movie_actor = i.find_element_by_xpath('./div/div/div[1]/p[2]').text
        movie_actor_list.append(movie_actor)

        movie_time = i.find_element_by_xpath('./div/div/div[1]/p[3]').text
        movie_time_list.append(movie_time)

        movie_score1 = str(i.find_element_by_xpath('./div/div/div[2]/p/i[1]').text)
        movie_score2 = str(i.find_element_by_xpath('./div/div/div[2]/p/i[2]').text)
        movie_score = movie_score1 + movie_score2
        movie_score_list.append(movie_score)


if __name__ == '__main__':
    # 打开浏览器进入到对应网址
    browser = webdriver.Chrome()
    browser.get("https://www.maoyan.com/board/4")
    time.sleep(3)

    # 创建一个循环体,通过点击下一页的方式实现跳转,直到没有下一页按钮表明最后一页数据已经爬取完毕
    while True:
        get_one_page(browser)
        next_page = browser.find_element_by_xpath('//*[@id="app"]/div/div/div[2]/ul/li[8]/a')
        if next_page.text != '下一页':
            print('爬取结束!')
            break
        next_page.click()
        time.sleep(3)

    # 将获取到的列表数据放入字典,方便后续转化为dataframe结构
    data_dict = {
        '排名': movie_rank_list,
        '电影名称': movie_name_list,
        '主演': movie_actor_list,
        '上映时间': movie_time_list,
        '评分': movie_score_list
    }

    # 将字典转为为dataframe格式并导出为csv文件
    pd.DataFrame(data_dict).to_csv('D:\desk\猫眼top100.csv', index=False)

    # 关闭浏览器
    browser.quit()
posted @   WangYao_BigData  阅读(20)  评论(0编辑  收藏  举报
努力加载评论中...
点击右上角即可分享
微信分享提示