python+selenium爬取网页云音乐的歌单id

不多说什么了,代码注释很全

项目依赖

from lxml import etree
import json
import time
from selenium import webdriver
from msedge.selenium_tools import Edge, EdgeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

代码

class Netease_spider:
    # 初始化数据(需要修改)
    def __init__(self):
        options = EdgeOptions()
        options.use_chromium = True
        options.binary_location = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe" # 浏览器的位置
        self.browser = Edge(options=options, executable_path=r"C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe") # 相应的浏览器的驱动位置
        
        #要爬取的网页云歌单的位置
        self.originURL = 'https://y.music.163.com/m/playlist?id=826019094&userid=551118830&creatorId=551118830'
        self.data = list()

    # 获取网页源代码(需要修改)
    def get_page(self,url):
        self.browser.get(url)
        # 网页云需要扫描登录,等待30s,时间可以自己设置
        #g_iframe是监测的元素的id,如果在规定时间内出现则继续运行
        element = WebDriverWait(self.browser, 30, 0.5).until(
                      EC.presence_of_element_located((By.ID, "g_iframe"))
                      )
        # 等待歌单加载
        time.sleep(5)
        self.browser.switch_to.frame('g_iframe')
        html = self.browser.page_source
        return html

    # 解析网页源代码,获取数据
    def parse4data(self,html):
        html_elem = etree.HTML(html)
        li=html_elem.xpath('//a[contains(@href,"song?")]')
        id_list = []
        for id in li:
            href=id.xpath('./@href')[0]
            music_id=href.split('=')[1]
            id_list.append(music_id)
        return id_list



    # 开始爬取网页
    def crawl(self):
        # 爬取数据
        print('爬取数据')
        html = self.get_page(self.originURL)
        data = self.parse4data(html)
        return data

    
get = Netease_spider()
id_list = get.crawl()
json.dumps(id_list)
posted @ 2022-02-05 20:34  tifaIsMyWife  阅读(130)  评论(0编辑  收藏  举报