不多说什么了,代码注释很全
项目依赖
from lxml import etree
import json
import time
from selenium import webdriver
from msedge.selenium_tools import Edge, EdgeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
代码
class Netease_spider:
# 初始化数据(需要修改)
def __init__(self):
options = EdgeOptions()
options.use_chromium = True
options.binary_location = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe" # 浏览器的位置
self.browser = Edge(options=options, executable_path=r"C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe") # 相应的浏览器的驱动位置
#要爬取的网页云歌单的位置
self.originURL = 'https://y.music.163.com/m/playlist?id=826019094&userid=551118830&creatorId=551118830'
self.data = list()
# 获取网页源代码(需要修改)
def get_page(self,url):
self.browser.get(url)
# 网页云需要扫描登录,等待30s,时间可以自己设置
#g_iframe是监测的元素的id,如果在规定时间内出现则继续运行
element = WebDriverWait(self.browser, 30, 0.5).until(
EC.presence_of_element_located((By.ID, "g_iframe"))
)
# 等待歌单加载
time.sleep(5)
self.browser.switch_to.frame('g_iframe')
html = self.browser.page_source
return html
# 解析网页源代码,获取数据
def parse4data(self,html):
html_elem = etree.HTML(html)
li=html_elem.xpath('//a[contains(@href,"song?")]')
id_list = []
for id in li:
href=id.xpath('./@href')[0]
music_id=href.split('=')[1]
id_list.append(music_id)
return id_list
# 开始爬取网页
def crawl(self):
# 爬取数据
print('爬取数据')
html = self.get_page(self.originURL)
data = self.parse4data(html)
return data
get = Netease_spider()
id_list = get.crawl()
json.dumps(id_list)