qq音乐付费爬取
使用selenium+Browsermob-Proxy实现付费爬取。(付费爬取需要有vip账号)
selenium负责实现自动化模拟点击,Browsermob-Proxy抓取请求。
分析
首先打开qq音乐网址,搜索歌曲(这里歌曲名使用了UrlEncode编码了),:https://y.qq.com/n/ryqq/search?w=美人鱼&t=song&remoteplace=txt.yqq.top
点击播放,然后打F12,筛选media,圈起来的地址就是我们要下载的
将地址粘贴出来,并点击下载
会得到一个.m4a的文件,我们只需要使用第三方工具ffmpeg
将其转换为mp3格式即可。
源代码
爬取源代码如下
package com.watchmen.selenium;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.util.List;
import java.util.stream.Collectors;
import io.netty.handler.codec.http.HttpHeaders;
import org.openqa.selenium.*;
import io.netty.handler.codec.http.HttpRequest;
import io.netty.handler.codec.http.HttpResponse;
import net.lightbody.bmp.BrowserMobProxy;
import net.lightbody.bmp.BrowserMobProxyServer;
import net.lightbody.bmp.client.ClientUtil;
import net.lightbody.bmp.filters.RequestFilter;
import net.lightbody.bmp.proxy.CaptureType;
import net.lightbody.bmp.util.HttpMessageContents;
import net.lightbody.bmp.util.HttpMessageInfo;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.firefox.FirefoxOptions;
import org.openqa.selenium.firefox.FirefoxProfile;
import org.openqa.selenium.interactions.Actions;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
public class SeleniumBrowserMobProxy {
public static BrowserMobProxy browserMobProxy;
public static FirefoxDriver driver;
public String songName;
static {
browserMobProxy = new BrowserMobProxyServer();
browserMobProxy.start();
browserMobProxy.enableHarCaptureTypes(CaptureType.REQUEST_CONTENT, CaptureType.RESPONSE_CONTENT);
browserMobProxy.setHarCaptureTypes(CaptureType.RESPONSE_CONTENT);
browserMobProxy.newHar("kk");
driver = catchConfig();
}
public static void main(String[] args) throws Exception {
new SeleniumBrowserMobProxy().start();
}
private void start() throws Exception {
browserMobProxy.addRequestFilter(new RequestFilter() {
@Override
public HttpResponse filterRequest(HttpRequest request, HttpMessageContents contents,
HttpMessageInfo messageInfo) {
// 打印浏览器请求的url和请求头
HttpHeaders headers = request.headers();
String dest = headers.get("Sec-Fetch-Dest");
if (dest == null || !dest.contains("audio")) {
return null;
}
System.out.println("请求:" + request.uri());
downloadFile("https://dl.stream.qqmusic.qq.com"+request.uri(), "D:\\app\\qqmusic\\localMusic\\" + songName + ".m4a");
System.out.println(songName + "下载完成");
return null;
}
});
List<String> musicList = new BufferedReader(new FileReader("src/main/resources/music.txt")).lines().collect(Collectors.toList());
for (String music : musicList) {
catchMusic(music);
}
}
private void catchMusic(String name) throws InterruptedException {
WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(30));
driver.get("https://y.qq.com/n/ryqq/search?w=" + URLEncoder.encode(name, StandardCharsets.UTF_8) + "&t=song&remoteplace=txt.yqq.top");
WebElement songList = wait.until(ExpectedConditions.visibilityOfElementLocated(By.className("songlist__list")));
WebElement openDiv = songList.findElements(By.tagName("li"))
.get(0)
.findElement(By.className("mod_list_menu"));
Actions actions2 = new Actions(driver);
actions2.moveToElement(openDiv).perform();
Thread.sleep(1500);
// 定义js
JavascriptExecutor js = (JavascriptExecutor) driver;
WebElement open = openDiv.findElement(By.className("list_menu__icon_play"));
js.executeScript("arguments[0].click();", open);
songName=name;
}
private static void downloadFile(String fileURL, String saveDir) {
try {
URL url = new URL(fileURL);
URLConnection connection = url.openConnection();
InputStream inputStream = new BufferedInputStream(connection.getInputStream());
try (FileOutputStream outputStream = new FileOutputStream(saveDir)) {
byte[] buffer = new byte[1024];
int bytesRead;
while ((bytesRead = inputStream.read(buffer)) != -1) {
outputStream.write(buffer, 0, bytesRead);
}
System.out.println("File downloaded to: " + saveDir);
}
} catch (IOException e) {
e.printStackTrace();
}
}
private static void login(WebDriverWait wait) {
wait.until(ExpectedConditions.visibilityOfElementLocated(By.className("login-box-tit")));
// 切换登录界面
driver.switchTo().frame("login_frame");
wait.until(ExpectedConditions.visibilityOfElementLocated(By.className("lay_login_form")));
driver.switchTo().frame("ptlogin_iframe");
// 等待 ptlogin_iframe 页面加载完成
wait.until(ExpectedConditions.presenceOfElementLocated(By.id("u")));
// 切换到密码登录
WebElement passLogin = driver.findElement(By.id("switcher_plogin"));
Actions actions = new Actions(driver);
actions.moveToElement(passLogin).perform();
passLogin.click();
// 等待密码登录元素可见
wait.until(ExpectedConditions.visibilityOfElementLocated(By.id("p")));
// 输入账号
WebElement username = wait.until(ExpectedConditions.visibilityOfElementLocated(By.id("u")));
username.sendKeys("");
// 输入密码
driver.findElement(By.id("p")).sendKeys("");
// 点击登录
driver.findElement(By.id("login_button"));
}
private static FirefoxDriver catchConfig() {
System.setProperty("webdriver.gecko.driver", "D:\\app\\WebDriver\\geckodriver-v0.31.0-win64\\geckodriver.exe");
FirefoxOptions options = new FirefoxOptions();
FirefoxProfile profile = new FirefoxProfile();
Proxy seleniumProxy = ClientUtil.createSeleniumProxy(browserMobProxy);
options.setProxy(seleniumProxy);
options.setAcceptInsecureCerts(true);
// 设置火狐浏览器路径
options.setBinary("D:\\app\\firefox\\firefox.exe");
//禁止GPU渲染
options.addArguments("--disable-gpu");
// options.addArguments("--headless");
//忽略错误
options.addArguments("ignore-certificate-errors");
//禁止浏览器被自动化的提示
options.addArguments("--disable-infobars");
//反爬关键:window.navigator.webdrive值=false*********************
options.addPreference("dom.webdriver.enabled", false);
//设置请求头
profile.setPreference(
"general.useragent.override",
"Mozilla/5.0(iPhone;CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML,like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"
);
return new FirefoxDriver(options);
}
}
txt文件就是要搜索的歌曲名
maven依赖如下
<properties>
<maven.compiler.source>8</maven.compiler.source>
<guava.version>25.0-jre</guava.version>
<maven.compiler.target>8</maven.compiler.target>
<browsermob.version>2.1.0</browsermob.version>
<selenium.version>3.141.0</selenium.version>
<hutool.version>5.8.2</hutool.version>
<jsoup.version>1.5.2</jsoup.version>
</properties>
<dependencies>
<dependency>
<groupId>net.lightbody.bmp</groupId>
<artifactId>browsermob-core</artifactId>
<version>${browsermob.version}</version>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>${hutool.version}</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>${jsoup.version}</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>${guava.version}</version>
</dependency>
<dependency>
<groupId>net.lightbody.bmp</groupId>
<artifactId>browsermob-legacy</artifactId>
<version>${browsermob.version}</version>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>${selenium.version}</version>
</dependency>
</dependencies>